diff --git a/README.md b/README.md
index da413001a51a74be9de4c70f6c1b8f23732a3597..a1d6777e123b65c7242e13259de1440733bfa682 100644
--- a/README.md
+++ b/README.md
@@ -23,7 +23,7 @@ from https://github.com/18F/open-source-guide/blob/18f-pages/pages/making-readme
 4.What is the goal of this project?
 -->
 
-**PaddleSpeech** is an open-source toolkit on [PaddlePaddle](https://github.com/PaddlePaddle/Paddle) platform for a variety of critical tasks in speech, with the state-of-art and influential models.
+**PaddleSpeech** is an open-source toolkit on [PaddlePaddle](https://github.com/PaddlePaddle/Paddle) platform for a variety of critical tasks in speech and audio, with the state-of-art and influential models.
 
 ##### Speech-to-Text
 
@@ -86,77 +86,76 @@ from https://github.com/18F/open-source-guide/blob/18f-pages/pages/making-readme
 
 For more synthesized audios, please refer to [PaddleSpeech Text-to-Speech samples](https://paddlespeech.readthedocs.io/en/latest/tts/demo.html).
 
+##### Speech Translation
+
+<div align = "center">
+<table style="width:100%">
+  <thead>
+    <tr>
+      <th> Input Audio  </th>
+      <th width="550"> Translations Result  </th>
+    </tr>
+  </thead>
+  <tbody>
+   <tr>
+      <td align = "center">
+      <a href="https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav" rel="nofollow">
+            <img align="center" src="./docs/images/audio_icon.png" width="200 style="max-width: 100%;"></a><br>
+      </td>
+      <td >我 在 这栋 建筑 的 古老 门上 敲门。</td>
+    </tr>
+  </tbody>
+</table>
+
+</div>
+
 Via the easy-to-use, efficient, flexible and scalable implementation, our vision is to empower both industrial application and academic research, including training, inference & testing modules, and deployment process. To be more specific, this toolkit features at:
-- **Fast and Light-weight**: we provide high-speed and ultra-lightweight models that are convenient for industrial deployment.
+- **Ease of Use**: low barries to install, and [CLI](#quick-start) is available to quick-start your journey.
+- **Align to the State-of-the-Art**: we provide high-speed and ultra-lightweight models, and also cutting edge technology. 
 - **Rule-based Chinese frontend**: our frontend contains Text Normalization and Grapheme-to-Phoneme (G2P, including Polyphone and Tone Sandhi). Moreover, we use self-defined linguistic rules to adapt Chinese context.
 - **Varieties of Functions that Vitalize both Industrial and Academia**:
-  - *Implementation of critical audio tasks*: this toolkit contains audio functions like Speech Translation, Automatic Speech Recognition, Text-to-Speech Synthesis, Voice Cloning, etc.
+  - *Implementation of critical audio tasks*: this toolkit contains audio functions like  Audio Classification, Speech Translation, Automatic Speech Recognition, Text-to-Speech Synthesis, etc.
   - *Integration of mainstream models and datasets*: the toolkit implements modules that participate in the whole pipeline of the speech tasks, and uses mainstream datasets like LibriSpeech, LJSpeech, AIShell, CSMSC, etc. See also [model list](#model-list) for more details.
-  - *Cascaded models application*: as an extension of the application of traditional audio tasks, we combine the workflows of aforementioned tasks with other fields like Natural language processing (NLP), like Punctuation Restoration.
+  - *Cascaded models application*: as an extension of the typical traditional audio tasks, we combine the workflows of the aforementioned tasks with other fields like Natural language processing (NLP) and Computer Vision (CV).
 
 ## Installation
 
-The base environment in this page is  
-- Ubuntu 16.04
-- python>=3.7
-- paddlepaddle>=2.2.0
-
-If you want to set up PaddleSpeech in other environment, please see the [installation](./docs/source/install.md) documents for all the alternatives.
+We strongly recommend our users to install PaddleSpeech in *Linux* with *python>=3.7* and *paddlepaddle>=2.2.0*, where `paddlespeech`  can be easily installed with `pip`:
+```python
+pip install paddlespeech
+```
+If you want to set up in other environment, please see the [installation](./docs/source/install.md) for all the alternatives.
 
 ## Quick Start
 
-Developers can have a try of our model with only a few lines of code.
-
-A tiny DeepSpeech2 **Speech-to-Text** model training on toy set of LibriSpeech:
+Developers can have a try of our models with [PaddleSpeech Command Line](./paddlespeech/cli/README.md). Change `--input` to test your own audio/text.
 
+**Audio Classification**     
+```shell
+paddlespeech cls --input input.wav
+```
+**Automatic Speech Recognition**
 ```shell
-cd examples/tiny/asr0/
-# source the environment
-source path.sh
-source ../../../utils/parse_options.sh
-# prepare data
-bash ./local/data.sh
-# train model, all `ckpt` under `exp` dir, if you use paddlepaddle-gpu, you can set CUDA_VISIBLE_DEVICES before the train script
-./local/train.sh conf/deepspeech2.yaml deepspeech2 offline
-# avg n best model to get the test model, in this case, n = 1
-avg.sh best exp/deepspeech2/checkpoints 1
-# evaluate the test model
-./local/test.sh conf/deepspeech2.yaml exp/deepspeech2/checkpoints/avg_1 offline
+paddlespeech asr --lang zh --input input_16k.wav
 ```
+**Speech Translation** (English to Chinese)
 
-For **Text-to-Speech**, try pretrained FastSpeech2 + Parallel WaveGAN on CSMSC:
+(not support for Windows now)
+```shell
+paddlespeech st --input input_16k.wav
+```
+**Text-to-Speech** 
 ```shell
-cd examples/csmsc/tts3
-# download the pretrained models and unaip them
-wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip
-unzip pwg_baker_ckpt_0.4.zip
-wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip
-unzip fastspeech2_nosil_baker_ckpt_0.4.zip
-# source the environment
-source path.sh
-# run end-to-end synthesize
-FLAGS_allocator_strategy=naive_best_fit \
-FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-python3 ${BIN_DIR}/synthesize_e2e.py \
-  --fastspeech2-config=fastspeech2_nosil_baker_ckpt_0.4/default.yaml \
-  --fastspeech2-checkpoint=fastspeech2_nosil_baker_ckpt_0.4/snapshot_iter_76000.pdz \
-  --fastspeech2-stat=fastspeech2_nosil_baker_ckpt_0.4/speech_stats.npy \
-  --pwg-config=pwg_baker_ckpt_0.4/pwg_default.yaml \
-  --pwg-checkpoint=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
-  --pwg-stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
-  --text=${BIN_DIR}/../sentences.txt \
-  --output-dir=exp/default/test_e2e \
-  --inference-dir=exp/default/inference \
-  --phones-dict=fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt
+paddlespeech tts --input "你好，欢迎使用百度飞桨深度学习框架！" --output output.wav
 ```
 
-If you want to try more functions like training and tuning, please see [Speech-to-Text Quick Start](./docs/source/asr/quick_start.md) and [Text-to-Speech Quick Start](./docs/source/tts/quick_start.md).
+If you want to try more functions like training and tuning, please have a look at [Speech-to-Text Quick Start](./docs/source/asr/quick_start.md) and [Text-to-Speech Quick Start](./docs/source/tts/quick_start.md).
 
 ## Model List
 
-PaddleSpeech supports a series of most popular models, summarized in [released models](./docs/source/released_model.md) with available pretrained models.
+PaddleSpeech supports a series of most popular models. They are summarized in [released models](./docs/source/released_model.md) and attached with available pretrained models.
 
-Speech-to-Text module contains *Acoustic Model* and *Language Model*, with the following details:
+**Speech-to-Text** contains *Acoustic Model* and *Language Model*, with the following details:
 
 <!---
 The current hyperlinks redirect to [Previous Parakeet](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples).
@@ -219,20 +218,20 @@ The current hyperlinks redirect to [Previous Parakeet](https://github.com/Paddle
   </tbody>
 </table>
 
-PaddleSpeech Text-to-Speech mainly contains three modules: *Text Frontend*, *Acoustic Model* and *Vocoder*. Acoustic Model and Vocoder models are listed as follow:
+**Text-to-Speech** in PaddleSpeech mainly contains three modules: *Text Frontend*, *Acoustic Model* and *Vocoder*. Acoustic Model and Vocoder models are listed as follow:
 
 <table>
   <thead>
     <tr>
-      <th> Text-to-Speech Module Type <img width="110" height="1"> </th>
-      <th>  Model Type  </th>
-      <th> <img width="50" height="1"> Dataset  <img width="50" height="1"> </th>
-      <th> <img width="101" height="1"> Link <img width="105" height="1"> </th>
+      <th> Text-to-Speech Module Type </th>
+      <th> Model Type </th>
+      <th> Dataset </th>
+      <th> Link </th>
     </tr>
   </thead>
   <tbody>
     <tr>
-    <td> Text Frontend</td>
+    <td> Text Frontend </td>
     <td colspan="2"> &emsp; </td>
     <td>
     <a href = "./examples/other/tn">tn</a> / <a href = "./examples/other/g2p">g2p</a>
@@ -291,7 +290,7 @@ PaddleSpeech Text-to-Speech mainly contains three modules: *Text Frontend*, *Aco
     <tr>
       <td rowspan="3">Voice Cloning</td>
       <td>GE2E</td>
-      <td >AISHELL-3, etc.</td>
+      <td >Librispeech, etc.</td>
       <td>
       <a href = "./examples/other/ge2e">ge2e</a>
       </td>
@@ -313,29 +312,68 @@ PaddleSpeech Text-to-Speech mainly contains three modules: *Text Frontend*, *Aco
   </tbody>
 </table>
 
+**Others**
+
+<table style="width:100%">
+  <thead>
+    <tr>
+      <th> Task </th>
+      <th> Dataset </th>
+      <th> Model Type </th>
+      <th> Link </th>
+    </tr>
+  </thead>
+  <tbody>
+  <tr>
+      <td>Audio Classification</td>
+      <td>ESC-50</td>
+      <td>PANN</td>
+      <td>
+      <a href = "./examples/esc50/cls0">pann-esc50</a>
+      </td>
+    </tr>
+  <tr>
+      <td rowspan="2">Speech Translation (English to Chinese)</td> 
+      <td rowspan="2">TED En-Zh</td>
+      <td>Transformer + ASR MTL</td>
+      <td>
+      <a href = "./examples/ted_en_zh/st0">transformer-ted</a>
+      </td>
+    </tr>
+    <tr>
+      <td>FAT + Transformer + ASR MTL</td>
+      <td>
+      <a href = "./examples/ted_en_zh/st1">fat-st-ted</a>
+      </td>
+    </tr>
+    
+  </tbody>
+</table>
+
 ## Tutorials
 
-Normally, [Speech SoTA](https://paperswithcode.com/area/speech) gives you an overview of the hot academic topics in speech. To focus on the tasks in PaddleSpeech, you will find the following guidelines are helpful to grasp the core ideas.
+Normally, [Speech SoTA](https://paperswithcode.com/area/speech), [Audio SoTA](https://paperswithcode.com/area/audio) and [Music SoTA](https://paperswithcode.com/area/music) give you an overview of the hot academic topics in the related area. To focus on the tasks in PaddleSpeech, you will find the following guidelines are helpful to grasp the core ideas.
 
 - [Overview](./docs/source/introduction.md)
-- Quick Start
-  - [Dependencies](./docs/source/dependencies.md) and [Installation](./docs/source/install.md)
-  - [Quick Start of Speech-to-Text](./docs/source/asr/quick_start.md)
-  - [Quick Start of Text-to-Speech](./docs/source/tts/quick_start.md)
+- [Installation](./docs/source/install.md)
 - Speech-to-Text
+  - [Quick Start of Speech-to-Text](./docs/source/asr/quick_start.md)
   - [Models Introduction](./docs/source/asr/models_introduction.md)
   - [Data Preparation](./docs/source/asr/data_preparation.md)
   - [Data Augmentation Pipeline](./docs/source/asr/augmentation.md)
   - [Features](./docs/source/asr/feature_list.md)
   - [Ngram LM](./docs/source/asr/ngram_lm.md)
 - Text-to-Speech
+  - [Quick Start of Text-to-Speech](./docs/source/tts/quick_start.md)
   - [Introduction](./docs/source/tts/models_introduction.md)
   - [Advanced Usage](./docs/source/tts/advanced_usage.md)
   - [Chinese Rule Based Text Frontend](./docs/source/tts/zh_text_frontend.md)
   - [Test Audio Samples](https://paddlespeech.readthedocs.io/en/latest/tts/demo.html) and [PaddleSpeech VS. Espnet](https://paddlespeech.readthedocs.io/en/latest/tts/demo_2.html)
+- Audio Classification
+- Speech Translation
 - [Released Models](./docs/source/released_model.md)
 
-The TTS module is originally called [Parakeet](https://github.com/PaddlePaddle/Parakeet), and now merged with DeepSpeech. If you are interested in academic research about this function, please see [TTS research overview](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/docs/source/tts#overview). Also, [this document](https://paddlespeech.readthedocs.io/en/latest/tts/models_introduction.html) is a good guideline for the pipeline components.
+The TTS module is originally called [Parakeet](https://github.com/PaddlePaddle/Parakeet), and now merged with DeepSpeech. If you are interested in academic research about this function, please see [TTS research overview](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/docs/source/tts#overview). Also, [this document](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/tts/models_introduction.md) is a good guideline for the pipeline components.
 
 ## FAQ and Contributing
 
diff --git a/demos/README.md b/demos/README.md
index 2183c1f2dec3487936837604125ee28c8355c674..28bab8bb318cfe7a8fa47b32b922d0961e522b1f 100644
--- a/demos/README.md
+++ b/demos/README.md
@@ -1 +1,10 @@
-# Demos for PaddleSpeech
+# Speech Application based on PaddleSpeech
+
+The directory containes many speech applications in multi scenarios.
+
+* audio tagging  - tag audio label in vedio  
+* metaverse  - 2D AR with TTS  
+* speech recogintion - vidio understanding  
+* speech translation - end to end speech translation  
+* story talker - book reader based on OCR and TTS  
+* style_fs2 - multi style control for FastSpeech2 model  
diff --git a/demos/audio_tagging/README.md b/demos/audio_tagging/README.md
index 5073393d4ce4c7bb7eff9aaa3ee1362ec83497db..1144cbb1f98c630f624a41e15d5e4a75044d7ab2 100644
--- a/demos/audio_tagging/README.md
+++ b/demos/audio_tagging/README.md
@@ -22,7 +22,7 @@ wget https://paddlespeech.bj.bcebos.com/PaddleAudio/cat.wav https://paddlespeech
 ### 3. Usage
 - Command Line(Recommended)
   ```bash
-  paddlespeech cls --input ~/cat.wav --topk 10
+  paddlespeech cls --input ./cat.wav --topk 10
   ```
   Usage:
   ```bash
diff --git a/demos/audio_tagging/run.sh b/demos/audio_tagging/run.sh
new file mode 100755
index 0000000000000000000000000000000000000000..b30eba35f9216787d21a4e357f53ba8ae1e88ec7
--- /dev/null
+++ b/demos/audio_tagging/run.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+
+wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/cat.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/dog.wav
+paddlespeech cls --input ./cat.wav --topk 10
diff --git a/demos/metaverse/README.md b/demos/metaverse/README.md
index 6f879a97abb03d7926b14ead238bf8211574b0ce..67c6b7c2c5d7feb395a3d36a36f93fc6125cbd35 100644
--- a/demos/metaverse/README.md
+++ b/demos/metaverse/README.md
@@ -16,6 +16,8 @@ Run `run.sh` to complete all the essential procedures, including the installatio
 ```
 In `run.sh`, it will execute `source path.sh` firstly, which will set the environment variants. 
 
+If you would like to try your own sentence, please replace the sentence in `sentences.txt`.
+
 If you would like to try your own image, please replace the image `download/Lamarr.png` in the shell script.
 
 The result has shown on our [notebook](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/tutorial/tts/tts_tutorial.ipynb).
diff --git a/demos/speech_recognition/README.md b/demos/speech_recognition/README.md
index 60ee8e4d4faddc55aadcbc66b7e7740d6d226713..c911653156c0bf6150f6a02530b57bf54b3d3c86 100644
--- a/demos/speech_recognition/README.md
+++ b/demos/speech_recognition/README.md
@@ -22,7 +22,7 @@ wget https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.
 ### 3. Usage
 - Command Line(Recommended)
   ```bash
-  paddlespeech asr --input ~/zh.wav
+  paddlespeech asr --input ./zh.wav
   ```
   Usage:
   ```bash
diff --git a/demos/speech_recognition/run.sh b/demos/speech_recognition/run.sh
new file mode 100755
index 0000000000000000000000000000000000000000..5efc8b81f97f818753059c6fa19e718f7f3f05ae
--- /dev/null
+++ b/demos/speech_recognition/run.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+
+wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav
+paddlespeech asr --input ./zh.wav
diff --git a/demos/speech_translation/README.md b/demos/speech_translation/README.md
index b2f29168acc0b45adf21a97d019c3a359b5f1672..caca05dd16543a3625006d07497c5893d3688147 100644
--- a/demos/speech_translation/README.md
+++ b/demos/speech_translation/README.md
@@ -19,10 +19,10 @@ Here are sample files for this demo that can be downloaded:
 wget https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav
 ```
 
-### 3. Usage
+### 3. Usage (not support for Windows now)
 - Command Line(Recommended)
   ```bash
-  paddlespeech st --input ~/en.wav
+  paddlespeech st --input ./en.wav
   ```
   Usage:
   ```bash
diff --git a/demos/speech_translation/run.sh b/demos/speech_translation/run.sh
new file mode 100755
index 0000000000000000000000000000000000000000..6619bd91f2687bfe66ce5bf6afd41139d60d6e1d
--- /dev/null
+++ b/demos/speech_translation/run.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+
+wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav
+paddlespeech st --input ./en.wav
diff --git a/demos/style_fs2/README.md b/demos/style_fs2/README.md
index c80b57315c4770df25ee1ee404ed204825df857e..d8daabde72005599ccbd18abc80e7e7127cfb33a 100644
--- a/demos/style_fs2/README.md
+++ b/demos/style_fs2/README.md
@@ -18,6 +18,10 @@ Run the following command line to get started:
 ```
 ./run.sh
 ```
+In `run.sh`, it will execute `source path.sh` firstly, which will set the environment variants.
+
+If you would like to try your own sentence, please replace the sentence in `sentences.txt`.
+
 For more details, please see `style_syn.py`
 
 The audio samples are in [style-control-in-fastspeech2](https://paddlespeech.readthedocs.io/en/latest/tts/demo.html#style-control-in-fastspeech2)
diff --git a/demos/text_to_speech/README.md b/demos/text_to_speech/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..769189e3b01695353c15eb3ad26f40ce89cca787
--- /dev/null
+++ b/demos/text_to_speech/README.md
@@ -0,0 +1,125 @@
+# TTS (Text To Speech)
+
+## Introduction
+Text-to-speech (TTS) is a natural language modeling process that requires changing units of text into units of speech for audio presentation. 
+
+This demo is an implementation to generate an audio from the giving text. It can be done by a single command or a few lines in python using `PaddleSpeech`. 
+
+## Usage
+### 1. Installation
+```bash
+pip install paddlespeech
+```
+
+### 2. Prepare Input
+Input of this demo should be a text of the specific language that can be passed via argument.
+### 3. Usage
+- Command Line (Recommended)
+    - Chinese
+    
+        The default acoustic model is `Fastspeech2`, and the default vocoder is `Parallel WaveGAN`.
+        ```bash
+        paddlespeech tts --input "你好，欢迎使用百度飞桨深度学习框架！"
+        ```
+    - Chinese, use `SpeedySpeech` as acoustic model
+        ```bash
+        paddlespeech tts --am speedyspeech_csmsc --input "你好，欢迎使用百度飞桨深度学习框架！"
+        ```
+    - Chinese, multi speaker
+    
+        You can change `spk_id` here.
+        ```bash
+        paddlespeech tts --am fastspeech2_aishell3 --voc pwgan_aishell3 --input "你好，欢迎使用百度飞桨深度学习框架！" --spk_id 0
+        ```
+    
+     - English
+        ```bash
+        paddlespeech tts --am fastspeech2_ljspeech --voc pwgan_ljspeech --lang en --input "hello world"
+        ```
+    - English, multi speaker
+    
+        You can change `spk_id` here.
+        ```bash
+        paddlespeech tts --am fastspeech2_vctk --voc pwgan_vctk --input "hello, boys" --lang en --spk_id 0
+        ```   
+  Usage:
+  
+  ```bash
+  paddlespeech tts --help
+  ```
+  Arguments:
+  - `input`(required): Input text to generate..
+  - `am`: Acoustic model type of tts task. Default: `fastspeech2_csmsc`.
+  - `am_config`: Config of acoustic model. Use deault config when it is None. Default: `None`.
+  - `am_ckpt`: Acoustic model checkpoint. Use pretrained model when it is None. Default: `None`.
+  - `am_stat`: Mean and standard deviation used to normalize spectrogram when training acoustic model. Default: `None`.
+  - `phones_dict`: Phone vocabulary file. Default: `None`.
+  - `tones_dict`: Tone vocabulary file. Default: `None`.
+  - `speaker_dict`: speaker id map file. Default: `None`.
+  - `spk_id`: Speaker id for multi speaker acoustic model. Default: `0`.
+  - `voc`: Vocoder type of tts task. Default: `pwgan_csmsc`.
+  - `voc_config`: Config of vocoder. Use deault config when it is None. Default: `None`.
+  - `voc_ckpt`: Vocoder checkpoint. Use pretrained model when it is None. Default: `None`.
+  - `voc_stat`: Mean and standard deviation used to normalize spectrogram when training vocoder. Default: `None`.
+  - `lang`: Language of tts task. Default: `zh`.
+  - `device`: Choose device to execute model inference. Default: default device of paddlepaddle in current environment.
+  - `output`: Output wave filepath. Default: `output.wav`.
+
+  Output:
+  ```bash
+  [2021-12-09 20:49:58,955] [    INFO] [log.py] [L57] - Wave file has been generated: output.wav
+  ```
+
+- Python API
+  ```python
+  import paddle
+  from paddlespeech.cli import TTSExecutor
+
+  tts_executor = TTSExecutor()
+  wav_file = tts_executor(
+      text='今天的天气不错啊',
+      output='output.wav',
+      am='fastspeech2_csmsc',
+      am_config=None,
+      am_ckpt=None,
+      am_stat=None,
+      spk_id=0,
+      phones_dict=None,
+      tones_dict=None,
+      speaker_dict=None,
+      voc='pwgan_csmsc',
+      voc_config=None,
+      voc_ckpt=None,
+      voc_stat=None,
+      lang='zh',
+      device=paddle.get_device())
+  print('Wave file has been generated: {}'.format(wav_file))
+  ```
+
+  Output:
+  ```bash
+  Wave file has been generated: output.wav
+  ```
+
+
+### 4. Pretrained Models
+
+Here is a list of pretrained models released by PaddleSpeech that can be used by command and python api:
+
+- Acoustic model
+  | Model | Language
+  | :--- | :---: |
+  | speedyspeech_csmsc| zh
+  | fastspeech2_csmsc| zh
+  | fastspeech2_aishell3| zh
+  | fastspeech2_ljspeech| en
+  | fastspeech2_vctk| en
+
+- Vocoder
+  | Model | Language
+  | :--- | :---: |
+  | pwgan_csmsc| zh
+  | pwgan_aishell3| zh
+  | pwgan_ljspeech| en
+  | pwgan_vctk| en
+  | mb_melgan_csmsc| zh
diff --git a/demos/text_to_speech/run.sh b/demos/text_to_speech/run.sh
new file mode 100755
index 0000000000000000000000000000000000000000..c2487aeed38ed5b0e3bc7e5c256eff0139bcca2b
--- /dev/null
+++ b/demos/text_to_speech/run.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+paddlespeech tts --input 今天的天气不错啊
diff --git a/docs/source/released_model.md b/docs/source/released_model.md
index 367b7c4b88a8674c4f797edd711a6dfe739fa411..1b61ccc7bb48b105a63887a971ad954f83120a60 100644
--- a/docs/source/released_model.md
+++ b/docs/source/released_model.md
@@ -2,32 +2,31 @@
 
 ## Speech-to-Text Models
 
-### Acoustic Model Released in paddle 2.X
-Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | Hours of speech | example link
+### Speech Recognition Model
+Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | Hours of speech | Example Link 
 :-------------:| :------------:| :-----: | -----: | :----------------- |:--------- | :---------- | :--------- | :-----------
-[Ds2 Online Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/aishell_ds2_online_cer8.00_release.tar.gz) | Aishell Dataset | Char-based | 345 MB  | 2 Conv + 5 LSTM layers with only forward direction | 0.080 |-| 151 h | [D2 Online Aishell S0 Example](../../examples/aishell/asr0)
-[Ds2 Offline Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/ds2.model.tar.gz)| Aishell Dataset | Char-based | 306 MB | 2 Conv + 3 bidirectional GRU layers| 0.064 |-| 151 h | [Ds2 Offline Aishell S0 Example](../../examples/aishell/asr0)
-[Conformer Online Aishell ASR1 Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.chunk.release.tar.gz) | Aishell Dataset | Char-based | 283 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0594 |-| 151 h | [Conformer Online Aishell S1 Example](../../examples/aishell/s1)
-[Conformer Offline Aishell ASR1 Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.release.tar.gz) | Aishell Dataset | Char-based | 284 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0547 |-| 151 h | [Conformer Offline Aishell S1 Example](../../examples/aishell/s1)
-[Conformer Librispeech ASR1 Model](https://deepspeech.bj.bcebos.com/release2.1/librispeech/s1/conformer.release.tar.gz) | Librispeech Dataset | subword-based | 287 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring |-| 0.0325 | 960 h | [Conformer Librispeech S1 example](../../example/librispeech/s1)
-[Transformer Librispeech ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr1/transformer.model.tar.gz) | Librispeech Dataset | subword-based | 131 MB  | Encoder:Transformer, Decoder:Transformer, Decoding method: Attention rescoring |-| 0.0410 | 960 h | [Transformer Librispeech S1 example](../../example/librispeech/s1)
-[Transformer Librispeech ASR2 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr2/transformer.model.tar.gz) | Librispeech Dataset | subword-based | 131 MB  | Encoder:Transformer, Decoder:Transformer, Decoding method: JoinCTC w/ LM |-| 0.024 | 960 h | [Transformer Librispeech S2 example](../../example/librispeech/s2)
-
-
-### Acoustic Model Transformed from paddle 1.8
-Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | Hours of speech
-:-------------:| :------------:| :-----: | -----: | :----------------- | :---------- | :---------- | :---------
-[Ds2 Offline Aishell model](https://deepspeech.bj.bcebos.com/mandarin_models/aishell_model_v1.8_to_v2.x.tar.gz)|Aishell Dataset| Char-based| 234 MB| 2 Conv + 3 bidirectional GRU layers| 0.0804 |-| 151 h|
-[Ds2 Offline Librispeech model](https://deepspeech.bj.bcebos.com/eng_models/librispeech_v1.8_to_v2.x.tar.gz)|Librispeech Dataset| Word-based| 307 MB| 2 Conv + 3 bidirectional sharing weight RNN layers |-| 0.0685| 960 h|
-[Ds2 Offline Baidu en8k model](https://deepspeech.bj.bcebos.com/eng_models/baidu_en8k_v1.8_to_v2.x.tar.gz)|Baidu Internal English Dataset| Word-based| 273 MB| 2 Conv + 3 bidirectional GRU layers |-| 0.0541 | 8628 h|
-
-### Language Model Released
+[Ds2 Online Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/aishell_ds2_online_cer8.00_release.tar.gz) | Aishell Dataset | Char-based | 345 MB  | 2 Conv + 5 LSTM layers with only forward direction | 0.080 |-| 151 h | [D2 Online Aishell ASR0](../../examples/aishell/asr0) 
+[Ds2 Offline Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/ds2.model.tar.gz)| Aishell Dataset | Char-based | 306 MB | 2 Conv + 3 bidirectional GRU layers| 0.064 |-| 151 h | [Ds2 Offline Aishell ASR0](../../examples/aishell/asr0) 
+[Conformer Online Aishell ASR1 Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.chunk.release.tar.gz) | Aishell Dataset | Char-based | 283 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0594 |-| 151 h | [Conformer Online Aishell ASR1](../../examples/aishell/asr1) 
+[Conformer Offline Aishell ASR1 Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.release.tar.gz) | Aishell Dataset | Char-based | 284 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0547 |-| 151 h | [Conformer Offline Aishell ASR1](../../examples/aishell/asr1) 
+[Transformer Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/transformer.model.tar.gz) | Aishell Dataset | Char-based | 128 MB | Encoder:Transformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0538 || 151 h | [Transformer  Aishell ASR1](../../examples/aishell/asr1) 
+[Conformer Librispeech ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr1/conformer.model.tar.gz) | Librispeech Dataset | subword-based | 191 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring |-| 0.0337 | 960 h | [Conformer Librispeech ASR1](../../example/librispeech/asr1) 
+[Transformer Librispeech ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr1/transformer.model.tar.gz) | Librispeech Dataset | subword-based | 131 MB  | Encoder:Transformer, Decoder:Transformer, Decoding method: Attention rescoring |-| 0.0381 | 960 h | [Transformer Librispeech ASR1](../../example/librispeech/asr1) 
+[Transformer Librispeech ASR2 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr2/transformer.model.tar.gz) | Librispeech Dataset | subword-based | 131 MB  | Encoder:Transformer, Decoder:Transformer, Decoding method: JoinCTC w/ LM |-| 0.0240 | 960 h | [Transformer Librispeech ASR2](../../example/librispeech/asr2) 
+
+### Language Model based on NGram
 Language Model | Training Data | Token-based | Size | Descriptions
 :-------------:| :------------:| :-----: | -----: | :-----------------
 [English LM](https://deepspeech.bj.bcebos.com/en_lm/common_crawl_00.prune01111.trie.klm) |  [CommonCrawl(en.00)](http://web-language-models.s3-website-us-east-1.amazonaws.com/ngrams/en/deduped/en.00.deduped.xz) | Word-based | 8.3 GB | Pruned with 0 1 1 1 1; <br/> About 1.85 billion n-grams; <br/> 'trie'  binary with '-a 22 -q 8 -b 8'
 [Mandarin LM Small](https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm) | Baidu Internal Corpus | Char-based | 2.8 GB | Pruned with 0 1 2 4 4; <br/> About 0.13 billion n-grams; <br/> 'probing' binary with default settings
 [Mandarin LM Large](https://deepspeech.bj.bcebos.com/zh_lm/zhidao_giga.klm) | Baidu Internal Corpus | Char-based | 70.4 GB | No Pruning; <br/> About 3.7 billion n-grams; <br/> 'probing' binary with default settings
 
+### Speech Translation Models
+
+| Model                                                        | Training Data | Token-based | Size | Descriptions                                                 | BLEU  | Example Link                                                 |
+| ------------------------------------------------------------ | ------------- | ----------- | ---- | ------------------------------------------------------------ | ----- | ------------------------------------------------------------ |
+| [Transformer FAT-ST MTL En-Zh](https://paddlespeech.bj.bcebos.com/s2t/ted_en_zh/st1/fat_st_ted-en-zh.tar.gz) | Ted-En-Zh     | Spm         |      | Encoder:Transformer, Decoder:Transformer, <br />Decoding method: Attention | 20.80 | [Transformer Ted-En-Zh ST1](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/examples/ted_en_zh/st1) |
+
 
 ## Text-to-Speech Models
 
@@ -38,6 +37,7 @@ Tacotron2|LJSpeech|[tacotron2-vctk](https://github.com/PaddlePaddle/PaddleSpeech
 TransformerTTS| LJSpeech| [transformer-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts1)|[transformer_tts_ljspeech_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/transformer_tts/transformer_tts_ljspeech_ckpt_0.4.zip)|||
 SpeedySpeech| CSMSC | [speedyspeech-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts2) |[speedyspeech_nosil_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_ckpt_0.5.zip)|[speedyspeech_nosil_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_static_0.5.zip)|12MB|
 FastSpeech2| CSMSC |[fastspeech2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts3)|[fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip)|[fastspeech2_nosil_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_static_0.4.zip)|157MB|
+FastSpeech2-Conformer| CSMSC |[fastspeech2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts3)|[fastspeech2_conformer_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_conformer_baker_ckpt_0.5.zip)|||
 FastSpeech2| AISHELL-3 |[fastspeech2-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/tts3)|[fastspeech2_nosil_aishell3_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_aishell3_ckpt_0.4.zip)|||
 FastSpeech2| LJSpeech |[fastspeech2-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts3)|[fastspeech2_nosil_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_ljspeech_ckpt_0.5.zip)|||
 FastSpeech2| VCTK |[fastspeech2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/tts3)|[fastspeech2_nosil_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_vctk_ckpt_0.5.zip)|||
@@ -58,3 +58,21 @@ Model Type | Dataset| Example Link | Pretrained Models
 GE2E| AISHELL-3, etc. |[ge2e](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/ge2e)|[ge2e_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/ge2e/ge2e_ckpt_0.3.zip)
 GE2E + Tactron2| AISHELL-3 |[ge2e-tactron2-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/vc0)|[tacotron2_aishell3_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_aishell3_ckpt_0.3.zip)
 GE2E + FastSpeech2 | AISHELL-3  |[ge2e-fastspeech2-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/vc1)|[fastspeech2_nosil_aishell3_vc1_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_aishell3_vc1_ckpt_0.5.zip)
+
+
+## Audio Classification Models
+
+Model Type | Dataset| Example Link | Pretrained Models
+:-------------:| :------------:| :-----: | :-----:
+PANN | Audioset| [audioset_tagging_cnn](https://github.com/qiuqiangkong/audioset_tagging_cnn) | [panns_cnn6.pdparams](https://bj.bcebos.com/paddleaudio/models/panns_cnn6.pdparams),[panns_cnn10.pdparams](https://bj.bcebos.com/paddleaudio/models/panns_cnn10.pdparams),[panns_cnn14.pdparams](https://bj.bcebos.com/paddleaudio/models/panns_cnn14.pdparams)
+PANN | ESC-50 |[pann-esc50]("./examples/esc50/cls0")|[panns_cnn6.tar.gz](https://paddlespeech.bj.bcebos.com/cls/panns_cnn6.tar.gz), [panns_cnn10](https://paddlespeech.bj.bcebos.com/cls/panns_cnn10.tar.gz), [panns_cnn14.tar.gz](https://paddlespeech.bj.bcebos.com/cls/panns_cnn14.tar.gz)
+
+
+## Speech Recognition Model  from paddle 1.8
+
+|                        Acoustic Model                        |         Training Data          | Token-based |   Size | Descriptions                                       | CER    | WER    | Hours of speech |
+| :----------------------------------------------------------: | :----------------------------: | :---------: | -----: | :------------------------------------------------- | :----- | :----- | :-------------- |
+| [Ds2 Offline Aishell model](https://deepspeech.bj.bcebos.com/mandarin_models/aishell_model_v1.8_to_v2.x.tar.gz) |        Aishell Dataset         | Char-based  | 234 MB | 2 Conv + 3 bidirectional GRU layers                | 0.0804 | -      | 151 h           |
+| [Ds2 Offline Librispeech model](https://deepspeech.bj.bcebos.com/eng_models/librispeech_v1.8_to_v2.x.tar.gz) |      Librispeech Dataset       | Word-based  | 307 MB | 2 Conv + 3 bidirectional sharing weight RNN layers | -      | 0.0685 | 960 h           |
+| [Ds2 Offline Baidu en8k model](https://deepspeech.bj.bcebos.com/eng_models/baidu_en8k_v1.8_to_v2.x.tar.gz) | Baidu Internal English Dataset | Word-based  | 273 MB | 2 Conv + 3 bidirectional GRU layers                | -      | 0.0541 | 8628 h          |
+
diff --git a/docs/source/tts/demo.rst b/docs/source/tts/demo.rst
index 4c2f86b148a9895a1bfaa3c1f61be7580fb2fd3e..ca2fd98e438fe38e449f9f34373362141497e647 100644
--- a/docs/source/tts/demo.rst
+++ b/docs/source/tts/demo.rst
@@ -455,6 +455,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog
     <b>CSMSC(Chinese)</b>
     <br>
     </br>
+
     <table border="2" cellspacing="1" cellpadding="1"> 
         <tr>
             <th align="center"> Text </th>
@@ -634,6 +635,106 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog
             </td>
         </tr>   
     </table>
+
+    <br>
+    </br>
+
+    <table border="2" cellspacing="1" cellpadding="1"> 
+        <tr>
+            <th align="center"> FastSpeech2-Conformer + ParallelWaveGAN </th>
+        </tr>
+        <tr>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fastspeech2_conformer_baker_ckpt_0.5_pwg_baker_ckpt_0.4/001.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+        </tr>
+        <tr>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fastspeech2_conformer_baker_ckpt_0.5_pwg_baker_ckpt_0.4/002.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+        </tr>
+        <tr>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fastspeech2_conformer_baker_ckpt_0.5_pwg_baker_ckpt_0.4/003.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+        </tr>
+
+        <tr>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fastspeech2_conformer_baker_ckpt_0.5_pwg_baker_ckpt_0.4/004.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+        </tr>
+        <tr>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fastspeech2_conformer_baker_ckpt_0.5_pwg_baker_ckpt_0.4/005.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+        </tr>
+        <tr>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fastspeech2_conformer_baker_ckpt_0.5_pwg_baker_ckpt_0.4/006.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+        </tr>
+        <tr>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fastspeech2_conformer_baker_ckpt_0.5_pwg_baker_ckpt_0.4/007.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+        </tr>
+        <tr>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fastspeech2_conformer_baker_ckpt_0.5_pwg_baker_ckpt_0.4/008.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+        </tr>
+        <tr>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fastspeech2_conformer_baker_ckpt_0.5_pwg_baker_ckpt_0.4/009.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+        </tr>   
+    </table>
     </div>
     <br>
     <br>
diff --git a/docs/source/tts/quick_start.md b/docs/source/tts/quick_start.md
index e6ad46fb11f3bc31fdac079f27338effec4ec59e..a50f0a8bd7963022ee19205f629951403de1afea 100644
--- a/docs/source/tts/quick_start.md
+++ b/docs/source/tts/quick_start.md
@@ -19,7 +19,7 @@ The models in PaddleSpeech TTS have the following mapping relationship:
 
 ## Quick Start
 
-Let's take a FastSpeech2 + Parallel WaveGAN with CSMSC dataset for instance. (./examples/csmsc/)(https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc)
+Let's take a FastSpeech2 + Parallel WaveGAN with CSMSC dataset for instance. [examples/csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc)
 
 ### Train Parallel WaveGAN with CSMSC
 - Go to directory
diff --git a/examples/aishell/asr0/conf/deepspeech2_online.yaml b/examples/aishell/asr0/conf/deepspeech2_online.yaml
index 010d8f155e48558b830f15af1a3b24aa2113d1e9..2f63f4de0cc5101ce6b2cbe5d1153fbb85a25d1a 100644
--- a/examples/aishell/asr0/conf/deepspeech2_online.yaml
+++ b/examples/aishell/asr0/conf/deepspeech2_online.yaml
@@ -43,7 +43,7 @@ model:
   fc_layers_size_list: -1,
   use_gru: False 
   blank_id: 0
-  ctc_grad_norm_type: null
+  
   
 training:
   n_epoch: 65
diff --git a/examples/aishell/asr1/READEME.md b/examples/aishell/asr1/README.md
similarity index 100%
rename from examples/aishell/asr1/READEME.md
rename to examples/aishell/asr1/README.md
diff --git a/examples/aishell/asr1/conf/chunk_conformer.yaml b/examples/aishell/asr1/conf/chunk_conformer.yaml
index e07cd07c5b99a5bb1ed166315df4a0df3350f882..80b455878f5f293234ac9130c8c22f863115d9ae 100644
--- a/examples/aishell/asr1/conf/chunk_conformer.yaml
+++ b/examples/aishell/asr1/conf/chunk_conformer.yaml
@@ -90,7 +90,7 @@ training:
   optim_conf:
     lr: 0.001
     weight_decay: 1e-6
-  scheduler: warmuplr     # pytorch v1.1.0+ required
+  scheduler: warmuplr     
   scheduler_conf:
     warmup_steps: 25000
     lr_decay: 1.0
diff --git a/examples/aishell/asr1/conf/transformer.yaml b/examples/aishell/asr1/conf/transformer.yaml
index d13f9e2f36d5ba1335749542073b21e184829030..60ec01801c634afa29a4cb0d88ec664323919500 100644
--- a/examples/aishell/asr1/conf/transformer.yaml
+++ b/examples/aishell/asr1/conf/transformer.yaml
@@ -80,7 +80,7 @@ training:
   optim_conf:
     lr: 0.002
     weight_decay: 1e-6
-  scheduler: warmuplr     # pytorch v1.1.0+ required
+  scheduler: warmuplr     
   scheduler_conf:
     warmup_steps: 25000
     lr_decay: 1.0
diff --git a/examples/aishell3/voc1/README.md b/examples/aishell3/voc1/README.md
index 9189eb728f7309d3ee771b6eedff2da296a776fc..de7e04a6fcc20aa313e42d808ca958637ebb561f 100644
--- a/examples/aishell3/voc1/README.md
+++ b/examples/aishell3/voc1/README.md
@@ -105,7 +105,7 @@ benchmark:
 4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
 
 ### Synthesizing
-`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
+`./local/synthesize.sh` calls `${BIN_DIR}/../synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
 ```bash
 CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name}
 ```
diff --git a/examples/aishell3/voc1/conf/default.yaml b/examples/aishell3/voc1/conf/default.yaml
index ba2d9f2ef7d51b87e97ec830401c6fe588079c03..eb6d350d41af6af148e64551ac53dad30c77c0d6 100644
--- a/examples/aishell3/voc1/conf/default.yaml
+++ b/examples/aishell3/voc1/conf/default.yaml
@@ -35,7 +35,7 @@ generator_params:
     dropout: 0.0          # Dropout rate. 0.0 means no dropout applied.
     use_weight_norm: true # Whether to use weight norm.
                           # If set to true, it will be applied to all of the conv layers.
-    upsample_scales: [4, 5, 3, 5]     # Upsampling scales. Prodcut of these must be the same as hop size.
+    upsample_scales: [4, 5, 3, 5]     # Upsampling scales. prod(upsample_scales) == n_shift
 
 ###########################################################
 #       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
@@ -71,7 +71,7 @@ lambda_adv: 4.0  # Loss balancing coefficient.
 #                  DATA LOADER SETTING                    #
 ###########################################################
 batch_size: 8              # Batch size.
-batch_max_steps: 24000     # Length of each audio in batch. Make sure dividable by hop_size.
+batch_max_steps: 24000     # Length of each audio in batch. Make sure dividable by n_shift.
 pin_memory: true           # Whether to pin memory in Pytorch DataLoader.
 num_workers: 4             # Number of workers in Pytorch DataLoader.
 remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps.
diff --git a/examples/aishell3/voc1/local/synthesize.sh b/examples/aishell3/voc1/local/synthesize.sh
index 9f904ac0c6e7006ab40c3d8aaa7c457ad1495b36..d85d1b1d970c613630e6288c5b33d88f7a475016 100755
--- a/examples/aishell3/voc1/local/synthesize.sh
+++ b/examples/aishell3/voc1/local/synthesize.sh
@@ -6,8 +6,9 @@ ckpt_name=$3
 
 FLAGS_allocator_strategy=naive_best_fit \
 FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-python3 ${BIN_DIR}/synthesize.py \
+python3 ${BIN_DIR}/../synthesize.py \
   --config=${config_path} \
   --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
   --test-metadata=dump/test/norm/metadata.jsonl \
-  --output-dir=${train_output_path}/test
+  --output-dir=${train_output_path}/test \
+  --generator-type=pwgan
diff --git a/examples/callcenter/asr1/conf/chunk_conformer.yaml b/examples/callcenter/asr1/conf/chunk_conformer.yaml
index d20d2b9a6c191ef91481cd52c21fb622f2418b05..69959c68eeac1f998ce053b62f1fb6aed30dba9b 100644
--- a/examples/callcenter/asr1/conf/chunk_conformer.yaml
+++ b/examples/callcenter/asr1/conf/chunk_conformer.yaml
@@ -88,7 +88,7 @@ training:
   optim_conf:
     lr: 0.001
     weight_decay: 1e-6
-  scheduler: warmuplr     # pytorch v1.1.0+ required
+  scheduler: warmuplr     
   scheduler_conf:
     warmup_steps: 25000
     lr_decay: 1.0
diff --git a/examples/callcenter/asr1/conf/conformer.yaml b/examples/callcenter/asr1/conf/conformer.yaml
index f86cd4a368486d3989d57ce28c74caef645b0473..80c15abb1c30dc514a9a523bb248cf5f75665117 100644
--- a/examples/callcenter/asr1/conf/conformer.yaml
+++ b/examples/callcenter/asr1/conf/conformer.yaml
@@ -83,7 +83,7 @@ training:
   optim_conf:
     lr: 0.002
     weight_decay: 1e-6
-  scheduler: warmuplr     # pytorch v1.1.0+ required
+  scheduler: warmuplr     
   scheduler_conf:
     warmup_steps: 25000
     lr_decay: 1.0
diff --git a/examples/csmsc/README.md b/examples/csmsc/README.md
index 08a513491419859e999a8872a393880a4b9cb4f3..a59a06edd49a65aa2ad1f6feb469d24138795b75 100644
--- a/examples/csmsc/README.md
+++ b/examples/csmsc/README.md
@@ -9,3 +9,4 @@
 * voc1 - Parallel WaveGAN
 * voc2 - MelGAN
 * voc3 - MultiBand MelGAN
+* voc4 - Style MelGAN
diff --git a/examples/csmsc/voc1/README.md b/examples/csmsc/voc1/README.md
index e6ee7b4a60ede20cc0f51f7d3496cae15f32f60d..b13d5896474589b8fe30ff51834ea2da9769cd2a 100644
--- a/examples/csmsc/voc1/README.md
+++ b/examples/csmsc/voc1/README.md
@@ -95,7 +95,7 @@ benchmark:
 4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
 
 ### Synthesizing
-`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
+`./local/synthesize.sh` calls `${BIN_DIR}/../synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
 ```bash
 CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name}
 ```
diff --git a/examples/csmsc/voc1/conf/default.yaml b/examples/csmsc/voc1/conf/default.yaml
index 1363b454f1a36b259059954ef8d0945b0f0e50ec..21bdd04050295af6a071f5b373bf2a0e93fe16c6 100644
--- a/examples/csmsc/voc1/conf/default.yaml
+++ b/examples/csmsc/voc1/conf/default.yaml
@@ -78,7 +78,7 @@ lambda_adv: 4.0  # Loss balancing coefficient.
 #                  DATA LOADER SETTING                    #
 ###########################################################
 batch_size: 8              # Batch size.
-batch_max_steps: 25500     # Length of each audio in batch. Make sure dividable by hop_size.
+batch_max_steps: 25500     # Length of each audio in batch. Make sure dividable by n_shift.
 pin_memory: true           # Whether to pin memory in Pytorch DataLoader.
 num_workers: 2             # Number of workers in Pytorch DataLoader.
 remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps.
@@ -88,23 +88,23 @@ allow_cache: true          # Whether to allow cache in dataset. If true, it requ
 #             OPTIMIZER & SCHEDULER SETTING               #
 ###########################################################
 generator_optimizer_params:
-    epsilon: 1.0e-6            # Generator's epsilon.
+    epsilon: 1.0e-6        # Generator's epsilon.
     weight_decay: 0.0      # Generator's weight decay coefficient.
 generator_scheduler_params:
-    learning_rate: 0.0001             # Generator's learning rate.
+    learning_rate: 0.0001  # Generator's learning rate.
     step_size: 200000      # Generator's scheduler step size.
     gamma: 0.5             # Generator's scheduler gamma.
                            # At each step size, lr will be multiplied by this parameter.
 generator_grad_norm: 10    # Generator's gradient norm.
 discriminator_optimizer_params:
     epsilon: 1.0e-6            # Discriminator's epsilon.
-    weight_decay: 0.0      # Discriminator's weight decay coefficient.
+    weight_decay: 0.0          # Discriminator's weight decay coefficient.
 discriminator_scheduler_params:
-    learning_rate: 0.00005            # Discriminator's learning rate.
-    step_size: 200000      # Discriminator's scheduler step size.
-    gamma: 0.5             # Discriminator's scheduler gamma.
-                           # At each step size, lr will be multiplied by this parameter.
-discriminator_grad_norm: 1 # Discriminator's gradient norm.
+    learning_rate: 0.00005     # Discriminator's learning rate.
+    step_size: 200000          # Discriminator's scheduler step size.
+    gamma: 0.5                 # Discriminator's scheduler gamma.
+                               # At each step size, lr will be multiplied by this parameter.
+discriminator_grad_norm: 1     # Discriminator's gradient norm.
 
 ###########################################################
 #                    INTERVAL SETTING                     #
diff --git a/examples/csmsc/voc1/local/synthesize.sh b/examples/csmsc/voc1/local/synthesize.sh
index 9f904ac0c6e7006ab40c3d8aaa7c457ad1495b36..d85d1b1d970c613630e6288c5b33d88f7a475016 100755
--- a/examples/csmsc/voc1/local/synthesize.sh
+++ b/examples/csmsc/voc1/local/synthesize.sh
@@ -6,8 +6,9 @@ ckpt_name=$3
 
 FLAGS_allocator_strategy=naive_best_fit \
 FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-python3 ${BIN_DIR}/synthesize.py \
+python3 ${BIN_DIR}/../synthesize.py \
   --config=${config_path} \
   --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
   --test-metadata=dump/test/norm/metadata.jsonl \
-  --output-dir=${train_output_path}/test
+  --output-dir=${train_output_path}/test \
+  --generator-type=pwgan
diff --git a/examples/csmsc/voc3/README.md b/examples/csmsc/voc3/README.md
index 52ca51e97a703d71160a6bf229a4ed098eb48099..99cef233f5d6792c9cdaeb28c1676214903693c2 100644
--- a/examples/csmsc/voc3/README.md
+++ b/examples/csmsc/voc3/README.md
@@ -80,7 +80,7 @@ optional arguments:
 4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
 
 ### Synthesizing
-`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
+`./local/synthesize.sh` calls `${BIN_DIR}/../synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
 ```bash
 CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name}
 ```
diff --git a/examples/csmsc/voc3/conf/default.yaml b/examples/csmsc/voc3/conf/default.yaml
index 5dda835aee23c8eb68b536a2947a98675987f814..e66d98a63dda8b6e1bc758104bc65788f12ba02a 100644
--- a/examples/csmsc/voc3/conf/default.yaml
+++ b/examples/csmsc/voc3/conf/default.yaml
@@ -6,7 +6,7 @@
 # This configuration is based on full-band MelGAN but the hop size and sampling
 # rate is different from the paper (16kHz vs 24kHz). The number of iteraions
 # is not shown in the paper so currently we train 1M iterations (not sure enough
-# to converge). 
+# to converge).
 
 ###########################################################
 #                FEATURE EXTRACTION SETTING               #
@@ -29,7 +29,7 @@ generator_params:
     out_channels: 4               # Number of output channels.
     kernel_size: 7                # Kernel size of initial and final conv layers.
     channels: 384                 # Initial number of channels for conv layers.
-    upsample_scales: [5, 5, 3]    # List of Upsampling scales.
+    upsample_scales: [5, 5, 3]    # List of Upsampling scales. prod(upsample_scales) == n_shift
     stack_kernel_size: 3          # Kernel size of dilated conv layers in residual stack.
     stacks: 4                     # Number of stacks in a single residual stack module.
     use_weight_norm: True         # Whether to use weight normalization.
@@ -66,7 +66,7 @@ discriminator_params:
 use_stft_loss: true
 stft_loss_params:
     fft_sizes: [1024, 2048, 512]  # List of FFT size for STFT-based loss.
-    hop_sizes: [120, 240, 50]     # List of hop size for STFT-based loss
+    hop_sizes: [120, 240, 50]     # List of hop size for STFT-based loss.
     win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
     window: "hann"                # Window function for STFT-based loss
 use_subband_stft_loss: true
@@ -86,7 +86,7 @@ lambda_adv: 2.5            # Loss balancing coefficient for adversarial loss.
 #                  DATA LOADER SETTING                    #
 ###########################################################
 batch_size: 64             # Batch size.
-batch_max_steps: 16200     # Length of each audio in batch. Make sure dividable by hop_size.
+batch_max_steps: 16200     # Length of each audio in batch. Make sure dividable by n_shift.
 num_workers: 2             # Number of workers in DataLoader.
 
 ###########################################################
@@ -108,7 +108,7 @@ generator_scheduler_params:
         - 500000
         - 600000
 discriminator_optimizer_params:
-    epsilon: 1.0e-7                          # Discriminator's epsilon.
+    epsilon: 1.0e-7                         # Discriminator's epsilon.
     weight_decay: 0.0                       # Discriminator's weight decay coefficient.
   
 discriminator_grad_norm: -1                 # Discriminator's gradient norm.
@@ -128,7 +128,7 @@ discriminator_scheduler_params:
 ###########################################################
 discriminator_train_start_steps: 200000 # Number of steps to start to train discriminator.
 train_max_steps: 1000000                # Number of training steps.
-save_interval_steps: 5000              # Interval steps to save checkpoint.
+save_interval_steps: 5000               # Interval steps to save checkpoint.
 eval_interval_steps: 1000               # Interval steps to evaluate the network.
 
 ###########################################################
diff --git a/examples/csmsc/voc3/conf/finetune.yaml b/examples/csmsc/voc3/conf/finetune.yaml
index 302274019d62a1134bdd4e0179e1e88e84d4d649..8610c526e08fbd7d5b8cf4feec7613bc5d69177e 100644
--- a/examples/csmsc/voc3/conf/finetune.yaml
+++ b/examples/csmsc/voc3/conf/finetune.yaml
@@ -29,7 +29,7 @@ generator_params:
     out_channels: 4               # Number of output channels.
     kernel_size: 7                # Kernel size of initial and final conv layers.
     channels: 384                 # Initial number of channels for conv layers.
-    upsample_scales: [5, 5, 3]    # List of Upsampling scales.
+    upsample_scales: [5, 5, 3]    # List of Upsampling scales. prod(upsample_scales) == n_shift
     stack_kernel_size: 3          # Kernel size of dilated conv layers in residual stack.
     stacks: 4                     # Number of stacks in a single residual stack module.
     use_weight_norm: True         # Whether to use weight normalization.
@@ -72,7 +72,7 @@ stft_loss_params:
 use_subband_stft_loss: true
 subband_stft_loss_params:
     fft_sizes: [384, 683, 171]  # List of FFT size for STFT-based loss.
-    hop_sizes: [30, 60, 10]     # List of hop size for STFT-based loss
+    hop_sizes: [30, 60, 10]     # List of hop size for STFT-based loss.
     win_lengths: [150, 300, 60] # List of window length for STFT-based loss.
     window: "hann"              # Window function for STFT-based loss
 
@@ -86,7 +86,7 @@ lambda_adv: 2.5            # Loss balancing coefficient for adversarial loss.
 #                  DATA LOADER SETTING                    #
 ###########################################################
 batch_size: 64             # Batch size.
-batch_max_steps: 16200     # Length of each audio in batch. Make sure dividable by hop_size.
+batch_max_steps: 16200     # Length of each audio in batch. Make sure dividable by n_shift.
 num_workers: 2             # Number of workers in DataLoader.
 
 ###########################################################
@@ -108,7 +108,7 @@ generator_scheduler_params:
         - 500000
         - 600000
 discriminator_optimizer_params:
-    epsilon: 1.0e-7                          # Discriminator's epsilon.
+    epsilon: 1.0e-7                         # Discriminator's epsilon.
     weight_decay: 0.0                       # Discriminator's weight decay coefficient.
   
 discriminator_grad_norm: -1                 # Discriminator's gradient norm.
@@ -128,7 +128,7 @@ discriminator_scheduler_params:
 ###########################################################
 discriminator_train_start_steps: 200000 # Number of steps to start to train discriminator.
 train_max_steps: 2000000                # Number of training steps.
-save_interval_steps: 1000              # Interval steps to save checkpoint.
+save_interval_steps: 1000               # Interval steps to save checkpoint.
 eval_interval_steps: 1000               # Interval steps to evaluate the network.
 
 ###########################################################
diff --git a/examples/csmsc/voc3/local/synthesize.sh b/examples/csmsc/voc3/local/synthesize.sh
index 9f904ac0c6e7006ab40c3d8aaa7c457ad1495b36..22d879fa33646cf311fc8580f514a58d616d82a8 100755
--- a/examples/csmsc/voc3/local/synthesize.sh
+++ b/examples/csmsc/voc3/local/synthesize.sh
@@ -6,8 +6,9 @@ ckpt_name=$3
 
 FLAGS_allocator_strategy=naive_best_fit \
 FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-python3 ${BIN_DIR}/synthesize.py \
+python3 ${BIN_DIR}/../synthesize.py \
   --config=${config_path} \
   --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
   --test-metadata=dump/test/norm/metadata.jsonl \
-  --output-dir=${train_output_path}/test
+  --output-dir=${train_output_path}/test \
+  --generator-type=mb_melgan
diff --git a/examples/csmsc/voc4/README.md b/examples/csmsc/voc4/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..86030e39d69deba073d70945f28e0eab7ac60b34
--- /dev/null
+++ b/examples/csmsc/voc4/README.md
@@ -0,0 +1,111 @@
+# Style MelGAN with CSMSC
+This example contains code used to train a [Style MelGAN](https://arxiv.org/abs/2011.01557) model with [Chinese Standard Mandarin Speech Copus](https://www.data-baker.com/open_source.html).
+## Dataset
+### Download and Extract
+Download CSMSC from the [official website](https://www.data-baker.com/data/index/source) and extract it to `~/datasets`. Then the dataset is in directory `~/datasets/BZNSYP`.
+
+### Get MFA Result and Extract
+We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to  cut silence in the edge of audio.
+You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your own MFA model reference to [mfa example](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/mfa) of our repo.
+
+## Get Started
+Assume the path to the dataset is `~/datasets/BZNSYP`.
+Assume the path to the MFA result of CSMSC is `./baker_alignment_tone`.
+Run the command below to
+1. **source path**.
+2. preprocess the dataset.
+3. train the model.
+4. synthesize wavs.
+    - synthesize waveform from `metadata.jsonl`.
+```bash
+./run.sh
+```
+You can choose a range of stages you want to run, or set `stage` equal to `stop-stage` to use only one stage, for example, run the following command will only preprocess the dataset.
+```bash
+./run.sh --stage 0 --stop-stage 0
+```
+### Data Preprocessing
+```bash
+./local/preprocess.sh ${conf_path}
+```
+When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below.
+
+```text
+dump
+├── dev
+│   ├── norm
+│   └── raw
+├── test
+│   ├── norm
+│   └── raw
+└── train
+    ├── norm
+    ├── raw
+    └── feats_stats.npy
+```
+The dataset is split into 3 parts, namely `train`, `dev` and `test`, each of which contains a `norm` and `raw` subfolder. The `raw` folder contains log magnitude of mel spectrogram of each utterances, while the norm folder contains normalized spectrogram. The statistics used to normalize the spectrogram is computed from the training set, which is located in `dump/train/feats_stats.npy`.
+
+Also there is a `metadata.jsonl` in each subfolder. It is a table-like file which contains id and paths to spectrogam of each utterance.
+
+### Model Training
+```bash
+CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path}
+```
+`./local/train.sh` calls `${BIN_DIR}/train.py`.
+Here's the complete help message.
+
+```text
+usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
+                [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
+                [--ngpu NGPU] [--verbose VERBOSE]
+
+Train a Multi-Band MelGAN model.
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --config CONFIG       config file to overwrite default config.
+  --train-metadata TRAIN_METADATA
+                        training data.
+  --dev-metadata DEV_METADATA
+                        dev data.
+  --output-dir OUTPUT_DIR
+                        output dir.
+  --ngpu NGPU           if ngpu == 0, use cpu.
+  --verbose VERBOSE     verbose.
+```
+
+1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`.
+2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder.
+3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are save in `checkpoints/` inside this directory.
+4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
+
+### Synthesizing
+`./local/synthesize.sh` calls `${BIN_DIR}/../synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
+```bash
+CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name}
+```
+```text
+usage: synthesize.py [-h] [--config CONFIG] [--checkpoint CHECKPOINT]
+                     [--test-metadata TEST_METADATA] [--output-dir OUTPUT_DIR]
+                     [--ngpu NGPU] [--verbose VERBOSE]
+
+Synthesize with multi band melgan.
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --config CONFIG       multi band melgan config file.
+  --checkpoint CHECKPOINT
+                        snapshot to load.
+  --test-metadata TEST_METADATA
+                        dev data.
+  --output-dir OUTPUT_DIR
+                        output dir.
+  --ngpu NGPU           if ngpu == 0, use cpu.
+  --verbose VERBOSE     verbose.
+```
+
+1. `--config` multi band melgan config file. You should use the same config with which the model is trained.
+2. `--checkpoint` is the checkpoint to load. Pick one of the checkpoints from `checkpoints` inside the training output directory.
+3. `--test-metadata` is the metadata of the test dataset. Use the `metadata.jsonl` in the `dev/norm` subfolder from the processed directory.
+4. `--output-dir` is the directory to save the synthesized audio files.
+5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
diff --git a/examples/csmsc/voc4/conf/default.yaml b/examples/csmsc/voc4/conf/default.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..cad4cf9bfcce4b11c8ce79afe5e89dca69005fa8
--- /dev/null
+++ b/examples/csmsc/voc4/conf/default.yaml
@@ -0,0 +1,136 @@
+# This is the configuration file for CSMSC dataset.This configuration is based 
+# on StyleMelGAN paper but uses MSE loss instead of Hinge loss. And I found that
+# batch_size = 8 is also working good. So maybe if you want to accelerate the training, 
+# you can reduce the batch size (e.g. 8 or 16). Upsampling scales is modified to 
+# fit the shift size 300 pt.
+# NOTE: batch_max_steps(24000) == prod(noise_upsample_scales)(80) * prod(upsample_scales)(300)
+
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+fs: 24000                # Sampling rate.
+n_fft: 2048              # FFT size. (in samples)
+n_shift: 300             # Hop size. (in samples)
+win_length: 1200         # Window length. (in samples)
+                         # If set to null, it will be the same as fft_size.
+window: "hann"           # Window function.
+n_mels: 80               # Number of mel basis.
+fmin: 80                 # Minimum freq in mel basis calculation. (Hz)
+fmax: 7600               # Maximum frequency in mel basis calculation. (Hz)
+
+###########################################################
+#         GENERATOR NETWORK ARCHITECTURE SETTING          #
+###########################################################
+generator_params:
+    in_channels: 128              # Number of input channels.
+    aux_channels: 80
+    channels: 64                  # Initial number of channels for conv layers.
+    out_channels: 1               # Number of output channels.
+    kernel_size: 9                # Kernel size of initial and final conv layers.
+    dilation: 2
+    bias: True
+    noise_upsample_scales: [10, 2, 2, 2]
+    noise_upsample_activation: "leakyrelu"
+    noise_upsample_activation_params:
+        negative_slope: 0.2
+    upsample_scales: [5, 1, 5, 1, 3, 1, 2, 2, 1] # List of Upsampling scales. prod(upsample_scales) == n_shift
+    upsample_mode: "nearest"
+    gated_function: "softmax"
+    use_weight_norm: True                        # Whether to use weight normalization.
+
+###########################################################
+#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
+###########################################################
+discriminator_params:
+    repeats: 4
+    window_sizes: [512, 1024, 2048, 4096]
+    pqmf_params:
+        - [1, None, None, None]
+        - [2, 62, 0.26700, 9.0]
+        - [4, 62, 0.14200, 9.0]
+        - [8, 62, 0.07949, 9.0]
+    discriminator_params:
+        out_channels: 1               # Number of output channels.
+        kernel_sizes: [5, 3]          # List of kernel size.
+        channels: 16                  # Number of channels of the initial conv layer.
+        max_downsample_channels: 512  # Maximum number of channels of downsampling layers.
+        bias: True
+        downsample_scales: [4, 4, 4, 1]   # List of downsampling scales.
+        nonlinear_activation: "leakyrelu" # Nonlinear activation function.
+        nonlinear_activation_params:      # Parameters of nonlinear activation function.
+            negative_slope: 0.2
+    use_weight_norm: True                 # Whether to use weight norm.
+    
+
+###########################################################
+#                   STFT LOSS SETTING                     #
+###########################################################
+use_stft_loss: true
+stft_loss_params:
+    fft_sizes: [1024, 2048, 512]  # List of FFT size for STFT-based loss.
+    hop_sizes: [120, 240, 50]     # List of hop size for STFT-based loss
+    win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
+    window: "hann"                # Window function for STFT-based loss
+lambda_aux: 1.0                   # Loss balancing coefficient for aux loss.
+
+###########################################################
+#               ADVERSARIAL LOSS SETTING                  #
+###########################################################
+lambda_adv: 1.0 # Loss balancing coefficient for adv loss.
+generator_adv_loss_params:
+    average_by_discriminators: false # Whether to average loss by #discriminators.
+discriminator_adv_loss_params:
+    average_by_discriminators: false # Whether to average loss by #discriminators.
+
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 32              # Batch size.
+# batch_max_steps(24000) == prod(noise_upsample_scales)(80) * prod(upsample_scales)(300, n_shift)
+batch_max_steps: 24000      # Length of each audio in batch. Make sure dividable by n_shift.
+num_workers: 2              # Number of workers in Pytorch DataLoader.
+
+###########################################################
+#             OPTIMIZER & SCHEDULER SETTING               #
+###########################################################
+generator_optimizer_params:
+    beta1: 0.5
+    beta2: 0.9
+    weight_decay: 0.0                       # Generator's weight decay coefficient.
+generator_scheduler_params:  
+    learning_rate: 1.0e-4                   # Generator's learning rate.
+    gamma: 0.5                              # Generator's scheduler gamma.
+    milestones:                             # At each milestone, lr will be multiplied by gamma.
+        - 100000
+        - 300000
+        - 500000
+        - 700000
+        - 900000
+generator_grad_norm: -1                     # Generator's gradient norm.
+discriminator_optimizer_params:
+    beta1: 0.5
+    beta2: 0.9
+    weight_decay: 0.0                       # Discriminator's weight decay coefficient.
+discriminator_scheduler_params:
+    learning_rate: 2.0e-4                   # Discriminator's learning rate.
+    gamma: 0.5                              # Discriminator's scheduler gamma.
+    milestones:                             # At each milestone, lr will be multiplied by gamma.
+        - 200000
+        - 400000
+        - 600000
+        - 800000
+discriminator_grad_norm: -1                 # Discriminator's gradient norm.
+
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+discriminator_train_start_steps: 100000 # Number of steps to start to train discriminator.
+train_max_steps: 1500000                # Number of training steps.
+save_interval_steps: 5000               # Interval steps to save checkpoint.
+eval_interval_steps: 1000               # Interval steps to evaluate the network.
+
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_snapshots: 10                 # max number of snapshots to keep while training
+seed: 42                          # random seed for paddle, random, and np.random
diff --git a/examples/csmsc/voc4/local/preprocess.sh b/examples/csmsc/voc4/local/preprocess.sh
new file mode 100755
index 0000000000000000000000000000000000000000..61d6d62bef566d385c4d3d2407ce437ec6d8e9ad
--- /dev/null
+++ b/examples/csmsc/voc4/local/preprocess.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+
+stage=0
+stop_stage=100
+
+config_path=$1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # get durations from MFA's result
+    echo "Generate durations.txt from MFA results ..."
+    python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
+        --inputdir=./baker_alignment_tone \
+        --output=durations.txt \
+        --config=${config_path}
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # extract features
+    echo "Extract features ..."
+    python3 ${BIN_DIR}/../preprocess.py \
+        --rootdir=~/datasets/BZNSYP/ \
+        --dataset=baker \
+        --dumpdir=dump \
+        --dur-file=durations.txt \
+        --config=${config_path} \
+        --cut-sil=True \
+        --num-cpu=20
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # get features' stats(mean and std)
+    echo "Get features' stats ..."
+    python3 ${MAIN_ROOT}/utils/compute_statistics.py \
+        --metadata=dump/train/raw/metadata.jsonl \
+        --field-name="feats"
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    # normalize, dev and test should use train's stats
+    echo "Normalize ..."
+   
+    python3 ${BIN_DIR}/../normalize.py \
+        --metadata=dump/train/raw/metadata.jsonl \
+        --dumpdir=dump/train/norm \
+        --stats=dump/train/feats_stats.npy
+    python3 ${BIN_DIR}/../normalize.py \
+        --metadata=dump/dev/raw/metadata.jsonl \
+        --dumpdir=dump/dev/norm \
+        --stats=dump/train/feats_stats.npy
+    
+    python3 ${BIN_DIR}/../normalize.py \
+        --metadata=dump/test/raw/metadata.jsonl \
+        --dumpdir=dump/test/norm \
+        --stats=dump/train/feats_stats.npy
+fi
diff --git a/examples/csmsc/voc4/local/synthesize.sh b/examples/csmsc/voc4/local/synthesize.sh
new file mode 100755
index 0000000000000000000000000000000000000000..527e5f839d0300b93c4361e197a0492b205704c5
--- /dev/null
+++ b/examples/csmsc/voc4/local/synthesize.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+
+FLAGS_allocator_strategy=naive_best_fit \
+FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+python3 ${BIN_DIR}/../synthesize.py \
+  --config=${config_path} \
+  --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
+  --test-metadata=dump/test/norm/metadata.jsonl \
+  --output-dir=${train_output_path}/test \
+  --generator-type=style_melgan
diff --git a/examples/csmsc/voc4/local/train.sh b/examples/csmsc/voc4/local/train.sh
new file mode 100755
index 0000000000000000000000000000000000000000..9695631ef023795f6c54e5c11bbcff5b6a6b2998
--- /dev/null
+++ b/examples/csmsc/voc4/local/train.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+
+FLAGS_cudnn_exhaustive_search=true \
+FLAGS_conv_workspace_size_limit=4000 \
+python ${BIN_DIR}/train.py \
+    --train-metadata=dump/train/norm/metadata.jsonl \
+    --dev-metadata=dump/dev/norm/metadata.jsonl \
+    --config=${config_path} \
+    --output-dir=${train_output_path} \
+    --ngpu=1
diff --git a/examples/csmsc/voc4/path.sh b/examples/csmsc/voc4/path.sh
new file mode 100755
index 0000000000000000000000000000000000000000..f68ea3be979cd72a95f0134ced1d4359de8baa44
--- /dev/null
+++ b/examples/csmsc/voc4/path.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+export MAIN_ROOT=`realpath ${PWD}/../../../`
+
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
+export LC_ALL=C
+
+export PYTHONDONTWRITEBYTECODE=1
+# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
+
+MODEL=style_melgan
+export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/gan_vocoder/${MODEL}
\ No newline at end of file
diff --git a/examples/csmsc/voc4/run.sh b/examples/csmsc/voc4/run.sh
new file mode 100755
index 0000000000000000000000000000000000000000..3e7d7e2ab61882a7985c0de110803815071afd1a
--- /dev/null
+++ b/examples/csmsc/voc4/run.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+
+set -e
+source path.sh
+
+gpus=0,1
+stage=0
+stop_stage=100
+
+conf_path=conf/default.yaml
+train_output_path=exp/default
+ckpt_name=snapshot_iter_50000.pdz
+
+# with the following command, you can choose the stage range you want to run
+# such as `./run.sh --stage 0 --stop-stage 0`
+# this can not be mixed use with `$1`, `$2` ...
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # prepare data
+    ./local/preprocess.sh ${conf_path} || exit -1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # train model, all `ckpt` under `train_output_path/checkpoints/` dir
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # synthesize
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
diff --git a/examples/librispeech/asr0/conf/deepspeech2.yaml b/examples/librispeech/asr0/conf/deepspeech2.yaml
index 70fa3fcb28775faa80ff5ab66f3b1075c33e6a9c..f3574e15025fea63a4d6434a2ee2a1ac25856758 100644
--- a/examples/librispeech/asr0/conf/deepspeech2.yaml
+++ b/examples/librispeech/asr0/conf/deepspeech2.yaml
@@ -41,7 +41,7 @@ model:
   use_gru: False 
   share_rnn_weights: True
   blank_id: 0
-  ctc_grad_norm_type: null
+  
 
 training:
   n_epoch: 50
diff --git a/examples/librispeech/asr0/conf/deepspeech2_online.yaml b/examples/librispeech/asr0/conf/deepspeech2_online.yaml
index 3e07862d606a161f79d92baf0c329132e42a1b38..0d16bc5714e46697719d218af696ed6be9c9478c 100644
--- a/examples/librispeech/asr0/conf/deepspeech2_online.yaml
+++ b/examples/librispeech/asr0/conf/deepspeech2_online.yaml
@@ -43,7 +43,7 @@ model:
   fc_layers_size_list: 512, 256
   use_gru: False 
   blank_id: 0
-  ctc_grad_norm_type: null
+  
 
 training:
   n_epoch: 50
diff --git a/examples/librispeech/asr1/conf/chunk_conformer.yaml b/examples/librispeech/asr1/conf/chunk_conformer.yaml
index 4a5741904b740940cef6fc98015e82c52601630f..7f5930378defcf6dac9e3ee94ddeecb8267b0cc6 100644
--- a/examples/librispeech/asr1/conf/chunk_conformer.yaml
+++ b/examples/librispeech/asr1/conf/chunk_conformer.yaml
@@ -76,8 +76,6 @@ model:
     # hybrid CTC/attention
     model_conf:
         ctc_weight: 0.3
-        ctc_dropoutrate: 0.0
-        ctc_grad_norm_type: null
         lsm_weight: 0.1     # label smoothing option
         length_normalized_loss: false
 
@@ -90,7 +88,7 @@ training:
   optim_conf:
     lr: 0.001
     weight_decay: 1e-06
-  scheduler: warmuplr     # pytorch v1.1.0+ required
+  scheduler: warmuplr     
   scheduler_conf:
     warmup_steps: 25000
     lr_decay: 1.0
diff --git a/examples/librispeech/asr1/conf/chunk_transformer.yaml b/examples/librispeech/asr1/conf/chunk_transformer.yaml
index c2644daf4a76b671e72e8fbae5d8f0278fdc460a..366d6de0f7135497c031d7363b7882504b6bff51 100644
--- a/examples/librispeech/asr1/conf/chunk_transformer.yaml
+++ b/examples/librispeech/asr1/conf/chunk_transformer.yaml
@@ -69,8 +69,6 @@ model:
     # hybrid CTC/attention
     model_conf:
         ctc_weight: 0.3
-        ctc_dropoutrate: 0.0
-        ctc_grad_norm_type: null
         lsm_weight: 0.1     # label smoothing option
         length_normalized_loss: false
 
@@ -83,7 +81,7 @@ training:
   optim_conf:
     lr: 0.001
     weight_decay: 1e-06
-  scheduler: warmuplr     # pytorch v1.1.0+ required
+  scheduler: warmuplr     
   scheduler_conf:
     warmup_steps: 25000
     lr_decay: 1.0
diff --git a/examples/librispeech/asr1/conf/conformer.yaml b/examples/librispeech/asr1/conf/conformer.yaml
index 684b6297699051b217af15a018db362e4c401792..f02f24dc67d68d980d735d0d82fca55a527ce67f 100644
--- a/examples/librispeech/asr1/conf/conformer.yaml
+++ b/examples/librispeech/asr1/conf/conformer.yaml
@@ -72,8 +72,6 @@ model:
     # hybrid CTC/attention
     model_conf:
         ctc_weight: 0.3
-        ctc_dropoutrate: 0.0
-        ctc_grad_norm_type: null
         lsm_weight: 0.1     # label smoothing option
         length_normalized_loss: false
 
@@ -86,7 +84,7 @@ training:
   optim_conf:
     lr: 0.004
     weight_decay: 1e-06
-  scheduler: warmuplr     # pytorch v1.1.0+ required
+  scheduler: warmuplr     
   scheduler_conf:
     warmup_steps: 25000
     lr_decay: 1.0
diff --git a/examples/librispeech/asr1/conf/transformer.yaml b/examples/librispeech/asr1/conf/transformer.yaml
index 0cc0dae63766c3947fab44ed5990ddf7310a0d2d..a90efe482df0b3a7ca779e9857926fd1a6fc16f7 100644
--- a/examples/librispeech/asr1/conf/transformer.yaml
+++ b/examples/librispeech/asr1/conf/transformer.yaml
@@ -29,8 +29,6 @@ model:
     # hybrid CTC/attention
     model_conf:
         ctc_weight: 0.3
-        ctc_dropoutrate: 0.0
-        ctc_grad_norm_type: null 
         lsm_weight: 0.1     # label smoothing option
         length_normalized_loss: false
 
@@ -81,7 +79,7 @@ training:
   optim_conf:
     lr: 0.004
     weight_decay: 1e-06
-  scheduler: warmuplr     # pytorch v1.1.0+ required
+  scheduler: warmuplr     
   scheduler_conf:
     warmup_steps: 25000
     lr_decay: 1.0
diff --git a/examples/librispeech/asr2/conf/transformer.yaml b/examples/librispeech/asr2/conf/transformer.yaml
index 3e9350abc0f118e152132e8c45e579fb29cbdc8a..a16563a59ddf1ac44ceb206e54d6f57e5e02faed 100644
--- a/examples/librispeech/asr2/conf/transformer.yaml
+++ b/examples/librispeech/asr2/conf/transformer.yaml
@@ -30,8 +30,6 @@ model:
     # hybrid CTC/attention
     model_conf:
         ctc_weight: 0.3
-        ctc_dropoutrate: 0.0
-        ctc_grad_norm_type: null 
         lsm_weight: 0.1     # label smoothing option
         length_normalized_loss: false
 
@@ -75,7 +73,7 @@ optim: adam
 optim_conf:
   global_grad_clip: 5.0
   weight_decay: 1.0e-06
-scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler: warmuplr     
 scheduler_conf:
   lr: 0.004
   warmup_steps: 25000
diff --git a/examples/ljspeech/voc1/README.md b/examples/ljspeech/voc1/README.md
index 3830156f9fea437f99c3bed1c86999fb1b048e91..5c556124a12fd34190169d7bed8bf518940ab0b4 100644
--- a/examples/ljspeech/voc1/README.md
+++ b/examples/ljspeech/voc1/README.md
@@ -95,7 +95,7 @@ benchmark:
 4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
 
 ### Synthesizing
-`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
+`./local/synthesize.sh` calls `${BIN_DIR}/../synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
 ```bash
 CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name}
 ```
diff --git a/examples/ljspeech/voc1/conf/default.yaml b/examples/ljspeech/voc1/conf/default.yaml
index 2edec3b99646c7a6cea7d40914e2861c32465859..fb97ea8e5fdcd883fe2960c0851fa59eaa08ce20 100644
--- a/examples/ljspeech/voc1/conf/default.yaml
+++ b/examples/ljspeech/voc1/conf/default.yaml
@@ -35,7 +35,7 @@ generator_params:
     dropout: 0.0          # Dropout rate. 0.0 means no dropout applied.
     use_weight_norm: true # Whether to use weight norm.
                           # If set to true, it will be applied to all of the conv layers.
-    upsample_scales: [4, 4, 4, 4]     # Upsampling scales. Prodcut of these must be the same as hop size.
+    upsample_scales: [4, 4, 4, 4]     # Upsampling scales. prod(upsample_scales) == n_shift
 
 ###########################################################
 #       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
@@ -60,7 +60,7 @@ stft_loss_params:
     fft_sizes: [1024, 2048, 512]  # List of FFT size for STFT-based loss.
     hop_sizes: [120, 240, 50]     # List of hop size for STFT-based loss
     win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
-    window: "hann"         # Window function for STFT-based loss
+    window: "hann"                # Window function for STFT-based loss
 
 ###########################################################
 #               ADVERSARIAL LOSS SETTING                  #
@@ -71,7 +71,7 @@ lambda_adv: 4.0  # Loss balancing coefficient.
 #                  DATA LOADER SETTING                    #
 ###########################################################
 batch_size: 8              # Batch size.
-batch_max_steps: 25600     # Length of each audio in batch. Make sure dividable by hop_size.
+batch_max_steps: 25600     # Length of each audio in batch. Make sure dividable by n_shift.
 pin_memory: true           # Whether to pin memory in Pytorch DataLoader.
 num_workers: 4             # Number of workers in Pytorch DataLoader.
 remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps.
@@ -84,20 +84,20 @@ generator_optimizer_params:
     epsilon: 1.0e-6        # Generator's epsilon.
     weight_decay: 0.0      # Generator's weight decay coefficient.
 generator_scheduler_params:
-    learning_rate: 0.0001             # Generator's learning rate.
+    learning_rate: 0.0001  # Generator's learning rate.
     step_size: 200000      # Generator's scheduler step size.
     gamma: 0.5             # Generator's scheduler gamma.
                            # At each step size, lr will be multiplied by this parameter.
 generator_grad_norm: 10    # Generator's gradient norm.
 discriminator_optimizer_params:
-    epsilon: 1.0e-6            # Discriminator's epsilon.
-    weight_decay: 0.0      # Discriminator's weight decay coefficient.
+    epsilon: 1.0e-6         # Discriminator's epsilon.
+    weight_decay: 0.0       # Discriminator's weight decay coefficient.
 discriminator_scheduler_params:
-    learning_rate: 0.00005            # Discriminator's learning rate.
-    step_size: 200000      # Discriminator's scheduler step size.
-    gamma: 0.5             # Discriminator's scheduler gamma.
-                           # At each step size, lr will be multiplied by this parameter.
-discriminator_grad_norm: 1 # Discriminator's gradient norm.
+    learning_rate: 0.00005  # Discriminator's learning rate.
+    step_size: 200000       # Discriminator's scheduler step size.
+    gamma: 0.5              # Discriminator's scheduler gamma.
+                            # At each step size, lr will be multiplied by this parameter.
+discriminator_grad_norm: 1  # Discriminator's gradient norm.
 
 ###########################################################
 #                    INTERVAL SETTING                     #
diff --git a/examples/ljspeech/voc1/local/synthesize.sh b/examples/ljspeech/voc1/local/synthesize.sh
index 9f904ac0c6e7006ab40c3d8aaa7c457ad1495b36..d85d1b1d970c613630e6288c5b33d88f7a475016 100755
--- a/examples/ljspeech/voc1/local/synthesize.sh
+++ b/examples/ljspeech/voc1/local/synthesize.sh
@@ -6,8 +6,9 @@ ckpt_name=$3
 
 FLAGS_allocator_strategy=naive_best_fit \
 FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-python3 ${BIN_DIR}/synthesize.py \
+python3 ${BIN_DIR}/../synthesize.py \
   --config=${config_path} \
   --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
   --test-metadata=dump/test/norm/metadata.jsonl \
-  --output-dir=${train_output_path}/test
+  --output-dir=${train_output_path}/test \
+  --generator-type=pwgan
diff --git a/examples/ted_en_zh/st0/conf/transformer.yaml b/examples/ted_en_zh/st0/conf/transformer.yaml
index 5a05fa46ea598c89b4ca80e5384364e069c1cba3..36f287b10d0aa65262158f444bcf8dd057a846cf 100644
--- a/examples/ted_en_zh/st0/conf/transformer.yaml
+++ b/examples/ted_en_zh/st0/conf/transformer.yaml
@@ -68,8 +68,6 @@ model:
     model_conf:
         asr_weight: 0.0
         ctc_weight: 0.0
-        ctc_dropoutrate: 0.0
-        ctc_grad_norm_type: null
         lsm_weight: 0.1     # label smoothing option
         length_normalized_loss: false
 
@@ -82,7 +80,7 @@ training:
   optim_conf:
     lr: 0.004
     weight_decay: 1e-06
-  scheduler: warmuplr     # pytorch v1.1.0+ required
+  scheduler: warmuplr     
   scheduler_conf:
     warmup_steps: 25000
     lr_decay: 1.0
diff --git a/examples/ted_en_zh/st0/conf/transformer_mtl_noam.yaml b/examples/ted_en_zh/st0/conf/transformer_mtl_noam.yaml
index 7e886cca3e02509ec5a79ec708800ae6a49a6390..78887d3cd95781efa54666d58c88ca75eb08520b 100644
--- a/examples/ted_en_zh/st0/conf/transformer_mtl_noam.yaml
+++ b/examples/ted_en_zh/st0/conf/transformer_mtl_noam.yaml
@@ -68,8 +68,6 @@ model:
     model_conf:
         asr_weight: 0.5
         ctc_weight: 0.3
-        ctc_dropoutrate: 0.0
-        ctc_grad_norm_type: null
         lsm_weight: 0.1     # label smoothing option
         length_normalized_loss: false
 
diff --git a/examples/ted_en_zh/st1/conf/transformer.yaml b/examples/ted_en_zh/st1/conf/transformer.yaml
index d553bde77f732195f86d26c4b6352273a28133ca..609c582407072f62bb5adc635201a92be5b2d6cd 100644
--- a/examples/ted_en_zh/st1/conf/transformer.yaml
+++ b/examples/ted_en_zh/st1/conf/transformer.yaml
@@ -68,8 +68,6 @@ model:
     model_conf:
         asr_weight: 0.0
         ctc_weight: 0.0
-        ctc_dropoutrate: 0.0
-        ctc_grad_norm_type: null
         lsm_weight: 0.1     # label smoothing option
         length_normalized_loss: false
 
@@ -82,7 +80,7 @@ training:
   optim_conf:
     lr: 0.004
     weight_decay: 1e-06
-  scheduler: warmuplr     # pytorch v1.1.0+ required
+  scheduler: warmuplr     
   scheduler_conf:
     warmup_steps: 25000
     lr_decay: 1.0
diff --git a/examples/ted_en_zh/st1/conf/transformer_mtl_noam.yaml b/examples/ted_en_zh/st1/conf/transformer_mtl_noam.yaml
index 3175aad9f7b36d340ce30a56a6a00d4f0291491c..10eccd1ebdcfd3d680ea4290f718ce53074dfa8d 100644
--- a/examples/ted_en_zh/st1/conf/transformer_mtl_noam.yaml
+++ b/examples/ted_en_zh/st1/conf/transformer_mtl_noam.yaml
@@ -68,8 +68,6 @@ model:
     model_conf:
         asr_weight: 0.5
         ctc_weight: 0.3
-        ctc_dropoutrate: 0.0
-        ctc_grad_norm_type: null
         lsm_weight: 0.1     # label smoothing option
         length_normalized_loss: false
 
diff --git a/examples/timit/asr1/conf/transformer.yaml b/examples/timit/asr1/conf/transformer.yaml
index 89ae2fd3de5595db93056cfb9c5c391afbc1a09b..f518cc5e77061fc32690bfbd7d3508c079a20f1e 100644
--- a/examples/timit/asr1/conf/transformer.yaml
+++ b/examples/timit/asr1/conf/transformer.yaml
@@ -66,8 +66,6 @@ model:
     # hybrid CTC/attention
     model_conf:
         ctc_weight: 0.5
-        ctc_dropoutrate: 0.0
-        ctc_grad_norm_type: null
         lsm_weight: 0.1     # label smoothing option
         length_normalized_loss: false
 
@@ -80,7 +78,7 @@ training:
   optim_conf:
     lr: 0.004
     weight_decay: 1e-06
-  scheduler: warmuplr     # pytorch v1.1.0+ required
+  scheduler: warmuplr     
   scheduler_conf:
     warmup_steps: 1200
     lr_decay: 1.0
diff --git a/examples/tiny/asr0/conf/deepspeech2.yaml b/examples/tiny/asr0/conf/deepspeech2.yaml
index ba453aad76976bdf76cfc7805a29707e371cc5ad..7d841d4743822c414e8d8dc4582d2a2b83211da9 100644
--- a/examples/tiny/asr0/conf/deepspeech2.yaml
+++ b/examples/tiny/asr0/conf/deepspeech2.yaml
@@ -42,7 +42,7 @@ model:
   use_gru: False 
   share_rnn_weights: True 
   blank_id: 0
-  ctc_grad_norm_type: null
+  
 
 training:
   n_epoch: 5
diff --git a/examples/tiny/asr0/conf/deepspeech2_online.yaml b/examples/tiny/asr0/conf/deepspeech2_online.yaml
index 36c774e374e92ed35227cb9a8ddd667ebe4775ac..393b6439f88c70997836aca494a3e894bffad445 100644
--- a/examples/tiny/asr0/conf/deepspeech2_online.yaml
+++ b/examples/tiny/asr0/conf/deepspeech2_online.yaml
@@ -44,7 +44,7 @@ model:
   fc_layers_size_list: 512, 256
   use_gru: True 
   blank_id: 0
-  ctc_grad_norm_type: null
+  
 
 training:
   n_epoch: 5
diff --git a/examples/tiny/asr1/conf/chunk_confermer.yaml b/examples/tiny/asr1/conf/chunk_confermer.yaml
index 728a82e3c8377b4a401dc5f4eb89820faa988df3..ad27478de66508e69596a7476f7ea037e3f60895 100644
--- a/examples/tiny/asr1/conf/chunk_confermer.yaml
+++ b/examples/tiny/asr1/conf/chunk_confermer.yaml
@@ -76,8 +76,6 @@ model:
     # hybrid CTC/attention
     model_conf:
         ctc_weight: 0.3
-        ctc_dropoutrate: 0.0
-        ctc_grad_norm_type: null
         lsm_weight: 0.1     # label smoothing option
         length_normalized_loss: false
 
@@ -90,7 +88,7 @@ training:
   optim_conf:
     lr: 0.001
     weight_decay: 1e-06
-  scheduler: warmuplr     # pytorch v1.1.0+ required
+  scheduler: warmuplr     
   scheduler_conf:
     warmup_steps: 25000
     lr_decay: 1.0
diff --git a/examples/tiny/asr1/conf/chunk_transformer.yaml b/examples/tiny/asr1/conf/chunk_transformer.yaml
index 7c927122b30ad3728d19bbc158c2d09c0d6d3873..298518fb556a6f2a8d6748faa6a2dcc6a032c04e 100644
--- a/examples/tiny/asr1/conf/chunk_transformer.yaml
+++ b/examples/tiny/asr1/conf/chunk_transformer.yaml
@@ -69,8 +69,6 @@ model:
     # hybrid CTC/attention
     model_conf:
         ctc_weight: 0.3
-        ctc_dropoutrate: 0.0
-        ctc_grad_norm_type: null
         lsm_weight: 0.1     # label smoothing option
         length_normalized_loss: false
 
@@ -83,7 +81,7 @@ training:
   optim_conf:
     lr: 0.002
     weight_decay: 1e-06
-  scheduler: warmuplr     # pytorch v1.1.0+ required
+  scheduler: warmuplr     
   scheduler_conf:
     warmup_steps: 25000
     lr_decay: 1.0
diff --git a/examples/tiny/asr1/conf/conformer.yaml b/examples/tiny/asr1/conf/conformer.yaml
index 21cc112862438b46477895b4ad98e58ef075bdd8..eb8509024ffcd31ae35e12c866dba2848dc59267 100644
--- a/examples/tiny/asr1/conf/conformer.yaml
+++ b/examples/tiny/asr1/conf/conformer.yaml
@@ -72,8 +72,6 @@ model:
     # hybrid CTC/attention
     model_conf:
         ctc_weight: 0.3
-        ctc_dropoutrate: 0.0
-        ctc_grad_norm_type: null
         lsm_weight: 0.1     # label smoothing option
         length_normalized_loss: false
 
@@ -86,7 +84,7 @@ training:
   optim_conf:
     lr: 0.002
     weight_decay: 1e-06
-  scheduler: warmuplr     # pytorch v1.1.0+ required
+  scheduler: warmuplr     
   scheduler_conf:
     warmup_steps: 25000
     lr_decay: 1.0
diff --git a/examples/tiny/asr1/conf/transformer.yaml b/examples/tiny/asr1/conf/transformer.yaml
index f4645c681b8ba637ee9a9ca738d570321d81173e..c641d1f5b7d4eddf86e22bde8e8b1c22466086da 100644
--- a/examples/tiny/asr1/conf/transformer.yaml
+++ b/examples/tiny/asr1/conf/transformer.yaml
@@ -66,8 +66,6 @@ model:
     # hybrid CTC/attention
     model_conf:
         ctc_weight: 0.3
-        ctc_dropoutrate: 0.0
-        ctc_grad_norm_type: null
         lsm_weight: 0.1     # label smoothing option
         length_normalized_loss: false
 
@@ -80,7 +78,7 @@ training:
   optim_conf:
     lr: 0.002
     weight_decay: 1e-06
-  scheduler: warmuplr     # pytorch v1.1.0+ required
+  scheduler: warmuplr     
   scheduler_conf:
     warmup_steps: 25000
     lr_decay: 1.0
diff --git a/examples/vctk/voc1/README.md b/examples/vctk/voc1/README.md
index 6aa311fb795c0dda3d96dbb9020bd3236307618f..6d7b325630055c43de3d66735f8561108e6894ac 100644
--- a/examples/vctk/voc1/README.md
+++ b/examples/vctk/voc1/README.md
@@ -100,7 +100,7 @@ benchmark:
 4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
 
 ### Synthesizing
-`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
+`./local/synthesize.sh` calls `${BIN_DIR}/../synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
 ```bash
 CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name}
 ```
diff --git a/examples/vctk/voc1/conf/default.yaml b/examples/vctk/voc1/conf/default.yaml
index ba2d9f2ef7d51b87e97ec830401c6fe588079c03..eb6d350d41af6af148e64551ac53dad30c77c0d6 100644
--- a/examples/vctk/voc1/conf/default.yaml
+++ b/examples/vctk/voc1/conf/default.yaml
@@ -35,7 +35,7 @@ generator_params:
     dropout: 0.0          # Dropout rate. 0.0 means no dropout applied.
     use_weight_norm: true # Whether to use weight norm.
                           # If set to true, it will be applied to all of the conv layers.
-    upsample_scales: [4, 5, 3, 5]     # Upsampling scales. Prodcut of these must be the same as hop size.
+    upsample_scales: [4, 5, 3, 5]     # Upsampling scales. prod(upsample_scales) == n_shift
 
 ###########################################################
 #       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
@@ -71,7 +71,7 @@ lambda_adv: 4.0  # Loss balancing coefficient.
 #                  DATA LOADER SETTING                    #
 ###########################################################
 batch_size: 8              # Batch size.
-batch_max_steps: 24000     # Length of each audio in batch. Make sure dividable by hop_size.
+batch_max_steps: 24000     # Length of each audio in batch. Make sure dividable by n_shift.
 pin_memory: true           # Whether to pin memory in Pytorch DataLoader.
 num_workers: 4             # Number of workers in Pytorch DataLoader.
 remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps.
diff --git a/examples/vctk/voc1/local/synthesize.sh b/examples/vctk/voc1/local/synthesize.sh
index 9f904ac0c6e7006ab40c3d8aaa7c457ad1495b36..d85d1b1d970c613630e6288c5b33d88f7a475016 100755
--- a/examples/vctk/voc1/local/synthesize.sh
+++ b/examples/vctk/voc1/local/synthesize.sh
@@ -6,8 +6,9 @@ ckpt_name=$3
 
 FLAGS_allocator_strategy=naive_best_fit \
 FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-python3 ${BIN_DIR}/synthesize.py \
+python3 ${BIN_DIR}/../synthesize.py \
   --config=${config_path} \
   --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
   --test-metadata=dump/test/norm/metadata.jsonl \
-  --output-dir=${train_output_path}/test
+  --output-dir=${train_output_path}/test \
+  --generator-type=pwgan
diff --git a/examples/wenetspeech/asr1/conf/conformer.yaml b/examples/wenetspeech/asr1/conf/conformer.yaml
index a3a42ec63f6d95738a2cfb890999a6677856fd1a..a438236d8b950800670da36c2f873df5a119df75 100644
--- a/examples/wenetspeech/asr1/conf/conformer.yaml
+++ b/examples/wenetspeech/asr1/conf/conformer.yaml
@@ -33,8 +33,6 @@ model:
     # hybrid CTC/attention
     model_conf:
         ctc_weight: 0.3
-        ctc_dropoutrate: 0.0
-        ctc_grad_norm_type: null
         lsm_weight: 0.1     # label smoothing option
         length_normalized_loss: false
 
@@ -87,7 +85,7 @@ training:
   optim_conf:
     lr: 0.001
     weight_decay: 1e-6
-  scheduler: warmuplr     # pytorch v1.1.0+ required
+  scheduler: warmuplr     
   scheduler_conf:
     warmup_steps: 5000
     lr_decay: 1.0
diff --git a/paddlespeech/cli/README.md b/paddlespeech/cli/README.md
index 264d66f72b44f091737a003278518826e6987159..34466ec2f0d7001a27a7eb4d329ad6c39e036291 100644
--- a/paddlespeech/cli/README.md
+++ b/paddlespeech/cli/README.md
@@ -3,10 +3,28 @@
  The simplest approach to use PaddleSpeech models.
 
  ## Help
- `paddlespeech help`
+ ```bash
+ paddlespeech help
+ ```
+ ## Audio Classification
+ ```bash
+ paddlespeech cls --input input.wav
+ ```
 
- ## ASR
- `paddlespeech asr --input ./test_audio.wav`
-
- ## Multi-label Classification
- `paddlespeech cls --input ./test_audio.wav`
+ ## Automatic Speech Recognition
+ ```
+ paddlespeech asr --lang zh --input input_16k.wav
+ ```
+ 
+ ## Speech Translation (English to Chinese)
+ 
+ (not support for Windows now)
+ ```bash
+ paddlespeech st --input input_16k.wav
+ ```
+ 
+ ## Text-to-Speech
+ ```bash
+ paddlespeech tts --input "你好，欢迎使用百度飞桨深度学习框架！" --output output.wav
+ ```
+ 
diff --git a/paddlespeech/cli/__init__.py b/paddlespeech/cli/__init__.py
index 99a53c37ee3e52377f9c6f95a4e5d17eba4f32e9..c82168aee0a1c03de893990e5ed81b8c0f90c360 100644
--- a/paddlespeech/cli/__init__.py
+++ b/paddlespeech/cli/__init__.py
@@ -11,9 +11,13 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+import _locale
 from .asr import ASRExecutor
 from .base_commands import BaseCommand
 from .base_commands import HelpCommand
 from .cls import CLSExecutor
 from .st import STExecutor
 from .tts import TTSExecutor
+
+_locale._getdefaultlocale = (lambda *args: ['en_US', 'utf8'])
diff --git a/paddlespeech/cli/asr/infer.py b/paddlespeech/cli/asr/infer.py
index 2db239c0d699bc6a6c5f4f2127bb17c8813b655c..00f2129324cc818430bd39bb8bbe59dfb5c1df4a 100644
--- a/paddlespeech/cli/asr/infer.py
+++ b/paddlespeech/cli/asr/infer.py
@@ -27,9 +27,9 @@ import yaml
 from yacs.config import CfgNode
 
 from ..executor import BaseExecutor
+from ..log import logger
 from ..utils import cli_register
 from ..utils import download_and_decompress
-from ..utils import logger
 from ..utils import MODEL_HOME
 from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
 from paddlespeech.s2t.transform.transformation import Transformation
diff --git a/paddlespeech/cli/cls/infer.py b/paddlespeech/cli/cls/infer.py
index 795d59f68052eb9d6d125bfc08aeee1ee71f09e9..37f2a9d29573c3c08aa9b36e63f87bfb6c41584d 100644
--- a/paddlespeech/cli/cls/infer.py
+++ b/paddlespeech/cli/cls/infer.py
@@ -20,14 +20,14 @@ from typing import Union
 import numpy as np
 import paddle
 import yaml
-from paddleaudio import load
-from paddleaudio.features import LogMelSpectrogram
 
 from ..executor import BaseExecutor
+from ..log import logger
 from ..utils import cli_register
 from ..utils import download_and_decompress
-from ..utils import logger
 from ..utils import MODEL_HOME
+from paddleaudio import load
+from paddleaudio.features import LogMelSpectrogram
 from paddlespeech.s2t.utils.dynamic_import import dynamic_import
 
 __all__ = ['CLSExecutor']
diff --git a/paddlespeech/cli/download.py b/paddlespeech/cli/download.py
index 8de9f0459964a31fb78a1e608644bdd966ba7353..0f09b6fad000f476f3bc38a851f982501a0232ba 100644
--- a/paddlespeech/cli/download.py
+++ b/paddlespeech/cli/download.py
@@ -20,49 +20,21 @@ import os
 import os.path as osp
 import shutil
 import subprocess
-import sys
 import tarfile
 import time
 import zipfile
 
 import requests
+from tqdm import tqdm
 
-try:
-    from tqdm import tqdm
-except:
+from .log import logger
 
-    class tqdm(object):
-        def __init__(self, total=None):
-            self.total = total
-            self.n = 0
-
-        def update(self, n):
-            self.n += n
-            if self.total is None:
-                sys.stderr.write("\r{0:.1f} bytes".format(self.n))
-            else:
-                sys.stderr.write(
-                    "\r{0:.1f}%".format(100 * self.n / float(self.total)))
-            sys.stderr.flush()
-
-        def __enter__(self):
-            return self
-
-        def __exit__(self, exc_type, exc_val, exc_tb):
-            sys.stderr.write('\n')
-
-
-import logging
-logger = logging.getLogger(__name__)
-
-__all__ = ['get_weights_path_from_url']
-
-WEIGHTS_HOME = osp.expanduser("~/.cache/paddle/hapi/weights")
+__all__ = ['get_path_from_url']
 
 DOWNLOAD_RETRY_LIMIT = 3
 
 
-def is_url(path):
+def _is_url(path):
     """
     Whether path is URL.
     Args:
@@ -71,25 +43,6 @@ def is_url(path):
     return path.startswith('http://') or path.startswith('https://')
 
 
-def get_weights_path_from_url(url, md5sum=None):
-    """Get weights path from WEIGHT_HOME, if not exists,
-    download it from url.
-    Args:
-        url (str): download url
-        md5sum (str): md5 sum of download package
-    
-    Returns:
-        str: a local path to save downloaded weights.
-    Examples:
-        .. code-block:: python
-            from paddle.utils.download import get_weights_path_from_url
-            resnet18_pretrained_weight_url = 'https://paddle-hapi.bj.bcebos.com/models/resnet18.pdparams'
-            local_weight_path = get_weights_path_from_url(resnet18_pretrained_weight_url)
-    """
-    path = get_path_from_url(url, WEIGHTS_HOME, md5sum)
-    return path
-
-
 def _map_path(url, root_dir):
     # parse path after download under root_dir
     fname = osp.split(url)[-1]
@@ -135,7 +88,7 @@ def get_path_from_url(url,
 
     from paddle.fluid.dygraph.parallel import ParallelEnv
 
-    assert is_url(url), "downloading from {} not a url".format(url)
+    assert _is_url(url), "downloading from {} not a url".format(url)
     # parse path after download to decompress under root_dir
     fullpath = _map_path(url, root_dir)
     # Mainly used to solve the problem of downloading data from different 
diff --git a/paddlespeech/cli/log.py b/paddlespeech/cli/log.py
new file mode 100644
index 0000000000000000000000000000000000000000..891b71a9485efc30ff77ecb4cb3362afe8fd366f
--- /dev/null
+++ b/paddlespeech/cli/log.py
@@ -0,0 +1,60 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import functools
+import logging
+
+__all__ = [
+    'logger',
+]
+
+
+class Logger(object):
+    def __init__(self, name: str=None):
+        name = 'PaddleSpeech' if not name else name
+        self.logger = logging.getLogger(name)
+
+        log_config = {
+            'DEBUG': 10,
+            'INFO': 20,
+            'TRAIN': 21,
+            'EVAL': 22,
+            'WARNING': 30,
+            'ERROR': 40,
+            'CRITICAL': 50,
+            'EXCEPTION': 100,
+        }
+        for key, level in log_config.items():
+            logging.addLevelName(level, key)
+            if key == 'EXCEPTION':
+                self.__dict__[key.lower()] = self.logger.exception
+            else:
+                self.__dict__[key.lower()] = functools.partial(self.__call__,
+                                                               level)
+
+        self.format = logging.Formatter(
+            fmt='[%(asctime)-15s] [%(levelname)8s] [%(filename)s] [L%(lineno)d] - %(message)s'
+        )
+
+        self.handler = logging.StreamHandler()
+        self.handler.setFormatter(self.format)
+
+        self.logger.addHandler(self.handler)
+        self.logger.setLevel(logging.DEBUG)
+        self.logger.propagate = False
+
+    def __call__(self, log_level: str, msg: str):
+        self.logger.log(log_level, msg)
+
+
+logger = Logger()
diff --git a/paddlespeech/cli/st/infer.py b/paddlespeech/cli/st/infer.py
index d7b53a0720be5df704dae6d9db76dece3595e05f..6bb8282107260f69726ba22a3e3883a0803a5922 100644
--- a/paddlespeech/cli/st/infer.py
+++ b/paddlespeech/cli/st/infer.py
@@ -18,7 +18,7 @@ from typing import List
 from typing import Optional
 from typing import Union
 
-import kaldi_io
+import kaldiio
 import numpy as np
 import paddle
 import soundfile
@@ -26,9 +26,9 @@ from kaldiio import WriteHelper
 from yacs.config import CfgNode
 
 from ..executor import BaseExecutor
+from ..log import logger
 from ..utils import cli_register
 from ..utils import download_and_decompress
-from ..utils import logger
 from ..utils import MODEL_HOME
 from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
 from paddlespeech.s2t.utils.dynamic_import import dynamic_import
@@ -234,7 +234,7 @@ class STExecutor(BaseExecutor):
                 f"{utt_name} {wav_file}".encode("utf8"))
             fbank_extract_process.stdin.close()
             fbank_feat = dict(
-                kaldi_io.read_mat_ark(fbank_extract_process.stdout))[utt_name]
+                kaldiio.load_ark(fbank_extract_process.stdout))[utt_name]
 
             extract_command = ["compute-kaldi-pitch-feats", "scp:-", "ark:-"]
             pitch_extract_process = subprocess.Popen(
@@ -251,8 +251,7 @@ class STExecutor(BaseExecutor):
                 stdout=subprocess.PIPE,
                 stderr=subprocess.PIPE)
             pitch_extract_process.stdin.close()
-            pitch_feat = dict(
-                kaldi_io.read_mat_ark(pitch_process.stdout))[utt_name]
+            pitch_feat = dict(kaldiio.load_ark(pitch_process.stdout))[utt_name]
             concated_feat = np.concatenate((fbank_feat, pitch_feat), axis=1)
             raw_feat = f"{utt_name}.raw"
             with WriteHelper(
@@ -272,7 +271,7 @@ class STExecutor(BaseExecutor):
                 stdin=cmvn_process.stdout,
                 stdout=subprocess.PIPE,
                 stderr=subprocess.PIPE)
-            norm_feat = dict(kaldi_io.read_mat_ark(process.stdout))[utt_name]
+            norm_feat = dict(kaldiio.load_ark(process.stdout))[utt_name]
             self._inputs["audio"] = paddle.to_tensor(norm_feat).unsqueeze(0)
             self._inputs["audio_len"] = paddle.to_tensor(
                 self._inputs["audio"].shape[1], dtype="int64")
diff --git a/paddlespeech/cli/tts/infer.py b/paddlespeech/cli/tts/infer.py
index 65d6d5282590ba55a2dcb52fe1dd3a1b01ba103d..fdf93e2c27180d6ffd87abec57f1cdd5e28e2026 100644
--- a/paddlespeech/cli/tts/infer.py
+++ b/paddlespeech/cli/tts/infer.py
@@ -25,9 +25,9 @@ import yaml
 from yacs.config import CfgNode
 
 from ..executor import BaseExecutor
+from ..log import logger
 from ..utils import cli_register
 from ..utils import download_and_decompress
-from ..utils import logger
 from ..utils import MODEL_HOME
 from paddlespeech.s2t.utils.dynamic_import import dynamic_import
 from paddlespeech.t2s.frontend import English
@@ -236,6 +236,7 @@ class TTSExecutor(BaseExecutor):
         self.parser.add_argument(
             "--am_stat",
             type=str,
+            default=None,
             help="mean and standard deviation used to normalize spectrogram when training acoustic model."
         )
         self.parser.add_argument(
@@ -282,6 +283,7 @@ class TTSExecutor(BaseExecutor):
         self.parser.add_argument(
             "--voc_stat",
             type=str,
+            default=None,
             help="mean and standard deviation used to normalize spectrogram when training voc."
         )
         # other
@@ -532,7 +534,7 @@ class TTSExecutor(BaseExecutor):
         wav = self.voc_inference(mel)
         self._outputs['wav'] = wav
 
-    def postprocess(self, output: str='output.wav'):
+    def postprocess(self, output: str='output.wav') -> Union[str, os.PathLike]:
         """
         Output postprocess and return results.
         This method get model output from self._outputs and convert it into human-readable results.
@@ -540,6 +542,7 @@ class TTSExecutor(BaseExecutor):
         Returns:
             Union[str, os.PathLike]: Human-readable results such as texts and audio files.
         """
+        output = os.path.abspath(os.path.expanduser(output))
         sf.write(
             output, self._outputs['wav'].numpy(), samplerate=self.am_config.fs)
         return output
@@ -590,7 +593,7 @@ class TTSExecutor(BaseExecutor):
                 lang=lang,
                 device=device,
                 output=output)
-            logger.info('TTS Result Saved in: {}'.format(res))
+            logger.info('Wave file has been generated: {}'.format(res))
             return True
         except Exception as e:
             logger.exception(e)
diff --git a/paddlespeech/cli/utils.py b/paddlespeech/cli/utils.py
index 6ae6e7e5245a721617debe8acdd524e454029414..ee31b771bd7a0e4cbbb2e5d29fa70ebc8e8a2de4 100644
--- a/paddlespeech/cli/utils.py
+++ b/paddlespeech/cli/utils.py
@@ -11,15 +11,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import functools
-import hashlib
-import logging
 import os
 import tarfile
 import zipfile
 from typing import Any
 from typing import Dict
-from typing import List
 
 from paddle.framework import load
 
@@ -31,7 +27,6 @@ __all__ = [
     'get_command',
     'download_and_decompress',
     'load_state_dict_from_url',
-    'logger',
 ]
 
 
@@ -59,38 +54,27 @@ def get_command(name: str) -> Any:
     return com['_entry']
 
 
-def _md5check(filepath: os.PathLike, md5sum: str) -> bool:
-    logger.info("File {} md5 checking...".format(filepath))
-    md5 = hashlib.md5()
-    with open(filepath, 'rb') as f:
-        for chunk in iter(lambda: f.read(4096), b""):
-            md5.update(chunk)
-    calc_md5sum = md5.hexdigest()
-
-    if calc_md5sum != md5sum:
-        logger.info("File {} md5 check failed, {}(calc) != "
-                    "{}(base)".format(filepath, calc_md5sum, md5sum))
-        return False
-    else:
-        logger.info("File {} md5 check passed.".format(filepath))
-        return True
-
-
 def _get_uncompress_path(filepath: os.PathLike) -> os.PathLike:
     file_dir = os.path.dirname(filepath)
+    is_zip_file = False
     if tarfile.is_tarfile(filepath):
         files = tarfile.open(filepath, "r:*")
         file_list = files.getnames()
     elif zipfile.is_zipfile(filepath):
         files = zipfile.ZipFile(filepath, 'r')
         file_list = files.namelist()
+        is_zip_file = True
     else:
         return file_dir
-    if _is_a_single_file(file_list):
+
+    if download._is_a_single_file(file_list):
         rootpath = file_list[0]
         uncompressed_path = os.path.join(file_dir, rootpath)
-    elif _is_a_single_dir(file_list):
-        rootpath = os.path.splitext(file_list[0])[0].split(os.sep)[0]
+    elif download._is_a_single_dir(file_list):
+        if is_zip_file:
+            rootpath = os.path.splitext(file_list[0])[0].split(os.sep)[0]
+        else:
+            rootpath = os.path.splitext(file_list[0])[0].split(os.sep)[-1]
         uncompressed_path = os.path.join(file_dir, rootpath)
     else:
         rootpath = os.path.splitext(filepath)[0].split(os.sep)[-1]
@@ -100,28 +84,6 @@ def _get_uncompress_path(filepath: os.PathLike) -> os.PathLike:
     return uncompressed_path
 
 
-def _is_a_single_file(file_list: List[os.PathLike]) -> bool:
-    if len(file_list) == 1 and file_list[0].find(os.sep) < -1:
-        return True
-    return False
-
-
-def _is_a_single_dir(file_list: List[os.PathLike]) -> bool:
-    new_file_list = []
-    for file_path in file_list:
-        if '/' in file_path:
-            file_path = file_path.replace('/', os.sep)
-        elif '\\' in file_path:
-            file_path = file_path.replace('\\', os.sep)
-        new_file_list.append(file_path)
-
-    file_name = new_file_list[0].split(os.sep)[0]
-    for i in range(1, len(new_file_list)):
-        if file_name != new_file_list[i].split(os.sep)[0]:
-            return False
-    return True
-
-
 def download_and_decompress(archive: Dict[str, str], path: str) -> os.PathLike:
     """
     Download archieves and decompress to specific path.
@@ -133,7 +95,8 @@ def download_and_decompress(archive: Dict[str, str], path: str) -> os.PathLike:
         'Dictionary keys of "url" and "md5" are required in the archive, but got: {}'.format(list(archive.keys()))
 
     filepath = os.path.join(path, os.path.basename(archive['url']))
-    if os.path.isfile(filepath) and _md5check(filepath, archive['md5']):
+    if os.path.isfile(filepath) and download._md5check(filepath,
+                                                       archive['md5']):
         uncompress_path = _get_uncompress_path(filepath)
         if not os.path.isdir(uncompress_path):
             download._decompress(filepath)
@@ -183,44 +146,3 @@ def _get_sub_home(directory):
 
 PPSPEECH_HOME = _get_paddlespcceh_home()
 MODEL_HOME = _get_sub_home('models')
-
-
-class Logger(object):
-    def __init__(self, name: str=None):
-        name = 'PaddleSpeech' if not name else name
-        self.logger = logging.getLogger(name)
-
-        log_config = {
-            'DEBUG': 10,
-            'INFO': 20,
-            'TRAIN': 21,
-            'EVAL': 22,
-            'WARNING': 30,
-            'ERROR': 40,
-            'CRITICAL': 50,
-            'EXCEPTION': 100,
-        }
-        for key, level in log_config.items():
-            logging.addLevelName(level, key)
-            if key == 'EXCEPTION':
-                self.__dict__[key.lower()] = self.logger.exception
-            else:
-                self.__dict__[key.lower()] = functools.partial(self.__call__,
-                                                               level)
-
-        self.format = logging.Formatter(
-            fmt='[%(asctime)-15s] [%(levelname)8s] [%(filename)s] [L%(lineno)d] - %(message)s'
-        )
-
-        self.handler = logging.StreamHandler()
-        self.handler.setFormatter(self.format)
-
-        self.logger.addHandler(self.handler)
-        self.logger.setLevel(logging.DEBUG)
-        self.logger.propagate = False
-
-    def __call__(self, log_level: str, msg: str):
-        self.logger.log(log_level, msg)
-
-
-logger = Logger()
diff --git a/paddlespeech/s2t/frontend/audio.py b/paddlespeech/s2t/frontend/audio.py
index 65dccad385c07ee59edf585a603ab822d8d38607..d494cc4fdc66704176b1bdb14e2b8bf08f6d120c 100644
--- a/paddlespeech/s2t/frontend/audio.py
+++ b/paddlespeech/s2t/frontend/audio.py
@@ -21,7 +21,6 @@ import struct
 import numpy as np
 import resampy
 import soundfile
-import soxbindings as sox
 from scipy import signal
 
 from .utility import convert_samples_from_float32
@@ -98,7 +97,7 @@ class AudioSegment():
         :param file: Input audio filepath or file object.
         :type file: str|file
         :param start: Start time in seconds. If start is negative, it wraps
-                      around from the end. If not provided, this function 
+                      around from the end. If not provided, this function
                       reads from the very beginning.
         :type start: float
         :param end: End time in seconds. If end is negative, it wraps around
@@ -199,7 +198,7 @@ class AudioSegment():
     @classmethod
     def from_bytes(cls, bytes):
         """Create audio segment from a byte string containing audio samples.
-        
+
         :param bytes: Byte string containing audio samples.
         :type bytes: str
         :return: Audio segment instance.
@@ -217,7 +216,7 @@ class AudioSegment():
         :type *segments: tuple of AudioSegment
         :return: Audio segment instance as concatenating results.
         :rtype: AudioSegment
-        :raises ValueError: If the number of segments is zero, or if the 
+        :raises ValueError: If the number of segments is zero, or if the
                             sample_rate of any segments does not match.
         :raises TypeError: If any segment is not AudioSegment instance.
         """
@@ -251,7 +250,7 @@ class AudioSegment():
 
     def to_wav_file(self, filepath, dtype='float32'):
         """Save audio segment to disk as wav file.
-        
+
         :param filepath: WAV filepath or file object to save the
                          audio segment.
         :type filepath: str|file
@@ -297,7 +296,7 @@ class AudioSegment():
 
     def to_bytes(self, dtype='float32'):
         """Create a byte string containing the audio content.
-        
+
         :param dtype: Data type for export samples. Options: 'int16', 'int32',
                       'float32', 'float64'. Default is 'float32'.
         :type dtype: str
@@ -309,7 +308,7 @@ class AudioSegment():
 
     def to(self, dtype='int16'):
         """Create a `dtype` audio content.
-        
+
         :param dtype: Data type for export samples. Options: 'int16', 'int32',
                       'float32', 'float64'. Default is 'float32'.
         :type dtype: str
@@ -323,8 +322,8 @@ class AudioSegment():
         """Apply gain in decibels to samples.
 
         Note that this is an in-place transformation.
-        
-        :param gain: Gain in decibels to apply to samples. 
+
+        :param gain: Gain in decibels to apply to samples.
         :type gain: float|1darray
         """
         self._samples *= 10.**(gain / 20.)
@@ -333,7 +332,7 @@ class AudioSegment():
         """Change the audio speed by linear interpolation.
 
         Note that this is an in-place transformation.
-        
+
         :param speed_rate: Rate of speed change:
                            speed_rate > 1.0, speed up the audio;
                            speed_rate = 1.0, unchanged;
@@ -355,6 +354,19 @@ class AudioSegment():
         # self._samples = np.interp(new_indices, old_indices, self._samples)
 
         # sox, slow
+        try:
+            import soxbindings as sox
+        except:
+            try:
+                from paddlespeech.s2t.utils import dynamic_pip_install
+                package = "sox"
+                dynamic_pip_install.install(package)
+                package = "soxbindings"
+                dynamic_pip_install.install(package)
+                import soxbindings as sox
+            except:
+                raise RuntimeError("Can not install soxbindings on your system." )
+
         tfm = sox.Transformer()
         tfm.set_globals(multithread=False)
         tfm.speed(speed_rate)
@@ -405,7 +417,7 @@ class AudioSegment():
         :param prior_samples: Prior strength in number of samples.
         :type prior_samples: float
         :param startup_delay: Default 0.0s. If provided, this function will
-                              accrue statistics for the first startup_delay 
+                              accrue statistics for the first startup_delay
                               seconds before applying online normalization.
         :type startup_delay: float
         """
@@ -557,7 +569,7 @@ class AudioSegment():
         :param impulse_segment: Impulse response segments.
         :type impulse_segment: AudioSegment
         :param allow_resample: Indicates whether resampling is allowed when
-                               the impulse_segment has a different sample 
+                               the impulse_segment has a different sample
                                rate from this signal.
         :type allow_resample: bool
         :raises ValueError: If the sample rate is not match between two
@@ -695,7 +707,7 @@ class AudioSegment():
 
     def _convert_samples_from_float32(self, samples, dtype):
         """Convert sample type from float32 to dtype.
-        
+
         Audio sample type is usually integer or float-point. For integer
         type, float32 will be rescaled from [-1, 1] to the maximum range
         supported by the integer type.
diff --git a/paddlespeech/s2t/models/ds2/deepspeech2.py b/paddlespeech/s2t/models/ds2/deepspeech2.py
index 317abc69e3d9dd0a919e692328764f1319b7b76d..f0a553ec80ffaa14342d9c2e85c4b2171f97c1fe 100644
--- a/paddlespeech/s2t/models/ds2/deepspeech2.py
+++ b/paddlespeech/s2t/models/ds2/deepspeech2.py
@@ -129,7 +129,7 @@ class DeepSpeech2Model(nn.Layer):
                 rnn_layer_size=1024,  #RNN layer size (number of RNN cells).
                 use_gru=True,  #Use gru if set True. Use simple rnn if set False.
                 share_rnn_weights=True,  #Whether to share input-hidden weights between forward and backward directional RNNs.Notice that for GRU, weight sharing is not supported.
-                ctc_grad_norm_type='instance', ))
+                ctc_grad_norm_type=None,))
         if config is not None:
             config.merge_from_other_cfg(default)
         return default
@@ -143,7 +143,7 @@ class DeepSpeech2Model(nn.Layer):
                  use_gru=False,
                  share_rnn_weights=True,
                  blank_id=0,
-                 ctc_grad_norm_type='instance'):
+                 ctc_grad_norm_type=None):
         super().__init__()
         self.encoder = CRNNEncoder(
             feat_size=feat_size,
@@ -220,16 +220,14 @@ class DeepSpeech2Model(nn.Layer):
         """
         model = cls(
             feat_size=dataloader.collate_fn.feature_size,
-            #feat_size=dataloader.dataset.feature_size,
             dict_size=dataloader.collate_fn.vocab_size,
-            #dict_size=dataloader.dataset.vocab_size,
             num_conv_layers=config.model.num_conv_layers,
             num_rnn_layers=config.model.num_rnn_layers,
             rnn_size=config.model.rnn_layer_size,
             use_gru=config.model.use_gru,
             share_rnn_weights=config.model.share_rnn_weights,
             blank_id=config.model.blank_id,
-            ctc_grad_norm_type=config.model.ctc_grad_norm_type, )
+            ctc_grad_norm_type=config.get('ctc_grad_norm_type', None), )
         infos = Checkpoint().load_parameters(
             model, checkpoint_path=checkpoint_path)
         logger.info(f"checkpoint info: {infos}")
@@ -257,7 +255,7 @@ class DeepSpeech2Model(nn.Layer):
             use_gru=config.use_gru,
             share_rnn_weights=config.share_rnn_weights,
             blank_id=config.blank_id,
-            ctc_grad_norm_type=config.ctc_grad_norm_type, )
+            ctc_grad_norm_type=config.get('ctc_grad_norm_type', None), )
         return model
 
 
diff --git a/paddlespeech/s2t/models/ds2_online/deepspeech2.py b/paddlespeech/s2t/models/ds2_online/deepspeech2.py
index d134239f295f3dc90e7cde55bdb211240b892d22..85876bce8abd4162c2076e6cb2c3de0fce384f36 100644
--- a/paddlespeech/s2t/models/ds2_online/deepspeech2.py
+++ b/paddlespeech/s2t/models/ds2_online/deepspeech2.py
@@ -255,7 +255,7 @@ class DeepSpeech2ModelOnline(nn.Layer):
                 fc_layers_size_list=[512, 256],
                 use_gru=True,  #Use gru if set True. Use simple rnn if set False.
                 blank_id=0,  # index of blank in vocob.txt
-                ctc_grad_norm_type='instance', ))
+                ctc_grad_norm_type=None, ))
         if config is not None:
             config.merge_from_other_cfg(default)
         return default
@@ -272,7 +272,7 @@ class DeepSpeech2ModelOnline(nn.Layer):
             fc_layers_size_list=[512, 256],
             use_gru=False,
             blank_id=0,
-            ctc_grad_norm_type='instance', ):
+            ctc_grad_norm_type=None, ):
         super().__init__()
         self.encoder = CRNNEncoder(
             feat_size=feat_size,
@@ -361,7 +361,7 @@ class DeepSpeech2ModelOnline(nn.Layer):
             fc_layers_size_list=config.model.fc_layers_size_list,
             use_gru=config.model.use_gru,
             blank_id=config.model.blank_id,
-            ctc_grad_norm_type=config.model.ctc_grad_norm_type, )
+            ctc_grad_norm_type=config.get('ctc_grad_norm_type', None), )
         infos = Checkpoint().load_parameters(
             model, checkpoint_path=checkpoint_path)
         logger.info(f"checkpoint info: {infos}")
@@ -391,7 +391,7 @@ class DeepSpeech2ModelOnline(nn.Layer):
             fc_layers_size_list=config.fc_layers_size_list,
             use_gru=config.use_gru,
             blank_id=config.blank_id,
-            ctc_grad_norm_type=config.ctc_grad_norm_type, )
+            ctc_grad_norm_type=config.get('ctc_grad_norm_type', None), )
         return model
 
 
diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py
index 4f833372a9ace05d9228c5ccad537a9e627dae88..8053ed3a8bd7a0efcd975e7e59a5a7779917564d 100644
--- a/paddlespeech/s2t/models/u2/u2.py
+++ b/paddlespeech/s2t/models/u2/u2.py
@@ -894,14 +894,16 @@ class U2Model(U2DecodeModel):
 
         # ctc decoder and ctc loss
         model_conf = configs['model_conf']
+        dropout_rate = model_conf.get('ctc_dropout_rate', 0.0)
+        grad_norm_type = model_conf.get('ctc_grad_norm_type', None)
         ctc = CTCDecoder(
             odim=vocab_size,
             enc_n_units=encoder.output_size(),
             blank_id=0,
-            dropout_rate=model_conf['ctc_dropoutrate'],
+            dropout_rate=dropout_rate,
             reduction=True,  # sum
             batch_average=True,  # sum / batch_size
-            grad_norm_type=model_conf['ctc_grad_norm_type'])
+            grad_norm_type=grad_norm_type)
 
         return vocab_size, encoder, decoder, ctc
 
diff --git a/paddlespeech/s2t/models/u2_st/u2_st.py b/paddlespeech/s2t/models/u2_st/u2_st.py
index a83e6707864039014076862b4c7813fa2d1cb299..3a23804fe2c6e4e9096e2a76eecd30c94f33190a 100644
--- a/paddlespeech/s2t/models/u2_st/u2_st.py
+++ b/paddlespeech/s2t/models/u2_st/u2_st.py
@@ -655,14 +655,16 @@ class U2STModel(U2STBaseModel):
                                          **configs['decoder_conf'])
             # ctc decoder and ctc loss
             model_conf = configs['model_conf']
+            dropout_rate = model_conf.get('ctc_dropout_rate', 0.0)
+            grad_norm_type = model_conf.get('ctc_grad_norm_type', None)
             ctc = CTCDecoder(
                 odim=vocab_size,
                 enc_n_units=encoder.output_size(),
                 blank_id=0,
-                dropout_rate=model_conf['ctc_dropoutrate'],
+                dropout_rate=dropout_rate,
                 reduction=True,  # sum
                 batch_average=True,  # sum / batch_size
-                grad_norm_type=model_conf['ctc_grad_norm_type'])
+                grad_norm_type=grad_norm_type)
 
             return vocab_size, encoder, (st_decoder, decoder, ctc)
         else:
diff --git a/paddlespeech/s2t/transform/perturb.py b/paddlespeech/s2t/transform/perturb.py
index 873adb0b8ab2e5d67bb434fb6c1ab907114bc35d..90144197c2e200756a0572ea799a736ea693e923 100644
--- a/paddlespeech/s2t/transform/perturb.py
+++ b/paddlespeech/s2t/transform/perturb.py
@@ -16,7 +16,6 @@ import librosa
 import numpy
 import scipy
 import soundfile
-import soxbindings as sox
 
 from paddlespeech.s2t.io.reader import SoundHDF5File
 
@@ -115,10 +114,10 @@ class SpeedPerturbationSox():
     and sox-speed just to resample the input,
     i.e pitch and tempo are changed both.
 
-    To speed up or slow down the sound of a file, 
-    use speed to modify the pitch and the duration of the file. 
-    This raises the speed and reduces the time. 
-    The default factor is 1.0 which makes no change to the audio. 
+    To speed up or slow down the sound of a file,
+    use speed to modify the pitch and the duration of the file.
+    This raises the speed and reduces the time.
+    The default factor is 1.0 which makes no change to the audio.
     2.0 doubles speed, thus time length is cut by a half and pitch is one interval higher.
 
     "Why use speed option instead of tempo -s in SoX for speed perturbation"
@@ -130,7 +129,7 @@ class SpeedPerturbationSox():
     speed option:
     sox -t wav input.wav -t wav output.speed0.9.wav speed 0.9
 
-    If we use speed option like above, the pitch of audio also will be changed, 
+    If we use speed option like above, the pitch of audio also will be changed,
     but the tempo option does not change the pitch.
     """
 
@@ -146,6 +145,19 @@ class SpeedPerturbationSox():
         self.keep_length = keep_length
         self.state = numpy.random.RandomState(seed)
 
+        try:
+            import soxbindings as sox
+        except:
+            try:
+                from paddlespeech.s2t.utils import dynamic_pip_install
+                package = "sox"
+                dynamic_pip_install.install(package)
+                package = "soxbindings"
+                dynamic_pip_install.install(package)
+                import soxbindings as sox
+            except:
+                raise RuntimeError("Can not install soxbindings on your system." )
+
         if utt2ratio is not None:
             self.utt2ratio = {}
             # Use the scheduled ratio for each utterances
@@ -168,8 +180,8 @@ class SpeedPerturbationSox():
     def __repr__(self):
         if self.utt2ratio is None:
             return f"""{self.__class__.__name__}(
-                lower={self.lower}, 
-                upper={self.upper}, 
+                lower={self.lower},
+                upper={self.upper},
                 keep_length={self.keep_length},
                 sample_rate={self.sr})"""
 
diff --git a/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/synthesize.py b/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/synthesize.py
deleted file mode 100644
index f275ed4474fb12c2c2b0346f0783bc31dba78842..0000000000000000000000000000000000000000
--- a/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/synthesize.py
+++ /dev/null
@@ -1,103 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import os
-from pathlib import Path
-
-import jsonlines
-import numpy as np
-import paddle
-import soundfile as sf
-import yaml
-from paddle import distributed as dist
-from timer import timer
-from yacs.config import CfgNode
-
-from paddlespeech.t2s.datasets.data_table import DataTable
-from paddlespeech.t2s.models.parallel_wavegan import PWGGenerator
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description="Synthesize with parallel wavegan.")
-    parser.add_argument(
-        "--config", type=str, help="parallel wavegan config file.")
-    parser.add_argument("--checkpoint", type=str, help="snapshot to load.")
-    parser.add_argument("--test-metadata", type=str, help="dev data.")
-    parser.add_argument("--output-dir", type=str, help="output dir.")
-    parser.add_argument(
-        "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
-    parser.add_argument("--verbose", type=int, default=1, help="verbose.")
-
-    args = parser.parse_args()
-
-    with open(args.config) as f:
-        config = CfgNode(yaml.safe_load(f))
-
-    print("========Args========")
-    print(yaml.safe_dump(vars(args)))
-    print("========Config========")
-    print(config)
-    print(
-        f"master see the word size: {dist.get_world_size()}, from pid: {os.getpid()}"
-    )
-
-    if args.ngpu == 0:
-        paddle.set_device("cpu")
-    elif args.ngpu > 0:
-        paddle.set_device("gpu")
-    else:
-        print("ngpu should >= 0 !")
-    generator = PWGGenerator(**config["generator_params"])
-    state_dict = paddle.load(args.checkpoint)
-    generator.set_state_dict(state_dict["generator_params"])
-
-    generator.remove_weight_norm()
-    generator.eval()
-    with jsonlines.open(args.test_metadata, 'r') as reader:
-        metadata = list(reader)
-
-    test_dataset = DataTable(
-        metadata,
-        fields=['utt_id', 'feats'],
-        converters={
-            'utt_id': None,
-            'feats': np.load,
-        })
-    output_dir = Path(args.output_dir)
-    output_dir.mkdir(parents=True, exist_ok=True)
-
-    N = 0
-    T = 0
-    for example in test_dataset:
-        utt_id = example['utt_id']
-        mel = example['feats']
-        mel = paddle.to_tensor(mel)  # (T, C)
-        with timer() as t:
-            with paddle.no_grad():
-                wav = generator.inference(c=mel)
-            wav = wav.numpy()
-            N += wav.size
-            T += t.elapse
-            speed = wav.size / t.elapse
-            rtf = config.fs / speed
-        print(
-            f"{utt_id}, mel: {mel.shape}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}."
-        )
-        sf.write(str(output_dir / (utt_id + ".wav")), wav, samplerate=config.fs)
-    print(f"generation speed: {N / T}Hz, RTF: {config.fs / (N / T) }")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/paddlespeech/t2s/exps/gan_vocoder/style_melgan/__init__.py b/paddlespeech/t2s/exps/gan_vocoder/style_melgan/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..185a92b8d94d3426d616c0624f0f2ee04339349e
--- /dev/null
+++ b/paddlespeech/t2s/exps/gan_vocoder/style_melgan/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/paddlespeech/t2s/exps/gan_vocoder/style_melgan/train.py b/paddlespeech/t2s/exps/gan_vocoder/style_melgan/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc746467806da598e1e346358fa24e4cac9871c8
--- /dev/null
+++ b/paddlespeech/t2s/exps/gan_vocoder/style_melgan/train.py
@@ -0,0 +1,258 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import logging
+import os
+import shutil
+from pathlib import Path
+
+import jsonlines
+import numpy as np
+import paddle
+import yaml
+from paddle import DataParallel
+from paddle import distributed as dist
+from paddle import nn
+from paddle.io import DataLoader
+from paddle.io import DistributedBatchSampler
+from paddle.optimizer import Adam
+from paddle.optimizer.lr import MultiStepDecay
+from yacs.config import CfgNode
+
+from paddlespeech.t2s.datasets.data_table import DataTable
+from paddlespeech.t2s.datasets.vocoder_batch_fn import Clip
+from paddlespeech.t2s.models.melgan import StyleMelGANDiscriminator
+from paddlespeech.t2s.models.melgan import StyleMelGANEvaluator
+from paddlespeech.t2s.models.melgan import StyleMelGANGenerator
+from paddlespeech.t2s.models.melgan import StyleMelGANUpdater
+from paddlespeech.t2s.modules.losses import DiscriminatorAdversarialLoss
+from paddlespeech.t2s.modules.losses import GeneratorAdversarialLoss
+from paddlespeech.t2s.modules.losses import MultiResolutionSTFTLoss
+from paddlespeech.t2s.training.extensions.snapshot import Snapshot
+from paddlespeech.t2s.training.extensions.visualizer import VisualDL
+from paddlespeech.t2s.training.seeding import seed_everything
+from paddlespeech.t2s.training.trainer import Trainer
+
+
+def train_sp(args, config):
+    # decides device type and whether to run in parallel
+    # setup running environment correctly
+    world_size = paddle.distributed.get_world_size()
+    if (not paddle.is_compiled_with_cuda()) or args.ngpu == 0:
+        paddle.set_device("cpu")
+    else:
+        paddle.set_device("gpu")
+        if world_size > 1:
+            paddle.distributed.init_parallel_env()
+
+    # set the random seed, it is a must for multiprocess training
+    seed_everything(config.seed)
+
+    print(
+        f"rank: {dist.get_rank()}, pid: {os.getpid()}, parent_pid: {os.getppid()}",
+    )
+
+    # dataloader has been too verbose
+    logging.getLogger("DataLoader").disabled = True
+
+    # construct dataset for training and validation
+    with jsonlines.open(args.train_metadata, 'r') as reader:
+        train_metadata = list(reader)
+    train_dataset = DataTable(
+        data=train_metadata,
+        fields=["wave", "feats"],
+        converters={
+            "wave": np.load,
+            "feats": np.load,
+        }, )
+    with jsonlines.open(args.dev_metadata, 'r') as reader:
+        dev_metadata = list(reader)
+    dev_dataset = DataTable(
+        data=dev_metadata,
+        fields=["wave", "feats"],
+        converters={
+            "wave": np.load,
+            "feats": np.load,
+        }, )
+
+    # collate function and dataloader
+    train_sampler = DistributedBatchSampler(
+        train_dataset,
+        batch_size=config.batch_size,
+        shuffle=True,
+        drop_last=True)
+    dev_sampler = DistributedBatchSampler(
+        dev_dataset,
+        batch_size=config.batch_size,
+        shuffle=False,
+        drop_last=False)
+    print("samplers done!")
+
+    if "aux_context_window" in config.generator_params:
+        aux_context_window = config.generator_params.aux_context_window
+    else:
+        aux_context_window = 0
+    train_batch_fn = Clip(
+        batch_max_steps=config.batch_max_steps,
+        hop_size=config.n_shift,
+        aux_context_window=aux_context_window)
+
+    train_dataloader = DataLoader(
+        train_dataset,
+        batch_sampler=train_sampler,
+        collate_fn=train_batch_fn,
+        num_workers=config.num_workers)
+
+    dev_dataloader = DataLoader(
+        dev_dataset,
+        batch_sampler=dev_sampler,
+        collate_fn=train_batch_fn,
+        num_workers=config.num_workers)
+    print("dataloaders done!")
+
+    generator = StyleMelGANGenerator(**config["generator_params"])
+    discriminator = StyleMelGANDiscriminator(**config["discriminator_params"])
+    if world_size > 1:
+        generator = DataParallel(generator)
+        discriminator = DataParallel(discriminator)
+    print("models done!")
+    criterion_stft = MultiResolutionSTFTLoss(**config["stft_loss_params"])
+
+    criterion_gen_adv = GeneratorAdversarialLoss(
+        **config["generator_adv_loss_params"])
+    criterion_dis_adv = DiscriminatorAdversarialLoss(
+        **config["discriminator_adv_loss_params"])
+    print("criterions done!")
+
+    lr_schedule_g = MultiStepDecay(**config["generator_scheduler_params"])
+    # Compared to multi_band_melgan.v1 config, Adam optimizer without gradient norm is used
+    generator_grad_norm = config["generator_grad_norm"]
+    gradient_clip_g = nn.ClipGradByGlobalNorm(
+        generator_grad_norm) if generator_grad_norm > 0 else None
+    print("gradient_clip_g:", gradient_clip_g)
+
+    optimizer_g = Adam(
+        learning_rate=lr_schedule_g,
+        grad_clip=gradient_clip_g,
+        parameters=generator.parameters(),
+        **config["generator_optimizer_params"])
+    lr_schedule_d = MultiStepDecay(**config["discriminator_scheduler_params"])
+    discriminator_grad_norm = config["discriminator_grad_norm"]
+    gradient_clip_d = nn.ClipGradByGlobalNorm(
+        discriminator_grad_norm) if discriminator_grad_norm > 0 else None
+    print("gradient_clip_d:", gradient_clip_d)
+    optimizer_d = Adam(
+        learning_rate=lr_schedule_d,
+        grad_clip=gradient_clip_d,
+        parameters=discriminator.parameters(),
+        **config["discriminator_optimizer_params"])
+    print("optimizers done!")
+
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    if dist.get_rank() == 0:
+        config_name = args.config.split("/")[-1]
+        # copy conf to output_dir
+        shutil.copyfile(args.config, output_dir / config_name)
+
+    updater = StyleMelGANUpdater(
+        models={
+            "generator": generator,
+            "discriminator": discriminator,
+        },
+        optimizers={
+            "generator": optimizer_g,
+            "discriminator": optimizer_d,
+        },
+        criterions={
+            "stft": criterion_stft,
+            "gen_adv": criterion_gen_adv,
+            "dis_adv": criterion_dis_adv,
+        },
+        schedulers={
+            "generator": lr_schedule_g,
+            "discriminator": lr_schedule_d,
+        },
+        dataloader=train_dataloader,
+        discriminator_train_start_steps=config.discriminator_train_start_steps,
+        lambda_adv=config.lambda_adv,
+        output_dir=output_dir)
+
+    evaluator = StyleMelGANEvaluator(
+        models={
+            "generator": generator,
+            "discriminator": discriminator,
+        },
+        criterions={
+            "stft": criterion_stft,
+            "gen_adv": criterion_gen_adv,
+            "dis_adv": criterion_dis_adv,
+        },
+        dataloader=dev_dataloader,
+        lambda_adv=config.lambda_adv,
+        output_dir=output_dir)
+
+    trainer = Trainer(
+        updater,
+        stop_trigger=(config.train_max_steps, "iteration"),
+        out=output_dir)
+
+    if dist.get_rank() == 0:
+        trainer.extend(
+            evaluator, trigger=(config.eval_interval_steps, 'iteration'))
+        trainer.extend(VisualDL(output_dir), trigger=(1, 'iteration'))
+        trainer.extend(
+            Snapshot(max_size=config.num_snapshots),
+            trigger=(config.save_interval_steps, 'iteration'))
+
+    print("Trainer Done!")
+    trainer.run()
+
+
+def main():
+    # parse args and config and redirect to train_sp
+
+    parser = argparse.ArgumentParser(
+        description="Train a Multi-Band MelGAN model.")
+    parser.add_argument(
+        "--config", type=str, help="config file to overwrite default config.")
+    parser.add_argument("--train-metadata", type=str, help="training data.")
+    parser.add_argument("--dev-metadata", type=str, help="dev data.")
+    parser.add_argument("--output-dir", type=str, help="output dir.")
+    parser.add_argument(
+        "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
+    parser.add_argument("--verbose", type=int, default=1, help="verbose.")
+
+    args = parser.parse_args()
+
+    with open(args.config, 'rt') as f:
+        config = CfgNode(yaml.safe_load(f))
+
+    print("========Args========")
+    print(yaml.safe_dump(vars(args)))
+    print("========Config========")
+    print(config)
+    print(
+        f"master see the word size: {dist.get_world_size()}, from pid: {os.getpid()}"
+    )
+
+    # dispatch
+    if args.ngpu > 1:
+        dist.spawn(train_sp, (args, config), nprocs=args.ngpu)
+    else:
+        train_sp(args, config)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/synthesize.py b/paddlespeech/t2s/exps/gan_vocoder/synthesize.py
similarity index 79%
rename from paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/synthesize.py
rename to paddlespeech/t2s/exps/gan_vocoder/synthesize.py
index 988d45905e867bff7665fd076641c264f1693e73..d7fd2f9441abbce55b71998b7715650ead978851 100644
--- a/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/synthesize.py
+++ b/paddlespeech/t2s/exps/gan_vocoder/synthesize.py
@@ -24,15 +24,19 @@ from paddle import distributed as dist
 from timer import timer
 from yacs.config import CfgNode
 
+import paddlespeech
 from paddlespeech.t2s.datasets.data_table import DataTable
-from paddlespeech.t2s.models.melgan import MelGANGenerator
 
 
 def main():
-    parser = argparse.ArgumentParser(
-        description="Synthesize with multi band melgan.")
+    parser = argparse.ArgumentParser(description="Synthesize with GANVocoder.")
     parser.add_argument(
-        "--config", type=str, help="multi band melgan config file.")
+        "--generator-type",
+        type=str,
+        default="pwgan",
+        help="type of GANVocoder, should in {pwgan, mb_melgan, style_melgan, } now"
+    )
+    parser.add_argument("--config", type=str, help="GANVocoder config file.")
     parser.add_argument("--checkpoint", type=str, help="snapshot to load.")
     parser.add_argument("--test-metadata", type=str, help="dev data.")
     parser.add_argument("--output-dir", type=str, help="output dir.")
@@ -59,15 +63,29 @@ def main():
         paddle.set_device("gpu")
     else:
         print("ngpu should >= 0 !")
-    generator = MelGANGenerator(**config["generator_params"])
+
+    class_map = {
+        "hifigan": "HiFiGANGenerator",
+        "mb_melgan": "MelGANGenerator",
+        "pwgan": "PWGGenerator",
+        "style_melgan": "StyleMelGANGenerator",
+    }
+
+    generator_type = args.generator_type
+
+    assert generator_type in class_map
+
+    print("generator_type:", generator_type)
+
+    generator_class = getattr(paddlespeech.t2s.models,
+                              class_map[generator_type])
+    generator = generator_class(**config["generator_params"])
     state_dict = paddle.load(args.checkpoint)
     generator.set_state_dict(state_dict["generator_params"])
-
     generator.remove_weight_norm()
     generator.eval()
     with jsonlines.open(args.test_metadata, 'r') as reader:
         metadata = list(reader)
-
     test_dataset = DataTable(
         metadata,
         fields=['utt_id', 'feats'],
diff --git a/paddlespeech/t2s/frontend/zh_frontend.py b/paddlespeech/t2s/frontend/zh_frontend.py
index b59060a36f46f91cc51040d4c209c505801cac00..8eb55ff253f5c535908fd08982391e9fbc81973d 100644
--- a/paddlespeech/t2s/frontend/zh_frontend.py
+++ b/paddlespeech/t2s/frontend/zh_frontend.py
@@ -137,6 +137,10 @@ class Frontend():
             phones_list.append(phones)
         if merge_sentences:
             merge_list = sum(phones_list, [])
+            # rm the last 'sp' to avoid the noise at the end
+            # cause in the training data, no 'sp' in the end
+            if merge_list[-1] == 'sp':
+                merge_list = merge_list[:-1]
             phones_list = []
             phones_list.append(merge_list)
         return phones_list
diff --git a/paddlespeech/t2s/models/__init__.py b/paddlespeech/t2s/models/__init__.py
index 66720649062343c018056a2271797e92e11e94a1..601bd9d6845d09ab525a5252b24db6c4d311707d 100644
--- a/paddlespeech/t2s/models/__init__.py
+++ b/paddlespeech/t2s/models/__init__.py
@@ -14,6 +14,7 @@
 from .fastspeech2 import *
 from .melgan import *
 from .parallel_wavegan import *
+from .speedyspeech import *
 from .tacotron2 import *
 from .transformer_tts import *
 from .waveflow import *
diff --git a/paddlespeech/t2s/models/melgan/__init__.py b/paddlespeech/t2s/models/melgan/__init__.py
index d4f557db6d155df5f12ae1de5a350f9d523f5ed9..df8ccd92dae127fe86734a495105de3de49eda74 100644
--- a/paddlespeech/t2s/models/melgan/__init__.py
+++ b/paddlespeech/t2s/models/melgan/__init__.py
@@ -13,3 +13,5 @@
 # limitations under the License.
 from .melgan import *
 from .multi_band_melgan_updater import *
+from .style_melgan import *
+from .style_melgan_updater import *
diff --git a/paddlespeech/t2s/models/melgan/melgan.py b/paddlespeech/t2s/models/melgan/melgan.py
index 809403f60f143c695ddf56b5b03b05fccec0babb..32fcf65882b51808d69e1422bc4f46df07ba115a 100644
--- a/paddlespeech/t2s/models/melgan/melgan.py
+++ b/paddlespeech/t2s/models/melgan/melgan.py
@@ -21,6 +21,7 @@ import numpy as np
 import paddle
 from paddle import nn
 
+from paddlespeech.t2s.modules.activation import get_activation
 from paddlespeech.t2s.modules.causal_conv import CausalConv1D
 from paddlespeech.t2s.modules.causal_conv import CausalConv1DTranspose
 from paddlespeech.t2s.modules.nets_utils import initialize
@@ -41,7 +42,7 @@ class MelGANGenerator(nn.Layer):
             upsample_scales: List[int]=[8, 8, 2, 2],
             stack_kernel_size: int=3,
             stacks: int=3,
-            nonlinear_activation: str="LeakyReLU",
+            nonlinear_activation: str="leakyrelu",
             nonlinear_activation_params: Dict[str, Any]={"negative_slope": 0.2},
             pad: str="Pad1D",
             pad_params: Dict[str, Any]={"mode": "reflect"},
@@ -88,16 +89,18 @@ class MelGANGenerator(nn.Layer):
         """
         super().__init__()
 
+        # initialize parameters
+        initialize(self, init_type)
+
+        # for compatibility
+        nonlinear_activation = nonlinear_activation.lower()
+
         # check hyper parameters is valid
         assert channels >= np.prod(upsample_scales)
         assert channels % (2**len(upsample_scales)) == 0
         if not use_causal_conv:
             assert (kernel_size - 1
                     ) % 2 == 0, "Not support even number kernel size."
-
-        # initialize parameters
-        initialize(self, init_type)
-
         layers = []
         if not use_causal_conv:
             layers += [
@@ -118,7 +121,8 @@ class MelGANGenerator(nn.Layer):
         for i, upsample_scale in enumerate(upsample_scales):
             # add upsampling layer
             layers += [
-                getattr(nn, nonlinear_activation)(**nonlinear_activation_params)
+                get_activation(nonlinear_activation,
+                               **nonlinear_activation_params)
             ]
             if not use_causal_conv:
                 layers += [
@@ -158,7 +162,7 @@ class MelGANGenerator(nn.Layer):
 
         # add final layer
         layers += [
-            getattr(nn, nonlinear_activation)(**nonlinear_activation_params)
+            get_activation(nonlinear_activation, **nonlinear_activation_params)
         ]
         if not use_causal_conv:
             layers += [
@@ -242,7 +246,6 @@ class MelGANGenerator(nn.Layer):
         This initialization follows official implementation manner.
         https://github.com/descriptinc/melgan-neurips/blob/master/mel2wav/modules.py
         """
-
         # 定义参数为float的正态分布。
         dist = paddle.distribution.Normal(loc=0.0, scale=0.02)
 
@@ -287,10 +290,11 @@ class MelGANDiscriminator(nn.Layer):
             max_downsample_channels: int=1024,
             bias: bool=True,
             downsample_scales: List[int]=[4, 4, 4, 4],
-            nonlinear_activation: str="LeakyReLU",
+            nonlinear_activation: str="leakyrelu",
             nonlinear_activation_params: Dict[str, Any]={"negative_slope": 0.2},
             pad: str="Pad1D",
-            pad_params: Dict[str, Any]={"mode": "reflect"}, ):
+            pad_params: Dict[str, Any]={"mode": "reflect"},
+            init_type: str="xavier_uniform", ):
         """Initilize MelGAN discriminator module.
         Parameters
         ----------
@@ -321,6 +325,13 @@ class MelGANDiscriminator(nn.Layer):
             Hyperparameters for padding function.
         """
         super().__init__()
+
+        # for compatibility
+        nonlinear_activation = nonlinear_activation.lower()
+
+        # initialize parameters
+        initialize(self, init_type)
+
         self.layers = nn.LayerList()
 
         # check kernel size is valid
@@ -338,8 +349,8 @@ class MelGANDiscriminator(nn.Layer):
                     channels,
                     int(np.prod(kernel_sizes)),
                     bias_attr=bias),
-                getattr(nn, nonlinear_activation)(
-                    **nonlinear_activation_params), ))
+                get_activation(nonlinear_activation, **
+                               nonlinear_activation_params), ))
 
         # add downsample layers
         in_chs = channels
@@ -355,8 +366,8 @@ class MelGANDiscriminator(nn.Layer):
                         padding=downsample_scale * 5,
                         groups=in_chs // 4,
                         bias_attr=bias, ),
-                    getattr(nn, nonlinear_activation)(
-                        **nonlinear_activation_params), ))
+                    get_activation(nonlinear_activation, **
+                                   nonlinear_activation_params), ))
             in_chs = out_chs
 
         # add final layers
@@ -369,8 +380,8 @@ class MelGANDiscriminator(nn.Layer):
                     kernel_sizes[0],
                     padding=(kernel_sizes[0] - 1) // 2,
                     bias_attr=bias, ),
-                getattr(nn, nonlinear_activation)(
-                    **nonlinear_activation_params), ))
+                get_activation(nonlinear_activation, **
+                               nonlinear_activation_params), ))
         self.layers.append(
             nn.Conv1D(
                 out_chs,
@@ -419,7 +430,7 @@ class MelGANMultiScaleDiscriminator(nn.Layer):
             max_downsample_channels: int=1024,
             bias: bool=True,
             downsample_scales: List[int]=[4, 4, 4, 4],
-            nonlinear_activation: str="LeakyReLU",
+            nonlinear_activation: str="leakyrelu",
             nonlinear_activation_params: Dict[str, Any]={"negative_slope": 0.2},
             pad: str="Pad1D",
             pad_params: Dict[str, Any]={"mode": "reflect"},
@@ -461,9 +472,13 @@ class MelGANMultiScaleDiscriminator(nn.Layer):
             Whether to use causal convolution.
         """
         super().__init__()
+
         # initialize parameters
         initialize(self, init_type)
 
+        # for compatibility
+        nonlinear_activation = nonlinear_activation.lower()
+
         self.discriminators = nn.LayerList()
 
         # add discriminators
diff --git a/paddlespeech/t2s/models/melgan/multi_band_melgan_updater.py b/paddlespeech/t2s/models/melgan/multi_band_melgan_updater.py
index a5d4cdeb99415461db3a9eaeb3ffddd9c38f957e..75e99627a711e055b22ab24600ba7a016a577748 100644
--- a/paddlespeech/t2s/models/melgan/multi_band_melgan_updater.py
+++ b/paddlespeech/t2s/models/melgan/multi_band_melgan_updater.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import logging
+from pathlib import Path
 from typing import Dict
 
 import paddle
@@ -41,7 +42,7 @@ class MBMelGANUpdater(StandardUpdater):
                  dataloader: DataLoader,
                  discriminator_train_start_steps: int,
                  lambda_adv: float,
-                 output_dir=None):
+                 output_dir: Path=None):
         self.models = models
         self.generator: Layer = models['generator']
         self.discriminator: Layer = models['discriminator']
@@ -159,11 +160,11 @@ class MBMelGANUpdater(StandardUpdater):
 
 class MBMelGANEvaluator(StandardEvaluator):
     def __init__(self,
-                 models,
-                 criterions,
-                 dataloader,
-                 lambda_adv,
-                 output_dir=None):
+                 models: Dict[str, Layer],
+                 criterions: Dict[str, Layer],
+                 dataloader: DataLoader,
+                 lambda_adv: float,
+                 output_dir: Path=None):
         self.models = models
         self.generator = models['generator']
         self.discriminator = models['discriminator']
diff --git a/paddlespeech/t2s/models/melgan/style_melgan.py b/paddlespeech/t2s/models/melgan/style_melgan.py
new file mode 100644
index 0000000000000000000000000000000000000000..4725a8d02dcc1401b8e0b609199072d2481cf9e5
--- /dev/null
+++ b/paddlespeech/t2s/models/melgan/style_melgan.py
@@ -0,0 +1,404 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+"""StyleMelGAN Modules."""
+import copy
+import math
+from typing import Any
+from typing import Dict
+from typing import List
+
+import numpy as np
+import paddle
+import paddle.nn.functional as F
+from paddle import nn
+
+from paddlespeech.t2s.models.melgan import MelGANDiscriminator as BaseDiscriminator
+from paddlespeech.t2s.modules.activation import get_activation
+from paddlespeech.t2s.modules.nets_utils import initialize
+from paddlespeech.t2s.modules.pqmf import PQMF
+from paddlespeech.t2s.modules.tade_res_block import TADEResBlock
+
+
+class StyleMelGANGenerator(nn.Layer):
+    """Style MelGAN generator module."""
+
+    def __init__(
+            self,
+            in_channels: int=128,
+            aux_channels: int=80,
+            channels: int=64,
+            out_channels: int=1,
+            kernel_size: int=9,
+            dilation: int=2,
+            bias: bool=True,
+            noise_upsample_scales: List[int]=[11, 2, 2, 2],
+            noise_upsample_activation: str="leakyrelu",
+            noise_upsample_activation_params: Dict[str,
+                                                   Any]={"negative_slope": 0.2},
+            upsample_scales: List[int]=[2, 2, 2, 2, 2, 2, 2, 2, 1],
+            upsample_mode: str="linear",
+            gated_function: str="softmax",
+            use_weight_norm: bool=True,
+            init_type: str="xavier_uniform", ):
+        """Initilize Style MelGAN generator.
+        Parameters
+        ----------
+        in_channels : int
+            Number of input noise channels.
+        aux_channels : int
+            Number of auxiliary input channels.
+        channels : int
+            Number of channels for conv layer.
+        out_channels : int
+            Number of output channels.
+        kernel_size : int
+            Kernel size of conv layers.
+        dilation : int
+            Dilation factor for conv layers.
+        bias : bool
+            Whether to add bias parameter in convolution layers.
+        noise_upsample_scales : list
+            List of noise upsampling scales.
+        noise_upsample_activation : str
+            Activation function module name for noise upsampling.
+        noise_upsample_activation_params : dict
+            Hyperparameters for the above activation function.
+        upsample_scales : list
+            List of upsampling scales.
+        upsample_mode : str
+            Upsampling mode in TADE layer.
+        gated_function : str
+            Gated function in TADEResBlock ("softmax" or "sigmoid").
+        use_weight_norm : bool
+            Whether to use weight norm.
+            If set to true, it will be applied to all of the conv layers.
+        """
+        super().__init__()
+
+        # initialize parameters
+        initialize(self, init_type)
+
+        self.in_channels = in_channels
+        noise_upsample = []
+        in_chs = in_channels
+        for noise_upsample_scale in noise_upsample_scales:
+            noise_upsample.append(
+                nn.Conv1DTranspose(
+                    in_chs,
+                    channels,
+                    noise_upsample_scale * 2,
+                    stride=noise_upsample_scale,
+                    padding=noise_upsample_scale // 2 + noise_upsample_scale %
+                    2,
+                    output_padding=noise_upsample_scale % 2,
+                    bias_attr=bias, ))
+            noise_upsample.append(
+                get_activation(noise_upsample_activation, **
+                               noise_upsample_activation_params))
+            in_chs = channels
+        self.noise_upsample = nn.Sequential(*noise_upsample)
+        self.noise_upsample_factor = np.prod(noise_upsample_scales)
+
+        self.blocks = nn.LayerList()
+        aux_chs = aux_channels
+        for upsample_scale in upsample_scales:
+            self.blocks.append(
+                TADEResBlock(
+                    in_channels=channels,
+                    aux_channels=aux_chs,
+                    kernel_size=kernel_size,
+                    dilation=dilation,
+                    bias=bias,
+                    upsample_factor=upsample_scale,
+                    upsample_mode=upsample_mode,
+                    gated_function=gated_function, ), )
+            aux_chs = channels
+        self.upsample_factor = np.prod(upsample_scales)
+
+        self.output_conv = nn.Sequential(
+            nn.Conv1D(
+                channels,
+                out_channels,
+                kernel_size,
+                1,
+                bias_attr=bias,
+                padding=(kernel_size - 1) // 2, ),
+            nn.Tanh(), )
+
+        nn.initializer.set_global_initializer(None)
+
+        # apply weight norm
+        if use_weight_norm:
+            self.apply_weight_norm()
+
+        # reset parameters
+        self.reset_parameters()
+
+    def forward(self, c, z=None):
+        """Calculate forward propagation.
+        Parameters
+        ----------
+        c : Tensor
+            Auxiliary input tensor (B, channels, T).
+        z : Tensor
+            Input noise tensor (B, in_channels, 1).
+        Returns
+        ----------
+        Tensor
+            Output tensor (B, out_channels, T ** prod(upsample_scales)).
+        """
+        # batch_max_steps(24000) == noise_upsample_factor(80) * upsample_factor(300)
+        if z is None:
+            z = paddle.randn([paddle.shape(c)[0], self.in_channels, 1])
+        # (B, in_channels, noise_upsample_factor).
+        x = self.noise_upsample(z)
+        for block in self.blocks:
+            x, c = block(x, c)
+        x = self.output_conv(x)
+        return x
+
+    def apply_weight_norm(self):
+        """Recursively apply weight normalization to all the Convolution layers
+        in the sublayers.
+        """
+
+        def _apply_weight_norm(layer):
+            if isinstance(layer, (nn.Conv1D, nn.Conv1DTranspose)):
+                nn.utils.weight_norm(layer)
+
+        self.apply(_apply_weight_norm)
+
+    def remove_weight_norm(self):
+        """Recursively remove weight normalization from all the Convolution 
+        layers in the sublayers.
+        """
+
+        def _remove_weight_norm(layer):
+            try:
+                if layer:
+                    nn.utils.remove_weight_norm(layer)
+            except ValueError:
+                pass
+
+        self.apply(_remove_weight_norm)
+
+    def reset_parameters(self):
+        """Reset parameters.
+        This initialization follows official implementation manner.
+        https://github.com/descriptinc/melgan-neurips/blob/master/mel2wav/modules.py
+        """
+        # 定义参数为float的正态分布。
+        dist = paddle.distribution.Normal(loc=0.0, scale=0.02)
+
+        def _reset_parameters(m):
+            if isinstance(m, nn.Conv1D) or isinstance(m, nn.Conv1DTranspose):
+                w = dist.sample(m.weight.shape)
+                m.weight.set_value(w)
+
+        self.apply(_reset_parameters)
+
+    def inference(self, c):
+        """Perform inference.
+        Parameters
+        ----------
+        c : Tensor
+            Input tensor (T, in_channels).
+        Returns
+        ----------
+        Tensor
+            Output tensor (T ** prod(upsample_scales), out_channels).
+        """
+        # (1, in_channels, T)
+        c = c.transpose([1, 0]).unsqueeze(0)
+        c_shape = paddle.shape(c)
+        # prepare noise input
+        # there is a bug in Paddle int division, we must convert a int tensor to int here
+        noise_size = (1, self.in_channels,
+                      math.ceil(int(c_shape[2]) / self.noise_upsample_factor))
+        # (1, in_channels, T/noise_upsample_factor)
+        noise = paddle.randn(noise_size)
+        # (1, in_channels, T)
+        x = self.noise_upsample(noise)
+        x_shape = paddle.shape(x)
+        total_length = c_shape[2] * self.upsample_factor
+        c = F.pad(
+            c, (0, x_shape[2] - c_shape[2]), "replicate", data_format="NCL")
+        # c.shape[2] == x.shape[2] here
+        # (1, in_channels, T*prod(upsample_scales))
+        for block in self.blocks:
+            x, c = block(x, c)
+        x = self.output_conv(x)[..., :total_length]
+        return x.squeeze(0).transpose([1, 0])
+
+
+# StyleMelGANDiscriminator 不需要 remove weight norm 嘛？
+class StyleMelGANDiscriminator(nn.Layer):
+    """Style MelGAN disciminator module."""
+
+    def __init__(
+            self,
+            repeats: int=2,
+            window_sizes: List[int]=[512, 1024, 2048, 4096],
+            pqmf_params: List[List[int]]=[
+                [1, None, None, None],
+                [2, 62, 0.26700, 9.0],
+                [4, 62, 0.14200, 9.0],
+                [8, 62, 0.07949, 9.0],
+            ],
+            discriminator_params: Dict[str, Any]={
+                "out_channels": 1,
+                "kernel_sizes": [5, 3],
+                "channels": 16,
+                "max_downsample_channels": 512,
+                "bias": True,
+                "downsample_scales": [4, 4, 4, 1],
+                "nonlinear_activation": "leakyrelu",
+                "nonlinear_activation_params": {
+                    "negative_slope": 0.2
+                },
+                "pad": "Pad1D",
+                "pad_params": {
+                    "mode": "reflect"
+                },
+            },
+            use_weight_norm: bool=True,
+            init_type: str="xavier_uniform", ):
+        """Initilize Style MelGAN discriminator.
+        Parameters
+        ----------
+        repeats : int
+            Number of repititons to apply RWD.
+        window_sizes : list
+            List of random window sizes.
+        pqmf_params : list
+            List of list of Parameters for PQMF modules
+        discriminator_params : dict
+            Parameters for base discriminator module.
+        use_weight_nom : bool
+            Whether to apply weight normalization.
+        """
+        super().__init__()
+
+        # initialize parameters
+        initialize(self, init_type)
+
+        # window size check
+        assert len(window_sizes) == len(pqmf_params)
+        sizes = [ws // p[0] for ws, p in zip(window_sizes, pqmf_params)]
+        assert len(window_sizes) == sum([sizes[0] == size for size in sizes])
+
+        self.repeats = repeats
+        self.window_sizes = window_sizes
+        self.pqmfs = nn.LayerList()
+        self.discriminators = nn.LayerList()
+        for pqmf_param in pqmf_params:
+            d_params = copy.deepcopy(discriminator_params)
+            d_params["in_channels"] = pqmf_param[0]
+            if pqmf_param[0] == 1:
+                self.pqmfs.append(nn.Identity())
+            else:
+                self.pqmfs.append(PQMF(*pqmf_param))
+            self.discriminators.append(BaseDiscriminator(**d_params))
+
+        nn.initializer.set_global_initializer(None)
+
+        # apply weight norm
+        if use_weight_norm:
+            self.apply_weight_norm()
+
+        # reset parameters
+        self.reset_parameters()
+
+    def forward(self, x):
+        """Calculate forward propagation.
+        Parameters
+        ----------
+        x : Tensor
+            Input tensor (B, 1, T).
+        Returns
+        ----------
+        List
+            List of discriminator outputs, #items in the list will be
+            equal to repeats * #discriminators.
+        """
+        outs = []
+        for _ in range(self.repeats):
+            outs += self._forward(x)
+        return outs
+
+    def _forward(self, x):
+        outs = []
+        for idx, (ws, pqmf, disc) in enumerate(
+                zip(self.window_sizes, self.pqmfs, self.discriminators)):
+            start_idx = int(np.random.randint(paddle.shape(x)[-1] - ws))
+            x_ = x[:, :, start_idx:start_idx + ws]
+            if idx == 0:
+                # nn.Identity()
+                x_ = pqmf(x_)
+            else:
+                x_ = pqmf.analysis(x_)
+            outs += [disc(x_)]
+        return outs
+
+    def apply_weight_norm(self):
+        """Recursively apply weight normalization to all the Convolution layers
+        in the sublayers.
+        """
+
+        def _apply_weight_norm(layer):
+            if isinstance(layer, (nn.Conv1D, nn.Conv1DTranspose)):
+                nn.utils.weight_norm(layer)
+
+        self.apply(_apply_weight_norm)
+
+    def remove_weight_norm(self):
+        """Recursively remove weight normalization from all the Convolution 
+        layers in the sublayers.
+        """
+
+        def _remove_weight_norm(layer):
+            try:
+                nn.utils.remove_weight_norm(layer)
+            except ValueError:
+                pass
+
+        self.apply(_remove_weight_norm)
+
+    def reset_parameters(self):
+        """Reset parameters.
+        This initialization follows official implementation manner.
+        https://github.com/descriptinc/melgan-neurips/blob/master/mel2wav/modules.py
+        """
+        # 定义参数为float的正态分布。
+        dist = paddle.distribution.Normal(loc=0.0, scale=0.02)
+
+        def _reset_parameters(m):
+            if isinstance(m, nn.Conv1D) or isinstance(m, nn.Conv1DTranspose):
+                w = dist.sample(m.weight.shape)
+                m.weight.set_value(w)
+
+        self.apply(_reset_parameters)
+
+
+class StyleMelGANInference(nn.Layer):
+    def __init__(self, normalizer, style_melgan_generator):
+        super().__init__()
+        self.normalizer = normalizer
+        self.style_melgan_generator = style_melgan_generator
+
+    def forward(self, logmel):
+        normalized_mel = self.normalizer(logmel)
+        wav = self.style_melgan_generator.inference(normalized_mel)
+        return wav
diff --git a/paddlespeech/t2s/models/melgan/style_melgan_updater.py b/paddlespeech/t2s/models/melgan/style_melgan_updater.py
new file mode 100644
index 0000000000000000000000000000000000000000..b0cb4ed662a4107aee405933b0b771a4b291be2a
--- /dev/null
+++ b/paddlespeech/t2s/models/melgan/style_melgan_updater.py
@@ -0,0 +1,227 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+from pathlib import Path
+from typing import Dict
+
+import paddle
+from paddle import distributed as dist
+from paddle.io import DataLoader
+from paddle.nn import Layer
+from paddle.optimizer import Optimizer
+from paddle.optimizer.lr import LRScheduler
+
+from paddlespeech.t2s.training.extensions.evaluator import StandardEvaluator
+from paddlespeech.t2s.training.reporter import report
+from paddlespeech.t2s.training.updaters.standard_updater import StandardUpdater
+from paddlespeech.t2s.training.updaters.standard_updater import UpdaterState
+logging.basicConfig(
+    format='%(asctime)s [%(levelname)s] [%(filename)s:%(lineno)d] %(message)s',
+    datefmt='[%Y-%m-%d %H:%M:%S]')
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+
+class StyleMelGANUpdater(StandardUpdater):
+    def __init__(self,
+                 models: Dict[str, Layer],
+                 optimizers: Dict[str, Optimizer],
+                 criterions: Dict[str, Layer],
+                 schedulers: Dict[str, LRScheduler],
+                 dataloader: DataLoader,
+                 generator_train_start_steps: int=0,
+                 discriminator_train_start_steps: int=100000,
+                 lambda_adv: float=1.0,
+                 lambda_aux: float=1.0,
+                 output_dir: Path=None):
+        self.models = models
+        self.generator: Layer = models['generator']
+        self.discriminator: Layer = models['discriminator']
+
+        self.optimizers = optimizers
+        self.optimizer_g: Optimizer = optimizers['generator']
+        self.optimizer_d: Optimizer = optimizers['discriminator']
+
+        self.criterions = criterions
+        self.criterion_stft = criterions['stft']
+        self.criterion_gen_adv = criterions["gen_adv"]
+        self.criterion_dis_adv = criterions["dis_adv"]
+
+        self.schedulers = schedulers
+        self.scheduler_g = schedulers['generator']
+        self.scheduler_d = schedulers['discriminator']
+
+        self.dataloader = dataloader
+
+        self.generator_train_start_steps = generator_train_start_steps
+        self.discriminator_train_start_steps = discriminator_train_start_steps
+        self.lambda_adv = lambda_adv
+        self.lambda_aux = lambda_aux
+
+        self.state = UpdaterState(iteration=0, epoch=0)
+        self.train_iterator = iter(self.dataloader)
+
+        log_file = output_dir / 'worker_{}.log'.format(dist.get_rank())
+        self.filehandler = logging.FileHandler(str(log_file))
+        logger.addHandler(self.filehandler)
+        self.logger = logger
+        self.msg = ""
+
+    def update_core(self, batch):
+        self.msg = "Rank: {}, ".format(dist.get_rank())
+        losses_dict = {}
+        # parse batch
+        wav, mel = batch
+
+        # Generator
+        if self.state.iteration > self.generator_train_start_steps:
+            # (B, out_channels, T ** prod(upsample_scales)
+            wav_ = self.generator(mel)
+
+            # initialize
+            gen_loss = 0.0
+            aux_loss = 0.0
+
+            # full band multi-resolution stft loss
+            sc_loss, mag_loss = self.criterion_stft(wav_, wav)
+            aux_loss += sc_loss + mag_loss
+            report("train/spectral_convergence_loss", float(sc_loss))
+            report("train/log_stft_magnitude_loss", float(mag_loss))
+            losses_dict["spectral_convergence_loss"] = float(sc_loss)
+            losses_dict["log_stft_magnitude_loss"] = float(mag_loss)
+
+            gen_loss += aux_loss * self.lambda_aux
+
+            # adversarial loss
+            if self.state.iteration > self.discriminator_train_start_steps:
+                p_ = self.discriminator(wav_)
+                adv_loss = self.criterion_gen_adv(p_)
+                report("train/adversarial_loss", float(adv_loss))
+                losses_dict["adversarial_loss"] = float(adv_loss)
+
+                gen_loss += self.lambda_adv * adv_loss
+
+            report("train/generator_loss", float(gen_loss))
+            losses_dict["generator_loss"] = float(gen_loss)
+
+            self.optimizer_g.clear_grad()
+            gen_loss.backward()
+
+            self.optimizer_g.step()
+            self.scheduler_g.step()
+
+        # Disctiminator
+        if self.state.iteration > self.discriminator_train_start_steps:
+            # re-compute wav_ which leads better quality
+            with paddle.no_grad():
+                wav_ = self.generator(mel)
+
+            p = self.discriminator(wav)
+            p_ = self.discriminator(wav_.detach())
+            real_loss, fake_loss = self.criterion_dis_adv(p_, p)
+            dis_loss = real_loss + fake_loss
+            report("train/real_loss", float(real_loss))
+            report("train/fake_loss", float(fake_loss))
+            report("train/discriminator_loss", float(dis_loss))
+            losses_dict["real_loss"] = float(real_loss)
+            losses_dict["fake_loss"] = float(fake_loss)
+            losses_dict["discriminator_loss"] = float(dis_loss)
+
+            self.optimizer_d.clear_grad()
+            dis_loss.backward()
+
+            self.optimizer_d.step()
+            self.scheduler_d.step()
+
+        self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
+                              for k, v in losses_dict.items())
+
+
+class StyleMelGANEvaluator(StandardEvaluator):
+    def __init__(self,
+                 models: Dict[str, Layer],
+                 criterions: Dict[str, Layer],
+                 dataloader: DataLoader,
+                 lambda_adv: float=1.0,
+                 lambda_aux: float=1.0,
+                 output_dir: Path=None):
+        self.models = models
+        self.generator = models['generator']
+        self.discriminator = models['discriminator']
+
+        self.criterions = criterions
+        self.criterion_stft = criterions['stft']
+        self.criterion_gen_adv = criterions["gen_adv"]
+        self.criterion_dis_adv = criterions["dis_adv"]
+
+        self.dataloader = dataloader
+
+        self.lambda_adv = lambda_adv
+        self.lambda_aux = lambda_aux
+
+        log_file = output_dir / 'worker_{}.log'.format(dist.get_rank())
+        self.filehandler = logging.FileHandler(str(log_file))
+        logger.addHandler(self.filehandler)
+        self.logger = logger
+        self.msg = ""
+
+    def evaluate_core(self, batch):
+        self.msg = "Evaluate: "
+        losses_dict = {}
+        wav, mel = batch
+
+        # Generator
+        # (B, out_channels, T ** prod(upsample_scales)
+        wav_ = self.generator(mel)
+
+        # initialize
+        gen_loss = 0.0
+        aux_loss = 0.0
+
+        # adversarial loss
+        p_ = self.discriminator(wav_)
+        adv_loss = self.criterion_gen_adv(p_)
+        report("eval/adversarial_loss", float(adv_loss))
+        losses_dict["adversarial_loss"] = float(adv_loss)
+
+        gen_loss += self.lambda_adv * adv_loss
+
+        # multi-resolution stft loss
+        sc_loss, mag_loss = self.criterion_stft(wav_, wav)
+        aux_loss += sc_loss + mag_loss
+        report("eval/spectral_convergence_loss", float(sc_loss))
+        report("eval/log_stft_magnitude_loss", float(mag_loss))
+        losses_dict["spectral_convergence_loss"] = float(sc_loss)
+        losses_dict["log_stft_magnitude_loss"] = float(mag_loss)
+
+        gen_loss += aux_loss * self.lambda_aux
+
+        report("eval/generator_loss", float(gen_loss))
+        losses_dict["generator_loss"] = float(gen_loss)
+
+        # Disctiminator
+        p = self.discriminator(wav)
+        real_loss, fake_loss = self.criterion_dis_adv(p_, p)
+        dis_loss = real_loss + fake_loss
+        report("eval/real_loss", float(real_loss))
+        report("eval/fake_loss", float(fake_loss))
+        report("eval/discriminator_loss", float(dis_loss))
+
+        losses_dict["real_loss"] = float(real_loss)
+        losses_dict["fake_loss"] = float(fake_loss)
+        losses_dict["discriminator_loss"] = float(dis_loss)
+
+        self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
+                              for k, v in losses_dict.items())
+        self.logger.info(self.msg)
diff --git a/paddlespeech/t2s/models/parallel_wavegan/parallel_wavegan_updater.py b/paddlespeech/t2s/models/parallel_wavegan/parallel_wavegan_updater.py
index 4e3daaa3c46f54218eb861697e0d7a5dc528785d..79707aa4ebcade609a49f1aa6d5eae2e535398bd 100644
--- a/paddlespeech/t2s/models/parallel_wavegan/parallel_wavegan_updater.py
+++ b/paddlespeech/t2s/models/parallel_wavegan/parallel_wavegan_updater.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import logging
+from pathlib import Path
 from typing import Dict
 
 import paddle
@@ -26,6 +27,7 @@ from paddlespeech.t2s.training.extensions.evaluator import StandardEvaluator
 from paddlespeech.t2s.training.reporter import report
 from paddlespeech.t2s.training.updaters.standard_updater import StandardUpdater
 from paddlespeech.t2s.training.updaters.standard_updater import UpdaterState
+
 logging.basicConfig(
     format='%(asctime)s [%(levelname)s] [%(filename)s:%(lineno)d] %(message)s',
     datefmt='[%Y-%m-%d %H:%M:%S]')
@@ -42,7 +44,7 @@ class PWGUpdater(StandardUpdater):
                  dataloader: DataLoader,
                  discriminator_train_start_steps: int,
                  lambda_adv: float,
-                 output_dir=None):
+                 output_dir: Path=None):
         self.models = models
         self.generator: Layer = models['generator']
         self.discriminator: Layer = models['discriminator']
@@ -155,11 +157,11 @@ class PWGUpdater(StandardUpdater):
 
 class PWGEvaluator(StandardEvaluator):
     def __init__(self,
-                 models,
-                 criterions,
-                 dataloader,
-                 lambda_adv,
-                 output_dir=None):
+                 models: Dict[str, Layer],
+                 criterions: Dict[str, Layer],
+                 dataloader: DataLoader,
+                 lambda_adv: float,
+                 output_dir: Path=None):
         self.models = models
         self.generator = models['generator']
         self.discriminator = models['discriminator']
diff --git a/paddlespeech/t2s/modules/activation.py b/paddlespeech/t2s/modules/activation.py
index f5b0af6e901e619ad3133fb8db46daef47f6b78b..8d8cd62ef4b232bbd32d2a0d3a050a7876e66b5c 100644
--- a/paddlespeech/t2s/modules/activation.py
+++ b/paddlespeech/t2s/modules/activation.py
@@ -27,7 +27,7 @@ class GLU(nn.Layer):
         return F.glu(xs, axis=self.dim)
 
 
-def get_activation(act):
+def get_activation(act, **kwargs):
     """Return activation function."""
 
     activation_funcs = {
@@ -35,8 +35,9 @@ def get_activation(act):
         "tanh": paddle.nn.Tanh,
         "relu": paddle.nn.ReLU,
         "selu": paddle.nn.SELU,
+        "leakyrelu": paddle.nn.LeakyReLU,
         "swish": paddle.nn.Swish,
         "glu": GLU
     }
 
-    return activation_funcs[act]()
+    return activation_funcs[act](**kwargs)
diff --git a/paddlespeech/t2s/modules/residual_stack.py b/paddlespeech/t2s/modules/residual_stack.py
index 236f41d3ed541ef28a6c0838b0e9aecebf98f0a2..b4f95229c7e4e0d0510bf9265fc704d1d690fcb7 100644
--- a/paddlespeech/t2s/modules/residual_stack.py
+++ b/paddlespeech/t2s/modules/residual_stack.py
@@ -18,6 +18,7 @@ from typing import Dict
 
 from paddle import nn
 
+from paddlespeech.t2s.modules.activation import get_activation
 from paddlespeech.t2s.modules.causal_conv import CausalConv1D
 
 
@@ -30,7 +31,7 @@ class ResidualStack(nn.Layer):
             channels: int=32,
             dilation: int=1,
             bias: bool=True,
-            nonlinear_activation: str="LeakyReLU",
+            nonlinear_activation: str="leakyrelu",
             nonlinear_activation_params: Dict[str, Any]={"negative_slope": 0.2},
             pad: str="Pad1D",
             pad_params: Dict[str, Any]={"mode": "reflect"},
@@ -58,14 +59,16 @@ class ResidualStack(nn.Layer):
             Whether to use causal convolution.
         """
         super().__init__()
+        # for compatibility
+        nonlinear_activation = nonlinear_activation.lower()
 
         # defile residual stack part
         if not use_causal_conv:
             assert (kernel_size - 1
                     ) % 2 == 0, "Not support even number kernel size."
             self.stack = nn.Sequential(
-                getattr(nn, nonlinear_activation)(
-                    **nonlinear_activation_params),
+                get_activation(nonlinear_activation,
+                               **nonlinear_activation_params),
                 getattr(nn, pad)((kernel_size - 1) // 2 * dilation,
                                  **pad_params),
                 nn.Conv1D(
@@ -74,13 +77,13 @@ class ResidualStack(nn.Layer):
                     kernel_size,
                     dilation=dilation,
                     bias_attr=bias),
-                getattr(nn, nonlinear_activation)(
-                    **nonlinear_activation_params),
+                get_activation(nonlinear_activation,
+                               **nonlinear_activation_params),
                 nn.Conv1D(channels, channels, 1, bias_attr=bias), )
         else:
             self.stack = nn.Sequential(
-                getattr(nn, nonlinear_activation)(
-                    **nonlinear_activation_params),
+                get_activation(nonlinear_activation,
+                               **nonlinear_activation_params),
                 CausalConv1D(
                     channels,
                     channels,
@@ -89,8 +92,8 @@ class ResidualStack(nn.Layer):
                     bias=bias,
                     pad=pad,
                     pad_params=pad_params, ),
-                getattr(nn, nonlinear_activation)(
-                    **nonlinear_activation_params),
+                get_activation(nonlinear_activation,
+                               **nonlinear_activation_params),
                 nn.Conv1D(channels, channels, 1, bias_attr=bias), )
 
         # defile extra layer for skip connection
diff --git a/paddlespeech/t2s/modules/style_encoder.py b/paddlespeech/t2s/modules/style_encoder.py
index e76226f3c587d22006bcf6d513046ec641e91a61..9d4b83a2e4f19c948ca187c75245066418a9ced4 100644
--- a/paddlespeech/t2s/modules/style_encoder.py
+++ b/paddlespeech/t2s/modules/style_encoder.py
@@ -298,8 +298,8 @@ class MultiHeadedAttention(BaseMultiHeadedAttention):
 
     def __init__(self, q_dim, k_dim, v_dim, n_head, n_feat, dropout_rate=0.0):
         """Initialize multi head attention module."""
-        # NOTE(kan-bayashi): Do not use super().__init__() here since we want to
-        #   overwrite BaseMultiHeadedAttention.__init__() method.
+        # Do not use super().__init__() here since we want to
+        # overwrite BaseMultiHeadedAttention.__init__() method.
         nn.Layer.__init__(self)
         assert n_feat % n_head == 0
         # We assume d_v always equals d_k
diff --git a/paddlespeech/t2s/modules/tade_res_block.py b/paddlespeech/t2s/modules/tade_res_block.py
new file mode 100644
index 0000000000000000000000000000000000000000..19b07639f863237febbd3b06877fd42591427e49
--- /dev/null
+++ b/paddlespeech/t2s/modules/tade_res_block.py
@@ -0,0 +1,164 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+"""StyleMelGAN's TADEResBlock Modules."""
+from functools import partial
+
+import paddle.nn.functional as F
+from paddle import nn
+
+
+class TADELayer(nn.Layer):
+    """TADE Layer module."""
+
+    def __init__(
+            self,
+            in_channels: int=64,
+            aux_channels: int=80,
+            kernel_size: int=9,
+            bias: bool=True,
+            upsample_factor: int=2,
+            upsample_mode: str="nearest", ):
+        """Initilize TADE layer."""
+        super().__init__()
+        self.norm = nn.InstanceNorm1D(
+            in_channels, momentum=0.1, data_format="NCL")
+        self.aux_conv = nn.Sequential(
+            nn.Conv1D(
+                aux_channels,
+                in_channels,
+                kernel_size,
+                1,
+                bias_attr=bias,
+                padding=(kernel_size - 1) // 2, ), )
+        self.gated_conv = nn.Sequential(
+            nn.Conv1D(
+                in_channels,
+                in_channels * 2,
+                kernel_size,
+                1,
+                bias_attr=bias,
+                padding=(kernel_size - 1) // 2, ), )
+        self.upsample = nn.Upsample(
+            scale_factor=upsample_factor, mode=upsample_mode)
+
+    def forward(self, x, c):
+        """Calculate forward propagation.
+        Parameters
+        ----------
+        x : Tensor
+            Input tensor (B, in_channels, T).
+        c : Tensor
+            Auxiliary input tensor (B, aux_channels, T).
+        Returns
+        ----------
+        Tensor
+            Output tensor (B, in_channels, T * upsample_factor).
+        Tensor
+            Upsampled aux tensor (B, in_channels, T * upsample_factor).
+        """
+
+        x = self.norm(x)
+        # 'bilinear', 'bicubic' and 'nearest' only support 4-D tensor.
+        c = self.upsample(c.unsqueeze(-1))
+        c = c[:, :, :, 0]
+
+        c = self.aux_conv(c)
+        cg = self.gated_conv(c)
+        cg1, cg2 = cg.split(2, axis=1)
+        # 'bilinear', 'bicubic' and 'nearest' only support 4-D tensor.
+        y = cg1 * self.upsample(x.unsqueeze(-1))[:, :, :, 0] + cg2
+        return y, c
+
+
+class TADEResBlock(nn.Layer):
+    """TADEResBlock module."""
+
+    def __init__(
+            self,
+            in_channels: int=64,
+            aux_channels: int=80,
+            kernel_size: int=9,
+            dilation: int=2,
+            bias: bool=True,
+            upsample_factor: int=2,
+            # this is a diff in paddle, the mode only can be "linear" when input is 3D
+            upsample_mode: str="nearest",
+            gated_function: str="softmax", ):
+        """Initialize TADEResBlock module."""
+        super().__init__()
+        self.tade1 = TADELayer(
+            in_channels=in_channels,
+            aux_channels=aux_channels,
+            kernel_size=kernel_size,
+            bias=bias,
+            upsample_factor=1,
+            upsample_mode=upsample_mode, )
+        self.gated_conv1 = nn.Conv1D(
+            in_channels,
+            in_channels * 2,
+            kernel_size,
+            1,
+            bias_attr=bias,
+            padding=(kernel_size - 1) // 2, )
+        self.tade2 = TADELayer(
+            in_channels=in_channels,
+            aux_channels=in_channels,
+            kernel_size=kernel_size,
+            bias=bias,
+            upsample_factor=upsample_factor,
+            upsample_mode=upsample_mode, )
+        self.gated_conv2 = nn.Conv1D(
+            in_channels,
+            in_channels * 2,
+            kernel_size,
+            1,
+            bias_attr=bias,
+            dilation=dilation,
+            padding=(kernel_size - 1) // 2 * dilation, )
+        self.upsample = nn.Upsample(
+            scale_factor=upsample_factor, mode=upsample_mode)
+        if gated_function == "softmax":
+            self.gated_function = partial(F.softmax, axis=1)
+        elif gated_function == "sigmoid":
+            self.gated_function = F.sigmoid
+        else:
+            raise ValueError(f"{gated_function} is not supported.")
+
+    def forward(self, x, c):
+        """Calculate forward propagation.
+        Parameters
+        ----------
+        x : Tensor
+            Input tensor (B, in_channels, T).
+        c : Tensor
+            Auxiliary input tensor (B, aux_channels, T).
+        Returns
+        ----------
+        Tensor
+            Output tensor (B, in_channels, T * upsample_factor).
+        Tensor
+            Upsampled auxirialy tensor (B, in_channels, T * upsample_factor).
+        """
+        residual = x
+        x, c = self.tade1(x, c)
+        x = self.gated_conv1(x)
+        xa, xb = x.split(2, axis=1)
+        x = self.gated_function(xa) * F.tanh(xb)
+        x, c = self.tade2(x, c)
+        x = self.gated_conv2(x)
+        xa, xb = x.split(2, axis=1)
+        x = self.gated_function(xa) * F.tanh(xb)
+        # 'bilinear', 'bicubic' and 'nearest' only support 4-D tensor.
+        return self.upsample(residual.unsqueeze(-1))[:, :, :, 0] + x, c
diff --git a/setup.py b/setup.py
index 039ab82a742ee306a38f41e6532972449bc8b2cf..a5b773edfe1ffe2e3c7fee50472a0cc7cb1b7e14 100644
--- a/setup.py
+++ b/setup.py
@@ -37,7 +37,6 @@ requirements = {
         "jieba",
         "jsonlines",
         "kaldiio",
-        "kaldi_io",
         "librosa",
         "loguru",
         "matplotlib",
@@ -47,7 +46,6 @@ requirements = {
         "paddleaudio",
         "paddlespeech_feat",
         "praatio~=4.1",
-        "pypi-kenlm",
         "pypinyin",
         "python-dateutil",
         "pyworld",
@@ -56,8 +54,6 @@ requirements = {
         "scipy",
         "sentencepiece~=0.1.96",
         "soundfile~=0.10",
-        "sox",
-        "soxbindings",
         "textgrid",
         "timer",
         "tqdm",
@@ -74,7 +70,10 @@ requirements = {
         "phkit",
         "Pillow",
         "pybind11",
+        "pypi-kenlm",
         "snakeviz",
+        "sox",
+        "soxbindings",
         "unidecode",
         "yq",
         "pre-commit",
diff --git a/tests/unit/asr/u2_model_test.py b/tests/unit/asr/u2_model_test.py
index f46c6d403ed3855a42286297dac828e2bd7fb2da..5b11d2ad3e2d42e7bd6d291dcc633475ffd034c2 100644
--- a/tests/unit/asr/u2_model_test.py
+++ b/tests/unit/asr/u2_model_test.py
@@ -74,8 +74,6 @@ class TestU2Model(unittest.TestCase):
             model_conf:
                 ctc_weight: 0.3
                 lsm_weight: 0.1     # label smoothing option
-                ctc_dropoutrate: 0.0
-                ctc_grad_norm_type: null
                 length_normalized_loss: false
         """
         cfg = CN().load_cfg(conf_str)
@@ -128,8 +126,6 @@ class TestU2Model(unittest.TestCase):
             model_conf:
                 ctc_weight: 0.3
                 lsm_weight: 0.1     # label smoothing option
-                ctc_dropoutrate: 0.0
-                ctc_grad_norm_type: null
                 length_normalized_loss: false
         """
         cfg = CN().load_cfg(conf_str)