diff --git a/audio/.gitignore b/audio/.gitignore
deleted file mode 100644
index e649619ec90aee665e7d96b4c829e0ef8ee75d98..0000000000000000000000000000000000000000
--- a/audio/.gitignore
+++ /dev/null
@@ -1,7 +0,0 @@
-.ipynb_checkpoints/**
-*.ipynb
-nohup.out
-__pycache__/
-*.wav
-*.m4a
-obsolete/**
diff --git a/audio/.pre-commit-config.yaml b/audio/.pre-commit-config.yaml
deleted file mode 100644
index 4100f3480771d7affa0354ebeb8829f3cba0698e..0000000000000000000000000000000000000000
--- a/audio/.pre-commit-config.yaml
+++ /dev/null
@@ -1,45 +0,0 @@
-repos:
--   repo: local
-    hooks:
-    -   id: yapf
-        name: yapf
-        entry: yapf
-        language: system
-        args: [-i, --style .style.yapf]
-        files: \.py$
-
--   repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: a11d9314b22d8f8c7556443875b731ef05965464
-    hooks:
-    -   id: check-merge-conflict
-    -   id: check-symlinks
-    -   id: end-of-file-fixer
-    -   id: trailing-whitespace
-    -   id: detect-private-key
-    -   id: check-symlinks
-    -   id: check-added-large-files
-
--   repo: https://github.com/pycqa/isort
-    rev: 5.8.0
-    hooks:
-    -   id: isort
-        name: isort (python)
-    -   id: isort
-        name: isort (cython)
-        types: [cython]
-    -   id: isort
-        name: isort (pyi)
-        types: [pyi]
-
--   repo: local
-    hooks:
-    -   id: flake8
-        name: flake8
-        entry: flake8
-        language: system
-        args:
-        -   --count
-        -   --select=E9,F63,F7,F82
-        -   --show-source
-        -   --statistics
-        files: \.py$
diff --git a/audio/.style.yapf b/audio/.style.yapf
deleted file mode 100644
index 4741fb4f3bbc6681088cf9e960321e7b857a93a8..0000000000000000000000000000000000000000
--- a/audio/.style.yapf
+++ /dev/null
@@ -1,3 +0,0 @@
-[style]
-based_on_style = pep8
-column_limit = 80
diff --git a/audio/LICENSE b/audio/LICENSE
deleted file mode 100644
index 261eeb9e9f8b2b4b0d119366dda99c6fd7d35c64..0000000000000000000000000000000000000000
--- a/audio/LICENSE
+++ /dev/null
@@ -1,201 +0,0 @@
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Copyright [yyyy] [name of copyright owner]
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
diff --git a/audio/README.md b/audio/README.md
deleted file mode 100644
index 9607fd86e497f825e1e4f0957829f9d20caffe2b..0000000000000000000000000000000000000000
--- a/audio/README.md
+++ /dev/null
@@ -1,37 +0,0 @@
-# PaddleAudio:  The audio library for PaddlePaddle
-
-## Introduction
-PaddleAudio is the audio toolkit to speed up your audio research and development loop in PaddlePaddle. It currently provides a collection of audio datasets, feature-extraction functions, audio transforms,state-of-the-art pre-trained models in sound tagging/classification and anomaly sound detection. More models and features are on the roadmap.
-
-
-
-## Features
-- Spectrogram and related features are compatible with librosa.
-- State-of-the-art models in sound tagging on Audioset, sound classification on esc50, and more to come.
-- Ready-to-use audio embedding with a line of code, includes sound embedding and more on the roadmap.
-- Data loading supports for common open source audio in multiple languages including English, Mandarin and so on.
-
-
-## Install
-```
-git clone https://github.com/PaddlePaddle/models
-cd models/PaddleAudio
-pip install .
-
-```
-
-## Quick start
-### Audio loading and feature extraction
-```
-import paddleaudio as pa
-s,r = pa.load(f)
-mel_spect = pa.melspectrogram(s,sr=r)
-```
-
-###  Examples
-We provide a set of examples to help you get started in using PaddleAudio quickly.
-- [PANNs:  acoustic scene and events analysis using pre-trained models](./examples/panns)
-- [Environmental Sound classification on ESC-50 dataset](./examples/sound_classification)
-- [Training a audio-tagging network on Audioset](./examples/audioset_training)
-
-Please refer to [example directory](./examples) for more details.
diff --git a/audio/examples/panns/README.md b/audio/examples/panns/README.md
deleted file mode 100644
index 243ebf8e51f066bb386806153ec373847e0170a4..0000000000000000000000000000000000000000
--- a/audio/examples/panns/README.md
+++ /dev/null
@@ -1,128 +0,0 @@
-# Audio Tagging
-
-声音分类的任务是单标签的分类任务，但是对于一段音频来说，它可以是多标签的。譬如在一般的室内办公环境进行录音，这段音频里可能包含人们说话的声音、键盘敲打的声音、鼠标点击的声音，还有室内的一些其他背景声音。对于通用的声音识别和声音检测场景而言，对一段音频预测多个标签是具有很强的实用性的。
-
-在IEEE ICASSP 2017 大会上，谷歌开放了一个大规模的音频数据集[Audioset](https://research.google.com/audioset/)。该数据集包含了 632 类的音频类别以及 2,084,320 条人工标记的每段 10 秒长度的声音剪辑片段（来源于YouTube视频）。目前该数据集已经有210万个已标注的视频数据，5800小时的音频数据，经过标记的声音样本的标签类别为527。
-
-`PANNs`([PANNs: Large-Scale Pretrained Audio Neural Networks for Audio Pattern Recognition](https://arxiv.org/pdf/1912.10211.pdf))是基于Audioset数据集训练的声音分类/识别的模型。其预训练的任务是多标签的声音识别，因此可用于声音的实时tagging。
-
-本示例采用`PANNs`预训练模型，基于Audioset的标签类别对输入音频实时tagging，并最终以文本形式输出对应时刻的top k类别和对应的得分。
-
-
-## 模型简介
-
-PaddleAudio提供了PANNs的CNN14、CNN10和CNN6的预训练模型，可供用户选择使用：
-- CNN14: 该模型主要包含12个卷积层和2个全连接层，模型参数的数量为79.6M，embbedding维度是2048。
-- CNN10: 该模型主要包含8个卷积层和2个全连接层，模型参数的数量为4.9M，embbedding维度是512。
-- CNN6: 该模型主要包含4个卷积层和2个全连接层，模型参数的数量为4.5M，embbedding维度是512。
-
-
-## 快速开始
-
-### 模型预测
-
-```shell
-export CUDA_VISIBLE_DEVICES=0
-python audio_tag.py --device gpu --wav ./cat.wav --sample_duration 2 --hop_duration 0.3 --output_dir ./output_dir
-```
-
-可支持配置的参数：
-
-- `device`: 选用什么设备进行训练，可选cpu或gpu，默认为gpu。如使用gpu训练则参数gpus指定GPU卡号。
-- `wav`: 指定预测的音频文件。
-- `sample_duration`: 模型每次预测的音频时间长度，单位为秒，默认为2s。
-- `hop_duration`: 每两个预测音频的时间间隔，单位为秒，默认为0.3s。
-- `output_dir`: 模型预测结果存放的路径，默认为`./output_dir`。
-
-示例代码中使用的预训练模型为`CNN14`，如果想更换为其他预训练模型，可通过以下方式执行：
-```python
-from paddleaudio.models.panns import cnn14, cnn10, cnn6
-
-# CNN14
-model = cnn14(pretrained=True, extract_embedding=False)
-# CNN10
-model = cnn10(pretrained=True, extract_embedding=False)
-# CNN6
-model = cnn6(pretrained=True, extract_embedding=False)
-```
-
-执行结果：
-```
-[2021-04-30 19:15:41,025] [    INFO] - Saved tagging results to ./output_dir/audioset_tagging_sr_44100.npz
-```
-
-执行后得分结果保存在`output_dir`的`.npz`文件中。
-
-
-### 生成tagging标签文本
-```shell
-python parse_result.py --tagging_file ./output_dir/audioset_tagging_sr_44100.npz --top_k 10 --smooth True --smooth_size 5 --label_file ./assets/audioset_labels.txt --output_dir ./output_dir
-```
-
-可支持配置的参数：
-
-- `tagging_file`: 模型预测结果文件。
-- `top_k`: 获取预测结果中，得分最高的前top_k个标签，默认为10。
-- `smooth`: 预测结果的后验概率平滑，默认为True，表示应用平滑。
-- `smooth_size`: 平滑计算过程中的样本数量，默认为5。
-- `label_file`: 模型预测结果对应的Audioset类别的文本文件。
-- `output_dir`: 标签文本存放的路径，默认为`./output_dir`。
-
-执行结果：
-```
-[2021-04-30 19:26:58,743] [    INFO] - Posterior smoothing...
-[2021-04-30 19:26:58,746] [    INFO] - Saved tagging labels to ./output_dir/audioset_tagging_sr_44100.txt
-```
-
-执行后文本结果保存在`output_dir`的`.txt`文件中。
-
-
-## Tagging标签文本
-
-最终输出的文本结果如下所示。  
-样本每个时间范围的top k结果用空行分隔。在每一个结果中，第一行是时间信息，数字表示tagging结果在时间起点信息，比例值代表当前时刻`t`与音频总长度`T`的比值；紧接的k行是对应的标签和得分。
-
-```
-0.0
-Cat: 0.9144676923751831
-Animal: 0.8855036497116089
-Domestic animals, pets: 0.804577112197876
-Meow: 0.7422927021980286
-Music: 0.19959309697151184
-Inside, small room: 0.12550437450408936
-Caterwaul: 0.021584441885352135
-Purr: 0.020247288048267365
-Speech: 0.018197158351540565
-Vehicle: 0.007446660194545984
-
-0.059197544398158296
-Cat: 0.9250872135162354
-Animal: 0.8957151174545288
-Domestic animals, pets: 0.8228275775909424
-Meow: 0.7650775909423828
-Music: 0.20210561156272888
-Inside, small room: 0.12290887534618378
-Caterwaul: 0.029371455311775208
-Purr: 0.018731823191046715
-Speech: 0.017130598425865173
-Vehicle: 0.007748497650027275
-
-0.11839508879631659
-Cat: 0.9336574673652649
-Animal: 0.9111202359199524
-Domestic animals, pets: 0.8349071145057678
-Meow: 0.7761964797973633
-Music: 0.20467285811901093
-Inside, small room: 0.10709915310144424
-Caterwaul: 0.05370649695396423
-Purr: 0.018830426037311554
-Speech: 0.017361722886562347
-Vehicle: 0.006929398979991674
-
-...
-...
-```
-
-以下[Demo](https://bj.bcebos.com/paddleaudio/media/audio_tagging_demo.mp4)展示了一个将tagging标签输出到视频的例子，可以实时地对音频进行多标签预测。
-
-![](https://bj.bcebos.com/paddleaudio/media/audio_tagging_demo.gif)
diff --git a/audio/examples/panns/assets/audioset_labels.txt b/audio/examples/panns/assets/audioset_labels.txt
deleted file mode 100644
index 6fccf56a769d23191f5482b2805127cb68c70888..0000000000000000000000000000000000000000
--- a/audio/examples/panns/assets/audioset_labels.txt
+++ /dev/null
@@ -1,527 +0,0 @@
-Speech
-Male speech, man speaking
-Female speech, woman speaking
-Child speech, kid speaking
-Conversation
-Narration, monologue
-Babbling
-Speech synthesizer
-Shout
-Bellow
-Whoop
-Yell
-Battle cry
-Children shouting
-Screaming
-Whispering
-Laughter
-Baby laughter
-Giggle
-Snicker
-Belly laugh
-Chuckle, chortle
-Crying, sobbing
-Baby cry, infant cry
-Whimper
-Wail, moan
-Sigh
-Singing
-Choir
-Yodeling
-Chant
-Mantra
-Male singing
-Female singing
-Child singing
-Synthetic singing
-Rapping
-Humming
-Groan
-Grunt
-Whistling
-Breathing
-Wheeze
-Snoring
-Gasp
-Pant
-Snort
-Cough
-Throat clearing
-Sneeze
-Sniff
-Run
-Shuffle
-Walk, footsteps
-Chewing, mastication
-Biting
-Gargling
-Stomach rumble
-Burping, eructation
-Hiccup
-Fart
-Hands
-Finger snapping
-Clapping
-Heart sounds, heartbeat
-Heart murmur
-Cheering
-Applause
-Chatter
-Crowd
-Hubbub, speech noise, speech babble
-Children playing
-Animal
-Domestic animals, pets
-Dog
-Bark
-Yip
-Howl
-Bow-wow
-Growling
-Whimper (dog)
-Cat
-Purr
-Meow
-Hiss
-Caterwaul
-Livestock, farm animals, working animals
-Horse
-Clip-clop
-Neigh, whinny
-Cattle, bovinae
-Moo
-Cowbell
-Pig
-Oink
-Goat
-Bleat
-Sheep
-Fowl
-Chicken, rooster
-Cluck
-Crowing, cock-a-doodle-doo
-Turkey
-Gobble
-Duck
-Quack
-Goose
-Honk
-Wild animals
-Roaring cats (lions, tigers)
-Roar
-Bird
-Bird vocalization, bird call, bird song
-Chirp, tweet
-Squawk
-Pigeon, dove
-Coo
-Crow
-Caw
-Owl
-Hoot
-Bird flight, flapping wings
-Canidae, dogs, wolves
-Rodents, rats, mice
-Mouse
-Patter
-Insect
-Cricket
-Mosquito
-Fly, housefly
-Buzz
-Bee, wasp, etc.
-Frog
-Croak
-Snake
-Rattle
-Whale vocalization
-Music
-Musical instrument
-Plucked string instrument
-Guitar
-Electric guitar
-Bass guitar
-Acoustic guitar
-Steel guitar, slide guitar
-Tapping (guitar technique)
-Strum
-Banjo
-Sitar
-Mandolin
-Zither
-Ukulele
-Keyboard (musical)
-Piano
-Electric piano
-Organ
-Electronic organ
-Hammond organ
-Synthesizer
-Sampler
-Harpsichord
-Percussion
-Drum kit
-Drum machine
-Drum
-Snare drum
-Rimshot
-Drum roll
-Bass drum
-Timpani
-Tabla
-Cymbal
-Hi-hat
-Wood block
-Tambourine
-Rattle (instrument)
-Maraca
-Gong
-Tubular bells
-Mallet percussion
-Marimba, xylophone
-Glockenspiel
-Vibraphone
-Steelpan
-Orchestra
-Brass instrument
-French horn
-Trumpet
-Trombone
-Bowed string instrument
-String section
-Violin, fiddle
-Pizzicato
-Cello
-Double bass
-Wind instrument, woodwind instrument
-Flute
-Saxophone
-Clarinet
-Harp
-Bell
-Church bell
-Jingle bell
-Bicycle bell
-Tuning fork
-Chime
-Wind chime
-Change ringing (campanology)
-Harmonica
-Accordion
-Bagpipes
-Didgeridoo
-Shofar
-Theremin
-Singing bowl
-Scratching (performance technique)
-Pop music
-Hip hop music
-Beatboxing
-Rock music
-Heavy metal
-Punk rock
-Grunge
-Progressive rock
-Rock and roll
-Psychedelic rock
-Rhythm and blues
-Soul music
-Reggae
-Country
-Swing music
-Bluegrass
-Funk
-Folk music
-Middle Eastern music
-Jazz
-Disco
-Classical music
-Opera
-Electronic music
-House music
-Techno
-Dubstep
-Drum and bass
-Electronica
-Electronic dance music
-Ambient music
-Trance music
-Music of Latin America
-Salsa music
-Flamenco
-Blues
-Music for children
-New-age music
-Vocal music
-A capella
-Music of Africa
-Afrobeat
-Christian music
-Gospel music
-Music of Asia
-Carnatic music
-Music of Bollywood
-Ska
-Traditional music
-Independent music
-Song
-Background music
-Theme music
-Jingle (music)
-Soundtrack music
-Lullaby
-Video game music
-Christmas music
-Dance music
-Wedding music
-Happy music
-Funny music
-Sad music
-Tender music
-Exciting music
-Angry music
-Scary music
-Wind
-Rustling leaves
-Wind noise (microphone)
-Thunderstorm
-Thunder
-Water
-Rain
-Raindrop
-Rain on surface
-Stream
-Waterfall
-Ocean
-Waves, surf
-Steam
-Gurgling
-Fire
-Crackle
-Vehicle
-Boat, Water vehicle
-Sailboat, sailing ship
-Rowboat, canoe, kayak
-Motorboat, speedboat
-Ship
-Motor vehicle (road)
-Car
-Vehicle horn, car horn, honking
-Toot
-Car alarm
-Power windows, electric windows
-Skidding
-Tire squeal
-Car passing by
-Race car, auto racing
-Truck
-Air brake
-Air horn, truck horn
-Reversing beeps
-Ice cream truck, ice cream van
-Bus
-Emergency vehicle
-Police car (siren)
-Ambulance (siren)
-Fire engine, fire truck (siren)
-Motorcycle
-Traffic noise, roadway noise
-Rail transport
-Train
-Train whistle
-Train horn
-Railroad car, train wagon
-Train wheels squealing
-Subway, metro, underground
-Aircraft
-Aircraft engine
-Jet engine
-Propeller, airscrew
-Helicopter
-Fixed-wing aircraft, airplane
-Bicycle
-Skateboard
-Engine
-Light engine (high frequency)
-Dental drill, dentist's drill
-Lawn mower
-Chainsaw
-Medium engine (mid frequency)
-Heavy engine (low frequency)
-Engine knocking
-Engine starting
-Idling
-Accelerating, revving, vroom
-Door
-Doorbell
-Ding-dong
-Sliding door
-Slam
-Knock
-Tap
-Squeak
-Cupboard open or close
-Drawer open or close
-Dishes, pots, and pans
-Cutlery, silverware
-Chopping (food)
-Frying (food)
-Microwave oven
-Blender
-Water tap, faucet
-Sink (filling or washing)
-Bathtub (filling or washing)
-Hair dryer
-Toilet flush
-Toothbrush
-Electric toothbrush
-Vacuum cleaner
-Zipper (clothing)
-Keys jangling
-Coin (dropping)
-Scissors
-Electric shaver, electric razor
-Shuffling cards
-Typing
-Typewriter
-Computer keyboard
-Writing
-Alarm
-Telephone
-Telephone bell ringing
-Ringtone
-Telephone dialing, DTMF
-Dial tone
-Busy signal
-Alarm clock
-Siren
-Civil defense siren
-Buzzer
-Smoke detector, smoke alarm
-Fire alarm
-Foghorn
-Whistle
-Steam whistle
-Mechanisms
-Ratchet, pawl
-Clock
-Tick
-Tick-tock
-Gears
-Pulleys
-Sewing machine
-Mechanical fan
-Air conditioning
-Cash register
-Printer
-Camera
-Single-lens reflex camera
-Tools
-Hammer
-Jackhammer
-Sawing
-Filing (rasp)
-Sanding
-Power tool
-Drill
-Explosion
-Gunshot, gunfire
-Machine gun
-Fusillade
-Artillery fire
-Cap gun
-Fireworks
-Firecracker
-Burst, pop
-Eruption
-Boom
-Wood
-Chop
-Splinter
-Crack
-Glass
-Chink, clink
-Shatter
-Liquid
-Splash, splatter
-Slosh
-Squish
-Drip
-Pour
-Trickle, dribble
-Gush
-Fill (with liquid)
-Spray
-Pump (liquid)
-Stir
-Boiling
-Sonar
-Arrow
-Whoosh, swoosh, swish
-Thump, thud
-Thunk
-Electronic tuner
-Effects unit
-Chorus effect
-Basketball bounce
-Bang
-Slap, smack
-Whack, thwack
-Smash, crash
-Breaking
-Bouncing
-Whip
-Flap
-Scratch
-Scrape
-Rub
-Roll
-Crushing
-Crumpling, crinkling
-Tearing
-Beep, bleep
-Ping
-Ding
-Clang
-Squeal
-Creak
-Rustle
-Whir
-Clatter
-Sizzle
-Clicking
-Clickety-clack
-Rumble
-Plop
-Jingle, tinkle
-Hum
-Zing
-Boing
-Crunch
-Silence
-Sine wave
-Harmonic
-Chirp tone
-Sound effect
-Pulse
-Inside, small room
-Inside, large room or hall
-Inside, public space
-Outside, urban or manmade
-Outside, rural or natural
-Reverberation
-Echo
-Noise
-Environmental noise
-Static
-Mains hum
-Distortion
-Sidetone
-Cacophony
-White noise
-Pink noise
-Throbbing
-Vibration
-Television
-Radio
-Field recording
diff --git a/audio/examples/panns/audio_tag.py b/audio/examples/panns/audio_tag.py
deleted file mode 100644
index 6f08cd1ce3260852391d77095e6ea9d1d1f1e3ad..0000000000000000000000000000000000000000
--- a/audio/examples/panns/audio_tag.py
+++ /dev/null
@@ -1,111 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import os
-from typing import List
-
-import numpy as np
-import paddle
-from paddleaudio.backends import load as load_audio
-from paddleaudio.features import melspectrogram
-from paddleaudio.models.panns import cnn14
-from paddleaudio.utils import logger
-
-# yapf: disable
-parser = argparse.ArgumentParser(__doc__)
-parser.add_argument('--device', choices=['cpu', 'gpu'], default='gpu', help='Select which device to predict, defaults to gpu.')
-parser.add_argument('--wav', type=str, required=True, help='Audio file to infer.')
-parser.add_argument('--sample_duration', type=float, default=2.0, help='Duration(in seconds) of tagging samples to predict.')
-parser.add_argument('--hop_duration', type=float, default=0.3, help='Duration(in seconds) between two samples.')
-parser.add_argument('--output_dir', type=str, default='./output_dir', help='Directory to save tagging result.')
-args = parser.parse_args()
-# yapf: enable
-
-
-def split(waveform: np.ndarray, win_size: int, hop_size: int):
-    """
-    Split into N waveforms.
-    N is decided by win_size and hop_size.
-    """
-    assert isinstance(waveform, np.ndarray)
-    time = []
-    data = []
-    for i in range(0, len(waveform), hop_size):
-        segment = waveform[i:i + win_size]
-        if len(segment) < win_size:
-            segment = np.pad(segment, (0, win_size - len(segment)))
-        data.append(segment)
-        time.append(i / len(waveform))
-    return time, data
-
-
-def batchify(data: List[List[float]],
-             sample_rate: int,
-             batch_size: int,
-             **kwargs):
-    """
-    Extract features from waveforms and create batches.
-    """
-    examples = []
-    for waveform in data:
-        feats = melspectrogram(waveform, sample_rate, **kwargs).transpose()
-        examples.append(feats)
-
-    # Seperates data into some batches.
-    one_batch = []
-    for example in examples:
-        one_batch.append(example)
-        if len(one_batch) == batch_size:
-            yield one_batch
-            one_batch = []
-    if one_batch:
-        yield one_batch
-
-
-def predict(model, data: List[List[float]], sample_rate: int,
-            batch_size: int=1):
-    """
-    Use pretrained model to make predictions.
-    """
-    batches = batchify(data, sample_rate, batch_size)
-    results = None
-    model.eval()
-    for batch in batches:
-        feats = paddle.to_tensor(batch).unsqueeze(1)  \
-            # (batch_size, num_frames, num_melbins) -> (batch_size, 1, num_frames, num_melbins)
-
-        audioset_scores = model(feats)
-        if results is None:
-            results = audioset_scores.numpy()
-        else:
-            results = np.concatenate((results, audioset_scores.numpy()))
-
-    return results
-
-
-if __name__ == '__main__':
-    paddle.set_device(args.device)
-    model = cnn14(pretrained=True, extract_embedding=False)
-    waveform, sr = load_audio(args.wav, sr=None)
-    time, data = split(waveform,
-                       int(args.sample_duration * sr),
-                       int(args.hop_duration * sr))
-    results = predict(model, data, sr, batch_size=8)
-
-    if not os.path.exists(args.output_dir):
-        os.makedirs(args.output_dir)
-    time = np.arange(0, 1, int(args.hop_duration * sr) / len(waveform))
-    output_file = os.path.join(args.output_dir, f'audioset_tagging_sr_{sr}.npz')
-    np.savez(output_file, time=time, scores=results)
-    logger.info(f'Saved tagging results to {output_file}')
diff --git a/audio/examples/panns/parse_result.py b/audio/examples/panns/parse_result.py
deleted file mode 100644
index 056c573f2e369c48ca099de9c08ad9dd07d5159a..0000000000000000000000000000000000000000
--- a/audio/examples/panns/parse_result.py
+++ /dev/null
@@ -1,83 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import ast
-import os
-from typing import Dict
-
-import numpy as np
-from paddleaudio.utils import logger
-
-# yapf: disable
-parser = argparse.ArgumentParser(__doc__)
-parser.add_argument('--tagging_file', type=str, required=True, help='')
-parser.add_argument('--top_k', type=int, default=10, help='Get top k predicted results of audioset labels.')
-parser.add_argument('--smooth', type=ast.literal_eval, default=True, help='Set "True" to apply posterior smoothing.')
-parser.add_argument('--smooth_size', type=int, default=5, help='Window size of posterior smoothing.')
-parser.add_argument('--label_file', type=str, default='./assets/audioset_labels.txt', help='File of audioset labels.')
-parser.add_argument('--output_dir', type=str, default='./output_dir', help='Directory to save tagging labels.')
-args = parser.parse_args()
-# yapf: enable
-
-
-def smooth(results: np.ndarray, win_size: int):
-    """
-    Execute posterior smoothing in-place.
-    """
-    for i in range(len(results) - 1, -1, -1):
-        if i < win_size - 1:
-            left = 0
-        else:
-            left = i + 1 - win_size
-        results[i] = np.sum(results[left:i + 1], axis=0) / (i - left + 1)
-
-
-def generate_topk_label(k: int, label_map: Dict, result: np.ndarray):
-    """
-    Return top k result.
-    """
-    result = np.asarray(result)
-    topk_idx = (-result).argsort()[:k]
-
-    ret = ''
-    for idx in topk_idx:
-        label, score = label_map[idx], result[idx]
-        ret += f'{label}: {score}\n'
-    return ret
-
-
-if __name__ == "__main__":
-    label_map = {}
-    with open(args.label_file, 'r') as f:
-        for i, l in enumerate(f.readlines()):
-            label_map[i] = l.strip()
-
-    results = np.load(args.tagging_file, allow_pickle=True)
-    times, scores = results['time'], results['scores']
-
-    if args.smooth:
-        logger.info('Posterior smoothing...')
-        smooth(scores, win_size=args.smooth_size)
-
-    if not os.path.exists(args.output_dir):
-        os.makedirs(args.output_dir)
-    output_file = os.path.join(
-        args.output_dir,
-        os.path.basename(args.tagging_file).split('.')[0] + '.txt')
-    with open(output_file, 'w') as f:
-        for time, score in zip(times, scores):
-            f.write(f'{time}\n')
-            f.write(generate_topk_label(args.top_k, label_map, score) + '\n')
-
-    logger.info(f'Saved tagging labels to {output_file}')
diff --git a/audio/paddleaudio/datasets/aishell.py b/audio/paddleaudio/datasets/aishell.py
deleted file mode 100644
index d84d9876c07a9dd6bbf32bd8d187ded4a73b7a23..0000000000000000000000000000000000000000
--- a/audio/paddleaudio/datasets/aishell.py
+++ /dev/null
@@ -1,154 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import codecs
-import collections
-import json
-import os
-from typing import Dict
-
-from paddle.io import Dataset
-from tqdm import tqdm
-
-from ..backends import load as load_audio
-from ..utils.download import decompress
-from ..utils.download import download_and_decompress
-from ..utils.env import DATA_HOME
-from ..utils.log import logger
-from .dataset import feat_funcs
-
-__all__ = ['AISHELL1']
-
-
-class AISHELL1(Dataset):
-    """
-    This Open Source Mandarin Speech Corpus, AISHELL-ASR0009-OS1, is 178 hours long.
-    It is a part of AISHELL-ASR0009, of which utterance contains 11 domains, including
-    smart home, autonomous driving, and industrial production. The whole recording was
-    put in quiet indoor environment, using 3 different devices at the same time: high
-    fidelity microphone (44.1kHz, 16-bit,); Android-system mobile phone (16kHz, 16-bit),
-    iOS-system mobile phone (16kHz, 16-bit). Audios in high fidelity were re-sampled
-    to 16kHz to build AISHELL- ASR0009-OS1. 400 speakers from different accent areas
-    in China were invited to participate in the recording. The manual transcription
-    accuracy rate is above 95%, through professional speech annotation and strict
-    quality inspection. The corpus is divided into training, development and testing
-    sets.
-
-    Reference:
-        AISHELL-1: An Open-Source Mandarin Speech Corpus and A Speech Recognition Baseline
-        https://arxiv.org/abs/1709.05522
-    """
-
-    archieves = [
-        {
-            'url': 'http://www.openslr.org/resources/33/data_aishell.tgz',
-            'md5': '2f494334227864a8a8fec932999db9d8',
-        },
-    ]
-    text_meta = os.path.join('data_aishell', 'transcript',
-                             'aishell_transcript_v0.8.txt')
-    utt_info = collections.namedtuple('META_INFO',
-                                      ('file_path', 'utt_id', 'text'))
-    audio_path = os.path.join('data_aishell', 'wav')
-    manifest_path = os.path.join('data_aishell', 'manifest')
-    subset = ['train', 'dev', 'test']
-
-    def __init__(self, subset: str='train', feat_type: str='raw', **kwargs):
-        assert subset in self.subset, 'Dataset subset must be one in {}, but got {}'.format(
-            self.subset, subset)
-        self.subset = subset
-        self.feat_type = feat_type
-        self.feat_config = kwargs
-        self._data = self._get_data()
-        super(AISHELL1, self).__init__()
-
-    def _get_text_info(self) -> Dict[str, str]:
-        ret = {}
-        with open(os.path.join(DATA_HOME, self.text_meta), 'r') as rf:
-            for line in rf.readlines()[1:]:
-                utt_id, text = map(str.strip, line.split(' ',
-                                                         1))  # utt_id, text
-                ret.update({utt_id: ''.join(text.split())})
-        return ret
-
-    def _get_data(self):
-        if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
-            not os.path.isfile(os.path.join(DATA_HOME, self.text_meta)):
-            download_and_decompress(self.archieves, DATA_HOME)
-            # Extract *wav from *.tar.gz.
-            for root, _, files in os.walk(
-                    os.path.join(DATA_HOME, self.audio_path)):
-                for file in files:
-                    if file.endswith('.tar.gz'):
-                        decompress(os.path.join(root, file))
-                        os.remove(os.path.join(root, file))
-
-        text_info = self._get_text_info()
-
-        data = []
-        for root, _, files in os.walk(
-                os.path.join(DATA_HOME, self.audio_path, self.subset)):
-            for file in files:
-                if file.endswith('.wav'):
-                    utt_id = os.path.splitext(file)[0]
-                    if utt_id not in text_info:  # There are some utt_id that without label
-                        continue
-                    text = text_info[utt_id]
-                    file_path = os.path.join(root, file)
-                    data.append(self.utt_info(file_path, utt_id, text))
-
-        return data
-
-    def _convert_to_record(self, idx: int):
-        sample = self._data[idx]
-
-        record = {}
-        # To show all fields in a namedtuple: `type(sample)._fields`
-        for field in type(sample)._fields:
-            record[field] = getattr(sample, field)
-
-        waveform, sr = load_audio(
-            sample[0])  # The first element of sample is file path
-        feat_func = feat_funcs[self.feat_type]
-        feat = feat_func(
-            waveform, sample_rate=sr,
-            **self.feat_config) if feat_func else waveform
-        record.update({'feat': feat, 'duration': len(waveform) / sr})
-        return record
-
-    def create_manifest(self, prefix='manifest'):
-        if not os.path.isdir(os.path.join(DATA_HOME, self.manifest_path)):
-            os.makedirs(os.path.join(DATA_HOME, self.manifest_path))
-
-        manifest_file = os.path.join(DATA_HOME, self.manifest_path,
-                                     f'{prefix}.{self.subset}')
-        with codecs.open(manifest_file, 'w', 'utf-8') as f:
-            for idx in tqdm(range(len(self))):
-                record = self._convert_to_record(idx)
-                record_line = json.dumps(
-                    {
-                        'utt': record['utt_id'],
-                        'feat': record['file_path'],
-                        'feat_shape': (record['duration'], ),
-                        'text': record['text']
-                    },
-                    ensure_ascii=False)
-                f.write(record_line + '\n')
-        logger.info(f'Manifest file {manifest_file} created.')
-
-    def __getitem__(self, idx):
-        record = self._convert_to_record(idx)
-        return tuple(record.values())
-
-    def __len__(self):
-        return len(self._data)
diff --git a/audio/paddleaudio/datasets/dcase.py b/audio/paddleaudio/datasets/dcase.py
deleted file mode 100644
index 47b0c9150667a3b77434e9f090cae9b2412d8293..0000000000000000000000000000000000000000
--- a/audio/paddleaudio/datasets/dcase.py
+++ /dev/null
@@ -1,298 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import collections
-import os
-from typing import List
-from typing import Tuple
-
-from ..utils.download import download_and_decompress
-from ..utils.env import DATA_HOME
-from .dataset import AudioClassificationDataset
-
-__all__ = ['UrbanAcousticScenes', 'UrbanAudioVisualScenes']
-
-
-class UrbanAcousticScenes(AudioClassificationDataset):
-    """
-    TAU Urban Acoustic Scenes 2020 Mobile Development dataset contains recordings from
-    12 European cities in 10 different acoustic scenes using 4 different devices.
-    Additionally, synthetic data for 11 mobile devices was created based on the original
-    recordings. Of the 12 cities, two are present only in the evaluation set.
-
-    Reference:
-        A multi-device dataset for urban acoustic scene classification
-        https://arxiv.org/abs/1807.09840
-    """
-
-    source_url = 'https://zenodo.org/record/3819968/files/'
-    base_name = 'TAU-urban-acoustic-scenes-2020-mobile-development'
-    archieves = [
-        {
-            'url': source_url + base_name + '.meta.zip',
-            'md5': '6eae9db553ce48e4ea246e34e50a3cf5',
-        },
-        {
-            'url': source_url + base_name + '.audio.1.zip',
-            'md5': 'b1e85b8a908d3d6a6ab73268f385d5c8',
-        },
-        {
-            'url': source_url + base_name + '.audio.2.zip',
-            'md5': '4310a13cc2943d6ce3f70eba7ba4c784',
-        },
-        {
-            'url': source_url + base_name + '.audio.3.zip',
-            'md5': 'ed38956c4246abb56190c1e9b602b7b8',
-        },
-        {
-            'url': source_url + base_name + '.audio.4.zip',
-            'md5': '97ab8560056b6816808dedc044dcc023',
-        },
-        {
-            'url': source_url + base_name + '.audio.5.zip',
-            'md5': 'b50f5e0bfed33cd8e52cb3e7f815c6cb',
-        },
-        {
-            'url': source_url + base_name + '.audio.6.zip',
-            'md5': 'fbf856a3a86fff7520549c899dc94372',
-        },
-        {
-            'url': source_url + base_name + '.audio.7.zip',
-            'md5': '0dbffe7b6e45564da649378723284062',
-        },
-        {
-            'url': source_url + base_name + '.audio.8.zip',
-            'md5': 'bb6f77832bf0bd9f786f965beb251b2e',
-        },
-        {
-            'url': source_url + base_name + '.audio.9.zip',
-            'md5': 'a65596a5372eab10c78e08a0de797c9e',
-        },
-        {
-            'url': source_url + base_name + '.audio.10.zip',
-            'md5': '2ad595819ffa1d56d2de4c7ed43205a6',
-        },
-        {
-            'url': source_url + base_name + '.audio.11.zip',
-            'md5': '0ad29f7040a4e6a22cfd639b3a6738e5',
-        },
-        {
-            'url': source_url + base_name + '.audio.12.zip',
-            'md5': 'e5f4400c6b9697295fab4cf507155a2f',
-        },
-        {
-            'url': source_url + base_name + '.audio.13.zip',
-            'md5': '8855ab9f9896422746ab4c5d89d8da2f',
-        },
-        {
-            'url': source_url + base_name + '.audio.14.zip',
-            'md5': '092ad744452cd3e7de78f988a3d13020',
-        },
-        {
-            'url': source_url + base_name + '.audio.15.zip',
-            'md5': '4b5eb85f6592aebf846088d9df76b420',
-        },
-        {
-            'url': source_url + base_name + '.audio.16.zip',
-            'md5': '2e0a89723e58a3836be019e6996ae460',
-        },
-    ]
-    label_list = [
-        'airport', 'shopping_mall', 'metro_station', 'street_pedestrian',
-        'public_square', 'street_traffic', 'tram', 'bus', 'metro', 'park'
-    ]
-
-    meta = os.path.join(base_name, 'meta.csv')
-    meta_info = collections.namedtuple('META_INFO', (
-        'filename', 'scene_label', 'identifier', 'source_label'))
-    subset_meta = {
-        'train': os.path.join(base_name, 'evaluation_setup', 'fold1_train.csv'),
-        'dev':
-        os.path.join(base_name, 'evaluation_setup', 'fold1_evaluate.csv'),
-        'test': os.path.join(base_name, 'evaluation_setup', 'fold1_test.csv'),
-    }
-    subset_meta_info = collections.namedtuple('SUBSET_META_INFO',
-                                              ('filename', 'scene_label'))
-    audio_path = os.path.join(base_name, 'audio')
-
-    def __init__(self, mode: str='train', feat_type: str='raw', **kwargs):
-        """
-        Ags:
-            mode (:obj:`str`, `optional`, defaults to `train`):
-                It identifies the dataset mode (train or dev).
-            feat_type (:obj:`str`, `optional`, defaults to `raw`):
-                It identifies the feature type that user wants to extrace of an audio file.
-        """
-        files, labels = self._get_data(mode)
-        super(UrbanAcousticScenes, self).__init__(
-            files=files, labels=labels, feat_type=feat_type, **kwargs)
-
-    def _get_meta_info(self, subset: str=None,
-                       skip_header: bool=True) -> List[collections.namedtuple]:
-        if subset is None:
-            meta_file = self.meta
-            meta_info = self.meta_info
-        else:
-            assert subset in self.subset_meta, f'Subset must be one in {list(self.subset_meta.keys())}, but got {subset}.'
-            meta_file = self.subset_meta[subset]
-            meta_info = self.subset_meta_info
-
-        ret = []
-        with open(os.path.join(DATA_HOME, meta_file), 'r') as rf:
-            lines = rf.readlines()[1:] if skip_header else rf.readlines()
-            for line in lines:
-                ret.append(meta_info(*line.strip().split('\t')))
-        return ret
-
-    def _get_data(self, mode: str) -> Tuple[List[str], List[int]]:
-        if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
-            not os.path.isfile(os.path.join(DATA_HOME, self.meta)):
-            download_and_decompress(self.archieves, DATA_HOME)
-
-        meta_info = self._get_meta_info(subset=mode, skip_header=True)
-
-        files = []
-        labels = []
-        for sample in meta_info:
-            filename, label = sample[:2]
-            filename = os.path.basename(filename)
-            target = self.label_list.index(label)
-
-            files.append(os.path.join(DATA_HOME, self.audio_path, filename))
-            labels.append(int(target))
-
-        return files, labels
-
-
-class UrbanAudioVisualScenes(AudioClassificationDataset):
-    """
-    TAU Urban Audio Visual Scenes 2021 Development dataset contains synchronized audio
-    and video recordings from 12 European cities in 10 different scenes.
-    This dataset consists of 10-seconds audio and video segments from 10
-    acoustic scenes. The total amount of audio in the development set is 34 hours.
-
-    Reference:
-        A Curated Dataset of Urban Scenes for Audio-Visual Scene Analysis
-        https://arxiv.org/abs/2011.00030
-    """
-
-    source_url = 'https://zenodo.org/record/4477542/files/'
-    base_name = 'TAU-urban-audio-visual-scenes-2021-development'
-
-    archieves = [
-        {
-            'url': source_url + base_name + '.meta.zip',
-            'md5': '76e3d7ed5291b118372e06379cb2b490',
-        },
-        {
-            'url': source_url + base_name + '.audio.1.zip',
-            'md5': '186f6273f8f69ed9dbdc18ad65ac234f',
-        },
-        {
-            'url': source_url + base_name + '.audio.2.zip',
-            'md5': '7fd6bb63127f5785874a55aba4e77aa5',
-        },
-        {
-            'url': source_url + base_name + '.audio.3.zip',
-            'md5': '61396bede29d7c8c89729a01a6f6b2e2',
-        },
-        {
-            'url': source_url + base_name + '.audio.4.zip',
-            'md5': '6ddac89717fcf9c92c451868eed77fe1',
-        },
-        {
-            'url': source_url + base_name + '.audio.5.zip',
-            'md5': 'af4820756cdf1a7d4bd6037dc034d384',
-        },
-        {
-            'url': source_url + base_name + '.audio.6.zip',
-            'md5': 'ebd11ec24411f2a17a64723bd4aa7fff',
-        },
-        {
-            'url': source_url + base_name + '.audio.7.zip',
-            'md5': '2be39a76aeed704d5929d020a2909efd',
-        },
-        {
-            'url': source_url + base_name + '.audio.8.zip',
-            'md5': '972d8afe0874720fc2f28086e7cb22a9',
-        },
-    ]
-    label_list = [
-        'airport', 'shopping_mall', 'metro_station', 'street_pedestrian',
-        'public_square', 'street_traffic', 'tram', 'bus', 'metro', 'park'
-    ]
-
-    meta_base_path = os.path.join(base_name, base_name + '.meta')
-    meta = os.path.join(meta_base_path, 'meta.csv')
-    meta_info = collections.namedtuple('META_INFO', (
-        'filename_audio', 'filename_video', 'scene_label', 'identifier'))
-    subset_meta = {
-        'train':
-        os.path.join(meta_base_path, 'evaluation_setup', 'fold1_train.csv'),
-        'dev':
-        os.path.join(meta_base_path, 'evaluation_setup', 'fold1_evaluate.csv'),
-        'test':
-        os.path.join(meta_base_path, 'evaluation_setup', 'fold1_test.csv'),
-    }
-    subset_meta_info = collections.namedtuple('SUBSET_META_INFO', (
-        'filename_audio', 'filename_video', 'scene_label'))
-    audio_path = os.path.join(base_name, 'audio')
-
-    def __init__(self, mode: str='train', feat_type: str='raw', **kwargs):
-        """
-        Ags:
-            mode (:obj:`str`, `optional`, defaults to `train`):
-                It identifies the dataset mode (train or dev).
-            feat_type (:obj:`str`, `optional`, defaults to `raw`):
-                It identifies the feature type that user wants to extrace of an audio file.
-        """
-        files, labels = self._get_data(mode)
-        super(UrbanAudioVisualScenes, self).__init__(
-            files=files, labels=labels, feat_type=feat_type, **kwargs)
-
-    def _get_meta_info(self, subset: str=None,
-                       skip_header: bool=True) -> List[collections.namedtuple]:
-        if subset is None:
-            meta_file = self.meta
-            meta_info = self.meta_info
-        else:
-            assert subset in self.subset_meta, f'Subset must be one in {list(self.subset_meta.keys())}, but got {subset}.'
-            meta_file = self.subset_meta[subset]
-            meta_info = self.subset_meta_info
-
-        ret = []
-        with open(os.path.join(DATA_HOME, meta_file), 'r') as rf:
-            lines = rf.readlines()[1:] if skip_header else rf.readlines()
-            for line in lines:
-                ret.append(meta_info(*line.strip().split('\t')))
-        return ret
-
-    def _get_data(self, mode: str) -> Tuple[List[str], List[int]]:
-        if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
-            not os.path.isfile(os.path.join(DATA_HOME, self.meta)):
-            download_and_decompress(self.archieves,
-                                    os.path.join(DATA_HOME, self.base_name))
-
-        meta_info = self._get_meta_info(subset=mode, skip_header=True)
-
-        files = []
-        labels = []
-        for sample in meta_info:
-            filename, _, label = sample[:3]
-            filename = os.path.basename(filename)
-            target = self.label_list.index(label)
-
-            files.append(os.path.join(DATA_HOME, self.audio_path, filename))
-            labels.append(int(target))
-
-        return files, labels
diff --git a/audio/paddleaudio/datasets/librispeech.py b/audio/paddleaudio/datasets/librispeech.py
deleted file mode 100644
index c3b3c83df869d58133ed325fad240dc31fa06522..0000000000000000000000000000000000000000
--- a/audio/paddleaudio/datasets/librispeech.py
+++ /dev/null
@@ -1,199 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import codecs
-import collections
-import json
-import os
-from typing import Dict
-
-from paddle.io import Dataset
-from tqdm import tqdm
-
-from ..backends import load as load_audio
-from ..utils.download import download_and_decompress
-from ..utils.env import DATA_HOME
-from ..utils.log import logger
-from .dataset import feat_funcs
-
-__all__ = ['LIBRISPEECH']
-
-
-class LIBRISPEECH(Dataset):
-    """
-    LibriSpeech is a corpus of approximately 1000 hours of 16kHz read English speech,
-    prepared by Vassil Panayotov with the assistance of Daniel Povey. The data is
-    derived from read audiobooks from the LibriVox project, and has been carefully
-    segmented and aligned.
-
-    Reference:
-        LIBRISPEECH: AN ASR CORPUS BASED ON PUBLIC DOMAIN AUDIO BOOKS
-        http://www.danielpovey.com/files/2015_icassp_librispeech.pdf
-        https://arxiv.org/abs/1709.05522
-    """
-
-    source_url = 'http://www.openslr.org/resources/12/'
-    archieves = [
-        {
-            'url': source_url + 'train-clean-100.tar.gz',
-            'md5': '2a93770f6d5c6c964bc36631d331a522',
-        },
-        {
-            'url': source_url + 'train-clean-360.tar.gz',
-            'md5': 'c0e676e450a7ff2f54aeade5171606fa',
-        },
-        {
-            'url': source_url + 'train-other-500.tar.gz',
-            'md5': 'd1a0fd59409feb2c614ce4d30c387708',
-        },
-        {
-            'url': source_url + 'dev-clean.tar.gz',
-            'md5': '42e2234ba48799c1f50f24a7926300a1',
-        },
-        {
-            'url': source_url + 'dev-other.tar.gz',
-            'md5': 'c8d0bcc9cca99d4f8b62fcc847357931',
-        },
-        {
-            'url': source_url + 'test-clean.tar.gz',
-            'md5': '32fa31d27d2e1cad72775fee3f4849a9',
-        },
-        {
-            'url': source_url + 'test-other.tar.gz',
-            'md5': 'fb5a50374b501bb3bac4815ee91d3135',
-        },
-    ]
-    speaker_meta = os.path.join('LibriSpeech', 'SPEAKERS.TXT')
-    utt_info = collections.namedtuple('META_INFO', (
-        'file_path', 'utt_id', 'text', 'spk_id', 'spk_gender'))
-    audio_path = 'LibriSpeech'
-    manifest_path = os.path.join('LibriSpeech', 'manifest')
-    subset = [
-        'train-clean-100', 'train-clean-360', 'train-clean-500', 'dev-clean',
-        'dev-other', 'test-clean', 'test-other'
-    ]
-
-    def __init__(self,
-                 subset: str='train-clean-100',
-                 feat_type: str='raw',
-                 **kwargs):
-        assert subset in self.subset, 'Dataset subset must be one in {}, but got {}'.format(
-            self.subset, subset)
-        self.subset = subset
-        self.feat_type = feat_type
-        self.feat_config = kwargs
-        self._data = self._get_data()
-        super(LIBRISPEECH, self).__init__()
-
-    def _get_speaker_info(self) -> Dict[str, str]:
-        ret = {}
-        with open(os.path.join(DATA_HOME, self.speaker_meta), 'r') as rf:
-            for line in rf.readlines():
-                if ';' in line:  # Skip dataset abstract
-                    continue
-                spk_id, gender = map(str.strip,
-                                     line.split('|')[:2])  # spk_id, gender
-                ret.update({spk_id: gender})
-        return ret
-
-    def _get_text_info(self, trans_file) -> Dict[str, str]:
-        ret = {}
-        with open(trans_file, 'r') as rf:
-            for line in rf.readlines():
-                utt_id, text = map(str.strip, line.split(' ',
-                                                         1))  # utt_id, text
-                ret.update({utt_id: text})
-        return ret
-
-    def _get_data(self):
-        if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
-            not os.path.isfile(os.path.join(DATA_HOME, self.speaker_meta)):
-            download_and_decompress(self.archieves, DATA_HOME,
-                                    len(self.archieves))
-
-        # Speaker info
-        speaker_info = self._get_speaker_info()
-
-        # Text info
-        text_info = {}
-        for root, _, files in os.walk(
-                os.path.join(DATA_HOME, self.audio_path, self.subset)):
-            for file in files:
-                if file.endswith('.trans.txt'):
-                    text_info.update(
-                        self._get_text_info(os.path.join(root, file)))
-
-        data = []
-        for root, _, files in os.walk(
-                os.path.join(DATA_HOME, self.audio_path, self.subset)):
-            for file in files:
-                if file.endswith('.flac'):
-                    utt_id = os.path.splitext(file)[0]
-                    spk_id = utt_id.split('-')[0]
-                    if utt_id not in text_info \
-                        or spk_id not in speaker_info :  # Skip samples with incomplete data
-                        continue
-                    file_path = os.path.join(root, file)
-                    text = text_info[utt_id]
-                    spk_gender = speaker_info[spk_id]
-                    data.append(
-                        self.utt_info(file_path, utt_id, text, spk_id,
-                                      spk_gender))
-
-        return data
-
-    def _convert_to_record(self, idx: int):
-        sample = self._data[idx]
-
-        record = {}
-        # To show all fields in a namedtuple: `type(sample)._fields`
-        for field in type(sample)._fields:
-            record[field] = getattr(sample, field)
-
-        waveform, sr = load_audio(
-            sample[0])  # The first element of sample is file path
-        feat_func = feat_funcs[self.feat_type]
-        feat = feat_func(
-            waveform, sample_rate=sr,
-            **self.feat_config) if feat_func else waveform
-        record.update({'feat': feat, 'duration': len(waveform) / sr})
-        return record
-
-    def create_manifest(self, prefix='manifest'):
-        if not os.path.isdir(os.path.join(DATA_HOME, self.manifest_path)):
-            os.makedirs(os.path.join(DATA_HOME, self.manifest_path))
-
-        manifest_file = os.path.join(DATA_HOME, self.manifest_path,
-                                     f'{prefix}.{self.subset}')
-        with codecs.open(manifest_file, 'w', 'utf-8') as f:
-            for idx in tqdm(range(len(self))):
-                record = self._convert_to_record(idx)
-                record_line = json.dumps(
-                    {
-                        'utt': record['utt_id'],
-                        'feat': record['file_path'],
-                        'feat_shape': (record['duration'], ),
-                        'text': record['text'],
-                        'spk': record['spk_id'],
-                        'gender': record['spk_gender'],
-                    },
-                    ensure_ascii=False)
-                f.write(record_line + '\n')
-        logger.info(f'Manifest file {manifest_file} created.')
-
-    def __getitem__(self, idx):
-        record = self._convert_to_record(idx)
-        return tuple(record.values())
-
-    def __len__(self):
-        return len(self._data)
diff --git a/audio/paddleaudio/datasets/ravdess.py b/audio/paddleaudio/datasets/ravdess.py
deleted file mode 100644
index d886aad27f55ad2f592d44a7710a762f4ad30028..0000000000000000000000000000000000000000
--- a/audio/paddleaudio/datasets/ravdess.py
+++ /dev/null
@@ -1,136 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import collections
-import os
-import random
-from typing import List
-from typing import Tuple
-
-from ..utils.download import download_and_decompress
-from ..utils.env import DATA_HOME
-from .dataset import AudioClassificationDataset
-
-__all__ = ['RAVDESS']
-
-
-class RAVDESS(AudioClassificationDataset):
-    """
-    The RAVDESS contains 24 professional actors (12 female, 12 male), vocalizing two
-    lexically-matched statements in a neutral North American accent. Speech emotions
-    includes calm, happy, sad, angry, fearful, surprise, and disgust expressions.
-    Each expression is produced at two levels of emotional intensity (normal, strong),
-    with an additional neutral expression.
-
-    Reference:
-        The Ryerson Audio-Visual Database of Emotional Speech and Song (RAVDESS):
-        A dynamic, multimodal set of facial and vocal expressions in North American English
-        https://doi.org/10.1371/journal.pone.0196391
-    """
-
-    archieves = [
-        {
-            'url':
-            'https://zenodo.org/record/1188976/files/Audio_Song_Actors_01-24.zip',
-            'md5':
-            '5411230427d67a21e18aa4d466e6d1b9',
-        },
-        {
-            'url':
-            'https://zenodo.org/record/1188976/files/Audio_Speech_Actors_01-24.zip',
-            'md5':
-            'bc696df654c87fed845eb13823edef8a',
-        },
-    ]
-    label_list = [
-        'neutral', 'calm', 'happy', 'sad', 'angry', 'fearful', 'disgust',
-        'surprised'
-    ]
-    meta_info = collections.namedtuple(
-        'META_INFO', ('modality', 'vocal_channel', 'emotion',
-                      'emotion_intensity', 'statement', 'repitition', 'actor'))
-    speech_path = os.path.join(DATA_HOME, 'Audio_Speech_Actors_01-24')
-    song_path = os.path.join(DATA_HOME, 'Audio_Song_Actors_01-24')
-
-    def __init__(self,
-                 mode='train',
-                 seed=0,
-                 n_folds=5,
-                 split=1,
-                 feat_type='raw',
-                 **kwargs):
-        """
-        Ags:
-            mode (:obj:`str`, `optional`, defaults to `train`):
-                It identifies the dataset mode (train or dev).
-            seed (:obj:`int`, `optional`, defaults to 0):
-                Set the random seed to shuffle samples.
-            n_folds (:obj:`int`, `optional`, defaults to 5):
-                Split the dataset into n folds. 1 fold for dev dataset and n-1 for train dataset.
-            split (:obj:`int`, `optional`, defaults to 1):
-                It specify the fold of dev dataset.
-            feat_type (:obj:`str`, `optional`, defaults to `raw`):
-                It identifies the feature type that user wants to extrace of an audio file.
-        """
-        assert split <= n_folds, f'The selected split should not be larger than n_fold, but got {split} > {n_folds}'
-        files, labels = self._get_data(mode, seed, n_folds, split)
-        super(RAVDESS, self).__init__(
-            files=files, labels=labels, feat_type=feat_type, **kwargs)
-
-    def _get_meta_info(self, files) -> List[collections.namedtuple]:
-        ret = []
-        for file in files:
-            basename_without_extend = os.path.basename(file)[:-4]
-            ret.append(self.meta_info(*basename_without_extend.split('-')))
-        return ret
-
-    def _get_data(self, mode, seed, n_folds,
-                  split) -> Tuple[List[str], List[int]]:
-        if not os.path.isdir(self.speech_path) and not os.path.isdir(
-                self.song_path):
-            download_and_decompress(self.archieves, DATA_HOME)
-
-        wav_files = []
-        for root, _, files in os.walk(self.speech_path):
-            for file in files:
-                if file.endswith('.wav'):
-                    wav_files.append(os.path.join(root, file))
-
-        for root, _, files in os.walk(self.song_path):
-            for file in files:
-                if file.endswith('.wav'):
-                    wav_files.append(os.path.join(root, file))
-
-        random.seed(seed)  # shuffle samples to split data
-        random.shuffle(
-            wav_files
-        )  # make sure using the same seed to create train and dev dataset
-        meta_info = self._get_meta_info(wav_files)
-
-        files = []
-        labels = []
-        n_samples_per_fold = len(meta_info) // n_folds
-        for idx, sample in enumerate(meta_info):
-            _, _, emotion, _, _, _, _ = sample
-            target = int(emotion) - 1
-            fold = idx // n_samples_per_fold + 1
-
-            if mode == 'train' and int(fold) != split:
-                files.append(wav_files[idx])
-                labels.append(target)
-
-            if mode != 'train' and int(fold) == split:
-                files.append(wav_files[idx])
-                labels.append(target)
-
-        return files, labels
diff --git a/audio/test/README.md b/audio/test/README.md
deleted file mode 100644
index e5dbc537c0c66226fedf2ebda01345f19d0ef07e..0000000000000000000000000000000000000000
--- a/audio/test/README.md
+++ /dev/null
@@ -1,41 +0,0 @@
-# PaddleAudio Testing Guide
-
-
-
-
-# Testing
-First clone a version of the project by
-```
-git clone https://github.com/PaddlePaddle/models.git
-
-```
-Then install the project in your virtual environment.
-```
-cd models/PaddleAudio
-python setup.py bdist_wheel
-pip install -e .[dev]
-```
-The requirements for testing will be installed along with PaddleAudio.  
-
-Now run
-```
-pytest test
-```
-
-If it goes well, you will see outputs like these:
-```
-platform linux -- Python 3.7.10, pytest-6.2.4, py-1.10.0, pluggy-0.13.1
-rootdir: ./models/PaddleAudio
-plugins: hydra-core-1.0.6
-collected 16 items  
-
-test/unit_test/test_backend.py ...........                                                                         [ 68%]
-test/unit_test/test_features.py .....                                                                              [100%]
-
-==================================================== warnings summary ====================================================
-.
-.
-.
--- Docs: https://docs.pytest.org/en/stable/warnings.html
-============================================ 16 passed, 11 warnings in 6.76s =============================================
-```
diff --git a/audio/test/unit_test/test_backend.py b/audio/test/unit_test/test_backend.py
deleted file mode 100644
index 1bf1504e0dc38a85a31c719197d2fd774900f294..0000000000000000000000000000000000000000
--- a/audio/test/unit_test/test_backend.py
+++ /dev/null
@@ -1,113 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import librosa
-import numpy as np
-import paddleaudio
-import pytest
-
-TEST_FILE = './test/data/test_audio.wav'
-
-
-def relative_err(a, b, real=True):
-    """compute relative error of two matrices or vectors"""
-    if real:
-        return np.sum((a - b)**2) / (EPS + np.sum(a**2) + np.sum(b**2))
-    else:
-        err = np.sum((a.real - b.real)**2) / \
-            (EPS + np.sum(a.real**2) + np.sum(b.real**2))
-        err += np.sum((a.imag - b.imag)**2) / \
-            (EPS + np.sum(a.imag**2) + np.sum(b.imag**2))
-
-        return err
-
-
-@pytest.mark.filterwarnings("ignore::DeprecationWarning")
-def load_audio():
-    x, r = librosa.load(TEST_FILE, sr=16000)
-    print(f'librosa: mean: {np.mean(x)}, std:{np.std(x)}')
-    return x, r
-
-
-# start testing
-x, r = load_audio()
-EPS = 1e-8
-
-
-def test_load():
-    s, r = paddleaudio.load(TEST_FILE, sr=16000)
-    assert r == 16000
-    assert s.dtype == 'float32'
-
-    s, r = paddleaudio.load(
-        TEST_FILE, sr=16000, offset=1, duration=2, dtype='int16')
-    assert len(s) / r == 2.0
-    assert r == 16000
-    assert s.dtype == 'int16'
-
-
-def test_depth_convert():
-    y = paddleaudio.depth_convert(x, 'int16')
-    assert len(y) == len(x)
-    assert y.dtype == 'int16'
-    assert np.max(y) <= 32767
-    assert np.min(y) >= -32768
-    assert np.std(y) > EPS
-
-    y = paddleaudio.depth_convert(x, 'int8')
-    assert len(y) == len(x)
-    assert y.dtype == 'int8'
-    assert np.max(y) <= 127
-    assert np.min(y) >= -128
-    assert np.std(y) > EPS
-
-
-# test case for resample
-rs_test_data = [
-    (32000, 'kaiser_fast'),
-    (16000, 'kaiser_fast'),
-    (8000, 'kaiser_fast'),
-    (32000, 'kaiser_best'),
-    (16000, 'kaiser_best'),
-    (8000, 'kaiser_best'),
-    (22050, 'kaiser_best'),
-    (44100, 'kaiser_best'),
-]
-
-
-@pytest.mark.parametrize('sr,mode', rs_test_data)
-def test_resample(sr, mode):
-    y = paddleaudio.resample(x, 16000, sr, mode=mode)
-    factor = sr / 16000
-    err = relative_err(len(y), len(x) * factor)
-    print('err:', err)
-    assert err < EPS
-
-
-def test_normalize():
-    y = paddleaudio.normalize(x, norm_type='linear', mul_factor=0.5)
-    assert np.max(y) < 0.5 + EPS
-
-    y = paddleaudio.normalize(x, norm_type='linear', mul_factor=2.0)
-    assert np.max(y) <= 2.0 + EPS
-
-    y = paddleaudio.normalize(x, norm_type='gaussian', mul_factor=1.0)
-    print('np.std(y):', np.std(y))
-    assert np.abs(np.std(y) - 1.0) < EPS
-
-
-if __name__ == '__main__':
-    test_load()
-    test_depth_convert()
-    test_resample(22050, 'kaiser_fast')
-    test_normalize()
diff --git a/audio/test/unit_test/test_features.py b/audio/test/unit_test/test_features.py
deleted file mode 100644
index 9e4e29cb317cb21c9e224522bafc95ba8b7f301e..0000000000000000000000000000000000000000
--- a/audio/test/unit_test/test_features.py
+++ /dev/null
@@ -1,143 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import librosa
-import numpy as np
-import paddleaudio as pa
-import pytest
-
-
-@pytest.mark.filterwarnings("ignore::DeprecationWarning")
-def load_audio():
-    x, r = librosa.load('./test/data/test_audio.wav')
-    #x,r = librosa.load('../data/test_audio.wav',sr=16000)
-    return x, r
-
-
-## start testing
-x, r = load_audio()
-EPS = 1e-8
-
-
-def relative_err(a, b, real=True):
-    """compute relative error of two matrices or vectors"""
-    if real:
-        return np.sum((a - b)**2) / (EPS + np.sum(a**2) + np.sum(b**2))
-    else:
-        err = np.sum((a.real - b.real)**2) / (
-            EPS + np.sum(a.real**2) + np.sum(b.real**2))
-        err += np.sum((a.imag - b.imag)**2) / (
-            EPS + np.sum(a.imag**2) + np.sum(b.imag**2))
-
-        return err
-
-
-@pytest.mark.filterwarnings("ignore::DeprecationWarning")
-def test_melspectrogram():
-    a = pa.melspectrogram(
-        x,
-        window_size=512,
-        sr=16000,
-        hop_length=320,
-        n_mels=64,
-        fmin=50,
-        to_db=False, )
-    b = librosa.feature.melspectrogram(
-        x,
-        sr=16000,
-        n_fft=512,
-        win_length=512,
-        hop_length=320,
-        n_mels=64,
-        fmin=50)
-    assert relative_err(a, b) < EPS
-
-
-@pytest.mark.filterwarnings("ignore::DeprecationWarning")
-def test_melspectrogram_db():
-
-    a = pa.melspectrogram(
-        x,
-        window_size=512,
-        sr=16000,
-        hop_length=320,
-        n_mels=64,
-        fmin=50,
-        to_db=True,
-        ref=1.0,
-        amin=1e-10,
-        top_db=None)
-    b = librosa.feature.melspectrogram(
-        x,
-        sr=16000,
-        n_fft=512,
-        win_length=512,
-        hop_length=320,
-        n_mels=64,
-        fmin=50)
-    b = pa.power_to_db(b, ref=1.0, amin=1e-10, top_db=None)
-    assert relative_err(a, b) < EPS
-
-
-@pytest.mark.filterwarnings("ignore::DeprecationWarning")
-def test_stft():
-    a = pa.stft(x, n_fft=1024, hop_length=320, win_length=512)
-    b = librosa.stft(x, n_fft=1024, hop_length=320, win_length=512)
-    assert a.shape == b.shape
-    assert relative_err(a, b, real=False) < EPS
-
-
-@pytest.mark.filterwarnings("ignore::DeprecationWarning")
-def test_split_frames():
-    a = librosa.util.frame(x, frame_length=512, hop_length=320)
-    b = pa.split_frames(x, frame_length=512, hop_length=320)
-    assert relative_err(a, b) < EPS
-
-
-@pytest.mark.filterwarnings("ignore::DeprecationWarning")
-def test_mfcc():
-    kwargs = {
-        'window_size': 512,
-        'hop_length': 320,
-        'n_mels': 64,
-        'fmin': 50,
-        'to_db': False
-    }
-    a = pa.mfcc(
-        x,
-        #sample_rate=16000,
-        spect=None,
-        n_mfcc=20,
-        dct_type=2,
-        norm='ortho',
-        lifter=0,
-        **kwargs)
-    S = librosa.feature.melspectrogram(
-        x,
-        sr=16000,
-        n_fft=512,
-        win_length=512,
-        hop_length=320,
-        n_mels=64,
-        fmin=50)
-    b = librosa.feature.mfcc(
-        x, sr=16000, S=S, n_mfcc=20, dct_type=2, norm='ortho', lifter=0)
-    assert relative_err(a, b) < EPS
-
-
-if __name__ == '__main__':
-    test_melspectrogram()
-    test_melspectrogram_db()
-    test_stft()
-    test_split_frames()
-    test_mfcc()
diff --git a/examples/dataset/aidatatang_200zh/.gitignore b/dataset/aidatatang_200zh/.gitignore
similarity index 100%
rename from examples/dataset/aidatatang_200zh/.gitignore
rename to dataset/aidatatang_200zh/.gitignore
diff --git a/examples/dataset/aidatatang_200zh/README.md b/dataset/aidatatang_200zh/README.md
similarity index 100%
rename from examples/dataset/aidatatang_200zh/README.md
rename to dataset/aidatatang_200zh/README.md
diff --git a/examples/dataset/aidatatang_200zh/aidatatang_200zh.py b/dataset/aidatatang_200zh/aidatatang_200zh.py
similarity index 100%
rename from examples/dataset/aidatatang_200zh/aidatatang_200zh.py
rename to dataset/aidatatang_200zh/aidatatang_200zh.py
diff --git a/examples/dataset/aishell/.gitignore b/dataset/aishell/.gitignore
similarity index 100%
rename from examples/dataset/aishell/.gitignore
rename to dataset/aishell/.gitignore
diff --git a/examples/dataset/aishell/README.md b/dataset/aishell/README.md
similarity index 100%
rename from examples/dataset/aishell/README.md
rename to dataset/aishell/README.md
diff --git a/examples/dataset/aishell/aishell.py b/dataset/aishell/aishell.py
similarity index 100%
rename from examples/dataset/aishell/aishell.py
rename to dataset/aishell/aishell.py
diff --git a/examples/dataset/aishell3/README.md b/dataset/aishell3/README.md
similarity index 100%
rename from examples/dataset/aishell3/README.md
rename to dataset/aishell3/README.md
diff --git a/examples/dataset/chime3_background/chime3_background.py b/dataset/chime3_background/chime3_background.py
similarity index 100%
rename from examples/dataset/chime3_background/chime3_background.py
rename to dataset/chime3_background/chime3_background.py
diff --git a/examples/dataset/gigaspeech/.gitignore b/dataset/gigaspeech/.gitignore
similarity index 100%
rename from examples/dataset/gigaspeech/.gitignore
rename to dataset/gigaspeech/.gitignore
diff --git a/examples/dataset/gigaspeech/README.md b/dataset/gigaspeech/README.md
similarity index 100%
rename from examples/dataset/gigaspeech/README.md
rename to dataset/gigaspeech/README.md
diff --git a/examples/dataset/gigaspeech/gigaspeech.py b/dataset/gigaspeech/gigaspeech.py
similarity index 100%
rename from examples/dataset/gigaspeech/gigaspeech.py
rename to dataset/gigaspeech/gigaspeech.py
diff --git a/examples/dataset/gigaspeech/run.sh b/dataset/gigaspeech/run.sh
similarity index 100%
rename from examples/dataset/gigaspeech/run.sh
rename to dataset/gigaspeech/run.sh
diff --git a/examples/dataset/librispeech/.gitignore b/dataset/librispeech/.gitignore
similarity index 100%
rename from examples/dataset/librispeech/.gitignore
rename to dataset/librispeech/.gitignore
diff --git a/examples/dataset/librispeech/librispeech.py b/dataset/librispeech/librispeech.py
similarity index 100%
rename from examples/dataset/librispeech/librispeech.py
rename to dataset/librispeech/librispeech.py
diff --git a/examples/dataset/magicdata/README.md b/dataset/magicdata/README.md
similarity index 100%
rename from examples/dataset/magicdata/README.md
rename to dataset/magicdata/README.md
diff --git a/examples/dataset/mini_librispeech/.gitignore b/dataset/mini_librispeech/.gitignore
similarity index 100%
rename from examples/dataset/mini_librispeech/.gitignore
rename to dataset/mini_librispeech/.gitignore
diff --git a/examples/dataset/mini_librispeech/mini_librispeech.py b/dataset/mini_librispeech/mini_librispeech.py
similarity index 100%
rename from examples/dataset/mini_librispeech/mini_librispeech.py
rename to dataset/mini_librispeech/mini_librispeech.py
diff --git a/examples/dataset/multi_cn/README.md b/dataset/multi_cn/README.md
similarity index 100%
rename from examples/dataset/multi_cn/README.md
rename to dataset/multi_cn/README.md
diff --git a/examples/dataset/musan/.gitignore b/dataset/musan/.gitignore
similarity index 100%
rename from examples/dataset/musan/.gitignore
rename to dataset/musan/.gitignore
diff --git a/examples/dataset/musan/musan.py b/dataset/musan/musan.py
similarity index 100%
rename from examples/dataset/musan/musan.py
rename to dataset/musan/musan.py
diff --git a/examples/dataset/primewords/README.md b/dataset/primewords/README.md
similarity index 100%
rename from examples/dataset/primewords/README.md
rename to dataset/primewords/README.md
diff --git a/examples/dataset/rir_noise/.gitignore b/dataset/rir_noise/.gitignore
similarity index 100%
rename from examples/dataset/rir_noise/.gitignore
rename to dataset/rir_noise/.gitignore
diff --git a/examples/dataset/rir_noise/rir_noise.py b/dataset/rir_noise/rir_noise.py
similarity index 100%
rename from examples/dataset/rir_noise/rir_noise.py
rename to dataset/rir_noise/rir_noise.py
diff --git a/examples/dataset/st-cmds/README.md b/dataset/st-cmds/README.md
similarity index 100%
rename from examples/dataset/st-cmds/README.md
rename to dataset/st-cmds/README.md
diff --git a/examples/dataset/ted_en_zh/.gitignore b/dataset/ted_en_zh/.gitignore
similarity index 100%
rename from examples/dataset/ted_en_zh/.gitignore
rename to dataset/ted_en_zh/.gitignore
diff --git a/examples/dataset/ted_en_zh/ted_en_zh.py b/dataset/ted_en_zh/ted_en_zh.py
similarity index 100%
rename from examples/dataset/ted_en_zh/ted_en_zh.py
rename to dataset/ted_en_zh/ted_en_zh.py
diff --git a/examples/dataset/thchs30/.gitignore b/dataset/thchs30/.gitignore
similarity index 100%
rename from examples/dataset/thchs30/.gitignore
rename to dataset/thchs30/.gitignore
diff --git a/examples/dataset/thchs30/README.md b/dataset/thchs30/README.md
similarity index 100%
rename from examples/dataset/thchs30/README.md
rename to dataset/thchs30/README.md
diff --git a/examples/dataset/thchs30/thchs30.py b/dataset/thchs30/thchs30.py
similarity index 100%
rename from examples/dataset/thchs30/thchs30.py
rename to dataset/thchs30/thchs30.py
diff --git a/examples/dataset/timit/.gitignore b/dataset/timit/.gitignore
similarity index 100%
rename from examples/dataset/timit/.gitignore
rename to dataset/timit/.gitignore
diff --git a/examples/dataset/timit/timit.py b/dataset/timit/timit.py
similarity index 100%
rename from examples/dataset/timit/timit.py
rename to dataset/timit/timit.py
diff --git a/examples/dataset/timit/timit_kaldi_standard_split.py b/dataset/timit/timit_kaldi_standard_split.py
similarity index 100%
rename from examples/dataset/timit/timit_kaldi_standard_split.py
rename to dataset/timit/timit_kaldi_standard_split.py
diff --git a/examples/dataset/voxforge/run_data.sh b/dataset/voxforge/run_data.sh
similarity index 78%
rename from examples/dataset/voxforge/run_data.sh
rename to dataset/voxforge/run_data.sh
index 5af9d0cc6a0036d18de78e1d9cc9f6fb689531c0..26214a8249e94ba5385d83a975b0083a4c4318b9 100644
--- a/examples/dataset/voxforge/run_data.sh
+++ b/dataset/voxforge/run_data.sh
@@ -1,10 +1,10 @@
 #! /usr/bin/env bash
 
-TARGET_DIR=${MAIN_ROOT}/examples/dataset/voxforge
+TARGET_DIR=${MAIN_ROOT}/dataset/voxforge
 mkdir -p ${TARGET_DIR}
 
 # download data, generate manifests
-python ${MAIN_ROOT}/examples/dataset/voxforge/voxforge.py \
+python ${MAIN_ROOT}/dataset/voxforge/voxforge.py \
 --manifest_prefix="${TARGET_DIR}/manifest" \
 --target_dir="${TARGET_DIR}" \
 --is_merge_dialect=True \
diff --git a/examples/dataset/voxforge/voxforge.py b/dataset/voxforge/voxforge.py
similarity index 100%
rename from examples/dataset/voxforge/voxforge.py
rename to dataset/voxforge/voxforge.py
diff --git a/demos/style_fs2/style_syn.py b/demos/style_fs2/style_syn.py
index 5b8ce35139aea0edb084cd3b1d33b702b27d2628..0ed87e7cb497314de435ca993d4b21f18b16700a 100644
--- a/demos/style_fs2/style_syn.py
+++ b/demos/style_fs2/style_syn.py
@@ -34,7 +34,9 @@ def evaluate(args, fastspeech2_config, pwg_config):
     sentences = []
     with open(args.text, 'rt') as f:
         for line in f:
-            utt_id, sentence = line.strip().split()
+            items = line.strip().split()
+            utt_id = items[0]
+            sentence = "".join(items[1:])
             sentences.append((utt_id, sentence))
 
     with open(args.phones_dict, "r") as f:
diff --git a/docs/source/reference.md b/docs/source/reference.md
index ab56344d8857d82471a547d893bf8b4c57616eb8..39e2a4688b46e0c4225a1313f5ec68f29fd1d9ce 100644
--- a/docs/source/reference.md
+++ b/docs/source/reference.md
@@ -6,7 +6,7 @@ We borrowed a lot of code from these repos to build `model` and `engine`, thanks
 - Apache-2.0 License
 - python/shell `utils`
 - kaldi feat preprocessing
-- data pipe line and `transform`
+- data pipe line and `transformer`
 - some tts models, like `fastspeech2` and GAN-based `vocoder`
 
 * [wenet](https://github.com/wenet-e2e/wenet/blob/main/LICENSE)
diff --git a/docs/source/released_model.md b/docs/source/released_model.md
index 2e3d8106ec043fde2949850baabc91fec2144e0f..367b7c4b88a8674c4f797edd711a6dfe739fa411 100644
--- a/docs/source/released_model.md
+++ b/docs/source/released_model.md
@@ -1,16 +1,18 @@
 # Released Models
 
 ## Speech-to-Text Models
+
 ### Acoustic Model Released in paddle 2.X
 Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | Hours of speech | example link
 :-------------:| :------------:| :-----: | -----: | :----------------- |:--------- | :---------- | :--------- | :-----------
-[Ds2 Online Aishell ASR0 Model](https://deepspeech.bj.bcebos.com/release2.2/aishell/s0/ds2_online_aishll_CER8.02_release.tar.gz) | Aishell Dataset | Char-based | 345 MB  | 2 Conv + 5 LSTM layers with only forward direction | 0.080218 |-| 151 h | [D2 Online Aishell S0 Example](../../examples/aishell/asr0)
-[Ds2 Offline Aishell ASR0 Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s0/aishell.s0.ds2.offline.cer6p65.release.tar.gz)| Aishell Dataset | Char-based | 306 MB | 2 Conv + 3 bidirectional GRU layers| 0.065 |-| 151 h | [Ds2 Offline Aishell S0 Example](../../examples/aishell/asr0)
+[Ds2 Online Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/aishell_ds2_online_cer8.00_release.tar.gz) | Aishell Dataset | Char-based | 345 MB  | 2 Conv + 5 LSTM layers with only forward direction | 0.080 |-| 151 h | [D2 Online Aishell S0 Example](../../examples/aishell/asr0)
+[Ds2 Offline Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/ds2.model.tar.gz)| Aishell Dataset | Char-based | 306 MB | 2 Conv + 3 bidirectional GRU layers| 0.064 |-| 151 h | [Ds2 Offline Aishell S0 Example](../../examples/aishell/asr0)
 [Conformer Online Aishell ASR1 Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.chunk.release.tar.gz) | Aishell Dataset | Char-based | 283 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0594 |-| 151 h | [Conformer Online Aishell S1 Example](../../examples/aishell/s1)
 [Conformer Offline Aishell ASR1 Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.release.tar.gz) | Aishell Dataset | Char-based | 284 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0547 |-| 151 h | [Conformer Offline Aishell S1 Example](../../examples/aishell/s1)
 [Conformer Librispeech ASR1 Model](https://deepspeech.bj.bcebos.com/release2.1/librispeech/s1/conformer.release.tar.gz) | Librispeech Dataset | subword-based | 287 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring |-| 0.0325 | 960 h | [Conformer Librispeech S1 example](../../example/librispeech/s1)
-[Transformer Librispeech ASR1 Model](https://deepspeech.bj.bcebos.com/release2.2/librispeech/s1/librispeech.s1.transformer.all.wer5p62.release.tar.gz) | Librispeech Dataset | subword-based | 131 MB  | Encoder:Transformer, Decoder:Transformer, Decoding method: Attention rescoring |-| 0.0456 | 960 h | [Transformer Librispeech S1 example](../../example/librispeech/s1)
-[Transformer Librispeech ASR2 Model](https://deepspeech.bj.bcebos.com/release2.2/librispeech/s2/libri_transformer_espnet_wer3p84.release.tar.gz) | Librispeech Dataset | subword-based | 131 MB  | Encoder:Transformer, Decoder:Transformer, Decoding method: Attention |-| 0.0384 | 960 h | [Transformer Librispeech S2 example](../../example/librispeech/s2)
+[Transformer Librispeech ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr1/transformer.model.tar.gz) | Librispeech Dataset | subword-based | 131 MB  | Encoder:Transformer, Decoder:Transformer, Decoding method: Attention rescoring |-| 0.0410 | 960 h | [Transformer Librispeech S1 example](../../example/librispeech/s1)
+[Transformer Librispeech ASR2 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr2/transformer.model.tar.gz) | Librispeech Dataset | subword-based | 131 MB  | Encoder:Transformer, Decoder:Transformer, Decoding method: JoinCTC w/ LM |-| 0.024 | 960 h | [Transformer Librispeech S2 example](../../example/librispeech/s2)
+
 
 ### Acoustic Model Transformed from paddle 1.8
 Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | Hours of speech
@@ -20,14 +22,15 @@ Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER |
 [Ds2 Offline Baidu en8k model](https://deepspeech.bj.bcebos.com/eng_models/baidu_en8k_v1.8_to_v2.x.tar.gz)|Baidu Internal English Dataset| Word-based| 273 MB| 2 Conv + 3 bidirectional GRU layers |-| 0.0541 | 8628 h|
 
 ### Language Model Released
-
 Language Model | Training Data | Token-based | Size | Descriptions
 :-------------:| :------------:| :-----: | -----: | :-----------------
 [English LM](https://deepspeech.bj.bcebos.com/en_lm/common_crawl_00.prune01111.trie.klm) |  [CommonCrawl(en.00)](http://web-language-models.s3-website-us-east-1.amazonaws.com/ngrams/en/deduped/en.00.deduped.xz) | Word-based | 8.3 GB | Pruned with 0 1 1 1 1; <br/> About 1.85 billion n-grams; <br/> 'trie'  binary with '-a 22 -q 8 -b 8'
 [Mandarin LM Small](https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm) | Baidu Internal Corpus | Char-based | 2.8 GB | Pruned with 0 1 2 4 4; <br/> About 0.13 billion n-grams; <br/> 'probing' binary with default settings
 [Mandarin LM Large](https://deepspeech.bj.bcebos.com/zh_lm/zhidao_giga.klm) | Baidu Internal Corpus | Char-based | 70.4 GB | No Pruning; <br/> About 3.7 billion n-grams; <br/> 'probing' binary with default settings
 
+
 ## Text-to-Speech Models
+
 ### Acoustic Models
 Model Type | Dataset| Example Link | Pretrained Models|Static Models|Siize(static)
 :-------------:| :------------:| :-----: | :-----:| :-----:| :-----:
@@ -40,7 +43,6 @@ FastSpeech2| LJSpeech |[fastspeech2-ljspeech](https://github.com/PaddlePaddle/Pa
 FastSpeech2| VCTK |[fastspeech2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/tts3)|[fastspeech2_nosil_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_vctk_ckpt_0.5.zip)|||
 
 ### Vocoders
-
 Model Type | Dataset| Example Link | Pretrained Models| Static Models|Size(static)
 :-------------:| :------------:| :-----: | :-----:| :-----:| :-----:
 WaveFlow| LJSpeech |[waveflow-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc0)|[waveflow_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/waveflow/waveflow_ljspeech_ckpt_0.3.zip)|||
diff --git a/docs/source/tts/README.md b/docs/source/tts/README.md
index 1b639af7aaecbd8c0c984d42ced690c7eb2fa4e6..0b2b2d1dad7590751b145212b32a1d0147b3481c 100644
--- a/docs/source/tts/README.md
+++ b/docs/source/tts/README.md
@@ -5,20 +5,6 @@ Parakeet aims to provide a flexible, efficient and state-of-the-art text-to-spee
   <img src="../../images/logo.png" width=300 /> <br>
 </div>
 
-
-## News  <img src="../../images/news_icon.png" width="40"/>
-- Oct-12-2021, Refector examples code.
-- Oct-12-2021, Parallel WaveGAN with LJSpeech. Check [examples/GANVocoder/parallelwave_gan/ljspeech](./examples/GANVocoder/parallelwave_gan/ljspeech).
-- Oct-12-2021, FastSpeech2/FastPitch with LJSpeech. Check [examples/fastspeech2/ljspeech](./examples/fastspeech2/ljspeech).
-- Sep-14-2021, Reconstruction of TransformerTTS. Check [examples/transformer_tts/ljspeech](./examples/transformer_tts/ljspeech).
-- Aug-31-2021, Chinese Text Frontend. Check [examples/text_frontend](./examples/text_frontend).
-- Aug-23-2021, FastSpeech2/FastPitch with AISHELL-3. Check [examples/fastspeech2/aishell3](./examples/fastspeech2/aishell3).
-- Aug-03-2021, FastSpeech2/FastPitch with CSMSC. Check [examples/fastspeech2/baker](./examples/fastspeech2/baker).
-- Jul-19-2021, SpeedySpeech with CSMSC. Check [examples/speedyspeech/baker](./examples/speedyspeech/baker).
-- Jul-01-2021, Parallel WaveGAN with CSMSC. Check [examples/GANVocoder/parallelwave_gan/baker](./examples/GANVocoder/parallelwave_gan/baker).
-- Jul-01-2021, Montreal-Forced-Aligner. Check  [examples/use_mfa](./examples/use_mfa).
-- May-07-2021, Voice Cloning in Chinese. Check [examples/tacotron2_aishell3](./examples/tacotron2_aishell3).
-
 ## Overview
 
 In order to facilitate exploiting the existing TTS models directly and developing the new ones, Parakeet selects typical models and provides their reference implementations in PaddlePaddle. Further more, Parakeet abstracts the TTS pipeline and standardizes the procedure of data preprocessing, common modules sharing, model configuration, and the process of training and synthesis. The models supported here include Text FrontEnd, end-to-end Acoustic models and Vocoders:
@@ -38,50 +24,11 @@ In order to facilitate exploiting the existing TTS models directly and developin
   - [Transfer Learning from Speaker Verification to Multispeaker Text-to-Speech Synthesis](https://arxiv.org/pdf/1806.04558v4.pdf)
   - [【GE2E】Generalized End-to-End Loss for Speaker Verification](https://arxiv.org/abs/1710.10467)
 
-## Setup
-It's difficult to install some dependent libraries for this repo in Windows system, we recommend that you **DO NOT** use Windows system, please use `Linux`.
-
-Make sure the library `libsndfile1` is installed, e.g., on Ubuntu.
-
-```bash
-sudo apt-get install libsndfile1
-```
-### Install PaddlePaddle
-See [install](https://www.paddlepaddle.org.cn/install/quick) for more details. This repo requires PaddlePaddle **2.1.2** or above.
-
-### Install Parakeet
 
-```bash
-git clone https://github.com/PaddlePaddle/Parakeet
-cd Parakeet
-pip install -e .
-```
-
-If some python dependent packages cannot be installed successfully, you can run the following script first.
-(replace `python3.6` with your own python version)
-```bash
-sudo apt install -y python3.6-dev
-```
-
-See [install](https://paddle-parakeet.readthedocs.io/en/latest/install.html) for more details.
-
-## Examples
-Entries to the introduction, and the launch of training and synthsis for different example models:
-
-- [>>> Chinese Text Frontend](./examples/text_frontend)
-- [>>> FastSpeech2/FastPitch](./examples/fastspeech2)
-- [>>> Montreal-Forced-Aligner](./examples/use_mfa)
-- [>>> Parallel WaveGAN](./examples/GANVocoder/parallelwave_gan)
-- [>>> SpeedySpeech](./examples/speedyspeech)
-- [>>> Tacotron2_AISHELL3](./examples/tacotron2_aishell3)
-- [>>> GE2E](./examples/ge2e)
-- [>>> WaveFlow](./examples/waveflow)
-- [>>> TransformerTTS](./examples/transformer_tts)
-- [>>> Tacotron2](./examples/tacotron2)
 
 ## Audio samples
-### TTS models (Acoustic Model + Neural Vocoder)
-Check our [website](https://paddleparakeet.readthedocs.io/en/latest/demo.html) for audio sampels.
+
+Check our [website](https://paddlespeech.readthedocs.io/en/latest/tts/demo.html) for audio sampels.
 
 ## Released Model
 
diff --git a/docs/source/tts/advanced_usage.md b/docs/source/tts/advanced_usage.md
index 0540a1c3f980c454edd99bb3fe71839bdba5cee1..040889649d44c723e3c9f7a1a90054c463f07d58 100644
--- a/docs/source/tts/advanced_usage.md
+++ b/docs/source/tts/advanced_usage.md
@@ -290,7 +290,7 @@ The following is the basic  `ArgumentParser`:
 1. `--config`  is used to support configuration file parsing, and the configuration file itself handles the unique options of each experiment.
 2. `--train-metadata` is the path to the training data.
 3.  `--output-dir` is the dir to save the training results.（if there are checkpoints in  `checkpoints/` of  `--output-dir` , it's defalut to reload the newest checkpoint to train)
-4. `--device` and  `--nprocs` determine operation modes，`--device` specifies the type of running device, whether to run on `cpu` or `gpu`. `--nprocs` refers to  the number of training processes. If `nprocs` > 1, it means that multi process parallel training is used. (Note: currently only GPU multi card multi process training is supported.)
+4. `--ngpu` determine operation modes，`--ngpu` refers to the number of training processes. If `ngpu` > 0, it means using GPU, else CPU is used.
 
 Developers can refer to the examples in `examples` to write the default configuration file when adding new experiments.
 
diff --git a/docs/topic/ctc/ctc_loss.ipynb b/docs/topic/ctc/ctc_loss.ipynb
index c0da1f3230cec8961e6b2ff6d7fb431c3cec6bb9..081a6388519d2087b432b51d073e7d492f4b097a 100644
--- a/docs/topic/ctc/ctc_loss.ipynb
+++ b/docs/topic/ctc/ctc_loss.ipynb
@@ -343,6 +343,16 @@
     "    $$"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "41637c03",
+   "metadata": {},
+   "source": [
+    "## Source Code\n",
+    "本人在 [warp-ctc](https://github.com/zh794390558/warp-ctc) 上加了注释，并调整 index 的索引方式，便于理解代码。\n",
+    "对比上面的公式推导和lattice图可以快速理解 ctc 实现。"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "coordinated-music",
@@ -372,7 +382,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -386,7 +396,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.1"
+   "version": "3.7.0"
   },
   "toc": {
    "base_numbering": 1,
diff --git a/examples/aishell/asr0/local/data.sh b/examples/aishell/asr0/local/data.sh
index 1032cedc828932d76e3d71951a745af5ad0d6213..ec692eba621830e307d7f15c1194a09383bbe1cf 100755
--- a/examples/aishell/asr0/local/data.sh
+++ b/examples/aishell/asr0/local/data.sh
@@ -9,7 +9,7 @@ dict_dir=data/lang_char
 
 mkdir -p data
 mkdir -p ${dict_dir}
-TARGET_DIR=${MAIN_ROOT}/examples/dataset
+TARGET_DIR=${MAIN_ROOT}/dataset
 mkdir -p ${TARGET_DIR}
 
 if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
diff --git a/examples/aishell/asr0/local/export.sh b/examples/aishell/asr0/local/export.sh
index a5e62c28d2fa23a5ef9b9e2a0281b025aa943a30..426a72fe54cfae8af4c564611e3b794f988f5468 100755
--- a/examples/aishell/asr0/local/export.sh
+++ b/examples/aishell/asr0/local/export.sh
@@ -14,7 +14,7 @@ jit_model_export_path=$3
 model_type=$4
 
 python3 -u ${BIN_DIR}/export.py \
---nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --checkpoint_path ${ckpt_path_prefix} \
 --export_path ${jit_model_export_path} \
diff --git a/examples/aishell/asr0/local/test.sh b/examples/aishell/asr0/local/test.sh
index 2ae0740b3e8d44ab03e45f4c1b5dbb945657705e..8cbff23520a61523e5d8599e627f897ef583d127 100755
--- a/examples/aishell/asr0/local/test.sh
+++ b/examples/aishell/asr0/local/test.sh
@@ -19,7 +19,7 @@ if [ $? -ne 0 ]; then
 fi
 
 python3 -u ${BIN_DIR}/test.py \
---nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --result_file ${ckpt_prefix}.rsl \
 --checkpoint_path ${ckpt_prefix} \
diff --git a/examples/aishell/asr0/local/test_export.sh b/examples/aishell/asr0/local/test_export.sh
index f0a30ce56fbc4cc43b559295fba7ef3ac3b3be26..4f5e5c8b6371da9e8ee19c939ccba47b815f5617 100755
--- a/examples/aishell/asr0/local/test_export.sh
+++ b/examples/aishell/asr0/local/test_export.sh
@@ -19,7 +19,7 @@ if [ $? -ne 0 ]; then
 fi
 
 python3 -u ${BIN_DIR}/test_export.py \
---nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --result_file ${jit_model_export_path}.rsl \
 --export_path ${jit_model_export_path} \
diff --git a/examples/aishell/asr0/local/test_hub.sh b/examples/aishell/asr0/local/test_hub.sh
index d01496c4941ceee95d56e3a856f1f34c1b16c444..f9fc457500ff60c0752a2d4616e9993640ff02ee 100755
--- a/examples/aishell/asr0/local/test_hub.sh
+++ b/examples/aishell/asr0/local/test_hub.sh
@@ -13,6 +13,17 @@ ckpt_prefix=$2
 model_type=$3
 audio_file=$4
 
+mkdir -p data
+wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/demo_01_03.wav -P data/
+if [ $? -ne 0 ]; then
+   exit 1
+fi
+
+if [ ! -f ${audio_file} ]; then
+    echo "Plase input the right audio_file path"
+    exit 1
+fi
+
 # download language model
 bash local/download_lm_ch.sh
 if [ $? -ne 0 ]; then
@@ -20,7 +31,7 @@ if [ $? -ne 0 ]; then
 fi
 
 python3 -u ${BIN_DIR}/test_hub.py \
---nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --result_file ${ckpt_prefix}.rsl \
 --checkpoint_path ${ckpt_prefix} \
diff --git a/examples/aishell/asr0/local/train.sh b/examples/aishell/asr0/local/train.sh
index edbf3383070ed0c8a71a125c0e77371d0a4412b6..54c642b637d62c9ac0b6be54e1a8d125d8efaf96 100755
--- a/examples/aishell/asr0/local/train.sh
+++ b/examples/aishell/asr0/local/train.sh
@@ -21,7 +21,7 @@ if [ ${seed} != 0 ]; then
 fi
 
 python3 -u ${BIN_DIR}/train.py \
---nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --output exp/${ckpt_name} \
 --model_type ${model_type} \
diff --git a/examples/aishell/asr0/run.sh b/examples/aishell/asr0/run.sh
index 59b7bfe30fc59aa1611fe5342bea31ad26dea5a6..d8ae60623a7762a313ffe49cf61ebf7e6ccf0d76 100755
--- a/examples/aishell/asr0/run.sh
+++ b/examples/aishell/asr0/run.sh
@@ -8,6 +8,7 @@ stop_stage=100
 conf_path=conf/deepspeech2.yaml    #conf/deepspeech2.yaml or conf/deepspeeech2_online.yaml
 avg_num=1
 model_type=offline    # offline or online
+audio_file=data/demo_01_03.wav
 
 source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
 
@@ -15,7 +16,6 @@ avg_ckpt=avg_${avg_num}
 ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}')
 echo "checkpoint name ${ckpt}"
 
-audio_file="data/tmp.wav"
 
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
     # prepare data
diff --git a/examples/aishell/asr1/READEME.md b/examples/aishell/asr1/READEME.md
index c255d0bdcc19d12bf396d556ed1abf18f26dc611..e9fd3017ca090cc271719e71e06a88ad7509f939 100644
--- a/examples/aishell/asr1/READEME.md
+++ b/examples/aishell/asr1/READEME.md
@@ -4,7 +4,7 @@ This example contains code used to train a Transformer or [Conformer](http://arx
 
 ## Overview
 
-All the scirpts you need are in the ```run.sh```. There are several stages in the ```run.sh```, and each stage has its function.
+All the scirpts you need are in ```run.sh```. There are several stages in ```run.sh```, and each stage has its function.
 
 | Stage | Function                                                     |
 | :---- | :----------------------------------------------------------- |
@@ -16,7 +16,7 @@ All the scirpts you need are in the ```run.sh```. There are several stages in th
 | 5     | Infer the single audio file                                  |
 
 
-You can choose to run a range of  stages by setting the ```stage``` and ```stop_stage ``` . 
+You can choose to run a range of stages by setting ```stage``` and ```stop_stage ```. 
 
 For example, if you want to execute the code in stage 2 and stage 3, you can run this script:
 
@@ -33,19 +33,17 @@ bash run.sh --stage 0 --stop_stage 0
 
 
 
-The document below will describe the scripts in the ```run.sh``` in detail.
+The document below will describe the scripts in ```run.sh``` in detail.
 
 ## The Environment Variables
 
-The path.sh contains the environment variable. 
+The path.sh contains the environment variables. 
 
 ```bash
 source path.sh
 ```
 
-This script needs to be run firstly.  
-
-And another script is also needed:
+This script needs to be run firstly. And another script is also needed:
 
 ```bash
 source ${MAIN_ROOT}/utils/parse_options.sh
@@ -57,10 +55,10 @@ It will support the way of using```--varibale value``` in the shell scripts.
 
 ## The Local Variables
 
-Some local variables are set in the ```run.sh```. 
-```gpus``` denotes the GPU number you want to use. If you set ```gpus=```,  it means you only use CPU. 
+Some local variables are set in ```run.sh```. 
+```gpus``` denotes the GPU number you want to use. If you set ```gpus=```, it means you only use CPU. 
 
-```stage``` denotes  the number of stage you want to start from in the expriments.
+```stage``` denotes the number of stage you want to start from in the expriments.
 ```stop stage```denotes the number of stage you want to end at in the expriments. 
 
 ```conf_path``` denotes the config path of the model.
@@ -71,7 +69,7 @@ Some local variables are set in the ```run.sh```.
 
 ```ckpt``` denotes the checkpoint prefix of the model, e.g. "conformer"
 
-You can set the local variables (except ```ckpt```)  when you use the ```run.sh```
+You can set the local variables (except ```ckpt```) when you use ```run.sh```
 
 For example, you can set the ```gpus``` and ``avg_num`` when you use the command line.:
 
@@ -83,7 +81,7 @@ bash run.sh --gpus 0,1 --avg_num 20
 
 ## Stage 0: Data Processing
 
-To use this example, you need to process data firstly and  you can use stage 0 in the ```run.sh``` to do this. The code is shown below:
+To use this example, you need to process data firstly and you can use stage 0 in ```run.sh``` to do this. The code is shown below:
 
 ```bash
  if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
@@ -129,12 +127,12 @@ data/
 
 ## Stage 1: Model Training
 
-If you want to train the model. you can use stage 1 in the ```run.sh```. The code is shown below. 
+If you want to train the model. you can use stage 1 in ```run.sh```. The code is shown below. 
 
 ```bash
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
      # train model, all `ckpt` under `exp` dir
-     CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path}  ${ckpt}
+     CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt}
  fi
 ```
 
@@ -149,14 +147,14 @@ or you can run these scripts in the command line (only use CPU).
 ```bash
 source path.sh
 bash ./local/data.sh
-CUDA_VISIBLE_DEVICES= ./local/train.sh conf/conformer.yaml  conformer
+CUDA_VISIBLE_DEVICES= ./local/train.sh conf/conformer.yaml conformer
 ```
 
 
 
-## Stage 2:  Top-k Models Averaging
+## Stage 2: Top-k Models Averaging
 
-After training the model,  we need to get the final model for testing and inference. In every epoch, the model checkpoint is saved, so we can choose the best model from them based on the validation loss or we can sort them and average the parameters of the top-k models  to get the final model.  We can use stage 2 to do this, and the code is shown below:
+After training the model, we need to get the final model for testing and inference. In every epoch, the model checkpoint is saved, so we can choose the best model from them based on the validation loss or we can sort them and average the parameters of the top-k models to get the final model. We can use stage 2 to do this, and the code is shown below:
 
 ```bash
  if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
@@ -166,7 +164,7 @@ After training the model,  we need to get the final model for testing and infere
 ```
 
 The ```avg.sh``` is in the ```../../../utils/``` which is define in the ```path.sh```.
-If you want to get the final model,  you can use the script below to execute stage 0, stage 1, and stage 2:
+If you want to get the final model, you can use the script below to execute stage 0, stage 1, and stage 2:
 
 ```bash
 bash run.sh --stage 0 --stop_stage 2
@@ -185,7 +183,7 @@ avg.sh best exp/conformer/checkpoints 20
 
 ## Stage 3: Model Testing
 
-The test stage is to evaluate the model performance.. The code of test stage is shown below:
+The test stage is to evaluate the model performance. The code of test stage is shown below:
 
 ```bash
  if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
@@ -194,7 +192,7 @@ The test stage is to evaluate the model performance.. The code of test stage is
  fi
 ```
 
-If you want to train a model and test it,  you can use the script below to execute stage 0, stage 1,  stage 2, and stage 3 :
+If you want to train a model and test it, you can use the script below to execute stage 0, stage 1, stage 2, and stage 3 :
 
 ```bash
 bash run.sh --stage 0 --stop_stage 3
@@ -228,7 +226,7 @@ wget https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/transformer.model.tar.g
 
 ```
 
-using the ```tar``` scripts to unpack the model  and then you can use the script to test the modle.
+using the ```tar``` scripts to unpack the model and then you can use the script to test the modle.
 
 For example:
 
@@ -238,7 +236,7 @@ tar xzvf transformer.model.tar.gz
 source path.sh
 # If you have process the data and get the manifest file， you can skip the following 2 steps
 bash local/data.sh --stage -1 --stop_stage -1
-bash local/data.sh --stage 2  --stop_stage 2
+bash local/data.sh --stage 2 --stop_stage 2
 
 CUDA_VISIBLE_DEVICES= ./local/test.sh conf/transformer.yaml exp/transformer/checkpoints/avg_20
 ```
@@ -291,7 +289,7 @@ If you want to get the alignment between the audio and the text, you can use the
  fi
 ```
 
-If you want to train the model, test it and do the alignment,  you can use the script below to execute stage 0, stage 1, stage 2, and stage 3 :
+If you want to train the model, test it and do the alignment, you can use the script below to execute stage 0, stage 1, stage 2, and stage 3 :
 
 ```bash
 bash run.sh --stage 0 --stop_stage 4
@@ -320,26 +318,26 @@ CUDA_VISIBLE_DEVICES= ./local/align.sh conf/conformer.yaml exp/conformer/checkpo
 
 ## Stage 5: Single Audio File Inference
 
-In some situations, you want to use the trained model to do the inference for the single audio file. You can use stage  5. The code is shown below
+In some situations, you want to use the trained model to do the inference for the single audio file. You can use stage 5. The code is shown below
 
 ```bash
  if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
      # test a single .wav file
-     CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1
+     CUDA_VISIBLE_DEVICES=0 ./local/test_hub.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1
  fi
 ```
 
-you can train the model by yourself using ```bash run.sh --stage 0 --stop_stage 3```, or you can download the pretrained model by the script below:
+you can train the model by yourself using ```bash run.sh --stage 0 --stop_stage 3```, or you can download the pretrained model through the script below:
 
 ```bash
 wget https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/transformer.model.tar.gz
 tar xzvf transformer.model.tar.gz
 ```
 
-You need to prepare an audio file, please confirm the sample rate of the audio is 16K. Assume the path of the audio file is ```data/test_audio.wav```,  you can get the result by running the script below.
+You need to prepare an audio file, please confirm the sample rate of the audio is 16K. Assume the path of the audio file is ```data/test_audio.wav```, you can get the result by running the script below.
 
 ```bash
-CUDA_VISIBLE_DEVICES= ./local/test_wav.sh conf/transformer.yaml exp/transformer/checkpoints/avg_20 data/test_audio.wav
+CUDA_VISIBLE_DEVICES= ./local/test_hub.sh conf/transformer.yaml exp/transformer/checkpoints/avg_20 data/test_audio.wav
 ```
 
 
diff --git a/examples/aishell/asr1/local/align.sh b/examples/aishell/asr1/local/align.sh
index 279461aafe19967ac054df815b08e60e351fcc7f..c65d611c43e870fe4024235c3a2f433452251d21 100755
--- a/examples/aishell/asr1/local/align.sh
+++ b/examples/aishell/asr1/local/align.sh
@@ -18,7 +18,7 @@ mkdir -p ${output_dir}
 # align dump in `result_file`
 # .tier, .TextGrid dump in `dir of result_file`
 python3 -u ${BIN_DIR}/alignment.py \
---nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --result_file ${output_dir}/${type}.align \
 --checkpoint_path ${ckpt_prefix} \
diff --git a/examples/aishell/asr1/local/data.sh b/examples/aishell/asr1/local/data.sh
index 41843231846fde71a4a657599e081ff1f6acd281..3657fd7b6ac9ed3e016ddecf16551bab275e377d 100755
--- a/examples/aishell/asr1/local/data.sh
+++ b/examples/aishell/asr1/local/data.sh
@@ -8,7 +8,7 @@ dict_dir=data/lang_char
 
 mkdir -p data
 mkdir -p ${dict_dir}
-TARGET_DIR=${MAIN_ROOT}/examples/dataset
+TARGET_DIR=${MAIN_ROOT}/dataset
 mkdir -p ${TARGET_DIR}
 
 if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
diff --git a/examples/aishell/asr1/local/export.sh b/examples/aishell/asr1/local/export.sh
index b562218e7a76c91d3b906d2c099218bf492627a8..6b646b46903f33563ab08604f4539551c78394b4 100755
--- a/examples/aishell/asr1/local/export.sh
+++ b/examples/aishell/asr1/local/export.sh
@@ -13,7 +13,7 @@ ckpt_path_prefix=$2
 jit_model_export_path=$3
 
 python3 -u ${BIN_DIR}/export.py \
---nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --checkpoint_path ${ckpt_path_prefix} \
 --export_path ${jit_model_export_path}
diff --git a/examples/aishell/asr1/local/test.sh b/examples/aishell/asr1/local/test.sh
index 47bd2f6338a7d062b094c327f39f5362fae39865..da159de734a40996eed297beb147c23962b7fba3 100755
--- a/examples/aishell/asr1/local/test.sh
+++ b/examples/aishell/asr1/local/test.sh
@@ -34,7 +34,7 @@ for type in attention ctc_greedy_search; do
     output_dir=${ckpt_prefix}
     mkdir -p ${output_dir}
     python3 -u ${BIN_DIR}/test.py \
-    --nproc ${ngpu} \
+    --ngpu ${ngpu} \
     --config ${config_path} \
     --result_file ${output_dir}/${type}.rsl \
     --checkpoint_path ${ckpt_prefix} \
@@ -53,7 +53,7 @@ for type in ctc_prefix_beam_search attention_rescoring; do
     output_dir=${ckpt_prefix}
     mkdir -p ${output_dir}
     python3 -u ${BIN_DIR}/test.py \
-    --nproc ${ngpu} \
+    --ngpu ${ngpu} \
     --config ${config_path} \
     --result_file ${output_dir}/${type}.rsl \
     --checkpoint_path ${ckpt_prefix} \
diff --git a/examples/aishell/asr1/local/test_hub.sh b/examples/aishell/asr1/local/test_hub.sh
index 6e78ec784bb8cf1445ebbe83fd39f4c8a441f418..900ccc4b1eeb94302516380960bdea015ba9f341 100755
--- a/examples/aishell/asr1/local/test_hub.sh
+++ b/examples/aishell/asr1/local/test_hub.sh
@@ -12,6 +12,17 @@ config_path=$1
 ckpt_prefix=$2
 audio_file=$3
 
+mkdir -p data
+wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/demo_01_03.wav -P data/
+if [ $? -ne 0 ]; then
+   exit 1
+fi
+
+if [ ! -f ${audio_file} ]; then
+    echo "Plase input the right audio_file path"
+    exit 1
+fi
+
 chunk_mode=false
 if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then
     chunk_mode=true
@@ -29,7 +40,7 @@ for type in  attention_rescoring; do
     output_dir=${ckpt_prefix}
     mkdir -p ${output_dir}
     python3 -u ${BIN_DIR}/test_hub.py \
-    --nproc ${ngpu} \
+    --ngpu ${ngpu} \
     --config ${config_path} \
     --result_file ${output_dir}/${type}.rsl \
     --checkpoint_path ${ckpt_prefix} \
diff --git a/examples/aishell/asr1/local/tlg.sh b/examples/aishell/asr1/local/tlg.sh
index f5287f79432dd62a4349ab21e45b773ca184171a..7e1665dddf2732155378cd6e0ba29da0f08d4615 100755
--- a/examples/aishell/asr1/local/tlg.sh
+++ b/examples/aishell/asr1/local/tlg.sh
@@ -9,7 +9,7 @@ lmtype=srilm
 
 source utils/parse_options.sh
 
-data=${MAIN_ROOT}/examples/dataset/${corpus}
+data=${MAIN_ROOT}/dataset/${corpus}
 lexicon=$data/resource_aishell/lexicon.txt
 text=$data/data_aishell/transcript/aishell_transcript_v0.8.txt
 
diff --git a/examples/aishell/asr1/local/train.sh b/examples/aishell/asr1/local/train.sh
index 71af3a006deec9032fcefd561fc46cea274d1e10..1c8593bddf5499f50026bd504470114c642055b0 100755
--- a/examples/aishell/asr1/local/train.sh
+++ b/examples/aishell/asr1/local/train.sh
@@ -29,7 +29,7 @@ mkdir -p exp
 
 python3 -u ${BIN_DIR}/train.py \
 --seed ${seed} \
---nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --output exp/${ckpt_name} \
 --profiler-options "${profiler_options}" \
diff --git a/examples/aishell/asr1/run.sh b/examples/aishell/asr1/run.sh
index 41af244539c757758ef1e584b1906ea3bef05aa9..d07a4ed5ce404881a9e7aca311a891f3a990f4da 100644
--- a/examples/aishell/asr1/run.sh
+++ b/examples/aishell/asr1/run.sh
@@ -7,6 +7,7 @@ stage=0
 stop_stage=50
 conf_path=conf/conformer.yaml
 avg_num=20
+audio_file=data/demo_01_03.wav
 
 source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
 
@@ -14,8 +15,6 @@ avg_ckpt=avg_${avg_num}
 ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}')
 echo "checkpoint name ${ckpt}"
 
-audio_file="data/test_single_audio.wav"
-
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
     # prepare data
     bash ./local/data.sh || exit -1
diff --git a/examples/aishell3/tts3/README.md b/examples/aishell3/tts3/README.md
index eb2cca2e2ef538b4a4d0a0ef363b163ef056766b..4953b8c94c3b6f8c6e923c179d1174095c3f0a70 100644
--- a/examples/aishell3/tts3/README.md
+++ b/examples/aishell3/tts3/README.md
@@ -17,7 +17,7 @@ tar zxvf data_aishell3.tgz -C data_aishell3
 ```
 ### Get MFA Result and Extract
 We use [MFA2.x](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for aishell3_fastspeech2.
-You can download from here [aishell3_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/with_tone/aishell3_alignment_tone.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) (use MFA1.x now) of our repo.
+You can download from here [aishell3_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/with_tone/aishell3_alignment_tone.tar.gz), or train your own MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) (use MFA1.x now) of our repo.
 
 ## Get Started
 Assume the path to the dataset is `~/datasets/data_aishell3`.
diff --git a/examples/aishell3/tts3/conf/default.yaml b/examples/aishell3/tts3/conf/default.yaml
index 1dd782dbeed656429499cf3518e0becda0619202..0159c12f97a682e26447cf5aa044683beeba6409 100644
--- a/examples/aishell3/tts3/conf/default.yaml
+++ b/examples/aishell3/tts3/conf/default.yaml
@@ -45,7 +45,6 @@ model:
     postnet_layers: 5                 # number of layers of postnset
     postnet_filts: 5                  # filter size of conv layers in postnet
     postnet_chans: 256                # number of channels of conv layers in postnet
-    use_masking: True                 # whether to apply masking for padded part in loss calculation
     use_scaled_pos_enc: True          # whether to use scaled positional encoding
     encoder_normalize_before: True    # whether to perform layer normalization before the input
     decoder_normalize_before: True    # whether to perform layer normalization before the input
diff --git a/examples/aishell3/vc0/README.md b/examples/aishell3/vc0/README.md
index fa5c6694196b07cab186601373030317bdc87f95..93cb08bd06251a1323187e932d7fb9b896f6d18f 100644
--- a/examples/aishell3/vc0/README.md
+++ b/examples/aishell3/vc0/README.md
@@ -45,7 +45,8 @@ We use Montreal Force Aligner 1.0. The label in  aishell3 include pinyin，so th
 
 We use [lexicon.txt](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/lexicon.txt) as the lexicon.
 
-You can download the alignment results from here [alignment_aishell3.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/alignment_aishell3.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) (use MFA1.x now) of our repo.
+You can download the alignment results from here [alignment_aishell3.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/alignment_aishell3.tar.gz), or train your own MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) (use MFA1.x now) of our repo.
+
 
 ```bash
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
diff --git a/examples/aishell3/vc1/README.md b/examples/aishell3/vc1/README.md
index 3784d4d14a96a4c6b4923b68dcd93e6eb2377dec..974b84cade3536da775fcfb2aab5ae0db3df0683 100644
--- a/examples/aishell3/vc1/README.md
+++ b/examples/aishell3/vc1/README.md
@@ -18,7 +18,7 @@ tar zxvf data_aishell3.tgz -C data_aishell3
 ```
 ### Get MFA Result and Extract
 We use [MFA2.x](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for aishell3_fastspeech2.
-You can download from here [aishell3_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/with_tone/aishell3_alignment_tone.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) (use MFA1.x now) of our repo.
+You can download from here [aishell3_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/with_tone/aishell3_alignment_tone.tar.gz), or train your own MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) (use MFA1.x now) of our repo.
 
 ## Pretrained GE2E Model
 We use pretrained GE2E model to generate spwaker embedding for each sentence.
diff --git a/examples/aishell3/vc1/conf/default.yaml b/examples/aishell3/vc1/conf/default.yaml
index bdd2a765e1dee40324a05226be7bc590448c7c49..78c32525717eb7becde0f90271a26060c36e6644 100644
--- a/examples/aishell3/vc1/conf/default.yaml
+++ b/examples/aishell3/vc1/conf/default.yaml
@@ -45,7 +45,6 @@ model:
     postnet_layers: 5                 # number of layers of postnset
     postnet_filts: 5                  # filter size of conv layers in postnet
     postnet_chans: 256                # number of channels of conv layers in postnet
-    use_masking: True                 # whether to apply masking for padded part in loss calculation
     use_scaled_pos_enc: True          # whether to use scaled positional encoding
     encoder_normalize_before: True    # whether to perform layer normalization before the input
     decoder_normalize_before: True    # whether to perform layer normalization before the input
diff --git a/examples/aishell3/voc1/README.md b/examples/aishell3/voc1/README.md
index ad5f81b1ece5ebce7850acc1659d92d811ff74a0..9189eb728f7309d3ee771b6eedff2da296a776fc 100644
--- a/examples/aishell3/voc1/README.md
+++ b/examples/aishell3/voc1/README.md
@@ -15,7 +15,7 @@ tar zxvf data_aishell3.tgz -C data_aishell3
 ```
 ### Get MFA Result and Extract
 We use [MFA2.x](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for aishell3_fastspeech2.
-You can download from here [aishell3_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/with_tone/aishell3_alignment_tone.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) (use MFA1.x now) of our repo.
+You can download from here [aishell3_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/with_tone/aishell3_alignment_tone.tar.gz), or train your own MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) (use MFA1.x now) of our repo.
 
 ## Get Started
 Assume the path to the dataset is `~/datasets/data_aishell3`.
diff --git a/examples/callcenter/asr1/local/align.sh b/examples/callcenter/asr1/local/align.sh
index b679e2ea7fcb1d5ebd5a50dc88cb90e59e3e4789..681c77edea276fd675e2a3bb01b46528944d7cc0 100755
--- a/examples/callcenter/asr1/local/align.sh
+++ b/examples/callcenter/asr1/local/align.sh
@@ -23,7 +23,7 @@ mkdir -p ${output_dir}
 # align dump in `result_file`
 # .tier, .TextGrid dump in `dir of result_file`
 python3 -u ${BIN_DIR}/alignment.py \
---nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --result_file ${output_dir}/${type}.align \
 --checkpoint_path ${ckpt_prefix} \
diff --git a/examples/callcenter/asr1/local/export.sh b/examples/callcenter/asr1/local/export.sh
index d5f912e9033b1d71423d87ef0a9cf4b1991d3563..36de2d6677bc8fe20cfd3527b6e5d2886930c8e2 100755
--- a/examples/callcenter/asr1/local/export.sh
+++ b/examples/callcenter/asr1/local/export.sh
@@ -13,7 +13,7 @@ ckpt_path_prefix=$2
 jit_model_export_path=$3
 
 python3 -u ${BIN_DIR}/export.py \
---nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --checkpoint_path ${ckpt_path_prefix} \
 --export_path ${jit_model_export_path}
diff --git a/examples/callcenter/asr1/local/test.sh b/examples/callcenter/asr1/local/test.sh
index 0aa99e196731e5e68489013718ca4a77aa4471ce..fc43c5a20c867e9a34c3c726e27e78b6261bfc85 100755
--- a/examples/callcenter/asr1/local/test.sh
+++ b/examples/callcenter/asr1/local/test.sh
@@ -28,7 +28,7 @@ for type in attention ctc_greedy_search; do
     output_dir=${ckpt_prefix}
     mkdir -p ${output_dir}
     python3 -u ${BIN_DIR}/test.py \
-    --nproc ${ngpu} \
+    --ngpu ${ngpu} \
     --config ${config_path} \
     --result_file ${output_dir}/${type}.rsl \
     --checkpoint_path ${ckpt_prefix} \
@@ -47,7 +47,7 @@ for type in ctc_prefix_beam_search attention_rescoring; do
     output_dir=${ckpt_prefix}
     mkdir -p ${output_dir}
     python3 -u ${BIN_DIR}/test.py \
-    --nproc ${ngpu} \
+    --ngpu ${ngpu} \
     --config ${config_path} \
     --result_file ${output_dir}/${type}.rsl \
     --checkpoint_path ${ckpt_prefix} \
diff --git a/examples/callcenter/asr1/local/train.sh b/examples/callcenter/asr1/local/train.sh
index eb8f86626f8988890c89dc29accb5ae1f172e1d6..3e92fd1624db300b69dfe454cb9c0aee4eac8874 100755
--- a/examples/callcenter/asr1/local/train.sh
+++ b/examples/callcenter/asr1/local/train.sh
@@ -22,7 +22,7 @@ if [ ${seed} != 0 ]; then
 fi
 
 python3 -u ${BIN_DIR}/train.py \
---nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --output exp/${ckpt_name} \
 --seed ${seed}
diff --git a/examples/csmsc/tts2/README.md b/examples/csmsc/tts2/README.md
index 6f84d8f233b00cefd6508168982bff5195b2be7e..c9332967d685a302a68d3e2a5303421cc50f834b 100644
--- a/examples/csmsc/tts2/README.md
+++ b/examples/csmsc/tts2/README.md
@@ -7,7 +7,7 @@ Download CSMSC from it's [Official Website](https://test.data-baker.com/data/ind
 
 ### Get MFA Result and Extract
 We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for SPEEDYSPEECH.
-You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your own MFA model reference to  [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) of our repo.
+You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your own MFA model reference to  [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo.
 
 ## Get Started
 Assume the path to the dataset is `~/datasets/BZNSYP`.
diff --git a/examples/csmsc/tts3/README.md b/examples/csmsc/tts3/README.md
index d2a8e66361b3b13323a29db3083676f2bf74884e..cb7d49f900050f0cde1bf0ecea1bac29d52722cf 100644
--- a/examples/csmsc/tts3/README.md
+++ b/examples/csmsc/tts3/README.md
@@ -7,7 +7,7 @@ Download CSMSC from it's [Official Website](https://test.data-baker.com/data/ind
 
 ### Get MFA Result and Extract
 We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for fastspeech2.
-You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your own MFA model reference to  [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) of our repo.
+You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your own MFA model reference to  [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo.
 
 ## Get Started
 Assume the path to the dataset is `~/datasets/BZNSYP`.
@@ -209,8 +209,8 @@ Static model can be downloaded here [fastspeech2_nosil_baker_static_0.4.zip](htt
 
 Model | Step | eval/loss | eval/l1_loss | eval/duration_loss | eval/pitch_loss| eval/energy_loss 
 :-------------:| :------------:| :-----: | :-----: | :--------: |:--------:|:---------:
-default| 2(gpu) x 76000|1.0991|0.59132|0.035815| 0.31915| 0.15287|
-conformer| 2(gpu) x 76000||||||
+default| 2(gpu) x 76000|1.0991|0.59132|0.035815|0.31915|0.15287|
+conformer| 2(gpu) x 76000|1.0675|0.56103|0.035869|0.31553|0.15509|
 
 FastSpeech2 checkpoint contains files listed below.
 ```text
diff --git a/examples/csmsc/tts3/conf/conformer.yaml b/examples/csmsc/tts3/conf/conformer.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a34ef318d7589014a85473cd6570b7fa18532017
--- /dev/null
+++ b/examples/csmsc/tts3/conf/conformer.yaml
@@ -0,0 +1,109 @@
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+
+fs: 24000          # sr
+n_fft: 2048        # FFT size.
+n_shift: 300       # Hop size.
+win_length: 1200   # Window length.
+                   # If set to null, it will be the same as fft_size.
+window: "hann"     # Window function.
+
+# Only used for feats_type != raw
+
+fmin: 80           # Minimum frequency of Mel basis.
+fmax: 7600         # Maximum frequency of Mel basis.
+n_mels: 80         # The number of mel basis.
+
+# Only used for the model using pitch features (e.g. FastSpeech2)
+f0min: 80          # Maximum f0 for pitch extraction.
+f0max: 400         # Minimum f0 for pitch extraction.
+
+
+###########################################################
+#                       DATA SETTING                      #
+###########################################################
+batch_size: 64
+num_workers: 4
+
+
+###########################################################
+#                       MODEL SETTING                     #
+###########################################################
+model:
+    adim: 384         # attention dimension
+    aheads: 2         # number of attention heads
+    elayers: 4        # number of encoder layers
+    eunits: 1536      # number of encoder ff units
+    dlayers: 4        # number of decoder layers
+    dunits: 1536      # number of decoder ff units
+    positionwise_layer_type: conv1d   # type of position-wise layer
+    positionwise_conv_kernel_size: 3  # kernel size of position wise conv layer
+    duration_predictor_layers: 2      # number of layers of duration predictor
+    duration_predictor_chans: 256     # number of channels of duration predictor
+    duration_predictor_kernel_size: 3 # filter size of duration predictor
+    postnet_layers: 5                 # number of layers of postnset
+    postnet_filts: 5                  # filter size of conv layers in postnet
+    postnet_chans: 256                # number of channels of conv layers in postnet
+    encoder_normalize_before: True    # whether to perform layer normalization before the input
+    decoder_normalize_before: True    # whether to perform layer normalization before the input
+    reduction_factor: 1               # reduction factor
+    encoder_type: conformer           # encoder type
+    decoder_type: conformer           # decoder type
+    conformer_pos_enc_layer_type: rel_pos        # conformer positional encoding type
+    conformer_self_attn_layer_type: rel_selfattn # conformer self-attention type
+    conformer_activation_type: swish             # conformer activation type
+    use_macaron_style_in_conformer: true         # whether to use macaron style in conformer
+    use_cnn_in_conformer: true                   # whether to use CNN in conformer
+    conformer_enc_kernel_size: 7                 # kernel size in CNN module of conformer-based encoder
+    conformer_dec_kernel_size: 31                # kernel size in CNN module of conformer-based decoder
+    init_type: xavier_uniform         # initialization type
+    transformer_enc_dropout_rate: 0.2            # dropout rate for transformer encoder layer
+    transformer_enc_positional_dropout_rate: 0.2 # dropout rate for transformer encoder positional encoding
+    transformer_enc_attn_dropout_rate: 0.2       # dropout rate for transformer encoder attention layer
+    transformer_dec_dropout_rate: 0.2            # dropout rate for transformer decoder layer
+    transformer_dec_positional_dropout_rate: 0.2 # dropout rate for transformer decoder positional encoding
+    transformer_dec_attn_dropout_rate: 0.2       # dropout rate for transformer decoder attention layer
+    pitch_predictor_layers: 5                  # number of conv layers in pitch predictor
+    pitch_predictor_chans: 256                 # number of channels of conv layers in pitch predictor
+    pitch_predictor_kernel_size: 5             # kernel size of conv leyers in pitch predictor
+    pitch_predictor_dropout: 0.5               # dropout rate in pitch predictor
+    pitch_embed_kernel_size: 1                 # kernel size of conv embedding layer for pitch
+    pitch_embed_dropout: 0.0                   # dropout rate after conv embedding layer for pitch
+    stop_gradient_from_pitch_predictor: true   # whether to stop the gradient from pitch predictor to encoder
+    energy_predictor_layers: 2                 # number of conv layers in energy predictor
+    energy_predictor_chans: 256                # number of channels of conv layers in energy predictor
+    energy_predictor_kernel_size: 3            # kernel size of conv leyers in energy predictor
+    energy_predictor_dropout: 0.5              # dropout rate in energy predictor
+    energy_embed_kernel_size: 1                # kernel size of conv embedding layer for energy
+    energy_embed_dropout: 0.0                  # dropout rate after conv embedding layer for energy
+    stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder
+
+
+
+###########################################################
+#                       UPDATER SETTING                   #
+###########################################################
+updater:
+    use_masking: True                 # whether to apply masking for padded part in loss calculation
+
+
+
+###########################################################
+#                     OPTIMIZER SETTING                   #
+###########################################################
+optimizer:
+  optim: adam              # optimizer type
+  learning_rate: 0.001     # learning rate
+
+###########################################################
+#                     TRAINING SETTING                    #
+###########################################################
+max_epoch: 1000
+num_snapshots: 5
+
+
+###########################################################
+#                       OTHER SETTING                     #
+###########################################################
+seed: 10086
diff --git a/examples/csmsc/tts3/conf/default.yaml b/examples/csmsc/tts3/conf/default.yaml
index 32e58c4c6ec22a3d22471c818e38bcbeece35d75..55dca6d859357dc716555140d40ae6b06a6dc21b 100644
--- a/examples/csmsc/tts3/conf/default.yaml
+++ b/examples/csmsc/tts3/conf/default.yaml
@@ -45,7 +45,6 @@ model:
     postnet_layers: 5                 # number of layers of postnset
     postnet_filts: 5                  # filter size of conv layers in postnet
     postnet_chans: 256                # number of channels of conv layers in postnet
-    use_masking: True                 # whether to apply masking for padded part in loss calculation
     use_scaled_pos_enc: True          # whether to use scaled positional encoding
     encoder_normalize_before: True    # whether to perform layer normalization before the input
     decoder_normalize_before: True    # whether to perform layer normalization before the input
diff --git a/examples/csmsc/voc1/README.md b/examples/csmsc/voc1/README.md
index cb7a8ecc74ffca511bbcd71f3b98233240ddaf67..e6ee7b4a60ede20cc0f51f7d3496cae15f32f60d 100644
--- a/examples/csmsc/voc1/README.md
+++ b/examples/csmsc/voc1/README.md
@@ -5,8 +5,8 @@ This example contains code used to train a [parallel wavegan](http://arxiv.org/a
 Download CSMSC from the [official website](https://www.data-baker.com/data/index/source) and extract it to `~/datasets`. Then the dataset is in directory `~/datasets/BZNSYP`.
 
 ### Get MFA Result and Extract
-We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to cut silence in the edge of audio.
-You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your own MFA model reference to  [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) of our repo.
+We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to  cut silence in the edge of audio.
+You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your own MFA model reference to  [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo.
 
 ## Get Started
 Assume the path to the dataset is `~/datasets/BZNSYP`.
diff --git a/examples/csmsc/voc3/README.md b/examples/csmsc/voc3/README.md
index 615cb09e014eb07b0bf2f548ed6d97bf81532eff..52ca51e97a703d71160a6bf229a4ed098eb48099 100644
--- a/examples/csmsc/voc3/README.md
+++ b/examples/csmsc/voc3/README.md
@@ -5,8 +5,8 @@ This example contains code used to train a [Multi Band MelGAN](https://arxiv.org
 Download CSMSC from the [official website](https://www.data-baker.com/data/index/source) and extract it to `~/datasets`. Then the dataset is in directory `~/datasets/BZNSYP`.
 
 ### Get MFA Result and Extract
-We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to cut silence in the edge of audio.
-You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/use_mfa) of our repo.
+We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to  cut silence in the edge of audio.
+You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your own MFA model reference to [mfa example](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/mfa) of our repo.
 
 ## Get Started
 Assume the path to the dataset is `~/datasets/BZNSYP`.
diff --git a/audio/examples/sound_classification/README.md b/examples/esc50/README.md
similarity index 74%
rename from audio/examples/sound_classification/README.md
rename to examples/esc50/README.md
index 86a54cb34af1cec4763aebbe9fe60e5d5faa5340..66409754d464b7a674bc28050350fef6c77ec48a 100644
--- a/audio/examples/sound_classification/README.md
+++ b/examples/esc50/README.md
@@ -21,22 +21,17 @@ PaddleAudio提供了PANNs的CNN14、CNN10和CNN6的预训练模型，可供用
 
 ### 模型训练
 
-以环境声音分类数据集`ESC50`为示例，运行下面的命令，可在训练集上进行模型的finetune，支持单机的单卡训练和多卡训练。关于如何使用`paddle.distributed.launch`启动多卡训练，请查看[单机多卡训练](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/02_paddle2.0_develop/06_device_cn.html)。
+以环境声音分类数据集`ESC50`为示例，运行下面的命令，可在训练集上进行模型的finetune，支持单机的单卡训练和多卡训练。
 
-单卡训练:
+启动训练:
 ```shell
-$ python train.py --epochs 50 --batch_size 16 --checkpoint_dir ./checkpoint --save_freq 10
+$ CUDA_VISIBLE_DEVICES=0 ./run.sh 1
 ```
 
-多卡训练:
-```shell
-$ unset CUDA_VISIBLE_DEVICES
-$ python -m paddle.distributed.launch --gpus "0,1" train.py --epochs 50 --batch_size 16 --num_worker 4 --checkpoint_dir ./checkpoint --save_freq 10
-```
+`paddlespeech/cls/exps/panns/train.py` 脚本中可支持配置的参数：
 
-可支持配置的参数：
-
-- `device`: 选用什么设备进行训练，可选cpu或gpu，默认为gpu。如使用gpu训练则参数gpus指定GPU卡号。
+- `device`: 指定模型预测时使用的设备。
+- `feat_backend`: 选择提取特征的后端，可选`'numpy'`或`'paddle'`，默认为`'numpy'`。
 - `epochs`: 训练轮次，默认为50。
 - `learning_rate`: Fine-tune的学习率；默认为5e-5。
 - `batch_size`: 批处理大小，请结合显存情况进行调整，若出现显存不足，请适当调低这一参数；默认为16。
@@ -47,9 +42,9 @@ $ python -m paddle.distributed.launch --gpus "0,1" train.py --epochs 50 --batch_
 
 示例代码中使用的预训练模型为`CNN14`，如果想更换为其他预训练模型，可通过以下方式执行：
 ```python
-from model import SoundClassifier
 from paddleaudio.datasets import ESC50
-from paddleaudio.models.panns import cnn14, cnn10, cnn6
+from paddlespeech.cls.models import SoundClassifier
+from paddlespeech.cls.models import cnn14, cnn10, cnn6
 
 # CNN14
 backbone = cnn14(pretrained=True, extract_embedding=True)
@@ -67,12 +62,14 @@ model = SoundClassifier(backbone, num_class=len(ESC50.label_list))
 ### 模型预测
 
 ```shell
-python -u predict.py --wav ./dog.wav --top_k 3 --checkpoint ./checkpoint/epoch_50/model.pdparams
+$ CUDA_VISIBLE_DEVICES=0 ./run.sh 2
 ```
 
-可支持配置的参数：
-- `device`: 选用什么设备进行训练，可选cpu或gpu，默认为gpu。如使用gpu训练则参数gpus指定GPU卡号。
+`paddlespeech/cls/exps/panns/predict.py` 脚本中可支持配置的参数：
+
+- `device`: 指定模型预测时使用的设备。
 - `wav`: 指定预测的音频文件。
+- `feat_backend`: 选择提取特征的后端，可选`'numpy'`或`'paddle'`，默认为`'numpy'`。
 - `top_k`: 预测显示的top k标签的得分，默认为1。
 - `checkpoint`: 模型参数checkpoint文件。
 
@@ -91,10 +88,10 @@ Cat: 6.579841738130199e-06
 模型训练结束后，可以将已保存的动态图参数导出成静态图的模型和参数，然后实施静态图的部署。
 
 ```shell
-python -u export_model.py --checkpoint ./checkpoint/epoch_50/model.pdparams --output_dir ./export
+$ CUDA_VISIBLE_DEVICES=0 ./run.sh 3
 ```
 
-可支持配置的参数：
+`paddlespeech/cls/exps/panns/export_model.py` 脚本中可支持配置的参数：
 - `checkpoint`: 模型参数checkpoint文件。
 - `output_dir`: 导出静态图模型和参数文件的保存目录。
 
@@ -109,8 +106,13 @@ export
 
 #### 2. 模型部署和预测
 
-`deploy/python/predict.py` 脚本使用了`paddle.inference`模块下的api，提供了python端部署的示例：
+`paddlespeech/cls/exps/panns/deploy/predict.py` 脚本使用了`paddle.inference`模块下的api，提供了python端部署的示例：
 
-```sh
-python deploy/python/predict.py --model_dir ./export --device gpu
+```shell
+$ CUDA_VISIBLE_DEVICES=0 ./run.sh 4
 ```
+
+`paddlespeech/cls/exps/panns/deploy/predict.py` 脚本中可支持配置的主要参数：
+- `device`: 指定模型预测时使用的设备。
+- `model_dir`: 导出静态图模型和参数文件的保存目录。
+- `wav`: 指定预测的音频文件。
diff --git a/examples/esc50/cls0/local/export.sh b/examples/esc50/cls0/local/export.sh
new file mode 100755
index 0000000000000000000000000000000000000000..160dc7432d28c650281684d0c7a539023bfd0fb0
--- /dev/null
+++ b/examples/esc50/cls0/local/export.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+ckpt_dir=$1
+output_dir=$2
+
+python3 ${BIN_DIR}/export_model.py \
+--checkpoint ${ckpt_dir}/model.pdparams \
+--output_dir ${output_dir}
diff --git a/examples/esc50/cls0/local/infer.sh b/examples/esc50/cls0/local/infer.sh
new file mode 100755
index 0000000000000000000000000000000000000000..bc03d68106e6123e0f7b4dfff45bb0d9182f28c1
--- /dev/null
+++ b/examples/esc50/cls0/local/infer.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+audio_file=$1
+ckpt_dir=$2
+feat_backend=$3
+
+python3 ${BIN_DIR}/predict.py \
+--wav ${audio_file} \
+--feat_backend ${feat_backend} \
+--top_k 10 \
+--checkpoint ${ckpt_dir}/model.pdparams
diff --git a/examples/esc50/cls0/local/static_model_infer.sh b/examples/esc50/cls0/local/static_model_infer.sh
new file mode 100755
index 0000000000000000000000000000000000000000..9b3abb5d75344d1ce19b029eae574e06704656c1
--- /dev/null
+++ b/examples/esc50/cls0/local/static_model_infer.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+device=$1
+model_dir=$2
+audio_file=$3
+
+python3 ${BIN_DIR}/deploy/predict.py \
+--device ${device} \
+--model_dir ${model_dir} \
+--wav ${audio_file} 
diff --git a/examples/esc50/cls0/local/train.sh b/examples/esc50/cls0/local/train.sh
new file mode 100755
index 0000000000000000000000000000000000000000..0f0f3d091a314fdadf39c27d4c35e9debaf0984c
--- /dev/null
+++ b/examples/esc50/cls0/local/train.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+ngpu=$1
+feat_backend=$2
+
+num_epochs=50
+batch_size=16
+ckpt_dir=./checkpoint
+save_freq=10
+
+if [ ${ngpu} -gt 0 ]; then
+    python3 -m paddle.distributed.launch --gpus $CUDA_VISIBLE_DEVICES ${BIN_DIR}/train.py \
+    --epochs ${num_epochs} \
+    --feat_backend ${feat_backend} \
+    --batch_size ${batch_size} \
+    --checkpoint_dir ${ckpt_dir} \
+    --save_freq ${save_freq}
+else
+    python3 ${BIN_DIR}/train.py \
+    --epochs ${num_epochs} \
+    --feat_backend ${feat_backend} \
+    --batch_size ${batch_size} \
+    --checkpoint_dir ${ckpt_dir} \
+    --save_freq ${save_freq}
+fi
diff --git a/examples/esc50/cls0/path.sh b/examples/esc50/cls0/path.sh
new file mode 100644
index 0000000000000000000000000000000000000000..3eff28e48caca0cfa18173639c4ecc905b73d4fc
--- /dev/null
+++ b/examples/esc50/cls0/path.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+export MAIN_ROOT=`realpath ${PWD}/../../../`
+
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
+export LC_ALL=C
+
+export PYTHONDONTWRITEBYTECODE=1
+# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
+
+MODEL=panns
+export BIN_DIR=${MAIN_ROOT}/paddlespeech/cls/exps/${MODEL}
\ No newline at end of file
diff --git a/examples/esc50/cls0/run.sh b/examples/esc50/cls0/run.sh
new file mode 100755
index 0000000000000000000000000000000000000000..7283aa8d7cf85c2acd5bede3eaf77ffc03f848f3
--- /dev/null
+++ b/examples/esc50/cls0/run.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+set -e
+source path.sh
+
+ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+
+stage=$1
+stop_stage=100
+feat_backend=numpy
+audio_file=~/cat.wav
+ckpt_dir=./checkpoint/epoch_50
+output_dir=./export
+infer_device=cpu
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    ./local/train.sh ${ngpu} ${feat_backend} || exit -1
+    exit 0
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    ./local/infer.sh ${audio_file} ${ckpt_dir} ${feat_backend} || exit -1
+    exit 0
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    ./local/export.sh ${ckpt_dir} ${output_dir} || exit -1
+    exit 0
+fi
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    ./local/static_model_infer.sh ${infer_device} ${output_dir} ${audio_file} || exit -1
+    exit 0
+fi
diff --git a/examples/librispeech/asr0/local/data.sh b/examples/librispeech/asr0/local/data.sh
index fa2c9b2f7df9833ffa08d78accf8af9745ee186c..b97e8c211b13f48e1e775d8271f3efd890bba041 100755
--- a/examples/librispeech/asr0/local/data.sh
+++ b/examples/librispeech/asr0/local/data.sh
@@ -10,7 +10,7 @@ source ${MAIN_ROOT}/utils/parse_options.sh
 
 mkdir -p data
 mkdir -p ${dict_dir}
-TARGET_DIR=${MAIN_ROOT}/examples/dataset
+TARGET_DIR=${MAIN_ROOT}/dataset
 mkdir -p ${TARGET_DIR}
 
 if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
diff --git a/examples/librispeech/asr0/local/export.sh b/examples/librispeech/asr0/local/export.sh
index a5e62c28d2fa23a5ef9b9e2a0281b025aa943a30..426a72fe54cfae8af4c564611e3b794f988f5468 100755
--- a/examples/librispeech/asr0/local/export.sh
+++ b/examples/librispeech/asr0/local/export.sh
@@ -14,7 +14,7 @@ jit_model_export_path=$3
 model_type=$4
 
 python3 -u ${BIN_DIR}/export.py \
---nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --checkpoint_path ${ckpt_path_prefix} \
 --export_path ${jit_model_export_path} \
diff --git a/examples/librispeech/asr0/local/test.sh b/examples/librispeech/asr0/local/test.sh
index 4d00f30b852da5a370f5d4934f3caadd2b833c00..a627ef72274e3d67219701d34956fdf2b3a364b2 100755
--- a/examples/librispeech/asr0/local/test.sh
+++ b/examples/librispeech/asr0/local/test.sh
@@ -19,7 +19,7 @@ if [ $? -ne 0 ]; then
 fi
 
 python3 -u ${BIN_DIR}/test.py \
---nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --result_file ${ckpt_prefix}.rsl \
 --checkpoint_path ${ckpt_prefix} \
diff --git a/examples/librispeech/asr0/local/test_hub.sh b/examples/librispeech/asr0/local/test_hub.sh
index 2e32f24a0bf7dcf60acf029b0c692a667132d1ee..560d6758d3288f59347c2b3e3cc86741d2d6719b 100755
--- a/examples/librispeech/asr0/local/test_hub.sh
+++ b/examples/librispeech/asr0/local/test_hub.sh
@@ -13,6 +13,17 @@ ckpt_prefix=$2
 model_type=$3
 audio_file=$4
 
+mkdir -p data
+wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/en/demo_002_en.wav -P data/
+if [ $? -ne 0 ]; then
+   exit 1
+fi
+
+if [ ! -f ${audio_file} ]; then
+    echo "Plase input the right audio_file path"
+    exit 1
+fi
+
 # download language model
 bash local/download_lm_en.sh
 if [ $? -ne 0 ]; then
@@ -20,7 +31,7 @@ if [ $? -ne 0 ]; then
 fi
 
 python3 -u ${BIN_DIR}/test_hub.py \
---nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --result_file ${ckpt_prefix}.rsl \
 --checkpoint_path ${ckpt_prefix} \
diff --git a/examples/librispeech/asr0/local/train.sh b/examples/librispeech/asr0/local/train.sh
index 519df7fe9387a215f6d8835d90869ec0d4d7f5e5..0479398ffe96cef6fd9db1f1ebb1b35ab1099a56 100755
--- a/examples/librispeech/asr0/local/train.sh
+++ b/examples/librispeech/asr0/local/train.sh
@@ -21,7 +21,7 @@ if [ ${seed} != 0 ]; then
 fi
 
 python3 -u ${BIN_DIR}/train.py \
---nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --output exp/${ckpt_name} \
 --model_type ${model_type} \
diff --git a/examples/librispeech/asr0/run.sh b/examples/librispeech/asr0/run.sh
index 6c4fc4116c85ddff2d6dc26f285b7fc477f16289..1253b409bd277314399a1fccf7331099d9f0e3b9 100755
--- a/examples/librispeech/asr0/run.sh
+++ b/examples/librispeech/asr0/run.sh
@@ -8,13 +8,14 @@ stop_stage=100
 conf_path=conf/deepspeech2.yaml
 avg_num=30
 model_type=offline
+audio_file=data/demo_002_en.wav
+
 source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
 
 avg_ckpt=avg_${avg_num}
 ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}')
 echo "checkpoint name ${ckpt}"
 
-audio_file="data/tmp.flac"
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
     # prepare data
     bash ./local/data.sh || exit -1
diff --git a/examples/librispeech/asr1/RESULTS.md b/examples/librispeech/asr1/RESULTS.md
index 2ea55fc90f7345a27fd46a850a8d8ddf24be30e9..3dad7acbd017f9ba4bcbadde4ec9bb89a5b24e01 100644
--- a/examples/librispeech/asr1/RESULTS.md
+++ b/examples/librispeech/asr1/RESULTS.md
@@ -21,7 +21,7 @@
 ## Transformer
 | Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER |  
 | --- | --- | --- | --- | --- | --- | --- | --- |
-| transformer | 32.52 M | conf/transformer.yaml | spec_aug  | test-clean | attention | 6.733129533131917 | 0.047874 |  
-| transformer | 32.52 M | conf/transformer.yaml | spec_aug  | test-clean | ctc_greedy_search | 6.733129533131917 | 0.053922 |  
-| transformer | 32.52 M | conf/transformer.yaml | spec_aug  | test-clean | ctc_prefix_beam_search | 6.733129533131917 | 0.053427 |  
-| transformer | 32.52 M | conf/transformer.yaml | spec_aug  | test-clean | attention_rescoring | 6.733129533131917 | 0.041369 |  
+| transformer | 32.52 M | conf/transformer.yaml | spec_aug  | test-clean | attention | 6.725063021977743 | 0.047417 |  
+| transformer | 32.52 M | conf/transformer.yaml | spec_aug  | test-clean | ctc_greedy_search | 6.725063021977743 | 0.053922 |  
+| transformer | 32.52 M | conf/transformer.yaml | spec_aug  | test-clean | ctc_prefix_beam_search | 6.725063021977743 | 0.053180 |  
+| transformer | 32.52 M | conf/transformer.yaml | spec_aug  | test-clean | attention_rescoring | 6.725063021977743 | 0.041026 |  
diff --git a/examples/librispeech/asr1/local/align.sh b/examples/librispeech/asr1/local/align.sh
index 279461aafe19967ac054df815b08e60e351fcc7f..c65d611c43e870fe4024235c3a2f433452251d21 100755
--- a/examples/librispeech/asr1/local/align.sh
+++ b/examples/librispeech/asr1/local/align.sh
@@ -18,7 +18,7 @@ mkdir -p ${output_dir}
 # align dump in `result_file`
 # .tier, .TextGrid dump in `dir of result_file`
 python3 -u ${BIN_DIR}/alignment.py \
---nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --result_file ${output_dir}/${type}.align \
 --checkpoint_path ${ckpt_prefix} \
diff --git a/examples/librispeech/asr1/local/data.sh b/examples/librispeech/asr1/local/data.sh
index a0bf9a2d3fe02e8cebfdd9a868d62532a6a26543..3037c366fba14cb3778b3174b28d18ffbfb0b2e0 100755
--- a/examples/librispeech/asr1/local/data.sh
+++ b/examples/librispeech/asr1/local/data.sh
@@ -19,7 +19,7 @@ source ${MAIN_ROOT}/utils/parse_options.sh
 
 mkdir -p data
 mkdir -p ${dict_dir}
-TARGET_DIR=${MAIN_ROOT}/examples/dataset
+TARGET_DIR=${MAIN_ROOT}/dataset
 mkdir -p ${TARGET_DIR}
 
 if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
diff --git a/examples/librispeech/asr1/local/export.sh b/examples/librispeech/asr1/local/export.sh
index b562218e7a76c91d3b906d2c099218bf492627a8..6b646b46903f33563ab08604f4539551c78394b4 100755
--- a/examples/librispeech/asr1/local/export.sh
+++ b/examples/librispeech/asr1/local/export.sh
@@ -13,7 +13,7 @@ ckpt_path_prefix=$2
 jit_model_export_path=$3
 
 python3 -u ${BIN_DIR}/export.py \
---nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --checkpoint_path ${ckpt_path_prefix} \
 --export_path ${jit_model_export_path}
diff --git a/examples/librispeech/asr1/local/test.sh b/examples/librispeech/asr1/local/test.sh
index ceaa77cfa12b7a073391eaf7d6544f8cac884ced..aa06132e48245cc08b8eb89d453d8ade9db5f075 100755
--- a/examples/librispeech/asr1/local/test.sh
+++ b/examples/librispeech/asr1/local/test.sh
@@ -50,7 +50,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
             batch_size=64
         fi
         python3 -u ${BIN_DIR}/test.py \
-            --nproc ${ngpu} \
+            --ngpu ${ngpu} \
             --config ${config_path} \
             --result_file ${ckpt_prefix}.${type}.rsl \
             --checkpoint_path ${ckpt_prefix} \
@@ -74,7 +74,7 @@ for type in ctc_greedy_search; do
         batch_size=64
     fi
     python3 -u ${BIN_DIR}/test.py \
-        --nproc ${ngpu} \
+        --ngpu ${ngpu} \
         --config ${config_path} \
         --result_file ${ckpt_prefix}.${type}.rsl \
         --checkpoint_path ${ckpt_prefix} \
@@ -94,7 +94,7 @@ for type in ctc_prefix_beam_search attention_rescoring; do
     echo "decoding ${type}"
     batch_size=1
     python3 -u ${BIN_DIR}/test.py \
-        --nproc ${ngpu} \
+        --ngpu ${ngpu} \
         --config ${config_path} \
         --result_file ${ckpt_prefix}.${type}.rsl \
         --checkpoint_path ${ckpt_prefix} \
diff --git a/examples/librispeech/asr1/local/test_hub.sh b/examples/librispeech/asr1/local/test_hub.sh
index dcf242e90c367093d05ca9eb910f5e10784a73e4..ca63cf6ce48d88e1f163b55fc9dbfcaaff0c7fbe 100755
--- a/examples/librispeech/asr1/local/test_hub.sh
+++ b/examples/librispeech/asr1/local/test_hub.sh
@@ -12,6 +12,17 @@ config_path=$1
 ckpt_prefix=$2
 audio_file=$3
 
+mkdir -p data
+wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/en/demo_002_en.wav -P data/
+if [ $? -ne 0 ]; then
+   exit 1
+fi
+
+if [ ! -f ${audio_file} ]; then
+    echo "Plase input the right audio_file path"
+    exit 1
+fi
+
 # bpemode (unigram or bpe)
 nbpe=5000
 bpemode=unigram
@@ -36,7 +47,7 @@ for type in attention_rescoring; do
     output_dir=${ckpt_prefix}
     mkdir -p ${output_dir}
     python3 -u ${BIN_DIR}/test_hub.py \
-    --nproc ${ngpu} \
+    --ngpu ${ngpu} \
     --config ${config_path} \
     --result_file ${output_dir}/${type}.rsl \
     --checkpoint_path ${ckpt_prefix} \
diff --git a/examples/librispeech/asr1/local/train.sh b/examples/librispeech/asr1/local/train.sh
index 8f92c64694662b99a681bbf709f8da60d4e0eb0d..275d3a4905654a685a65e1341d2e611bf215fefc 100755
--- a/examples/librispeech/asr1/local/train.sh
+++ b/examples/librispeech/asr1/local/train.sh
@@ -23,7 +23,7 @@ fi
 # export FLAGS_conv_workspace_size_limit=4000
 
 python3 -u ${BIN_DIR}/train.py \
---nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --output exp/${ckpt_name} \
 --seed ${seed}
diff --git a/examples/librispeech/asr1/run.sh b/examples/librispeech/asr1/run.sh
index 74f7cbc1a4ecea0bfb4b6ae5064bc6773a4a4024..6dbb86e6cff2772efe0ab18ba1eb5e7f7ff18233 100755
--- a/examples/librispeech/asr1/run.sh
+++ b/examples/librispeech/asr1/run.sh
@@ -9,6 +9,7 @@ stage=0
 stop_stage=100
 conf_path=conf/transformer.yaml
 avg_num=30
+audio_file=data/demo_002_en.wav
 
 . ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
 
@@ -16,8 +17,6 @@ avg_ckpt=avg_${avg_num}
 ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}')
 echo "checkpoint name ${ckpt}"
 
-audio_file="data/tmp.flac"
-
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
     # prepare data
     bash ./local/data.sh || exit -1
diff --git a/examples/librispeech/asr2/local/align.sh b/examples/librispeech/asr2/local/align.sh
index b45f4a0f5c270740189b3a917b335eca7efdd164..626c3574258490c3e62c7f334dddd9c672e23941 100755
--- a/examples/librispeech/asr2/local/align.sh
+++ b/examples/librispeech/asr2/local/align.sh
@@ -22,7 +22,7 @@ python3 -u ${BIN_DIR}/test.py \
 --model-name 'u2_kaldi' \
 --run-mode 'align' \
 --dict-path ${dict_path} \
---nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --result-file ${output_dir}/${type}.align \
 --checkpoint_path ${ckpt_prefix} \
diff --git a/examples/librispeech/asr2/local/data.sh b/examples/librispeech/asr2/local/data.sh
index b232f35a0ac18d4d8a7d74d54aa1e146d6b3b286..c98c46952f69cc3e65f327bfb24faabdb90afcc5 100755
--- a/examples/librispeech/asr2/local/data.sh
+++ b/examples/librispeech/asr2/local/data.sh
@@ -15,7 +15,7 @@ do_delta=false
 # Set this to somewhere where you want to put your data, or where
 # someone else has already put it.  You'll want to change this
 # if you're not on the CLSP grid.
-datadir=${MAIN_ROOT}/examples/dataset/
+datadir=${MAIN_ROOT}/dataset/
 
 # bpemode (unigram or bpe)
 nbpe=5000
@@ -36,7 +36,7 @@ recog_set="test_clean test_other dev_clean dev_other"
 
 
 mkdir -p data
-TARGET_DIR=${MAIN_ROOT}/examples/dataset
+TARGET_DIR=${MAIN_ROOT}/dataset
 mkdir -p ${TARGET_DIR}
 if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
     # download data, generate manifests
diff --git a/examples/librispeech/asr2/local/export.sh b/examples/librispeech/asr2/local/export.sh
index 9c66dc62aa603aee0df2d48e90c8c7fe955063c9..1bdce16cd74e4b2542deb51bfb8cad24464fd319 100755
--- a/examples/librispeech/asr2/local/export.sh
+++ b/examples/librispeech/asr2/local/export.sh
@@ -15,7 +15,7 @@ jit_model_export_path=$3
 python3 -u ${BIN_DIR}/test.py \
 --model-name 'u2_kaldi' \
 --run-mode 'export' \
---nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --checkpoint_path ${ckpt_path_prefix} \
 --export_path ${jit_model_export_path}
diff --git a/examples/librispeech/asr2/local/test.sh b/examples/librispeech/asr2/local/test.sh
index 23670f74f14b61af11259929790f311543711892..d210f2a85e703f537da81dbf2ce840ab0abfb945 100755
--- a/examples/librispeech/asr2/local/test.sh
+++ b/examples/librispeech/asr2/local/test.sh
@@ -76,7 +76,7 @@ for dmethd in attention ctc_greedy_search ctc_prefix_beam_search attention_resco
             python3 -u ${BIN_DIR}/test.py \
             --model-name u2_kaldi \
             --run-mode test \
-            --nproc ${ngpu} \
+            --ngpu ${ngpu} \
             --dict-path ${dict} \
             --config ${config_path} \
             --checkpoint_path ${ckpt_prefix} \
diff --git a/examples/librispeech/asr2/local/train.sh b/examples/librispeech/asr2/local/train.sh
index 33b46c20f1f752c158fb304ce8470e88d6d4a761..898391f4e4c03ed7e6e4290b65f8b4f33ff39a51 100755
--- a/examples/librispeech/asr2/local/train.sh
+++ b/examples/librispeech/asr2/local/train.sh
@@ -21,7 +21,7 @@ fi
 
 python3 -u ${BIN_DIR}/train.py \
 --model-name u2_kaldi \
---nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --output exp/${ckpt_name} \
 --seed ${seed}
diff --git a/examples/ljspeech/tts3/README.md b/examples/ljspeech/tts3/README.md
index 86ed00eacf6ad74cdb63d2d567e246bc915eba8f..93f14edc39304d61563f185fe67e810e392dc7e5 100644
--- a/examples/ljspeech/tts3/README.md
+++ b/examples/ljspeech/tts3/README.md
@@ -7,7 +7,7 @@ Download LJSpeech-1.1 from the [official website](https://keithito.com/LJ-Speech
 
 ### Get MFA Result and Extract
 We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for fastspeech2.
-You can download from here [ljspeech_alignment.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/LJSpeech-1.1/ljspeech_alignment.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) of our repo.
+You can download from here [ljspeech_alignment.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/LJSpeech-1.1/ljspeech_alignment.tar.gz), or train your own MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo.
 
 ## Get Started
 Assume the path to the dataset is `~/datasets/LJSpeech-1.1`.
diff --git a/examples/ljspeech/tts3/conf/default.yaml b/examples/ljspeech/tts3/conf/default.yaml
index cabcca80ba5552ac3d300616b44572f3c297656e..e96422a1902d653e184c5afe39def0f8106200a5 100644
--- a/examples/ljspeech/tts3/conf/default.yaml
+++ b/examples/ljspeech/tts3/conf/default.yaml
@@ -45,7 +45,6 @@ model:
     postnet_layers: 5                 # number of layers of postnset
     postnet_filts: 5                  # filter size of conv layers in postnet
     postnet_chans: 256                # number of channels of conv layers in postnet
-    use_masking: True                 # whether to apply masking for padded part in loss calculation
     use_scaled_pos_enc: True          # whether to use scaled positional encoding
     encoder_normalize_before: True    # whether to perform layer normalization before the input
     decoder_normalize_before: True    # whether to perform layer normalization before the input
diff --git a/examples/ljspeech/voc1/README.md b/examples/ljspeech/voc1/README.md
index 7cb69b154713c787c48036030f97bd7d4ce089bc..3830156f9fea437f99c3bed1c86999fb1b048e91 100644
--- a/examples/ljspeech/voc1/README.md
+++ b/examples/ljspeech/voc1/README.md
@@ -5,7 +5,7 @@ This example contains code used to train a [parallel wavegan](http://arxiv.org/a
 Download LJSpeech-1.1 from the [official website](https://keithito.com/LJ-Speech-Dataset/).
 ### Get MFA Result and Extract
 We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to cut silence in the edge of audio.
-You can download from here [ljspeech_alignment.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/LJSpeech-1.1/ljspeech_alignment.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) of our repo.
+You can download from here [ljspeech_alignment.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/LJSpeech-1.1/ljspeech_alignment.tar.gz), or train your own MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo.
 
 ## Get Started
 Assume the path to the dataset is `~/datasets/LJSpeech-1.1`.
diff --git a/examples/other/1xt2x/aishell/local/data.sh b/examples/other/1xt2x/aishell/local/data.sh
index 85574260b023f8559c16869666a34b7f42599679..a9d5b1412a5117812e37fbdbc2e168732c8d0528 100755
--- a/examples/other/1xt2x/aishell/local/data.sh
+++ b/examples/other/1xt2x/aishell/local/data.sh
@@ -12,7 +12,7 @@ stop_stage=100
 source ${MAIN_ROOT}/utils/parse_options.sh
 
 mkdir -p data
-TARGET_DIR=${MAIN_ROOT}/examples/dataset
+TARGET_DIR=${MAIN_ROOT}/dataset
 mkdir -p ${TARGET_DIR}
 
 bash local/download_model.sh ${ckpt_dir}
diff --git a/examples/other/1xt2x/aishell/local/test.sh b/examples/other/1xt2x/aishell/local/test.sh
index 2ae0740b3e8d44ab03e45f4c1b5dbb945657705e..8cbff23520a61523e5d8599e627f897ef583d127 100755
--- a/examples/other/1xt2x/aishell/local/test.sh
+++ b/examples/other/1xt2x/aishell/local/test.sh
@@ -19,7 +19,7 @@ if [ $? -ne 0 ]; then
 fi
 
 python3 -u ${BIN_DIR}/test.py \
---nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --result_file ${ckpt_prefix}.rsl \
 --checkpoint_path ${ckpt_prefix} \
diff --git a/examples/other/1xt2x/baidu_en8k/local/data.sh b/examples/other/1xt2x/baidu_en8k/local/data.sh
index 8e378ff053ba78c970a50ca5d938975f3464f50f..9b017324dc512f6b61706db8afb393e5ca88b3f3 100755
--- a/examples/other/1xt2x/baidu_en8k/local/data.sh
+++ b/examples/other/1xt2x/baidu_en8k/local/data.sh
@@ -13,7 +13,7 @@ unit_type=char
 source ${MAIN_ROOT}/utils/parse_options.sh
 
 mkdir -p data
-TARGET_DIR=${MAIN_ROOT}/examples/dataset
+TARGET_DIR=${MAIN_ROOT}/dataset
 mkdir -p ${TARGET_DIR}
 
 
diff --git a/examples/other/1xt2x/baidu_en8k/local/test.sh b/examples/other/1xt2x/baidu_en8k/local/test.sh
index 4d00f30b852da5a370f5d4934f3caadd2b833c00..a627ef72274e3d67219701d34956fdf2b3a364b2 100755
--- a/examples/other/1xt2x/baidu_en8k/local/test.sh
+++ b/examples/other/1xt2x/baidu_en8k/local/test.sh
@@ -19,7 +19,7 @@ if [ $? -ne 0 ]; then
 fi
 
 python3 -u ${BIN_DIR}/test.py \
---nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --result_file ${ckpt_prefix}.rsl \
 --checkpoint_path ${ckpt_prefix} \
diff --git a/examples/other/1xt2x/librispeech/local/data.sh b/examples/other/1xt2x/librispeech/local/data.sh
index 7387472d53cd0cb6ed6a73eb9eb8e5d3ba6685aa..43b5426d9d9a3bc9e4fbafcf23251fc7793eb080 100755
--- a/examples/other/1xt2x/librispeech/local/data.sh
+++ b/examples/other/1xt2x/librispeech/local/data.sh
@@ -14,7 +14,7 @@ unit_type=char
 source ${MAIN_ROOT}/utils/parse_options.sh
 
 mkdir -p data
-TARGET_DIR=${MAIN_ROOT}/examples/dataset
+TARGET_DIR=${MAIN_ROOT}/dataset
 mkdir -p ${TARGET_DIR}
 
 bash local/download_model.sh ${ckpt_dir}
diff --git a/examples/other/1xt2x/librispeech/local/test.sh b/examples/other/1xt2x/librispeech/local/test.sh
index 4d00f30b852da5a370f5d4934f3caadd2b833c00..a627ef72274e3d67219701d34956fdf2b3a364b2 100755
--- a/examples/other/1xt2x/librispeech/local/test.sh
+++ b/examples/other/1xt2x/librispeech/local/test.sh
@@ -19,7 +19,7 @@ if [ $? -ne 0 ]; then
 fi
 
 python3 -u ${BIN_DIR}/test.py \
---nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --result_file ${ckpt_prefix}.rsl \
 --checkpoint_path ${ckpt_prefix} \
diff --git a/examples/other/1xt2x/src_deepspeech2x/test_model.py b/examples/other/1xt2x/src_deepspeech2x/test_model.py
index a9afc6313f705fd5e04a792048323457d39cab5e..82e190d81bdcecb94fbc2a9626c1cb99bc5d9db4 100644
--- a/examples/other/1xt2x/src_deepspeech2x/test_model.py
+++ b/examples/other/1xt2x/src_deepspeech2x/test_model.py
@@ -403,7 +403,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
     def setup(self):
         """Setup the experiment.
         """
-        paddle.set_device('gpu' if self.args.nprocs > 0 else 'cpu')
+        paddle.set_device('gpu' if self.args.ngpu > 0 else 'cpu')
 
         self.setup_output_dir()
         self.setup_checkpointer()
diff --git a/examples/other/use_mfa/README.md b/examples/other/mfa/README.md
similarity index 100%
rename from examples/other/use_mfa/README.md
rename to examples/other/mfa/README.md
diff --git a/examples/other/use_mfa/local/cmudict-0.7b b/examples/other/mfa/local/cmudict-0.7b
similarity index 100%
rename from examples/other/use_mfa/local/cmudict-0.7b
rename to examples/other/mfa/local/cmudict-0.7b
diff --git a/examples/other/use_mfa/local/detect_oov.py b/examples/other/mfa/local/detect_oov.py
similarity index 100%
rename from examples/other/use_mfa/local/detect_oov.py
rename to examples/other/mfa/local/detect_oov.py
diff --git a/examples/other/use_mfa/local/generate_lexicon.py b/examples/other/mfa/local/generate_lexicon.py
similarity index 100%
rename from examples/other/use_mfa/local/generate_lexicon.py
rename to examples/other/mfa/local/generate_lexicon.py
diff --git a/examples/other/use_mfa/local/reorganize_aishell3.py b/examples/other/mfa/local/reorganize_aishell3.py
similarity index 100%
rename from examples/other/use_mfa/local/reorganize_aishell3.py
rename to examples/other/mfa/local/reorganize_aishell3.py
diff --git a/examples/other/use_mfa/local/reorganize_baker.py b/examples/other/mfa/local/reorganize_baker.py
similarity index 100%
rename from examples/other/use_mfa/local/reorganize_baker.py
rename to examples/other/mfa/local/reorganize_baker.py
diff --git a/examples/other/use_mfa/local/reorganize_ljspeech.py b/examples/other/mfa/local/reorganize_ljspeech.py
similarity index 100%
rename from examples/other/use_mfa/local/reorganize_ljspeech.py
rename to examples/other/mfa/local/reorganize_ljspeech.py
diff --git a/examples/other/use_mfa/local/reorganize_vctk.py b/examples/other/mfa/local/reorganize_vctk.py
similarity index 100%
rename from examples/other/use_mfa/local/reorganize_vctk.py
rename to examples/other/mfa/local/reorganize_vctk.py
diff --git a/examples/other/use_mfa/run.sh b/examples/other/mfa/run.sh
similarity index 100%
rename from examples/other/use_mfa/run.sh
rename to examples/other/mfa/run.sh
diff --git a/examples/ted_en_zh/st0/local/data.sh b/examples/ted_en_zh/st0/local/data.sh
index fb4efbe3572400363e2da16d1ba888840686d7a3..097cd3a85d9ae6a78857e2051223a80d10fc8862 100755
--- a/examples/ted_en_zh/st0/local/data.sh
+++ b/examples/ted_en_zh/st0/local/data.sh
@@ -16,7 +16,7 @@ data_dir=./TED-En-Zh
 . ${MAIN_ROOT}/utils/parse_options.sh || exit -1;
 
 
-TARGET_DIR=${MAIN_ROOT}/examples/dataset
+TARGET_DIR=${MAIN_ROOT}/dataset
 mkdir -p ${TARGET_DIR}
 mkdir -p data
 mkdir -p ${dict_dir}
diff --git a/examples/ted_en_zh/st0/local/test.sh b/examples/ted_en_zh/st0/local/test.sh
index 7235c6f9a167d8c60fd80a0947dea380dde202bc..a9b18dd98810d56cb2c0c94141511bfd8343a346 100755
--- a/examples/ted_en_zh/st0/local/test.sh
+++ b/examples/ted_en_zh/st0/local/test.sh
@@ -15,7 +15,7 @@ for type in fullsentence; do
     echo "decoding ${type}"
     batch_size=32
     python3 -u ${BIN_DIR}/test.py \
-    --nproc ${ngpu} \
+    --ngpu ${ngpu} \
     --config ${config_path} \
     --result_file ${ckpt_prefix}.${type}.rsl \
     --checkpoint_path ${ckpt_prefix} \
diff --git a/examples/ted_en_zh/st0/local/train.sh b/examples/ted_en_zh/st0/local/train.sh
index e5fd19ddb8d523acc328f13e3a05c126ae56db29..e366376bb82da7dbefb2744cf1099949a5fadf06 100755
--- a/examples/ted_en_zh/st0/local/train.sh
+++ b/examples/ted_en_zh/st0/local/train.sh
@@ -20,7 +20,7 @@ if [ ${seed} != 0 ]; then
 fi
 
 python3 -u ${BIN_DIR}/train.py \
---nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --output exp/${ckpt_name} \
 --seed ${seed}
diff --git a/examples/ted_en_zh/st1/local/data.sh b/examples/ted_en_zh/st1/local/data.sh
index 2e9d05d102e6d892eca263ffb41a001087923425..aa958cfde38fadebc8d6e042cb9c1ce9ce21db16 100755
--- a/examples/ted_en_zh/st1/local/data.sh
+++ b/examples/ted_en_zh/st1/local/data.sh
@@ -15,7 +15,7 @@ data_dir=./TED_EnZh
 
 source ${MAIN_ROOT}/utils/parse_options.sh
 
-TARGET_DIR=${MAIN_ROOT}/examples/dataset
+TARGET_DIR=${MAIN_ROOT}/dataset
 mkdir -p ${TARGET_DIR}
 mkdir -p data
 mkdir -p ${dict_dir}
diff --git a/examples/ted_en_zh/st1/local/test.sh b/examples/ted_en_zh/st1/local/test.sh
index 7235c6f9a167d8c60fd80a0947dea380dde202bc..a9b18dd98810d56cb2c0c94141511bfd8343a346 100755
--- a/examples/ted_en_zh/st1/local/test.sh
+++ b/examples/ted_en_zh/st1/local/test.sh
@@ -15,7 +15,7 @@ for type in fullsentence; do
     echo "decoding ${type}"
     batch_size=32
     python3 -u ${BIN_DIR}/test.py \
-    --nproc ${ngpu} \
+    --ngpu ${ngpu} \
     --config ${config_path} \
     --result_file ${ckpt_prefix}.${type}.rsl \
     --checkpoint_path ${ckpt_prefix} \
diff --git a/examples/ted_en_zh/st1/local/train_finetune.sh b/examples/ted_en_zh/st1/local/train_finetune.sh
index 36701121741dd1cad0c480764849fb99390c5281..e54c7fff4c20dfd2dcb6bd75c3eed6dd6c983858 100755
--- a/examples/ted_en_zh/st1/local/train_finetune.sh
+++ b/examples/ted_en_zh/st1/local/train_finetune.sh
@@ -21,7 +21,7 @@ if [ ${seed} != 0 ]; then
 fi
 
 python3 -u ${BIN_DIR}/train.py \
---nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --output exp/${ckpt_name} \
 --checkpoint_path ${ckpt_path} \
diff --git a/examples/thchs30/align0/local/data.sh b/examples/thchs30/align0/local/data.sh
index 8614a041583f1a79867728d94d3b5fedcf473fb5..6d6fc4e8f08bc462446e45b687ea833835b5dd19 100644
--- a/examples/thchs30/align0/local/data.sh
+++ b/examples/thchs30/align0/local/data.sh
@@ -6,7 +6,7 @@ stop_stage=100
 source ${MAIN_ROOT}/utils/parse_options.sh
 
 mkdir -p data
-TARGET_DIR=${MAIN_ROOT}/examples/dataset
+TARGET_DIR=${MAIN_ROOT}/dataset
 mkdir -p ${TARGET_DIR}
 LEXICON_NAME=$1
 
diff --git a/examples/timit/asr1/local/align.sh b/examples/timit/asr1/local/align.sh
index 279461aafe19967ac054df815b08e60e351fcc7f..c65d611c43e870fe4024235c3a2f433452251d21 100755
--- a/examples/timit/asr1/local/align.sh
+++ b/examples/timit/asr1/local/align.sh
@@ -18,7 +18,7 @@ mkdir -p ${output_dir}
 # align dump in `result_file`
 # .tier, .TextGrid dump in `dir of result_file`
 python3 -u ${BIN_DIR}/alignment.py \
---nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --result_file ${output_dir}/${type}.align \
 --checkpoint_path ${ckpt_prefix} \
diff --git a/examples/timit/asr1/local/data.sh b/examples/timit/asr1/local/data.sh
index fb720932d6665dc99c5e930a03714c06c05bdb47..1f631f7fddcde86ad8bfebfe29132ef456c1c398 100755
--- a/examples/timit/asr1/local/data.sh
+++ b/examples/timit/asr1/local/data.sh
@@ -12,7 +12,7 @@ TIMIT_path=
 
 mkdir -p data
 mkdir -p ${dict_dir}
-TARGET_DIR=${MAIN_ROOT}/examples/dataset
+TARGET_DIR=${MAIN_ROOT}/dataset
 mkdir -p ${TARGET_DIR}
 
 
diff --git a/examples/timit/asr1/local/export.sh b/examples/timit/asr1/local/export.sh
index b562218e7a76c91d3b906d2c099218bf492627a8..6b646b46903f33563ab08604f4539551c78394b4 100755
--- a/examples/timit/asr1/local/export.sh
+++ b/examples/timit/asr1/local/export.sh
@@ -13,7 +13,7 @@ ckpt_path_prefix=$2
 jit_model_export_path=$3
 
 python3 -u ${BIN_DIR}/export.py \
---nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --checkpoint_path ${ckpt_path_prefix} \
 --export_path ${jit_model_export_path}
diff --git a/examples/timit/asr1/local/test.sh b/examples/timit/asr1/local/test.sh
index 575bff572fecd92779a61c41c81c903a6dee3822..08ee0e36571f058367ab4921e498505a498155f7 100755
--- a/examples/timit/asr1/local/test.sh
+++ b/examples/timit/asr1/local/test.sh
@@ -41,7 +41,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
             batch_size=64
         fi
         python3 -u ${BIN_DIR}/test.py \
-        --nproc ${ngpu} \
+        --ngpu ${ngpu} \
         --config ${config_path} \
         --result_file ${ckpt_prefix}.${type}.rsl \
         --checkpoint_path ${ckpt_prefix} \
@@ -61,7 +61,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
         echo "decoding ${type}"
         batch_size=1
         python3 -u ${BIN_DIR}/test.py \
-        --nproc ${ngpu}  \
+        --ngpu ${ngpu}  \
         --config ${config_path} \
         --result_file ${ckpt_prefix}.${type}.rsl \
         --checkpoint_path ${ckpt_prefix} \
@@ -80,7 +80,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
         echo "decoding ${type}"
         batch_size=1
         python3 -u ${BIN_DIR}/test.py \
-        --nproc ${ngpu}  \
+        --ngpu ${ngpu}  \
         --config ${config_path} \
         --result_file ${ckpt_prefix}.${type}.rsl \
         --checkpoint_path ${ckpt_prefix} \
diff --git a/examples/timit/asr1/local/train.sh b/examples/timit/asr1/local/train.sh
index 89a64327c1befc875f50fed37f93b4abd021ac8e..9b3fa17750262e775100313b5386d9150ae011b7 100755
--- a/examples/timit/asr1/local/train.sh
+++ b/examples/timit/asr1/local/train.sh
@@ -20,7 +20,7 @@ if [ ${seed} != 0  ]; then
 fi
 
 python3 -u ${BIN_DIR}/train.py \
---nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --output exp/${ckpt_name} \
 --seed ${seed}
diff --git a/examples/tiny/asr0/local/data.sh b/examples/tiny/asr0/local/data.sh
index 2a544ef89ee269ded158295f4ea58f8e9dbe5776..4251d57143d7772f84dfd6147d62c803e5bdeebd 100755
--- a/examples/tiny/asr0/local/data.sh
+++ b/examples/tiny/asr0/local/data.sh
@@ -10,7 +10,7 @@ dict_dir=data/lang_char
 
 mkdir -p data
 mkdir -p ${dict_dir}
-TARGET_DIR=${MAIN_ROOT}/examples/dataset
+TARGET_DIR=${MAIN_ROOT}/dataset
 mkdir -p ${TARGET_DIR}
 
 if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
diff --git a/examples/tiny/asr0/local/export.sh b/examples/tiny/asr0/local/export.sh
index a5e62c28d2fa23a5ef9b9e2a0281b025aa943a30..426a72fe54cfae8af4c564611e3b794f988f5468 100755
--- a/examples/tiny/asr0/local/export.sh
+++ b/examples/tiny/asr0/local/export.sh
@@ -14,7 +14,7 @@ jit_model_export_path=$3
 model_type=$4
 
 python3 -u ${BIN_DIR}/export.py \
---nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --checkpoint_path ${ckpt_path_prefix} \
 --export_path ${jit_model_export_path} \
diff --git a/examples/tiny/asr0/local/test.sh b/examples/tiny/asr0/local/test.sh
index 4d00f30b852da5a370f5d4934f3caadd2b833c00..a627ef72274e3d67219701d34956fdf2b3a364b2 100755
--- a/examples/tiny/asr0/local/test.sh
+++ b/examples/tiny/asr0/local/test.sh
@@ -19,7 +19,7 @@ if [ $? -ne 0 ]; then
 fi
 
 python3 -u ${BIN_DIR}/test.py \
---nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --result_file ${ckpt_prefix}.rsl \
 --checkpoint_path ${ckpt_prefix} \
diff --git a/examples/tiny/asr0/local/train.sh b/examples/tiny/asr0/local/train.sh
index 5b87780aebb1022e7914d6bd702faf7da7d21c14..a69b6ddb90ee9ade0320ed88f91a6d1ba25e93a2 100755
--- a/examples/tiny/asr0/local/train.sh
+++ b/examples/tiny/asr0/local/train.sh
@@ -27,7 +27,7 @@ model_type=$3
 mkdir -p exp
 
 python3 -u ${BIN_DIR}/train.py \
---nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --output exp/${ckpt_name} \
 --model_type ${model_type} \
diff --git a/examples/tiny/asr1/local/align.sh b/examples/tiny/asr1/local/align.sh
index 279461aafe19967ac054df815b08e60e351fcc7f..c65d611c43e870fe4024235c3a2f433452251d21 100755
--- a/examples/tiny/asr1/local/align.sh
+++ b/examples/tiny/asr1/local/align.sh
@@ -18,7 +18,7 @@ mkdir -p ${output_dir}
 # align dump in `result_file`
 # .tier, .TextGrid dump in `dir of result_file`
 python3 -u ${BIN_DIR}/alignment.py \
---nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --result_file ${output_dir}/${type}.align \
 --checkpoint_path ${ckpt_prefix} \
diff --git a/examples/tiny/asr1/local/data.sh b/examples/tiny/asr1/local/data.sh
index 1ef9f7768aede38d6d192e8c458b87433aa364c1..16a029a3887942719701eb3f27eacd68d9b8f039 100755
--- a/examples/tiny/asr1/local/data.sh
+++ b/examples/tiny/asr1/local/data.sh
@@ -14,7 +14,7 @@ bpeprefix="${dict_dir}/bpe_${bpemode}_${nbpe}"
 
 mkdir -p data
 mkdir -p ${dict_dir}
-TARGET_DIR=${MAIN_ROOT}/examples/dataset
+TARGET_DIR=${MAIN_ROOT}/dataset
 mkdir -p ${TARGET_DIR}
 
 if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
diff --git a/examples/tiny/asr1/local/export.sh b/examples/tiny/asr1/local/export.sh
index b562218e7a76c91d3b906d2c099218bf492627a8..6b646b46903f33563ab08604f4539551c78394b4 100755
--- a/examples/tiny/asr1/local/export.sh
+++ b/examples/tiny/asr1/local/export.sh
@@ -13,7 +13,7 @@ ckpt_path_prefix=$2
 jit_model_export_path=$3
 
 python3 -u ${BIN_DIR}/export.py \
---nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --checkpoint_path ${ckpt_path_prefix} \
 --export_path ${jit_model_export_path}
diff --git a/examples/tiny/asr1/local/test.sh b/examples/tiny/asr1/local/test.sh
index 34088ce97582dc9494e8e5ce6ba18adb81499b03..190bacffc8b911e062299a00ddf2ba6b4701c337 100755
--- a/examples/tiny/asr1/local/test.sh
+++ b/examples/tiny/asr1/local/test.sh
@@ -31,7 +31,7 @@ for type in attention ctc_greedy_search; do
         batch_size=64
     fi
     python3 -u ${BIN_DIR}/test.py \
-    --nproc ${ngpu} \
+    --ngpu ${ngpu} \
     --config ${config_path} \
     --result_file ${ckpt_prefix}.${type}.rsl \
     --checkpoint_path ${ckpt_prefix} \
@@ -48,7 +48,7 @@ for type in ctc_prefix_beam_search attention_rescoring; do
     echo "decoding ${type}"
     batch_size=1
     python3 -u ${BIN_DIR}/test.py \
-    --nproc ${ngpu} \
+    --ngpu ${ngpu} \
     --config ${config_path} \
     --result_file ${ckpt_prefix}.${type}.rsl \
     --checkpoint_path ${ckpt_prefix} \
diff --git a/examples/tiny/asr1/local/train.sh b/examples/tiny/asr1/local/train.sh
index 71af3a006deec9032fcefd561fc46cea274d1e10..1c8593bddf5499f50026bd504470114c642055b0 100755
--- a/examples/tiny/asr1/local/train.sh
+++ b/examples/tiny/asr1/local/train.sh
@@ -29,7 +29,7 @@ mkdir -p exp
 
 python3 -u ${BIN_DIR}/train.py \
 --seed ${seed} \
---nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --output exp/${ckpt_name} \
 --profiler-options "${profiler_options}" \
diff --git a/examples/vctk/tts3/README.md b/examples/vctk/tts3/README.md
index aab005735134dc5514cc9ee18edc30240ebaf1b4..d7d632cdd305302a88c083639b1f24ce5bb15d64 100644
--- a/examples/vctk/tts3/README.md
+++ b/examples/vctk/tts3/README.md
@@ -7,8 +7,8 @@ Download VCTK-0.92 from the [official website](https://datashare.ed.ac.uk/handle
 
 ### Get MFA Result and Extract
 We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for fastspeech2.
-You can download from here [vctk_alignment.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/VCTK-Corpus-0.92/vctk_alignment.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) of our repo.
-ps: we remove three speakers in VCTK-0.92 (see [reorganize_vctk.py](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/examples/other/use_mfa/local/reorganize_vctk.py)):
+You can download from here [vctk_alignment.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/VCTK-Corpus-0.92/vctk_alignment.tar.gz), or train your own MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo.
+ps: we remove three speakers in VCTK-0.92 (see [reorganize_vctk.py](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/examples/other/mfa/local/reorganize_vctk.py)):
 1. `p315`, because no txt for it.
 2. `p280` and `p362`, because no *_mic2.flac (which is better than *_mic1.flac) for  them.
 
diff --git a/examples/vctk/tts3/conf/default.yaml b/examples/vctk/tts3/conf/default.yaml
index 09bd348337fa63a3b2f546f23b9e20d041518dd7..4f945a31c595e2f1d06962e5b08c81a0978a2221 100644
--- a/examples/vctk/tts3/conf/default.yaml
+++ b/examples/vctk/tts3/conf/default.yaml
@@ -45,7 +45,6 @@ model:
     postnet_layers: 5                 # number of layers of postnset
     postnet_filts: 5                  # filter size of conv layers in postnet
     postnet_chans: 256                # number of channels of conv layers in postnet
-    use_masking: True                 # whether to apply masking for padded part in loss calculation
     use_scaled_pos_enc: True          # whether to use scaled positional encoding
     encoder_normalize_before: True    # whether to perform layer normalization before the input
     decoder_normalize_before: True    # whether to perform layer normalization before the input
diff --git a/examples/vctk/voc1/README.md b/examples/vctk/voc1/README.md
index 154fd7cdee108e776cb1df7312a0a5757509f44e..6aa311fb795c0dda3d96dbb9020bd3236307618f 100644
--- a/examples/vctk/voc1/README.md
+++ b/examples/vctk/voc1/README.md
@@ -6,9 +6,9 @@ This example contains code used to train a [parallel wavegan](http://arxiv.org/a
 Download VCTK-0.92  from the [official website](https://datashare.ed.ac.uk/handle/10283/3443) and extract it to `~/datasets`. Then the dataset is in directory `~/datasets/VCTK-Corpus-0.92`.
 
 ### Get MFA Result and Extract
-We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to cut silence in the edge of audio.
-You can download from here [vctk_alignment.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/VCTK-Corpus-0.92/vctk_alignment.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) of our repo.
-ps: we remove three speakers in VCTK-0.92 (see [reorganize_vctk.py](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/examples/other/use_mfa/local/reorganize_vctk.py)):
+We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to  cut silence in the edge of audio.
+You can download from here [vctk_alignment.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/VCTK-Corpus-0.92/vctk_alignment.tar.gz), or train your own MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo.
+ps: we remove three speakers in VCTK-0.92 (see [reorganize_vctk.py](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/examples/other/mfa/local/reorganize_vctk.py)):
 1. `p315`, because no txt for it.
 2. `p280` and `p362`, because no *_mic2.flac (which is better than *_mic1.flac) for  them.
 
diff --git a/examples/wenetspeech/asr1/local/data.sh b/examples/wenetspeech/asr1/local/data.sh
index 67b3d5a55ff6fa8c842d540c3814fb35aafa48e1..7dd478d193c199df9d4d025006d9392668be0546 100755
--- a/examples/wenetspeech/asr1/local/data.sh
+++ b/examples/wenetspeech/asr1/local/data.sh
@@ -27,7 +27,7 @@ set -o pipefail
 
 
 mkdir -p data
-TARGET_DIR=${MAIN_ROOT}/examples/dataset
+TARGET_DIR=${MAIN_ROOT}/dataset
 mkdir -p ${TARGET_DIR}
 
 if [ ${stage} -le -2 ] && [ ${stop_stage} -ge -2 ]; then
diff --git a/examples/wenetspeech/asr1/local/test.sh b/examples/wenetspeech/asr1/local/test.sh
index 47bd2f6338a7d062b094c327f39f5362fae39865..da159de734a40996eed297beb147c23962b7fba3 100755
--- a/examples/wenetspeech/asr1/local/test.sh
+++ b/examples/wenetspeech/asr1/local/test.sh
@@ -34,7 +34,7 @@ for type in attention ctc_greedy_search; do
     output_dir=${ckpt_prefix}
     mkdir -p ${output_dir}
     python3 -u ${BIN_DIR}/test.py \
-    --nproc ${ngpu} \
+    --ngpu ${ngpu} \
     --config ${config_path} \
     --result_file ${output_dir}/${type}.rsl \
     --checkpoint_path ${ckpt_prefix} \
@@ -53,7 +53,7 @@ for type in ctc_prefix_beam_search attention_rescoring; do
     output_dir=${ckpt_prefix}
     mkdir -p ${output_dir}
     python3 -u ${BIN_DIR}/test.py \
-    --nproc ${ngpu} \
+    --ngpu ${ngpu} \
     --config ${config_path} \
     --result_file ${output_dir}/${type}.rsl \
     --checkpoint_path ${ckpt_prefix} \
diff --git a/examples/wenetspeech/asr1/local/test_wav.sh b/examples/wenetspeech/asr1/local/test_wav.sh
index 13296af2e3d5c0d3b1af753b100f348e4ad2040e..90f92d80bb2526d3896f1f623fb688273f27ad23 100755
--- a/examples/wenetspeech/asr1/local/test_wav.sh
+++ b/examples/wenetspeech/asr1/local/test_wav.sh
@@ -29,7 +29,7 @@ for type in  attention_rescoring; do
     output_dir=${ckpt_prefix}
     mkdir -p ${output_dir}
     python3 -u ${BIN_DIR}/test_wav.py \
-    --nproc ${ngpu} \
+    --ngpu ${ngpu} \
     --config ${config_path} \
     --result_file ${output_dir}/${type}.rsl \
     --checkpoint_path ${ckpt_prefix} \
diff --git a/audio/paddleaudio/__init__.py b/paddleaudio/__init__.py
similarity index 100%
rename from audio/paddleaudio/__init__.py
rename to paddleaudio/__init__.py
diff --git a/audio/paddleaudio/backends/__init__.py b/paddleaudio/backends/__init__.py
similarity index 100%
rename from audio/paddleaudio/backends/__init__.py
rename to paddleaudio/backends/__init__.py
diff --git a/audio/paddleaudio/backends/audio.py b/paddleaudio/backends/audio.py
similarity index 100%
rename from audio/paddleaudio/backends/audio.py
rename to paddleaudio/backends/audio.py
diff --git a/audio/paddleaudio/datasets/__init__.py b/paddleaudio/datasets/__init__.py
similarity index 73%
rename from audio/paddleaudio/datasets/__init__.py
rename to paddleaudio/datasets/__init__.py
index e1d2bbc5639659afe5fb03b4e8640da90a15316d..8d2fdab4695d73a4d5cdebe85e46c95660ef66cb 100644
--- a/audio/paddleaudio/datasets/__init__.py
+++ b/paddleaudio/datasets/__init__.py
@@ -11,24 +11,14 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from .aishell import AISHELL1
-from .dcase import UrbanAcousticScenes
-from .dcase import UrbanAudioVisualScenes
 from .esc50 import ESC50
 from .gtzan import GTZAN
-from .librispeech import LIBRISPEECH
-from .ravdess import RAVDESS
 from .tess import TESS
 from .urban_sound import UrbanSound8K
 
 __all__ = [
-    'AISHELL1',
-    'LIBRISPEECH',
     'ESC50',
     'UrbanSound8K',
     'GTZAN',
-    'UrbanAcousticScenes',
-    'UrbanAudioVisualScenes',
-    'RAVDESS',
     'TESS',
 ]
diff --git a/audio/paddleaudio/datasets/dataset.py b/paddleaudio/datasets/dataset.py
similarity index 100%
rename from audio/paddleaudio/datasets/dataset.py
rename to paddleaudio/datasets/dataset.py
diff --git a/audio/paddleaudio/datasets/esc50.py b/paddleaudio/datasets/esc50.py
similarity index 100%
rename from audio/paddleaudio/datasets/esc50.py
rename to paddleaudio/datasets/esc50.py
diff --git a/audio/paddleaudio/datasets/gtzan.py b/paddleaudio/datasets/gtzan.py
similarity index 100%
rename from audio/paddleaudio/datasets/gtzan.py
rename to paddleaudio/datasets/gtzan.py
diff --git a/audio/paddleaudio/datasets/tess.py b/paddleaudio/datasets/tess.py
similarity index 100%
rename from audio/paddleaudio/datasets/tess.py
rename to paddleaudio/datasets/tess.py
diff --git a/audio/paddleaudio/datasets/urban_sound.py b/paddleaudio/datasets/urban_sound.py
similarity index 100%
rename from audio/paddleaudio/datasets/urban_sound.py
rename to paddleaudio/datasets/urban_sound.py
diff --git a/audio/paddleaudio/features/__init__.py b/paddleaudio/features/__init__.py
similarity index 96%
rename from audio/paddleaudio/features/__init__.py
rename to paddleaudio/features/__init__.py
index 8503cfaba31a2355dbd8cc9dbb69981ea45eea08..d8ac7c4b90b09b3f6d2774861b6c0017c24559c9 100644
--- a/audio/paddleaudio/features/__init__.py
+++ b/paddleaudio/features/__init__.py
@@ -13,3 +13,4 @@
 # limitations under the License.
 from .augment import *
 from .core import *
+from .spectrum import *
diff --git a/audio/paddleaudio/features/augment.py b/paddleaudio/features/augment.py
similarity index 98%
rename from audio/paddleaudio/features/augment.py
rename to paddleaudio/features/augment.py
index 7556bb3c931b023006ef16791deca92e7b59e806..6f903bdba075b3f26d8e3ba55c6f0083c4054f34 100644
--- a/audio/paddleaudio/features/augment.py
+++ b/paddleaudio/features/augment.py
@@ -15,8 +15,9 @@ from typing import List
 
 import numpy as np
 from numpy import ndarray as array
-from paddleaudio.backends import depth_convert
-from paddleaudio.utils import ParameterError
+
+from ..backends import depth_convert
+from ..utils import ParameterError
 
 __all__ = [
     'depth_augment',
diff --git a/audio/paddleaudio/features/core.py b/paddleaudio/features/core.py
similarity index 99%
rename from audio/paddleaudio/features/core.py
rename to paddleaudio/features/core.py
index dd25724ff22254ced39e4387e24ccdc411148b12..d3c2e290ed20e1690ecbad3c865eedd5259bd3bf 100644
--- a/audio/paddleaudio/features/core.py
+++ b/paddleaudio/features/core.py
@@ -21,9 +21,10 @@ import numpy as np
 import scipy
 from numpy import ndarray as array
 from numpy.lib.stride_tricks import as_strided
-from paddleaudio.utils import ParameterError
 from scipy.signal import get_window
 
+from ..utils import ParameterError
+
 __all__ = [
     'stft',
     'mfcc',
@@ -293,6 +294,7 @@ def stft(x: array,
     This function is aligned with librosa.
     """
     _check_audio(x)
+
     # By default, use the entire frame
     if win_length is None:
         win_length = n_fft
@@ -397,7 +399,7 @@ def mfcc(x,
     This function is NOT strictly aligned with librosa. The following example shows how to get the
     same result with librosa:
 
-    # paddleaudioe mfcc:
+    # mfcc:
      kwargs = {
         'window_size':512,
         'hop_length':320,
diff --git a/paddleaudio/features/spectrum.py b/paddleaudio/features/spectrum.py
new file mode 100644
index 0000000000000000000000000000000000000000..154b6484ccc7157b8296e0567a4230fa8ef19335
--- /dev/null
+++ b/paddleaudio/features/spectrum.py
@@ -0,0 +1,461 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from functools import partial
+from typing import Optional
+from typing import Union
+
+import paddle
+import paddle.nn as nn
+
+from .window import get_window
+
+__all__ = [
+    'Spectrogram',
+    'MelSpectrogram',
+    'LogMelSpectrogram',
+]
+
+
+def hz_to_mel(freq: Union[paddle.Tensor, float],
+              htk: bool=False) -> Union[paddle.Tensor, float]:
+    """Convert Hz to Mels.
+    Parameters:
+        freq: the input tensor of arbitrary shape, or a single floating point number.
+        htk: use HTK formula to do the conversion.
+            The default value is False.
+    Returns:
+        The frequencies represented in Mel-scale.
+    """
+
+    if htk:
+        if isinstance(freq, paddle.Tensor):
+            return 2595.0 * paddle.log10(1.0 + freq / 700.0)
+        else:
+            return 2595.0 * math.log10(1.0 + freq / 700.0)
+
+    # Fill in the linear part
+    f_min = 0.0
+    f_sp = 200.0 / 3
+
+    mels = (freq - f_min) / f_sp
+
+    # Fill in the log-scale part
+
+    min_log_hz = 1000.0  # beginning of log region (Hz)
+    min_log_mel = (min_log_hz - f_min) / f_sp  # same (Mels)
+    logstep = math.log(6.4) / 27.0  # step size for log region
+
+    if isinstance(freq, paddle.Tensor):
+        target = min_log_mel + paddle.log(
+            freq / min_log_hz + 1e-10) / logstep  # prevent nan with 1e-10
+        mask = (freq > min_log_hz).astype(freq.dtype)
+        mels = target * mask + mels * (
+            1 - mask)  # will replace by masked_fill OP in future
+    else:
+        if freq >= min_log_hz:
+            mels = min_log_mel + math.log(freq / min_log_hz + 1e-10) / logstep
+
+    return mels
+
+
+def mel_to_hz(mel: Union[float, paddle.Tensor],
+              htk: bool=False) -> Union[float, paddle.Tensor]:
+    """Convert mel bin numbers to frequencies.
+    Parameters:
+        mel: the mel frequency represented as a tensor of arbitrary shape, or a floating point number.
+        htk: use HTK formula to do the conversion.
+    Returns:
+        The frequencies represented in hz.
+    """
+    if htk:
+        return 700.0 * (10.0**(mel / 2595.0) - 1.0)
+
+    f_min = 0.0
+    f_sp = 200.0 / 3
+    freqs = f_min + f_sp * mel
+    # And now the nonlinear scale
+    min_log_hz = 1000.0  # beginning of log region (Hz)
+    min_log_mel = (min_log_hz - f_min) / f_sp  # same (Mels)
+    logstep = math.log(6.4) / 27.0  # step size for log region
+    if isinstance(mel, paddle.Tensor):
+        target = min_log_hz * paddle.exp(logstep * (mel - min_log_mel))
+        mask = (mel > min_log_mel).astype(mel.dtype)
+        freqs = target * mask + freqs * (
+            1 - mask)  # will replace by masked_fill OP in future
+    else:
+        if mel >= min_log_mel:
+            freqs = min_log_hz * math.exp(logstep * (mel - min_log_mel))
+
+    return freqs
+
+
+def mel_frequencies(n_mels: int=64,
+                    f_min: float=0.0,
+                    f_max: float=11025.0,
+                    htk: bool=False,
+                    dtype: str=paddle.float32):
+    """Compute mel frequencies.
+    Parameters:
+        n_mels(int): number of Mel bins.
+        f_min(float): the lower cut-off frequency, below which the filter response is zero.
+        f_max(float): the upper cut-off frequency, above which the filter response is zero.
+        htk(bool): whether to use htk formula.
+        dtype(str): the datatype of the return frequencies.
+    Returns:
+        The frequencies represented in Mel-scale
+    """
+    # 'Center freqs' of mel bands - uniformly spaced between limits
+    min_mel = hz_to_mel(f_min, htk=htk)
+    max_mel = hz_to_mel(f_max, htk=htk)
+    mels = paddle.linspace(min_mel, max_mel, n_mels, dtype=dtype)
+    freqs = mel_to_hz(mels, htk=htk)
+    return freqs
+
+
+def fft_frequencies(sr: int, n_fft: int, dtype: str=paddle.float32):
+    """Compute fourier frequencies.
+    Parameters:
+        sr(int): the audio sample rate.
+        n_fft(float): the number of fft bins.
+        dtype(str): the datatype of the return frequencies.
+    Returns:
+        The frequencies represented in hz.
+    """
+    return paddle.linspace(0, float(sr) / 2, int(1 + n_fft // 2), dtype=dtype)
+
+
+def compute_fbank_matrix(sr: int,
+                         n_fft: int,
+                         n_mels: int=64,
+                         f_min: float=0.0,
+                         f_max: Optional[float]=None,
+                         htk: bool=False,
+                         norm: Union[str, float]='slaney',
+                         dtype: str=paddle.float32):
+    """Compute fbank matrix.
+    Parameters:
+        sr(int): the audio sample rate.
+        n_fft(int): the number of fft bins.
+        n_mels(int): the number of Mel bins.
+        f_min(float): the lower cut-off frequency, below which the filter response is zero.
+        f_max(float): the upper cut-off frequency, above which the filter response is zero.
+        htk: whether to use htk formula.
+        return_complex(bool): whether to return complex matrix. If True, the matrix will
+            be complex type. Otherwise, the real and image part will be stored in the last
+            axis of returned tensor.
+        dtype(str): the datatype of the returned fbank matrix.
+    Returns:
+        The fbank matrix of shape (n_mels, int(1+n_fft//2)).
+    Shape:
+        output: (n_mels, int(1+n_fft//2))
+    """
+
+    if f_max is None:
+        f_max = float(sr) / 2
+
+    # Initialize the weights
+    weights = paddle.zeros((n_mels, int(1 + n_fft // 2)), dtype=dtype)
+
+    # Center freqs of each FFT bin
+    fftfreqs = fft_frequencies(sr=sr, n_fft=n_fft, dtype=dtype)
+
+    # 'Center freqs' of mel bands - uniformly spaced between limits
+    mel_f = mel_frequencies(
+        n_mels + 2, f_min=f_min, f_max=f_max, htk=htk, dtype=dtype)
+
+    fdiff = mel_f[1:] - mel_f[:-1]  #np.diff(mel_f)
+    ramps = mel_f.unsqueeze(1) - fftfreqs.unsqueeze(0)
+    #ramps = np.subtract.outer(mel_f, fftfreqs)
+
+    for i in range(n_mels):
+        # lower and upper slopes for all bins
+        lower = -ramps[i] / fdiff[i]
+        upper = ramps[i + 2] / fdiff[i + 1]
+
+        # .. then intersect them with each other and zero
+        weights[i] = paddle.maximum(
+            paddle.zeros_like(lower), paddle.minimum(lower, upper))
+
+    # Slaney-style mel is scaled to be approx constant energy per channel
+    if norm == 'slaney':
+        enorm = 2.0 / (mel_f[2:n_mels + 2] - mel_f[:n_mels])
+        weights *= enorm.unsqueeze(1)
+    elif isinstance(norm, int) or isinstance(norm, float):
+        weights = paddle.nn.functional.normalize(weights, p=norm, axis=-1)
+
+    return weights
+
+
+def power_to_db(magnitude: paddle.Tensor,
+                ref_value: float=1.0,
+                amin: float=1e-10,
+                top_db: Optional[float]=None) -> paddle.Tensor:
+    """Convert a power spectrogram (amplitude squared) to decibel (dB) units.
+    The function computes the scaling ``10 * log10(x / ref)`` in a numerically
+    stable way.
+    Parameters:
+        magnitude(Tensor): the input magnitude tensor of any shape.
+        ref_value(float): the reference value. If smaller than 1.0, the db level
+            of the signal will be pulled up accordingly. Otherwise, the db level
+            is pushed down.
+        amin(float): the minimum value of input magnitude, below which the input
+            magnitude is clipped(to amin).
+        top_db(float): the maximum db value of resulting spectrum, above which the
+            spectrum is clipped(to top_db).
+    Returns:
+        The spectrogram in log-scale.
+    shape:
+        input: any shape
+        output: same as input
+    """
+    if amin <= 0:
+        raise Exception("amin must be strictly positive")
+
+    if ref_value <= 0:
+        raise Exception("ref_value must be strictly positive")
+
+    ones = paddle.ones_like(magnitude)
+    log_spec = 10.0 * paddle.log10(paddle.maximum(ones * amin, magnitude))
+    log_spec -= 10.0 * math.log10(max(ref_value, amin))
+
+    if top_db is not None:
+        if top_db < 0:
+            raise Exception("top_db must be non-negative")
+        log_spec = paddle.maximum(log_spec, ones * (log_spec.max() - top_db))
+
+    return log_spec
+
+
+class Spectrogram(nn.Layer):
+    def __init__(self,
+                 n_fft: int=512,
+                 hop_length: Optional[int]=None,
+                 win_length: Optional[int]=None,
+                 window: str='hann',
+                 center: bool=True,
+                 pad_mode: str='reflect',
+                 dtype: str=paddle.float32):
+        """Compute spectrogram of a given signal, typically an audio waveform.
+        The spectorgram is defined as the complex norm of the short-time
+        Fourier transformation.
+        Parameters:
+            n_fft(int): the number of frequency components of the discrete Fourier transform.
+                The default value is 2048,
+            hop_length(int|None): the hop length of the short time FFT. If None, it is set to win_length//4.
+                The default value is None.
+            win_length: the window length of the short time FFt. If None, it is set to same as n_fft.
+                The default value is None.
+            window(str): the name of the window function applied to the single before the Fourier transform.
+                The folllowing window names are supported: 'hamming','hann','kaiser','gaussian',
+                'exponential','triang','bohman','blackman','cosine','tukey','taylor'.
+                The default value is 'hann'
+            center(bool): if True, the signal is padded so that frame t is centered at x[t * hop_length].
+                If False, frame t begins at x[t * hop_length]
+                The default value is True
+            pad_mode(str): the mode to pad the signal if necessary. The supported modes are 'reflect'
+                and 'constant'. The default value is 'reflect'.
+            dtype(str): the data type of input and window.
+        Notes:
+            The Spectrogram transform relies on STFT transform to compute the spectrogram.
+            By default, the weights are not learnable. To fine-tune the Fourier coefficients,
+            set stop_gradient=False before training.
+            For more information, see STFT().
+        """
+        super(Spectrogram, self).__init__()
+
+        if win_length is None:
+            win_length = n_fft
+
+        fft_window = get_window(window, win_length, fftbins=True, dtype=dtype)
+        self._stft = partial(
+            paddle.signal.stft,
+            n_fft=n_fft,
+            hop_length=hop_length,
+            win_length=win_length,
+            window=fft_window,
+            center=center,
+            pad_mode=pad_mode)
+
+    def forward(self, x):
+        stft = self._stft(x)
+        spectrogram = paddle.square(paddle.abs(stft))
+        return spectrogram
+
+
+class MelSpectrogram(nn.Layer):
+    def __init__(self,
+                 sr: int=22050,
+                 n_fft: int=512,
+                 hop_length: Optional[int]=None,
+                 win_length: Optional[int]=None,
+                 window: str='hann',
+                 center: bool=True,
+                 pad_mode: str='reflect',
+                 n_mels: int=64,
+                 f_min: float=50.0,
+                 f_max: Optional[float]=None,
+                 htk: bool=False,
+                 norm: Union[str, float]='slaney',
+                 dtype: str=paddle.float32):
+        """Compute the melspectrogram of a given signal, typically an audio waveform.
+        The melspectrogram is also known as filterbank or fbank feature in audio community.
+        It is computed by multiplying spectrogram with Mel filter bank matrix.
+        Parameters:
+            sr(int): the audio sample rate.
+                The default value is 22050.
+            n_fft(int): the number of frequency components of the discrete Fourier transform.
+                The default value is 2048,
+            hop_length(int|None): the hop length of the short time FFT. If None, it is set to win_length//4.
+                The default value is None.
+            win_length: the window length of the short time FFt. If None, it is set to same as n_fft.
+                The default value is None.
+            window(str): the name of the window function applied to the single before the Fourier transform.
+                The folllowing window names are supported: 'hamming','hann','kaiser','gaussian',
+                'exponential','triang','bohman','blackman','cosine','tukey','taylor'.
+                The default value is 'hann'
+            center(bool): if True, the signal is padded so that frame t is centered at x[t * hop_length].
+                If False, frame t begins at x[t * hop_length]
+                The default value is True
+            pad_mode(str): the mode to pad the signal if necessary. The supported modes are 'reflect'
+                and 'constant'.
+                The default value is 'reflect'.
+            n_mels(int): the mel bins.
+            f_min(float): the lower cut-off frequency, below which the filter response is zero.
+            f_max(float): the upper cut-off frequency, above which the filter response is zeros.
+            htk(bool): whether to use HTK formula in computing fbank matrix.
+            norm(str|float): the normalization type in computing fbank matrix.  Slaney-style is used by default.
+                You can specify norm=1.0/2.0 to use customized p-norm normalization.
+            dtype(str): the datatype of fbank matrix used in the transform. Use float64 to increase numerical
+                accuracy. Note that the final transform will be conducted in float32 regardless of dtype of fbank matrix.
+        """
+        super(MelSpectrogram, self).__init__()
+
+        self._spectrogram = Spectrogram(
+            n_fft=n_fft,
+            hop_length=hop_length,
+            win_length=win_length,
+            window=window,
+            center=center,
+            pad_mode=pad_mode,
+            dtype=dtype)
+        self.n_mels = n_mels
+        self.f_min = f_min
+        self.f_max = f_max
+        self.htk = htk
+        self.norm = norm
+        if f_max is None:
+            f_max = sr // 2
+        self.fbank_matrix = compute_fbank_matrix(
+            sr=sr,
+            n_fft=n_fft,
+            n_mels=n_mels,
+            f_min=f_min,
+            f_max=f_max,
+            htk=htk,
+            norm=norm,
+            dtype=dtype)  # float64 for better numerical results
+        self.register_buffer('fbank_matrix', self.fbank_matrix)
+
+    def forward(self, x):
+        spect_feature = self._spectrogram(x)
+        mel_feature = paddle.matmul(self.fbank_matrix, spect_feature)
+        return mel_feature
+
+
+class LogMelSpectrogram(nn.Layer):
+    def __init__(self,
+                 sr: int=22050,
+                 n_fft: int=512,
+                 hop_length: Optional[int]=None,
+                 win_length: Optional[int]=None,
+                 window: str='hann',
+                 center: bool=True,
+                 pad_mode: str='reflect',
+                 n_mels: int=64,
+                 f_min: float=50.0,
+                 f_max: Optional[float]=None,
+                 htk: bool=False,
+                 norm: Union[str, float]='slaney',
+                 ref_value: float=1.0,
+                 amin: float=1e-10,
+                 top_db: Optional[float]=None,
+                 dtype: str=paddle.float32):
+        """Compute log-mel-spectrogram(also known as LogFBank) feature of a given signal,
+        typically an audio waveform.
+        Parameters:
+            sr(int): the audio sample rate.
+                The default value is 22050.
+            n_fft(int): the number of frequency components of the discrete Fourier transform.
+                The default value is 2048,
+            hop_length(int|None): the hop length of the short time FFT. If None, it is set to win_length//4.
+                The default value is None.
+            win_length: the window length of the short time FFt. If None, it is set to same as n_fft.
+                The default value is None.
+            window(str): the name of the window function applied to the single before the Fourier transform.
+                The folllowing window names are supported: 'hamming','hann','kaiser','gaussian',
+                'exponential','triang','bohman','blackman','cosine','tukey','taylor'.
+                The default value is 'hann'
+            center(bool): if True, the signal is padded so that frame t is centered at x[t * hop_length].
+                If False, frame t begins at x[t * hop_length]
+                The default value is True
+            pad_mode(str): the mode to pad the signal if necessary. The supported modes are 'reflect'
+                and 'constant'.
+                The default value is 'reflect'.
+            n_mels(int): the mel bins.
+            f_min(float): the lower cut-off frequency, below which the filter response is zero.
+            f_max(float): the upper cut-off frequency, above which the filter response is zeros.
+            ref_value(float): the reference value. If smaller than 1.0, the db level
+            htk(bool): whether to use HTK formula in computing fbank matrix.
+            norm(str|float): the normalization type in computing fbank matrix. Slaney-style is used by default.
+                You can specify norm=1.0/2.0 to use customized p-norm normalization.
+            dtype(str): the datatype of fbank matrix used in the transform. Use float64 to increase numerical
+                accuracy. Note that the final transform will be conducted in float32 regardless of dtype of fbank matrix.
+            amin(float): the minimum value of input magnitude, below which the input of the signal will be pulled up accordingly.
+                Otherwise, the db level is pushed down.
+                magnitude is clipped(to amin). For numerical stability, set amin to a larger value,
+                e.g., 1e-3.
+            top_db(float): the maximum db value of resulting spectrum, above which the
+                spectrum is clipped(to top_db).
+        """
+        super(LogMelSpectrogram, self).__init__()
+
+        self._melspectrogram = MelSpectrogram(
+            sr=sr,
+            n_fft=n_fft,
+            hop_length=hop_length,
+            win_length=win_length,
+            window=window,
+            center=center,
+            pad_mode=pad_mode,
+            n_mels=n_mels,
+            f_min=f_min,
+            f_max=f_max,
+            htk=htk,
+            norm=norm,
+            dtype=dtype)
+
+        self.ref_value = ref_value
+        self.amin = amin
+        self.top_db = top_db
+
+    def forward(self, x):
+        # import ipdb; ipdb.set_trace()
+        mel_feature = self._melspectrogram(x)
+        log_mel_feature = power_to_db(
+            mel_feature,
+            ref_value=self.ref_value,
+            amin=self.amin,
+            top_db=self.top_db)
+        return log_mel_feature
diff --git a/paddleaudio/features/window.py b/paddleaudio/features/window.py
new file mode 100644
index 0000000000000000000000000000000000000000..629989fc9cca399e8c4228cf369553570b5f5db4
--- /dev/null
+++ b/paddleaudio/features/window.py
@@ -0,0 +1,415 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+import math
+from typing import List
+from typing import Tuple
+from typing import Union
+
+import paddle
+from paddle import Tensor
+
+__all__ = [
+    'get_window',
+]
+
+
+def _cat(a: List[Tensor], data_type: str) -> Tensor:
+    l = [paddle.to_tensor(_a, data_type) for _a in a]
+    return paddle.concat(l)
+
+
+def _acosh(x: Union[Tensor, float]) -> Tensor:
+    if isinstance(x, float):
+        return math.log(x + math.sqrt(x**2 - 1))
+    return paddle.log(x + paddle.sqrt(paddle.square(x) - 1))
+
+
+def _extend(M: int, sym: bool) -> bool:
+    """Extend window by 1 sample if needed for DFT-even symmetry"""
+    if not sym:
+        return M + 1, True
+    else:
+        return M, False
+
+
+def _len_guards(M: int) -> bool:
+    """Handle small or incorrect window lengths"""
+    if int(M) != M or M < 0:
+        raise ValueError('Window length M must be a non-negative integer')
+
+    return M <= 1
+
+
+def _truncate(w: Tensor, needed: bool) -> Tensor:
+    """Truncate window by 1 sample if needed for DFT-even symmetry"""
+    if needed:
+        return w[:-1]
+    else:
+        return w
+
+
+def general_gaussian(M: int, p, sig, sym: bool=True,
+                     dtype: str='float64') -> Tensor:
+    """Compute a window with a generalized Gaussian shape.
+    This function is consistent with scipy.signal.windows.general_gaussian().
+    """
+    if _len_guards(M):
+        return paddle.ones((M, ), dtype=dtype)
+    M, needs_trunc = _extend(M, sym)
+
+    n = paddle.arange(0, M, dtype=dtype) - (M - 1.0) / 2.0
+    w = paddle.exp(-0.5 * paddle.abs(n / sig)**(2 * p))
+
+    return _truncate(w, needs_trunc)
+
+
+def general_hamming(M: int, alpha: float, sym: bool=True,
+                    dtype: str='float64') -> Tensor:
+    """Compute a generalized Hamming window.
+    This function is consistent with scipy.signal.windows.general_hamming()
+    """
+    return general_cosine(M, [alpha, 1. - alpha], sym, dtype=dtype)
+
+
+def taylor(M: int,
+           nbar=4,
+           sll=30,
+           norm=True,
+           sym: bool=True,
+           dtype: str='float64') -> Tensor:
+    """Compute a Taylor window.
+    The Taylor window taper function approximates the Dolph-Chebyshev window's
+    constant sidelobe level for a parameterized number of near-in sidelobes.
+    Parameters:
+        M(int): window size
+        nbar, sil, norm: the window-specific parameter.
+        sym(bool)：whether to return symmetric window.
+            The default value is True
+        dtype(str): the datatype of returned tensor.
+    Returns:
+        Tensor: the window tensor
+    """
+    if _len_guards(M):
+        return paddle.ones((M, ), dtype=dtype)
+    M, needs_trunc = _extend(M, sym)
+    # Original text uses a negative sidelobe level parameter and then negates
+    # it in the calculation of B. To keep consistent with other methods we
+    # assume the sidelobe level parameter to be positive.
+    B = 10**(sll / 20)
+    A = _acosh(B) / math.pi
+    s2 = nbar**2 / (A**2 + (nbar - 0.5)**2)
+    ma = paddle.arange(1, nbar, dtype=dtype)
+
+    Fm = paddle.empty((nbar - 1, ), dtype=dtype)
+    signs = paddle.empty_like(ma)
+    signs[::2] = 1
+    signs[1::2] = -1
+    m2 = ma * ma
+    for mi in range(len(ma)):
+        numer = signs[mi] * paddle.prod(1 - m2[mi] / s2 / (A**2 + (ma - 0.5)**2
+                                                           ))
+        if mi == 0:
+            denom = 2 * paddle.prod(1 - m2[mi] / m2[mi + 1:])
+        elif mi == len(ma) - 1:
+            denom = 2 * paddle.prod(1 - m2[mi] / m2[:mi])
+        else:
+            denom = 2 * paddle.prod(1 - m2[mi] / m2[:mi]) * paddle.prod(1 - m2[
+                mi] / m2[mi + 1:])
+
+        Fm[mi] = numer / denom
+
+    def W(n):
+        return 1 + 2 * paddle.matmul(
+            Fm.unsqueeze(0),
+            paddle.cos(2 * math.pi * ma.unsqueeze(1) * (n - M / 2. + 0.5) / M))
+
+    w = W(paddle.arange(0, M, dtype=dtype))
+
+    # normalize (Note that this is not described in the original text [1])
+    if norm:
+        scale = 1.0 / W((M - 1) / 2)
+        w *= scale
+    w = w.squeeze()
+    return _truncate(w, needs_trunc)
+
+
+def general_cosine(M: int, a: float, sym: bool=True,
+                   dtype: str='float64') -> Tensor:
+    """Compute a generic weighted sum of cosine terms window.
+    This function is consistent with scipy.signal.windows.general_cosine().
+    """
+    if _len_guards(M):
+        return paddle.ones((M, ), dtype=dtype)
+    M, needs_trunc = _extend(M, sym)
+    fac = paddle.linspace(-math.pi, math.pi, M, dtype=dtype)
+    w = paddle.zeros((M, ), dtype=dtype)
+    for k in range(len(a)):
+        w += a[k] * paddle.cos(k * fac)
+    return _truncate(w, needs_trunc)
+
+
+def hamming(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
+    """Compute a Hamming window.
+    The Hamming window is a taper formed by using a raised cosine with
+    non-zero endpoints, optimized to minimize the nearest side lobe.
+    Parameters:
+        M(int): window size
+        sym(bool)：whether to return symmetric window.
+            The default value is True
+        dtype(str): the datatype of returned tensor.
+    Returns:
+        Tensor: the window tensor
+    """
+    return general_hamming(M, 0.54, sym, dtype=dtype)
+
+
+def hann(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
+    """Compute a Hann window.
+    The Hann window is a taper formed by using a raised cosine or sine-squared
+    with ends that touch zero.
+    Parameters:
+        M(int): window size
+        sym(bool)：whether to return symmetric window.
+            The default value is True
+        dtype(str): the datatype of returned tensor.
+    Returns:
+        Tensor: the window tensor
+    """
+    return general_hamming(M, 0.5, sym, dtype=dtype)
+
+
+def tukey(M: int, alpha=0.5, sym: bool=True, dtype: str='float64') -> Tensor:
+    """Compute a Tukey window.
+    The Tukey window is also known as a tapered cosine window.
+    Parameters:
+        M(int): window size
+        sym(bool)：whether to return symmetric window.
+            The default value is True
+        dtype(str): the datatype of returned tensor.
+    Returns:
+        Tensor: the window tensor
+    """
+    if _len_guards(M):
+        return paddle.ones((M, ), dtype=dtype)
+
+    if alpha <= 0:
+        return paddle.ones((M, ), dtype=dtype)
+    elif alpha >= 1.0:
+        return hann(M, sym=sym)
+
+    M, needs_trunc = _extend(M, sym)
+
+    n = paddle.arange(0, M, dtype=dtype)
+    width = int(alpha * (M - 1) / 2.0)
+    n1 = n[0:width + 1]
+    n2 = n[width + 1:M - width - 1]
+    n3 = n[M - width - 1:]
+
+    w1 = 0.5 * (1 + paddle.cos(math.pi * (-1 + 2.0 * n1 / alpha / (M - 1))))
+    w2 = paddle.ones(n2.shape, dtype=dtype)
+    w3 = 0.5 * (1 + paddle.cos(math.pi * (-2.0 / alpha + 1 + 2.0 * n3 / alpha /
+                                          (M - 1))))
+    w = paddle.concat([w1, w2, w3])
+
+    return _truncate(w, needs_trunc)
+
+
+def kaiser(M: int, beta: float, sym: bool=True, dtype: str='float64') -> Tensor:
+    """Compute a Kaiser window.
+    The Kaiser window is a taper formed by using a Bessel function.
+    Parameters:
+        M(int): window size.
+        beta(float): the window-specific parameter.
+        sym(bool)：whether to return symmetric window.
+            The default value is True
+    Returns:
+        Tensor: the window tensor
+    """
+    raise NotImplementedError()
+
+
+def gaussian(M: int, std: float, sym: bool=True,
+             dtype: str='float64') -> Tensor:
+    """Compute a Gaussian window.
+    The Gaussian widows has a Gaussian shape defined by the standard deviation(std).
+    Parameters:
+        M(int): window size.
+        std(float): the window-specific parameter.
+        sym(bool)：whether to return symmetric window.
+            The default value is True
+        dtype(str): the datatype of returned tensor.
+    Returns:
+        Tensor: the window tensor
+    """
+    if _len_guards(M):
+        return paddle.ones((M, ), dtype=dtype)
+    M, needs_trunc = _extend(M, sym)
+
+    n = paddle.arange(0, M, dtype=dtype) - (M - 1.0) / 2.0
+    sig2 = 2 * std * std
+    w = paddle.exp(-n**2 / sig2)
+
+    return _truncate(w, needs_trunc)
+
+
+def exponential(M: int,
+                center=None,
+                tau=1.,
+                sym: bool=True,
+                dtype: str='float64') -> Tensor:
+    """Compute an exponential (or Poisson) window.
+    Parameters:
+        M(int): window size.
+        tau(float): the window-specific parameter.
+        sym(bool)：whether to return symmetric window.
+            The default value is True
+        dtype(str): the datatype of returned tensor.
+    Returns:
+        Tensor: the window tensor
+    """
+    if sym and center is not None:
+        raise ValueError("If sym==True, center must be None.")
+    if _len_guards(M):
+        return paddle.ones((M, ), dtype=dtype)
+    M, needs_trunc = _extend(M, sym)
+
+    if center is None:
+        center = (M - 1) / 2
+
+    n = paddle.arange(0, M, dtype=dtype)
+    w = paddle.exp(-paddle.abs(n - center) / tau)
+
+    return _truncate(w, needs_trunc)
+
+
+def triang(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
+    """Compute a triangular window.
+    Parameters:
+        M(int): window size.
+        sym(bool)：whether to return symmetric window.
+            The default value is True
+        dtype(str): the datatype of returned tensor.
+    Returns:
+        Tensor: the window tensor
+    """
+    if _len_guards(M):
+        return paddle.ones((M, ), dtype=dtype)
+    M, needs_trunc = _extend(M, sym)
+
+    n = paddle.arange(1, (M + 1) // 2 + 1, dtype=dtype)
+    if M % 2 == 0:
+        w = (2 * n - 1.0) / M
+        w = paddle.concat([w, w[::-1]])
+    else:
+        w = 2 * n / (M + 1.0)
+        w = paddle.concat([w, w[-2::-1]])
+
+    return _truncate(w, needs_trunc)
+
+
+def bohman(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
+    """Compute a Bohman window.
+    The Bohman window is the autocorrelation of a cosine window.
+    Parameters:
+        M(int): window size.
+        sym(bool)：whether to return symmetric window.
+            The default value is True
+        dtype(str): the datatype of returned tensor.
+    Returns:
+        Tensor: the window tensor
+    """
+    if _len_guards(M):
+        return paddle.ones((M, ), dtype=dtype)
+    M, needs_trunc = _extend(M, sym)
+
+    fac = paddle.abs(paddle.linspace(-1, 1, M, dtype=dtype)[1:-1])
+    w = (1 - fac) * paddle.cos(math.pi * fac) + 1.0 / math.pi * paddle.sin(
+        math.pi * fac)
+    w = _cat([0, w, 0], dtype)
+
+    return _truncate(w, needs_trunc)
+
+
+def blackman(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
+    """Compute a Blackman window.
+    The Blackman window is a taper formed by using the first three terms of
+    a summation of cosines. It was designed to have close to the minimal
+    leakage possible.  It is close to optimal, only slightly worse than a
+    Kaiser window.
+    Parameters:
+        M(int): window size.
+        sym(bool)：whether to return symmetric window.
+            The default value is True
+        dtype(str): the datatype of returned tensor.
+    Returns:
+        Tensor: the window tensor
+    """
+    return general_cosine(M, [0.42, 0.50, 0.08], sym, dtype=dtype)
+
+
+def cosine(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
+    """Compute a window with a simple cosine shape.
+    Parameters:
+        M(int): window size.
+        sym(bool)：whether to return symmetric window.
+            The default value is True
+        dtype(str): the datatype of returned tensor.
+    Returns:
+        Tensor: the window tensor
+    """
+    if _len_guards(M):
+        return paddle.ones((M, ), dtype=dtype)
+    M, needs_trunc = _extend(M, sym)
+    w = paddle.sin(math.pi / M * (paddle.arange(0, M, dtype=dtype) + .5))
+
+    return _truncate(w, needs_trunc)
+
+
+def get_window(window: Union[str, Tuple[str, float]],
+               win_length: int,
+               fftbins: bool=True,
+               dtype: str='float64') -> Tensor:
+    """Return a window of a given length and type.
+    Parameters:
+        window(str|(str,float)): the type of window to create.
+        win_length(int): the number of samples in the window.
+        fftbins(bool): If True, create a "periodic" window. Otherwise,
+            create a "symmetric" window, for use in filter design.
+    Returns:
+       The window represented as a tensor.
+    """
+    sym = not fftbins
+
+    args = ()
+    if isinstance(window, tuple):
+        winstr = window[0]
+        if len(window) > 1:
+            args = window[1:]
+    elif isinstance(window, str):
+        if window in ['gaussian', 'exponential']:
+            raise ValueError("The '" + window + "' window needs one or "
+                             "more parameters -- pass a tuple.")
+        else:
+            winstr = window
+    else:
+        raise ValueError("%s as window type is not supported." %
+                         str(type(window)))
+
+    try:
+        winfunc = eval(winstr)
+    except KeyError as e:
+        raise ValueError("Unknown window type.") from e
+
+    params = (win_length, ) + args
+    kwargs = {'sym': sym}
+    return winfunc(*params, dtype=dtype, **kwargs)
diff --git a/audio/paddleaudio/utils/__init__.py b/paddleaudio/utils/__init__.py
similarity index 100%
rename from audio/paddleaudio/utils/__init__.py
rename to paddleaudio/utils/__init__.py
diff --git a/audio/paddleaudio/utils/download.py b/paddleaudio/utils/download.py
similarity index 65%
rename from audio/paddleaudio/utils/download.py
rename to paddleaudio/utils/download.py
index 0a36f29b97f024afea5347773de0db576e8ed875..45a8e57ba0ad31f0921fd03a5a2156f02d34a1cb 100644
--- a/audio/paddleaudio/utils/download.py
+++ b/paddleaudio/utils/download.py
@@ -17,7 +17,6 @@ from typing import List
 
 from paddle.framework import load as load_state_dict
 from paddle.utils import download
-from pathos.multiprocessing import ProcessPool
 
 from .log import logger
 
@@ -32,27 +31,18 @@ def decompress(file: str):
     download._decompress(file)
 
 
-def download_and_decompress(archives: List[Dict[str, str]],
-                            path: str,
-                            n_workers: int=0):
+def download_and_decompress(archives: List[Dict[str, str]], path: str):
     """
     Download archieves and decompress to specific path.
     """
     if not os.path.isdir(path):
         os.makedirs(path)
 
-    if n_workers <= 0:
-        for archive in archives:
-            assert 'url' in archive and 'md5' in archive, \
-                'Dictionary keys of "url" and "md5" are required in the archive, but got: {list(archieve.keys())}'
+    for archive in archives:
+        assert 'url' in archive and 'md5' in archive, \
+            'Dictionary keys of "url" and "md5" are required in the archive, but got: {list(archieve.keys())}'
 
-            download.get_path_from_url(archive['url'], path, archive['md5'])
-    else:
-        pool = ProcessPool(nodes=n_workers)
-        pool.imap(download.get_path_from_url, [_['url'] for _ in archives],
-                  [path] * len(archives), [_['md5'] for _ in archives])
-        pool.close()
-        pool.join()
+        download.get_path_from_url(archive['url'], path, archive['md5'])
 
 
 def load_state_dict_from_url(url: str, path: str, md5: str=None):
diff --git a/audio/paddleaudio/utils/env.py b/paddleaudio/utils/env.py
similarity index 100%
rename from audio/paddleaudio/utils/env.py
rename to paddleaudio/utils/env.py
diff --git a/audio/paddleaudio/utils/error.py b/paddleaudio/utils/error.py
similarity index 100%
rename from audio/paddleaudio/utils/error.py
rename to paddleaudio/utils/error.py
diff --git a/audio/paddleaudio/utils/log.py b/paddleaudio/utils/log.py
similarity index 100%
rename from audio/paddleaudio/utils/log.py
rename to paddleaudio/utils/log.py
diff --git a/audio/paddleaudio/utils/time.py b/paddleaudio/utils/time.py
similarity index 100%
rename from audio/paddleaudio/utils/time.py
rename to paddleaudio/utils/time.py
diff --git a/paddlespeech/cli/README.md b/paddlespeech/cli/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4cea85b1484ebb499e812423e6ab61f78a06f18f
--- /dev/null
+++ b/paddlespeech/cli/README.md
@@ -0,0 +1,9 @@
+# PaddleSpeech Command Line
+
+ The simplest approach to use PaddleSpeech models.
+
+ ## Help
+ `paddlespeech help`
+
+ ## S2T
+ `paddlespeech s2t --config ./s2t.yaml --input ./zh.wav --device gpu`
diff --git a/paddlespeech/cli/__init__.py b/paddlespeech/cli/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1cc7e27f58099501282fa6d6ea4373b745b788f7
--- /dev/null
+++ b/paddlespeech/cli/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .base_commands import BaseCommand
+from .base_commands import HelpCommand
+from .s2t import S2TExecutor
diff --git a/paddlespeech/cli/base_commands.py b/paddlespeech/cli/base_commands.py
new file mode 100644
index 0000000000000000000000000000000000000000..97d5cd7fa3ad30ee2338b50cfd5123fe4cd99d05
--- /dev/null
+++ b/paddlespeech/cli/base_commands.py
@@ -0,0 +1,49 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List
+
+from .entry import commands
+from .utils import cli_register
+from .utils import get_command
+
+__all__ = [
+    'BaseCommand',
+    'HelpCommand',
+]
+
+
+@cli_register(name='paddlespeech')
+class BaseCommand:
+    def execute(self, argv: List[str]) -> bool:
+        help = get_command('paddlespeech.help')
+        return help().execute(argv)
+
+
+@cli_register(name='paddlespeech.help', description='Show help for commands.')
+class HelpCommand:
+    def execute(self, argv: List[str]) -> bool:
+        msg = 'Usage:\n'
+        msg += '    paddlespeech <command> <options>\n\n'
+        msg += 'Commands:\n'
+        for command, detail in commands['paddlespeech'].items():
+            if command.startswith('_'):
+                continue
+
+            if '_description' not in detail:
+                continue
+            msg += '    {:<15}        {}\n'.format(command,
+                                                   detail['_description'])
+
+        print(msg)
+        return True
diff --git a/paddlespeech/cli/cls/__init.__py b/paddlespeech/cli/cls/__init.__py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/paddlespeech/cli/entry.py b/paddlespeech/cli/entry.py
new file mode 100644
index 0000000000000000000000000000000000000000..726cff1afd6832ef36c4a3ad7e9d197063e562e3
--- /dev/null
+++ b/paddlespeech/cli/entry.py
@@ -0,0 +1,38 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+from collections import defaultdict
+
+__all__ = ['commands']
+
+
+def _CommandDict():
+    return defaultdict(_CommandDict)
+
+
+def _execute():
+    com = commands
+    for idx, _argv in enumerate(['paddlespeech'] + sys.argv[1:]):
+        if _argv not in com:
+            break
+        com = com[_argv]
+
+    # The method 'execute' of a command instance returns 'True' for a success
+    # while 'False' for a failure. Here converts this result into a exit status
+    # in bash: 0 for a success and 1 for a failure.
+    status = 0 if com['_entry']().execute(sys.argv[idx:]) else 1
+    return status
+
+
+commands = _CommandDict()
diff --git a/paddlespeech/cli/executor.py b/paddlespeech/cli/executor.py
new file mode 100644
index 0000000000000000000000000000000000000000..45472fa4b62de440ed27c737fb9dbc1349ff49b2
--- /dev/null
+++ b/paddlespeech/cli/executor.py
@@ -0,0 +1,67 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from abc import ABC
+from abc import abstractmethod
+from typing import Optional
+from typing import Union
+
+import paddle
+
+
+class BaseExecutor(ABC):
+    """
+        An abstract executor of paddlespeech tasks.
+    """
+
+    def __init__(self):
+        self.input = None
+        self.output = None
+
+    @abstractmethod
+    def _get_default_cfg_path(self):
+        """
+            Returns a default config file path of current task.
+        """
+        pass
+
+    @abstractmethod
+    def _init_from_cfg(self, cfg_path: Optional[os.PathLike]=None):
+        """
+            Init model from a specific config file.
+        """
+        pass
+
+    @abstractmethod
+    def preprocess(self, input: Union[str, os.PathLike]):
+        """
+            Input preprocess and return paddle.Tensor stored in self.input.
+            Input content can be a text(t2s), a file(s2t, cls) or a streaming(not supported yet).
+        """
+        pass
+
+    @paddle.no_grad()
+    @abstractmethod
+    def infer(self, device: str):
+        """
+            Model inference and result stored in self.output.
+        """
+        pass
+
+    @abstractmethod
+    def postprocess(self) -> Union[str, os.PathLike]:
+        """
+            Output postprocess and return human-readable results such as texts and audio files.
+        """
+        pass
diff --git a/paddlespeech/cli/s2t/__init__.py b/paddlespeech/cli/s2t/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..57e814b9eb792d014108f3c29aad204f98382c99
--- /dev/null
+++ b/paddlespeech/cli/s2t/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .infer import S2TExecutor
diff --git a/paddlespeech/cli/s2t/conf/default_conf.yaml b/paddlespeech/cli/s2t/conf/default_conf.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/paddlespeech/cli/s2t/infer.py b/paddlespeech/cli/s2t/infer.py
new file mode 100644
index 0000000000000000000000000000000000000000..682279852cd9ca7787a013257f30a576000324e6
--- /dev/null
+++ b/paddlespeech/cli/s2t/infer.py
@@ -0,0 +1,103 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+from typing import List
+from typing import Optional
+from typing import Union
+
+import paddle
+
+from ..executor import BaseExecutor
+from ..utils import cli_register
+
+__all__ = ['S2TExecutor']
+
+
+@cli_register(
+    name='paddlespeech.s2t', description='Speech to text infer command.')
+class S2TExecutor(BaseExecutor):
+    def __init__(self):
+        super(S2TExecutor, self).__init__()
+
+        self.parser = argparse.ArgumentParser(
+            prog='paddlespeech.s2t', add_help=True)
+        self.parser.add_argument(
+            '--config',
+            type=str,
+            default=None,
+            help='Config of s2t task. Use deault config when it is None.')
+        self.parser.add_argument(
+            '--input', type=str, help='Audio file to recognize.')
+        self.parser.add_argument(
+            '--device',
+            type=str,
+            default='cpu',
+            help='Choose device to execute model inference.')
+
+    def _get_default_cfg_path(self):
+        """
+            Returns a default config file path of current task.
+        """
+        pass
+
+    def _init_from_cfg(self, cfg_path: Optional[os.PathLike]=None):
+        """
+            Init model from a specific config file.
+        """
+        pass
+
+    def preprocess(self, input: Union[str, os.PathLike]):
+        """
+            Input preprocess and return paddle.Tensor stored in self.input.
+            Input content can be a text(t2s), a file(s2t, cls) or a streaming(not supported yet).
+        """
+        pass
+
+    @paddle.no_grad()
+    def infer(self):
+        """
+            Model inference and result stored in self.output.
+        """
+        pass
+
+    def postprocess(self) -> Union[str, os.PathLike]:
+        """
+            Output postprocess and return human-readable results such as texts and audio files.
+        """
+        pass
+
+    def execute(self, argv: List[str]) -> bool:
+        parser_args = self.parser.parse_args(argv)
+        print(parser_args)
+
+        config = parser_args.config
+        audio_file = parser_args.input
+        device = parser_args.device
+
+        if config is not None:
+            assert os.path.isfile(config), 'Config file is not valid.'
+        else:
+            config = self._get_default_cfg_path()
+
+        try:
+            self._init_from_cfg(config)
+            self.preprocess(audio_file)
+            self.infer()
+            res = self.postprocess()  # Retrieve result of s2t.
+            print(res)
+            return True
+        except Exception as e:
+            print(e)
+            return False
diff --git a/paddlespeech/cli/t2s/__init.__py b/paddlespeech/cli/t2s/__init.__py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/paddlespeech/cli/utils.py b/paddlespeech/cli/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..c83deee890763ec4bd1e25c99b1d22272b224712
--- /dev/null
+++ b/paddlespeech/cli/utils.py
@@ -0,0 +1,86 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from typing import Any
+from typing import Dict
+from typing import List
+
+from paddle.framework import load
+from paddle.utils import download
+
+from .entry import commands
+
+__all__ = [
+    'cli_register',
+    'get_command',
+    'download_and_decompress',
+    'load_state_dict_from_url',
+]
+
+
+def cli_register(name: str, description: str='') -> Any:
+    def _warpper(command):
+        items = name.split('.')
+
+        com = commands
+        for item in items:
+            com = com[item]
+        com['_entry'] = command
+        if description:
+            com['_description'] = description
+        return command
+
+    return _warpper
+
+
+def get_command(name: str) -> Any:
+    items = name.split('.')
+    com = commands
+    for item in items:
+        com = com[item]
+
+    return com['_entry']
+
+
+def decompress(file: str):
+    """
+    Extracts all files from a compressed file.
+    """
+    assert os.path.isfile(file), "File: {} not exists.".format(file)
+    download._decompress(file)
+
+
+def download_and_decompress(archives: List[Dict[str, str]], path: str):
+    """
+    Download archieves and decompress to specific path.
+    """
+    if not os.path.isdir(path):
+        os.makedirs(path)
+
+    for archive in archives:
+        assert 'url' in archive and 'md5' in archive, \
+            'Dictionary keys of "url" and "md5" are required in the archive, but got: {list(archieve.keys())}'
+
+        download.get_path_from_url(archive['url'], path, archive['md5'])
+
+
+def load_state_dict_from_url(url: str, path: str, md5: str=None):
+    """
+    Download and load a state dict from url
+    """
+    if not os.path.isdir(path):
+        os.makedirs(path)
+
+    download.get_path_from_url(url, path, md5)
+    return load(os.path.join(path, os.path.basename(url)))
diff --git a/audio/paddleaudio/models/__init__.py b/paddlespeech/cls/exps/__init__.py
similarity index 100%
rename from audio/paddleaudio/models/__init__.py
rename to paddlespeech/cls/exps/__init__.py
diff --git a/paddlespeech/t2s/modules/fastspeech2_predictor/__init__.py b/paddlespeech/cls/exps/panns/__init__.py
similarity index 100%
rename from paddlespeech/t2s/modules/fastspeech2_predictor/__init__.py
rename to paddlespeech/cls/exps/panns/__init__.py
diff --git a/paddlespeech/cls/exps/panns/deploy/__init__.py b/paddlespeech/cls/exps/panns/deploy/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..185a92b8d94d3426d616c0624f0f2ee04339349e
--- /dev/null
+++ b/paddlespeech/cls/exps/panns/deploy/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/audio/examples/sound_classification/deploy/python/predict.py b/paddlespeech/cls/exps/panns/deploy/predict.py
similarity index 97%
rename from audio/examples/sound_classification/deploy/python/predict.py
rename to paddlespeech/cls/exps/panns/deploy/predict.py
index a99b8980c20ed3a2aa3f1c26fff58cf84c6db7ab..d4e5c22fb12b6453ba6ef6e4192f6a9442b960a9 100644
--- a/audio/examples/sound_classification/deploy/python/predict.py
+++ b/paddlespeech/cls/exps/panns/deploy/predict.py
@@ -16,16 +16,18 @@ import os
 
 import numpy as np
 from paddle import inference
+from scipy.special import softmax
+
 from paddleaudio.backends import load as load_audio
 from paddleaudio.datasets import ESC50
 from paddleaudio.features import melspectrogram
-from scipy.special import softmax
 
 # yapf: disable
 parser = argparse.ArgumentParser()
 parser.add_argument("--model_dir", type=str, required=True, default="./export", help="The directory to static model.")
-parser.add_argument("--batch_size", type=int, default=2, help="Batch size per GPU/CPU for training.")
 parser.add_argument('--device', choices=['cpu', 'gpu', 'xpu'], default="gpu", help="Select which device to train model, defaults to gpu.")
+parser.add_argument("--wav", type=str, required=True, help="Audio file to infer.")
+parser.add_argument("--batch_size", type=int, default=1, help="Batch size per GPU/CPU for training.")
 parser.add_argument('--use_tensorrt', type=eval, default=False, choices=[True, False], help='Enable to use tensorrt to speed up.')
 parser.add_argument("--precision", type=str, default="fp32", choices=["fp32", "fp16"], help='The tensorrt precision.')
 parser.add_argument('--cpu_threads', type=int, default=10, help='Number of threads to predict when using cpu.')
@@ -131,10 +133,7 @@ if __name__ == "__main__":
                           args.use_tensorrt, args.precision, args.cpu_threads,
                           args.enable_mkldnn)
 
-    wavs = [
-        '~/audio_demo_resource/cat.wav',
-        '~/audio_demo_resource/dog.wav',
-    ]
+    wavs = [args.wav]
 
     for i in range(len(wavs)):
         wavs[i] = os.path.abspath(os.path.expanduser(wavs[i]))
diff --git a/audio/examples/sound_classification/export_model.py b/paddlespeech/cls/exps/panns/export_model.py
similarity index 94%
rename from audio/examples/sound_classification/export_model.py
rename to paddlespeech/cls/exps/panns/export_model.py
index 1be7b27a369e04e47c7b188fb169b1ed07c0d95d..c295c6a33838b086480ddc4e681341cbd023d560 100644
--- a/audio/examples/sound_classification/export_model.py
+++ b/paddlespeech/cls/exps/panns/export_model.py
@@ -15,9 +15,10 @@ import argparse
 import os
 
 import paddle
-from model import SoundClassifier
+
 from paddleaudio.datasets import ESC50
-from paddleaudio.models.panns import cnn14
+from paddlespeech.cls.models import cnn14
+from paddlespeech.cls.models import SoundClassifier
 
 # yapf: disable
 parser = argparse.ArgumentParser(__doc__)
diff --git a/audio/examples/sound_classification/predict.py b/paddlespeech/cls/exps/panns/predict.py
similarity index 67%
rename from audio/examples/sound_classification/predict.py
rename to paddlespeech/cls/exps/panns/predict.py
index 30d141cd0dac5c13ce0203d03d0a47700d9b849d..9cfd8b6ce44ab7318a218a52bd1c2eaee28d680a 100644
--- a/audio/examples/sound_classification/predict.py
+++ b/paddlespeech/cls/exps/panns/predict.py
@@ -16,30 +16,40 @@ import argparse
 import numpy as np
 import paddle
 import paddle.nn.functional as F
-from model import SoundClassifier
+
 from paddleaudio.backends import load as load_audio
 from paddleaudio.datasets import ESC50
+from paddleaudio.features import LogMelSpectrogram
 from paddleaudio.features import melspectrogram
-from paddleaudio.models.panns import cnn14
+from paddlespeech.cls.models import cnn14
+from paddlespeech.cls.models import SoundClassifier
 
 # yapf: disable
 parser = argparse.ArgumentParser(__doc__)
-parser.add_argument('--device', choices=['cpu', 'gpu'], default="gpu", help="Select which device to predict, defaults to gpu.")
 parser.add_argument("--wav", type=str, required=True, help="Audio file to infer.")
+parser.add_argument("--feat_backend", type=str, choices=['numpy', 'paddle'], default='numpy', help="Choose backend to extract features from audio files.")
 parser.add_argument("--top_k", type=int, default=1, help="Show top k predicted results")
 parser.add_argument("--checkpoint", type=str, required=True, help="Checkpoint of model.")
 args = parser.parse_args()
 # yapf: enable
 
 
-def extract_features(file: str, **kwargs):
+def extract_features(file: str, feat_backend: str='numpy',
+                     **kwargs) -> paddle.Tensor:
     waveform, sr = load_audio(file, sr=None)
-    feat = melspectrogram(waveform, sr, **kwargs).transpose()
+
+    if args.feat_backend == 'numpy':
+        feat = melspectrogram(waveform, sr, **kwargs).transpose()
+        feat = np.expand_dims(feat, 0)
+        feat = paddle.to_tensor(feat)
+    else:
+        feature_extractor = LogMelSpectrogram(sr=sr, **kwargs)
+        feat = feature_extractor(paddle.to_tensor(waveform).unsqueeze(0))
+        feat = paddle.transpose(feat, [0, 2, 1])
     return feat
 
 
 if __name__ == '__main__':
-    paddle.set_device(args.device)
 
     model = SoundClassifier(
         backbone=cnn14(pretrained=False, extract_embedding=True),
@@ -47,8 +57,7 @@ if __name__ == '__main__':
     model.set_state_dict(paddle.load(args.checkpoint))
     model.eval()
 
-    feat = np.expand_dims(extract_features(args.wav), 0)
-    feat = paddle.to_tensor(feat)
+    feat = extract_features(args.wav, args.feat_backend)
     logits = model(feat)
     probs = F.softmax(logits, axis=1).numpy()
 
diff --git a/audio/examples/sound_classification/train.py b/paddlespeech/cls/exps/panns/train.py
similarity index 80%
rename from audio/examples/sound_classification/train.py
rename to paddlespeech/cls/exps/panns/train.py
index e3b5e2ae039595962e791f21fba327b78b44d57a..1213097899cee2594b5ecf4e91310c16b5f46841 100644
--- a/audio/examples/sound_classification/train.py
+++ b/paddlespeech/cls/exps/panns/train.py
@@ -15,16 +15,18 @@ import argparse
 import os
 
 import paddle
-from model import SoundClassifier
+
 from paddleaudio.datasets import ESC50
-from paddleaudio.models.panns import cnn14
+from paddleaudio.features import LogMelSpectrogram
 from paddleaudio.utils import logger
 from paddleaudio.utils import Timer
+from paddlespeech.cls.models import cnn14
+from paddlespeech.cls.models import SoundClassifier
 
 # yapf: disable
 parser = argparse.ArgumentParser(__doc__)
-parser.add_argument('--device', choices=['cpu', 'gpu'], default="gpu", help="Select which device to train model, defaults to gpu.")
 parser.add_argument("--epochs", type=int, default=50, help="Number of epoches for fine-tuning.")
+parser.add_argument("--feat_backend", type=str, choices=['numpy', 'paddle'], default='numpy', help="Choose backend to extract features from audio files.")
 parser.add_argument("--learning_rate", type=float, default=5e-5, help="Learning rate used to train with warmup.")
 parser.add_argument("--batch_size", type=int, default=16, help="Total examples' number in batch for training.")
 parser.add_argument("--num_workers", type=int, default=0, help="Number of workers in dataloader.")
@@ -35,7 +37,6 @@ args = parser.parse_args()
 # yapf: enable
 
 if __name__ == "__main__":
-    paddle.set_device(args.device)
     nranks = paddle.distributed.get_world_size()
     if paddle.distributed.get_world_size() > 1:
         paddle.distributed.init_parallel_env()
@@ -48,8 +49,13 @@ if __name__ == "__main__":
         learning_rate=args.learning_rate, parameters=model.parameters())
     criterion = paddle.nn.loss.CrossEntropyLoss()
 
-    train_ds = ESC50(mode='train', feat_type='melspectrogram')
-    dev_ds = ESC50(mode='dev', feat_type='melspectrogram')
+    if args.feat_backend == 'numpy':
+        train_ds = ESC50(mode='train', feat_type='melspectrogram')
+        dev_ds = ESC50(mode='dev', feat_type='melspectrogram')
+    else:
+        train_ds = ESC50(mode='train')
+        dev_ds = ESC50(mode='dev')
+        feature_extractor = LogMelSpectrogram(sr=16000)
 
     train_sampler = paddle.io.DistributedBatchSampler(
         train_ds, batch_size=args.batch_size, shuffle=True, drop_last=False)
@@ -71,7 +77,16 @@ if __name__ == "__main__":
         num_corrects = 0
         num_samples = 0
         for batch_idx, batch in enumerate(train_loader):
-            feats, labels = batch
+            if args.feat_backend == 'numpy':
+                feats, labels = batch
+            else:
+                waveforms, labels = batch
+                feats = feature_extractor(
+                    waveforms
+                )  # Need a padding when lengths of waveforms differ in a batch.
+                feats = paddle.transpose(feats,
+                                         [0, 2, 1])  # To [N, length, n_mels]
+
             logits = model(feats)
 
             loss = criterion(logits, labels)
@@ -126,7 +141,13 @@ if __name__ == "__main__":
             num_samples = 0
             with logger.processing('Evaluation on validation dataset'):
                 for batch_idx, batch in enumerate(dev_loader):
-                    feats, labels = batch
+                    if args.feat_backend == 'numpy':
+                        feats, labels = batch
+                    else:
+                        waveforms, labels = batch
+                        feats = feature_extractor(waveforms)
+                        feats = paddle.transpose(feats, [0, 2, 1])
+
                     logits = model(feats)
 
                     preds = paddle.argmax(logits, axis=1)
diff --git a/paddlespeech/cls/models/__init__.py b/paddlespeech/cls/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4bfadda11968aa2262d810941761dca6838a0d79
--- /dev/null
+++ b/paddlespeech/cls/models/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .panns import *
diff --git a/paddlespeech/cls/models/panns/__init__.py b/paddlespeech/cls/models/panns/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..638f772f9e99c4fa910cbcee9333c6772024ed8a
--- /dev/null
+++ b/paddlespeech/cls/models/panns/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .classifier import *
+from .panns import *
diff --git a/audio/examples/sound_classification/model.py b/paddlespeech/cls/models/panns/classifier.py
similarity index 100%
rename from audio/examples/sound_classification/model.py
rename to paddlespeech/cls/models/panns/classifier.py
diff --git a/audio/paddleaudio/models/panns.py b/paddlespeech/cls/models/panns/panns.py
similarity index 99%
rename from audio/paddleaudio/models/panns.py
rename to paddlespeech/cls/models/panns/panns.py
index 1c68f06f634747361d0929190afdc465ac75bbb1..6d2dac56ac23d9b3322e49703f98e15faf936fd0 100644
--- a/audio/paddleaudio/models/panns.py
+++ b/paddlespeech/cls/models/panns/panns.py
@@ -16,8 +16,8 @@ import os
 import paddle.nn as nn
 import paddle.nn.functional as F
 
-from ..utils.download import load_state_dict_from_url
-from ..utils.env import MODEL_HOME
+from paddleaudio.utils.download import load_state_dict_from_url
+from paddleaudio.utils.env import MODEL_HOME
 
 __all__ = ['CNN14', 'CNN10', 'CNN6', 'cnn14', 'cnn10', 'cnn6']
 
diff --git a/paddlespeech/s2t/decoders/recog.py b/paddlespeech/s2t/decoders/recog.py
index d9324ca02ad5fff81f8c386282dfbce9683b0d5c..3e9939f02bc678217e1f1c61d06aa674ab403cf6 100644
--- a/paddlespeech/s2t/decoders/recog.py
+++ b/paddlespeech/s2t/decoders/recog.py
@@ -40,7 +40,6 @@ def get_config(config_path):
 
 
 def load_trained_model(args):
-    args.nprocs = args.ngpu
     confs = get_config(args.model_conf)
     class_obj = dynamic_import_tester(args.model_name)
     exp = class_obj(confs, args)
diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/test_hub.py b/paddlespeech/s2t/exps/deepspeech2/bin/test_hub.py
index 8ab8fea2fc41422e38c8c4decb961120b0e2f6cc..b8544dc2bf4666cb62ff61c5fbc476c33be783a9 100644
--- a/paddlespeech/s2t/exps/deepspeech2/bin/test_hub.py
+++ b/paddlespeech/s2t/exps/deepspeech2/bin/test_hub.py
@@ -87,7 +87,7 @@ class DeepSpeech2Tester_hub():
     def setup(self):
         """Setup the experiment.
         """
-        paddle.set_device('gpu' if self.args.nprocs > 0 else 'cpu')
+        paddle.set_device('gpu' if self.args.ngpu > 0 else 'cpu')
 
         self.setup_output_dir()
         self.setup_checkpointer()
@@ -110,8 +110,8 @@ class DeepSpeech2Tester_hub():
     def setup_model(self):
         config = self.config.clone()
         with UpdateConfig(config):
-            config.model.feat_size = self.collate_fn_test.feature_size
-            config.model.dict_size = self.collate_fn_test.vocab_size
+            config.model.input_dim = self.collate_fn_test.feature_size
+            config.model.output_dim = self.collate_fn_test.vocab_size
 
         if self.args.model_type == 'offline':
             model = DeepSpeech2Model.from_config(config.model)
diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/train.py b/paddlespeech/s2t/exps/deepspeech2/bin/train.py
index d9b610a06f787b197003f1ecd774753f518662ab..400538f9b2cd55ca1f2f1ed006e89f1437332c0e 100644
--- a/paddlespeech/s2t/exps/deepspeech2/bin/train.py
+++ b/paddlespeech/s2t/exps/deepspeech2/bin/train.py
@@ -27,8 +27,8 @@ def main_sp(config, args):
 
 
 def main(config, args):
-    if args.nprocs > 0:
-        dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs)
+    if args.ngpu > 1:
+        dist.spawn(main_sp, args=(config, args), nprocs=args.ngpu)
     else:
         main_sp(config, args)
 
diff --git a/paddlespeech/s2t/exps/deepspeech2/model.py b/paddlespeech/s2t/exps/deepspeech2/model.py
index e827414d3c67a5790a381f4877bf6a7618ff7d46..3e4ff1a8b10be88fc6990850ae1dda63fe1a2b21 100644
--- a/paddlespeech/s2t/exps/deepspeech2/model.py
+++ b/paddlespeech/s2t/exps/deepspeech2/model.py
@@ -154,11 +154,11 @@ class DeepSpeech2Trainer(Trainer):
         config = self.config.clone()
         with UpdateConfig(config):
             if self.train:
-                config.model.feat_size = self.train_loader.collate_fn.feature_size
-                config.model.dict_size = self.train_loader.collate_fn.vocab_size
+                config.model.input_dim = self.train_loader.collate_fn.feature_size
+                config.model.output_dim = self.train_loader.collate_fn.vocab_size
             else:
-                config.model.feat_size = self.test_loader.collate_fn.feature_size
-                config.model.dict_size = self.test_loader.collate_fn.vocab_size
+                config.model.input_dim = self.test_loader.collate_fn.feature_size
+                config.model.output_dim = self.test_loader.collate_fn.vocab_size
 
         if self.args.model_type == 'offline':
             model = DeepSpeech2Model.from_config(config.model)
diff --git a/paddlespeech/s2t/exps/u2/bin/test_wav.py b/paddlespeech/s2t/exps/u2/bin/test_wav.py
index e118b481dcb4fbaf4aa74cac347b37a00865e2c6..a9450129fb2d98c8d15acca8c996549d3673188e 100644
--- a/paddlespeech/s2t/exps/u2/bin/test_wav.py
+++ b/paddlespeech/s2t/exps/u2/bin/test_wav.py
@@ -47,7 +47,7 @@ class U2Infer():
             vocab_filepath=config.collator.vocab_filepath,
             spm_model_prefix=config.collator.spm_model_prefix)
 
-        paddle.set_device('gpu' if self.args.nprocs > 0 else 'cpu')
+        paddle.set_device('gpu' if self.args.ngpu > 0 else 'cpu')
 
         # model
         model_conf = config.model
diff --git a/paddlespeech/s2t/exps/u2/bin/train.py b/paddlespeech/s2t/exps/u2/bin/train.py
index 127db521addddc55f362510a47dcdced3cce3367..d6ee8b30773544cbf3584ba8b4062769783355b3 100644
--- a/paddlespeech/s2t/exps/u2/bin/train.py
+++ b/paddlespeech/s2t/exps/u2/bin/train.py
@@ -32,8 +32,8 @@ def main_sp(config, args):
 
 
 def main(config, args):
-    if args.nprocs > 0:
-        dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs)
+    if args.ngpu > 1:
+        dist.spawn(main_sp, args=(config, args), nprocs=args.ngpu)
     else:
         main_sp(config, args)
 
diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py
index 27bc47d2baa537496bef0e3f1d1b18a23cf1d1f2..b6dbcf443e94d752615295c20c7aae2a95525896 100644
--- a/paddlespeech/s2t/exps/u2/model.py
+++ b/paddlespeech/s2t/exps/u2/model.py
@@ -257,7 +257,7 @@ class U2Trainer(Trainer):
                 maxlen_in=float('inf'),
                 maxlen_out=float('inf'),
                 minibatches=0,
-                mini_batch_size=self.args.nprocs,
+                mini_batch_size=self.args.ngpu,
                 batch_count='auto',
                 batch_bins=0,
                 batch_frames_in=0,
@@ -277,7 +277,7 @@ class U2Trainer(Trainer):
                 maxlen_in=float('inf'),
                 maxlen_out=float('inf'),
                 minibatches=0,
-                mini_batch_size=self.args.nprocs,
+                mini_batch_size=self.args.ngpu,
                 batch_count='auto',
                 batch_bins=0,
                 batch_frames_in=0,
diff --git a/paddlespeech/s2t/exps/u2_kaldi/bin/train.py b/paddlespeech/s2t/exps/u2_kaldi/bin/train.py
index d3427eec1d08e4a000352949b56b211d96bbb11c..fcfc05a8aea553d154ebde5289560d9c818672ff 100644
--- a/paddlespeech/s2t/exps/u2_kaldi/bin/train.py
+++ b/paddlespeech/s2t/exps/u2_kaldi/bin/train.py
@@ -36,8 +36,8 @@ def main_sp(config, args):
 
 
 def main(config, args):
-    if args.nprocs > 0:
-        dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs)
+    if args.ngpu > 1:
+        dist.spawn(main_sp, args=(config, args), nprocs=args.ngpu)
     else:
         main_sp(config, args)
 
diff --git a/paddlespeech/s2t/exps/u2_kaldi/model.py b/paddlespeech/s2t/exps/u2_kaldi/model.py
index d82034c8234df7bb621f2756dd047f40e0475e5c..c23b4c24545c2dda21b4b366a8d1e76c3acdf7a6 100644
--- a/paddlespeech/s2t/exps/u2_kaldi/model.py
+++ b/paddlespeech/s2t/exps/u2_kaldi/model.py
@@ -239,7 +239,7 @@ class U2Trainer(Trainer):
             maxlen_in=float('inf'),
             maxlen_out=float('inf'),
             minibatches=0,
-            mini_batch_size=self.args.nprocs,
+            mini_batch_size=self.args.ngpu,
             batch_count='auto',
             batch_bins=0,
             batch_frames_in=0,
@@ -258,7 +258,7 @@ class U2Trainer(Trainer):
             maxlen_in=float('inf'),
             maxlen_out=float('inf'),
             minibatches=0,
-            mini_batch_size=self.args.nprocs,
+            mini_batch_size=self.args.ngpu,
             batch_count='auto',
             batch_bins=0,
             batch_frames_in=0,
diff --git a/paddlespeech/s2t/exps/u2_st/bin/train.py b/paddlespeech/s2t/exps/u2_st/bin/train.py
index 3d823cc44d4a5c790ed60f85d90dfc8ca0460409..58496c8874b3dd433d006cdb4c40daa4be051ed7 100644
--- a/paddlespeech/s2t/exps/u2_st/bin/train.py
+++ b/paddlespeech/s2t/exps/u2_st/bin/train.py
@@ -30,8 +30,8 @@ def main_sp(config, args):
 
 
 def main(config, args):
-    if args.nprocs > 0:
-        dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs)
+    if args.ngpu > 1:
+        dist.spawn(main_sp, args=(config, args), nprocs=args.ngpu)
     else:
         main_sp(config, args)
 
diff --git a/paddlespeech/s2t/models/ds2/deepspeech2.py b/paddlespeech/s2t/models/ds2/deepspeech2.py
index 4a7a7c15e9b96edfe5ee4b3f29f406b4b6b62a64..317abc69e3d9dd0a919e692328764f1319b7b76d 100644
--- a/paddlespeech/s2t/models/ds2/deepspeech2.py
+++ b/paddlespeech/s2t/models/ds2/deepspeech2.py
@@ -249,8 +249,8 @@ class DeepSpeech2Model(nn.Layer):
             The model built from config.
         """
         model = cls(
-            feat_size=config.feat_size,
-            dict_size=config.dict_size,
+            feat_size=config.input_dim,
+            dict_size=config.output_dim,
             num_conv_layers=config.num_conv_layers,
             num_rnn_layers=config.num_rnn_layers,
             rnn_size=config.rnn_layer_size,
diff --git a/paddlespeech/s2t/models/ds2_online/deepspeech2.py b/paddlespeech/s2t/models/ds2_online/deepspeech2.py
index da04d5c5de8f56ac17801b2871c6c2f89b16712c..d134239f295f3dc90e7cde55bdb211240b892d22 100644
--- a/paddlespeech/s2t/models/ds2_online/deepspeech2.py
+++ b/paddlespeech/s2t/models/ds2_online/deepspeech2.py
@@ -381,8 +381,8 @@ class DeepSpeech2ModelOnline(nn.Layer):
             The model built from config.
         """
         model = cls(
-            feat_size=config.feat_size,
-            dict_size=config.dict_size,
+            feat_size=config.input_dim,
+            dict_size=config.output_dim,
             num_conv_layers=config.num_conv_layers,
             num_rnn_layers=config.num_rnn_layers,
             rnn_size=config.rnn_layer_size,
diff --git a/paddlespeech/s2t/training/cli.py b/paddlespeech/s2t/training/cli.py
index 55b010e98b8e4d50e54b487bc275861e63cb4413..3ef871c5df08386e5a1e6ef8798d607b304df237 100644
--- a/paddlespeech/s2t/training/cli.py
+++ b/paddlespeech/s2t/training/cli.py
@@ -51,7 +51,7 @@ def default_argument_parser(parser=None):
 
     The ``--checkpoint_path`` specifies the checkpoint to load from.
 
-    The ``--nprocs`` specifies how to run the training.
+    The ``--ngpu`` specifies how to run the training.
 
 
     See Also
@@ -78,7 +78,7 @@ def default_argument_parser(parser=None):
         help="seed to use for paddle, np and random. None or 0 for random, else set seed."
     )
     train_group.add_argument(
-        "--nprocs",
+        "--ngpu",
         type=int,
         default=1,
         help="number of parallel processes. 0 for cpu.")
diff --git a/paddlespeech/s2t/training/trainer.py b/paddlespeech/s2t/training/trainer.py
index e6328cdf73797fc57b012104acf80838cb6796f0..f5fb2db03be0090d6e99656db0be4836745783a2 100644
--- a/paddlespeech/s2t/training/trainer.py
+++ b/paddlespeech/s2t/training/trainer.py
@@ -88,8 +88,8 @@ class Trainer():
     >>>     config.merge_from_list(args.opts)
     >>> config.freeze()
     >>>
-    >>> if args.nprocs > 0:
-    >>>     dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs)
+    >>> if args.ngpu > 1:
+    >>>     dist.spawn(main_sp, args=(config, args), nprocs=args.ngpu)
     >>> else:
     >>>     main_sp(config, args)
     """
@@ -112,7 +112,7 @@ class Trainer():
         logger.info(f"Rank: {self.rank}/{self.world_size}")
 
         # set device
-        paddle.set_device('gpu' if self.args.nprocs > 0 else 'cpu')
+        paddle.set_device('gpu' if self.args.ngpu > 0 else 'cpu')
         if self.parallel:
             self.init_parallel()
 
@@ -162,7 +162,7 @@ class Trainer():
         """A flag indicating whether the experiment should run with
         multiprocessing.
         """
-        return self.args.nprocs > 1
+        return self.args.ngpu > 1
 
     def init_parallel(self):
         """Init environment for multiprocess training.
diff --git a/paddlespeech/t2s/exps/fastspeech2/inference.py b/paddlespeech/t2s/exps/fastspeech2/inference.py
index 07e9ed7ee33cf2408f15ceb6a9b620d691614253..1d6ea667a28a8a8bb751f339ab25bd7e6edcff1b 100644
--- a/paddlespeech/t2s/exps/fastspeech2/inference.py
+++ b/paddlespeech/t2s/exps/fastspeech2/inference.py
@@ -82,7 +82,9 @@ def main():
 
     with open(args.text, 'rt') as f:
         for line in f:
-            utt_id, sentence = line.strip().split()
+            items = line.strip().split()
+            utt_id = items[0]
+            sentence = "".join(items[1:])
             sentences.append((utt_id, sentence))
 
     for utt_id, sentence in sentences:
diff --git a/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e.py b/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e.py
index 1839415e978d84b7fbf83f3516f01176e4aabe6c..9dc3ab4b655badb41bcaf16b738bf9a1f3fa1faa 100644
--- a/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e.py
+++ b/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e.py
@@ -37,7 +37,9 @@ def evaluate(args, fastspeech2_config, pwg_config):
     sentences = []
     with open(args.text, 'rt') as f:
         for line in f:
-            utt_id, sentence = line.strip().split()
+            items = line.strip().split()
+            utt_id = items[0]
+            sentence = "".join(items[1:])
             sentences.append((utt_id, sentence))
 
     with open(args.phones_dict, "r") as f:
diff --git a/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e.py b/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e.py
index ff9a41eabbb6b5073343f921fe5e4d205a84107f..47c8a5e7af57b0547a1baffe05d8de1d93782160 100644
--- a/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e.py
+++ b/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e.py
@@ -40,7 +40,9 @@ def evaluate(args, fastspeech2_config, pwg_config):
     sentences = []
     with open(args.text, 'rt') as f:
         for line in f:
-            utt_id, sentence = line.strip().split()
+            items = line.strip().split()
+            utt_id = items[0]
+            sentence = "".join(items[1:])
             sentences.append((utt_id, sentence))
 
     with open(args.phones_dict, "r") as f:
diff --git a/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e_melgan.py b/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e_melgan.py
index f0ff5655dc1ad91470458bc1fff7fe592260accd..4d5d1ac413eb9ee512a5bfa4d0d9350744c3f15e 100644
--- a/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e_melgan.py
+++ b/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e_melgan.py
@@ -40,7 +40,9 @@ def evaluate(args, fastspeech2_config, melgan_config):
     sentences = []
     with open(args.text, 'rt') as f:
         for line in f:
-            utt_id, sentence = line.strip().split()
+            items = line.strip().split()
+            utt_id = items[0]
+            sentence = "".join(items[1:])
             sentences.append((utt_id, sentence))
 
     with open(args.phones_dict, "r") as f:
diff --git a/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/train.py b/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/train.py
index ca3c0a1f1ba55d05bcb1bcb790388025bf16ba9d..a44d2d3c263b5ca6f78b40d06a87cafa36bc7798 100644
--- a/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/train.py
+++ b/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/train.py
@@ -36,10 +36,10 @@ from paddlespeech.t2s.models.melgan import MBMelGANEvaluator
 from paddlespeech.t2s.models.melgan import MBMelGANUpdater
 from paddlespeech.t2s.models.melgan import MelGANGenerator
 from paddlespeech.t2s.models.melgan import MelGANMultiScaleDiscriminator
-from paddlespeech.t2s.modules.adversarial_loss import DiscriminatorAdversarialLoss
-from paddlespeech.t2s.modules.adversarial_loss import GeneratorAdversarialLoss
+from paddlespeech.t2s.modules.losses import DiscriminatorAdversarialLoss
+from paddlespeech.t2s.modules.losses import GeneratorAdversarialLoss
+from paddlespeech.t2s.modules.losses import MultiResolutionSTFTLoss
 from paddlespeech.t2s.modules.pqmf import PQMF
-from paddlespeech.t2s.modules.stft_loss import MultiResolutionSTFTLoss
 from paddlespeech.t2s.training.extensions.snapshot import Snapshot
 from paddlespeech.t2s.training.extensions.visualizer import VisualDL
 from paddlespeech.t2s.training.seeding import seed_everything
diff --git a/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py b/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py
index 42ef88307a9a766092d8bc3f39cea6890ce73d7c..98b0ed717c89d2d5d7ccf2c910420bd03722beaa 100644
--- a/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py
+++ b/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py
@@ -36,7 +36,7 @@ from paddlespeech.t2s.models.parallel_wavegan import PWGDiscriminator
 from paddlespeech.t2s.models.parallel_wavegan import PWGEvaluator
 from paddlespeech.t2s.models.parallel_wavegan import PWGGenerator
 from paddlespeech.t2s.models.parallel_wavegan import PWGUpdater
-from paddlespeech.t2s.modules.stft_loss import MultiResolutionSTFTLoss
+from paddlespeech.t2s.modules.losses import MultiResolutionSTFTLoss
 from paddlespeech.t2s.training.extensions.snapshot import Snapshot
 from paddlespeech.t2s.training.extensions.visualizer import VisualDL
 from paddlespeech.t2s.training.seeding import seed_everything
diff --git a/paddlespeech/t2s/exps/speedyspeech/inference.py b/paddlespeech/t2s/exps/speedyspeech/inference.py
index 617848c58454b294d2ea8ab1e7bba1228dfe6078..0ed2e0bf104e30301ecb20fceace28f9ef26bcad 100644
--- a/paddlespeech/t2s/exps/speedyspeech/inference.py
+++ b/paddlespeech/t2s/exps/speedyspeech/inference.py
@@ -87,7 +87,9 @@ def main():
 
     with open(args.text, 'rt') as f:
         for line in f:
-            utt_id, sentence = line.strip().split()
+            items = line.strip().split()
+            utt_id = items[0]
+            sentence = "".join(items[1:])
             sentences.append((utt_id, sentence))
 
     for utt_id, sentence in sentences:
diff --git a/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py b/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py
index 0e64088dcd135975e128f9d32f797397848e9489..403d35088276d6ac2270185149f361a5fa1de8ee 100644
--- a/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py
+++ b/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py
@@ -40,7 +40,9 @@ def evaluate(args, speedyspeech_config, pwg_config):
     sentences = []
     with open(args.text, 'rt') as f:
         for line in f:
-            utt_id, sentence = line.strip().split()
+            items = line.strip().split()
+            utt_id = items[0]
+            sentence = "".join(items[1:])
             sentences.append((utt_id, sentence))
 
     with open(args.phones_dict, "r") as f:
diff --git a/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/train.py b/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/train.py
index 8af1d45e039bd04925d221e6ade3975b1e4fcba0..ea5f12da7bab17ff1bafc8a27df17955b3cb54af 100644
--- a/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/train.py
+++ b/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/train.py
@@ -241,7 +241,7 @@ def main_sp(config, args):
 
 
 def main(config, args):
-    if args.ngpu:
+    if args.ngpu > 1:
         dist.spawn(main_sp, args=(config, args), nprocs=args.ngpu)
     else:
         main_sp(config, args)
diff --git a/paddlespeech/t2s/frontend/zh_frontend.py b/paddlespeech/t2s/frontend/zh_frontend.py
index d49c09378a274fad28d71ba188362b8590f2caf9..84852b9ce05abf6f65f8b2ba93605598674f0089 100644
--- a/paddlespeech/t2s/frontend/zh_frontend.py
+++ b/paddlespeech/t2s/frontend/zh_frontend.py
@@ -129,6 +129,8 @@ class Frontend():
                 # we discriminate i, ii and iii
                 if c and c not in self.punc:
                     phones.append(c)
+                if c and c in self.punc:
+                    phones.append('sp')
                 if v and v not in self.punc:
                     phones.append(v)
             # add sp between sentence (replace the last punc with sp)
@@ -149,9 +151,14 @@ class Frontend():
         if word not in self.must_erhua and (word in self.not_erhua or
                                             pos in {"a", "j", "nr"}):
             return initials, finals
+        # "……" 等情况直接返回
+        if len(finals) != len(word):
+            return initials, finals
+
+        assert len(finals) == len(word)
+
         new_initials = []
         new_finals = []
-        assert len(finals) == len(word)
         for i, phn in enumerate(finals):
             if i == len(finals) - 1 and word[i] == "儿" and phn in {
                     "er2", "er5"
diff --git a/paddlespeech/t2s/frontend/zh_normalization/chronology.py b/paddlespeech/t2s/frontend/zh_normalization/chronology.py
index b8d711564c923ee5cf74613e4dd85a991739713e..8801baa0d5a59c09bb661fac7eb56dc8e4a8ed93 100644
--- a/paddlespeech/t2s/frontend/zh_normalization/chronology.py
+++ b/paddlespeech/t2s/frontend/zh_normalization/chronology.py
@@ -32,6 +32,15 @@ RE_TIME = re.compile(r'([0-1]?[0-9]|2[0-3])'
                      r':([0-5][0-9])'
                      r'(:([0-5][0-9]))?')
 
+# 时间范围，如8:30-12:30
+RE_TIME_RANGE = re.compile(r'([0-1]?[0-9]|2[0-3])'
+                           r':([0-5][0-9])'
+                           r'(:([0-5][0-9]))?'
+                           r'(~|-)'
+                           r'([0-1]?[0-9]|2[0-3])'
+                           r':([0-5][0-9])'
+                           r'(:([0-5][0-9]))?')
+
 
 def replace_time(match) -> str:
     """
@@ -42,15 +51,32 @@ def replace_time(match) -> str:
     ----------
     str
     """
+
+    is_range = len(match.groups()) > 5
+
     hour = match.group(1)
     minute = match.group(2)
     second = match.group(4)
 
+    if is_range:
+        hour_2 = match.group(6)
+        minute_2 = match.group(7)
+        second_2 = match.group(9)
+
     result = f"{num2str(hour)}点"
     if minute.lstrip('0'):
         result += f"{_time_num2str(minute)}分"
     if second and second.lstrip('0'):
         result += f"{_time_num2str(second)}秒"
+
+    if is_range:
+        result += "至"
+        result += f"{num2str(hour_2)}点"
+        if minute_2.lstrip('0'):
+            result += f"{_time_num2str(minute_2)}分"
+        if second_2 and second_2.lstrip('0'):
+            result += f"{_time_num2str(second_2)}秒"
+
     return result
 
 
diff --git a/paddlespeech/t2s/frontend/zh_normalization/phonecode.py b/paddlespeech/t2s/frontend/zh_normalization/phonecode.py
index be159c2395df53b13c62ce509a37569e2d6a8473..b7b69b41b223172e281f22773e745e579b9c8bba 100644
--- a/paddlespeech/t2s/frontend/zh_normalization/phonecode.py
+++ b/paddlespeech/t2s/frontend/zh_normalization/phonecode.py
@@ -26,16 +26,19 @@ RE_MOBILE_PHONE = re.compile(
 RE_TELEPHONE = re.compile(
     r"(?<!\d)((0(10|2[1-3]|[3-9]\d{2})-?)?[1-9]\d{7,8})(?!\d)")
 
+# 全国统一的号码400开头
+RE_NATIONAL_UNIFORM_NUMBER = re.compile(r"(400)(-)?\d{3}(-)?\d{4}")
+
 
 def phone2str(phone_string: str, mobile=True) -> str:
     if mobile:
         sp_parts = phone_string.strip('+').split()
-        result = ''.join(
+        result = '，'.join(
             [verbalize_digit(part, alt_one=True) for part in sp_parts])
         return result
     else:
         sil_parts = phone_string.split('-')
-        result = ''.join(
+        result = '，'.join(
             [verbalize_digit(part, alt_one=True) for part in sil_parts])
         return result
 
diff --git a/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py b/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py
index e25e9901914dc0407a85ecaaec4452a39e0990b0..c68caeeb78e675a69bea501ac33c0c951c5673fa 100644
--- a/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py
+++ b/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py
@@ -18,6 +18,7 @@ from .char_convert import tranditional_to_simplified
 from .chronology import RE_DATE
 from .chronology import RE_DATE2
 from .chronology import RE_TIME
+from .chronology import RE_TIME_RANGE
 from .chronology import replace_date
 from .chronology import replace_date2
 from .chronology import replace_time
@@ -40,6 +41,7 @@ from .num import replace_percentage
 from .num import replace_positive_quantifier
 from .num import replace_range
 from .phonecode import RE_MOBILE_PHONE
+from .phonecode import RE_NATIONAL_UNIFORM_NUMBER
 from .phonecode import RE_TELEPHONE
 from .phonecode import replace_mobile
 from .phonecode import replace_phone
@@ -62,6 +64,8 @@ class TextNormalizer():
         List[str]
             Sentences.
         """
+        # Only for pure Chinese here
+        text = text.replace(" ", "")
         text = self.SENTENCE_SPLITOR.sub(r'\1\n', text)
         text = text.strip()
         sentences = [sentence.strip() for sentence in re.split(r'\n+', text)]
@@ -76,12 +80,19 @@ class TextNormalizer():
         # number related NSW verbalization
         sentence = RE_DATE.sub(replace_date, sentence)
         sentence = RE_DATE2.sub(replace_date2, sentence)
+
+        # range first
+        sentence = RE_TIME_RANGE.sub(replace_time, sentence)
         sentence = RE_TIME.sub(replace_time, sentence)
+
         sentence = RE_TEMPERATURE.sub(replace_temperature, sentence)
         sentence = RE_FRAC.sub(replace_frac, sentence)
         sentence = RE_PERCENTAGE.sub(replace_percentage, sentence)
         sentence = RE_MOBILE_PHONE.sub(replace_mobile, sentence)
+
         sentence = RE_TELEPHONE.sub(replace_phone, sentence)
+        sentence = RE_NATIONAL_UNIFORM_NUMBER.sub(replace_phone, sentence)
+
         sentence = RE_RANGE.sub(replace_range, sentence)
         sentence = RE_INTEGER.sub(replace_negative_num, sentence)
         sentence = RE_DECIMAL_NUM.sub(replace_number, sentence)
@@ -94,5 +105,6 @@ class TextNormalizer():
 
     def normalize(self, text: str) -> List[str]:
         sentences = self._split(text)
+
         sentences = [self.normalize_sentence(sent) for sent in sentences]
         return sentences
diff --git a/paddlespeech/t2s/models/__init__.py b/paddlespeech/t2s/models/__init__.py
index 4ce90896d2e5d9421eb2cc922bd8dca24d48819c..66720649062343c018056a2271797e92e11e94a1 100644
--- a/paddlespeech/t2s/models/__init__.py
+++ b/paddlespeech/t2s/models/__init__.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from .fastspeech2 import *
+from .melgan import *
+from .parallel_wavegan import *
 from .tacotron2 import *
 from .transformer_tts import *
 from .waveflow import *
diff --git a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
index cf957978eb20a6d3eab9a2f3b8cdb36c91213a53..cdec03abc15ef89a96bf2ae7df4932bb94f91437 100644
--- a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
+++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
@@ -24,17 +24,16 @@ import paddle.nn.functional as F
 from paddle import nn
 from typeguard import check_argument_types
 
-from paddlespeech.t2s.modules.fastspeech2_predictor.duration_predictor import DurationPredictor
-from paddlespeech.t2s.modules.fastspeech2_predictor.duration_predictor import DurationPredictorLoss
-from paddlespeech.t2s.modules.fastspeech2_predictor.length_regulator import LengthRegulator
-from paddlespeech.t2s.modules.fastspeech2_predictor.variance_predictor import VariancePredictor
-from paddlespeech.t2s.modules.fastspeech2_transformer.embedding import PositionalEncoding
-from paddlespeech.t2s.modules.fastspeech2_transformer.embedding import ScaledPositionalEncoding
-from paddlespeech.t2s.modules.fastspeech2_transformer.encoder import Encoder as TransformerEncoder
 from paddlespeech.t2s.modules.nets_utils import initialize
 from paddlespeech.t2s.modules.nets_utils import make_non_pad_mask
 from paddlespeech.t2s.modules.nets_utils import make_pad_mask
+from paddlespeech.t2s.modules.predictor.duration_predictor import DurationPredictor
+from paddlespeech.t2s.modules.predictor.duration_predictor import DurationPredictorLoss
+from paddlespeech.t2s.modules.predictor.length_regulator import LengthRegulator
+from paddlespeech.t2s.modules.predictor.variance_predictor import VariancePredictor
 from paddlespeech.t2s.modules.tacotron2.decoder import Postnet
+from paddlespeech.t2s.modules.transformer.encoder import ConformerEncoder
+from paddlespeech.t2s.modules.transformer.encoder import TransformerEncoder
 
 
 class FastSpeech2(nn.Layer):
@@ -66,6 +65,7 @@ class FastSpeech2(nn.Layer):
             postnet_layers: int=5,
             postnet_chans: int=512,
             postnet_filts: int=5,
+            postnet_dropout_rate: float=0.5,
             positionwise_layer_type: str="conv1d",
             positionwise_conv_kernel_size: int=1,
             use_scaled_pos_enc: bool=True,
@@ -77,10 +77,27 @@ class FastSpeech2(nn.Layer):
             reduction_factor: int=1,
             encoder_type: str="transformer",
             decoder_type: str="transformer",
+            # for transformer
+            transformer_enc_dropout_rate: float=0.1,
+            transformer_enc_positional_dropout_rate: float=0.1,
+            transformer_enc_attn_dropout_rate: float=0.1,
+            transformer_dec_dropout_rate: float=0.1,
+            transformer_dec_positional_dropout_rate: float=0.1,
+            transformer_dec_attn_dropout_rate: float=0.1,
+            # for conformer
+            conformer_pos_enc_layer_type: str="rel_pos",
+            conformer_self_attn_layer_type: str="rel_selfattn",
+            conformer_activation_type: str="swish",
+            use_macaron_style_in_conformer: bool=True,
+            use_cnn_in_conformer: bool=True,
+            zero_triu: bool=False,
+            conformer_enc_kernel_size: int=7,
+            conformer_dec_kernel_size: int=31,
             # duration predictor
             duration_predictor_layers: int=2,
             duration_predictor_chans: int=384,
             duration_predictor_kernel_size: int=3,
+            duration_predictor_dropout_rate: float=0.1,
             # energy predictor
             energy_predictor_layers: int=2,
             energy_predictor_chans: int=384,
@@ -101,25 +118,147 @@ class FastSpeech2(nn.Layer):
             spk_num: int=None,
             spk_embed_dim: int=None,
             spk_embed_integration_type: str="add",
-            #  tone emb
-            num_tones: int=None,
+            # tone emb
+            tone_num: int=None,
             tone_embed_dim: int=None,
             tone_embed_integration_type: str="add",
             # training related
-            transformer_enc_dropout_rate: float=0.1,
-            transformer_enc_positional_dropout_rate: float=0.1,
-            transformer_enc_attn_dropout_rate: float=0.1,
-            transformer_dec_dropout_rate: float=0.1,
-            transformer_dec_positional_dropout_rate: float=0.1,
-            transformer_dec_attn_dropout_rate: float=0.1,
-            duration_predictor_dropout_rate: float=0.1,
-            postnet_dropout_rate: float=0.5,
             init_type: str="xavier_uniform",
             init_enc_alpha: float=1.0,
-            init_dec_alpha: float=1.0,
-            use_masking: bool=False,
-            use_weighted_masking: bool=False, ):
-        """Initialize FastSpeech2 module."""
+            init_dec_alpha: float=1.0, ):
+        """Initialize FastSpeech2 module.
+        Parameters
+        ----------
+        idim : int
+            Dimension of the inputs.
+        odim : int
+            Dimension of the outputs.
+        adim : int
+            Attention dimension.
+        aheads : int
+            Number of attention heads.
+        elayers : int
+            Number of encoder layers.
+        eunits : int
+            Number of encoder hidden units.
+        dlayers : int
+            Number of decoder layers.
+        dunits : int
+            Number of decoder hidden units.
+        postnet_layers : int
+            Number of postnet layers.
+        postnet_chans : int
+            Number of postnet channels.
+        postnet_filts : int
+            Kernel size of postnet.
+        postnet_dropout_rate : float
+            Dropout rate in postnet.
+        use_scaled_pos_enc : bool
+            Whether to use trainable scaled pos encoding.
+        use_batch_norm : bool
+            Whether to use batch normalization in encoder prenet.
+        encoder_normalize_before : bool
+            Whether to apply layernorm layer before encoder block.
+        decoder_normalize_before : bool
+            Whether to apply layernorm layer before
+            decoder block.
+        encoder_concat_after : bool
+            Whether to concatenate attention layer's input and output in encoder.
+        decoder_concat_after : bool
+            Whether to concatenate attention layer's input  and output in decoder.
+        reduction_factor : int
+            Reduction factor.
+        encoder_type : str
+            Encoder type ("transformer" or "conformer").
+        decoder_type : str
+            Decoder type ("transformer" or "conformer").
+        transformer_enc_dropout_rate : float
+            Dropout rate in encoder except attention and positional encoding.
+        transformer_enc_positional_dropout_rate (float): Dropout rate after encoder
+            positional encoding.
+        transformer_enc_attn_dropout_rate (float): Dropout rate in encoder
+            self-attention module.
+        transformer_dec_dropout_rate (float): Dropout rate in decoder except
+            attention & positional encoding.
+        transformer_dec_positional_dropout_rate (float): Dropout rate after decoder
+            positional encoding.
+        transformer_dec_attn_dropout_rate (float): Dropout rate in decoder
+            self-attention module.
+        conformer_pos_enc_layer_type : str
+            Pos encoding layer type in conformer.
+        conformer_self_attn_layer_type : str
+            Self-attention layer type in conformer
+        conformer_activation_type : str
+            Activation function type in conformer.
+        use_macaron_style_in_conformer : bool
+            Whether to use macaron style FFN.
+        use_cnn_in_conformer : bool
+            Whether to use CNN in conformer.
+        zero_triu : bool
+            Whether to use zero triu in relative self-attention module.
+        conformer_enc_kernel_size : int
+            Kernel size of encoder conformer.
+        conformer_dec_kernel_size : int
+            Kernel size of decoder conformer.
+        duration_predictor_layers : int
+            Number of duration predictor layers.
+        duration_predictor_chans : int
+            Number of duration predictor channels.
+        duration_predictor_kernel_size : int
+            Kernel size of duration predictor.
+        duration_predictor_dropout_rate : float
+            Dropout rate in duration predictor.
+        pitch_predictor_layers : int
+            Number of pitch predictor layers.
+        pitch_predictor_chans : int
+            Number of pitch predictor channels.
+        pitch_predictor_kernel_size : int
+            Kernel size of pitch predictor.
+        pitch_predictor_dropout_rate : float
+            Dropout rate in pitch predictor.
+        pitch_embed_kernel_size : float
+            Kernel size of pitch embedding.
+        pitch_embed_dropout_rate : float
+            Dropout rate for pitch embedding.
+        stop_gradient_from_pitch_predictor : bool
+            Whether to stop gradient from pitch predictor to encoder.
+        energy_predictor_layers : int
+            Number of energy predictor layers.
+        energy_predictor_chans : int
+            Number of energy predictor channels.
+        energy_predictor_kernel_size : int
+            Kernel size of energy predictor.
+        energy_predictor_dropout_rate : float
+            Dropout rate in energy predictor.
+        energy_embed_kernel_size : float
+            Kernel size of energy embedding.
+        energy_embed_dropout_rate : float
+            Dropout rate for energy embedding.
+        stop_gradient_from_energy_predictor : bool 
+            Whether to stop gradient from energy predictor to encoder.
+        spk_num : Optional[int]
+            Number of speakers. If not None, assume that the spk_embed_dim is not None,
+            spk_ids will be provided as the input and use spk_embedding_table.
+        spk_embed_dim : Optional[int]
+            Speaker embedding dimension. If not None, 
+            assume that spk_emb will be provided as the input or spk_num is not None.
+        spk_embed_integration_type : str
+            How to integrate speaker embedding.
+        tone_num : Optional[int]
+            Number of tones. If not None, assume that the
+            tone_ids will be provided as the input and use tone_embedding_table.
+        tone_embed_dim : Optional[int]
+            Tone embedding dimension. If not None, assume that tone_num is not None.
+        tone_embed_integration_type : str
+            How to integrate tone embedding.
+        init_type : str
+            How to initialize transformer parameters.
+        init_enc_alpha : float
+            Initial value of alpha in scaled pos encoding of the encoder.
+        init_dec_alpha : float
+            Initial value of alpha in scaled pos encoding of the decoder.
+    
+        """
         assert check_argument_types()
         super().__init__()
 
@@ -156,13 +295,12 @@ class FastSpeech2(nn.Layer):
 
         if self.tone_embed_dim is not None:
             self.tone_embedding_table = nn.Embedding(
-                num_embeddings=num_tones,
+                num_embeddings=tone_num,
                 embedding_dim=self.tone_embed_dim,
                 padding_idx=self.padding_idx)
 
-        # get positional encoding class
-        pos_enc_class = (ScaledPositionalEncoding
-                         if self.use_scaled_pos_enc else PositionalEncoding)
+        # get positional encoding layer type
+        transformer_pos_enc_layer_type = "scaled_abs_pos" if self.use_scaled_pos_enc else "abs_pos"
 
         # define encoder
         encoder_input_layer = nn.Embedding(
@@ -171,6 +309,7 @@ class FastSpeech2(nn.Layer):
             padding_idx=self.padding_idx)
 
         if encoder_type == "transformer":
+            print("encoder_type is transformer")
             self.encoder = TransformerEncoder(
                 idim=idim,
                 attention_dim=adim,
@@ -181,11 +320,34 @@ class FastSpeech2(nn.Layer):
                 dropout_rate=transformer_enc_dropout_rate,
                 positional_dropout_rate=transformer_enc_positional_dropout_rate,
                 attention_dropout_rate=transformer_enc_attn_dropout_rate,
-                pos_enc_class=pos_enc_class,
+                pos_enc_layer_type=transformer_pos_enc_layer_type,
                 normalize_before=encoder_normalize_before,
                 concat_after=encoder_concat_after,
                 positionwise_layer_type=positionwise_layer_type,
                 positionwise_conv_kernel_size=positionwise_conv_kernel_size, )
+        elif encoder_type == "conformer":
+            print("encoder_type is conformer")
+            self.encoder = ConformerEncoder(
+                idim=idim,
+                attention_dim=adim,
+                attention_heads=aheads,
+                linear_units=eunits,
+                num_blocks=elayers,
+                input_layer=encoder_input_layer,
+                dropout_rate=transformer_enc_dropout_rate,
+                positional_dropout_rate=transformer_enc_positional_dropout_rate,
+                attention_dropout_rate=transformer_enc_attn_dropout_rate,
+                normalize_before=encoder_normalize_before,
+                concat_after=encoder_concat_after,
+                positionwise_layer_type=positionwise_layer_type,
+                positionwise_conv_kernel_size=positionwise_conv_kernel_size,
+                macaron_style=use_macaron_style_in_conformer,
+                pos_enc_layer_type=conformer_pos_enc_layer_type,
+                selfattention_layer_type=conformer_self_attn_layer_type,
+                activation_type=conformer_activation_type,
+                use_cnn_module=use_cnn_in_conformer,
+                cnn_module_kernel=conformer_enc_kernel_size,
+                zero_triu=zero_triu, )
         else:
             raise ValueError(f"{encoder_type} is not supported.")
 
@@ -251,6 +413,7 @@ class FastSpeech2(nn.Layer):
         # NOTE: we use encoder as decoder
         # because fastspeech's decoder is the same as encoder
         if decoder_type == "transformer":
+            print("decoder_type is transformer")
             self.decoder = TransformerEncoder(
                 idim=0,
                 attention_dim=adim,
@@ -262,11 +425,33 @@ class FastSpeech2(nn.Layer):
                 dropout_rate=transformer_dec_dropout_rate,
                 positional_dropout_rate=transformer_dec_positional_dropout_rate,
                 attention_dropout_rate=transformer_dec_attn_dropout_rate,
-                pos_enc_class=pos_enc_class,
+                pos_enc_layer_type=transformer_pos_enc_layer_type,
                 normalize_before=decoder_normalize_before,
                 concat_after=decoder_concat_after,
                 positionwise_layer_type=positionwise_layer_type,
                 positionwise_conv_kernel_size=positionwise_conv_kernel_size, )
+        elif decoder_type == "conformer":
+            print("decoder_type is conformer")
+            self.decoder = ConformerEncoder(
+                idim=0,
+                attention_dim=adim,
+                attention_heads=aheads,
+                linear_units=dunits,
+                num_blocks=dlayers,
+                input_layer=None,
+                dropout_rate=transformer_dec_dropout_rate,
+                positional_dropout_rate=transformer_dec_positional_dropout_rate,
+                attention_dropout_rate=transformer_dec_attn_dropout_rate,
+                normalize_before=decoder_normalize_before,
+                concat_after=decoder_concat_after,
+                positionwise_layer_type=positionwise_layer_type,
+                positionwise_conv_kernel_size=positionwise_conv_kernel_size,
+                macaron_style=use_macaron_style_in_conformer,
+                pos_enc_layer_type=conformer_pos_enc_layer_type,
+                selfattention_layer_type=conformer_self_attn_layer_type,
+                activation_type=conformer_activation_type,
+                use_cnn_module=use_cnn_in_conformer,
+                cnn_module_kernel=conformer_dec_kernel_size, )
         else:
             raise ValueError(f"{decoder_type} is not supported.")
 
diff --git a/paddlespeech/t2s/models/melgan/melgan.py b/paddlespeech/t2s/models/melgan/melgan.py
index 80bb1c1b2c4e17fbcf0ff531e9504ff6ccad2332..809403f60f143c695ddf56b5b03b05fccec0babb 100644
--- a/paddlespeech/t2s/models/melgan/melgan.py
+++ b/paddlespeech/t2s/models/melgan/melgan.py
@@ -78,7 +78,7 @@ class MelGANGenerator(nn.Layer):
             Padding function module name before dilated convolution layer.
         pad_params : dict
             Hyperparameters for padding function.
-        use_final_nonlinear_activation : paddle.nn.Layer
+        use_final_nonlinear_activation : nn.Layer
             Activation function for the final layer.
         use_weight_norm : bool
             Whether to use weight norm.
diff --git a/paddlespeech/t2s/models/speedyspeech/speedyspeech.py b/paddlespeech/t2s/models/speedyspeech/speedyspeech.py
index 0689ec45337c869812646f0acd45f14fda956dee..ece5c279f2f82e76b873d2f0b80b78173fce098f 100644
--- a/paddlespeech/t2s/models/speedyspeech/speedyspeech.py
+++ b/paddlespeech/t2s/models/speedyspeech/speedyspeech.py
@@ -11,13 +11,34 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import numpy as np
 import paddle
 from paddle import nn
 
-from paddlespeech.t2s.modules.expansion import expand
 from paddlespeech.t2s.modules.positional_encoding import sinusoid_position_encoding
 
 
+def expand(encodings: paddle.Tensor, durations: paddle.Tensor) -> paddle.Tensor:
+    """
+    encodings: (B, T, C)
+    durations: (B, T)
+    """
+    batch_size, t_enc = durations.shape
+    durations = durations.numpy()
+    slens = np.sum(durations, -1)
+    t_dec = np.max(slens)
+    M = np.zeros([batch_size, t_dec, t_enc])
+    for i in range(batch_size):
+        k = 0
+        for j in range(t_enc):
+            d = durations[i, j]
+            M[i, k:k + d, j] = 1
+            k += d
+    M = paddle.to_tensor(M, dtype=encodings.dtype)
+    encodings = paddle.matmul(M, encodings)
+    return encodings
+
+
 class ResidualBlock(nn.Layer):
     def __init__(self, channels, kernel_size, dilation, n=2):
         super().__init__()
diff --git a/paddlespeech/t2s/models/speedyspeech/speedyspeech_updater.py b/paddlespeech/t2s/models/speedyspeech/speedyspeech_updater.py
index 4883a87e53a527d90e205755a0e98b5257955139..6f9937a51845ff7c3962ba14efc1a53f0d15b94d 100644
--- a/paddlespeech/t2s/models/speedyspeech/speedyspeech_updater.py
+++ b/paddlespeech/t2s/models/speedyspeech/speedyspeech_updater.py
@@ -19,8 +19,8 @@ from paddle.fluid.layers import huber_loss
 from paddle.nn import functional as F
 
 from paddlespeech.t2s.modules.losses import masked_l1_loss
+from paddlespeech.t2s.modules.losses import ssim
 from paddlespeech.t2s.modules.losses import weighted_mean
-from paddlespeech.t2s.modules.ssim import ssim
 from paddlespeech.t2s.training.extensions.evaluator import StandardEvaluator
 from paddlespeech.t2s.training.reporter import report
 from paddlespeech.t2s.training.updaters.standard_updater import StandardUpdater
diff --git a/paddlespeech/t2s/models/tacotron2.py b/paddlespeech/t2s/models/tacotron2.py
index b0946a5ba0c42b3c895b782993f8c47ef5e7a14c..01ea4f7d235aa76eba3bc18d029e2e07e8ccbe5c 100644
--- a/paddlespeech/t2s/models/tacotron2.py
+++ b/paddlespeech/t2s/models/tacotron2.py
@@ -20,7 +20,6 @@ from paddle.nn import functional as F
 from paddle.nn import initializer as I
 from tqdm import trange
 
-from paddlespeech.t2s.modules.attention import LocationSensitiveAttention
 from paddlespeech.t2s.modules.conv import Conv1dBatchNorm
 from paddlespeech.t2s.modules.losses import guided_attention_loss
 from paddlespeech.t2s.utils import checkpoint
@@ -28,6 +27,99 @@ from paddlespeech.t2s.utils import checkpoint
 __all__ = ["Tacotron2", "Tacotron2Loss"]
 
 
+class LocationSensitiveAttention(nn.Layer):
+    """Location Sensitive Attention module.
+
+    Reference: `Attention-Based Models for Speech Recognition <https://arxiv.org/pdf/1506.07503.pdf>`_
+
+    Parameters
+    -----------
+    d_query: int
+        The feature size of query.
+    d_key : int
+        The feature size of key.
+    d_attention : int
+        The feature size of dimension.
+    location_filters : int
+        Filter size of attention convolution.
+    location_kernel_size : int
+        Kernel size of attention convolution.
+    """
+
+    def __init__(self,
+                 d_query: int,
+                 d_key: int,
+                 d_attention: int,
+                 location_filters: int,
+                 location_kernel_size: int):
+        super().__init__()
+
+        self.query_layer = nn.Linear(d_query, d_attention, bias_attr=False)
+        self.key_layer = nn.Linear(d_key, d_attention, bias_attr=False)
+        self.value = nn.Linear(d_attention, 1, bias_attr=False)
+
+        # Location Layer
+        self.location_conv = nn.Conv1D(
+            2,
+            location_filters,
+            kernel_size=location_kernel_size,
+            padding=int((location_kernel_size - 1) / 2),
+            bias_attr=False,
+            data_format='NLC')
+        self.location_layer = nn.Linear(
+            location_filters, d_attention, bias_attr=False)
+
+    def forward(self,
+                query,
+                processed_key,
+                value,
+                attention_weights_cat,
+                mask=None):
+        """Compute context vector and attention weights.
+        
+        Parameters
+        -----------
+        query : Tensor [shape=(batch_size, d_query)]
+            The queries.
+        processed_key : Tensor [shape=(batch_size, time_steps_k, d_attention)]
+            The keys after linear layer.
+        value : Tensor [shape=(batch_size, time_steps_k, d_key)]
+            The values.
+        attention_weights_cat : Tensor [shape=(batch_size, time_step_k, 2)]
+            Attention weights concat.
+        mask : Tensor, optional
+            The mask. Shape should be (batch_size, times_steps_k, 1).
+            Defaults to None.
+
+        Returns
+        ----------
+        attention_context : Tensor [shape=(batch_size, d_attention)]
+            The context vector.
+        attention_weights : Tensor [shape=(batch_size, time_steps_k)]
+            The attention weights.
+        """
+
+        processed_query = self.query_layer(paddle.unsqueeze(query, axis=[1]))
+        processed_attention_weights = self.location_layer(
+            self.location_conv(attention_weights_cat))
+        # (B, T_enc, 1)
+        alignment = self.value(
+            paddle.tanh(processed_attention_weights + processed_key +
+                        processed_query))
+
+        if mask is not None:
+            alignment = alignment + (1.0 - mask) * -1e9
+
+        attention_weights = F.softmax(alignment, axis=1)
+        attention_context = paddle.matmul(
+            attention_weights, value, transpose_x=True)
+
+        attention_weights = paddle.squeeze(attention_weights, axis=-1)
+        attention_context = paddle.squeeze(attention_context, axis=1)
+
+        return attention_context, attention_weights
+
+
 class DecoderPreNet(nn.Layer):
     """Decoder prenet module for Tacotron2.
 
@@ -197,7 +289,7 @@ class Tacotron2Encoder(nn.Layer):
         super().__init__()
 
         k = math.sqrt(1.0 / (d_hidden * kernel_size))
-        self.conv_batchnorms = paddle.nn.LayerList([
+        self.conv_batchnorms = nn.LayerList([
             Conv1dBatchNorm(
                 d_hidden,
                 d_hidden,
@@ -903,7 +995,7 @@ class Tacotron2Loss(nn.Layer):
         self.use_stop_token_loss = use_stop_token_loss
         self.use_guided_attention_loss = use_guided_attention_loss
         self.attn_criterion = guided_attention_loss
-        self.stop_criterion = paddle.nn.BCEWithLogitsLoss()
+        self.stop_criterion = nn.BCEWithLogitsLoss()
         self.sigma = sigma
 
     def forward(self,
diff --git a/paddlespeech/t2s/models/transformer_tts/transformer_tts.py b/paddlespeech/t2s/models/transformer_tts/transformer_tts.py
index 5958a16609a1541ec10e27582a3acbf0e319eb8c..ae6d736559384b8951f02c09dbc8bf987625e1f5 100644
--- a/paddlespeech/t2s/models/transformer_tts/transformer_tts.py
+++ b/paddlespeech/t2s/models/transformer_tts/transformer_tts.py
@@ -23,12 +23,6 @@ import paddle.nn.functional as F
 from paddle import nn
 from typeguard import check_argument_types
 
-from paddlespeech.t2s.modules.fastspeech2_transformer.attention import MultiHeadedAttention
-from paddlespeech.t2s.modules.fastspeech2_transformer.decoder import Decoder
-from paddlespeech.t2s.modules.fastspeech2_transformer.embedding import PositionalEncoding
-from paddlespeech.t2s.modules.fastspeech2_transformer.embedding import ScaledPositionalEncoding
-from paddlespeech.t2s.modules.fastspeech2_transformer.encoder import Encoder
-from paddlespeech.t2s.modules.fastspeech2_transformer.mask import subsequent_mask
 from paddlespeech.t2s.modules.nets_utils import initialize
 from paddlespeech.t2s.modules.nets_utils import make_non_pad_mask
 from paddlespeech.t2s.modules.nets_utils import make_pad_mask
@@ -36,6 +30,12 @@ from paddlespeech.t2s.modules.style_encoder import StyleEncoder
 from paddlespeech.t2s.modules.tacotron2.decoder import Postnet
 from paddlespeech.t2s.modules.tacotron2.decoder import Prenet as DecoderPrenet
 from paddlespeech.t2s.modules.tacotron2.encoder import Encoder as EncoderPrenet
+from paddlespeech.t2s.modules.transformer.attention import MultiHeadedAttention
+from paddlespeech.t2s.modules.transformer.decoder import Decoder
+from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding
+from paddlespeech.t2s.modules.transformer.embedding import ScaledPositionalEncoding
+from paddlespeech.t2s.modules.transformer.encoder import TransformerEncoder
+from paddlespeech.t2s.modules.transformer.mask import subsequent_mask
 
 
 class TransformerTTS(nn.Layer):
@@ -257,9 +257,9 @@ class TransformerTTS(nn.Layer):
         self.padding_idx = 0
         # set_global_initializer 会影响后面的全局，包括 create_parameter
         initialize(self, init_type)
-        # get positional encoding class
-        pos_enc_class = (ScaledPositionalEncoding
-                         if self.use_scaled_pos_enc else PositionalEncoding)
+
+        # get positional encoding layer type
+        transformer_pos_enc_layer_type = "scaled_abs_pos" if self.use_scaled_pos_enc else "abs_pos"
 
         # define transformer encoder
         if eprenet_conv_layers != 0:
@@ -281,7 +281,7 @@ class TransformerTTS(nn.Layer):
                 num_embeddings=idim,
                 embedding_dim=adim,
                 padding_idx=self.padding_idx)
-        self.encoder = Encoder(
+        self.encoder = TransformerEncoder(
             idim=idim,
             attention_dim=adim,
             attention_heads=aheads,
@@ -291,7 +291,7 @@ class TransformerTTS(nn.Layer):
             dropout_rate=transformer_enc_dropout_rate,
             positional_dropout_rate=transformer_enc_positional_dropout_rate,
             attention_dropout_rate=transformer_enc_attn_dropout_rate,
-            pos_enc_class=pos_enc_class,
+            pos_enc_layer_type=transformer_pos_enc_layer_type,
             normalize_before=encoder_normalize_before,
             concat_after=encoder_concat_after,
             positionwise_layer_type=positionwise_layer_type,
@@ -330,6 +330,9 @@ class TransformerTTS(nn.Layer):
                 nn.Linear(dprenet_units, adim), )
         else:
             decoder_input_layer = "linear"
+        # get positional encoding class
+        pos_enc_class = (ScaledPositionalEncoding
+                         if self.use_scaled_pos_enc else PositionalEncoding)
         self.decoder = Decoder(
             odim=odim,  # odim is needed when no prenet is used
             attention_dim=adim,
diff --git a/paddlespeech/t2s/models/waveflow.py b/paddlespeech/t2s/models/waveflow.py
index c57429db1fc3ea9587e8c3a0eaec95c09330da9f..e519e0c5050cc3edb0065c8f442525f709f21d00 100644
--- a/paddlespeech/t2s/models/waveflow.py
+++ b/paddlespeech/t2s/models/waveflow.py
@@ -329,7 +329,7 @@ class ResidualNet(nn.LayerList):
         if len(dilations_h) != n_layer:
             raise ValueError(
                 "number of dilations_h should equals num of layers")
-        super(ResidualNet, self).__init__()
+        super().__init__()
         for i in range(n_layer):
             dilation = (dilations_h[i], 2**i)
             layer = ResidualBlock(residual_channels, condition_channels,
diff --git a/paddlespeech/t2s/modules/__init__.py b/paddlespeech/t2s/modules/__init__.py
index 664267895491c47ad0b3ecaaaae9412f3ce5110f..1e3312002cada4503b6c4d43f2ea5b30ba9d7efb 100644
--- a/paddlespeech/t2s/modules/__init__.py
+++ b/paddlespeech/t2s/modules/__init__.py
@@ -11,10 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from .attention import *
 from .conv import *
 from .geometry import *
 from .losses import *
-from .masking import *
 from .positional_encoding import *
-from .transformer import *
diff --git a/paddlespeech/t2s/modules/glu.py b/paddlespeech/t2s/modules/activation.py
similarity index 69%
rename from paddlespeech/t2s/modules/glu.py
rename to paddlespeech/t2s/modules/activation.py
index 1669fb36738a47f06ad4b0350fe2b7182bca49ab..f5b0af6e901e619ad3133fb8db46daef47f6b78b 100644
--- a/paddlespeech/t2s/modules/glu.py
+++ b/paddlespeech/t2s/modules/activation.py
@@ -11,8 +11,9 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import paddle
+import paddle.nn.functional as F
 from paddle import nn
-from paddle.nn import functional as F
 
 
 class GLU(nn.Layer):
@@ -24,3 +25,18 @@ class GLU(nn.Layer):
 
     def forward(self, xs):
         return F.glu(xs, axis=self.dim)
+
+
+def get_activation(act):
+    """Return activation function."""
+
+    activation_funcs = {
+        "hardtanh": paddle.nn.Hardtanh,
+        "tanh": paddle.nn.Tanh,
+        "relu": paddle.nn.ReLU,
+        "selu": paddle.nn.SELU,
+        "swish": paddle.nn.Swish,
+        "glu": GLU
+    }
+
+    return activation_funcs[act]()
diff --git a/paddlespeech/t2s/modules/adversarial_loss.py b/paddlespeech/t2s/modules/adversarial_loss.py
deleted file mode 100644
index d2c8f7a94ea89d5fe5975799e3cacad4f94b4cd0..0000000000000000000000000000000000000000
--- a/paddlespeech/t2s/modules/adversarial_loss.py
+++ /dev/null
@@ -1,125 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# Modified from espnet(https://github.com/espnet/espnet)
-"""Adversarial loss modules."""
-import paddle
-import paddle.nn.functional as F
-from paddle import nn
-
-
-class GeneratorAdversarialLoss(nn.Layer):
-    """Generator adversarial loss module."""
-
-    def __init__(
-            self,
-            average_by_discriminators=True,
-            loss_type="mse", ):
-        """Initialize GeneratorAversarialLoss module."""
-        super().__init__()
-        self.average_by_discriminators = average_by_discriminators
-        assert loss_type in ["mse", "hinge"], f"{loss_type} is not supported."
-        if loss_type == "mse":
-            self.criterion = self._mse_loss
-        else:
-            self.criterion = self._hinge_loss
-
-    def forward(self, outputs):
-        """Calcualate generator adversarial loss.
-        Parameters
-        ----------
-        outputs: Tensor or List
-        Discriminator outputs or list of discriminator outputs.
-        Returns
-        ----------
-        Tensor
-            Generator adversarial loss value.
-        """
-        if isinstance(outputs, (tuple, list)):
-            adv_loss = 0.0
-            for i, outputs_ in enumerate(outputs):
-                if isinstance(outputs_, (tuple, list)):
-                    # case including feature maps
-                    outputs_ = outputs_[-1]
-                adv_loss += self.criterion(outputs_)
-            if self.average_by_discriminators:
-                adv_loss /= i + 1
-        else:
-            adv_loss = self.criterion(outputs)
-
-        return adv_loss
-
-    def _mse_loss(self, x):
-        return F.mse_loss(x, paddle.ones_like(x))
-
-    def _hinge_loss(self, x):
-        return -x.mean()
-
-
-class DiscriminatorAdversarialLoss(nn.Layer):
-    """Discriminator adversarial loss module."""
-
-    def __init__(
-            self,
-            average_by_discriminators=True,
-            loss_type="mse", ):
-        """Initialize DiscriminatorAversarialLoss module."""
-        super().__init__()
-        self.average_by_discriminators = average_by_discriminators
-        assert loss_type in ["mse"], f"{loss_type} is not supported."
-        if loss_type == "mse":
-            self.fake_criterion = self._mse_fake_loss
-            self.real_criterion = self._mse_real_loss
-
-    def forward(self, outputs_hat, outputs):
-        """Calcualate discriminator adversarial loss.
-        Parameters
-        ----------
-        outputs_hat : Tensor or list
-            Discriminator outputs or list of
-            discriminator outputs calculated from generator outputs.
-        outputs : Tensor or list
-            Discriminator outputs or list of
-            discriminator outputs calculated from groundtruth.
-        Returns
-        ----------
-        Tensor
-            Discriminator real loss value.
-        Tensor
-            Discriminator fake loss value.
-        """
-        if isinstance(outputs, (tuple, list)):
-            real_loss = 0.0
-            fake_loss = 0.0
-            for i, (outputs_hat_,
-                    outputs_) in enumerate(zip(outputs_hat, outputs)):
-                if isinstance(outputs_hat_, (tuple, list)):
-                    # case including feature maps
-                    outputs_hat_ = outputs_hat_[-1]
-                    outputs_ = outputs_[-1]
-                real_loss += self.real_criterion(outputs_)
-                fake_loss += self.fake_criterion(outputs_hat_)
-            if self.average_by_discriminators:
-                fake_loss /= i + 1
-                real_loss /= i + 1
-        else:
-            real_loss = self.real_criterion(outputs)
-            fake_loss = self.fake_criterion(outputs_hat)
-
-        return real_loss, fake_loss
-
-    def _mse_real_loss(self, x):
-        return F.mse_loss(x, paddle.ones_like(x))
-
-    def _mse_fake_loss(self, x):
-        return F.mse_loss(x, paddle.zeros_like(x))
diff --git a/paddlespeech/t2s/modules/attention.py b/paddlespeech/t2s/modules/attention.py
deleted file mode 100644
index 154625cc3c1d9426ed2d21edc3064798b73ccd3a..0000000000000000000000000000000000000000
--- a/paddlespeech/t2s/modules/attention.py
+++ /dev/null
@@ -1,348 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import math
-
-import numpy as np
-import paddle
-from paddle import nn
-from paddle.nn import functional as F
-
-
-def scaled_dot_product_attention(q, k, v, mask=None, dropout=0.0,
-                                 training=True):
-    r"""Scaled dot product attention with masking. 
-    
-    Assume that q, k, v all have the same leading dimensions (denoted as * in 
-    descriptions below). Dropout is applied to attention weights before 
-    weighted sum of values.
-
-    Parameters
-    -----------
-    q : Tensor [shape=(\*, T_q, d)]
-        the query tensor.
-    k : Tensor [shape=(\*, T_k, d)]
-        the key tensor.
-    v : Tensor [shape=(\*, T_k, d_v)]
-        the value tensor.
-    mask : Tensor, [shape=(\*, T_q, T_k) or broadcastable shape], optional
-        the mask tensor, zeros correspond to paddings. Defaults to None.
-
-    Returns
-    ----------
-    out : Tensor [shape=(\*, T_q, d_v)]
-        the context vector.
-    attn_weights : Tensor [shape=(\*, T_q, T_k)]
-        the attention weights.
-    """
-    d = q.shape[-1]  # we only support imperative execution
-    qk = paddle.matmul(q, k, transpose_y=True)
-    scaled_logit = paddle.scale(qk, 1.0 / math.sqrt(d))
-
-    if mask is not None:
-        scaled_logit += paddle.scale((1.0 - mask), -1e9)  # hard coded here
-
-    attn_weights = F.softmax(scaled_logit, axis=-1)
-    attn_weights = F.dropout(attn_weights, dropout, training=training)
-    out = paddle.matmul(attn_weights, v)
-    return out, attn_weights
-
-
-def drop_head(x, drop_n_heads, training=True):
-    """Drop n context vectors from multiple ones.
-
-    Parameters
-    ----------
-    x : Tensor [shape=(batch_size, num_heads, time_steps, channels)]
-        The input, multiple context vectors.
-    drop_n_heads : int [0<= drop_n_heads <= num_heads]
-        Number of vectors to drop.
-    training : bool
-        A flag indicating whether it is in training. If `False`, no dropout is 
-        applied.
-
-    Returns
-    -------
-    Tensor
-        The output.
-    """
-    if not training or (drop_n_heads == 0):
-        return x
-
-    batch_size, num_heads, _, _ = x.shape
-    # drop all heads
-    if num_heads == drop_n_heads:
-        return paddle.zeros_like(x)
-
-    mask = np.ones([batch_size, num_heads])
-    mask[:, :drop_n_heads] = 0
-    for subarray in mask:
-        np.random.shuffle(subarray)
-    scale = float(num_heads) / (num_heads - drop_n_heads)
-    mask = scale * np.reshape(mask, [batch_size, num_heads, 1, 1])
-    out = x * paddle.to_tensor(mask)
-    return out
-
-
-def _split_heads(x, num_heads):
-    batch_size, time_steps, _ = x.shape
-    x = paddle.reshape(x, [batch_size, time_steps, num_heads, -1])
-    x = paddle.transpose(x, [0, 2, 1, 3])
-    return x
-
-
-def _concat_heads(x):
-    batch_size, _, time_steps, _ = x.shape
-    x = paddle.transpose(x, [0, 2, 1, 3])
-    x = paddle.reshape(x, [batch_size, time_steps, -1])
-    return x
-
-
-# Standard implementations of Monohead Attention & Multihead Attention
-class MonoheadAttention(nn.Layer):
-    """Monohead Attention module.
-
-    Parameters
-    ----------
-    model_dim : int
-        Feature size of the query.
-    dropout : float, optional
-        Dropout probability of scaled dot product attention and final context
-        vector. Defaults to 0.0.
-    k_dim : int, optional
-        Feature size of the key of each scaled dot product attention. If not
-        provided, it is set to `model_dim / num_heads`. Defaults to None.
-    v_dim : int, optional
-        Feature size of the key of each scaled dot product attention. If not
-        provided, it is set to `model_dim / num_heads`. Defaults to None.
-    """
-
-    def __init__(self,
-                 model_dim: int,
-                 dropout: float=0.0,
-                 k_dim: int=None,
-                 v_dim: int=None):
-        super(MonoheadAttention, self).__init__()
-        k_dim = k_dim or model_dim
-        v_dim = v_dim or model_dim
-        self.affine_q = nn.Linear(model_dim, k_dim)
-        self.affine_k = nn.Linear(model_dim, k_dim)
-        self.affine_v = nn.Linear(model_dim, v_dim)
-        self.affine_o = nn.Linear(v_dim, model_dim)
-
-        self.model_dim = model_dim
-        self.dropout = dropout
-
-    def forward(self, q, k, v, mask):
-        """Compute context vector and attention weights.
-        
-        Parameters
-        -----------
-        q : Tensor [shape=(batch_size, time_steps_q, model_dim)]
-            The queries.
-        k : Tensor [shape=(batch_size, time_steps_k, model_dim)]
-            The keys.
-        v : Tensor [shape=(batch_size, time_steps_k, model_dim)]
-            The values.
-        mask : Tensor [shape=(batch_size, times_steps_q, time_steps_k] or broadcastable shape
-            The mask.
-
-        Returns
-        ----------
-        out : Tensor [shape=(batch_size, time_steps_q, model_dim)]
-            The context vector.
-        attention_weights : Tensor [shape=(batch_size, times_steps_q, time_steps_k)]
-            The attention weights.
-        """
-        q = self.affine_q(q)  # (B, T, C)
-        k = self.affine_k(k)
-        v = self.affine_v(v)
-
-        context_vectors, attention_weights = scaled_dot_product_attention(
-            q, k, v, mask, self.dropout, self.training)
-
-        out = self.affine_o(context_vectors)
-        return out, attention_weights
-
-
-class MultiheadAttention(nn.Layer):
-    """Multihead Attention module.
-
-    Parameters
-    -----------
-    model_dim: int
-        The feature size of query.
-    num_heads : int
-        The number of attention heads.
-    dropout : float, optional
-        Dropout probability of scaled dot product attention and final context
-        vector. Defaults to 0.0.
-    k_dim : int, optional
-        Feature size of the key of each scaled dot product attention. If not
-        provided, it is set to ``model_dim / num_heads``. Defaults to None.
-    v_dim : int, optional
-        Feature size of the key of each scaled dot product attention. If not
-        provided, it is set to ``model_dim / num_heads``. Defaults to None.
-
-    Raises
-    ---------
-    ValueError
-        If ``model_dim`` is not divisible by ``num_heads``.
-    """
-
-    def __init__(self,
-                 model_dim: int,
-                 num_heads: int,
-                 dropout: float=0.0,
-                 k_dim: int=None,
-                 v_dim: int=None):
-        super(MultiheadAttention, self).__init__()
-        if model_dim % num_heads != 0:
-            raise ValueError("model_dim must be divisible by num_heads")
-        depth = model_dim // num_heads
-        k_dim = k_dim or depth
-        v_dim = v_dim or depth
-        self.affine_q = nn.Linear(model_dim, num_heads * k_dim)
-        self.affine_k = nn.Linear(model_dim, num_heads * k_dim)
-        self.affine_v = nn.Linear(model_dim, num_heads * v_dim)
-        self.affine_o = nn.Linear(num_heads * v_dim, model_dim)
-
-        self.num_heads = num_heads
-        self.model_dim = model_dim
-        self.dropout = dropout
-
-    def forward(self, q, k, v, mask):
-        """Compute context vector and attention weights.
-        
-        Parameters
-        -----------
-        q : Tensor [shape=(batch_size, time_steps_q, model_dim)]
-            The queries.
-        k : Tensor [shape=(batch_size, time_steps_k, model_dim)]
-            The keys.
-        v : Tensor [shape=(batch_size, time_steps_k, model_dim)]
-            The values.
-        mask : Tensor [shape=(batch_size, times_steps_q, time_steps_k] or broadcastable shape
-            The mask.
-
-        Returns
-        ----------
-        out : Tensor [shape=(batch_size, time_steps_q, model_dim)]
-            The context vector.
-        attention_weights : Tensor [shape=(batch_size, times_steps_q, time_steps_k)]
-            The attention weights.
-        """
-        q = _split_heads(self.affine_q(q), self.num_heads)  # (B, h, T, C)
-        k = _split_heads(self.affine_k(k), self.num_heads)
-        v = _split_heads(self.affine_v(v), self.num_heads)
-        mask = paddle.unsqueeze(mask, 1)  # unsqueeze for the h dim
-
-        context_vectors, attention_weights = scaled_dot_product_attention(
-            q, k, v, mask, self.dropout, self.training)
-        # NOTE: there is more sophisticated implementation: Scheduled DropHead
-        context_vectors = _concat_heads(context_vectors)  # (B, T, h*C)
-        out = self.affine_o(context_vectors)
-        return out, attention_weights
-
-
-class LocationSensitiveAttention(nn.Layer):
-    """Location Sensitive Attention module.
-
-    Reference: `Attention-Based Models for Speech Recognition <https://arxiv.org/pdf/1506.07503.pdf>`_
-
-    Parameters
-    -----------
-    d_query: int
-        The feature size of query.
-    d_key : int
-        The feature size of key.
-    d_attention : int
-        The feature size of dimension.
-    location_filters : int
-        Filter size of attention convolution.
-    location_kernel_size : int
-        Kernel size of attention convolution.
-    """
-
-    def __init__(self,
-                 d_query: int,
-                 d_key: int,
-                 d_attention: int,
-                 location_filters: int,
-                 location_kernel_size: int):
-        super().__init__()
-
-        self.query_layer = nn.Linear(d_query, d_attention, bias_attr=False)
-        self.key_layer = nn.Linear(d_key, d_attention, bias_attr=False)
-        self.value = nn.Linear(d_attention, 1, bias_attr=False)
-
-        # Location Layer
-        self.location_conv = nn.Conv1D(
-            2,
-            location_filters,
-            kernel_size=location_kernel_size,
-            padding=int((location_kernel_size - 1) / 2),
-            bias_attr=False,
-            data_format='NLC')
-        self.location_layer = nn.Linear(
-            location_filters, d_attention, bias_attr=False)
-
-    def forward(self,
-                query,
-                processed_key,
-                value,
-                attention_weights_cat,
-                mask=None):
-        """Compute context vector and attention weights.
-        
-        Parameters
-        -----------
-        query : Tensor [shape=(batch_size, d_query)]
-            The queries.
-        processed_key : Tensor [shape=(batch_size, time_steps_k, d_attention)]
-            The keys after linear layer.
-        value : Tensor [shape=(batch_size, time_steps_k, d_key)]
-            The values.
-        attention_weights_cat : Tensor [shape=(batch_size, time_step_k, 2)]
-            Attention weights concat.
-        mask : Tensor, optional
-            The mask. Shape should be (batch_size, times_steps_k, 1).
-            Defaults to None.
-
-        Returns
-        ----------
-        attention_context : Tensor [shape=(batch_size, d_attention)]
-            The context vector.
-        attention_weights : Tensor [shape=(batch_size, time_steps_k)]
-            The attention weights.
-        """
-
-        processed_query = self.query_layer(paddle.unsqueeze(query, axis=[1]))
-        processed_attention_weights = self.location_layer(
-            self.location_conv(attention_weights_cat))
-        # (B, T_enc, 1)
-        alignment = self.value(
-            paddle.tanh(processed_attention_weights + processed_key +
-                        processed_query))
-
-        if mask is not None:
-            alignment = alignment + (1.0 - mask) * -1e9
-
-        attention_weights = F.softmax(alignment, axis=1)
-        attention_context = paddle.matmul(
-            attention_weights, value, transpose_x=True)
-
-        attention_weights = paddle.squeeze(attention_weights, axis=-1)
-        attention_context = paddle.squeeze(attention_context, axis=1)
-
-        return attention_context, attention_weights
diff --git a/paddlespeech/t2s/modules/audio.py b/paddlespeech/t2s/modules/audio.py
deleted file mode 100644
index 926ce8f24b76d86e2956fc80300988cad27f1757..0000000000000000000000000000000000000000
--- a/paddlespeech/t2s/modules/audio.py
+++ /dev/null
@@ -1,229 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import librosa
-import numpy as np
-import paddle
-from librosa.util import pad_center
-from paddle import nn
-from paddle.nn import functional as F
-from scipy import signal
-
-__all__ = ["quantize", "dequantize", "STFT", "MelScale"]
-
-
-def quantize(values, n_bands):
-    """Linearlly quantize a float Tensor in [-1, 1) to an interger Tensor in
-    [0, n_bands).
-
-    Parameters
-    -----------
-    values : Tensor [dtype: flaot32 or float64]
-        The floating point value.
-        
-    n_bands : int
-        The number of bands. The output integer Tensor's value is in the range
-        [0, n_bans).
-
-    Returns
-    ----------
-    Tensor [dtype: int 64]
-        The quantized tensor.
-    """
-    quantized = paddle.cast((values + 1.0) / 2.0 * n_bands, "int64")
-    return quantized
-
-
-def dequantize(quantized, n_bands, dtype=None):
-    """Linearlly dequantize an integer Tensor into a float Tensor in the range
-    [-1, 1).
-
-    Parameters
-    -----------
-    quantized : Tensor [dtype: int]
-        The quantized value in the range [0, n_bands).
-        
-    n_bands : int
-        Number of bands. The input integer Tensor's value is in the range
-        [0, n_bans).
-        
-    dtype : str, optional
-        Data type of the output.
-        
-    Returns
-    -----------
-    Tensor
-        The dequantized tensor, dtype is specified by `dtype`. If `dtype` is 
-        not specified, the default float data type is used.
-    """
-    dtype = dtype or paddle.get_default_dtype()
-    value = (paddle.cast(quantized, dtype) + 0.5) * (2.0 / n_bands) - 1.0
-    return value
-
-
-class STFT(nn.Layer):
-    """A module for computing stft transformation in a differentiable way.
-    
-    Parameters
-    ------------
-    n_fft : int
-        Number of samples in a frame.
-    hop_length : int
-        Number of samples shifted between adjacent frames.
-    win_length : int
-        Length of the window.
-    window : str, optional
-        Name of window function, see `scipy.signal.get_window` for more
-        details. Defaults to "hanning".
-    center : bool
-        If True, the signal y is padded so that frame D[:, t] is centered 
-        at y[t * hop_length]. If False, then D[:, t] begins at y[t * hop_length].
-        Defaults to True.
-    pad_mode : string or function
-        If center=True, this argument is passed to np.pad for padding the edges
-        of the signal y. By default (pad_mode="reflect"), y is padded on both
-        sides with its own reflection, mirrored around its first and last
-        sample respectively. If center=False, this argument is ignored.
-        
-    Notes
-    -----------
-    It behaves like ``librosa.core.stft``. See ``librosa.core.stft`` for more
-    details.
-    
-    Given a audio which ``T`` samples, it the STFT transformation outputs a
-    spectrum with (C, frames) and complex dtype, where ``C = 1 + n_fft / 2``
-    and ``frames = 1 + T // hop_lenghth``.
-    
-    Ony ``center`` and ``reflect`` padding is supported now.
-    
-    """
-
-    def __init__(self,
-                 n_fft,
-                 hop_length=None,
-                 win_length=None,
-                 window="hanning",
-                 center=True,
-                 pad_mode="reflect"):
-        super().__init__()
-        # By default, use the entire frame
-        if win_length is None:
-            win_length = n_fft
-
-        # Set the default hop, if it's not already specified
-        if hop_length is None:
-            hop_length = int(win_length // 4)
-
-        self.hop_length = hop_length
-        self.n_bin = 1 + n_fft // 2
-        self.n_fft = n_fft
-        self.center = center
-        self.pad_mode = pad_mode
-
-        # calculate window
-        window = signal.get_window(window, win_length, fftbins=True)
-
-        # pad window to n_fft size
-        if n_fft != win_length:
-            window = pad_center(window, n_fft, mode="constant")
-            # lpad = (n_fft - win_length) // 2
-            # rpad = n_fft - win_length - lpad
-            # window = np.pad(window, ((lpad, pad), ), 'constant')
-
-        # calculate weights
-        # r = np.arange(0, n_fft)
-        # M = np.expand_dims(r, -1) * np.expand_dims(r, 0)
-        # w_real = np.reshape(window *
-        # np.cos(2 * np.pi * M / n_fft)[:self.n_bin],
-        # (self.n_bin, 1, self.n_fft))
-        # w_imag = np.reshape(window *
-        # np.sin(-2 * np.pi * M / n_fft)[:self.n_bin],
-        # (self.n_bin, 1, self.n_fft))
-        weight = np.fft.fft(np.eye(n_fft))[:self.n_bin]
-        w_real = weight.real
-        w_imag = weight.imag
-        w = np.concatenate([w_real, w_imag], axis=0)
-        w = w * window
-        w = np.expand_dims(w, 1)
-        weight = paddle.cast(paddle.to_tensor(w), paddle.get_default_dtype())
-        self.register_buffer("weight", weight)
-
-    def forward(self, x):
-        """Compute the stft transform.
-        Parameters
-        ------------
-        x : Tensor [shape=(B, T)]
-            The input waveform.
-        Returns
-        ------------
-        real : Tensor [shape=(B, C, frames)]
-            The real part of the spectrogram.
-            
-        imag : Tensor [shape=(B, C, frames)]
-            The image part of the spectrogram.
-        """
-        x = paddle.unsqueeze(x, axis=1)
-        if self.center:
-            x = F.pad(
-                x, [self.n_fft // 2, self.n_fft // 2],
-                data_format='NCL',
-                mode=self.pad_mode)
-
-        # to BCT, C=1
-        out = F.conv1d(x, self.weight, stride=self.hop_length)
-        real, imag = paddle.chunk(out, 2, axis=1)  # BCT
-        return real, imag
-
-    def power(self, x):
-        """Compute the power spectrum.
-        Parameters
-        ------------
-        x : Tensor [shape=(B, T)]
-            The input waveform.
-        Returns
-        ------------
-        Tensor [shape=(B, C, T)]
-            The power spectrum.
-        """
-        real, imag = self.forward(x)
-        power = real**2 + imag**2
-        return power
-
-    def magnitude(self, x):
-        """Compute the magnitude of the spectrum.
-        Parameters
-        ------------
-        x : Tensor [shape=(B, T)]
-            The input waveform.
-        Returns
-        ------------
-        Tensor [shape=(B, C, T)]
-            The magnitude of the spectrum.
-        """
-        power = self.power(x)
-        magnitude = paddle.sqrt(power)  # TODO(chenfeiyu): maybe clipping
-        return magnitude
-
-
-class MelScale(nn.Layer):
-    def __init__(self, sr, n_fft, n_mels, fmin, fmax):
-        super().__init__()
-        mel_basis = librosa.filters.mel(sr, n_fft, n_mels, fmin, fmax)
-        # self.weight = paddle.to_tensor(mel_basis)
-        weight = paddle.to_tensor(mel_basis, dtype=paddle.get_default_dtype())
-        self.register_buffer("weight", weight)
-
-    def forward(self, spec):
-        # (n_mels, n_freq) * (batch_size, n_freq, n_frames)
-        mel = paddle.matmul(self.weight, spec)
-        return mel
diff --git a/paddlespeech/t2s/modules/causal_conv.py b/paddlespeech/t2s/modules/causal_conv.py
index c0dd5b28c96ce8bfd4edd52a27d687ff5901f27f..c0d4f95598e3889db48f4e7113f4242908ba18e0 100644
--- a/paddlespeech/t2s/modules/causal_conv.py
+++ b/paddlespeech/t2s/modules/causal_conv.py
@@ -13,9 +13,10 @@
 # limitations under the License.
 """Causal convolusion layer modules."""
 import paddle
+from paddle import nn
 
 
-class CausalConv1D(paddle.nn.Layer):
+class CausalConv1D(nn.Layer):
     """CausalConv1D module with customized initialization."""
 
     def __init__(
@@ -31,7 +32,7 @@ class CausalConv1D(paddle.nn.Layer):
         super().__init__()
         self.pad = getattr(paddle.nn, pad)((kernel_size - 1) * dilation,
                                            **pad_params)
-        self.conv = paddle.nn.Conv1D(
+        self.conv = nn.Conv1D(
             in_channels,
             out_channels,
             kernel_size,
@@ -52,7 +53,7 @@ class CausalConv1D(paddle.nn.Layer):
         return self.conv(self.pad(x))[:, :, :x.shape[2]]
 
 
-class CausalConv1DTranspose(paddle.nn.Layer):
+class CausalConv1DTranspose(nn.Layer):
     """CausalConv1DTranspose module with customized initialization."""
 
     def __init__(self,
@@ -63,7 +64,7 @@ class CausalConv1DTranspose(paddle.nn.Layer):
                  bias=True):
         """Initialize CausalConvTranspose1d module."""
         super().__init__()
-        self.deconv = paddle.nn.Conv1DTranspose(
+        self.deconv = nn.Conv1DTranspose(
             in_channels, out_channels, kernel_size, stride, bias_attr=bias)
         self.stride = stride
 
diff --git a/paddlespeech/t2s/modules/fastspeech2_transformer/__init__.py b/paddlespeech/t2s/modules/conformer/__init__.py
similarity index 100%
rename from paddlespeech/t2s/modules/fastspeech2_transformer/__init__.py
rename to paddlespeech/t2s/modules/conformer/__init__.py
diff --git a/paddlespeech/t2s/modules/conformer/convolution.py b/paddlespeech/t2s/modules/conformer/convolution.py
new file mode 100644
index 0000000000000000000000000000000000000000..e4a6c8c67e70842feaf84160f273b083f86c012f
--- /dev/null
+++ b/paddlespeech/t2s/modules/conformer/convolution.py
@@ -0,0 +1,86 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+"""ConvolutionModule definition."""
+from paddle import nn
+
+
+class ConvolutionModule(nn.Layer):
+    """ConvolutionModule in Conformer model.
+    Parameters
+    ----------
+    channels : int
+        The number of channels of conv layers.
+    kernel_size : int
+        Kernerl size of conv layers.
+    """
+
+    def __init__(self, channels, kernel_size, activation=nn.ReLU(), bias=True):
+        """Construct an ConvolutionModule object."""
+        super().__init__()
+        # kernerl_size should be a odd number for 'SAME' padding
+        assert (kernel_size - 1) % 2 == 0
+
+        self.pointwise_conv1 = nn.Conv1D(
+            channels,
+            2 * channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias_attr=bias, )
+        self.depthwise_conv = nn.Conv1D(
+            channels,
+            channels,
+            kernel_size,
+            stride=1,
+            padding=(kernel_size - 1) // 2,
+            groups=channels,
+            bias_attr=bias, )
+        self.norm = nn.BatchNorm1D(channels)
+        self.pointwise_conv2 = nn.Conv1D(
+            channels,
+            channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias_attr=bias, )
+        self.activation = activation
+
+    def forward(self, x):
+        """Compute convolution module.
+        Parameters
+        ----------
+        x : paddle.Tensor
+            Input tensor (#batch, time, channels).
+        Returns
+        ----------
+        paddle.Tensor
+            Output tensor (#batch, time, channels).
+        """
+        # exchange the temporal dimension and the feature dimension
+        x = x.transpose([0, 2, 1])
+
+        # GLU mechanism
+        # (batch, 2*channel, time)
+        x = self.pointwise_conv1(x)
+        # (batch, channel, time)
+        x = nn.functional.glu(x, axis=1)
+
+        # 1D Depthwise Conv
+        x = self.depthwise_conv(x)
+        x = self.activation(self.norm(x))
+
+        x = self.pointwise_conv2(x)
+
+        return x.transpose([0, 2, 1])
diff --git a/paddlespeech/t2s/modules/conformer/encoder_layer.py b/paddlespeech/t2s/modules/conformer/encoder_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..2949dc37634beb8332a8c231c41253bb1501ad6a
--- /dev/null
+++ b/paddlespeech/t2s/modules/conformer/encoder_layer.py
@@ -0,0 +1,196 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+"""Encoder self-attention layer definition."""
+import paddle
+from paddle import nn
+
+from paddlespeech.t2s.modules.layer_norm import LayerNorm
+
+
+class EncoderLayer(nn.Layer):
+    """Encoder layer module.
+    Parameters
+    ----------
+    size : int
+        Input dimension.
+    self_attn : nn.Layer
+        Self-attention module instance.
+        `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` instance
+        can be used as the argument.
+    feed_forward : nn.Layer
+        Feed-forward module instance.
+        `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance
+        can be used as the argument.
+    feed_forward_macaron : nn.Layer
+        Additional feed-forward module instance.
+        `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance
+        can be used as the argument.
+    conv_module : nn.Layer
+        Convolution module instance.
+        `ConvlutionModule` instance can be used as the argument.
+    dropout_rate : float
+        Dropout rate.
+    normalize_before : bool
+        Whether to use layer_norm before the first block.
+    concat_after : bool
+        Whether to concat attention layer's input and output.
+        if True, additional linear will be applied.
+        i.e. x -> x + linear(concat(x, att(x)))
+        if False, no additional linear will be applied. i.e. x -> x + att(x)
+    stochastic_depth_rate : float
+        Proability to skip this layer.
+        During training, the layer may skip residual computation and return input
+        as-is with given probability.
+    """
+
+    def __init__(
+            self,
+            size,
+            self_attn,
+            feed_forward,
+            feed_forward_macaron,
+            conv_module,
+            dropout_rate,
+            normalize_before=True,
+            concat_after=False,
+            stochastic_depth_rate=0.0, ):
+        """Construct an EncoderLayer object."""
+        super().__init__()
+        self.self_attn = self_attn
+        self.feed_forward = feed_forward
+        self.feed_forward_macaron = feed_forward_macaron
+        self.conv_module = conv_module
+        self.norm_ff = LayerNorm(size)  # for the FNN module
+        self.norm_mha = LayerNorm(size)  # for the MHA module
+        if feed_forward_macaron is not None:
+            self.norm_ff_macaron = LayerNorm(size)
+            self.ff_scale = 0.5
+        else:
+            self.ff_scale = 1.0
+        if self.conv_module is not None:
+            self.norm_conv = LayerNorm(size)  # for the CNN module
+            self.norm_final = LayerNorm(
+                size)  # for the final output of the block
+        self.dropout = nn.Dropout(dropout_rate)
+        self.size = size
+        self.normalize_before = normalize_before
+        self.concat_after = concat_after
+        if self.concat_after:
+            self.concat_linear = nn.Linear(size + size, size)
+        self.stochastic_depth_rate = stochastic_depth_rate
+
+    def forward(self, x_input, mask, cache=None):
+        """Compute encoded features.
+        Parameters
+        ----------
+        x_input : Union[Tuple, paddle.Tensor]
+            Input tensor w/ or w/o pos emb.
+            - w/ pos emb: Tuple of tensors [(#batch, time, size), (1, time, size)].
+            - w/o pos emb: Tensor (#batch, time, size).
+        mask : paddle.Tensor
+            Mask tensor for the input (#batch, time).
+        cache paddle.Tensor
+            Cache tensor of the input (#batch, time - 1, size).
+        Returns
+        ----------
+        paddle.Tensor
+            Output tensor (#batch, time, size).
+        paddle.Tensor
+            Mask tensor (#batch, time).
+        """
+        if isinstance(x_input, tuple):
+            x, pos_emb = x_input[0], x_input[1]
+        else:
+            x, pos_emb = x_input, None
+
+        skip_layer = False
+        # with stochastic depth, residual connection `x + f(x)` becomes
+        # `x <- x + 1 / (1 - p) * f(x)` at training time.
+        stoch_layer_coeff = 1.0
+        if self.training and self.stochastic_depth_rate > 0:
+            skip_layer = paddle.rand(1).item() < self.stochastic_depth_rate
+            stoch_layer_coeff = 1.0 / (1 - self.stochastic_depth_rate)
+
+        if skip_layer:
+            if cache is not None:
+                x = paddle.concat([cache, x], axis=1)
+            if pos_emb is not None:
+                return (x, pos_emb), mask
+            return x, mask
+
+        # whether to use macaron style
+        if self.feed_forward_macaron is not None:
+            residual = x
+            if self.normalize_before:
+                x = self.norm_ff_macaron(x)
+            x = residual + stoch_layer_coeff * self.ff_scale * self.dropout(
+                self.feed_forward_macaron(x))
+            if not self.normalize_before:
+                x = self.norm_ff_macaron(x)
+
+        # multi-headed self-attention module
+        residual = x
+        if self.normalize_before:
+            x = self.norm_mha(x)
+
+        if cache is None:
+            x_q = x
+        else:
+            assert cache.shape == (x.shape[0], x.shape[1] - 1, self.size)
+            x_q = x[:, -1:, :]
+            residual = residual[:, -1:, :]
+            mask = None if mask is None else mask[:, -1:, :]
+
+        if pos_emb is not None:
+            x_att = self.self_attn(x_q, x, x, pos_emb, mask)
+        else:
+            x_att = self.self_attn(x_q, x, x, mask)
+
+        if self.concat_after:
+            x_concat = paddle.concat((x, x_att), axis=-1)
+            x = residual + stoch_layer_coeff * self.concat_linear(x_concat)
+        else:
+            x = residual + stoch_layer_coeff * self.dropout(x_att)
+        if not self.normalize_before:
+            x = self.norm_mha(x)
+
+        # convolution module
+        if self.conv_module is not None:
+            residual = x
+            if self.normalize_before:
+                x = self.norm_conv(x)
+            x = residual + stoch_layer_coeff * self.dropout(self.conv_module(x))
+            if not self.normalize_before:
+                x = self.norm_conv(x)
+
+        # feed forward module
+        residual = x
+        if self.normalize_before:
+            x = self.norm_ff(x)
+        x = residual + stoch_layer_coeff * self.ff_scale * self.dropout(
+            self.feed_forward(x))
+        if not self.normalize_before:
+            x = self.norm_ff(x)
+
+        if self.conv_module is not None:
+            x = self.norm_final(x)
+
+        if cache is not None:
+            x = paddle.concat([cache, x], axis=1)
+
+        if pos_emb is not None:
+            return (x, pos_emb), mask
+
+        return x, mask
diff --git a/paddlespeech/t2s/modules/conv.py b/paddlespeech/t2s/modules/conv.py
index d9bd98df5f6dbed5ce1921225d47f9e8d3ba5eea..68766d5e5af0a52ea9e1243de8bf39531142e3e2 100644
--- a/paddlespeech/t2s/modules/conv.py
+++ b/paddlespeech/t2s/modules/conv.py
@@ -84,7 +84,7 @@ class Conv1dCell(nn.Conv1D):
         _kernel_size = kernel_size[0] if isinstance(kernel_size, (
             tuple, list)) else kernel_size
         self._r = 1 + (_kernel_size - 1) * _dilation
-        super(Conv1dCell, self).__init__(
+        super().__init__(
             in_channels,
             out_channels,
             kernel_size,
@@ -226,7 +226,7 @@ class Conv1dBatchNorm(nn.Layer):
                  data_format="NCL",
                  momentum=0.9,
                  epsilon=1e-05):
-        super(Conv1dBatchNorm, self).__init__()
+        super().__init__()
         self.conv = nn.Conv1D(
             in_channels,
             out_channels,
diff --git a/paddlespeech/t2s/modules/expansion.py b/paddlespeech/t2s/modules/expansion.py
deleted file mode 100644
index e9d4b6fe8645468828179eb4d2420070237de470..0000000000000000000000000000000000000000
--- a/paddlespeech/t2s/modules/expansion.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import numpy as np
-import paddle
-from paddle import Tensor
-
-
-def expand(encodings: Tensor, durations: Tensor) -> Tensor:
-    """
-    encodings: (B, T, C)
-    durations: (B, T)
-    """
-    batch_size, t_enc = durations.shape
-    durations = durations.numpy()
-    slens = np.sum(durations, -1)
-    t_dec = np.max(slens)
-    M = np.zeros([batch_size, t_dec, t_enc])
-    for i in range(batch_size):
-        k = 0
-        for j in range(t_enc):
-            d = durations[i, j]
-            M[i, k:k + d, j] = 1
-            k += d
-    M = paddle.to_tensor(M, dtype=encodings.dtype)
-    encodings = paddle.matmul(M, encodings)
-    return encodings
diff --git a/paddlespeech/t2s/modules/fastspeech2_transformer/encoder.py b/paddlespeech/t2s/modules/fastspeech2_transformer/encoder.py
deleted file mode 100644
index f91c76b727e8af153ec82bf70410c3c6cae0f227..0000000000000000000000000000000000000000
--- a/paddlespeech/t2s/modules/fastspeech2_transformer/encoder.py
+++ /dev/null
@@ -1,224 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# Modified from espnet(https://github.com/espnet/espnet)
-from paddle import nn
-
-from paddlespeech.t2s.modules.fastspeech2_transformer.attention import MultiHeadedAttention
-from paddlespeech.t2s.modules.fastspeech2_transformer.embedding import PositionalEncoding
-from paddlespeech.t2s.modules.fastspeech2_transformer.encoder_layer import EncoderLayer
-from paddlespeech.t2s.modules.fastspeech2_transformer.multi_layer_conv import Conv1dLinear
-from paddlespeech.t2s.modules.fastspeech2_transformer.multi_layer_conv import MultiLayeredConv1d
-from paddlespeech.t2s.modules.fastspeech2_transformer.positionwise_feed_forward import PositionwiseFeedForward
-from paddlespeech.t2s.modules.fastspeech2_transformer.repeat import repeat
-
-
-class Encoder(nn.Layer):
-    """Transformer encoder module.
-
-    Parameters
-    ----------
-    idim : int
-        Input dimension.
-    attention_dim : int
-        Dimention of attention.
-    attention_heads : int
-        The number of heads of multi head attention.
-    linear_units : int
-        The number of units of position-wise feed forward.
-    num_blocks : int
-        The number of decoder blocks.
-    dropout_rate : float
-        Dropout rate.
-    positional_dropout_rate : float
-        Dropout rate after adding positional encoding.
-    attention_dropout_rate : float
-        Dropout rate in attention.
-    input_layer : Union[str, paddle.nn.Layer]
-        Input layer type.
-    pos_enc_class : paddle.nn.Layer
-        Positional encoding module class.
-        `PositionalEncoding `or `ScaledPositionalEncoding`
-    normalize_before : bool
-        Whether to use layer_norm before the first block.
-    concat_after : bool
-        Whether to concat attention layer's input and output.
-        if True, additional linear will be applied.
-        i.e. x -> x + linear(concat(x, att(x)))
-        if False, no additional linear will be applied. i.e. x -> x + att(x)
-    positionwise_layer_type : str
-        "linear", "conv1d", or "conv1d-linear".
-    positionwise_conv_kernel_size : int
-        Kernel size of positionwise conv1d layer.
-    selfattention_layer_type : str
-        Encoder attention layer type.
-    padding_idx : int
-        Padding idx for input_layer=embed.
-    """
-
-    def __init__(
-            self,
-            idim,
-            attention_dim=256,
-            attention_heads=4,
-            linear_units=2048,
-            num_blocks=6,
-            dropout_rate=0.1,
-            positional_dropout_rate=0.1,
-            attention_dropout_rate=0.0,
-            input_layer="conv2d",
-            pos_enc_class=PositionalEncoding,
-            normalize_before=True,
-            concat_after=False,
-            positionwise_layer_type="linear",
-            positionwise_conv_kernel_size=1,
-            selfattention_layer_type="selfattn",
-            padding_idx=-1, ):
-        """Construct an Encoder object."""
-        super(Encoder, self).__init__()
-        self.conv_subsampling_factor = 1
-        if input_layer == "linear":
-            self.embed = nn.Sequential(
-                nn.Linear(idim, attention_dim, bias_attr=True),
-                nn.LayerNorm(attention_dim),
-                nn.Dropout(dropout_rate),
-                nn.ReLU(),
-                pos_enc_class(attention_dim, positional_dropout_rate), )
-        elif input_layer == "embed":
-            self.embed = nn.Sequential(
-                nn.Embedding(idim, attention_dim, padding_idx=padding_idx),
-                pos_enc_class(attention_dim, positional_dropout_rate), )
-        elif isinstance(input_layer, nn.Layer):
-            self.embed = nn.Sequential(
-                input_layer,
-                pos_enc_class(attention_dim, positional_dropout_rate), )
-        elif input_layer is None:
-            self.embed = nn.Sequential(
-                pos_enc_class(attention_dim, positional_dropout_rate))
-        else:
-            raise ValueError("unknown input_layer: " + input_layer)
-
-        self.normalize_before = normalize_before
-        positionwise_layer, positionwise_layer_args = self.get_positionwise_layer(
-            positionwise_layer_type,
-            attention_dim,
-            linear_units,
-            dropout_rate,
-            positionwise_conv_kernel_size, )
-        if selfattention_layer_type in [
-                "selfattn",
-                "rel_selfattn",
-                "legacy_rel_selfattn",
-        ]:
-            encoder_selfattn_layer = MultiHeadedAttention
-            encoder_selfattn_layer_args = [
-                (attention_heads, attention_dim, attention_dropout_rate, )
-            ] * num_blocks
-
-        else:
-            raise NotImplementedError(selfattention_layer_type)
-
-        self.encoders = repeat(
-            num_blocks,
-            lambda lnum: EncoderLayer(
-                attention_dim,
-                encoder_selfattn_layer(*encoder_selfattn_layer_args[lnum]),
-                positionwise_layer(*positionwise_layer_args),
-                dropout_rate,
-                normalize_before,
-                concat_after, ), )
-        if self.normalize_before:
-            self.after_norm = nn.LayerNorm(attention_dim)
-
-    def get_positionwise_layer(
-            self,
-            positionwise_layer_type="linear",
-            attention_dim=256,
-            linear_units=2048,
-            dropout_rate=0.1,
-            positionwise_conv_kernel_size=1, ):
-        """Define positionwise layer."""
-        if positionwise_layer_type == "linear":
-            positionwise_layer = PositionwiseFeedForward
-            positionwise_layer_args = (attention_dim, linear_units,
-                                       dropout_rate)
-        elif positionwise_layer_type == "conv1d":
-            positionwise_layer = MultiLayeredConv1d
-            positionwise_layer_args = (attention_dim, linear_units,
-                                       positionwise_conv_kernel_size,
-                                       dropout_rate, )
-        elif positionwise_layer_type == "conv1d-linear":
-            positionwise_layer = Conv1dLinear
-            positionwise_layer_args = (attention_dim, linear_units,
-                                       positionwise_conv_kernel_size,
-                                       dropout_rate, )
-        else:
-            raise NotImplementedError("Support only linear or conv1d.")
-        return positionwise_layer, positionwise_layer_args
-
-    def forward(self, xs, masks):
-        """Encode input sequence.
-
-        Parameters
-        ----------
-        xs : paddle.Tensor
-            Input tensor (#batch, time, idim).
-        masks : paddle.Tensor
-            Mask tensor (#batch, time).
-
-        Returns
-        ----------
-        paddle.Tensor
-            Output tensor (#batch, time, attention_dim).
-        paddle.Tensor
-            Mask tensor (#batch, time).
-        """
-
-        xs = self.embed(xs)
-        xs, masks = self.encoders(xs, masks)
-        if self.normalize_before:
-            xs = self.after_norm(xs)
-        return xs, masks
-
-    def forward_one_step(self, xs, masks, cache=None):
-        """Encode input frame.
-
-        Parameters
-        ----------
-        xs : paddle.Tensor
-            Input tensor.
-        masks : paddle.Tensor
-            Mask tensor.
-        cache : List[paddle.Tensor]
-            List of cache tensors.
-
-        Returns
-        ----------
-        paddle.Tensor
-            Output tensor.
-        paddle.Tensor
-            Mask tensor.
-        List[paddle.Tensor]
-            List of new cache tensors.
-        """
-
-        xs = self.embed(xs)
-        if cache is None:
-            cache = [None for _ in range(len(self.encoders))]
-        new_cache = []
-        for c, e in zip(cache, self.encoders):
-            xs, masks = e(xs, masks, cache=c)
-            new_cache.append(xs)
-        if self.normalize_before:
-            xs = self.after_norm(xs)
-        return xs, masks, new_cache
diff --git a/paddlespeech/t2s/modules/layer_norm.py b/paddlespeech/t2s/modules/layer_norm.py
index a1c775fc8bfa6d1b720e58cf4db223f01c558c5d..4edd22c98302fd10e99184b53a64c6cd7a6ee855 100644
--- a/paddlespeech/t2s/modules/layer_norm.py
+++ b/paddlespeech/t2s/modules/layer_norm.py
@@ -13,9 +13,10 @@
 # limitations under the License.
 """Layer normalization module."""
 import paddle
+from paddle import nn
 
 
-class LayerNorm(paddle.nn.LayerNorm):
+class LayerNorm(nn.LayerNorm):
     """Layer normalization module.
 
     Parameters
@@ -28,7 +29,7 @@ class LayerNorm(paddle.nn.LayerNorm):
 
     def __init__(self, nout, dim=-1):
         """Construct an LayerNorm object."""
-        super(LayerNorm, self).__init__(nout)
+        super().__init__(nout)
         self.dim = dim
 
     def forward(self, x):
diff --git a/paddlespeech/t2s/modules/losses.py b/paddlespeech/t2s/modules/losses.py
index ece9e04505e65c8e2b6779c117f8d21557c12f4d..6b0ab6b3342f39e2f9dc8e17d4d3f69ff82460a2 100644
--- a/paddlespeech/t2s/modules/losses.py
+++ b/paddlespeech/t2s/modules/losses.py
@@ -11,18 +11,16 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import math
+
 import paddle
+from paddle import nn
 from paddle.fluid.layers import sequence_mask
 from paddle.nn import functional as F
-
-__all__ = [
-    "guided_attention_loss",
-    "weighted_mean",
-    "masked_l1_loss",
-    "masked_softmax_with_cross_entropy",
-]
+from scipy import signal
 
 
+# Loss for Tacotron2
 def attention_guide(dec_lens, enc_lens, N, T, g, dtype=None):
     """Build that W matrix. shape(B, T_dec, T_enc)
     W[i, n, t] = 1 - exp(-(n/dec_lens[i] - t/enc_lens[i])**2 / (2g**2)) 
@@ -57,6 +55,367 @@ def guided_attention_loss(attention_weight, dec_lens, enc_lens, g):
     return loss
 
 
+# Losses for GAN Vocoder
+def stft(x,
+         fft_size,
+         hop_length=None,
+         win_length=None,
+         window='hann',
+         center=True,
+         pad_mode='reflect'):
+    """Perform STFT and convert to magnitude spectrogram.
+    Parameters
+    ----------
+    x : Tensor
+        Input signal tensor (B, T).
+    fft_size : int
+        FFT size.
+    hop_size : int
+        Hop size.
+    win_length : int
+        window : str, optional
+    window : str
+        Name of window function, see `scipy.signal.get_window` for more
+        details. Defaults to "hann".
+    center : bool, optional
+        center (bool, optional): Whether to pad `x` to make that the
+        :math:`t \times hop\_length` at the center of :math:`t`-th frame. Default: `True`.
+    pad_mode : str, optional
+        Choose padding pattern when `center` is `True`.
+    Returns
+    ----------
+    Tensor:
+        Magnitude spectrogram (B, #frames, fft_size // 2 + 1).
+    """
+    # calculate window
+    window = signal.get_window(window, win_length, fftbins=True)
+    window = paddle.to_tensor(window)
+    x_stft = paddle.signal.stft(
+        x,
+        fft_size,
+        hop_length,
+        win_length,
+        window=window,
+        center=center,
+        pad_mode=pad_mode)
+
+    real = x_stft.real()
+    imag = x_stft.imag()
+
+    return paddle.sqrt(paddle.clip(real**2 + imag**2, min=1e-7)).transpose(
+        [0, 2, 1])
+
+
+class SpectralConvergenceLoss(nn.Layer):
+    """Spectral convergence loss module."""
+
+    def __init__(self):
+        """Initilize spectral convergence loss module."""
+        super().__init__()
+
+    def forward(self, x_mag, y_mag):
+        """Calculate forward propagation.
+        Parameters
+        ----------
+        x_mag : Tensor
+            Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
+        y_mag : Tensor)
+            Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
+        Returns
+        ----------
+        Tensor
+            Spectral convergence loss value.
+        """
+        return paddle.norm(
+            y_mag - x_mag, p="fro") / paddle.clip(
+                paddle.norm(y_mag, p="fro"), min=1e-10)
+
+
+class LogSTFTMagnitudeLoss(nn.Layer):
+    """Log STFT magnitude loss module."""
+
+    def __init__(self, epsilon=1e-7):
+        """Initilize los STFT magnitude loss module."""
+        super().__init__()
+        self.epsilon = epsilon
+
+    def forward(self, x_mag, y_mag):
+        """Calculate forward propagation.
+        Parameters
+        ----------
+        x_mag : Tensor
+            Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
+        y_mag : Tensor
+            Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
+        Returns
+        ----------
+        Tensor
+            Log STFT magnitude loss value.
+        """
+        return F.l1_loss(
+            paddle.log(paddle.clip(y_mag, min=self.epsilon)),
+            paddle.log(paddle.clip(x_mag, min=self.epsilon)))
+
+
+class STFTLoss(nn.Layer):
+    """STFT loss module."""
+
+    def __init__(self,
+                 fft_size=1024,
+                 shift_size=120,
+                 win_length=600,
+                 window="hann"):
+        """Initialize STFT loss module."""
+        super().__init__()
+        self.fft_size = fft_size
+        self.shift_size = shift_size
+        self.win_length = win_length
+        self.window = window
+        self.spectral_convergence_loss = SpectralConvergenceLoss()
+        self.log_stft_magnitude_loss = LogSTFTMagnitudeLoss()
+
+    def forward(self, x, y):
+        """Calculate forward propagation.
+        Parameters
+        ----------
+        x : Tensor
+            Predicted signal (B, T).
+        y : Tensor
+            Groundtruth signal (B, T).
+        Returns
+        ----------
+        Tensor
+            Spectral convergence loss value.
+        Tensor
+            Log STFT magnitude loss value.
+        """
+        x_mag = stft(x, self.fft_size, self.shift_size, self.win_length,
+                     self.window)
+        y_mag = stft(y, self.fft_size, self.shift_size, self.win_length,
+                     self.window)
+        sc_loss = self.spectral_convergence_loss(x_mag, y_mag)
+        mag_loss = self.log_stft_magnitude_loss(x_mag, y_mag)
+
+        return sc_loss, mag_loss
+
+
+class MultiResolutionSTFTLoss(nn.Layer):
+    """Multi resolution STFT loss module."""
+
+    def __init__(
+            self,
+            fft_sizes=[1024, 2048, 512],
+            hop_sizes=[120, 240, 50],
+            win_lengths=[600, 1200, 240],
+            window="hann", ):
+        """Initialize Multi resolution STFT loss module.
+        Parameters
+        ----------
+        fft_sizes : list
+            List of FFT sizes.
+        hop_sizes : list
+            List of hop sizes.
+        win_lengths : list
+            List of window lengths.
+        window : str
+            Window function type.
+        """
+        super().__init__()
+        assert len(fft_sizes) == len(hop_sizes) == len(win_lengths)
+        self.stft_losses = nn.LayerList()
+        for fs, ss, wl in zip(fft_sizes, hop_sizes, win_lengths):
+            self.stft_losses.append(STFTLoss(fs, ss, wl, window))
+
+    def forward(self, x, y):
+        """Calculate forward propagation.
+        Parameters
+        ----------
+        x : Tensor
+            Predicted signal (B, T) or (B, #subband, T).
+        y : Tensor
+            Groundtruth signal (B, T) or (B, #subband, T).
+        Returns
+        ----------
+        Tensor
+            Multi resolution spectral convergence loss value.
+        Tensor
+            Multi resolution log STFT magnitude loss value.
+        """
+        if len(x.shape) == 3:
+            # (B, C, T) -> (B x C, T)
+            x = x.reshape([-1, x.shape[2]])
+            # (B, C, T) -> (B x C, T)
+            y = y.reshape([-1, y.shape[2]])
+        sc_loss = 0.0
+        mag_loss = 0.0
+        for f in self.stft_losses:
+            sc_l, mag_l = f(x, y)
+            sc_loss += sc_l
+            mag_loss += mag_l
+        sc_loss /= len(self.stft_losses)
+        mag_loss /= len(self.stft_losses)
+
+        return sc_loss, mag_loss
+
+
+class GeneratorAdversarialLoss(nn.Layer):
+    """Generator adversarial loss module."""
+
+    def __init__(
+            self,
+            average_by_discriminators=True,
+            loss_type="mse", ):
+        """Initialize GeneratorAversarialLoss module."""
+        super().__init__()
+        self.average_by_discriminators = average_by_discriminators
+        assert loss_type in ["mse", "hinge"], f"{loss_type} is not supported."
+        if loss_type == "mse":
+            self.criterion = self._mse_loss
+        else:
+            self.criterion = self._hinge_loss
+
+    def forward(self, outputs):
+        """Calcualate generator adversarial loss.
+        Parameters
+        ----------
+        outputs: Tensor or List
+        Discriminator outputs or list of discriminator outputs.
+        Returns
+        ----------
+        Tensor
+            Generator adversarial loss value.
+        """
+        if isinstance(outputs, (tuple, list)):
+            adv_loss = 0.0
+            for i, outputs_ in enumerate(outputs):
+                if isinstance(outputs_, (tuple, list)):
+                    # case including feature maps
+                    outputs_ = outputs_[-1]
+                adv_loss += self.criterion(outputs_)
+            if self.average_by_discriminators:
+                adv_loss /= i + 1
+        else:
+            adv_loss = self.criterion(outputs)
+
+        return adv_loss
+
+    def _mse_loss(self, x):
+        return F.mse_loss(x, paddle.ones_like(x))
+
+    def _hinge_loss(self, x):
+        return -x.mean()
+
+
+class DiscriminatorAdversarialLoss(nn.Layer):
+    """Discriminator adversarial loss module."""
+
+    def __init__(
+            self,
+            average_by_discriminators=True,
+            loss_type="mse", ):
+        """Initialize DiscriminatorAversarialLoss module."""
+        super().__init__()
+        self.average_by_discriminators = average_by_discriminators
+        assert loss_type in ["mse"], f"{loss_type} is not supported."
+        if loss_type == "mse":
+            self.fake_criterion = self._mse_fake_loss
+            self.real_criterion = self._mse_real_loss
+
+    def forward(self, outputs_hat, outputs):
+        """Calcualate discriminator adversarial loss.
+        Parameters
+        ----------
+        outputs_hat : Tensor or list
+            Discriminator outputs or list of
+            discriminator outputs calculated from generator outputs.
+        outputs : Tensor or list
+            Discriminator outputs or list of
+            discriminator outputs calculated from groundtruth.
+        Returns
+        ----------
+        Tensor
+            Discriminator real loss value.
+        Tensor
+            Discriminator fake loss value.
+        """
+        if isinstance(outputs, (tuple, list)):
+            real_loss = 0.0
+            fake_loss = 0.0
+            for i, (outputs_hat_,
+                    outputs_) in enumerate(zip(outputs_hat, outputs)):
+                if isinstance(outputs_hat_, (tuple, list)):
+                    # case including feature maps
+                    outputs_hat_ = outputs_hat_[-1]
+                    outputs_ = outputs_[-1]
+                real_loss += self.real_criterion(outputs_)
+                fake_loss += self.fake_criterion(outputs_hat_)
+            if self.average_by_discriminators:
+                fake_loss /= i + 1
+                real_loss /= i + 1
+        else:
+            real_loss = self.real_criterion(outputs)
+            fake_loss = self.fake_criterion(outputs_hat)
+
+        return real_loss, fake_loss
+
+    def _mse_real_loss(self, x):
+        return F.mse_loss(x, paddle.ones_like(x))
+
+    def _mse_fake_loss(self, x):
+        return F.mse_loss(x, paddle.zeros_like(x))
+
+
+# Losses for SpeedySpeech
+# Structural Similarity Index Measure (SSIM)
+def gaussian(window_size, sigma):
+    gauss = paddle.to_tensor([
+        math.exp(-(x - window_size // 2)**2 / float(2 * sigma**2))
+        for x in range(window_size)
+    ])
+    return gauss / gauss.sum()
+
+
+def create_window(window_size, channel):
+    _1D_window = gaussian(window_size, 1.5).unsqueeze(1)
+    _2D_window = paddle.matmul(_1D_window, paddle.transpose(
+        _1D_window, [1, 0])).unsqueeze([0, 1])
+    window = paddle.expand(_2D_window, [channel, 1, window_size, window_size])
+    return window
+
+
+def _ssim(img1, img2, window, window_size, channel, size_average=True):
+    mu1 = F.conv2d(img1, window, padding=window_size // 2, groups=channel)
+    mu2 = F.conv2d(img2, window, padding=window_size // 2, groups=channel)
+
+    mu1_sq = mu1.pow(2)
+    mu2_sq = mu2.pow(2)
+    mu1_mu2 = mu1 * mu2
+
+    sigma1_sq = F.conv2d(
+        img1 * img1, window, padding=window_size // 2, groups=channel) - mu1_sq
+    sigma2_sq = F.conv2d(
+        img2 * img2, window, padding=window_size // 2, groups=channel) - mu2_sq
+    sigma12 = F.conv2d(
+        img1 * img2, window, padding=window_size // 2, groups=channel) - mu1_mu2
+
+    C1 = 0.01**2
+    C2 = 0.03**2
+
+    ssim_map = ((2 * mu1_mu2 + C1) * (2 * sigma12 + C2)) \
+             / ((mu1_sq + mu2_sq + C1) * (sigma1_sq + sigma2_sq + C2))
+
+    if size_average:
+        return ssim_map.mean()
+    else:
+        return ssim_map.mean(1).mean(1).mean(1)
+
+
+def ssim(img1, img2, window_size=11, size_average=True):
+    (_, channel, _, _) = img1.shape
+    window = create_window(window_size, channel)
+    return _ssim(img1, img2, window, window_size, channel, size_average)
+
+
 def weighted_mean(input, weight):
     """Weighted mean. It can also be used as masked mean.
 
@@ -98,28 +457,3 @@ def masked_l1_loss(prediction, target, mask):
     abs_error = F.l1_loss(prediction, target, reduction='none')
     loss = weighted_mean(abs_error, mask)
     return loss
-
-
-def masked_softmax_with_cross_entropy(logits, label, mask, axis=-1):
-    """Compute masked softmax with cross entropy loss.
-
-    Parameters
-    ----------
-    logits : Tensor
-        The logits. The ``axis``-th axis is the class dimension.
-    label : Tensor [dtype: int]
-        The label. The size of the ``axis``-th axis should be 1.
-    mask : Tensor 
-        The mask. The shape should be broadcastable to ``label``.
-    axis : int, optional
-        The index of the class dimension in the shape of ``logits``, by default
-        -1.
-
-    Returns
-    -------
-    Tensor [shape=(1,)]
-        The masked softmax with cross entropy loss.
-    """
-    ce = F.softmax_with_cross_entropy(logits, label, axis=axis)
-    loss = weighted_mean(ce, mask)
-    return loss
diff --git a/paddlespeech/t2s/modules/masking.py b/paddlespeech/t2s/modules/masking.py
deleted file mode 100644
index 7cf37040a728e9fa8cf4c4ef11d818b444186358..0000000000000000000000000000000000000000
--- a/paddlespeech/t2s/modules/masking.py
+++ /dev/null
@@ -1,120 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import paddle
-
-__all__ = [
-    "id_mask",
-    "feature_mask",
-    "combine_mask",
-    "future_mask",
-]
-
-
-def id_mask(input, padding_index=0, dtype="bool"):
-    """Generate mask with input ids. 
-
-    Those positions where the value equals ``padding_index`` correspond to 0 or
-    ``False``, otherwise, 1 or ``True``.
-
-    Parameters
-    ----------
-    input : Tensor [dtype: int]
-        The input tensor. It represents the ids.
-    padding_index : int, optional
-        The id which represents padding, by default 0.
-    dtype : str, optional
-        Data type of the returned mask, by default "bool".
-
-    Returns
-    -------
-    Tensor
-        The generate mask. It has the same shape as ``input`` does.
-    """
-    return paddle.cast(input != padding_index, dtype)
-
-
-def feature_mask(input, axis, dtype="bool"):
-    """Compute mask from input features.
-
-    For a input features, represented as batched feature vectors, those vectors
-    which all zeros are considerd padding vectors.
-
-    Parameters
-    ----------
-    input : Tensor [dtype: float]
-        The input tensor which represents featues.
-    axis : int
-        The index of the feature dimension in ``input``. Other dimensions are
-        considered ``spatial`` dimensions.
-    dtype : str, optional
-        Data type of the generated mask, by default "bool"
-    Returns
-    -------
-    Tensor
-        The geenrated mask with ``spatial`` shape as mentioned above.
-
-        It has one less dimension than ``input`` does.
-    """
-    feature_sum = paddle.sum(paddle.abs(input), axis)
-    return paddle.cast(feature_sum != 0, dtype)
-
-
-def combine_mask(mask1, mask2):
-    """Combine two mask with multiplication or logical and.
-
-    Parameters
-    -----------
-    mask1 : Tensor
-        The first mask.
-    mask2 : Tensor
-        The second mask with broadcastable shape with ``mask1``.
-    Returns
-    --------
-    Tensor
-        Combined mask.
-
-    Notes
-    ------
-    It is mainly used to combine the padding mask and no future mask for
-    transformer decoder. 
-
-    Padding mask is used to mask padding positions of the decoder inputs and
-    no future mask is used to prevent the decoder to see future information.
-    """
-    if mask1.dtype == paddle.fluid.core.VarDesc.VarType.BOOL:
-        return paddle.logical_and(mask1, mask2)
-    else:
-        return mask1 * mask2
-
-
-def future_mask(time_steps, dtype="bool"):
-    """Generate lower triangular mask.
-
-    It is used at transformer decoder to prevent the decoder to see future
-    information.
-
-    Parameters
-    ----------
-    time_steps : int
-        Decoder time steps.
-    dtype : str, optional
-        The data type of the generate mask, by default "bool".
-
-    Returns
-    -------
-    Tensor
-        The generated mask.
-    """
-    mask = paddle.tril(paddle.ones([time_steps, time_steps]))
-    return paddle.cast(mask, dtype)
diff --git a/paddlespeech/t2s/modules/nets_utils.py b/paddlespeech/t2s/modules/nets_utils.py
index 30d3db86c885a56204562b82f7e0d709a96717e2..3822b33d074ce2343412a7eadd4294a0f2f9accb 100644
--- a/paddlespeech/t2s/modules/nets_utils.py
+++ b/paddlespeech/t2s/modules/nets_utils.py
@@ -129,7 +129,7 @@ def initialize(model: nn.Layer, init: str):
 
     Parameters
     ----------
-    model : paddle.nn.Layer
+    model : nn.Layer
         Target.
     init : str
         Method of initialization.
diff --git a/paddlespeech/t2s/modules/pqmf.py b/paddlespeech/t2s/modules/pqmf.py
index c299fb5772070fa2e5a4fdc5201541b787b7d449..fb850a4d55548f8dfcc5262e919eb0ab1d91c8ec 100644
--- a/paddlespeech/t2s/modules/pqmf.py
+++ b/paddlespeech/t2s/modules/pqmf.py
@@ -16,6 +16,7 @@
 import numpy as np
 import paddle
 import paddle.nn.functional as F
+from paddle import nn
 from scipy.signal import kaiser
 
 
@@ -56,7 +57,7 @@ def design_prototype_filter(taps=62, cutoff_ratio=0.142, beta=9.0):
     return h
 
 
-class PQMF(paddle.nn.Layer):
+class PQMF(nn.Layer):
     """PQMF module.
     This module is based on `Near-perfect-reconstruction pseudo-QMF banks`_.
     .. _`Near-perfect-reconstruction pseudo-QMF banks`:
@@ -105,7 +106,7 @@ class PQMF(paddle.nn.Layer):
         self.updown_filter = updown_filter
         self.subbands = subbands
         # keep padding info
-        self.pad_fn = paddle.nn.Pad1D(taps // 2, mode='constant', value=0.0)
+        self.pad_fn = nn.Pad1D(taps // 2, mode='constant', value=0.0)
 
     def analysis(self, x):
         """Analysis with PQMF.
diff --git a/paddlespeech/t2s/modules/predictor/__init__.py b/paddlespeech/t2s/modules/predictor/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..185a92b8d94d3426d616c0624f0f2ee04339349e
--- /dev/null
+++ b/paddlespeech/t2s/modules/predictor/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/paddlespeech/t2s/modules/fastspeech2_predictor/duration_predictor.py b/paddlespeech/t2s/modules/predictor/duration_predictor.py
similarity index 98%
rename from paddlespeech/t2s/modules/fastspeech2_predictor/duration_predictor.py
rename to paddlespeech/t2s/modules/predictor/duration_predictor.py
index b269b6866452f57296656f30e0b74f472737891c..6d7adf236d285db283b0a3558c1ad086c59a3108 100644
--- a/paddlespeech/t2s/modules/fastspeech2_predictor/duration_predictor.py
+++ b/paddlespeech/t2s/modules/predictor/duration_predictor.py
@@ -65,7 +65,7 @@ class DurationPredictor(nn.Layer):
             Offset value to avoid nan in log domain.
 
         """
-        super(DurationPredictor, self).__init__()
+        super().__init__()
         self.offset = offset
         self.conv = nn.LayerList()
         for idx in range(n_layers):
@@ -155,7 +155,7 @@ class DurationPredictorLoss(nn.Layer):
         reduction : str
             Reduction type in loss calculation.
         """
-        super(DurationPredictorLoss, self).__init__()
+        super().__init__()
         self.criterion = nn.MSELoss(reduction=reduction)
         self.offset = offset
 
diff --git a/paddlespeech/t2s/modules/fastspeech2_predictor/length_regulator.py b/paddlespeech/t2s/modules/predictor/length_regulator.py
similarity index 100%
rename from paddlespeech/t2s/modules/fastspeech2_predictor/length_regulator.py
rename to paddlespeech/t2s/modules/predictor/length_regulator.py
diff --git a/paddlespeech/t2s/modules/fastspeech2_predictor/variance_predictor.py b/paddlespeech/t2s/modules/predictor/variance_predictor.py
similarity index 100%
rename from paddlespeech/t2s/modules/fastspeech2_predictor/variance_predictor.py
rename to paddlespeech/t2s/modules/predictor/variance_predictor.py
diff --git a/paddlespeech/t2s/modules/ssim.py b/paddlespeech/t2s/modules/ssim.py
deleted file mode 100644
index c9899cd6bc60d937b909f0b5cf405f4b16194a04..0000000000000000000000000000000000000000
--- a/paddlespeech/t2s/modules/ssim.py
+++ /dev/null
@@ -1,80 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from math import exp
-
-import paddle
-import paddle.nn.functional as F
-from paddle import nn
-
-
-def gaussian(window_size, sigma):
-    gauss = paddle.to_tensor([
-        exp(-(x - window_size // 2)**2 / float(2 * sigma**2))
-        for x in range(window_size)
-    ])
-    return gauss / gauss.sum()
-
-
-def create_window(window_size, channel):
-    _1D_window = gaussian(window_size, 1.5).unsqueeze(1)
-    _2D_window = paddle.matmul(_1D_window, paddle.transpose(
-        _1D_window, [1, 0])).unsqueeze([0, 1])
-    window = paddle.expand(_2D_window, [channel, 1, window_size, window_size])
-    return window
-
-
-def _ssim(img1, img2, window, window_size, channel, size_average=True):
-    mu1 = F.conv2d(img1, window, padding=window_size // 2, groups=channel)
-    mu2 = F.conv2d(img2, window, padding=window_size // 2, groups=channel)
-
-    mu1_sq = mu1.pow(2)
-    mu2_sq = mu2.pow(2)
-    mu1_mu2 = mu1 * mu2
-
-    sigma1_sq = F.conv2d(
-        img1 * img1, window, padding=window_size // 2, groups=channel) - mu1_sq
-    sigma2_sq = F.conv2d(
-        img2 * img2, window, padding=window_size // 2, groups=channel) - mu2_sq
-    sigma12 = F.conv2d(
-        img1 * img2, window, padding=window_size // 2, groups=channel) - mu1_mu2
-
-    C1 = 0.01**2
-    C2 = 0.03**2
-
-    ssim_map = ((2 * mu1_mu2 + C1) * (2 * sigma12 + C2)) \
-             / ((mu1_sq + mu2_sq + C1) * (sigma1_sq + sigma2_sq + C2))
-
-    if size_average:
-        return ssim_map.mean()
-    else:
-        return ssim_map.mean(1).mean(1).mean(1)
-
-
-class SSIM(nn.Layer):
-    def __init__(self, window_size=11, size_average=True):
-        super().__init__()
-        self.window_size = window_size
-        self.size_average = size_average
-        self.channel = 1
-        self.window = create_window(window_size, self.channel)
-
-    def forward(self, img1, img2):
-        return _ssim(img1, img2, self.window, self.window_size, self.channel,
-                     self.size_average)
-
-
-def ssim(img1, img2, window_size=11, size_average=True):
-    (_, channel, _, _) = img1.shape
-    window = create_window(window_size, channel)
-    return _ssim(img1, img2, window, window_size, channel, size_average)
diff --git a/paddlespeech/t2s/modules/stft_loss.py b/paddlespeech/t2s/modules/stft_loss.py
deleted file mode 100644
index 31963e718de6b617d1067458ee00987ebbe5a1ad..0000000000000000000000000000000000000000
--- a/paddlespeech/t2s/modules/stft_loss.py
+++ /dev/null
@@ -1,220 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# Modified from espnet(https://github.com/espnet/espnet)
-import paddle
-from paddle import nn
-from paddle.nn import functional as F
-from scipy import signal
-
-
-def stft(x,
-         fft_size,
-         hop_length=None,
-         win_length=None,
-         window='hann',
-         center=True,
-         pad_mode='reflect'):
-    """Perform STFT and convert to magnitude spectrogram.
-    Parameters
-    ----------
-    x : Tensor
-        Input signal tensor (B, T).
-    fft_size : int
-        FFT size.
-    hop_size : int
-        Hop size.
-    win_length : int
-        window : str, optional
-    window : str
-        Name of window function, see `scipy.signal.get_window` for more
-        details. Defaults to "hann".
-    center : bool, optional
-        center (bool, optional): Whether to pad `x` to make that the
-        :math:`t \times hop\_length` at the center of :math:`t`-th frame. Default: `True`.
-    pad_mode : str, optional
-        Choose padding pattern when `center` is `True`.
-    Returns
-    ----------
-    Tensor:
-        Magnitude spectrogram (B, #frames, fft_size // 2 + 1).
-    """
-    # calculate window
-    window = signal.get_window(window, win_length, fftbins=True)
-    window = paddle.to_tensor(window)
-    x_stft = paddle.signal.stft(
-        x,
-        fft_size,
-        hop_length,
-        win_length,
-        window=window,
-        center=center,
-        pad_mode=pad_mode)
-
-    real = x_stft.real()
-    imag = x_stft.imag()
-
-    return paddle.sqrt(paddle.clip(real**2 + imag**2, min=1e-7)).transpose(
-        [0, 2, 1])
-
-
-class SpectralConvergenceLoss(nn.Layer):
-    """Spectral convergence loss module."""
-
-    def __init__(self):
-        """Initilize spectral convergence loss module."""
-        super().__init__()
-
-    def forward(self, x_mag, y_mag):
-        """Calculate forward propagation.
-        Parameters
-        ----------
-        x_mag : Tensor
-            Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
-        y_mag : Tensor)
-            Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
-        Returns
-        ----------
-        Tensor
-            Spectral convergence loss value.
-        """
-        return paddle.norm(
-            y_mag - x_mag, p="fro") / paddle.clip(
-                paddle.norm(y_mag, p="fro"), min=1e-10)
-
-
-class LogSTFTMagnitudeLoss(nn.Layer):
-    """Log STFT magnitude loss module."""
-
-    def __init__(self, epsilon=1e-7):
-        """Initilize los STFT magnitude loss module."""
-        super().__init__()
-        self.epsilon = epsilon
-
-    def forward(self, x_mag, y_mag):
-        """Calculate forward propagation.
-        Parameters
-        ----------
-        x_mag : Tensor
-            Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
-        y_mag : Tensor
-            Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
-        Returns
-        ----------
-        Tensor
-            Log STFT magnitude loss value.
-        """
-        return F.l1_loss(
-            paddle.log(paddle.clip(y_mag, min=self.epsilon)),
-            paddle.log(paddle.clip(x_mag, min=self.epsilon)))
-
-
-class STFTLoss(nn.Layer):
-    """STFT loss module."""
-
-    def __init__(self,
-                 fft_size=1024,
-                 shift_size=120,
-                 win_length=600,
-                 window="hann"):
-        """Initialize STFT loss module."""
-        super().__init__()
-        self.fft_size = fft_size
-        self.shift_size = shift_size
-        self.win_length = win_length
-        self.window = window
-        self.spectral_convergence_loss = SpectralConvergenceLoss()
-        self.log_stft_magnitude_loss = LogSTFTMagnitudeLoss()
-
-    def forward(self, x, y):
-        """Calculate forward propagation.
-        Parameters
-        ----------
-        x : Tensor
-            Predicted signal (B, T).
-        y : Tensor
-            Groundtruth signal (B, T).
-        Returns
-        ----------
-        Tensor
-            Spectral convergence loss value.
-        Tensor
-            Log STFT magnitude loss value.
-        """
-        x_mag = stft(x, self.fft_size, self.shift_size, self.win_length,
-                     self.window)
-        y_mag = stft(y, self.fft_size, self.shift_size, self.win_length,
-                     self.window)
-        sc_loss = self.spectral_convergence_loss(x_mag, y_mag)
-        mag_loss = self.log_stft_magnitude_loss(x_mag, y_mag)
-
-        return sc_loss, mag_loss
-
-
-class MultiResolutionSTFTLoss(nn.Layer):
-    """Multi resolution STFT loss module."""
-
-    def __init__(
-            self,
-            fft_sizes=[1024, 2048, 512],
-            hop_sizes=[120, 240, 50],
-            win_lengths=[600, 1200, 240],
-            window="hann", ):
-        """Initialize Multi resolution STFT loss module.
-        Parameters
-        ----------
-        fft_sizes : list
-            List of FFT sizes.
-        hop_sizes : list
-            List of hop sizes.
-        win_lengths : list
-            List of window lengths.
-        window : str
-            Window function type.
-        """
-        super().__init__()
-        assert len(fft_sizes) == len(hop_sizes) == len(win_lengths)
-        self.stft_losses = nn.LayerList()
-        for fs, ss, wl in zip(fft_sizes, hop_sizes, win_lengths):
-            self.stft_losses.append(STFTLoss(fs, ss, wl, window))
-
-    def forward(self, x, y):
-        """Calculate forward propagation.
-        Parameters
-        ----------
-        x : Tensor
-            Predicted signal (B, T) or (B, #subband, T).
-        y : Tensor
-            Groundtruth signal (B, T) or (B, #subband, T).
-        Returns
-        ----------
-        Tensor
-            Multi resolution spectral convergence loss value.
-        Tensor
-            Multi resolution log STFT magnitude loss value.
-        """
-        if len(x.shape) == 3:
-            # (B, C, T) -> (B x C, T)
-            x = x.reshape([-1, x.shape[2]])
-            # (B, C, T) -> (B x C, T)
-            y = y.reshape([-1, y.shape[2]])
-        sc_loss = 0.0
-        mag_loss = 0.0
-        for f in self.stft_losses:
-            sc_l, mag_l = f(x, y)
-            sc_loss += sc_l
-            mag_loss += mag_l
-        sc_loss /= len(self.stft_losses)
-        mag_loss /= len(self.stft_losses)
-
-        return sc_loss, mag_loss
diff --git a/paddlespeech/t2s/modules/style_encoder.py b/paddlespeech/t2s/modules/style_encoder.py
index 868a73a969edb6d3dc1affe6b0e401a88fb7d11b..e76226f3c587d22006bcf6d513046ec641e91a61 100644
--- a/paddlespeech/t2s/modules/style_encoder.py
+++ b/paddlespeech/t2s/modules/style_encoder.py
@@ -19,7 +19,7 @@ import paddle
 from paddle import nn
 from typeguard import check_argument_types
 
-from paddlespeech.t2s.modules.fastspeech2_transformer.attention import MultiHeadedAttention as BaseMultiHeadedAttention
+from paddlespeech.t2s.modules.transformer.attention import MultiHeadedAttention as BaseMultiHeadedAttention
 
 
 class StyleEncoder(nn.Layer):
@@ -74,7 +74,7 @@ class StyleEncoder(nn.Layer):
             gru_units: int=128, ):
         """Initilize global style encoder module."""
         assert check_argument_types()
-        super(StyleEncoder, self).__init__()
+        super().__init__()
 
         self.ref_enc = ReferenceEncoder(
             idim=idim,
@@ -93,11 +93,15 @@ class StyleEncoder(nn.Layer):
     def forward(self, speech: paddle.Tensor) -> paddle.Tensor:
         """Calculate forward propagation.
 
-        Args:
-            speech (Tensor): Batch of padded target features (B, Lmax, odim).
+        Parameters
+        ----------
+        speech : Tensor
+            Batch of padded target features (B, Lmax, odim).
 
-        Returns:
-            Tensor: Style token embeddings (B, token_dim).
+        Returns
+        ----------
+        Tensor:
+            Style token embeddings (B, token_dim).
 
         """
         ref_embs = self.ref_enc(speech)
@@ -145,7 +149,7 @@ class ReferenceEncoder(nn.Layer):
             gru_units: int=128, ):
         """Initilize reference encoder module."""
         assert check_argument_types()
-        super(ReferenceEncoder, self).__init__()
+        super().__init__()
 
         # check hyperparameters are valid
         assert conv_kernel_size % 2 == 1, "kernel size must be odd."
@@ -249,7 +253,7 @@ class StyleTokenLayer(nn.Layer):
             dropout_rate: float=0.0, ):
         """Initilize style token layer module."""
         assert check_argument_types()
-        super(StyleTokenLayer, self).__init__()
+        super().__init__()
 
         gst_embs = paddle.randn(shape=[gst_tokens, gst_token_dim // gst_heads])
         self.gst_embs = paddle.create_parameter(
diff --git a/paddlespeech/t2s/modules/tacotron2/encoder.py b/paddlespeech/t2s/modules/tacotron2/encoder.py
index b95e3529ff7eda456863b519e8982580c066cc01..f1889061396b3ee6ce62026a0a0e039d61cfdba0 100644
--- a/paddlespeech/t2s/modules/tacotron2/encoder.py
+++ b/paddlespeech/t2s/modules/tacotron2/encoder.py
@@ -73,7 +73,7 @@ class Encoder(nn.Layer):
             Dropout rate.
 
         """
-        super(Encoder, self).__init__()
+        super().__init__()
         # store the hyperparameters
         self.idim = idim
         self.use_residual = use_residual
diff --git a/paddlespeech/t2s/modules/transformer.py b/paddlespeech/t2s/modules/transformer.py
deleted file mode 100644
index e50d58d44bc6663414a7390589d3a8d7ad6f2c5b..0000000000000000000000000000000000000000
--- a/paddlespeech/t2s/modules/transformer.py
+++ /dev/null
@@ -1,208 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from paddle import nn
-from paddle.nn import functional as F
-
-from paddlespeech.t2s.modules import attention as attn
-
-__all__ = [
-    "PositionwiseFFN",
-    "TransformerEncoderLayer",
-    "TransformerDecoderLayer",
-]
-
-
-class PositionwiseFFN(nn.Layer):
-    """A faithful implementation of Position-wise Feed-Forward Network 
-    in `Attention is All You Need <https://arxiv.org/abs/1706.03762>`_.
-    It is basically a 2-layer MLP, with relu actication and dropout in between.
-
-    Parameters
-    ----------
-    input_size: int
-        The feature size of the intput. It is also the feature size of the
-        output.
-    hidden_size: int
-        The hidden size.
-    dropout: float
-        The probability of the Dropout applied to the output of the first
-        layer, by default 0.
-    """
-
-    def __init__(self, input_size: int, hidden_size: int, dropout=0.0):
-        super(PositionwiseFFN, self).__init__()
-        self.linear1 = nn.Linear(input_size, hidden_size)
-        self.linear2 = nn.Linear(hidden_size, input_size)
-        self.dropout = nn.Dropout(dropout)
-
-        self.input_size = input_size
-        self.hidden_szie = hidden_size
-
-    def forward(self, x):
-        r"""Forward pass of positionwise feed forward network.
-
-        Parameters
-        ----------
-        x : Tensor [shape=(\*, input_size)]
-            The input tensor, where ``\*`` means arbitary shape.
-
-        Returns
-        -------
-        Tensor [shape=(\*, input_size)]
-            The output tensor.
-        """
-        l1 = self.dropout(F.relu(self.linear1(x)))
-        l2 = self.linear2(l1)
-        return l2
-
-
-class TransformerEncoderLayer(nn.Layer):
-    """A faithful implementation of Transformer encoder layer in
-    `Attention is All You Need <https://arxiv.org/abs/1706.03762>`_.
-
-    Parameters
-    ----------
-    d_model :int 
-        The feature size of the input. It is also the feature size of the
-        output.
-    n_heads : int
-        The number of heads of self attention (a ``MultiheadAttention``
-        layer).
-    d_ffn : int 
-        The hidden size of the positional feed forward network (a
-        ``PositionwiseFFN`` layer).
-    dropout : float, optional
-        The probability of the dropout in MultiHeadAttention and
-        PositionwiseFFN, by default 0.
-
-    Notes
-    ------
-    It uses the PostLN (post layer norm) scheme.
-    """
-
-    def __init__(self, d_model, n_heads, d_ffn, dropout=0.):
-        super(TransformerEncoderLayer, self).__init__()
-        self.self_mha = attn.MultiheadAttention(d_model, n_heads, dropout)
-        self.layer_norm1 = nn.LayerNorm([d_model], epsilon=1e-6)
-
-        self.ffn = PositionwiseFFN(d_model, d_ffn, dropout)
-        self.layer_norm2 = nn.LayerNorm([d_model], epsilon=1e-6)
-
-        self.dropout = dropout
-
-    def forward(self, x, mask):
-        """Forward pass of TransformerEncoderLayer.
-
-        Parameters
-        ----------
-        x : Tensor [shape=(batch_size, time_steps, d_model)]
-            The input.
-        mask : Tensor
-            The padding mask. The shape is (batch_size, time_steps,
-            time_steps) or broadcastable shape.
-
-        Returns
-        -------
-        x :Tensor [shape=(batch_size, time_steps, d_model)]
-            The encoded output.
-
-        attn_weights : Tensor [shape=(batch_size, n_heads, time_steps, time_steps)]
-            The attention weights of the self attention.
-        """
-        context_vector, attn_weights = self.self_mha(x, x, x, mask)
-        x = self.layer_norm1(
-            F.dropout(x + context_vector, self.dropout, training=self.training))
-
-        x = self.layer_norm2(
-            F.dropout(x + self.ffn(x), self.dropout, training=self.training))
-        return x, attn_weights
-
-
-class TransformerDecoderLayer(nn.Layer):
-    """A faithful implementation of Transformer decoder layer in 
-    `Attention is All You Need <https://arxiv.org/abs/1706.03762>`_.
-
-    Parameters
-    ----------
-    d_model :int 
-        The feature size of the input. It is also the feature size of the
-        output.
-    n_heads : int
-        The number of heads of attentions (``MultiheadAttention``
-        layers).
-    d_ffn : int 
-        The hidden size of the positional feed forward network (a
-        ``PositionwiseFFN`` layer).
-    dropout : float, optional
-        The probability of the dropout in MultiHeadAttention and
-        PositionwiseFFN, by default 0.
-
-    Notes
-    ------
-    It uses the PostLN (post layer norm) scheme.
-    """
-
-    def __init__(self, d_model, n_heads, d_ffn, dropout=0.):
-        super(TransformerDecoderLayer, self).__init__()
-        self.self_mha = attn.MultiheadAttention(d_model, n_heads, dropout)
-        self.layer_norm1 = nn.LayerNorm([d_model], epsilon=1e-6)
-
-        self.cross_mha = attn.MultiheadAttention(d_model, n_heads, dropout)
-        self.layer_norm2 = nn.LayerNorm([d_model], epsilon=1e-6)
-
-        self.ffn = PositionwiseFFN(d_model, d_ffn, dropout)
-        self.layer_norm3 = nn.LayerNorm([d_model], epsilon=1e-6)
-
-        self.dropout = dropout
-
-    def forward(self, q, k, v, encoder_mask, decoder_mask):
-        """Forward pass of TransformerEncoderLayer.
-
-        Parameters
-        ----------
-        q : Tensor [shape=(batch_size, time_steps_q, d_model)]
-            The decoder input.
-        k : Tensor [shape=(batch_size, time_steps_k, d_model)]
-            The keys.
-        v : Tensor [shape=(batch_size, time_steps_k, d_model)]
-            The values
-        encoder_mask : Tensor
-            Encoder padding mask, shape is ``(batch_size, time_steps_k,
-            time_steps_k)`` or broadcastable shape.
-        decoder_mask : Tensor
-            Decoder mask, shape is ``(batch_size, time_steps_q, time_steps_k)``
-            or broadcastable shape. 
-
-        Returns
-        --------
-        q : Tensor [shape=(batch_size, time_steps_q, d_model)]
-            The decoder output.
-        self_attn_weights : Tensor [shape=(batch_size, n_heads, time_steps_q, time_steps_q)]
-            Decoder self attention.
-
-        cross_attn_weights : Tensor [shape=(batch_size, n_heads, time_steps_q, time_steps_k)]
-            Decoder-encoder cross attention.
-        """
-        context_vector, self_attn_weights = self.self_mha(q, q, q, decoder_mask)
-        q = self.layer_norm1(
-            F.dropout(q + context_vector, self.dropout, training=self.training))
-
-        context_vector, cross_attn_weights = self.cross_mha(q, k, v,
-                                                            encoder_mask)
-        q = self.layer_norm2(
-            F.dropout(q + context_vector, self.dropout, training=self.training))
-
-        q = self.layer_norm3(
-            F.dropout(q + self.ffn(q), self.dropout, training=self.training))
-        return q, self_attn_weights, cross_attn_weights
diff --git a/paddlespeech/t2s/modules/transformer/__init__.py b/paddlespeech/t2s/modules/transformer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..abf198b97e6e818e1fbe59006f98492640bcee54
--- /dev/null
+++ b/paddlespeech/t2s/modules/transformer/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/paddlespeech/t2s/modules/fastspeech2_transformer/attention.py b/paddlespeech/t2s/modules/transformer/attention.py
similarity index 54%
rename from paddlespeech/t2s/modules/fastspeech2_transformer/attention.py
rename to paddlespeech/t2s/modules/transformer/attention.py
index b11329b03144f3997f36ab1914cbec1612faf41b..34386f2a5914d9053fe8e731c9ce1789d1625979 100644
--- a/paddlespeech/t2s/modules/fastspeech2_transformer/attention.py
+++ b/paddlespeech/t2s/modules/transformer/attention.py
@@ -37,7 +37,7 @@ class MultiHeadedAttention(nn.Layer):
 
     def __init__(self, n_head, n_feat, dropout_rate):
         """Construct an MultiHeadedAttention object."""
-        super(MultiHeadedAttention, self).__init__()
+        super().__init__()
         assert n_feat % n_head == 0
         # We assume d_v always equals d_k
         self.d_k = n_feat // n_head
@@ -70,7 +70,7 @@ class MultiHeadedAttention(nn.Layer):
         paddle.Tensor
             Transformed value tensor (#batch, n_head, time2, d_k).
         """
-        n_batch = query.shape[0]
+        n_batch = paddle.shape(query)[0]
 
         q = paddle.reshape(
             self.linear_q(query), [n_batch, -1, self.h, self.d_k])
@@ -104,7 +104,7 @@ class MultiHeadedAttention(nn.Layer):
             Transformed value (#batch, time1, d_model)
             weighted by the attention score (#batch, time1, time2).
         """
-        n_batch = value.shape[0]
+        n_batch = paddle.shape(value)[0]
         softmax = paddle.nn.Softmax(axis=-1)
         if mask is not None:
             mask = mask.unsqueeze(1)
@@ -126,8 +126,8 @@ class MultiHeadedAttention(nn.Layer):
         # (batch, time1, d_model)
         x = (paddle.reshape(
             x.transpose((0, 2, 1, 3)), (n_batch, -1, self.h * self.d_k)))
-
-        return self.linear_out(x)  # (batch, time1, d_model)
+        # (batch, time1, d_model)
+        return self.linear_out(x)
 
     def forward(self, query, key, value, mask=None):
         """Compute scaled dot product attention.
@@ -153,3 +153,113 @@ class MultiHeadedAttention(nn.Layer):
             (0, 1, 3, 2))) / math.sqrt(self.d_k)
 
         return self.forward_attention(v, scores, mask)
+
+
+class RelPositionMultiHeadedAttention(MultiHeadedAttention):
+    """Multi-Head Attention layer with relative position encoding (new implementation).
+    Details can be found in https://github.com/espnet/espnet/pull/2816.
+    Paper: https://arxiv.org/abs/1901.02860
+    Parameters
+    ----------
+    n_head : int
+        The number of heads.
+    n_feat : int
+        The number of features.
+    dropout_rate : float
+        Dropout rate.
+    zero_triu : bool
+        Whether to zero the upper triangular part of attention matrix.
+    """
+
+    def __init__(self, n_head, n_feat, dropout_rate, zero_triu=False):
+        """Construct an RelPositionMultiHeadedAttention object."""
+        super().__init__(n_head, n_feat, dropout_rate)
+        self.zero_triu = zero_triu
+        # linear transformation for positional encoding
+        self.linear_pos = nn.Linear(n_feat, n_feat, bias_attr=False)
+        # these two learnable bias are used in matrix c and matrix d
+        # as described in https://arxiv.org/abs/1901.02860 Section 3.3
+
+        self.pos_bias_u = paddle.create_parameter(
+            shape=(self.h, self.d_k),
+            dtype='float32',
+            default_initializer=paddle.nn.initializer.XavierUniform())
+        self.pos_bias_v = paddle.create_parameter(
+            shape=(self.h, self.d_k),
+            dtype='float32',
+            default_initializer=paddle.nn.initializer.XavierUniform())
+
+    def rel_shift(self, x):
+        """Compute relative positional encoding.
+        Parameters
+        ----------
+        x : paddle.Tensor
+            Input tensor (batch, head, time1, 2*time1-1).
+            time1 means the length of query vector.
+        Returns
+        ----------
+        paddle.Tensor
+            Output tensor.
+        """
+        b, h, t1, t2 = paddle.shape(x)
+        zero_pad = paddle.zeros((b, h, t1, 1))
+        x_padded = paddle.concat([zero_pad, x], axis=-1)
+        x_padded = x_padded.reshape([b, h, t2 + 1, t1])
+        # only keep the positions from 0 to time2
+        x = x_padded[:, :, 1:].reshape([b, h, t1, t2])[:, :, :, :t2 // 2 + 1]
+
+        if self.zero_triu:
+            ones = paddle.ones((t1, t2))
+            x = x * paddle.tril(ones, t2 - 1)[None, None, :, :]
+
+        return x
+
+    def forward(self, query, key, value, pos_emb, mask):
+        """Compute 'Scaled Dot Product Attention' with rel. positional encoding.
+        Parameters
+        ----------
+        query : paddle.Tensor 
+            Query tensor (#batch, time1, size).
+        key : paddle.Tensor
+            Key tensor (#batch, time2, size).
+        value : paddle.Tensor
+            Value tensor (#batch, time2, size).
+        pos_emb : paddle.Tensor
+            Positional embedding tensor
+            (#batch, 2*time1-1, size).
+        mask : paddle.Tensor
+            Mask tensor (#batch, 1, time2) or
+            (#batch, time1, time2).
+        Returns
+        ----------
+        paddle.Tensor
+            Output tensor (#batch, time1, d_model).
+        """
+        q, k, v = self.forward_qkv(query, key, value)
+        # (batch, time1, head, d_k)
+        q = q.transpose([0, 2, 1, 3])
+
+        n_batch_pos = paddle.shape(pos_emb)[0]
+        p = self.linear_pos(pos_emb).reshape(
+            [n_batch_pos, -1, self.h, self.d_k])
+        # (batch, head, 2*time1-1, d_k)
+        p = p.transpose([0, 2, 1, 3])
+        # (batch, head, time1, d_k)
+        q_with_bias_u = (q + self.pos_bias_u).transpose([0, 2, 1, 3])
+        # (batch, head, time1, d_k)
+        q_with_bias_v = (q + self.pos_bias_v).transpose([0, 2, 1, 3])
+
+        # compute attention score
+        # first compute matrix a and matrix c
+        # as described in https://arxiv.org/abs/1901.02860 Section 3.3
+        # (batch, head, time1, time2)
+        matrix_ac = paddle.matmul(q_with_bias_u, k.transpose([0, 1, 3, 2]))
+
+        # compute matrix b and matrix d
+        # (batch, head, time1, 2*time1-1)
+        matrix_bd = paddle.matmul(q_with_bias_v, p.transpose([0, 1, 3, 2]))
+        matrix_bd = self.rel_shift(matrix_bd)
+        # (batch, head, time1, time2)
+        scores = (matrix_ac + matrix_bd) / math.sqrt(self.d_k)
+
+        return self.forward_attention(v, scores, mask)
diff --git a/paddlespeech/t2s/modules/fastspeech2_transformer/decoder.py b/paddlespeech/t2s/modules/transformer/decoder.py
similarity index 92%
rename from paddlespeech/t2s/modules/fastspeech2_transformer/decoder.py
rename to paddlespeech/t2s/modules/transformer/decoder.py
index 489fda12bc9d5708418ef2b8e3b96ea264f7101e..fe2949f47dc9a1b837ae6aa4a8294723a7b270b7 100644
--- a/paddlespeech/t2s/modules/fastspeech2_transformer/decoder.py
+++ b/paddlespeech/t2s/modules/transformer/decoder.py
@@ -23,14 +23,14 @@ import paddle
 import paddle.nn.functional as F
 from paddle import nn
 
-from paddlespeech.t2s.modules.fastspeech2_transformer.attention import MultiHeadedAttention
-from paddlespeech.t2s.modules.fastspeech2_transformer.decoder_layer import DecoderLayer
-from paddlespeech.t2s.modules.fastspeech2_transformer.embedding import PositionalEncoding
-from paddlespeech.t2s.modules.fastspeech2_transformer.lightconv import LightweightConvolution
-from paddlespeech.t2s.modules.fastspeech2_transformer.mask import subsequent_mask
-from paddlespeech.t2s.modules.fastspeech2_transformer.positionwise_feed_forward import PositionwiseFeedForward
-from paddlespeech.t2s.modules.fastspeech2_transformer.repeat import repeat
 from paddlespeech.t2s.modules.layer_norm import LayerNorm
+from paddlespeech.t2s.modules.transformer.attention import MultiHeadedAttention
+from paddlespeech.t2s.modules.transformer.decoder_layer import DecoderLayer
+from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding
+from paddlespeech.t2s.modules.transformer.lightconv import LightweightConvolution
+from paddlespeech.t2s.modules.transformer.mask import subsequent_mask
+from paddlespeech.t2s.modules.transformer.positionwise_feed_forward import PositionwiseFeedForward
+from paddlespeech.t2s.modules.transformer.repeat import repeat
 
 
 class Decoder(nn.Layer):
@@ -67,11 +67,11 @@ class Decoder(nn.Layer):
         Dropout rate in self-attention.
     src_attention_dropout_rate : float
         Dropout rate in source-attention.
-    input_layer : (Union[str, paddle.nn.Layer])
+    input_layer : (Union[str, nn.Layer])
         Input layer type.
     use_output_layer : bool
         Whether to use output layer.
-    pos_enc_class : paddle.nn.Layer
+    pos_enc_class : nn.Layer
         Positional encoding module class.
         `PositionalEncoding `or `ScaledPositionalEncoding`
     normalize_before : bool
@@ -122,8 +122,7 @@ class Decoder(nn.Layer):
                 input_layer,
                 pos_enc_class(attention_dim, positional_dropout_rate))
         else:
-            raise NotImplementedError(
-                "only `embed` or paddle.nn.Layer is supported.")
+            raise NotImplementedError("only `embed` or nn.Layer is supported.")
         self.normalize_before = normalize_before
 
         # self-attention module definition
diff --git a/paddlespeech/t2s/modules/fastspeech2_transformer/decoder_layer.py b/paddlespeech/t2s/modules/transformer/decoder_layer.py
similarity index 98%
rename from paddlespeech/t2s/modules/fastspeech2_transformer/decoder_layer.py
rename to paddlespeech/t2s/modules/transformer/decoder_layer.py
index 0310d83ea3933c185612d53db316457f24fb8020..44978f1e84c6ddd5d8f927838705c0c59f29a630 100644
--- a/paddlespeech/t2s/modules/fastspeech2_transformer/decoder_layer.py
+++ b/paddlespeech/t2s/modules/transformer/decoder_layer.py
@@ -26,13 +26,13 @@ class DecoderLayer(nn.Layer):
     ----------
     size : int
         Input dimension.
-    self_attn : paddle.nn.Layer
+    self_attn : nn.Layer
         Self-attention module instance.
         `MultiHeadedAttention` instance can be used as the argument.
-    src_attn : paddle.nn.Layer
+    src_attn : nn.Layer
         Self-attention module instance.
         `MultiHeadedAttention` instance can be used as the argument.
-    feed_forward : paddle.nn.Layer
+    feed_forward : nn.Layer
         Feed-forward module instance.
         `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance can be used as the argument.
     dropout_rate : float
diff --git a/paddlespeech/t2s/modules/fastspeech2_transformer/embedding.py b/paddlespeech/t2s/modules/transformer/embedding.py
similarity index 52%
rename from paddlespeech/t2s/modules/fastspeech2_transformer/embedding.py
rename to paddlespeech/t2s/modules/transformer/embedding.py
index f26c9dcbab659f3e3b73c61308d3f513c4f26739..40ab03ee7afa2c5a0eb490bf638884a40ae68c62 100644
--- a/paddlespeech/t2s/modules/fastspeech2_transformer/embedding.py
+++ b/paddlespeech/t2s/modules/transformer/embedding.py
@@ -43,7 +43,7 @@ class PositionalEncoding(nn.Layer):
                  dtype="float32",
                  reverse=False):
         """Construct an PositionalEncoding object."""
-        super(PositionalEncoding, self).__init__()
+        super().__init__()
         self.d_model = d_model
         self.reverse = reverse
         self.xscale = math.sqrt(self.d_model)
@@ -96,14 +96,14 @@ class ScaledPositionalEncoding(PositionalEncoding):
 
     Parameters
     ----------
-        d_model : int
-            Embedding dimension.
-        dropout_rate : float
-            Dropout rate.
-        max_len : int
-            Maximum input length.
-        dtype : str
-            dtype of param
+    d_model : int
+        Embedding dimension.
+    dropout_rate : float
+        Dropout rate.
+    max_len : int
+        Maximum input length.
+    dtype : str
+        dtype of param
     """
 
     def __init__(self, d_model, dropout_rate, max_len=5000, dtype="float32"):
@@ -117,7 +117,7 @@ class ScaledPositionalEncoding(PositionalEncoding):
         self.alpha = paddle.create_parameter(
             shape=x.shape,
             dtype=self.dtype,
-            default_initializer=paddle.nn.initializer.Assign(x))
+            default_initializer=nn.initializer.Assign(x))
 
     def reset_parameters(self):
         """Reset parameters."""
@@ -128,14 +128,87 @@ class ScaledPositionalEncoding(PositionalEncoding):
 
         Parameters
         ----------
-            x : paddle.Tensor
-                Input tensor (batch, time, `*`).
+        x : paddle.Tensor
+            Input tensor (batch, time, `*`).
         Returns
         ----------
-            paddle.Tensor
-                Encoded tensor (batch, time, `*`).
+        paddle.Tensor
+            Encoded tensor (batch, time, `*`).
         """
         self.extend_pe(x)
         T = paddle.shape(x)[1]
         x = x + self.alpha * self.pe[:, :T]
         return self.dropout(x)
+
+
+class RelPositionalEncoding(nn.Layer):
+    """Relative positional encoding module (new implementation).
+    Details can be found in https://github.com/espnet/espnet/pull/2816.
+    See : Appendix B in https://arxiv.org/abs/1901.02860
+    Parameters
+    ----------
+    d_model : int
+        Embedding dimension.
+    dropout_rate : float
+        Dropout rate.
+    max_len : int
+        Maximum input length.
+    """
+
+    def __init__(self, d_model, dropout_rate, max_len=5000, dtype="float32"):
+        """Construct an PositionalEncoding object."""
+        super().__init__()
+        self.d_model = d_model
+        self.xscale = math.sqrt(self.d_model)
+        self.dropout = nn.Dropout(p=dropout_rate)
+        self.pe = None
+        self.dtype = dtype
+        self.extend_pe(paddle.expand(paddle.zeros([1]), (1, max_len)))
+
+    def extend_pe(self, x):
+        """Reset the positional encodings."""
+        if self.pe is not None:
+            # self.pe contains both positive and negative parts
+            # the length of self.pe is 2 * input_len - 1
+            if paddle.shape(self.pe)[1] >= paddle.shape(x)[1] * 2 - 1:
+                return
+        # Suppose `i` means to the position of query vecotr and `j` means the
+        # position of key vector. We use position relative positions when keys
+        # are to the left (i>j) and negative relative positions otherwise (i<j).
+        x_shape = paddle.shape(x)
+        pe_positive = paddle.zeros([x_shape[1], self.d_model])
+        pe_negative = paddle.zeros([x_shape[1], self.d_model])
+        position = paddle.arange(0, x_shape[1], dtype=self.dtype).unsqueeze(1)
+        div_term = paddle.exp(
+            paddle.arange(0, self.d_model, 2, dtype=self.dtype) *
+            -(math.log(10000.0) / self.d_model))
+        pe_positive[:, 0::2] = paddle.sin(position * div_term)
+        pe_positive[:, 1::2] = paddle.cos(position * div_term)
+        pe_negative[:, 0::2] = paddle.sin(-1 * position * div_term)
+        pe_negative[:, 1::2] = paddle.cos(-1 * position * div_term)
+
+        # Reserve the order of positive indices and concat both positive and
+        # negative indices. This is used to support the shifting trick
+        # as in https://arxiv.org/abs/1901.02860
+        pe_positive = paddle.flip(pe_positive, [0]).unsqueeze(0)
+        pe_negative = pe_negative[1:].unsqueeze(0)
+        pe = paddle.concat([pe_positive, pe_negative], axis=1)
+        self.pe = pe
+
+    def forward(self, x: paddle.Tensor):
+        """Add positional encoding.
+        Parameters
+        ----------
+        x : paddle.Tensor
+            Input tensor (batch, time, `*`).
+        Returns
+        ----------
+        paddle.Tensor
+            Encoded tensor (batch, time, `*`).
+        """
+        self.extend_pe(x)
+        x = x * self.xscale
+        T = paddle.shape(x)[1]
+        pe_size = paddle.shape(self.pe)
+        pos_emb = self.pe[:, pe_size[1] // 2 - T + 1:pe_size[1] // 2 + T, ]
+        return self.dropout(x), self.dropout(pos_emb)
diff --git a/paddlespeech/t2s/modules/transformer/encoder.py b/paddlespeech/t2s/modules/transformer/encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..8bf71b413308de1a627c811eec9eacdeece4a248
--- /dev/null
+++ b/paddlespeech/t2s/modules/transformer/encoder.py
@@ -0,0 +1,609 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+from typing import List
+from typing import Union
+
+from paddle import nn
+
+from paddlespeech.t2s.modules.activation import get_activation
+from paddlespeech.t2s.modules.conformer.convolution import ConvolutionModule
+from paddlespeech.t2s.modules.conformer.encoder_layer import EncoderLayer as ConformerEncoderLayer
+from paddlespeech.t2s.modules.layer_norm import LayerNorm
+from paddlespeech.t2s.modules.transformer.attention import MultiHeadedAttention
+from paddlespeech.t2s.modules.transformer.attention import RelPositionMultiHeadedAttention
+from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding
+from paddlespeech.t2s.modules.transformer.embedding import RelPositionalEncoding
+from paddlespeech.t2s.modules.transformer.embedding import ScaledPositionalEncoding
+from paddlespeech.t2s.modules.transformer.encoder_layer import EncoderLayer
+from paddlespeech.t2s.modules.transformer.multi_layer_conv import Conv1dLinear
+from paddlespeech.t2s.modules.transformer.multi_layer_conv import MultiLayeredConv1d
+from paddlespeech.t2s.modules.transformer.positionwise_feed_forward import PositionwiseFeedForward
+from paddlespeech.t2s.modules.transformer.repeat import repeat
+from paddlespeech.t2s.modules.transformer.subsampling import Conv2dSubsampling
+
+
+class BaseEncoder(nn.Layer):
+    """Base Encoder module.
+
+    Parameters
+    ----------
+    idim : int
+        Input dimension.
+    attention_dim : int
+        Dimention of attention.
+    attention_heads : int
+        The number of heads of multi head attention.
+    linear_units : int
+        The number of units of position-wise feed forward.
+    num_blocks : int
+        The number of decoder blocks.
+    dropout_rate : float
+        Dropout rate.
+    positional_dropout_rate : float
+        Dropout rate after adding positional encoding.
+    attention_dropout_rate : float
+        Dropout rate in attention.
+    input_layer : Union[str, nn.Layer]
+        Input layer type.
+    normalize_before : bool
+        Whether to use layer_norm before the first block.
+    concat_after : bool
+        Whether to concat attention layer's input and output.
+        if True, additional linear will be applied.
+        i.e. x -> x + linear(concat(x, att(x)))
+        if False, no additional linear will be applied. i.e. x -> x + att(x)
+    positionwise_layer_type : str
+        "linear", "conv1d", or "conv1d-linear".
+    positionwise_conv_kernel_size : int
+        Kernel size of positionwise conv1d layer.
+    macaron_style : bool
+        Whether to use macaron style for positionwise layer.
+    pos_enc_layer_type : str
+        Encoder positional encoding layer type.
+    selfattention_layer_type : str
+        Encoder attention layer type.
+    activation_type : str
+        Encoder activation function type.
+    use_cnn_module : bool
+        Whether to use convolution module.
+    zero_triu : bool
+        Whether to zero the upper triangular part of attention matrix.
+    cnn_module_kernel : int
+        Kernerl size of convolution module.
+    padding_idx : int
+        Padding idx for input_layer=embed.
+    stochastic_depth_rate : float
+        Maximum probability to skip the encoder layer.
+    intermediate_layers : Union[List[int], None]
+        indices of intermediate CTC layer.
+        indices start from 1.
+        if not None, intermediate outputs are returned (which changes return type
+        signature.)
+    encoder_type: str
+         "transformer", or "conformer".
+    """
+
+    def __init__(self,
+                 idim: int,
+                 attention_dim: int=256,
+                 attention_heads: int=4,
+                 linear_units: int=2048,
+                 num_blocks: int=6,
+                 dropout_rate: float=0.1,
+                 positional_dropout_rate: float=0.1,
+                 attention_dropout_rate: float=0.0,
+                 input_layer: str="conv2d",
+                 normalize_before: bool=True,
+                 concat_after: bool=False,
+                 positionwise_layer_type: str="linear",
+                 positionwise_conv_kernel_size: int=1,
+                 macaron_style: bool=False,
+                 pos_enc_layer_type: str="abs_pos",
+                 selfattention_layer_type: str="selfattn",
+                 activation_type: str="swish",
+                 use_cnn_module: bool=False,
+                 zero_triu: bool=False,
+                 cnn_module_kernel: int=31,
+                 padding_idx: int=-1,
+                 stochastic_depth_rate: float=0.0,
+                 intermediate_layers: Union[List[int], None]=None,
+                 encoder_type: str="transformer"):
+        """Construct an Base Encoder object."""
+        super().__init__()
+        activation = get_activation(activation_type)
+        pos_enc_class = self.get_pos_enc_class(pos_enc_layer_type,
+                                               selfattention_layer_type)
+        self.encoder_type = encoder_type
+
+        self.conv_subsampling_factor = 1
+        self.embed = self.get_embed(
+            idim=idim,
+            input_layer=input_layer,
+            attention_dim=attention_dim,
+            pos_enc_class=pos_enc_class,
+            dropout_rate=dropout_rate,
+            positional_dropout_rate=positional_dropout_rate,
+            padding_idx=padding_idx)
+
+        self.normalize_before = normalize_before
+
+        # self-attention module definition
+        encoder_selfattn_layer, encoder_selfattn_layer_args = self.get_encoder_selfattn_layer(
+            selfattention_layer_type=selfattention_layer_type,
+            attention_heads=attention_heads,
+            attention_dim=attention_dim,
+            attention_dropout_rate=attention_dropout_rate,
+            zero_triu=zero_triu,
+            pos_enc_layer_type=pos_enc_layer_type)
+        # feed-forward module definition
+        positionwise_layer, positionwise_layer_args = self.get_positionwise_layer(
+            positionwise_layer_type, attention_dim, linear_units, dropout_rate,
+            positionwise_conv_kernel_size, activation)
+
+        # convolution module definition
+        convolution_layer = ConvolutionModule
+        convolution_layer_args = (attention_dim, cnn_module_kernel, activation)
+
+        if self.encoder_type == "transformer":
+            self.encoders = repeat(
+                num_blocks,
+                lambda lnum: EncoderLayer(
+                    attention_dim,
+                    encoder_selfattn_layer(*encoder_selfattn_layer_args),
+                    positionwise_layer(*positionwise_layer_args),
+                    dropout_rate,
+                    normalize_before,
+                    concat_after, ), )
+
+        elif self.encoder_type == "conformer":
+            self.encoders = repeat(
+                num_blocks,
+                lambda lnum: ConformerEncoderLayer(
+                    attention_dim,
+                    encoder_selfattn_layer(*encoder_selfattn_layer_args),
+                    positionwise_layer(*positionwise_layer_args),
+                    positionwise_layer(*positionwise_layer_args) if macaron_style else None,
+                    convolution_layer(*convolution_layer_args) if use_cnn_module else None,
+                    dropout_rate,
+                    normalize_before,
+                    concat_after,
+                    stochastic_depth_rate * float(1 + lnum) / num_blocks, ), )
+            self.intermediate_layers = intermediate_layers
+        else:
+            raise NotImplementedError("Support only linear or conv1d.")
+
+        if self.normalize_before:
+            self.after_norm = LayerNorm(attention_dim)
+
+    def get_positionwise_layer(self,
+                               positionwise_layer_type: str="linear",
+                               attention_dim: int=256,
+                               linear_units: int=2048,
+                               dropout_rate: float=0.1,
+                               positionwise_conv_kernel_size: int=1,
+                               activation: nn.Layer=nn.ReLU()):
+        """Define positionwise layer."""
+        if positionwise_layer_type == "linear":
+            positionwise_layer = PositionwiseFeedForward
+            positionwise_layer_args = (attention_dim, linear_units,
+                                       dropout_rate, activation)
+        elif positionwise_layer_type == "conv1d":
+            positionwise_layer = MultiLayeredConv1d
+            positionwise_layer_args = (attention_dim, linear_units,
+                                       positionwise_conv_kernel_size,
+                                       dropout_rate, )
+        elif positionwise_layer_type == "conv1d-linear":
+            positionwise_layer = Conv1dLinear
+            positionwise_layer_args = (attention_dim, linear_units,
+                                       positionwise_conv_kernel_size,
+                                       dropout_rate, )
+        else:
+            raise NotImplementedError("Support only linear or conv1d.")
+        return positionwise_layer, positionwise_layer_args
+
+    def get_encoder_selfattn_layer(self,
+                                   selfattention_layer_type: str="selfattn",
+                                   attention_heads: int=4,
+                                   attention_dim: int=256,
+                                   attention_dropout_rate: float=0.0,
+                                   zero_triu: bool=False,
+                                   pos_enc_layer_type: str="abs_pos"):
+        if selfattention_layer_type == "selfattn":
+            encoder_selfattn_layer = MultiHeadedAttention
+            encoder_selfattn_layer_args = (attention_heads, attention_dim,
+                                           attention_dropout_rate, )
+        elif selfattention_layer_type == "rel_selfattn":
+            assert pos_enc_layer_type == "rel_pos"
+            encoder_selfattn_layer = RelPositionMultiHeadedAttention
+            encoder_selfattn_layer_args = (attention_heads, attention_dim,
+                                           attention_dropout_rate, zero_triu, )
+        else:
+            raise ValueError("unknown encoder_attn_layer: " +
+                             selfattention_layer_type)
+        return encoder_selfattn_layer, encoder_selfattn_layer_args
+
+    def get_pos_enc_class(self,
+                          pos_enc_layer_type: str="abs_pos",
+                          selfattention_layer_type: str="selfattn"):
+        if pos_enc_layer_type == "abs_pos":
+            pos_enc_class = PositionalEncoding
+        elif pos_enc_layer_type == "scaled_abs_pos":
+            pos_enc_class = ScaledPositionalEncoding
+        elif pos_enc_layer_type == "rel_pos":
+            assert selfattention_layer_type == "rel_selfattn"
+            pos_enc_class = RelPositionalEncoding
+        else:
+            raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type)
+        return pos_enc_class
+
+    def get_embed(self,
+                  idim,
+                  input_layer="conv2d",
+                  attention_dim: int=256,
+                  pos_enc_class=PositionalEncoding,
+                  dropout_rate: int=0.1,
+                  positional_dropout_rate: int=0.1,
+                  padding_idx: int=-1):
+
+        if input_layer == "linear":
+            embed = nn.Sequential(
+                nn.Linear(idim, attention_dim),
+                nn.LayerNorm(attention_dim),
+                nn.Dropout(dropout_rate),
+                nn.ReLU(),
+                pos_enc_class(attention_dim, positional_dropout_rate), )
+        elif input_layer == "conv2d":
+            embed = Conv2dSubsampling(
+                idim,
+                attention_dim,
+                dropout_rate,
+                pos_enc_class(attention_dim, positional_dropout_rate), )
+            self.conv_subsampling_factor = 4
+        elif input_layer == "embed":
+            embed = nn.Sequential(
+                nn.Embedding(idim, attention_dim, padding_idx=padding_idx),
+                pos_enc_class(attention_dim, positional_dropout_rate), )
+        elif isinstance(input_layer, nn.Layer):
+            embed = nn.Sequential(
+                input_layer,
+                pos_enc_class(attention_dim, positional_dropout_rate), )
+        elif input_layer is None:
+            embed = nn.Sequential(
+                pos_enc_class(attention_dim, positional_dropout_rate))
+        else:
+            raise ValueError("unknown input_layer: " + input_layer)
+
+        return embed
+
+    def forward(self, xs, masks):
+        """Encode input sequence.
+
+        Parameters
+        ----------
+        xs : paddle.Tensor
+            Input tensor (#batch, time, idim).
+        masks : paddle.Tensor
+            Mask tensor (#batch, 1, time).
+
+        Returns
+        ----------
+        paddle.Tensor
+            Output tensor (#batch, time, attention_dim).
+        paddle.Tensor
+            Mask tensor (#batch, 1, time).
+        """
+        xs = self.embed(xs)
+        xs, masks = self.encoders(xs, masks)
+        if self.normalize_before:
+            xs = self.after_norm(xs)
+        return xs, masks
+
+
+class TransformerEncoder(BaseEncoder):
+    """Transformer encoder module.
+    Parameters
+    ----------
+    idim : int
+        Input dimension.
+    attention_dim : int
+        Dimention of attention.
+    attention_heads : int
+        The number of heads of multi head attention.
+    linear_units : int
+        The number of units of position-wise feed forward.
+    num_blocks : int
+        The number of decoder blocks.
+    dropout_rate : float
+        Dropout rate.
+    positional_dropout_rate : float
+        Dropout rate after adding positional encoding.
+    attention_dropout_rate : float
+        Dropout rate in attention.
+    input_layer : Union[str, paddle.nn.Layer]
+        Input layer type.
+    pos_enc_layer_type : str
+        Encoder positional encoding layer type.
+    normalize_before : bool
+        Whether to use layer_norm before the first block.
+    concat_after : bool
+        Whether to concat attention layer's input and output.
+        if True, additional linear will be applied.
+        i.e. x -> x + linear(concat(x, att(x)))
+        if False, no additional linear will be applied. i.e. x -> x + att(x)
+    positionwise_layer_type : str
+        "linear", "conv1d", or "conv1d-linear".
+    positionwise_conv_kernel_size : int
+        Kernel size of positionwise conv1d layer.
+    selfattention_layer_type : str
+        Encoder attention layer type.
+    activation_type : str
+        Encoder activation function type.
+    padding_idx : int
+        Padding idx for input_layer=embed.
+    """
+
+    def __init__(
+            self,
+            idim,
+            attention_dim: int=256,
+            attention_heads: int=4,
+            linear_units: int=2048,
+            num_blocks: int=6,
+            dropout_rate: float=0.1,
+            positional_dropout_rate: float=0.1,
+            attention_dropout_rate: float=0.0,
+            input_layer: str="conv2d",
+            pos_enc_layer_type: str="abs_pos",
+            normalize_before: bool=True,
+            concat_after: bool=False,
+            positionwise_layer_type: str="linear",
+            positionwise_conv_kernel_size: int=1,
+            selfattention_layer_type: str="selfattn",
+            activation_type: str="relu",
+            padding_idx: int=-1, ):
+        """Construct an Transformer Encoder object."""
+        super().__init__(
+            idim,
+            attention_dim=attention_dim,
+            attention_heads=attention_heads,
+            linear_units=linear_units,
+            num_blocks=num_blocks,
+            dropout_rate=dropout_rate,
+            positional_dropout_rate=positional_dropout_rate,
+            attention_dropout_rate=attention_dropout_rate,
+            input_layer=input_layer,
+            pos_enc_layer_type=pos_enc_layer_type,
+            normalize_before=normalize_before,
+            concat_after=concat_after,
+            positionwise_layer_type=positionwise_layer_type,
+            positionwise_conv_kernel_size=positionwise_conv_kernel_size,
+            selfattention_layer_type=selfattention_layer_type,
+            activation_type=activation_type,
+            padding_idx=padding_idx,
+            encoder_type="transformer")
+
+    def forward(self, xs, masks):
+        """Encode input sequence.
+
+        Parameters
+        ----------
+        xs : paddle.Tensor
+            Input tensor (#batch, time, idim).
+        masks : paddle.Tensor
+            Mask tensor (#batch, 1, time).
+
+        Returns
+        ----------
+        paddle.Tensor
+            Output tensor (#batch, time, attention_dim).
+        paddle.Tensor
+            Mask tensor (#batch, 1, time).
+        """
+        xs = self.embed(xs)
+        xs, masks = self.encoders(xs, masks)
+        if self.normalize_before:
+            xs = self.after_norm(xs)
+        return xs, masks
+
+    def forward_one_step(self, xs, masks, cache=None):
+        """Encode input frame.
+
+        Parameters
+        ----------
+        xs : paddle.Tensor
+            Input tensor.
+        masks : paddle.Tensor
+            Mask tensor.
+        cache : List[paddle.Tensor]
+            List of cache tensors.
+
+        Returns
+        ----------
+        paddle.Tensor
+            Output tensor.
+        paddle.Tensor
+            Mask tensor.
+        List[paddle.Tensor]
+            List of new cache tensors.
+        """
+
+        xs = self.embed(xs)
+        if cache is None:
+            cache = [None for _ in range(len(self.encoders))]
+        new_cache = []
+        for c, e in zip(cache, self.encoders):
+            xs, masks = e(xs, masks, cache=c)
+            new_cache.append(xs)
+        if self.normalize_before:
+            xs = self.after_norm(xs)
+        return xs, masks, new_cache
+
+
+class ConformerEncoder(BaseEncoder):
+    """Conformer encoder module.
+    Parameters
+    ----------
+    idim : int
+        Input dimension.
+    attention_dim : int
+        Dimention of attention.
+    attention_heads : int
+        The number of heads of multi head attention.
+    linear_units : int
+        The number of units of position-wise feed forward.
+    num_blocks : int
+        The number of decoder blocks.
+    dropout_rate : float
+        Dropout rate.
+    positional_dropout_rate : float
+        Dropout rate after adding positional encoding.
+    attention_dropout_rate : float
+        Dropout rate in attention.
+    input_layer : Union[str, nn.Layer]
+        Input layer type.
+    normalize_before : bool
+        Whether to use layer_norm before the first block.
+    concat_after : bool
+        Whether to concat attention layer's input and output.
+        if True, additional linear will be applied.
+        i.e. x -> x + linear(concat(x, att(x)))
+        if False, no additional linear will be applied. i.e. x -> x + att(x)
+    positionwise_layer_type : str
+        "linear", "conv1d", or "conv1d-linear".
+    positionwise_conv_kernel_size : int
+        Kernel size of positionwise conv1d layer.
+    macaron_style : bool
+        Whether to use macaron style for positionwise layer.
+    pos_enc_layer_type : str
+        Encoder positional encoding layer type.
+    selfattention_layer_type : str
+        Encoder attention layer type.
+    activation_type : str
+        Encoder activation function type.
+    use_cnn_module : bool
+        Whether to use convolution module.
+    zero_triu : bool
+        Whether to zero the upper triangular part of attention matrix.
+    cnn_module_kernel : int
+        Kernerl size of convolution module.
+    padding_idx : int
+        Padding idx for input_layer=embed.
+    stochastic_depth_rate : float
+        Maximum probability to skip the encoder layer.
+    intermediate_layers : Union[List[int], None]
+        indices of intermediate CTC layer.
+        indices start from 1.
+        if not None, intermediate outputs are returned (which changes return type
+        signature.)
+    """
+
+    def __init__(
+            self,
+            idim: int,
+            attention_dim: int=256,
+            attention_heads: int=4,
+            linear_units: int=2048,
+            num_blocks: int=6,
+            dropout_rate: float=0.1,
+            positional_dropout_rate: float=0.1,
+            attention_dropout_rate: float=0.0,
+            input_layer: str="conv2d",
+            normalize_before: bool=True,
+            concat_after: bool=False,
+            positionwise_layer_type: str="linear",
+            positionwise_conv_kernel_size: int=1,
+            macaron_style: bool=False,
+            pos_enc_layer_type: str="rel_pos",
+            selfattention_layer_type: str="rel_selfattn",
+            activation_type: str="swish",
+            use_cnn_module: bool=False,
+            zero_triu: bool=False,
+            cnn_module_kernel: int=31,
+            padding_idx: int=-1,
+            stochastic_depth_rate: float=0.0,
+            intermediate_layers: Union[List[int], None]=None, ):
+        """Construct an Conformer Encoder object."""
+        super().__init__(
+            idim=idim,
+            attention_dim=attention_dim,
+            attention_heads=attention_heads,
+            linear_units=linear_units,
+            num_blocks=num_blocks,
+            dropout_rate=dropout_rate,
+            positional_dropout_rate=positional_dropout_rate,
+            attention_dropout_rate=attention_dropout_rate,
+            input_layer=input_layer,
+            normalize_before=normalize_before,
+            concat_after=concat_after,
+            positionwise_layer_type=positionwise_layer_type,
+            positionwise_conv_kernel_size=positionwise_conv_kernel_size,
+            macaron_style=macaron_style,
+            pos_enc_layer_type=pos_enc_layer_type,
+            selfattention_layer_type=selfattention_layer_type,
+            activation_type=activation_type,
+            use_cnn_module=use_cnn_module,
+            zero_triu=zero_triu,
+            cnn_module_kernel=cnn_module_kernel,
+            padding_idx=padding_idx,
+            stochastic_depth_rate=stochastic_depth_rate,
+            intermediate_layers=intermediate_layers,
+            encoder_type="conformer")
+
+    def forward(self, xs, masks):
+        """Encode input sequence.
+        Parameters
+        ----------
+        xs : paddle.Tensor
+            Input tensor (#batch, time, idim).
+        masks : paddle.Tensor
+            Mask tensor (#batch, 1, time).
+        Returns
+        ----------
+        paddle.Tensor
+            Output tensor (#batch, time, attention_dim).
+        paddle.Tensor
+            Mask tensor (#batch, 1, time).
+        """
+        if isinstance(self.embed, (Conv2dSubsampling)):
+            xs, masks = self.embed(xs, masks)
+        else:
+            xs = self.embed(xs)
+
+        if self.intermediate_layers is None:
+            xs, masks = self.encoders(xs, masks)
+        else:
+            intermediate_outputs = []
+            for layer_idx, encoder_layer in enumerate(self.encoders):
+                xs, masks = encoder_layer(xs, masks)
+
+                if (self.intermediate_layers is not None and
+                        layer_idx + 1 in self.intermediate_layers):
+                    # intermediate branches also require normalization.
+                    encoder_output = xs
+                    if isinstance(encoder_output, tuple):
+                        encoder_output = encoder_output[0]
+                        if self.normalize_before:
+                            encoder_output = self.after_norm(encoder_output)
+                    intermediate_outputs.append(encoder_output)
+
+        if isinstance(xs, tuple):
+            xs = xs[0]
+
+        if self.normalize_before:
+            xs = self.after_norm(xs)
+
+        if self.intermediate_layers is not None:
+            return xs, masks, intermediate_outputs
+        return xs, masks
diff --git a/paddlespeech/t2s/modules/fastspeech2_transformer/encoder_layer.py b/paddlespeech/t2s/modules/transformer/encoder_layer.py
similarity index 97%
rename from paddlespeech/t2s/modules/fastspeech2_transformer/encoder_layer.py
rename to paddlespeech/t2s/modules/transformer/encoder_layer.py
index fb2c2e823a327ab0a531076df00d19f4ae9faa7b..f55ded3de65bdc644b5eab63354ccd3a120edd51 100644
--- a/paddlespeech/t2s/modules/fastspeech2_transformer/encoder_layer.py
+++ b/paddlespeech/t2s/modules/transformer/encoder_layer.py
@@ -24,10 +24,10 @@ class EncoderLayer(nn.Layer):
     ----------
     size : int
         Input dimension.
-    self_attn : paddle.nn.Layer
+    self_attn : nn.Layer
         Self-attention module instance.
         `MultiHeadedAttention`  instance can be used as the argument.
-    feed_forward : paddle.nn.Layer
+    feed_forward : nn.Layer
         Feed-forward module instance.
         `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance can be used as the argument.
     dropout_rate : float
@@ -50,7 +50,7 @@ class EncoderLayer(nn.Layer):
             normalize_before=True,
             concat_after=False, ):
         """Construct an EncoderLayer object."""
-        super(EncoderLayer, self).__init__()
+        super().__init__()
         self.self_attn = self_attn
         self.feed_forward = feed_forward
         self.norm1 = nn.LayerNorm(size)
diff --git a/paddlespeech/t2s/modules/fastspeech2_transformer/lightconv.py b/paddlespeech/t2s/modules/transformer/lightconv.py
similarity index 97%
rename from paddlespeech/t2s/modules/fastspeech2_transformer/lightconv.py
rename to paddlespeech/t2s/modules/transformer/lightconv.py
index 1aeb6d6e19a8f7b0e0820829a3ed7f496693fd28..ccf84c8a3364ea872fd589091c6a7ef28f4ac928 100644
--- a/paddlespeech/t2s/modules/fastspeech2_transformer/lightconv.py
+++ b/paddlespeech/t2s/modules/transformer/lightconv.py
@@ -18,7 +18,7 @@ import paddle
 import paddle.nn.functional as F
 from paddle import nn
 
-from paddlespeech.t2s.modules.glu import GLU
+from paddlespeech.t2s.modules.activation import get_activation
 from paddlespeech.t2s.modules.masked_fill import masked_fill
 
 MIN_VALUE = float(numpy.finfo(numpy.float32).min)
@@ -56,7 +56,7 @@ class LightweightConvolution(nn.Layer):
             use_kernel_mask=False,
             use_bias=False, ):
         """Construct Lightweight Convolution layer."""
-        super(LightweightConvolution, self).__init__()
+        super().__init__()
 
         assert n_feat % wshare == 0
         self.wshare = wshare
@@ -68,7 +68,7 @@ class LightweightConvolution(nn.Layer):
         # linear -> GLU -> lightconv -> linear
         self.linear1 = nn.Linear(n_feat, n_feat * 2)
         self.linear2 = nn.Linear(n_feat, n_feat)
-        self.act = GLU()
+        self.act = get_activation("glu")
 
         # lightconv related
         self.uniform_ = nn.initializer.Uniform()
diff --git a/paddlespeech/t2s/modules/fastspeech2_transformer/mask.py b/paddlespeech/t2s/modules/transformer/mask.py
similarity index 100%
rename from paddlespeech/t2s/modules/fastspeech2_transformer/mask.py
rename to paddlespeech/t2s/modules/transformer/mask.py
diff --git a/paddlespeech/t2s/modules/fastspeech2_transformer/multi_layer_conv.py b/paddlespeech/t2s/modules/transformer/multi_layer_conv.py
similarity index 85%
rename from paddlespeech/t2s/modules/fastspeech2_transformer/multi_layer_conv.py
rename to paddlespeech/t2s/modules/transformer/multi_layer_conv.py
index 8845b2a2b07f161a85b2fec254e9c4957e4614b4..df8929e306b0743899ec712ea470c792838f355c 100644
--- a/paddlespeech/t2s/modules/fastspeech2_transformer/multi_layer_conv.py
+++ b/paddlespeech/t2s/modules/transformer/multi_layer_conv.py
@@ -12,10 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Layer modules for FFT block in FastSpeech (Feed-forward Transformer)."""
-import paddle
+from paddle import nn
 
 
-class MultiLayeredConv1d(paddle.nn.Layer):
+class MultiLayeredConv1d(nn.Layer):
     """Multi-layered conv1d for Transformer block.
 
     This is a module of multi-leyered conv1d designed
@@ -43,21 +43,21 @@ class MultiLayeredConv1d(paddle.nn.Layer):
             Dropout rate.
 
         """
-        super(MultiLayeredConv1d, self).__init__()
-        self.w_1 = paddle.nn.Conv1D(
+        super().__init__()
+        self.w_1 = nn.Conv1D(
             in_chans,
             hidden_chans,
             kernel_size,
             stride=1,
             padding=(kernel_size - 1) // 2, )
-        self.w_2 = paddle.nn.Conv1D(
+        self.w_2 = nn.Conv1D(
             hidden_chans,
             in_chans,
             kernel_size,
             stride=1,
             padding=(kernel_size - 1) // 2, )
-        self.dropout = paddle.nn.Dropout(dropout_rate)
-        self.relu = paddle.nn.ReLU()
+        self.dropout = nn.Dropout(dropout_rate)
+        self.relu = nn.ReLU()
 
     def forward(self, x):
         """Calculate forward propagation.
@@ -77,7 +77,7 @@ class MultiLayeredConv1d(paddle.nn.Layer):
             [0, 2, 1])
 
 
-class Conv1dLinear(paddle.nn.Layer):
+class Conv1dLinear(nn.Layer):
     """Conv1D + Linear for Transformer block.
 
     A variant of MultiLayeredConv1d, which replaces second conv-layer to linear.
@@ -98,16 +98,16 @@ class Conv1dLinear(paddle.nn.Layer):
         dropout_rate : float
             Dropout rate.
         """
-        super(Conv1dLinear, self).__init__()
-        self.w_1 = paddle.nn.Conv1D(
+        super().__init__()
+        self.w_1 = nn.Conv1D(
             in_chans,
             hidden_chans,
             kernel_size,
             stride=1,
             padding=(kernel_size - 1) // 2, )
-        self.w_2 = paddle.nn.Linear(hidden_chans, in_chans, bias_attr=True)
-        self.dropout = paddle.nn.Dropout(dropout_rate)
-        self.relu = paddle.nn.ReLU()
+        self.w_2 = nn.Linear(hidden_chans, in_chans, bias_attr=True)
+        self.dropout = nn.Dropout(dropout_rate)
+        self.relu = nn.ReLU()
 
     def forward(self, x):
         """Calculate forward propagation.
diff --git a/paddlespeech/t2s/modules/fastspeech2_transformer/positionwise_feed_forward.py b/paddlespeech/t2s/modules/transformer/positionwise_feed_forward.py
similarity index 93%
rename from paddlespeech/t2s/modules/fastspeech2_transformer/positionwise_feed_forward.py
rename to paddlespeech/t2s/modules/transformer/positionwise_feed_forward.py
index 297a3b4fbb78bc121e7102f85a95272d4f91c8ef..28ed1c31b10fa0c499baf11ec55baf6bf93fff26 100644
--- a/paddlespeech/t2s/modules/fastspeech2_transformer/positionwise_feed_forward.py
+++ b/paddlespeech/t2s/modules/transformer/positionwise_feed_forward.py
@@ -14,9 +14,10 @@
 # Modified from espnet(https://github.com/espnet/espnet)
 """Positionwise feed forward layer definition."""
 import paddle
+from paddle import nn
 
 
-class PositionwiseFeedForward(paddle.nn.Layer):
+class PositionwiseFeedForward(nn.Layer):
     """Positionwise feed forward layer.
 
     Parameters
@@ -35,7 +36,7 @@ class PositionwiseFeedForward(paddle.nn.Layer):
                  dropout_rate,
                  activation=paddle.nn.ReLU()):
         """Construct an PositionwiseFeedForward object."""
-        super(PositionwiseFeedForward, self).__init__()
+        super().__init__()
         self.w_1 = paddle.nn.Linear(idim, hidden_units, bias_attr=True)
         self.w_2 = paddle.nn.Linear(hidden_units, idim, bias_attr=True)
         self.dropout = paddle.nn.Dropout(dropout_rate)
diff --git a/paddlespeech/t2s/modules/fastspeech2_transformer/repeat.py b/paddlespeech/t2s/modules/transformer/repeat.py
similarity index 100%
rename from paddlespeech/t2s/modules/fastspeech2_transformer/repeat.py
rename to paddlespeech/t2s/modules/transformer/repeat.py
diff --git a/paddlespeech/t2s/modules/transformer/subsampling.py b/paddlespeech/t2s/modules/transformer/subsampling.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf0fca8a1b0115f60445f52d164d822533873359
--- /dev/null
+++ b/paddlespeech/t2s/modules/transformer/subsampling.py
@@ -0,0 +1,83 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+"""Subsampling layer definition."""
+import paddle
+from paddle import nn
+
+from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding
+
+
+class Conv2dSubsampling(nn.Layer):
+    """Convolutional 2D subsampling (to 1/4 length).
+    Parameters
+    ----------
+    idim : int
+        Input dimension.
+    odim : int
+        Output dimension.
+    dropout_rate : float
+        Dropout rate.
+    pos_enc : nn.Layer
+        Custom position encoding layer.
+    """
+
+    def __init__(self, idim, odim, dropout_rate, pos_enc=None):
+        """Construct an Conv2dSubsampling object."""
+        super().__init__()
+        self.conv = nn.Sequential(
+            nn.Conv2D(1, odim, 3, 2),
+            nn.ReLU(),
+            nn.Conv2D(odim, odim, 3, 2),
+            nn.ReLU(), )
+        self.out = nn.Sequential(
+            nn.Linear(odim * (((idim - 1) // 2 - 1) // 2), odim),
+            pos_enc if pos_enc is not None else
+            PositionalEncoding(odim, dropout_rate), )
+
+    def forward(self, x, x_mask):
+        """Subsample x.
+        Parameters
+        ----------
+        x : paddle.Tensor
+            Input tensor (#batch, time, idim).
+        x_mask : paddle.Tensor
+            Input mask (#batch, 1, time).
+        Returns
+        ----------
+        paddle.Tensor
+            Subsampled tensor (#batch, time', odim),
+            where time' = time // 4.
+        paddle.Tensor
+            Subsampled mask (#batch, 1, time'),
+            where time' = time // 4.
+        """
+        # (b, c, t, f)
+        x = x.unsqueeze(1)
+        x = self.conv(x)
+        b, c, t, f = paddle.shape(x)
+        x = self.out(x.transpose([0, 2, 1, 3]).reshape([b, t, c * f]))
+        if x_mask is None:
+            return x, None
+        return x, x_mask[:, :, :-2:2][:, :, :-2:2]
+
+    def __getitem__(self, key):
+        """Get item.
+        When reset_parameters() is called, if use_scaled_pos_enc is used,
+            return the positioning encoding.
+        """
+        if key != -1:
+            raise NotImplementedError(
+                "Support only `-1` (for `reset_parameters`).")
+        return self.out[key]
diff --git a/paddlespeech/t2s/training/cli.py b/paddlespeech/t2s/training/cli.py
index a0710fd7253c080fa9813f4aa49f5eb296b5f36d..83dae11776e39f224ad6c02d46920bc21fdc9d7c 100644
--- a/paddlespeech/t2s/training/cli.py
+++ b/paddlespeech/t2s/training/cli.py
@@ -30,7 +30,7 @@ def default_argument_parser():
     
     The ``--checkpoint_path`` specifies the checkpoint to load from.
     
-    The ``--device`` and ``--nprocs`` specifies how to run the training.
+    The ``--ngpu`` specifies how to run the training.
     
     See Also
     --------
diff --git a/paddlespeech/t2s/training/experiment.py b/paddlespeech/t2s/training/experiment.py
index c9e7f4cc0a3fd9f661edc0c96be37d7d41a3a80e..de36db24b952b64d26944ee985fdeb2891646008 100644
--- a/paddlespeech/t2s/training/experiment.py
+++ b/paddlespeech/t2s/training/experiment.py
@@ -82,8 +82,8 @@ class ExperimentBase(object):
     >>>     config.merge_from_list(args.opts)
     >>> config.freeze()
     >>>
-    >>> if args.nprocs > 1 and args.device == "gpu":
-    >>>     dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs)
+    >>> if args.ngpu > 1:
+    >>>     dist.spawn(main_sp, args=(config, args), nprocs=args.ngpu)
     >>> else:
     >>>     main_sp(config, args)
     """
diff --git a/paddlespeech/t2s/training/optimizer.py b/paddlespeech/t2s/training/optimizer.py
index c6a6944d10758e2c1d0e6fae79ae1db995b0ac32..907e3dafafb2ea2d78919f9d5042d7b8d402872c 100644
--- a/paddlespeech/t2s/training/optimizer.py
+++ b/paddlespeech/t2s/training/optimizer.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import paddle
+from paddle import nn
 
 optim_classes = dict(
     adadelta=paddle.optimizer.Adadelta,
@@ -25,7 +26,7 @@ optim_classes = dict(
     sgd=paddle.optimizer.SGD, )
 
 
-def build_optimizers(model: paddle.nn.Layer,
+def build_optimizers(model: nn.Layer,
                      optim='adadelta',
                      max_grad_norm=None,
                      learning_rate=0.01) -> paddle.optimizer:
diff --git a/paddlespeech/text/examples/punctuation_restoration/chinese/local/test.sh b/paddlespeech/text/examples/punctuation_restoration/chinese/local/test.sh
index 6db75ca2ad347f292443b1baf980430df00dd568..ee02246222b7ed6c7e780cde443abf537da7f14c 100644
--- a/paddlespeech/text/examples/punctuation_restoration/chinese/local/test.sh
+++ b/paddlespeech/text/examples/punctuation_restoration/chinese/local/test.sh
@@ -9,17 +9,11 @@ fi
 ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
 
-device=gpu
-if [ ${ngpu} == 0 ];then
-    device=cpu
-fi
 config_path=$1
 ckpt_prefix=$2
 
-
 python3 -u ${BIN_DIR}/test.py \
---device ${device} \
---nproc 1 \
+--ngpu 1 \
 --config ${config_path} \
 --result_file ${ckpt_prefix}.rsl \
 --checkpoint_path ${ckpt_prefix}
diff --git a/paddlespeech/text/examples/punctuation_restoration/chinese/local/train.sh b/paddlespeech/text/examples/punctuation_restoration/chinese/local/train.sh
index f6bd2c98359b67292f9654c69ed11fc1a6720046..fc345cc1a04c3842f090f8804fa936bae7a82e4a 100644
--- a/paddlespeech/text/examples/punctuation_restoration/chinese/local/train.sh
+++ b/paddlespeech/text/examples/punctuation_restoration/chinese/local/train.sh
@@ -11,16 +11,10 @@ echo "using $ngpu gpus..."
 config_path=$1
 ckpt_name=$2
 
-device=gpu
-if [ ${ngpu} == 0 ];then
-    device=cpu
-fi
-
 mkdir -p exp
 
 python3 -u ${BIN_DIR}/train.py \
---device ${device} \
---nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --output exp/${ckpt_name}
 
diff --git a/paddlespeech/text/examples/punctuation_restoration/english/local/test.sh b/paddlespeech/text/examples/punctuation_restoration/english/local/test.sh
index 6db75ca2ad347f292443b1baf980430df00dd568..d8a58f34c7082665c626790e065b5c11b4bf234a 100644
--- a/paddlespeech/text/examples/punctuation_restoration/english/local/test.sh
+++ b/paddlespeech/text/examples/punctuation_restoration/english/local/test.sh
@@ -9,17 +9,12 @@ fi
 ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
 
-device=gpu
-if [ ${ngpu} == 0 ];then
-    device=cpu
-fi
 config_path=$1
 ckpt_prefix=$2
 
 
 python3 -u ${BIN_DIR}/test.py \
---device ${device} \
---nproc 1 \
+--ngpu 1 \
 --config ${config_path} \
 --result_file ${ckpt_prefix}.rsl \
 --checkpoint_path ${ckpt_prefix}
diff --git a/paddlespeech/text/examples/punctuation_restoration/english/local/train.sh b/paddlespeech/text/examples/punctuation_restoration/english/local/train.sh
index f6bd2c98359b67292f9654c69ed11fc1a6720046..fc345cc1a04c3842f090f8804fa936bae7a82e4a 100644
--- a/paddlespeech/text/examples/punctuation_restoration/english/local/train.sh
+++ b/paddlespeech/text/examples/punctuation_restoration/english/local/train.sh
@@ -11,16 +11,10 @@ echo "using $ngpu gpus..."
 config_path=$1
 ckpt_name=$2
 
-device=gpu
-if [ ${ngpu} == 0 ];then
-    device=cpu
-fi
-
 mkdir -p exp
 
 python3 -u ${BIN_DIR}/train.py \
---device ${device} \
---nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --output exp/${ckpt_name}
 
diff --git a/paddlespeech/text/speechtask/punctuation_restoration/bin/train.py b/paddlespeech/text/speechtask/punctuation_restoration/bin/train.py
index 1ffd79b7b192ce358d66104c1198212205a53040..c4b67265e47910a814654caa71469392b6649c7f 100644
--- a/paddlespeech/text/speechtask/punctuation_restoration/bin/train.py
+++ b/paddlespeech/text/speechtask/punctuation_restoration/bin/train.py
@@ -26,8 +26,8 @@ def main_sp(config, args):
 
 
 def main(config, args):
-    if args.device == "gpu" and args.nprocs > 1:
-        dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs)
+    if args.ngpu > 1:
+        dist.spawn(main_sp, args=(config, args), nprocs=args.ngpu)
     else:
         main_sp(config, args)
 
diff --git a/paddlespeech/text/speechtask/punctuation_restoration/training/trainer.py b/paddlespeech/text/speechtask/punctuation_restoration/training/trainer.py
index 2dce88a3f5d4f5916835654f5642cf86b8999eaf..d6b6eeb65ea338029b9a58d2dda1d3824f093866 100644
--- a/paddlespeech/text/speechtask/punctuation_restoration/training/trainer.py
+++ b/paddlespeech/text/speechtask/punctuation_restoration/training/trainer.py
@@ -106,8 +106,8 @@ class Trainer():
     >>>     config.merge_from_list(args.opts)
     >>> config.freeze()
     >>> 
-    >>> if args.nprocs > 1 and args.device == "gpu":
-    >>>     dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs)
+    >>> if args.ngpu > 1:
+    >>>     dist.spawn(main_sp, args=(config, args), nprocs=args.ngpu)
     >>> else:
     >>>     main_sp(config, args)
     """
@@ -147,7 +147,7 @@ class Trainer():
         """A flag indicating whether the experiment should run with 
         multiprocessing.
         """
-        return self.args.device == "gpu" and self.args.nprocs > 1
+        return self.args.ngpu > 1
 
     def init_parallel(self):
         """Init environment for multiprocess training.
diff --git a/paddlespeech/text/speechtask/punctuation_restoration/utils/default_parser.py b/paddlespeech/text/speechtask/punctuation_restoration/utils/default_parser.py
index 405b29a2b5f4cf925e59951cf385f54e06239025..de0d8aeffe7d813927effa9957b7f206fcf4a479 100644
--- a/paddlespeech/text/speechtask/punctuation_restoration/utils/default_parser.py
+++ b/paddlespeech/text/speechtask/punctuation_restoration/utils/default_parser.py
@@ -30,7 +30,7 @@ def default_argument_parser():
     
     The ``--checkpoint_path`` specifies the checkpoint to load from.
     
-    The ``--device`` and ``--nprocs`` specifies how to run the training.
+    The ``--ngpu`` specifies how to run the training.
     
     
     See Also
@@ -60,9 +60,7 @@ def default_argument_parser():
     parser.add_argument("--result_file", type=str, help="path of save the asr result")
 
     # running
-    parser.add_argument("--device", type=str, default='gpu', choices=["cpu", "gpu"],
-                        help="device type to use, cpu and gpu are supported.")
-    parser.add_argument("--nprocs", type=int, default=1, help="number of parallel processes to use.")
+    parser.add_argument("--ngpu", type=int, default=1, help="number of parallel processes to use. if ngpu=0, using cpu.")
 
     # overwrite extra config and default config
     # parser.add_argument("--opts", nargs=argparse.REMAINDER, 
diff --git a/setup.py b/setup.py
index 310eed1e75b1b6e489df467707dd60c959a1c4e1..a4ce181a9675fd470ce7f0da6cb74e0a2f454f2c 100644
--- a/setup.py
+++ b/setup.py
@@ -187,6 +187,9 @@ setup_info = dict(
         'Programming Language :: Python :: 3.7',
         'Programming Language :: Python :: 3.8',
         'Programming Language :: Python :: 3.9',
-    ], )
+    ],
+    entry_points={
+        'console_scripts': ['paddlespeech=paddlespeech.cli.entry:_execute']
+    })
 
 setup(**setup_info)
diff --git a/audio/setup.py b/setup_audio.py
similarity index 78%
rename from audio/setup.py
rename to setup_audio.py
index e0ac98181ae9717c66b07e3b52d0f5cca90f7230..24c9bb9b98691bd7c9d8f0efe7e2cabdd4664399 100644
--- a/audio/setup.py
+++ b/setup_audio.py
@@ -16,19 +16,16 @@ import setuptools
 # set the version here
 version = '0.1.0a'
 
-with open("README.md", "r") as fh:
-    long_description = fh.read()
-
 setuptools.setup(
     name="paddleaudio",
     version=version,
     author="",
     author_email="",
     description="PaddleAudio, in development",
-    long_description=long_description,
+    long_description="",
     long_description_content_type="text/markdown",
     url="",
-    packages=setuptools.find_packages(exclude=["build*", "test*", "examples*"]),
+    packages=setuptools.find_packages(include=['paddleaudio*']),
     classifiers=[
         "Programming Language :: Python :: 3",
         "License :: OSI Approved :: MIT License",
@@ -41,8 +38,4 @@ setuptools.setup(
         'resampy >= 0.2.2',
         'soundfile >= 0.9.0',
         'colorlog',
-        'pathos',
-    ],
-    extras_require={'dev': ['pytest>=3.7', 'librosa>=0.7.2']
-                    }  # for dev only, install: pip install -e .[dev]
-)
+    ], )
diff --git a/tests/benchmark/conformer/run_benchmark.sh b/tests/benchmark/conformer/run_benchmark.sh
index 56b63e76b1f23abf8f36c237dcd2232e20792d39..d4efe2b96a03d045040b37715abd9bb0844ae219 100644
--- a/tests/benchmark/conformer/run_benchmark.sh
+++ b/tests/benchmark/conformer/run_benchmark.sh
@@ -38,7 +38,7 @@ function _train(){
     train_cmd="--config=${config_path}
                --output=${output}
                --seed=${seed}
-               --nproc=${ngpu}
+               --ngpu=${ngpu}
                --profiler-options "${profiler_options}"
                --benchmark-batch-size ${batch_size}
                --benchmark-max-step ${benchmark_max_step} "
diff --git a/tests/chains/ds2/ds2_params_lite_train_infer.txt b/tests/chains/ds2/ds2_params_lite_train_infer.txt
index c1cbfbb92fc325b598f056c26561a29f5b2c341c..b11872bd006d8264cbe8f652f719ed4f25d78a62 100644
--- a/tests/chains/ds2/ds2_params_lite_train_infer.txt
+++ b/tests/chains/ds2/ds2_params_lite_train_infer.txt
@@ -21,13 +21,13 @@ null:null
 null:null
 ##
 ===========================eval_params===========================
-eval: ../../../paddlespeech/s2t/exps/deepspeech2/bin/test.py --nproc 1 --config conf/deepspeech2.yaml --checkpoint_path exp/deepspeech_tiny/checkpoints/9 --result_file tests/9.rsl  --model_type offline
+eval: ../../../paddlespeech/s2t/exps/deepspeech2/bin/test.py --ngpu 1 --config conf/deepspeech2.yaml --checkpoint_path exp/deepspeech_tiny/checkpoints/9 --result_file tests/9.rsl  --model_type offline
 null:null
 ##
 ===========================infer_params===========================
 null:null
 null:null
-norm_export: ../../../paddlespeech/s2t/exps/deepspeech2/bin/export.py --nproc 1 --config conf/deepspeech2.yaml --model_type offline --checkpoint_path exp/deepspeech_tiny/checkpoints/9 --export_path exp/deepspeech_tiny/checkpoints/9.jit
+norm_export: ../../../paddlespeech/s2t/exps/deepspeech2/bin/export.py --ngpu 1 --config conf/deepspeech2.yaml --model_type offline --checkpoint_path exp/deepspeech_tiny/checkpoints/9 --export_path exp/deepspeech_tiny/checkpoints/9.jit
 quant_export:null
 fpgm_export:null
 distill_export:null
diff --git a/tests/chains/ds2/ds2_params_whole_train_infer.txt b/tests/chains/ds2/ds2_params_whole_train_infer.txt
index bfcb745f6c22027ded7fc7334567ae007eb463d4..875e3ccf9bb87fffd321452808c48ab692a379e2 100644
--- a/tests/chains/ds2/ds2_params_whole_train_infer.txt
+++ b/tests/chains/ds2/ds2_params_whole_train_infer.txt
@@ -21,13 +21,13 @@ null:null
 null:null
 ##
 ===========================eval_params===========================
-eval: ../../../paddlespeech/s2t/exps/deepspeech2/bin/test.py --nproc 1 --config conf/deepspeech2.yaml --result_file tests/49.rsl --checkpoint_path exp/deepspeech_whole/checkpoints/49 --model_type offline
+eval: ../../../paddlespeech/s2t/exps/deepspeech2/bin/test.py --ngpu 1 --config conf/deepspeech2.yaml --result_file tests/49.rsl --checkpoint_path exp/deepspeech_whole/checkpoints/49 --model_type offline
 null:null
 ##
 ===========================infer_params===========================
 null:null
 null:null
-norm_export: ../../../paddlespeech/s2t/exps/deepspeech2/bin/export.py --nproc 1 --config conf/deepspeech2.yaml --model_type offline  --checkpoint_path exp/deepspeech_whole/checkpoints/49 --export_path exp/deepspeech_whole/checkpoints/49.jit
+norm_export: ../../../paddlespeech/s2t/exps/deepspeech2/bin/export.py --ngpu 1 --config conf/deepspeech2.yaml --model_type offline  --checkpoint_path exp/deepspeech_whole/checkpoints/49 --export_path exp/deepspeech_whole/checkpoints/49.jit
 quant_export:null
 fpgm_export:null
 distill_export:null
diff --git a/tests/chains/ds2/speedyspeech_params_lite.txt b/tests/chains/ds2/speedyspeech_params_lite.txt
index c1cfb8f54b60a5b5c72998ffceccb2e8e0cd3acc..487b0b5e1150e801c14e030eca13ebeb913b4a61 100644
--- a/tests/chains/ds2/speedyspeech_params_lite.txt
+++ b/tests/chains/ds2/speedyspeech_params_lite.txt
@@ -21,7 +21,7 @@ null:null
 null:null
 ##
 ===========================eval_params===========================
-eval:../examples/speedyspeech/baker/synthesize_e2e.py --speedyspeech-config=../examples/speedyspeech/baker/conf/default.yaml --speedyspeech-checkpoint=exp/default/checkpoints/snapshot_iter_90.pdz --speedyspeech-stat=pretrain_models/speedyspeech_baker_ckpt_0.4/speedy_speech_stats.npy --pwg-config=../examples/parallelwave_gan/baker/conf/default.yaml --pwg-checkpoint=pretrain_models/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz --pwg-stat=pretrain_models/pwg_baker_ckpt_0.4/pwg_stats.npy --text=../examples/speedyspeech/baker/sentences.txt --output-dir=e2e --inference-dir=inference --device="gpu" --phones-dict=../examples/speedyspeech/baker/phones.txt --tones-dict=../examples/speedyspeech/baker/tones.txt
+eval:../examples/speedyspeech/baker/synthesize_e2e.py --speedyspeech-config=../examples/speedyspeech/baker/conf/default.yaml --speedyspeech-checkpoint=exp/default/checkpoints/snapshot_iter_90.pdz --speedyspeech-stat=pretrain_models/speedyspeech_baker_ckpt_0.4/speedy_speech_stats.npy --pwg-config=../examples/parallelwave_gan/baker/conf/default.yaml --pwg-checkpoint=pretrain_models/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz --pwg-stat=pretrain_models/pwg_baker_ckpt_0.4/pwg_stats.npy --text=../examples/speedyspeech/baker/sentences.txt --output-dir=e2e --inference-dir=inference --ngpu=1 --phones-dict=../examples/speedyspeech/baker/phones.txt --tones-dict=../examples/speedyspeech/baker/tones.txt
 null:null
 ##
 ===========================infer_params===========================
diff --git a/tests/chains/ds2/test.sh b/tests/chains/ds2/test.sh
index 0b2b4f581a27f29c6c07f0d82490039e49c4b961..c930782051e0b75f9be0d7a95674b2e16da4ba40 100644
--- a/tests/chains/ds2/test.sh
+++ b/tests/chains/ds2/test.sh
@@ -323,7 +323,7 @@ else
                 elif [ ${#gpu} -le 15 ];then  # train with multi-gpu
                     gsu=${gpu//,/ }
                     nump=`echo $gsu | wc -w`
-                    cmd="${python} ${run_train} --nproc=$nump"
+                    cmd="${python} ${run_train} --ngpu=$nump"
                 else     # train with multi-machine
                     cmd="${python} -m paddle.distributed.launch --ips=${ips} --gpus=${gpu} ${run_train} ${set_save_model} ${set_pretrain} ${set_epoch} ${set_autocast} ${set_batchsize} ${set_train_params1}"
                 fi
diff --git a/tests/chains/speedyspeech/speedyspeech_params_lite_multi_gpu.txt b/tests/chains/speedyspeech/speedyspeech_params_lite_multi_gpu.txt
index 0f64da271424942f93b669b97aecfb33ba8632e3..634c3a5afd71b7280d601a62fb64d2c4193d0c98 100644
--- a/tests/chains/speedyspeech/speedyspeech_params_lite_multi_gpu.txt
+++ b/tests/chains/speedyspeech/speedyspeech_params_lite_multi_gpu.txt
@@ -21,7 +21,7 @@ null:null
 null:null
 ##
 ===========================eval_params===========================
-eval:../../../paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py --speedyspeech-config=../../../examples/csmsc/tts2/conf/default.yaml --speedyspeech-checkpoint=exp/default/checkpoints/snapshot_iter_20.pdz --speedyspeech-stat=train_data/mini_BZNSYP/train/feats_stats.npy --pwg-config=pretrain_models/pwg_baker_ckpt_0.4/pwg_default.yaml --pwg-checkpoint=pretrain_models/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz --pwg-stat=pretrain_models/pwg_baker_ckpt_0.4/pwg_stats.npy --text=../../../paddlespeech/t2s/exps/sentences.txt --output-dir=e2e --inference-dir=inference --device="gpu" --phones-dict=train_data/mini_BZNSYP/phone_id_map.txt --tones-dict=train_data/mini_BZNSYP/tone_id_map.txt
+eval:../../../paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py --speedyspeech-config=../../../examples/csmsc/tts2/conf/default.yaml --speedyspeech-checkpoint=exp/default/checkpoints/snapshot_iter_20.pdz --speedyspeech-stat=train_data/mini_BZNSYP/train/feats_stats.npy --pwg-config=pretrain_models/pwg_baker_ckpt_0.4/pwg_default.yaml --pwg-checkpoint=pretrain_models/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz --pwg-stat=pretrain_models/pwg_baker_ckpt_0.4/pwg_stats.npy --text=../../../paddlespeech/t2s/exps/sentences.txt --output-dir=e2e --inference-dir=inference --ngpu=1 --phones-dict=train_data/mini_BZNSYP/phone_id_map.txt --tones-dict=train_data/mini_BZNSYP/tone_id_map.txt
 null:null
 ##
 ===========================infer_params===========================
diff --git a/tests/chains/speedyspeech/speedyspeech_params_lite_single_gpu.txt b/tests/chains/speedyspeech/speedyspeech_params_lite_single_gpu.txt
index beda4c04e06d5d055e407e3f6c67cbc35dbf5806..d187d4c672e07565c45a9e6d85f0fcbf078860bb 100644
--- a/tests/chains/speedyspeech/speedyspeech_params_lite_single_gpu.txt
+++ b/tests/chains/speedyspeech/speedyspeech_params_lite_single_gpu.txt
@@ -21,7 +21,7 @@ null:null
 null:null
 ##
 ===========================eval_params===========================
-eval:../../../paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py --speedyspeech-config=../../../examples/csmsc/tts2/conf/default.yaml --speedyspeech-checkpoint=exp/default/checkpoints/snapshot_iter_30.pdz --speedyspeech-stat=train_data/mini_BZNSYP/train/feats_stats.npy --pwg-config=pretrain_models/pwg_baker_ckpt_0.4/pwg_default.yaml --pwg-checkpoint=pretrain_models/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz --pwg-stat=pretrain_models/pwg_baker_ckpt_0.4/pwg_stats.npy --text=../../../paddlespeech/t2s/exps/sentences.txt --output-dir=e2e --inference-dir=inference --device="gpu" --phones-dict=train_data/mini_BZNSYP/phone_id_map.txt --tones-dict=train_data/mini_BZNSYP/tone_id_map.txt
+eval:../../../paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py --speedyspeech-config=../../../examples/csmsc/tts2/conf/default.yaml --speedyspeech-checkpoint=exp/default/checkpoints/snapshot_iter_30.pdz --speedyspeech-stat=train_data/mini_BZNSYP/train/feats_stats.npy --pwg-config=pretrain_models/pwg_baker_ckpt_0.4/pwg_default.yaml --pwg-checkpoint=pretrain_models/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz --pwg-stat=pretrain_models/pwg_baker_ckpt_0.4/pwg_stats.npy --text=../../../paddlespeech/t2s/exps/sentences.txt --output-dir=e2e --inference-dir=inference --ngpu=1 --phones-dict=train_data/mini_BZNSYP/phone_id_map.txt --tones-dict=train_data/mini_BZNSYP/tone_id_map.txt
 null:null
 ##
 ===========================infer_params===========================
diff --git a/tests/chains/speedyspeech/speedyspeech_params_whole_multi_gpu.txt b/tests/chains/speedyspeech/speedyspeech_params_whole_multi_gpu.txt
index ecdbf76dc0fce5463e69a3dae86fa030df866f9c..f8ebd499e9d101107d2924abd0f077fd5fa7b046 100644
--- a/tests/chains/speedyspeech/speedyspeech_params_whole_multi_gpu.txt
+++ b/tests/chains/speedyspeech/speedyspeech_params_whole_multi_gpu.txt
@@ -21,7 +21,7 @@ null:null
 null:null
 ##
 ===========================eval_params===========================
-eval:../../../paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py --speedyspeech-config=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/default.yaml --speedyspeech-checkpoint=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/snapshot_iter_11400.pdz --speedyspeech-stat=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/feats_stats.npy --pwg-config=pretrain_models/pwg_baker_ckpt_0.4/pwg_default.yaml --pwg-checkpoint=pretrain_models/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz --pwg-stat=pretrain_models/pwg_baker_ckpt_0.4/pwg_stats.npy --text=../../../paddlespeech/t2s/exps/sentences.txt --output-dir=e2e --inference-dir=inference --device="gpu" --phones-dict=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/phone_id_map.txt --tones-dict=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/tone_id_map.txt
+eval:../../../paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py --speedyspeech-config=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/default.yaml --speedyspeech-checkpoint=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/snapshot_iter_11400.pdz --speedyspeech-stat=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/feats_stats.npy --pwg-config=pretrain_models/pwg_baker_ckpt_0.4/pwg_default.yaml --pwg-checkpoint=pretrain_models/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz --pwg-stat=pretrain_models/pwg_baker_ckpt_0.4/pwg_stats.npy --text=../../../paddlespeech/t2s/exps/sentences.txt --output-dir=e2e --inference-dir=inference --ngpu=1 --phones-dict=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/phone_id_map.txt --tones-dict=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/tone_id_map.txt
 null:null
 ##
 ===========================infer_params===========================
diff --git a/tests/chains/speedyspeech/speedyspeech_params_whole_single_gpu.txt b/tests/chains/speedyspeech/speedyspeech_params_whole_single_gpu.txt
index 523b5c6e303db86c4f1f3c3597fbcf75a54c61c2..0e3c49cb8f43172d8d6597841fbb2f52e027ad80 100644
--- a/tests/chains/speedyspeech/speedyspeech_params_whole_single_gpu.txt
+++ b/tests/chains/speedyspeech/speedyspeech_params_whole_single_gpu.txt
@@ -21,7 +21,7 @@ null:null
 null:null
 ##
 ===========================eval_params===========================
-eval:../../../paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py --speedyspeech-config=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/default.yaml --speedyspeech-checkpoint=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/snapshot_iter_11400.pdz --speedyspeech-stat=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/feats_stats.npy --pwg-config=pretrain_models/pwg_baker_ckpt_0.4/pwg_default.yaml --pwg-checkpoint=pretrain_models/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz --pwg-stat=pretrain_models/pwg_baker_ckpt_0.4/pwg_stats.npy --text=../../../paddlespeech/t2s/exps/sentences.txt --output-dir=e2e --inference-dir=inference --device="gpu" --phones-dict=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/phone_id_map.txt --tones-dict=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/tone_id_map.txt
+eval:../../../paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py --speedyspeech-config=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/default.yaml --speedyspeech-checkpoint=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/snapshot_iter_11400.pdz --speedyspeech-stat=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/feats_stats.npy --pwg-config=pretrain_models/pwg_baker_ckpt_0.4/pwg_default.yaml --pwg-checkpoint=pretrain_models/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz --pwg-stat=pretrain_models/pwg_baker_ckpt_0.4/pwg_stats.npy --text=../../../paddlespeech/t2s/exps/sentences.txt --output-dir=e2e --inference-dir=inference --ngpu=1 --phones-dict=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/phone_id_map.txt --tones-dict=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/tone_id_map.txt
 null:null
 ##
 ===========================infer_params===========================
diff --git a/tests/unit/tts/test_stft.py b/tests/unit/tts/test_stft.py
index d2d56dca400ffd8da055509583944d0b660a5cb4..624226e941c07778af1716728e2a62256124a958 100644
--- a/tests/unit/tts/test_stft.py
+++ b/tests/unit/tts/test_stft.py
@@ -11,52 +11,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import librosa
-import numpy as np
 import paddle
 import torch
 from parallel_wavegan.losses import stft_loss as sl
-from scipy import signal
 
-from paddlespeech.t2s.modules.stft_loss import MultiResolutionSTFTLoss
-from paddlespeech.t2s.modules.stft_loss import STFT
-
-
-def test_stft():
-    stft = STFT(n_fft=1024, hop_length=256, win_length=1024)
-    x = paddle.uniform([4, 46080])
-    S = stft.magnitude(x)
-    window = signal.get_window('hann', 1024, fftbins=True)
-    D2 = torch.stft(
-        torch.as_tensor(x.numpy()),
-        n_fft=1024,
-        hop_length=256,
-        win_length=1024,
-        window=torch.as_tensor(window))
-    S2 = (D2**2).sum(-1).sqrt()
-    S3 = np.abs(
-        librosa.stft(x.numpy()[0], n_fft=1024, hop_length=256, win_length=1024))
-    print(S2.shape)
-    print(S.numpy()[0])
-    print(S2.data.cpu().numpy()[0])
-    print(S3)
-
-
-def test_torch_stft():
-    # NOTE: torch.stft use no window by default
-    x = np.random.uniform(-1.0, 1.0, size=(46080, ))
-    window = signal.get_window('hann', 1024, fftbins=True)
-    D2 = torch.stft(
-        torch.as_tensor(x),
-        n_fft=1024,
-        hop_length=256,
-        win_length=1024,
-        window=torch.as_tensor(window))
-    D3 = librosa.stft(
-        x, n_fft=1024, hop_length=256, win_length=1024, window='hann')
-    print(D2[:, :, 0].data.cpu().numpy()[:, 30:60])
-    print(D3.real[:, 30:60])
-    # print(D3.imag[:, 30:60])
+from paddlespeech.t2s.modules.losses import MultiResolutionSTFTLoss
 
 
 def test_multi_resolution_stft_loss():
diff --git a/tools/extras/install_kaldi.sh b/tools/extras/install_kaldi.sh
index 3cdcd32d4d2aeda41f3ea21aac0518d7f70b7a8e..b93e7ecf6372c9f84391289ace2c4a1b3590b45e 100755
--- a/tools/extras/install_kaldi.sh
+++ b/tools/extras/install_kaldi.sh
@@ -9,6 +9,7 @@ apt-get install subversion -y
 KALDI_GIT="--depth 1 -b master https://github.com/kaldi-asr/kaldi.git"
 
 KALDI_DIR="$PWD/kaldi"
+SHARED=false
 
 if [ ! -d "$KALDI_DIR" ]; then
     git clone $KALDI_GIT $KALDI_DIR
@@ -23,17 +24,25 @@ git pull
 mkdir -p "python"
 touch "python/.use_default_python"
 
+# check deps
 ./extras/check_dependencies.sh
 
+# make tools
 make -j4
 
+# make src
 pushd ../src
 OPENBLAS_DIR=${KALDI_DIR}/../OpenBLAS
 mkdir -p ${OPENBLAS_DIR}/install
-./configure --shared --use-cuda=no --static-math --mathlib=OPENBLAS --openblas-root=${OPENBLAS_DIR}/install
+if [ $SHARED == true ];
+   ./configure --shared --use-cuda=no --static-math --mathlib=OPENBLAS --openblas-root=${OPENBLAS_DIR}/install
+else
+   ./configure --static --use-cuda=no --static-math --mathlib=OPENBLAS --openblas-root=${OPENBLAS_DIR}/install
+fi
 make clean -j && make depend -j && make -j4
-popd
+popd # kaldi/src
+
 
-popd
+popd # kaldi/tools
 
 echo "Done installing Kaldi."