merge the develop branch and do the revising

dcc23903 · huangyuxin · 5047e878 · 85f7f674 · 5047e878 · 5047e878
298 changed file
--- a/audio/.gitignore
+++ b/audio/.gitignore
-.ipynb_checkpoints/**
-*.ipynb
-nohup.out
-__pycache__/
-*.wav
-*.m4a
-obsolete/**
--- a/audio/.pre-commit-config.yaml
+++ b/audio/.pre-commit-config.yaml
-repos:
-   repo: local
-    hooks:
-    -   id: yapf
-        name: yapf
-        entry: yapf
-        language: system
-        args: [-i, --style .style.yapf]
-        files: \.py$
-   repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: a11d9314b22d8f8c7556443875b731ef05965464
-    hooks:
-    -   id: check-merge-conflict
-    -   id: check-symlinks
-    -   id: end-of-file-fixer
-    -   id: trailing-whitespace
-    -   id: detect-private-key
-    -   id: check-symlinks
-    -   id: check-added-large-files
-   repo: https://github.com/pycqa/isort
-    rev: 5.8.0
-    hooks:
-    -   id: isort
-        name: isort (python)
-    -   id: isort
-        name: isort (cython)
-        types: [cython]
-    -   id: isort
-        name: isort (pyi)
-        types: [pyi]
-   repo: local
-    hooks:
-    -   id: flake8
-        name: flake8
-        entry: flake8
-        language: system
-        args:
-        -   --count
-        -   --select=E9,F63,F7,F82
-        -   --show-source
-        -   --statistics
-        files: \.py$
--- a/audio/.style.yapf
+++ b/audio/.style.yapf
-[style]
-based_on_style = pep8
-column_limit = 80
--- a/audio/LICENSE
+++ b/audio/LICENSE
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-   1. Definitions.
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-   END OF TERMS AND CONDITIONS
-   APPENDIX: How to apply the Apache License to your work.
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-   Copyright [yyyy] [name of copyright owner]
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-       http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
--- a/audio/README.md
+++ b/audio/README.md
-# PaddleAudio:  The audio library for PaddlePaddle
-## Introduction
-PaddleAudio is the audio toolkit to speed up your audio research and development loop in PaddlePaddle. It currently provides a collection of audio datasets, feature-extraction functions, audio transforms,state-of-the-art pre-trained models in sound tagging/classification and anomaly sound detection. More models and features are on the roadmap.
-## Features
- Spectrogram and related features are compatible with librosa.
- State-of-the-art models in sound tagging on Audioset, sound classification on esc50, and more to come.
- Ready-to-use audio embedding with a line of code, includes sound embedding and more on the roadmap.
- Data loading supports for common open source audio in multiple languages including English, Mandarin and so on.
-## Install
-```
-git clone https://github.com/PaddlePaddle/models
-cd models/PaddleAudio
-pip install .
-```
-## Quick start
-### Audio loading and feature extraction
-```
-import paddleaudio as pa
-s,r = pa.load(f)
-mel_spect = pa.melspectrogram(s,sr=r)
-```
-###  Examples
-We provide a set of examples to help you get started in using PaddleAudio quickly.
- [PANNs:  acoustic scene and events analysis using pre-trained models](./examples/panns)
- [Environmental Sound classification on ESC-50 dataset](./examples/sound_classification)
- [Training a audio-tagging network on Audioset](./examples/audioset_training)
-Please refer to [example directory](./examples) for more details.
--- a/audio/examples/panns/README.md
+++ b/audio/examples/panns/README.md
-# Audio Tagging
-声音分类的任务是单标签的分类任务，但是对于一段音频来说，它可以是多标签的。譬如在一般的室内办公环境进行录音，这段音频里可能包含人们说话的声音、键盘敲打的声音、鼠标点击的声音，还有室内的一些其他背景声音。对于通用的声音识别和声音检测场景而言，对一段音频预测多个标签是具有很强的实用性的。
-在IEEE ICASSP 2017 大会上，谷歌开放了一个大规模的音频数据集[Audioset](https://research.google.com/audioset/)。该数据集包含了 632 类的音频类别以及 2,084,320 条人工标记的每段 10 秒长度的声音剪辑片段（来源于YouTube视频）。目前该数据集已经有210万个已标注的视频数据，5800小时的音频数据，经过标记的声音样本的标签类别为527。
-`PANNs`([PANNs: Large-Scale Pretrained Audio Neural Networks for Audio Pattern Recognition](https://arxiv.org/pdf/1912.10211.pdf))是基于Audioset数据集训练的声音分类/识别的模型。其预训练的任务是多标签的声音识别，因此可用于声音的实时tagging。
-本示例采用`PANNs`预训练模型，基于Audioset的标签类别对输入音频实时tagging，并最终以文本形式输出对应时刻的top k类别和对应的得分。
-## 模型简介
-PaddleAudio提供了PANNs的CNN14、CNN10和CNN6的预训练模型，可供用户选择使用：
- CNN14: 该模型主要包含12个卷积层和2个全连接层，模型参数的数量为79.6M，embbedding维度是2048。
- CNN10: 该模型主要包含8个卷积层和2个全连接层，模型参数的数量为4.9M，embbedding维度是512。
- CNN6: 该模型主要包含4个卷积层和2个全连接层，模型参数的数量为4.5M，embbedding维度是512。
-## 快速开始
-### 模型预测
-```shell
-export CUDA_VISIBLE_DEVICES=0
-python audio_tag.py --device gpu --wav ./cat.wav --sample_duration 2 --hop_duration 0.3 --output_dir ./output_dir
-```
-可支持配置的参数：
- `device`: 选用什么设备进行训练，可选cpu或gpu，默认为gpu。如使用gpu训练则参数gpus指定GPU卡号。
- `wav`: 指定预测的音频文件。
- `sample_duration`: 模型每次预测的音频时间长度，单位为秒，默认为2s。
- `hop_duration`: 每两个预测音频的时间间隔，单位为秒，默认为0.3s。
- `output_dir`: 模型预测结果存放的路径，默认为`./output_dir`。
-示例代码中使用的预训练模型为`CNN14`，如果想更换为其他预训练模型，可通过以下方式执行：
-```python
-from paddleaudio.models.panns import cnn14, cnn10, cnn6
-# CNN14
-model = cnn14(pretrained=True, extract_embedding=False)
-# CNN10
-model = cnn10(pretrained=True, extract_embedding=False)
-# CNN6
-model = cnn6(pretrained=True, extract_embedding=False)
-```
-执行结果：
-```
-[2021-04-30 19:15:41,025] [    INFO] - Saved tagging results to ./output_dir/audioset_tagging_sr_44100.npz
-```
-执行后得分结果保存在`output_dir`的`.npz`文件中。
-### 生成tagging标签文本
-```shell
-python parse_result.py --tagging_file ./output_dir/audioset_tagging_sr_44100.npz --top_k 10 --smooth True --smooth_size 5 --label_file ./assets/audioset_labels.txt --output_dir ./output_dir
-```
-可支持配置的参数：
- `tagging_file`: 模型预测结果文件。
- `top_k`: 获取预测结果中，得分最高的前top_k个标签，默认为10。
- `smooth`: 预测结果的后验概率平滑，默认为True，表示应用平滑。
- `smooth_size`: 平滑计算过程中的样本数量，默认为5。
- `label_file`: 模型预测结果对应的Audioset类别的文本文件。
- `output_dir`: 标签文本存放的路径，默认为`./output_dir`。
-执行结果：
-```
-[2021-04-30 19:26:58,743] [    INFO] - Posterior smoothing...
-[2021-04-30 19:26:58,746] [    INFO] - Saved tagging labels to ./output_dir/audioset_tagging_sr_44100.txt
-```
-执行后文本结果保存在`output_dir`的`.txt`文件中。
-## Tagging标签文本
-最终输出的文本结果如下所示。  
-样本每个时间范围的top k结果用空行分隔。在每一个结果中，第一行是时间信息，数字表示tagging结果在时间起点信息，比例值代表当前时刻`t`与音频总长度`T`的比值；紧接的k行是对应的标签和得分。
-```
-0.0
-Cat: 0.9144676923751831
-Animal: 0.8855036497116089
-Domestic animals, pets: 0.804577112197876
-Meow: 0.7422927021980286
-Music: 0.19959309697151184
-Inside, small room: 0.12550437450408936
-Caterwaul: 0.021584441885352135
-Purr: 0.020247288048267365
-Speech: 0.018197158351540565
-Vehicle: 0.007446660194545984
-0.059197544398158296
-Cat: 0.9250872135162354
-Animal: 0.8957151174545288
-Domestic animals, pets: 0.8228275775909424
-Meow: 0.7650775909423828
-Music: 0.20210561156272888
-Inside, small room: 0.12290887534618378
-Caterwaul: 0.029371455311775208
-Purr: 0.018731823191046715
-Speech: 0.017130598425865173
-Vehicle: 0.007748497650027275
-0.11839508879631659
-Cat: 0.9336574673652649
-Animal: 0.9111202359199524
-Domestic animals, pets: 0.8349071145057678
-Meow: 0.7761964797973633
-Music: 0.20467285811901093
-Inside, small room: 0.10709915310144424
-Caterwaul: 0.05370649695396423
-Purr: 0.018830426037311554
-Speech: 0.017361722886562347
-Vehicle: 0.006929398979991674
-...
-...
-```
-以下[Demo](https://bj.bcebos.com/paddleaudio/media/audio_tagging_demo.mp4)展示了一个将tagging标签输出到视频的例子，可以实时地对音频进行多标签预测。
-![](https://bj.bcebos.com/paddleaudio/media/audio_tagging_demo.gif)
--- a/audio/examples/panns/assets/audioset_labels.txt
+++ b/audio/examples/panns/assets/audioset_labels.txt
-Speech
-Male speech, man speaking
-Female speech, woman speaking
-Child speech, kid speaking
-Conversation
-Narration, monologue
-Babbling
-Speech synthesizer
-Shout
-Bellow
-Whoop
-Yell
-Battle cry
-Children shouting
-Screaming
-Whispering
-Laughter
-Baby laughter
-Giggle
-Snicker
-Belly laugh
-Chuckle, chortle
-Crying, sobbing
-Baby cry, infant cry
-Whimper
-Wail, moan
-Sigh
-Singing
-Choir
-Yodeling
-Chant
-Mantra
-Male singing
-Female singing
-Child singing
-Synthetic singing
-Rapping
-Humming
-Groan
-Grunt
-Whistling
-Breathing
-Wheeze
-Snoring
-Gasp
-Pant
-Snort
-Cough
-Throat clearing
-Sneeze
-Sniff
-Run
-Shuffle
-Walk, footsteps
-Chewing, mastication
-Biting
-Gargling
-Stomach rumble
-Burping, eructation
-Hiccup
-Fart
-Hands
-Finger snapping
-Clapping
-Heart sounds, heartbeat
-Heart murmur
-Cheering
-Applause
-Chatter
-Crowd
-Hubbub, speech noise, speech babble
-Children playing
-Animal
-Domestic animals, pets
-Dog
-Bark
-Yip
-Howl
-Bow-wow
-Growling
-Whimper (dog)
-Cat
-Purr
-Meow
-Hiss
-Caterwaul
-Livestock, farm animals, working animals
-Horse
-Clip-clop
-Neigh, whinny
-Cattle, bovinae
-Moo
-Cowbell
-Pig
-Oink
-Goat
-Bleat
-Sheep
-Fowl
-Chicken, rooster
-Cluck
-Crowing, cock-a-doodle-doo
-Turkey
-Gobble
-Duck
-Quack
-Goose
-Honk
-Wild animals
-Roaring cats (lions, tigers)
-Roar
-Bird
-Bird vocalization, bird call, bird song
-Chirp, tweet
-Squawk
-Pigeon, dove
-Coo
-Crow
-Caw
-Owl
-Hoot
-Bird flight, flapping wings
-Canidae, dogs, wolves
-Rodents, rats, mice
-Mouse
-Patter
-Insect
-Cricket
-Mosquito
-Fly, housefly
-Buzz
-Bee, wasp, etc.
-Frog
-Croak
-Snake
-Rattle
-Whale vocalization
-Music
-Musical instrument
-Plucked string instrument
-Guitar
-Electric guitar
-Bass guitar
-Acoustic guitar
-Steel guitar, slide guitar
-Tapping (guitar technique)
-Strum
-Banjo
-Sitar
-Mandolin
-Zither
-Ukulele
-Keyboard (musical)
-Piano
-Electric piano
-Organ
-Electronic organ
-Hammond organ
-Synthesizer
-Sampler
-Harpsichord
-Percussion
-Drum kit
-Drum machine
-Drum
-Snare drum
-Rimshot
-Drum roll
-Bass drum
-Timpani
-Tabla
-Cymbal
-Hi-hat
-Wood block
-Tambourine
-Rattle (instrument)
-Maraca
-Gong
-Tubular bells
-Mallet percussion
-Marimba, xylophone
-Glockenspiel
-Vibraphone
-Steelpan
-Orchestra
-Brass instrument
-French horn
-Trumpet
-Trombone
-Bowed string instrument
-String section
-Violin, fiddle
-Pizzicato
-Cello
-Double bass
-Wind instrument, woodwind instrument
-Flute
-Saxophone
-Clarinet
-Harp
-Bell
-Church bell
-Jingle bell
-Bicycle bell
-Tuning fork
-Chime
-Wind chime
-Change ringing (campanology)
-Harmonica
-Accordion
-Bagpipes
-Didgeridoo
-Shofar
-Theremin
-Singing bowl
-Scratching (performance technique)
-Pop music
-Hip hop music
-Beatboxing
-Rock music
-Heavy metal
-Punk rock
-Grunge
-Progressive rock
-Rock and roll
-Psychedelic rock
-Rhythm and blues
-Soul music
-Reggae
-Country
-Swing music
-Bluegrass
-Funk
-Folk music
-Middle Eastern music
-Jazz
-Disco
-Classical music
-Opera
-Electronic music
-House music
-Techno
-Dubstep
-Drum and bass
-Electronica
-Electronic dance music
-Ambient music
-Trance music
-Music of Latin America
-Salsa music
-Flamenco
-Blues
-Music for children
-New-age music
-Vocal music
-A capella
-Music of Africa
-Afrobeat
-Christian music
-Gospel music
-Music of Asia
-Carnatic music
-Music of Bollywood
-Ska
-Traditional music
-Independent music
-Song
-Background music
-Theme music
-Jingle (music)
-Soundtrack music
-Lullaby
-Video game music
-Christmas music
-Dance music
-Wedding music
-Happy music
-Funny music
-Sad music
-Tender music
-Exciting music
-Angry music
-Scary music
-Wind
-Rustling leaves
-Wind noise (microphone)
-Thunderstorm
-Thunder
-Water
-Rain
-Raindrop
-Rain on surface
-Stream
-Waterfall
-Ocean
-Waves, surf
-Steam
-Gurgling
-Fire
-Crackle
-Vehicle
-Boat, Water vehicle
-Sailboat, sailing ship
-Rowboat, canoe, kayak
-Motorboat, speedboat
-Ship
-Motor vehicle (road)
-Car
-Vehicle horn, car horn, honking
-Toot
-Car alarm
-Power windows, electric windows
-Skidding
-Tire squeal
-Car passing by
-Race car, auto racing
-Truck
-Air brake
-Air horn, truck horn
-Reversing beeps
-Ice cream truck, ice cream van
-Bus
-Emergency vehicle
-Police car (siren)
-Ambulance (siren)
-Fire engine, fire truck (siren)
-Motorcycle
-Traffic noise, roadway noise
-Rail transport
-Train
-Train whistle
-Train horn
-Railroad car, train wagon
-Train wheels squealing
-Subway, metro, underground
-Aircraft
-Aircraft engine
-Jet engine
-Propeller, airscrew
-Helicopter
-Fixed-wing aircraft, airplane
-Bicycle
-Skateboard
-Engine
-Light engine (high frequency)
-Dental drill, dentist's drill
-Lawn mower
-Chainsaw
-Medium engine (mid frequency)
-Heavy engine (low frequency)
-Engine knocking
-Engine starting
-Idling
-Accelerating, revving, vroom
-Door
-Doorbell
-Ding-dong
-Sliding door
-Slam
-Knock
-Tap
-Squeak
-Cupboard open or close
-Drawer open or close
-Dishes, pots, and pans
-Cutlery, silverware
-Chopping (food)
-Frying (food)
-Microwave oven
-Blender
-Water tap, faucet
-Sink (filling or washing)
-Bathtub (filling or washing)
-Hair dryer
-Toilet flush
-Toothbrush
-Electric toothbrush
-Vacuum cleaner
-Zipper (clothing)
-Keys jangling
-Coin (dropping)
-Scissors
-Electric shaver, electric razor
-Shuffling cards
-Typing
-Typewriter
-Computer keyboard
-Writing
-Alarm
-Telephone
-Telephone bell ringing
-Ringtone
-Telephone dialing, DTMF
-Dial tone
-Busy signal
-Alarm clock
-Siren
-Civil defense siren
-Buzzer
-Smoke detector, smoke alarm
-Fire alarm
-Foghorn
-Whistle
-Steam whistle
-Mechanisms
-Ratchet, pawl
-Clock
-Tick
-Tick-tock
-Gears
-Pulleys
-Sewing machine
-Mechanical fan
-Air conditioning
-Cash register
-Printer
-Camera
-Single-lens reflex camera
-Tools
-Hammer
-Jackhammer
-Sawing
-Filing (rasp)
-Sanding
-Power tool
-Drill
-Explosion
-Gunshot, gunfire
-Machine gun
-Fusillade
-Artillery fire
-Cap gun
-Fireworks
-Firecracker
-Burst, pop
-Eruption
-Boom
-Wood
-Chop
-Splinter
-Crack
-Glass
-Chink, clink
-Shatter
-Liquid
-Splash, splatter
-Slosh
-Squish
-Drip
-Pour
-Trickle, dribble
-Gush
-Fill (with liquid)
-Spray
-Pump (liquid)
-Stir
-Boiling
-Sonar
-Arrow
-Whoosh, swoosh, swish
-Thump, thud
-Thunk
-Electronic tuner
-Effects unit
-Chorus effect
-Basketball bounce
-Bang
-Slap, smack
-Whack, thwack
-Smash, crash
-Breaking
-Bouncing
-Whip
-Flap
-Scratch
-Scrape
-Rub
-Roll
-Crushing
-Crumpling, crinkling
-Tearing
-Beep, bleep
-Ping
-Ding
-Clang
-Squeal
-Creak
-Rustle
-Whir
-Clatter
-Sizzle
-Clicking
-Clickety-clack
-Rumble
-Plop
-Jingle, tinkle
-Hum
-Zing
-Boing
-Crunch
-Silence
-Sine wave
-Harmonic
-Chirp tone
-Sound effect
-Pulse
-Inside, small room
-Inside, large room or hall
-Inside, public space
-Outside, urban or manmade
-Outside, rural or natural
-Reverberation
-Echo
-Noise
-Environmental noise
-Static
-Mains hum
-Distortion
-Sidetone
-Cacophony
-White noise
-Pink noise
-Throbbing
-Vibration
-Television
-Radio
-Field recording
--- a/audio/examples/panns/audio_tag.py
+++ b/audio/examples/panns/audio_tag.py
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import os
-from typing import List
-import numpy as np
-import paddle
-from paddleaudio.backends import load as load_audio
-from paddleaudio.features import melspectrogram
-from paddleaudio.models.panns import cnn14
-from paddleaudio.utils import logger
-# yapf: disable
-parser = argparse.ArgumentParser(__doc__)
-parser.add_argument('--device', choices=['cpu', 'gpu'], default='gpu', help='Select which device to predict, defaults to gpu.')
-parser.add_argument('--wav', type=str, required=True, help='Audio file to infer.')
-parser.add_argument('--sample_duration', type=float, default=2.0, help='Duration(in seconds) of tagging samples to predict.')
-parser.add_argument('--hop_duration', type=float, default=0.3, help='Duration(in seconds) between two samples.')
-parser.add_argument('--output_dir', type=str, default='./output_dir', help='Directory to save tagging result.')
-args = parser.parse_args()
-# yapf: enable
-def split(waveform: np.ndarray, win_size: int, hop_size: int):
-    """
-    Split into N waveforms.
-    N is decided by win_size and hop_size.
-    """
-    assert isinstance(waveform, np.ndarray)
-    time = []
-    data = []
-    for i in range(0, len(waveform), hop_size):
-        segment = waveform[i:i + win_size]
-        if len(segment) < win_size:
-            segment = np.pad(segment, (0, win_size - len(segment)))
-        data.append(segment)
-        time.append(i / len(waveform))
-    return time, data
-def batchify(data: List[List[float]],
-             sample_rate: int,
-             batch_size: int,
-             **kwargs):
-    """
-    Extract features from waveforms and create batches.
-    """
-    examples = []
-    for waveform in data:
-        feats = melspectrogram(waveform, sample_rate, **kwargs).transpose()
-        examples.append(feats)
-    # Seperates data into some batches.
-    one_batch = []
-    for example in examples:
-        one_batch.append(example)
-        if len(one_batch) == batch_size:
-            yield one_batch
-            one_batch = []
-    if one_batch:
-        yield one_batch
-def predict(model, data: List[List[float]], sample_rate: int,
-            batch_size: int=1):
-    """
-    Use pretrained model to make predictions.
-    """
-    batches = batchify(data, sample_rate, batch_size)
-    results = None
-    model.eval()
-    for batch in batches:
-        feats = paddle.to_tensor(batch).unsqueeze(1)  \
-            # (batch_size, num_frames, num_melbins) -> (batch_size, 1, num_frames, num_melbins)
-        audioset_scores = model(feats)
-        if results is None:
-            results = audioset_scores.numpy()
-        else:
-            results = np.concatenate((results, audioset_scores.numpy()))
-    return results
-if __name__ == '__main__':
-    paddle.set_device(args.device)
-    model = cnn14(pretrained=True, extract_embedding=False)
-    waveform, sr = load_audio(args.wav, sr=None)
-    time, data = split(waveform,
-                       int(args.sample_duration * sr),
-                       int(args.hop_duration * sr))
-    results = predict(model, data, sr, batch_size=8)
-    if not os.path.exists(args.output_dir):
-        os.makedirs(args.output_dir)
-    time = np.arange(0, 1, int(args.hop_duration * sr) / len(waveform))
-    output_file = os.path.join(args.output_dir, f'audioset_tagging_sr_{sr}.npz')
-    np.savez(output_file, time=time, scores=results)
-    logger.info(f'Saved tagging results to {output_file}')
--- a/audio/examples/panns/parse_result.py
+++ b/audio/examples/panns/parse_result.py
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import ast
-import os
-from typing import Dict
-import numpy as np
-from paddleaudio.utils import logger
-# yapf: disable
-parser = argparse.ArgumentParser(__doc__)
-parser.add_argument('--tagging_file', type=str, required=True, help='')
-parser.add_argument('--top_k', type=int, default=10, help='Get top k predicted results of audioset labels.')
-parser.add_argument('--smooth', type=ast.literal_eval, default=True, help='Set "True" to apply posterior smoothing.')
-parser.add_argument('--smooth_size', type=int, default=5, help='Window size of posterior smoothing.')
-parser.add_argument('--label_file', type=str, default='./assets/audioset_labels.txt', help='File of audioset labels.')
-parser.add_argument('--output_dir', type=str, default='./output_dir', help='Directory to save tagging labels.')
-args = parser.parse_args()
-# yapf: enable
-def smooth(results: np.ndarray, win_size: int):
-    """
-    Execute posterior smoothing in-place.
-    """
-    for i in range(len(results) - 1, -1, -1):
-        if i < win_size - 1:
-            left = 0
-        else:
-            left = i + 1 - win_size
-        results[i] = np.sum(results[left:i + 1], axis=0) / (i - left + 1)
-def generate_topk_label(k: int, label_map: Dict, result: np.ndarray):
-    """
-    Return top k result.
-    """
-    result = np.asarray(result)
-    topk_idx = (-result).argsort()[:k]
-    ret = ''
-    for idx in topk_idx:
-        label, score = label_map[idx], result[idx]
-        ret += f'{label}: {score}\n'
-    return ret
-if __name__ == "__main__":
-    label_map = {}
-    with open(args.label_file, 'r') as f:
-        for i, l in enumerate(f.readlines()):
-            label_map[i] = l.strip()
-    results = np.load(args.tagging_file, allow_pickle=True)
-    times, scores = results['time'], results['scores']
-    if args.smooth:
-        logger.info('Posterior smoothing...')
-        smooth(scores, win_size=args.smooth_size)
-    if not os.path.exists(args.output_dir):
-        os.makedirs(args.output_dir)
-    output_file = os.path.join(
-        args.output_dir,
-        os.path.basename(args.tagging_file).split('.')[0] + '.txt')
-    with open(output_file, 'w') as f:
-        for time, score in zip(times, scores):
-            f.write(f'{time}\n')
-            f.write(generate_topk_label(args.top_k, label_map, score) + '\n')
-    logger.info(f'Saved tagging labels to {output_file}')
--- a/audio/paddleaudio/datasets/aishell.py
+++ b/audio/paddleaudio/datasets/aishell.py
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import codecs
-import collections
-import json
-import os
-from typing import Dict
-from paddle.io import Dataset
-from tqdm import tqdm
-from ..backends import load as load_audio
-from ..utils.download import decompress
-from ..utils.download import download_and_decompress
-from ..utils.env import DATA_HOME
-from ..utils.log import logger
-from .dataset import feat_funcs
-__all__ = ['AISHELL1']
-class AISHELL1(Dataset):
-    """
-    This Open Source Mandarin Speech Corpus, AISHELL-ASR0009-OS1, is 178 hours long.
-    It is a part of AISHELL-ASR0009, of which utterance contains 11 domains, including
-    smart home, autonomous driving, and industrial production. The whole recording was
-    put in quiet indoor environment, using 3 different devices at the same time: high
-    fidelity microphone (44.1kHz, 16-bit,); Android-system mobile phone (16kHz, 16-bit),
-    iOS-system mobile phone (16kHz, 16-bit). Audios in high fidelity were re-sampled
-    to 16kHz to build AISHELL- ASR0009-OS1. 400 speakers from different accent areas
-    in China were invited to participate in the recording. The manual transcription
-    accuracy rate is above 95%, through professional speech annotation and strict
-    quality inspection. The corpus is divided into training, development and testing
-    sets.
-    Reference:
-        AISHELL-1: An Open-Source Mandarin Speech Corpus and A Speech Recognition Baseline
-        https://arxiv.org/abs/1709.05522
-    """
-    archieves = [
-        {
-            'url': 'http://www.openslr.org/resources/33/data_aishell.tgz',
-            'md5': '2f494334227864a8a8fec932999db9d8',
-        },
-    ]
-    text_meta = os.path.join('data_aishell', 'transcript',
-                             'aishell_transcript_v0.8.txt')
-    utt_info = collections.namedtuple('META_INFO',
-                                      ('file_path', 'utt_id', 'text'))
-    audio_path = os.path.join('data_aishell', 'wav')
-    manifest_path = os.path.join('data_aishell', 'manifest')
-    subset = ['train', 'dev', 'test']
-    def __init__(self, subset: str='train', feat_type: str='raw', **kwargs):
-        assert subset in self.subset, 'Dataset subset must be one in {}, but got {}'.format(
-            self.subset, subset)
-        self.subset = subset
-        self.feat_type = feat_type
-        self.feat_config = kwargs
-        self._data = self._get_data()
-        super(AISHELL1, self).__init__()
-    def _get_text_info(self) -> Dict[str, str]:
-        ret = {}
-        with open(os.path.join(DATA_HOME, self.text_meta), 'r') as rf:
-            for line in rf.readlines()[1:]:
-                utt_id, text = map(str.strip, line.split(' ',
-                                                         1))  # utt_id, text
-                ret.update({utt_id: ''.join(text.split())})
-        return ret
-    def _get_data(self):
-        if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
-            not os.path.isfile(os.path.join(DATA_HOME, self.text_meta)):
-            download_and_decompress(self.archieves, DATA_HOME)
-            # Extract *wav from *.tar.gz.
-            for root, _, files in os.walk(
-                    os.path.join(DATA_HOME, self.audio_path)):
-                for file in files:
-                    if file.endswith('.tar.gz'):
-                        decompress(os.path.join(root, file))
-                        os.remove(os.path.join(root, file))
-        text_info = self._get_text_info()
-        data = []
-        for root, _, files in os.walk(
-                os.path.join(DATA_HOME, self.audio_path, self.subset)):
-            for file in files:
-                if file.endswith('.wav'):
-                    utt_id = os.path.splitext(file)[0]
-                    if utt_id not in text_info:  # There are some utt_id that without label
-                        continue
-                    text = text_info[utt_id]
-                    file_path = os.path.join(root, file)
-                    data.append(self.utt_info(file_path, utt_id, text))
-        return data
-    def _convert_to_record(self, idx: int):
-        sample = self._data[idx]
-        record = {}
-        # To show all fields in a namedtuple: `type(sample)._fields`
-        for field in type(sample)._fields:
-            record[field] = getattr(sample, field)
-        waveform, sr = load_audio(
-            sample[0])  # The first element of sample is file path
-        feat_func = feat_funcs[self.feat_type]
-        feat = feat_func(
-            waveform, sample_rate=sr,
-            **self.feat_config) if feat_func else waveform
-        record.update({'feat': feat, 'duration': len(waveform) / sr})
-        return record
-    def create_manifest(self, prefix='manifest'):
-        if not os.path.isdir(os.path.join(DATA_HOME, self.manifest_path)):
-            os.makedirs(os.path.join(DATA_HOME, self.manifest_path))
-        manifest_file = os.path.join(DATA_HOME, self.manifest_path,
-                                     f'{prefix}.{self.subset}')
-        with codecs.open(manifest_file, 'w', 'utf-8') as f:
-            for idx in tqdm(range(len(self))):
-                record = self._convert_to_record(idx)
-                record_line = json.dumps(
-                    {
-                        'utt': record['utt_id'],
-                        'feat': record['file_path'],
-                        'feat_shape': (record['duration'], ),
-                        'text': record['text']
-                    },
-                    ensure_ascii=False)
-                f.write(record_line + '\n')
-        logger.info(f'Manifest file {manifest_file} created.')
-    def __getitem__(self, idx):
-        record = self._convert_to_record(idx)
-        return tuple(record.values())
-    def __len__(self):
-        return len(self._data)
--- a/audio/paddleaudio/datasets/dcase.py
+++ b/audio/paddleaudio/datasets/dcase.py
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import collections
-import os
-from typing import List
-from typing import Tuple
-from ..utils.download import download_and_decompress
-from ..utils.env import DATA_HOME
-from .dataset import AudioClassificationDataset
-__all__ = ['UrbanAcousticScenes', 'UrbanAudioVisualScenes']
-class UrbanAcousticScenes(AudioClassificationDataset):
-    """
-    TAU Urban Acoustic Scenes 2020 Mobile Development dataset contains recordings from
-    12 European cities in 10 different acoustic scenes using 4 different devices.
-    Additionally, synthetic data for 11 mobile devices was created based on the original
-    recordings. Of the 12 cities, two are present only in the evaluation set.
-    Reference:
-        A multi-device dataset for urban acoustic scene classification
-        https://arxiv.org/abs/1807.09840
-    """
-    source_url = 'https://zenodo.org/record/3819968/files/'
-    base_name = 'TAU-urban-acoustic-scenes-2020-mobile-development'
-    archieves = [
-        {
-            'url': source_url + base_name + '.meta.zip',
-            'md5': '6eae9db553ce48e4ea246e34e50a3cf5',
-        },
-        {
-            'url': source_url + base_name + '.audio.1.zip',
-            'md5': 'b1e85b8a908d3d6a6ab73268f385d5c8',
-        },
-        {
-            'url': source_url + base_name + '.audio.2.zip',
-            'md5': '4310a13cc2943d6ce3f70eba7ba4c784',
-        },
-        {
-            'url': source_url + base_name + '.audio.3.zip',
-            'md5': 'ed38956c4246abb56190c1e9b602b7b8',
-        },
-        {
-            'url': source_url + base_name + '.audio.4.zip',
-            'md5': '97ab8560056b6816808dedc044dcc023',
-        },
-        {
-            'url': source_url + base_name + '.audio.5.zip',
-            'md5': 'b50f5e0bfed33cd8e52cb3e7f815c6cb',
-        },
-        {
-            'url': source_url + base_name + '.audio.6.zip',
-            'md5': 'fbf856a3a86fff7520549c899dc94372',
-        },
-        {
-            'url': source_url + base_name + '.audio.7.zip',
-            'md5': '0dbffe7b6e45564da649378723284062',
-        },
-        {
-            'url': source_url + base_name + '.audio.8.zip',
-            'md5': 'bb6f77832bf0bd9f786f965beb251b2e',
-        },
-        {
-            'url': source_url + base_name + '.audio.9.zip',
-            'md5': 'a65596a5372eab10c78e08a0de797c9e',
-        },
-        {
-            'url': source_url + base_name + '.audio.10.zip',
-            'md5': '2ad595819ffa1d56d2de4c7ed43205a6',
-        },
-        {
-            'url': source_url + base_name + '.audio.11.zip',
-            'md5': '0ad29f7040a4e6a22cfd639b3a6738e5',
-        },
-        {
-            'url': source_url + base_name + '.audio.12.zip',
-            'md5': 'e5f4400c6b9697295fab4cf507155a2f',
-        },
-        {
-            'url': source_url + base_name + '.audio.13.zip',
-            'md5': '8855ab9f9896422746ab4c5d89d8da2f',
-        },
-        {
-            'url': source_url + base_name + '.audio.14.zip',
-            'md5': '092ad744452cd3e7de78f988a3d13020',
-        },
-        {
-            'url': source_url + base_name + '.audio.15.zip',
-            'md5': '4b5eb85f6592aebf846088d9df76b420',
-        },
-        {
-            'url': source_url + base_name + '.audio.16.zip',
-            'md5': '2e0a89723e58a3836be019e6996ae460',
-        },
-    ]
-    label_list = [
-        'airport', 'shopping_mall', 'metro_station', 'street_pedestrian',
-        'public_square', 'street_traffic', 'tram', 'bus', 'metro', 'park'
-    ]
-    meta = os.path.join(base_name, 'meta.csv')
-    meta_info = collections.namedtuple('META_INFO', (
-        'filename', 'scene_label', 'identifier', 'source_label'))
-    subset_meta = {
-        'train': os.path.join(base_name, 'evaluation_setup', 'fold1_train.csv'),
-        'dev':
-        os.path.join(base_name, 'evaluation_setup', 'fold1_evaluate.csv'),
-        'test': os.path.join(base_name, 'evaluation_setup', 'fold1_test.csv'),
-    }
-    subset_meta_info = collections.namedtuple('SUBSET_META_INFO',
-                                              ('filename', 'scene_label'))
-    audio_path = os.path.join(base_name, 'audio')
-    def __init__(self, mode: str='train', feat_type: str='raw', **kwargs):
-        """
-        Ags:
-            mode (:obj:`str`, `optional`, defaults to `train`):
-                It identifies the dataset mode (train or dev).
-            feat_type (:obj:`str`, `optional`, defaults to `raw`):
-                It identifies the feature type that user wants to extrace of an audio file.
-        """
-        files, labels = self._get_data(mode)
-        super(UrbanAcousticScenes, self).__init__(
-            files=files, labels=labels, feat_type=feat_type, **kwargs)
-    def _get_meta_info(self, subset: str=None,
-                       skip_header: bool=True) -> List[collections.namedtuple]:
-        if subset is None:
-            meta_file = self.meta
-            meta_info = self.meta_info
-        else:
-            assert subset in self.subset_meta, f'Subset must be one in {list(self.subset_meta.keys())}, but got {subset}.'
-            meta_file = self.subset_meta[subset]
-            meta_info = self.subset_meta_info
-        ret = []
-        with open(os.path.join(DATA_HOME, meta_file), 'r') as rf:
-            lines = rf.readlines()[1:] if skip_header else rf.readlines()
-            for line in lines:
-                ret.append(meta_info(*line.strip().split('\t')))
-        return ret
-    def _get_data(self, mode: str) -> Tuple[List[str], List[int]]:
-        if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
-            not os.path.isfile(os.path.join(DATA_HOME, self.meta)):
-            download_and_decompress(self.archieves, DATA_HOME)
-        meta_info = self._get_meta_info(subset=mode, skip_header=True)
-        files = []
-        labels = []
-        for sample in meta_info:
-            filename, label = sample[:2]
-            filename = os.path.basename(filename)
-            target = self.label_list.index(label)
-            files.append(os.path.join(DATA_HOME, self.audio_path, filename))
-            labels.append(int(target))
-        return files, labels
-class UrbanAudioVisualScenes(AudioClassificationDataset):
-    """
-    TAU Urban Audio Visual Scenes 2021 Development dataset contains synchronized audio
-    and video recordings from 12 European cities in 10 different scenes.
-    This dataset consists of 10-seconds audio and video segments from 10
-    acoustic scenes. The total amount of audio in the development set is 34 hours.
-    Reference:
-        A Curated Dataset of Urban Scenes for Audio-Visual Scene Analysis
-        https://arxiv.org/abs/2011.00030
-    """
-    source_url = 'https://zenodo.org/record/4477542/files/'
-    base_name = 'TAU-urban-audio-visual-scenes-2021-development'
-    archieves = [
-        {
-            'url': source_url + base_name + '.meta.zip',
-            'md5': '76e3d7ed5291b118372e06379cb2b490',
-        },
-        {
-            'url': source_url + base_name + '.audio.1.zip',
-            'md5': '186f6273f8f69ed9dbdc18ad65ac234f',
-        },
-        {
-            'url': source_url + base_name + '.audio.2.zip',
-            'md5': '7fd6bb63127f5785874a55aba4e77aa5',
-        },
-        {
-            'url': source_url + base_name + '.audio.3.zip',
-            'md5': '61396bede29d7c8c89729a01a6f6b2e2',
-        },
-        {
-            'url': source_url + base_name + '.audio.4.zip',
-            'md5': '6ddac89717fcf9c92c451868eed77fe1',
-        },
-        {
-            'url': source_url + base_name + '.audio.5.zip',
-            'md5': 'af4820756cdf1a7d4bd6037dc034d384',
-        },
-        {
-            'url': source_url + base_name + '.audio.6.zip',
-            'md5': 'ebd11ec24411f2a17a64723bd4aa7fff',
-        },
-        {
-            'url': source_url + base_name + '.audio.7.zip',
-            'md5': '2be39a76aeed704d5929d020a2909efd',
-        },
-        {
-            'url': source_url + base_name + '.audio.8.zip',
-            'md5': '972d8afe0874720fc2f28086e7cb22a9',
-        },
-    ]
-    label_list = [
-        'airport', 'shopping_mall', 'metro_station', 'street_pedestrian',
-        'public_square', 'street_traffic', 'tram', 'bus', 'metro', 'park'
-    ]
-    meta_base_path = os.path.join(base_name, base_name + '.meta')
-    meta = os.path.join(meta_base_path, 'meta.csv')
-    meta_info = collections.namedtuple('META_INFO', (
-        'filename_audio', 'filename_video', 'scene_label', 'identifier'))
-    subset_meta = {
-        'train':
-        os.path.join(meta_base_path, 'evaluation_setup', 'fold1_train.csv'),
-        'dev':
-        os.path.join(meta_base_path, 'evaluation_setup', 'fold1_evaluate.csv'),
-        'test':
-        os.path.join(meta_base_path, 'evaluation_setup', 'fold1_test.csv'),
-    }
-    subset_meta_info = collections.namedtuple('SUBSET_META_INFO', (
-        'filename_audio', 'filename_video', 'scene_label'))
-    audio_path = os.path.join(base_name, 'audio')
-    def __init__(self, mode: str='train', feat_type: str='raw', **kwargs):
-        """
-        Ags:
-            mode (:obj:`str`, `optional`, defaults to `train`):
-                It identifies the dataset mode (train or dev).
-            feat_type (:obj:`str`, `optional`, defaults to `raw`):
-                It identifies the feature type that user wants to extrace of an audio file.
-        """
-        files, labels = self._get_data(mode)
-        super(UrbanAudioVisualScenes, self).__init__(
-            files=files, labels=labels, feat_type=feat_type, **kwargs)
-    def _get_meta_info(self, subset: str=None,
-                       skip_header: bool=True) -> List[collections.namedtuple]:
-        if subset is None:
-            meta_file = self.meta
-            meta_info = self.meta_info
-        else:
-            assert subset in self.subset_meta, f'Subset must be one in {list(self.subset_meta.keys())}, but got {subset}.'
-            meta_file = self.subset_meta[subset]
-            meta_info = self.subset_meta_info
-        ret = []
-        with open(os.path.join(DATA_HOME, meta_file), 'r') as rf:
-            lines = rf.readlines()[1:] if skip_header else rf.readlines()
-            for line in lines:
-                ret.append(meta_info(*line.strip().split('\t')))
-        return ret
-    def _get_data(self, mode: str) -> Tuple[List[str], List[int]]:
-        if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
-            not os.path.isfile(os.path.join(DATA_HOME, self.meta)):
-            download_and_decompress(self.archieves,
-                                    os.path.join(DATA_HOME, self.base_name))
-        meta_info = self._get_meta_info(subset=mode, skip_header=True)
-        files = []
-        labels = []
-        for sample in meta_info:
-            filename, _, label = sample[:3]
-            filename = os.path.basename(filename)
-            target = self.label_list.index(label)
-            files.append(os.path.join(DATA_HOME, self.audio_path, filename))
-            labels.append(int(target))
-        return files, labels
--- a/audio/paddleaudio/datasets/librispeech.py
+++ b/audio/paddleaudio/datasets/librispeech.py
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import codecs
-import collections
-import json
-import os
-from typing import Dict
-from paddle.io import Dataset
-from tqdm import tqdm
-from ..backends import load as load_audio
-from ..utils.download import download_and_decompress
-from ..utils.env import DATA_HOME
-from ..utils.log import logger
-from .dataset import feat_funcs
-__all__ = ['LIBRISPEECH']
-class LIBRISPEECH(Dataset):
-    """
-    LibriSpeech is a corpus of approximately 1000 hours of 16kHz read English speech,
-    prepared by Vassil Panayotov with the assistance of Daniel Povey. The data is
-    derived from read audiobooks from the LibriVox project, and has been carefully
-    segmented and aligned.
-    Reference:
-        LIBRISPEECH: AN ASR CORPUS BASED ON PUBLIC DOMAIN AUDIO BOOKS
-        http://www.danielpovey.com/files/2015_icassp_librispeech.pdf
-        https://arxiv.org/abs/1709.05522
-    """
-    source_url = 'http://www.openslr.org/resources/12/'
-    archieves = [
-        {
-            'url': source_url + 'train-clean-100.tar.gz',
-            'md5': '2a93770f6d5c6c964bc36631d331a522',
-        },
-        {
-            'url': source_url + 'train-clean-360.tar.gz',
-            'md5': 'c0e676e450a7ff2f54aeade5171606fa',
-        },
-        {
-            'url': source_url + 'train-other-500.tar.gz',
-            'md5': 'd1a0fd59409feb2c614ce4d30c387708',
-        },
-        {
-            'url': source_url + 'dev-clean.tar.gz',
-            'md5': '42e2234ba48799c1f50f24a7926300a1',
-        },
-        {
-            'url': source_url + 'dev-other.tar.gz',
-            'md5': 'c8d0bcc9cca99d4f8b62fcc847357931',
-        },
-        {
-            'url': source_url + 'test-clean.tar.gz',
-            'md5': '32fa31d27d2e1cad72775fee3f4849a9',
-        },
-        {
-            'url': source_url + 'test-other.tar.gz',
-            'md5': 'fb5a50374b501bb3bac4815ee91d3135',
-        },
-    ]
-    speaker_meta = os.path.join('LibriSpeech', 'SPEAKERS.TXT')
-    utt_info = collections.namedtuple('META_INFO', (
-        'file_path', 'utt_id', 'text', 'spk_id', 'spk_gender'))
-    audio_path = 'LibriSpeech'
-    manifest_path = os.path.join('LibriSpeech', 'manifest')
-    subset = [
-        'train-clean-100', 'train-clean-360', 'train-clean-500', 'dev-clean',
-        'dev-other', 'test-clean', 'test-other'
-    ]
-    def __init__(self,
-                 subset: str='train-clean-100',
-                 feat_type: str='raw',
-                 **kwargs):
-        assert subset in self.subset, 'Dataset subset must be one in {}, but got {}'.format(
-            self.subset, subset)
-        self.subset = subset
-        self.feat_type = feat_type
-        self.feat_config = kwargs
-        self._data = self._get_data()
-        super(LIBRISPEECH, self).__init__()
-    def _get_speaker_info(self) -> Dict[str, str]:
-        ret = {}
-        with open(os.path.join(DATA_HOME, self.speaker_meta), 'r') as rf:
-            for line in rf.readlines():
-                if ';' in line:  # Skip dataset abstract
-                    continue
-                spk_id, gender = map(str.strip,
-                                     line.split('|')[:2])  # spk_id, gender
-                ret.update({spk_id: gender})
-        return ret
-    def _get_text_info(self, trans_file) -> Dict[str, str]:
-        ret = {}
-        with open(trans_file, 'r') as rf:
-            for line in rf.readlines():
-                utt_id, text = map(str.strip, line.split(' ',
-                                                         1))  # utt_id, text
-                ret.update({utt_id: text})
-        return ret
-    def _get_data(self):
-        if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
-            not os.path.isfile(os.path.join(DATA_HOME, self.speaker_meta)):
-            download_and_decompress(self.archieves, DATA_HOME,
-                                    len(self.archieves))
-        # Speaker info
-        speaker_info = self._get_speaker_info()
-        # Text info
-        text_info = {}
-        for root, _, files in os.walk(
-                os.path.join(DATA_HOME, self.audio_path, self.subset)):
-            for file in files:
-                if file.endswith('.trans.txt'):
-                    text_info.update(
-                        self._get_text_info(os.path.join(root, file)))
-        data = []
-        for root, _, files in os.walk(
-                os.path.join(DATA_HOME, self.audio_path, self.subset)):
-            for file in files:
-                if file.endswith('.flac'):
-                    utt_id = os.path.splitext(file)[0]
-                    spk_id = utt_id.split('-')[0]
-                    if utt_id not in text_info \
-                        or spk_id not in speaker_info :  # Skip samples with incomplete data
-                        continue
-                    file_path = os.path.join(root, file)
-                    text = text_info[utt_id]
-                    spk_gender = speaker_info[spk_id]
-                    data.append(
-                        self.utt_info(file_path, utt_id, text, spk_id,
-                                      spk_gender))
-        return data
-    def _convert_to_record(self, idx: int):
-        sample = self._data[idx]
-        record = {}
-        # To show all fields in a namedtuple: `type(sample)._fields`
-        for field in type(sample)._fields:
-            record[field] = getattr(sample, field)
-        waveform, sr = load_audio(
-            sample[0])  # The first element of sample is file path
-        feat_func = feat_funcs[self.feat_type]
-        feat = feat_func(
-            waveform, sample_rate=sr,
-            **self.feat_config) if feat_func else waveform
-        record.update({'feat': feat, 'duration': len(waveform) / sr})
-        return record
-    def create_manifest(self, prefix='manifest'):
-        if not os.path.isdir(os.path.join(DATA_HOME, self.manifest_path)):
-            os.makedirs(os.path.join(DATA_HOME, self.manifest_path))
-        manifest_file = os.path.join(DATA_HOME, self.manifest_path,
-                                     f'{prefix}.{self.subset}')
-        with codecs.open(manifest_file, 'w', 'utf-8') as f:
-            for idx in tqdm(range(len(self))):
-                record = self._convert_to_record(idx)
-                record_line = json.dumps(
-                    {
-                        'utt': record['utt_id'],
-                        'feat': record['file_path'],
-                        'feat_shape': (record['duration'], ),
-                        'text': record['text'],
-                        'spk': record['spk_id'],
-                        'gender': record['spk_gender'],
-                    },
-                    ensure_ascii=False)
-                f.write(record_line + '\n')
-        logger.info(f'Manifest file {manifest_file} created.')
-    def __getitem__(self, idx):
-        record = self._convert_to_record(idx)
-        return tuple(record.values())
-    def __len__(self):
-        return len(self._data)
--- a/audio/paddleaudio/datasets/ravdess.py
+++ b/audio/paddleaudio/datasets/ravdess.py
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import collections
-import os
-import random
-from typing import List
-from typing import Tuple
-from ..utils.download import download_and_decompress
-from ..utils.env import DATA_HOME
-from .dataset import AudioClassificationDataset
-__all__ = ['RAVDESS']
-class RAVDESS(AudioClassificationDataset):
-    """
-    The RAVDESS contains 24 professional actors (12 female, 12 male), vocalizing two
-    lexically-matched statements in a neutral North American accent. Speech emotions
-    includes calm, happy, sad, angry, fearful, surprise, and disgust expressions.
-    Each expression is produced at two levels of emotional intensity (normal, strong),
-    with an additional neutral expression.
-    Reference:
-        The Ryerson Audio-Visual Database of Emotional Speech and Song (RAVDESS):
-        A dynamic, multimodal set of facial and vocal expressions in North American English
-        https://doi.org/10.1371/journal.pone.0196391
-    """
-    archieves = [
-        {
-            'url':
-            'https://zenodo.org/record/1188976/files/Audio_Song_Actors_01-24.zip',
-            'md5':
-            '5411230427d67a21e18aa4d466e6d1b9',
-        },
-        {
-            'url':
-            'https://zenodo.org/record/1188976/files/Audio_Speech_Actors_01-24.zip',
-            'md5':
-            'bc696df654c87fed845eb13823edef8a',
-        },
-    ]
-    label_list = [
-        'neutral', 'calm', 'happy', 'sad', 'angry', 'fearful', 'disgust',
-        'surprised'
-    ]
-    meta_info = collections.namedtuple(
-        'META_INFO', ('modality', 'vocal_channel', 'emotion',
-                      'emotion_intensity', 'statement', 'repitition', 'actor'))
-    speech_path = os.path.join(DATA_HOME, 'Audio_Speech_Actors_01-24')
-    song_path = os.path.join(DATA_HOME, 'Audio_Song_Actors_01-24')
-    def __init__(self,
-                 mode='train',
-                 seed=0,
-                 n_folds=5,
-                 split=1,
-                 feat_type='raw',
-                 **kwargs):
-        """
-        Ags:
-            mode (:obj:`str`, `optional`, defaults to `train`):
-                It identifies the dataset mode (train or dev).
-            seed (:obj:`int`, `optional`, defaults to 0):
-                Set the random seed to shuffle samples.
-            n_folds (:obj:`int`, `optional`, defaults to 5):
-                Split the dataset into n folds. 1 fold for dev dataset and n-1 for train dataset.
-            split (:obj:`int`, `optional`, defaults to 1):
-                It specify the fold of dev dataset.
-            feat_type (:obj:`str`, `optional`, defaults to `raw`):
-                It identifies the feature type that user wants to extrace of an audio file.
-        """
-        assert split <= n_folds, f'The selected split should not be larger than n_fold, but got {split} > {n_folds}'
-        files, labels = self._get_data(mode, seed, n_folds, split)
-        super(RAVDESS, self).__init__(
-            files=files, labels=labels, feat_type=feat_type, **kwargs)
-    def _get_meta_info(self, files) -> List[collections.namedtuple]:
-        ret = []
-        for file in files:
-            basename_without_extend = os.path.basename(file)[:-4]
-            ret.append(self.meta_info(*basename_without_extend.split('-')))
-        return ret
-    def _get_data(self, mode, seed, n_folds,
-                  split) -> Tuple[List[str], List[int]]:
-        if not os.path.isdir(self.speech_path) and not os.path.isdir(
-                self.song_path):
-            download_and_decompress(self.archieves, DATA_HOME)
-        wav_files = []
-        for root, _, files in os.walk(self.speech_path):
-            for file in files:
-                if file.endswith('.wav'):
-                    wav_files.append(os.path.join(root, file))
-        for root, _, files in os.walk(self.song_path):
-            for file in files:
-                if file.endswith('.wav'):
-                    wav_files.append(os.path.join(root, file))
-        random.seed(seed)  # shuffle samples to split data
-        random.shuffle(
-            wav_files
-        )  # make sure using the same seed to create train and dev dataset
-        meta_info = self._get_meta_info(wav_files)
-        files = []
-        labels = []
-        n_samples_per_fold = len(meta_info) // n_folds
-        for idx, sample in enumerate(meta_info):
-            _, _, emotion, _, _, _, _ = sample
-            target = int(emotion) - 1
-            fold = idx // n_samples_per_fold + 1
-            if mode == 'train' and int(fold) != split:
-                files.append(wav_files[idx])
-                labels.append(target)
-            if mode != 'train' and int(fold) == split:
-                files.append(wav_files[idx])
-                labels.append(target)
-        return files, labels
--- a/audio/test/README.md
+++ b/audio/test/README.md
-# PaddleAudio Testing Guide
-# Testing
-First clone a version of the project by
-```
-git clone https://github.com/PaddlePaddle/models.git
-```
-Then install the project in your virtual environment.
-```
-cd models/PaddleAudio
-python setup.py bdist_wheel
-pip install -e .[dev]
-```
-The requirements for testing will be installed along with PaddleAudio.  
-Now run
-```
-pytest test
-```
-If it goes well, you will see outputs like these:
-```
-platform linux -- Python 3.7.10, pytest-6.2.4, py-1.10.0, pluggy-0.13.1
-rootdir: ./models/PaddleAudio
-plugins: hydra-core-1.0.6
-collected 16 items  
-test/unit_test/test_backend.py ...........                                                                         [ 68%]
-test/unit_test/test_features.py .....                                                                              [100%]
-==================================================== warnings summary ====================================================
-.
-.
-.
-- Docs: https://docs.pytest.org/en/stable/warnings.html
-============================================ 16 passed, 11 warnings in 6.76s =============================================
-```
--- a/audio/test/unit_test/test_backend.py
+++ b/audio/test/unit_test/test_backend.py
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import librosa
-import numpy as np
-import paddleaudio
-import pytest
-TEST_FILE = './test/data/test_audio.wav'
-def relative_err(a, b, real=True):
-    """compute relative error of two matrices or vectors"""
-    if real:
-        return np.sum((a - b)**2) / (EPS + np.sum(a**2) + np.sum(b**2))
-    else:
-        err = np.sum((a.real - b.real)**2) / \
-            (EPS + np.sum(a.real**2) + np.sum(b.real**2))
-        err += np.sum((a.imag - b.imag)**2) / \
-            (EPS + np.sum(a.imag**2) + np.sum(b.imag**2))
-        return err
-@pytest.mark.filterwarnings("ignore::DeprecationWarning")
-def load_audio():
-    x, r = librosa.load(TEST_FILE, sr=16000)
-    print(f'librosa: mean: {np.mean(x)}, std:{np.std(x)}')
-    return x, r
-# start testing
-x, r = load_audio()
-EPS = 1e-8
-def test_load():
-    s, r = paddleaudio.load(TEST_FILE, sr=16000)
-    assert r == 16000
-    assert s.dtype == 'float32'
-    s, r = paddleaudio.load(
-        TEST_FILE, sr=16000, offset=1, duration=2, dtype='int16')
-    assert len(s) / r == 2.0
-    assert r == 16000
-    assert s.dtype == 'int16'
-def test_depth_convert():
-    y = paddleaudio.depth_convert(x, 'int16')
-    assert len(y) == len(x)
-    assert y.dtype == 'int16'
-    assert np.max(y) <= 32767
-    assert np.min(y) >= -32768
-    assert np.std(y) > EPS
-    y = paddleaudio.depth_convert(x, 'int8')
-    assert len(y) == len(x)
-    assert y.dtype == 'int8'
-    assert np.max(y) <= 127
-    assert np.min(y) >= -128
-    assert np.std(y) > EPS
-# test case for resample
-rs_test_data = [
-    (32000, 'kaiser_fast'),
-    (16000, 'kaiser_fast'),
-    (8000, 'kaiser_fast'),
-    (32000, 'kaiser_best'),
-    (16000, 'kaiser_best'),
-    (8000, 'kaiser_best'),
-    (22050, 'kaiser_best'),
-    (44100, 'kaiser_best'),
-]
-@pytest.mark.parametrize('sr,mode', rs_test_data)
-def test_resample(sr, mode):
-    y = paddleaudio.resample(x, 16000, sr, mode=mode)
-    factor = sr / 16000
-    err = relative_err(len(y), len(x) * factor)
-    print('err:', err)
-    assert err < EPS
-def test_normalize():
-    y = paddleaudio.normalize(x, norm_type='linear', mul_factor=0.5)
-    assert np.max(y) < 0.5 + EPS
-    y = paddleaudio.normalize(x, norm_type='linear', mul_factor=2.0)
-    assert np.max(y) <= 2.0 + EPS
-    y = paddleaudio.normalize(x, norm_type='gaussian', mul_factor=1.0)
-    print('np.std(y):', np.std(y))
-    assert np.abs(np.std(y) - 1.0) < EPS
-if __name__ == '__main__':
-    test_load()
-    test_depth_convert()
-    test_resample(22050, 'kaiser_fast')
-    test_normalize()
--- a/audio/test/unit_test/test_features.py
+++ b/audio/test/unit_test/test_features.py
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import librosa
-import numpy as np
-import paddleaudio as pa
-import pytest
-@pytest.mark.filterwarnings("ignore::DeprecationWarning")
-def load_audio():
-    x, r = librosa.load('./test/data/test_audio.wav')
-    #x,r = librosa.load('../data/test_audio.wav',sr=16000)
-    return x, r
-## start testing
-x, r = load_audio()
-EPS = 1e-8
-def relative_err(a, b, real=True):
-    """compute relative error of two matrices or vectors"""
-    if real:
-        return np.sum((a - b)**2) / (EPS + np.sum(a**2) + np.sum(b**2))
-    else:
-        err = np.sum((a.real - b.real)**2) / (
-            EPS + np.sum(a.real**2) + np.sum(b.real**2))
-        err += np.sum((a.imag - b.imag)**2) / (
-            EPS + np.sum(a.imag**2) + np.sum(b.imag**2))
-        return err
-@pytest.mark.filterwarnings("ignore::DeprecationWarning")
-def test_melspectrogram():
-    a = pa.melspectrogram(
-        x,
-        window_size=512,
-        sr=16000,
-        hop_length=320,
-        n_mels=64,
-        fmin=50,
-        to_db=False, )
-    b = librosa.feature.melspectrogram(
-        x,
-        sr=16000,
-        n_fft=512,
-        win_length=512,
-        hop_length=320,
-        n_mels=64,
-        fmin=50)
-    assert relative_err(a, b) < EPS
-@pytest.mark.filterwarnings("ignore::DeprecationWarning")
-def test_melspectrogram_db():
-    a = pa.melspectrogram(
-        x,
-        window_size=512,
-        sr=16000,
-        hop_length=320,
-        n_mels=64,
-        fmin=50,
-        to_db=True,
-        ref=1.0,
-        amin=1e-10,
-        top_db=None)
-    b = librosa.feature.melspectrogram(
-        x,
-        sr=16000,
-        n_fft=512,
-        win_length=512,
-        hop_length=320,
-        n_mels=64,
-        fmin=50)
-    b = pa.power_to_db(b, ref=1.0, amin=1e-10, top_db=None)
-    assert relative_err(a, b) < EPS
-@pytest.mark.filterwarnings("ignore::DeprecationWarning")
-def test_stft():
-    a = pa.stft(x, n_fft=1024, hop_length=320, win_length=512)
-    b = librosa.stft(x, n_fft=1024, hop_length=320, win_length=512)
-    assert a.shape == b.shape
-    assert relative_err(a, b, real=False) < EPS
-@pytest.mark.filterwarnings("ignore::DeprecationWarning")
-def test_split_frames():
-    a = librosa.util.frame(x, frame_length=512, hop_length=320)
-    b = pa.split_frames(x, frame_length=512, hop_length=320)
-    assert relative_err(a, b) < EPS
-@pytest.mark.filterwarnings("ignore::DeprecationWarning")
-def test_mfcc():
-    kwargs = {
-        'window_size': 512,
-        'hop_length': 320,
-        'n_mels': 64,
-        'fmin': 50,
-        'to_db': False
-    }
-    a = pa.mfcc(
-        x,
-        #sample_rate=16000,
-        spect=None,
-        n_mfcc=20,
-        dct_type=2,
-        norm='ortho',
-        lifter=0,
-        **kwargs)
-    S = librosa.feature.melspectrogram(
-        x,
-        sr=16000,
-        n_fft=512,
-        win_length=512,
-        hop_length=320,
-        n_mels=64,
-        fmin=50)
-    b = librosa.feature.mfcc(
-        x, sr=16000, S=S, n_mfcc=20, dct_type=2, norm='ortho', lifter=0)
-    assert relative_err(a, b) < EPS
-if __name__ == '__main__':
-    test_melspectrogram()
-    test_melspectrogram_db()
-    test_stft()
-    test_split_frames()
-    test_mfcc()
--- a/examples/dataset/aidatatang_200zh/.gitignore
+++ b/examples/dataset/aidatatang_200zh/.gitignore
--- a/examples/dataset/aidatatang_200zh/README.md
+++ b/examples/dataset/aidatatang_200zh/README.md
--- a/examples/dataset/aidatatang_200zh/aidatatang_200zh.py
+++ b/examples/dataset/aidatatang_200zh/aidatatang_200zh.py
--- a/examples/dataset/aishell/.gitignore
+++ b/examples/dataset/aishell/.gitignore
--- a/examples/dataset/aishell/README.md
+++ b/examples/dataset/aishell/README.md
--- a/examples/dataset/aishell/aishell.py
+++ b/examples/dataset/aishell/aishell.py
--- a/examples/dataset/aishell3/README.md
+++ b/examples/dataset/aishell3/README.md
--- a/examples/dataset/chime3_background/chime3_background.py
+++ b/examples/dataset/chime3_background/chime3_background.py
--- a/examples/dataset/gigaspeech/.gitignore
+++ b/examples/dataset/gigaspeech/.gitignore
--- a/examples/dataset/gigaspeech/README.md
+++ b/examples/dataset/gigaspeech/README.md
--- a/examples/dataset/gigaspeech/gigaspeech.py
+++ b/examples/dataset/gigaspeech/gigaspeech.py
--- a/examples/dataset/gigaspeech/run.sh
+++ b/examples/dataset/gigaspeech/run.sh
--- a/examples/dataset/librispeech/.gitignore
+++ b/examples/dataset/librispeech/.gitignore
--- a/examples/dataset/librispeech/librispeech.py
+++ b/examples/dataset/librispeech/librispeech.py
--- a/examples/dataset/magicdata/README.md
+++ b/examples/dataset/magicdata/README.md
--- a/examples/dataset/mini_librispeech/.gitignore
+++ b/examples/dataset/mini_librispeech/.gitignore
--- a/examples/dataset/mini_librispeech/mini_librispeech.py
+++ b/examples/dataset/mini_librispeech/mini_librispeech.py
--- a/examples/dataset/multi_cn/README.md
+++ b/examples/dataset/multi_cn/README.md
--- a/examples/dataset/musan/.gitignore
+++ b/examples/dataset/musan/.gitignore
--- a/examples/dataset/musan/musan.py
+++ b/examples/dataset/musan/musan.py
--- a/examples/dataset/primewords/README.md
+++ b/examples/dataset/primewords/README.md
--- a/examples/dataset/rir_noise/.gitignore
+++ b/examples/dataset/rir_noise/.gitignore
--- a/examples/dataset/rir_noise/rir_noise.py
+++ b/examples/dataset/rir_noise/rir_noise.py
--- a/examples/dataset/st-cmds/README.md
+++ b/examples/dataset/st-cmds/README.md
--- a/examples/dataset/ted_en_zh/.gitignore
+++ b/examples/dataset/ted_en_zh/.gitignore
--- a/examples/dataset/ted_en_zh/ted_en_zh.py
+++ b/examples/dataset/ted_en_zh/ted_en_zh.py
--- a/examples/dataset/thchs30/.gitignore
+++ b/examples/dataset/thchs30/.gitignore
--- a/examples/dataset/thchs30/README.md
+++ b/examples/dataset/thchs30/README.md
--- a/examples/dataset/thchs30/thchs30.py
+++ b/examples/dataset/thchs30/thchs30.py
--- a/examples/dataset/timit/.gitignore
+++ b/examples/dataset/timit/.gitignore
--- a/examples/dataset/timit/timit.py
+++ b/examples/dataset/timit/timit.py
--- a/examples/dataset/timit/timit_kaldi_standard_split.py
+++ b/examples/dataset/timit/timit_kaldi_standard_split.py
--- a/examples/dataset/voxforge/run_data.sh
+++ b/examples/dataset/voxforge/run_data.sh
 #! /usr/bin/env bash
-TARGET_DIR=${MAIN_ROOT}/examples/dataset/voxforge
+TARGET_DIR=${MAIN_ROOT}/dataset/voxforge
 mkdir -p ${TARGET_DIR}
 # download data, generate manifests
-python ${MAIN_ROOT}/examples/dataset/voxforge/voxforge.py \
+python ${MAIN_ROOT}/dataset/voxforge/voxforge.py \
 --manifest_prefix="${TARGET_DIR}/manifest" \
 --target_dir="${TARGET_DIR}" \
 --is_merge_dialect=True \

--- a/examples/dataset/voxforge/voxforge.py
+++ b/examples/dataset/voxforge/voxforge.py
--- a/demos/style_fs2/style_syn.py
+++ b/demos/style_fs2/style_syn.py
@@ -34,7 +34,9 @@ def evaluate(args, fastspeech2_config, pwg_config):
    sentences = []
    with open(args.text, 'rt') as f:
        for line in f:
-            utt_id, sentence = line.strip().split()
+            items = line.strip().split()
+            utt_id = items[0]
+            sentence = "".join(items[1:])
            sentences.append((utt_id, sentence))
    with open(args.phones_dict, "r") as f:

--- a/docs/source/reference.md
+++ b/docs/source/reference.md
@@ -6,7 +6,7 @@ We borrowed a lot of code from these repos to build `model` and `engine`, thanks
 - Apache-2.0 License
 - python/shell `utils`
 - kaldi feat preprocessing
- data pipe line and `transform`
+- data pipe line and `transformer`
 - some tts models, like `fastspeech2` and GAN-based `vocoder`
 * [wenet](https://github.com/wenet-e2e/wenet/blob/main/LICENSE)

--- a/docs/source/released_model.md
+++ b/docs/source/released_model.md
 # Released Models
 ## Speech-to-Text Models
 ### Acoustic Model Released in paddle 2.X
 Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | Hours of speech | example link
 :-------------:| :------------:| :-----: | -----: | :----------------- |:--------- | :---------- | :--------- | :-----------
-[Ds2 Online Aishell ASR0 Model](https://deepspeech.bj.bcebos.com/release2.2/aishell/s0/ds2_online_aishll_CER8.02_release.tar.gz) | Aishell Dataset | Char-based | 345 MB  | 2 Conv + 5 LSTM layers with only forward direction | 0.080218 |-| 151 h | [D2 Online Aishell S0 Example](../../examples/aishell/asr0)
+[Ds2 Online Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/aishell_ds2_online_cer8.00_release.tar.gz) | Aishell Dataset | Char-based | 345 MB  | 2 Conv + 5 LSTM layers with only forward direction | 0.080 |-| 151 h | [D2 Online Aishell S0 Example](../../examples/aishell/asr0)
-[Ds2 Offline Aishell ASR0 Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s0/aishell.s0.ds2.offline.cer6p65.release.tar.gz)| Aishell Dataset | Char-based | 306 MB | 2 Conv + 3 bidirectional GRU layers| 0.065 |-| 151 h | [Ds2 Offline Aishell S0 Example](../../examples/aishell/asr0)
+[Ds2 Offline Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/ds2.model.tar.gz)| Aishell Dataset | Char-based | 306 MB | 2 Conv + 3 bidirectional GRU layers| 0.064 |-| 151 h | [Ds2 Offline Aishell S0 Example](../../examples/aishell/asr0)
 [Conformer Online Aishell ASR1 Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.chunk.release.tar.gz) | Aishell Dataset | Char-based | 283 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0594 |-| 151 h | [Conformer Online Aishell S1 Example](../../examples/aishell/s1)
 [Conformer Offline Aishell ASR1 Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.release.tar.gz) | Aishell Dataset | Char-based | 284 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0547 |-| 151 h | [Conformer Offline Aishell S1 Example](../../examples/aishell/s1)
 [Conformer Librispeech ASR1 Model](https://deepspeech.bj.bcebos.com/release2.1/librispeech/s1/conformer.release.tar.gz) | Librispeech Dataset | subword-based | 287 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring |-| 0.0325 | 960 h | [Conformer Librispeech S1 example](../../example/librispeech/s1)
-[Transformer Librispeech ASR1 Model](https://deepspeech.bj.bcebos.com/release2.2/librispeech/s1/librispeech.s1.transformer.all.wer5p62.release.tar.gz) | Librispeech Dataset | subword-based | 131 MB  | Encoder:Transformer, Decoder:Transformer, Decoding method: Attention rescoring |-| 0.0456 | 960 h | [Transformer Librispeech S1 example](../../example/librispeech/s1)
+[Transformer Librispeech ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr1/transformer.model.tar.gz) | Librispeech Dataset | subword-based | 131 MB  | Encoder:Transformer, Decoder:Transformer, Decoding method: Attention rescoring |-| 0.0410 | 960 h | [Transformer Librispeech S1 example](../../example/librispeech/s1)
-[Transformer Librispeech ASR2 Model](https://deepspeech.bj.bcebos.com/release2.2/librispeech/s2/libri_transformer_espnet_wer3p84.release.tar.gz) | Librispeech Dataset | subword-based | 131 MB  | Encoder:Transformer, Decoder:Transformer, Decoding method: Attention |-| 0.0384 | 960 h | [Transformer Librispeech S2 example](../../example/librispeech/s2)
+[Transformer Librispeech ASR2 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr2/transformer.model.tar.gz) | Librispeech Dataset | subword-based | 131 MB  | Encoder:Transformer, Decoder:Transformer, Decoding method: JoinCTC w/ LM |-| 0.024 | 960 h | [Transformer Librispeech S2 example](../../example/librispeech/s2)
 ### Acoustic Model Transformed from paddle 1.8
 Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | Hours of speech
@@ -20,14 +22,15 @@ Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER |
 [Ds2 Offline Baidu en8k model](https://deepspeech.bj.bcebos.com/eng_models/baidu_en8k_v1.8_to_v2.x.tar.gz)|Baidu Internal English Dataset| Word-based| 273 MB| 2 Conv + 3 bidirectional GRU layers |-| 0.0541 | 8628 h|
 ### Language Model Released
 Language Model | Training Data | Token-based | Size | Descriptions
 :-------------:| :------------:| :-----: | -----: | :-----------------
 [English LM](https://deepspeech.bj.bcebos.com/en_lm/common_crawl_00.prune01111.trie.klm) |  [CommonCrawl(en.00)](http://web-language-models.s3-website-us-east-1.amazonaws.com/ngrams/en/deduped/en.00.deduped.xz) | Word-based | 8.3 GB | Pruned with 0 1 1 1 1; <br/> About 1.85 billion n-grams; <br/> 'trie'  binary with '-a 22 -q 8 -b 8'
 [Mandarin LM Small](https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm) | Baidu Internal Corpus | Char-based | 2.8 GB | Pruned with 0 1 2 4 4; <br/> About 0.13 billion n-grams; <br/> 'probing' binary with default settings
 [Mandarin LM Large](https://deepspeech.bj.bcebos.com/zh_lm/zhidao_giga.klm) | Baidu Internal Corpus | Char-based | 70.4 GB | No Pruning; <br/> About 3.7 billion n-grams; <br/> 'probing' binary with default settings
 ## Text-to-Speech Models
 ### Acoustic Models
 Model Type | Dataset| Example Link | Pretrained Models|Static Models|Siize(static)
 :-------------:| :------------:| :-----: | :-----:| :-----:| :-----:
@@ -40,7 +43,6 @@ FastSpeech2| LJSpeech |[fastspeech2-ljspeech](https://github.com/PaddlePaddle/Pa
 FastSpeech2| VCTK |[fastspeech2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/tts3)|[fastspeech2_nosil_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_vctk_ckpt_0.5.zip)|||
 ### Vocoders
 Model Type | Dataset| Example Link | Pretrained Models| Static Models|Size(static)
 :-------------:| :------------:| :-----: | :-----:| :-----:| :-----:
 WaveFlow| LJSpeech |[waveflow-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc0)|[waveflow_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/waveflow/waveflow_ljspeech_ckpt_0.3.zip)|||

--- a/docs/source/tts/README.md
+++ b/docs/source/tts/README.md
@@ -5,20 +5,6 @@ Parakeet aims to provide a flexible, efficient and state-of-the-art text-to-spee
  <img src="../../images/logo.png" width=300 /> <br>
 </div>
-## News  <img src="../../images/news_icon.png" width="40"/>
- Oct-12-2021, Refector examples code.
- Oct-12-2021, Parallel WaveGAN with LJSpeech. Check [examples/GANVocoder/parallelwave_gan/ljspeech](./examples/GANVocoder/parallelwave_gan/ljspeech).
- Oct-12-2021, FastSpeech2/FastPitch with LJSpeech. Check [examples/fastspeech2/ljspeech](./examples/fastspeech2/ljspeech).
- Sep-14-2021, Reconstruction of TransformerTTS. Check [examples/transformer_tts/ljspeech](./examples/transformer_tts/ljspeech).
- Aug-31-2021, Chinese Text Frontend. Check [examples/text_frontend](./examples/text_frontend).
- Aug-23-2021, FastSpeech2/FastPitch with AISHELL-3. Check [examples/fastspeech2/aishell3](./examples/fastspeech2/aishell3).
- Aug-03-2021, FastSpeech2/FastPitch with CSMSC. Check [examples/fastspeech2/baker](./examples/fastspeech2/baker).
- Jul-19-2021, SpeedySpeech with CSMSC. Check [examples/speedyspeech/baker](./examples/speedyspeech/baker).
- Jul-01-2021, Parallel WaveGAN with CSMSC. Check [examples/GANVocoder/parallelwave_gan/baker](./examples/GANVocoder/parallelwave_gan/baker).
- Jul-01-2021, Montreal-Forced-Aligner. Check  [examples/use_mfa](./examples/use_mfa).
- May-07-2021, Voice Cloning in Chinese. Check [examples/tacotron2_aishell3](./examples/tacotron2_aishell3).
 ## Overview
 In order to facilitate exploiting the existing TTS models directly and developing the new ones, Parakeet selects typical models and provides their reference implementations in PaddlePaddle. Further more, Parakeet abstracts the TTS pipeline and standardizes the procedure of data preprocessing, common modules sharing, model configuration, and the process of training and synthesis. The models supported here include Text FrontEnd, end-to-end Acoustic models and Vocoders:
@@ -38,50 +24,11 @@ In order to facilitate exploiting the existing TTS models directly and developin
  - [Transfer Learning from Speaker Verification to Multispeaker Text-to-Speech Synthesis](https://arxiv.org/pdf/1806.04558v4.pdf)
  - [【GE2E】Generalized End-to-End Loss for Speaker Verification](https://arxiv.org/abs/1710.10467)
-## Setup
-It's difficult to install some dependent libraries for this repo in Windows system, we recommend that you **DO NOT** use Windows system, please use `Linux`.
-Make sure the library `libsndfile1` is installed, e.g., on Ubuntu.
-```bash
-sudo apt-get install libsndfile1
-```
-### Install PaddlePaddle
-See [install](https://www.paddlepaddle.org.cn/install/quick) for more details. This repo requires PaddlePaddle **2.1.2** or above.
-### Install Parakeet
-```bash
-git clone https://github.com/PaddlePaddle/Parakeet
-cd Parakeet
-pip install -e .
-```
-If some python dependent packages cannot be installed successfully, you can run the following script first.
-(replace `python3.6` with your own python version)
-```bash
-sudo apt install -y python3.6-dev
-```
-See [install](https://paddle-parakeet.readthedocs.io/en/latest/install.html) for more details.
-## Examples
-Entries to the introduction, and the launch of training and synthsis for different example models:
- [>>> Chinese Text Frontend](./examples/text_frontend)
- [>>> FastSpeech2/FastPitch](./examples/fastspeech2)
- [>>> Montreal-Forced-Aligner](./examples/use_mfa)
- [>>> Parallel WaveGAN](./examples/GANVocoder/parallelwave_gan)
- [>>> SpeedySpeech](./examples/speedyspeech)
- [>>> Tacotron2_AISHELL3](./examples/tacotron2_aishell3)
- [>>> GE2E](./examples/ge2e)
- [>>> WaveFlow](./examples/waveflow)
- [>>> TransformerTTS](./examples/transformer_tts)
- [>>> Tacotron2](./examples/tacotron2)
 ## Audio samples
-### TTS models (Acoustic Model + Neural Vocoder)
-Check our [website](https://paddleparakeet.readthedocs.io/en/latest/demo.html) for audio sampels.
+Check our [website](https://paddlespeech.readthedocs.io/en/latest/tts/demo.html) for audio sampels.
 ## Released Model

--- a/docs/source/tts/advanced_usage.md
+++ b/docs/source/tts/advanced_usage.md
@@ -290,7 +290,7 @@ The following is the basic  `ArgumentParser`:
 1. `--config`  is used to support configuration file parsing, and the configuration file itself handles the unique options of each experiment.
 2. `--train-metadata` is the path to the training data.
 3.  `--output-dir` is the dir to save the training results.（if there are checkpoints in  `checkpoints/` of  `--output-dir` , it's defalut to reload the newest checkpoint to train)
-4. `--device` and  `--nprocs` determine operation modes，`--device` specifies the type of running device, whether to run on `cpu` or `gpu`. `--nprocs` refers to  the number of training processes. If `nprocs` > 1, it means that multi process parallel training is used. (Note: currently only GPU multi card multi process training is supported.)
+4. `--ngpu` determine operation modes，`--ngpu` refers to the number of training processes. If `ngpu` > 0, it means using GPU, else CPU is used.
 Developers can refer to the examples in `examples` to write the default configuration file when adding new experiments.

--- a/docs/topic/ctc/ctc_loss.ipynb
+++ b/docs/topic/ctc/ctc_loss.ipynb
@@ -343,6 +343,16 @@
    "    $$"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "id": "41637c03",
+   "metadata": {},
+   "source": [
+    "## Source Code\n",
+    "本人在 [warp-ctc](https://github.com/zh794390558/warp-ctc) 上加了注释，并调整 index 的索引方式，便于理解代码。\n",
+    "对比上面的公式推导和lattice图可以快速理解 ctc 实现。"
+   ]
+  },
  {
   "cell_type": "markdown",
   "id": "coordinated-music",
@@ -372,7 +382,7 @@
 ],
 "metadata": {
  "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
@@ -386,7 +396,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.9.1"
+   "version": "3.7.0"
  },
  "toc": {
   "base_numbering": 1,

--- a/examples/aishell/asr0/local/data.sh
+++ b/examples/aishell/asr0/local/data.sh
@@ -9,7 +9,7 @@ dict_dir=data/lang_char
 mkdir -p data
 mkdir -p ${dict_dir}
-TARGET_DIR=${MAIN_ROOT}/examples/dataset
+TARGET_DIR=${MAIN_ROOT}/dataset
 mkdir -p ${TARGET_DIR}
 if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then

--- a/examples/aishell/asr0/local/export.sh
+++ b/examples/aishell/asr0/local/export.sh
@@ -14,7 +14,7 @@ jit_model_export_path=$3
 model_type=$4
 python3 -u ${BIN_DIR}/export.py \
--nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --checkpoint_path ${ckpt_path_prefix} \
 --export_path ${jit_model_export_path} \

--- a/examples/aishell/asr0/local/test.sh
+++ b/examples/aishell/asr0/local/test.sh
@@ -19,7 +19,7 @@ if [ $? -ne 0 ]; then
 fi
 python3 -u ${BIN_DIR}/test.py \
--nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --result_file ${ckpt_prefix}.rsl \
 --checkpoint_path ${ckpt_prefix} \

--- a/examples/aishell/asr0/local/test_export.sh
+++ b/examples/aishell/asr0/local/test_export.sh
@@ -19,7 +19,7 @@ if [ $? -ne 0 ]; then
 fi
 python3 -u ${BIN_DIR}/test_export.py \
--nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --result_file ${jit_model_export_path}.rsl \
 --export_path ${jit_model_export_path} \

--- a/examples/aishell/asr0/local/test_hub.sh
+++ b/examples/aishell/asr0/local/test_hub.sh
@@ -13,6 +13,17 @@ ckpt_prefix=$2
 model_type=$3
 audio_file=$4
+mkdir -p data
+wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/demo_01_03.wav -P data/
+if [ $? -ne 0 ]; then
+   exit 1
+fi
+if [ ! -f ${audio_file} ]; then
+    echo "Plase input the right audio_file path"
+    exit 1
+fi
 # download language model
 bash local/download_lm_ch.sh
 if [ $? -ne 0 ]; then
@@ -20,7 +31,7 @@ if [ $? -ne 0 ]; then
 fi
 python3 -u ${BIN_DIR}/test_hub.py \
--nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --result_file ${ckpt_prefix}.rsl \
 --checkpoint_path ${ckpt_prefix} \

--- a/examples/aishell/asr0/local/train.sh
+++ b/examples/aishell/asr0/local/train.sh
@@ -21,7 +21,7 @@ if [ ${seed} != 0 ]; then
 fi
 python3 -u ${BIN_DIR}/train.py \
--nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --output exp/${ckpt_name} \
 --model_type ${model_type} \

--- a/examples/aishell/asr0/run.sh
+++ b/examples/aishell/asr0/run.sh
@@ -8,6 +8,7 @@ stop_stage=100
 conf_path=conf/deepspeech2.yaml    #conf/deepspeech2.yaml or conf/deepspeeech2_online.yaml
 avg_num=1
 model_type=offline    # offline or online
+audio_file=data/demo_01_03.wav
 source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
@@ -15,7 +16,6 @@ avg_ckpt=avg_${avg_num}
 ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}')
 echo "checkpoint name ${ckpt}"
-audio_file="data/tmp.wav"
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    # prepare data

--- a/examples/aishell/asr1/READEME.md
+++ b/examples/aishell/asr1/READEME.md
@@ -4,7 +4,7 @@ This example contains code used to train a Transformer or [Conformer](http://arx
 ## Overview
-All the scirpts you need are in the ```run.sh```. There are several stages in the ```run.sh```, and each stage has its function.
+All the scirpts you need are in ```run.sh```. There are several stages in ```run.sh```, and each stage has its function.
 | Stage | Function                                                     |
 | :---- | :----------------------------------------------------------- |
@@ -16,7 +16,7 @@ All the scirpts you need are in the ```run.sh```. There are several stages in th
 | 5     | Infer the single audio file                                  |
-You can choose to run a range of  stages by setting the ```stage``` and ```stop_stage ``` . 
+You can choose to run a range of stages by setting ```stage``` and ```stop_stage ```. 
 For example, if you want to execute the code in stage 2 and stage 3, you can run this script:
@@ -33,19 +33,17 @@ bash run.sh --stage 0 --stop_stage 0
-The document below will describe the scripts in the ```run.sh``` in detail.
+The document below will describe the scripts in ```run.sh``` in detail.
 ## The Environment Variables
-The path.sh contains the environment variable. 
+The path.sh contains the environment variables. 
 ```bash
 source path.sh
 ```
-This script needs to be run firstly.  
+This script needs to be run firstly. And another script is also needed:
-And another script is also needed:
 ```bash
 source ${MAIN_ROOT}/utils/parse_options.sh
@@ -57,10 +55,10 @@ It will support the way of using```--varibale value``` in the shell scripts.
 ## The Local Variables
-Some local variables are set in the ```run.sh```. 
+Some local variables are set in ```run.sh```. 
-```gpus``` denotes the GPU number you want to use. If you set ```gpus=```,  it means you only use CPU. 
+```gpus``` denotes the GPU number you want to use. If you set ```gpus=```, it means you only use CPU. 
-```stage``` denotes  the number of stage you want to start from in the expriments.
+```stage``` denotes the number of stage you want to start from in the expriments.
 ```stop stage```denotes the number of stage you want to end at in the expriments. 
 ```conf_path``` denotes the config path of the model.
@@ -71,7 +69,7 @@ Some local variables are set in the ```run.sh```.
 ```ckpt``` denotes the checkpoint prefix of the model, e.g. "conformer"
-You can set the local variables (except ```ckpt```)  when you use the ```run.sh```
+You can set the local variables (except ```ckpt```) when you use ```run.sh```
 For example, you can set the ```gpus``` and ``avg_num`` when you use the command line.:
@@ -83,7 +81,7 @@ bash run.sh --gpus 0,1 --avg_num 20
 ## Stage 0: Data Processing
-To use this example, you need to process data firstly and  you can use stage 0 in the ```run.sh``` to do this. The code is shown below:
+To use this example, you need to process data firstly and you can use stage 0 in ```run.sh``` to do this. The code is shown below:
 ```bash
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
@@ -129,12 +127,12 @@ data/
 ## Stage 1: Model Training
-If you want to train the model. you can use stage 1 in the ```run.sh```. The code is shown below. 
+If you want to train the model. you can use stage 1 in ```run.sh```. The code is shown below. 
 ```bash
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     # train model, all `ckpt` under `exp` dir
-     CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path}  ${ckpt}
+     CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt}
 fi
 ```
@@ -149,14 +147,14 @@ or you can run these scripts in the command line (only use CPU).
 ```bash
 source path.sh
 bash ./local/data.sh
-CUDA_VISIBLE_DEVICES= ./local/train.sh conf/conformer.yaml  conformer
+CUDA_VISIBLE_DEVICES= ./local/train.sh conf/conformer.yaml conformer
 ```
-## Stage 2:  Top-k Models Averaging
+## Stage 2: Top-k Models Averaging
-After training the model,  we need to get the final model for testing and inference. In every epoch, the model checkpoint is saved, so we can choose the best model from them based on the validation loss or we can sort them and average the parameters of the top-k models  to get the final model.  We can use stage 2 to do this, and the code is shown below:
+After training the model, we need to get the final model for testing and inference. In every epoch, the model checkpoint is saved, so we can choose the best model from them based on the validation loss or we can sort them and average the parameters of the top-k models to get the final model. We can use stage 2 to do this, and the code is shown below:
 ```bash
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
@@ -166,7 +164,7 @@ After training the model,  we need to get the final model for testing and infere
 ```
 The ```avg.sh``` is in the ```../../../utils/``` which is define in the ```path.sh```.
-If you want to get the final model,  you can use the script below to execute stage 0, stage 1, and stage 2:
+If you want to get the final model, you can use the script below to execute stage 0, stage 1, and stage 2:
 ```bash
 bash run.sh --stage 0 --stop_stage 2
@@ -185,7 +183,7 @@ avg.sh best exp/conformer/checkpoints 20
 ## Stage 3: Model Testing
-The test stage is to evaluate the model performance.. The code of test stage is shown below:
+The test stage is to evaluate the model performance. The code of test stage is shown below:
 ```bash
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
@@ -194,7 +192,7 @@ The test stage is to evaluate the model performance.. The code of test stage is
 fi
 ```
-If you want to train a model and test it,  you can use the script below to execute stage 0, stage 1,  stage 2, and stage 3 :
+If you want to train a model and test it, you can use the script below to execute stage 0, stage 1, stage 2, and stage 3 :
 ```bash
 bash run.sh --stage 0 --stop_stage 3
@@ -228,7 +226,7 @@ wget https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/transformer.model.tar.g
 ```
-using the ```tar``` scripts to unpack the model  and then you can use the script to test the modle.
+using the ```tar``` scripts to unpack the model and then you can use the script to test the modle.
 For example:
@@ -238,7 +236,7 @@ tar xzvf transformer.model.tar.gz
 source path.sh
 # If you have process the data and get the manifest file， you can skip the following 2 steps
 bash local/data.sh --stage -1 --stop_stage -1
-bash local/data.sh --stage 2  --stop_stage 2
+bash local/data.sh --stage 2 --stop_stage 2
 CUDA_VISIBLE_DEVICES= ./local/test.sh conf/transformer.yaml exp/transformer/checkpoints/avg_20
 ```
@@ -291,7 +289,7 @@ If you want to get the alignment between the audio and the text, you can use the
 fi
 ```
-If you want to train the model, test it and do the alignment,  you can use the script below to execute stage 0, stage 1, stage 2, and stage 3 :
+If you want to train the model, test it and do the alignment, you can use the script below to execute stage 0, stage 1, stage 2, and stage 3 :
 ```bash
 bash run.sh --stage 0 --stop_stage 4
@@ -320,26 +318,26 @@ CUDA_VISIBLE_DEVICES= ./local/align.sh conf/conformer.yaml exp/conformer/checkpo
 ## Stage 5: Single Audio File Inference
-In some situations, you want to use the trained model to do the inference for the single audio file. You can use stage  5. The code is shown below
+In some situations, you want to use the trained model to do the inference for the single audio file. You can use stage 5. The code is shown below
 ```bash
 if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
     # test a single .wav file
-     CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1
+     CUDA_VISIBLE_DEVICES=0 ./local/test_hub.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1
 fi
 ```
-you can train the model by yourself using ```bash run.sh --stage 0 --stop_stage 3```, or you can download the pretrained model by the script below:
+you can train the model by yourself using ```bash run.sh --stage 0 --stop_stage 3```, or you can download the pretrained model through the script below:
 ```bash
 wget https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/transformer.model.tar.gz
 tar xzvf transformer.model.tar.gz
 ```
-You need to prepare an audio file, please confirm the sample rate of the audio is 16K. Assume the path of the audio file is ```data/test_audio.wav```,  you can get the result by running the script below.
+You need to prepare an audio file, please confirm the sample rate of the audio is 16K. Assume the path of the audio file is ```data/test_audio.wav```, you can get the result by running the script below.
 ```bash
-CUDA_VISIBLE_DEVICES= ./local/test_wav.sh conf/transformer.yaml exp/transformer/checkpoints/avg_20 data/test_audio.wav
+CUDA_VISIBLE_DEVICES= ./local/test_hub.sh conf/transformer.yaml exp/transformer/checkpoints/avg_20 data/test_audio.wav
 ```

--- a/examples/aishell/asr1/local/align.sh
+++ b/examples/aishell/asr1/local/align.sh
@@ -18,7 +18,7 @@ mkdir -p ${output_dir}
 # align dump in `result_file`
 # .tier, .TextGrid dump in `dir of result_file`
 python3 -u ${BIN_DIR}/alignment.py \
--nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --result_file ${output_dir}/${type}.align \
 --checkpoint_path ${ckpt_prefix} \

--- a/examples/aishell/asr1/local/data.sh
+++ b/examples/aishell/asr1/local/data.sh
@@ -8,7 +8,7 @@ dict_dir=data/lang_char
 mkdir -p data
 mkdir -p ${dict_dir}
-TARGET_DIR=${MAIN_ROOT}/examples/dataset
+TARGET_DIR=${MAIN_ROOT}/dataset
 mkdir -p ${TARGET_DIR}
 if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then

--- a/examples/aishell/asr1/local/export.sh
+++ b/examples/aishell/asr1/local/export.sh
@@ -13,7 +13,7 @@ ckpt_path_prefix=$2
 jit_model_export_path=$3
 python3 -u ${BIN_DIR}/export.py \
--nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --checkpoint_path ${ckpt_path_prefix} \
 --export_path ${jit_model_export_path}

--- a/examples/aishell/asr1/local/test.sh
+++ b/examples/aishell/asr1/local/test.sh
@@ -34,7 +34,7 @@ for type in attention ctc_greedy_search; do
    output_dir=${ckpt_prefix}
    mkdir -p ${output_dir}
    python3 -u ${BIN_DIR}/test.py \
-    --nproc ${ngpu} \
+    --ngpu ${ngpu} \
    --config ${config_path} \
    --result_file ${output_dir}/${type}.rsl \
    --checkpoint_path ${ckpt_prefix} \
@@ -53,7 +53,7 @@ for type in ctc_prefix_beam_search attention_rescoring; do
    output_dir=${ckpt_prefix}
    mkdir -p ${output_dir}
    python3 -u ${BIN_DIR}/test.py \
-    --nproc ${ngpu} \
+    --ngpu ${ngpu} \
    --config ${config_path} \
    --result_file ${output_dir}/${type}.rsl \
    --checkpoint_path ${ckpt_prefix} \

--- a/examples/aishell/asr1/local/test_hub.sh
+++ b/examples/aishell/asr1/local/test_hub.sh
@@ -12,6 +12,17 @@ config_path=$1
 ckpt_prefix=$2
 audio_file=$3
+mkdir -p data
+wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/demo_01_03.wav -P data/
+if [ $? -ne 0 ]; then
+   exit 1
+fi
+if [ ! -f ${audio_file} ]; then
+    echo "Plase input the right audio_file path"
+    exit 1
+fi
 chunk_mode=false
 if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then
    chunk_mode=true
@@ -29,7 +40,7 @@ for type in  attention_rescoring; do
    output_dir=${ckpt_prefix}
    mkdir -p ${output_dir}
    python3 -u ${BIN_DIR}/test_hub.py \
-    --nproc ${ngpu} \
+    --ngpu ${ngpu} \
    --config ${config_path} \
    --result_file ${output_dir}/${type}.rsl \
    --checkpoint_path ${ckpt_prefix} \

--- a/examples/aishell/asr1/local/tlg.sh
+++ b/examples/aishell/asr1/local/tlg.sh
@@ -9,7 +9,7 @@ lmtype=srilm
 source utils/parse_options.sh
-data=${MAIN_ROOT}/examples/dataset/${corpus}
+data=${MAIN_ROOT}/dataset/${corpus}
 lexicon=$data/resource_aishell/lexicon.txt
 text=$data/data_aishell/transcript/aishell_transcript_v0.8.txt

--- a/examples/aishell/asr1/local/train.sh
+++ b/examples/aishell/asr1/local/train.sh
@@ -29,7 +29,7 @@ mkdir -p exp
 python3 -u ${BIN_DIR}/train.py \
 --seed ${seed} \
--nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --output exp/${ckpt_name} \
 --profiler-options "${profiler_options}" \

--- a/examples/aishell/asr1/run.sh
+++ b/examples/aishell/asr1/run.sh
@@ -7,6 +7,7 @@ stage=0
 stop_stage=50
 conf_path=conf/conformer.yaml
 avg_num=20
+audio_file=data/demo_01_03.wav
 source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
@@ -14,8 +15,6 @@ avg_ckpt=avg_${avg_num}
 ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}')
 echo "checkpoint name ${ckpt}"
-audio_file="data/test_single_audio.wav"
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    # prepare data
    bash ./local/data.sh || exit -1

--- a/examples/aishell3/tts3/README.md
+++ b/examples/aishell3/tts3/README.md
@@ -17,7 +17,7 @@ tar zxvf data_aishell3.tgz -C data_aishell3
 ```
 ### Get MFA Result and Extract
 We use [MFA2.x](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for aishell3_fastspeech2.
-You can download from here [aishell3_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/with_tone/aishell3_alignment_tone.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) (use MFA1.x now) of our repo.
+You can download from here [aishell3_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/with_tone/aishell3_alignment_tone.tar.gz), or train your own MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) (use MFA1.x now) of our repo.
 ## Get Started
 Assume the path to the dataset is `~/datasets/data_aishell3`.

--- a/examples/aishell3/tts3/conf/default.yaml
+++ b/examples/aishell3/tts3/conf/default.yaml
@@ -45,7 +45,6 @@ model:
    postnet_layers: 5                 # number of layers of postnset
    postnet_filts: 5                  # filter size of conv layers in postnet
    postnet_chans: 256                # number of channels of conv layers in postnet
-    use_masking: True                 # whether to apply masking for padded part in loss calculation
    use_scaled_pos_enc: True          # whether to use scaled positional encoding
    encoder_normalize_before: True    # whether to perform layer normalization before the input
    decoder_normalize_before: True    # whether to perform layer normalization before the input

--- a/examples/aishell3/vc0/README.md
+++ b/examples/aishell3/vc0/README.md
@@ -45,7 +45,8 @@ We use Montreal Force Aligner 1.0. The label in  aishell3 include pinyin，so th
 We use [lexicon.txt](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/lexicon.txt) as the lexicon.
-You can download the alignment results from here [alignment_aishell3.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/alignment_aishell3.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) (use MFA1.x now) of our repo.
+You can download the alignment results from here [alignment_aishell3.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/alignment_aishell3.tar.gz), or train your own MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) (use MFA1.x now) of our repo.
 ```bash
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then

--- a/examples/aishell3/vc1/README.md
+++ b/examples/aishell3/vc1/README.md
@@ -18,7 +18,7 @@ tar zxvf data_aishell3.tgz -C data_aishell3
 ```
 ### Get MFA Result and Extract
 We use [MFA2.x](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for aishell3_fastspeech2.
-You can download from here [aishell3_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/with_tone/aishell3_alignment_tone.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) (use MFA1.x now) of our repo.
+You can download from here [aishell3_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/with_tone/aishell3_alignment_tone.tar.gz), or train your own MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) (use MFA1.x now) of our repo.
 ## Pretrained GE2E Model
 We use pretrained GE2E model to generate spwaker embedding for each sentence.

--- a/examples/aishell3/vc1/conf/default.yaml
+++ b/examples/aishell3/vc1/conf/default.yaml
@@ -45,7 +45,6 @@ model:
    postnet_layers: 5                 # number of layers of postnset
    postnet_filts: 5                  # filter size of conv layers in postnet
    postnet_chans: 256                # number of channels of conv layers in postnet
-    use_masking: True                 # whether to apply masking for padded part in loss calculation
    use_scaled_pos_enc: True          # whether to use scaled positional encoding
    encoder_normalize_before: True    # whether to perform layer normalization before the input
    decoder_normalize_before: True    # whether to perform layer normalization before the input

--- a/examples/aishell3/voc1/README.md
+++ b/examples/aishell3/voc1/README.md
@@ -15,7 +15,7 @@ tar zxvf data_aishell3.tgz -C data_aishell3
 ```
 ### Get MFA Result and Extract
 We use [MFA2.x](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for aishell3_fastspeech2.
-You can download from here [aishell3_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/with_tone/aishell3_alignment_tone.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) (use MFA1.x now) of our repo.
+You can download from here [aishell3_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/with_tone/aishell3_alignment_tone.tar.gz), or train your own MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) (use MFA1.x now) of our repo.
 ## Get Started
 Assume the path to the dataset is `~/datasets/data_aishell3`.

--- a/examples/callcenter/asr1/local/align.sh
+++ b/examples/callcenter/asr1/local/align.sh
@@ -23,7 +23,7 @@ mkdir -p ${output_dir}
 # align dump in `result_file`
 # .tier, .TextGrid dump in `dir of result_file`
 python3 -u ${BIN_DIR}/alignment.py \
--nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --result_file ${output_dir}/${type}.align \
 --checkpoint_path ${ckpt_prefix} \

--- a/examples/callcenter/asr1/local/export.sh
+++ b/examples/callcenter/asr1/local/export.sh
@@ -13,7 +13,7 @@ ckpt_path_prefix=$2
 jit_model_export_path=$3
 python3 -u ${BIN_DIR}/export.py \
--nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --checkpoint_path ${ckpt_path_prefix} \
 --export_path ${jit_model_export_path}

--- a/examples/callcenter/asr1/local/test.sh
+++ b/examples/callcenter/asr1/local/test.sh
@@ -28,7 +28,7 @@ for type in attention ctc_greedy_search; do
    output_dir=${ckpt_prefix}
    mkdir -p ${output_dir}
    python3 -u ${BIN_DIR}/test.py \
-    --nproc ${ngpu} \
+    --ngpu ${ngpu} \
    --config ${config_path} \
    --result_file ${output_dir}/${type}.rsl \
    --checkpoint_path ${ckpt_prefix} \
@@ -47,7 +47,7 @@ for type in ctc_prefix_beam_search attention_rescoring; do
    output_dir=${ckpt_prefix}
    mkdir -p ${output_dir}
    python3 -u ${BIN_DIR}/test.py \
-    --nproc ${ngpu} \
+    --ngpu ${ngpu} \
    --config ${config_path} \
    --result_file ${output_dir}/${type}.rsl \
    --checkpoint_path ${ckpt_prefix} \

--- a/examples/callcenter/asr1/local/train.sh
+++ b/examples/callcenter/asr1/local/train.sh
@@ -22,7 +22,7 @@ if [ ${seed} != 0 ]; then
 fi
 python3 -u ${BIN_DIR}/train.py \
--nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --output exp/${ckpt_name} \
 --seed ${seed}

--- a/examples/csmsc/tts2/README.md
+++ b/examples/csmsc/tts2/README.md
@@ -7,7 +7,7 @@ Download CSMSC from it's [Official Website](https://test.data-baker.com/data/ind
 ### Get MFA Result and Extract
 We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for SPEEDYSPEECH.
-You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your own MFA model reference to  [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) of our repo.
+You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your own MFA model reference to  [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo.
 ## Get Started
 Assume the path to the dataset is `~/datasets/BZNSYP`.

--- a/examples/csmsc/tts3/README.md
+++ b/examples/csmsc/tts3/README.md
@@ -7,7 +7,7 @@ Download CSMSC from it's [Official Website](https://test.data-baker.com/data/ind
 ### Get MFA Result and Extract
 We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for fastspeech2.
-You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your own MFA model reference to  [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) of our repo.
+You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your own MFA model reference to  [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo.
 ## Get Started
 Assume the path to the dataset is `~/datasets/BZNSYP`.
@@ -209,8 +209,8 @@ Static model can be downloaded here [fastspeech2_nosil_baker_static_0.4.zip](htt
 Model | Step | eval/loss | eval/l1_loss | eval/duration_loss | eval/pitch_loss| eval/energy_loss 
 :-------------:| :------------:| :-----: | :-----: | :--------: |:--------:|:---------:
-default| 2(gpu) x 76000|1.0991|0.59132|0.035815| 0.31915| 0.15287|
+default| 2(gpu) x 76000|1.0991|0.59132|0.035815|0.31915|0.15287|
-conformer| 2(gpu) x 76000||||||
+conformer| 2(gpu) x 76000|1.0675|0.56103|0.035869|0.31553|0.15509|
 FastSpeech2 checkpoint contains files listed below.
 ```text

--- a/examples/csmsc/tts3/conf/conformer.yaml
+++ b/examples/csmsc/tts3/conf/conformer.yaml
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+fs: 24000          # sr
+n_fft: 2048        # FFT size.
+n_shift: 300       # Hop size.
+win_length: 1200   # Window length.
+                   # If set to null, it will be the same as fft_size.
+window: "hann"     # Window function.
+# Only used for feats_type != raw
+fmin: 80           # Minimum frequency of Mel basis.
+fmax: 7600         # Maximum frequency of Mel basis.
+n_mels: 80         # The number of mel basis.
+# Only used for the model using pitch features (e.g. FastSpeech2)
+f0min: 80          # Maximum f0 for pitch extraction.
+f0max: 400         # Minimum f0 for pitch extraction.
+###########################################################
+#                       DATA SETTING                      #
+###########################################################
+batch_size: 64
+num_workers: 4
+###########################################################
+#                       MODEL SETTING                     #
+###########################################################
+model:
+    adim: 384         # attention dimension
+    aheads: 2         # number of attention heads
+    elayers: 4        # number of encoder layers
+    eunits: 1536      # number of encoder ff units
+    dlayers: 4        # number of decoder layers
+    dunits: 1536      # number of decoder ff units
+    positionwise_layer_type: conv1d   # type of position-wise layer
+    positionwise_conv_kernel_size: 3  # kernel size of position wise conv layer
+    duration_predictor_layers: 2      # number of layers of duration predictor
+    duration_predictor_chans: 256     # number of channels of duration predictor
+    duration_predictor_kernel_size: 3 # filter size of duration predictor
+    postnet_layers: 5                 # number of layers of postnset
+    postnet_filts: 5                  # filter size of conv layers in postnet
+    postnet_chans: 256                # number of channels of conv layers in postnet
+    encoder_normalize_before: True    # whether to perform layer normalization before the input
+    decoder_normalize_before: True    # whether to perform layer normalization before the input
+    reduction_factor: 1               # reduction factor
+    encoder_type: conformer           # encoder type
+    decoder_type: conformer           # decoder type
+    conformer_pos_enc_layer_type: rel_pos        # conformer positional encoding type
+    conformer_self_attn_layer_type: rel_selfattn # conformer self-attention type
+    conformer_activation_type: swish             # conformer activation type
+    use_macaron_style_in_conformer: true         # whether to use macaron style in conformer
+    use_cnn_in_conformer: true                   # whether to use CNN in conformer
+    conformer_enc_kernel_size: 7                 # kernel size in CNN module of conformer-based encoder
+    conformer_dec_kernel_size: 31                # kernel size in CNN module of conformer-based decoder
+    init_type: xavier_uniform         # initialization type
+    transformer_enc_dropout_rate: 0.2            # dropout rate for transformer encoder layer
+    transformer_enc_positional_dropout_rate: 0.2 # dropout rate for transformer encoder positional encoding
+    transformer_enc_attn_dropout_rate: 0.2       # dropout rate for transformer encoder attention layer
+    transformer_dec_dropout_rate: 0.2            # dropout rate for transformer decoder layer
+    transformer_dec_positional_dropout_rate: 0.2 # dropout rate for transformer decoder positional encoding
+    transformer_dec_attn_dropout_rate: 0.2       # dropout rate for transformer decoder attention layer
+    pitch_predictor_layers: 5                  # number of conv layers in pitch predictor
+    pitch_predictor_chans: 256                 # number of channels of conv layers in pitch predictor
+    pitch_predictor_kernel_size: 5             # kernel size of conv leyers in pitch predictor
+    pitch_predictor_dropout: 0.5               # dropout rate in pitch predictor
+    pitch_embed_kernel_size: 1                 # kernel size of conv embedding layer for pitch
+    pitch_embed_dropout: 0.0                   # dropout rate after conv embedding layer for pitch
+    stop_gradient_from_pitch_predictor: true   # whether to stop the gradient from pitch predictor to encoder
+    energy_predictor_layers: 2                 # number of conv layers in energy predictor
+    energy_predictor_chans: 256                # number of channels of conv layers in energy predictor
+    energy_predictor_kernel_size: 3            # kernel size of conv leyers in energy predictor
+    energy_predictor_dropout: 0.5              # dropout rate in energy predictor
+    energy_embed_kernel_size: 1                # kernel size of conv embedding layer for energy
+    energy_embed_dropout: 0.0                  # dropout rate after conv embedding layer for energy
+    stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder
+###########################################################
+#                       UPDATER SETTING                   #
+###########################################################
+updater:
+    use_masking: True                 # whether to apply masking for padded part in loss calculation
+###########################################################
+#                     OPTIMIZER SETTING                   #
+###########################################################
+optimizer:
+  optim: adam              # optimizer type
+  learning_rate: 0.001     # learning rate
+###########################################################
+#                     TRAINING SETTING                    #
+###########################################################
+max_epoch: 1000
+num_snapshots: 5
+###########################################################
+#                       OTHER SETTING                     #
+###########################################################
+seed: 10086
--- a/examples/csmsc/tts3/conf/default.yaml
+++ b/examples/csmsc/tts3/conf/default.yaml
@@ -45,7 +45,6 @@ model:
    postnet_layers: 5                 # number of layers of postnset
    postnet_filts: 5                  # filter size of conv layers in postnet
    postnet_chans: 256                # number of channels of conv layers in postnet
-    use_masking: True                 # whether to apply masking for padded part in loss calculation
    use_scaled_pos_enc: True          # whether to use scaled positional encoding
    encoder_normalize_before: True    # whether to perform layer normalization before the input
    decoder_normalize_before: True    # whether to perform layer normalization before the input

--- a/examples/csmsc/voc1/README.md
+++ b/examples/csmsc/voc1/README.md
@@ -5,8 +5,8 @@ This example contains code used to train a [parallel wavegan](http://arxiv.org/a
 Download CSMSC from the [official website](https://www.data-baker.com/data/index/source) and extract it to `~/datasets`. Then the dataset is in directory `~/datasets/BZNSYP`.
 ### Get MFA Result and Extract
-We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to cut silence in the edge of audio.
+We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to  cut silence in the edge of audio.
-You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your own MFA model reference to  [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) of our repo.
+You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your own MFA model reference to  [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo.
 ## Get Started
 Assume the path to the dataset is `~/datasets/BZNSYP`.

--- a/examples/csmsc/voc3/README.md
+++ b/examples/csmsc/voc3/README.md
@@ -5,8 +5,8 @@ This example contains code used to train a [Multi Band MelGAN](https://arxiv.org
 Download CSMSC from the [official website](https://www.data-baker.com/data/index/source) and extract it to `~/datasets`. Then the dataset is in directory `~/datasets/BZNSYP`.
 ### Get MFA Result and Extract
-We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to cut silence in the edge of audio.
+We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to  cut silence in the edge of audio.
-You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/use_mfa) of our repo.
+You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your own MFA model reference to [mfa example](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/mfa) of our repo.
 ## Get Started
 Assume the path to the dataset is `~/datasets/BZNSYP`.

--- a/audio/examples/sound_classification/README.md
+++ b/audio/examples/sound_classification/README.md
@@ -21,22 +21,17 @@ PaddleAudio提供了PANNs的CNN14、CNN10和CNN6的预训练模型，可供用
 ### 模型训练
-以环境声音分类数据集`ESC50`为示例，运行下面的命令，可在训练集上进行模型的finetune，支持单机的单卡训练和多卡训练。关于如何使用`paddle.distributed.launch`启动多卡训练，请查看[单机多卡训练](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/02_paddle2.0_develop/06_device_cn.html)。
+以环境声音分类数据集`ESC50`为示例，运行下面的命令，可在训练集上进行模型的finetune，支持单机的单卡训练和多卡训练。
-单卡训练:
+启动训练:
 ```shell
-$ python train.py --epochs 50 --batch_size 16 --checkpoint_dir ./checkpoint --save_freq 10
+$ CUDA_VISIBLE_DEVICES=0 ./run.sh 1
 ```
-多卡训练:
+`paddlespeech/cls/exps/panns/train.py` 脚本中可支持配置的参数：
-```shell
-$ unset CUDA_VISIBLE_DEVICES
-$ python -m paddle.distributed.launch --gpus "0,1" train.py --epochs 50 --batch_size 16 --num_worker 4 --checkpoint_dir ./checkpoint --save_freq 10
-```
-可支持配置的参数：
+- `device`: 指定模型预测时使用的设备。
+- `feat_backend`: 选择提取特征的后端，可选`'numpy'`或`'paddle'`，默认为`'numpy'`。
- `device`: 选用什么设备进行训练，可选cpu或gpu，默认为gpu。如使用gpu训练则参数gpus指定GPU卡号。
 - `epochs`: 训练轮次，默认为50。
 - `learning_rate`: Fine-tune的学习率；默认为5e-5。
 - `batch_size`: 批处理大小，请结合显存情况进行调整，若出现显存不足，请适当调低这一参数；默认为16。
@@ -47,9 +42,9 @@ $ python -m paddle.distributed.launch --gpus "0,1" train.py --epochs 50 --batch_
 示例代码中使用的预训练模型为`CNN14`，如果想更换为其他预训练模型，可通过以下方式执行：
 ```python
-from model import SoundClassifier
 from paddleaudio.datasets import ESC50
-from paddleaudio.models.panns import cnn14, cnn10, cnn6
+from paddlespeech.cls.models import SoundClassifier
+from paddlespeech.cls.models import cnn14, cnn10, cnn6
 # CNN14
 backbone = cnn14(pretrained=True, extract_embedding=True)
@@ -67,12 +62,14 @@ model = SoundClassifier(backbone, num_class=len(ESC50.label_list))
 ### 模型预测
 ```shell
-python -u predict.py --wav ./dog.wav --top_k 3 --checkpoint ./checkpoint/epoch_50/model.pdparams
+$ CUDA_VISIBLE_DEVICES=0 ./run.sh 2
 ```
-可支持配置的参数：
+`paddlespeech/cls/exps/panns/predict.py` 脚本中可支持配置的参数：
- `device`: 选用什么设备进行训练，可选cpu或gpu，默认为gpu。如使用gpu训练则参数gpus指定GPU卡号。
+- `device`: 指定模型预测时使用的设备。
 - `wav`: 指定预测的音频文件。
+- `feat_backend`: 选择提取特征的后端，可选`'numpy'`或`'paddle'`，默认为`'numpy'`。
 - `top_k`: 预测显示的top k标签的得分，默认为1。
 - `checkpoint`: 模型参数checkpoint文件。
@@ -91,10 +88,10 @@ Cat: 6.579841738130199e-06
 模型训练结束后，可以将已保存的动态图参数导出成静态图的模型和参数，然后实施静态图的部署。
 ```shell
-python -u export_model.py --checkpoint ./checkpoint/epoch_50/model.pdparams --output_dir ./export
+$ CUDA_VISIBLE_DEVICES=0 ./run.sh 3
 ```
-可支持配置的参数：
+`paddlespeech/cls/exps/panns/export_model.py` 脚本中可支持配置的参数：
 - `checkpoint`: 模型参数checkpoint文件。
 - `output_dir`: 导出静态图模型和参数文件的保存目录。
@@ -109,8 +106,13 @@ export
 #### 2. 模型部署和预测
-`deploy/python/predict.py` 脚本使用了`paddle.inference`模块下的api，提供了python端部署的示例：
+`paddlespeech/cls/exps/panns/deploy/predict.py` 脚本使用了`paddle.inference`模块下的api，提供了python端部署的示例：
-```sh
+```shell
-python deploy/python/predict.py --model_dir ./export --device gpu
+$ CUDA_VISIBLE_DEVICES=0 ./run.sh 4
 ```
+`paddlespeech/cls/exps/panns/deploy/predict.py` 脚本中可支持配置的主要参数：
+- `device`: 指定模型预测时使用的设备。
+- `model_dir`: 导出静态图模型和参数文件的保存目录。
+- `wav`: 指定预测的音频文件。
--- a/examples/esc50/cls0/local/export.sh
+++ b/examples/esc50/cls0/local/export.sh
+#!/bin/bash
+ckpt_dir=$1
+output_dir=$2
+python3 ${BIN_DIR}/export_model.py \
+--checkpoint ${ckpt_dir}/model.pdparams \
+--output_dir ${output_dir}
--- a/examples/esc50/cls0/local/infer.sh
+++ b/examples/esc50/cls0/local/infer.sh
+#!/bin/bash
+audio_file=$1
+ckpt_dir=$2
+feat_backend=$3
+python3 ${BIN_DIR}/predict.py \
+--wav ${audio_file} \
+--feat_backend ${feat_backend} \
+--top_k 10 \
+--checkpoint ${ckpt_dir}/model.pdparams
--- a/examples/esc50/cls0/local/static_model_infer.sh
+++ b/examples/esc50/cls0/local/static_model_infer.sh
+#!/bin/bash
+device=$1
+model_dir=$2
+audio_file=$3
+python3 ${BIN_DIR}/deploy/predict.py \
+--device ${device} \
+--model_dir ${model_dir} \
+--wav ${audio_file} 
--- a/examples/esc50/cls0/local/train.sh
+++ b/examples/esc50/cls0/local/train.sh
+#!/bin/bash
+ngpu=$1
+feat_backend=$2
+num_epochs=50
+batch_size=16
+ckpt_dir=./checkpoint
+save_freq=10
+if [ ${ngpu} -gt 0 ]; then
+    python3 -m paddle.distributed.launch --gpus $CUDA_VISIBLE_DEVICES ${BIN_DIR}/train.py \
+    --epochs ${num_epochs} \
+    --feat_backend ${feat_backend} \
+    --batch_size ${batch_size} \
+    --checkpoint_dir ${ckpt_dir} \
+    --save_freq ${save_freq}
+else
+    python3 ${BIN_DIR}/train.py \
+    --epochs ${num_epochs} \
+    --feat_backend ${feat_backend} \
+    --batch_size ${batch_size} \
+    --checkpoint_dir ${ckpt_dir} \
+    --save_freq ${save_freq}
+fi
--- a/examples/esc50/cls0/path.sh
+++ b/examples/esc50/cls0/path.sh
+#!/bin/bash
+export MAIN_ROOT=`realpath ${PWD}/../../../`
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
+export LC_ALL=C
+export PYTHONDONTWRITEBYTECODE=1
+# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
+MODEL=panns
+export BIN_DIR=${MAIN_ROOT}/paddlespeech/cls/exps/${MODEL}
\ No newline at end of file
--- a/examples/esc50/cls0/run.sh
+++ b/examples/esc50/cls0/run.sh
+#!/bin/bash
+set -e
+source path.sh
+ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+stage=$1
+stop_stage=100
+feat_backend=numpy
+audio_file=~/cat.wav
+ckpt_dir=./checkpoint/epoch_50
+output_dir=./export
+infer_device=cpu
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    ./local/train.sh ${ngpu} ${feat_backend} || exit -1
+    exit 0
+fi
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    ./local/infer.sh ${audio_file} ${ckpt_dir} ${feat_backend} || exit -1
+    exit 0
+fi
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    ./local/export.sh ${ckpt_dir} ${output_dir} || exit -1
+    exit 0
+fi
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    ./local/static_model_infer.sh ${infer_device} ${output_dir} ${audio_file} || exit -1
+    exit 0
+fi
--- a/examples/librispeech/asr0/local/data.sh
+++ b/examples/librispeech/asr0/local/data.sh
@@ -10,7 +10,7 @@ source ${MAIN_ROOT}/utils/parse_options.sh
 mkdir -p data
 mkdir -p ${dict_dir}
-TARGET_DIR=${MAIN_ROOT}/examples/dataset
+TARGET_DIR=${MAIN_ROOT}/dataset
 mkdir -p ${TARGET_DIR}
 if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then

--- a/examples/librispeech/asr0/local/export.sh
+++ b/examples/librispeech/asr0/local/export.sh
@@ -14,7 +14,7 @@ jit_model_export_path=$3
 model_type=$4
 python3 -u ${BIN_DIR}/export.py \
--nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --checkpoint_path ${ckpt_path_prefix} \
 --export_path ${jit_model_export_path} \

--- a/examples/librispeech/asr0/local/test.sh
+++ b/examples/librispeech/asr0/local/test.sh
@@ -19,7 +19,7 @@ if [ $? -ne 0 ]; then
 fi
 python3 -u ${BIN_DIR}/test.py \
--nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --result_file ${ckpt_prefix}.rsl \
 --checkpoint_path ${ckpt_prefix} \

--- a/examples/librispeech/asr0/local/test_hub.sh
+++ b/examples/librispeech/asr0/local/test_hub.sh
@@ -13,6 +13,17 @@ ckpt_prefix=$2
 model_type=$3
 audio_file=$4
+mkdir -p data
+wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/en/demo_002_en.wav -P data/
+if [ $? -ne 0 ]; then
+   exit 1
+fi
+if [ ! -f ${audio_file} ]; then
+    echo "Plase input the right audio_file path"
+    exit 1
+fi
 # download language model
 bash local/download_lm_en.sh
 if [ $? -ne 0 ]; then
@@ -20,7 +31,7 @@ if [ $? -ne 0 ]; then
 fi
 python3 -u ${BIN_DIR}/test_hub.py \
--nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --result_file ${ckpt_prefix}.rsl \
 --checkpoint_path ${ckpt_prefix} \

--- a/examples/librispeech/asr0/local/train.sh
+++ b/examples/librispeech/asr0/local/train.sh
@@ -21,7 +21,7 @@ if [ ${seed} != 0 ]; then
 fi
 python3 -u ${BIN_DIR}/train.py \
--nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --output exp/${ckpt_name} \
 --model_type ${model_type} \

--- a/examples/librispeech/asr0/run.sh
+++ b/examples/librispeech/asr0/run.sh
@@ -8,13 +8,14 @@ stop_stage=100
 conf_path=conf/deepspeech2.yaml
 avg_num=30
 model_type=offline
+audio_file=data/demo_002_en.wav
 source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
 avg_ckpt=avg_${avg_num}
 ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}')
 echo "checkpoint name ${ckpt}"
-audio_file="data/tmp.flac"
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    # prepare data
    bash ./local/data.sh || exit -1

--- a/examples/librispeech/asr1/RESULTS.md
+++ b/examples/librispeech/asr1/RESULTS.md
@@ -21,7 +21,7 @@
 ## Transformer
 | Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER |  
 | --- | --- | --- | --- | --- | --- | --- | --- |
-| transformer | 32.52 M | conf/transformer.yaml | spec_aug  | test-clean | attention | 6.733129533131917 | 0.047874 |  
+| transformer | 32.52 M | conf/transformer.yaml | spec_aug  | test-clean | attention | 6.725063021977743 | 0.047417 |  
-| transformer | 32.52 M | conf/transformer.yaml | spec_aug  | test-clean | ctc_greedy_search | 6.733129533131917 | 0.053922 |  
+| transformer | 32.52 M | conf/transformer.yaml | spec_aug  | test-clean | ctc_greedy_search | 6.725063021977743 | 0.053922 |  
-| transformer | 32.52 M | conf/transformer.yaml | spec_aug  | test-clean | ctc_prefix_beam_search | 6.733129533131917 | 0.053427 |  
+| transformer | 32.52 M | conf/transformer.yaml | spec_aug  | test-clean | ctc_prefix_beam_search | 6.725063021977743 | 0.053180 |  
-| transformer | 32.52 M | conf/transformer.yaml | spec_aug  | test-clean | attention_rescoring | 6.733129533131917 | 0.041369 |  
+| transformer | 32.52 M | conf/transformer.yaml | spec_aug  | test-clean | attention_rescoring | 6.725063021977743 | 0.041026 |  
--- a/examples/librispeech/asr1/local/align.sh
+++ b/examples/librispeech/asr1/local/align.sh
@@ -18,7 +18,7 @@ mkdir -p ${output_dir}
 # align dump in `result_file`
 # .tier, .TextGrid dump in `dir of result_file`
 python3 -u ${BIN_DIR}/alignment.py \
--nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --result_file ${output_dir}/${type}.align \
 --checkpoint_path ${ckpt_prefix} \

--- a/examples/librispeech/asr1/local/data.sh
+++ b/examples/librispeech/asr1/local/data.sh
@@ -19,7 +19,7 @@ source ${MAIN_ROOT}/utils/parse_options.sh
 mkdir -p data
 mkdir -p ${dict_dir}
-TARGET_DIR=${MAIN_ROOT}/examples/dataset
+TARGET_DIR=${MAIN_ROOT}/dataset
 mkdir -p ${TARGET_DIR}
 if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then

--- a/examples/librispeech/asr1/local/export.sh
+++ b/examples/librispeech/asr1/local/export.sh
@@ -13,7 +13,7 @@ ckpt_path_prefix=$2
 jit_model_export_path=$3
 python3 -u ${BIN_DIR}/export.py \
--nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --checkpoint_path ${ckpt_path_prefix} \
 --export_path ${jit_model_export_path}

--- a/examples/librispeech/asr1/local/test.sh
+++ b/examples/librispeech/asr1/local/test.sh
@@ -50,7 +50,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
            batch_size=64
        fi
        python3 -u ${BIN_DIR}/test.py \
-            --nproc ${ngpu} \
+            --ngpu ${ngpu} \
            --config ${config_path} \
            --result_file ${ckpt_prefix}.${type}.rsl \
            --checkpoint_path ${ckpt_prefix} \
@@ -74,7 +74,7 @@ for type in ctc_greedy_search; do
        batch_size=64
    fi
    python3 -u ${BIN_DIR}/test.py \
-        --nproc ${ngpu} \
+        --ngpu ${ngpu} \
        --config ${config_path} \
        --result_file ${ckpt_prefix}.${type}.rsl \
        --checkpoint_path ${ckpt_prefix} \
@@ -94,7 +94,7 @@ for type in ctc_prefix_beam_search attention_rescoring; do
    echo "decoding ${type}"
    batch_size=1
    python3 -u ${BIN_DIR}/test.py \
-        --nproc ${ngpu} \
+        --ngpu ${ngpu} \
        --config ${config_path} \
        --result_file ${ckpt_prefix}.${type}.rsl \
        --checkpoint_path ${ckpt_prefix} \

--- a/examples/librispeech/asr1/local/test_hub.sh
+++ b/examples/librispeech/asr1/local/test_hub.sh
@@ -12,6 +12,17 @@ config_path=$1
 ckpt_prefix=$2
 audio_file=$3
+mkdir -p data
+wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/en/demo_002_en.wav -P data/
+if [ $? -ne 0 ]; then
+   exit 1
+fi
+if [ ! -f ${audio_file} ]; then
+    echo "Plase input the right audio_file path"
+    exit 1
+fi
 # bpemode (unigram or bpe)
 nbpe=5000
 bpemode=unigram
@@ -36,7 +47,7 @@ for type in attention_rescoring; do
    output_dir=${ckpt_prefix}
    mkdir -p ${output_dir}
    python3 -u ${BIN_DIR}/test_hub.py \
-    --nproc ${ngpu} \
+    --ngpu ${ngpu} \
    --config ${config_path} \
    --result_file ${output_dir}/${type}.rsl \
    --checkpoint_path ${ckpt_prefix} \

--- a/examples/librispeech/asr1/local/train.sh
+++ b/examples/librispeech/asr1/local/train.sh
@@ -23,7 +23,7 @@ fi
 # export FLAGS_conv_workspace_size_limit=4000
 python3 -u ${BIN_DIR}/train.py \
--nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --output exp/${ckpt_name} \
 --seed ${seed}

--- a/examples/librispeech/asr1/run.sh
+++ b/examples/librispeech/asr1/run.sh
@@ -9,6 +9,7 @@ stage=0
 stop_stage=100
 conf_path=conf/transformer.yaml
 avg_num=30
+audio_file=data/demo_002_en.wav
 . ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
@@ -16,8 +17,6 @@ avg_ckpt=avg_${avg_num}
 ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}')
 echo "checkpoint name ${ckpt}"
-audio_file="data/tmp.flac"
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    # prepare data
    bash ./local/data.sh || exit -1

--- a/examples/librispeech/asr2/local/align.sh
+++ b/examples/librispeech/asr2/local/align.sh
@@ -22,7 +22,7 @@ python3 -u ${BIN_DIR}/test.py \
 --model-name 'u2_kaldi' \
 --run-mode 'align' \
 --dict-path ${dict_path} \
--nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --result-file ${output_dir}/${type}.align \
 --checkpoint_path ${ckpt_prefix} \

--- a/examples/librispeech/asr2/local/data.sh
+++ b/examples/librispeech/asr2/local/data.sh
@@ -15,7 +15,7 @@ do_delta=false
 # Set this to somewhere where you want to put your data, or where
 # someone else has already put it.  You'll want to change this
 # if you're not on the CLSP grid.
-datadir=${MAIN_ROOT}/examples/dataset/
+datadir=${MAIN_ROOT}/dataset/
 # bpemode (unigram or bpe)
 nbpe=5000
@@ -36,7 +36,7 @@ recog_set="test_clean test_other dev_clean dev_other"
 mkdir -p data
-TARGET_DIR=${MAIN_ROOT}/examples/dataset
+TARGET_DIR=${MAIN_ROOT}/dataset
 mkdir -p ${TARGET_DIR}
 if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
    # download data, generate manifests

--- a/examples/librispeech/asr2/local/export.sh
+++ b/examples/librispeech/asr2/local/export.sh
@@ -15,7 +15,7 @@ jit_model_export_path=$3
 python3 -u ${BIN_DIR}/test.py \
 --model-name 'u2_kaldi' \
 --run-mode 'export' \
--nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --checkpoint_path ${ckpt_path_prefix} \
 --export_path ${jit_model_export_path}

--- a/examples/librispeech/asr2/local/test.sh
+++ b/examples/librispeech/asr2/local/test.sh
@@ -76,7 +76,7 @@ for dmethd in attention ctc_greedy_search ctc_prefix_beam_search attention_resco
            python3 -u ${BIN_DIR}/test.py \
            --model-name u2_kaldi \
            --run-mode test \
-            --nproc ${ngpu} \
+            --ngpu ${ngpu} \
            --dict-path ${dict} \
            --config ${config_path} \
            --checkpoint_path ${ckpt_prefix} \

--- a/examples/librispeech/asr2/local/train.sh
+++ b/examples/librispeech/asr2/local/train.sh
@@ -21,7 +21,7 @@ fi
 python3 -u ${BIN_DIR}/train.py \
 --model-name u2_kaldi \
--nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --output exp/${ckpt_name} \
 --seed ${seed}

--- a/examples/ljspeech/tts3/README.md
+++ b/examples/ljspeech/tts3/README.md
@@ -7,7 +7,7 @@ Download LJSpeech-1.1 from the [official website](https://keithito.com/LJ-Speech
 ### Get MFA Result and Extract
 We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for fastspeech2.
-You can download from here [ljspeech_alignment.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/LJSpeech-1.1/ljspeech_alignment.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) of our repo.
+You can download from here [ljspeech_alignment.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/LJSpeech-1.1/ljspeech_alignment.tar.gz), or train your own MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo.
 ## Get Started
 Assume the path to the dataset is `~/datasets/LJSpeech-1.1`.

--- a/examples/ljspeech/tts3/conf/default.yaml
+++ b/examples/ljspeech/tts3/conf/default.yaml
@@ -45,7 +45,6 @@ model:
    postnet_layers: 5                 # number of layers of postnset
    postnet_filts: 5                  # filter size of conv layers in postnet
    postnet_chans: 256                # number of channels of conv layers in postnet
-    use_masking: True                 # whether to apply masking for padded part in loss calculation
    use_scaled_pos_enc: True          # whether to use scaled positional encoding
    encoder_normalize_before: True    # whether to perform layer normalization before the input
    decoder_normalize_before: True    # whether to perform layer normalization before the input

--- a/examples/ljspeech/voc1/README.md
+++ b/examples/ljspeech/voc1/README.md
@@ -5,7 +5,7 @@ This example contains code used to train a [parallel wavegan](http://arxiv.org/a
 Download LJSpeech-1.1 from the [official website](https://keithito.com/LJ-Speech-Dataset/).
 ### Get MFA Result and Extract
 We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to cut silence in the edge of audio.
-You can download from here [ljspeech_alignment.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/LJSpeech-1.1/ljspeech_alignment.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) of our repo.
+You can download from here [ljspeech_alignment.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/LJSpeech-1.1/ljspeech_alignment.tar.gz), or train your own MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo.
 ## Get Started
 Assume the path to the dataset is `~/datasets/LJSpeech-1.1`.

--- a/examples/other/1xt2x/aishell/local/data.sh
+++ b/examples/other/1xt2x/aishell/local/data.sh
@@ -12,7 +12,7 @@ stop_stage=100
 source ${MAIN_ROOT}/utils/parse_options.sh
 mkdir -p data
-TARGET_DIR=${MAIN_ROOT}/examples/dataset
+TARGET_DIR=${MAIN_ROOT}/dataset
 mkdir -p ${TARGET_DIR}
 bash local/download_model.sh ${ckpt_dir}

--- a/examples/other/1xt2x/aishell/local/test.sh
+++ b/examples/other/1xt2x/aishell/local/test.sh
@@ -19,7 +19,7 @@ if [ $? -ne 0 ]; then
 fi
 python3 -u ${BIN_DIR}/test.py \
--nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --result_file ${ckpt_prefix}.rsl \
 --checkpoint_path ${ckpt_prefix} \

--- a/examples/other/1xt2x/baidu_en8k/local/data.sh
+++ b/examples/other/1xt2x/baidu_en8k/local/data.sh
@@ -13,7 +13,7 @@ unit_type=char
 source ${MAIN_ROOT}/utils/parse_options.sh
 mkdir -p data
-TARGET_DIR=${MAIN_ROOT}/examples/dataset
+TARGET_DIR=${MAIN_ROOT}/dataset
 mkdir -p ${TARGET_DIR}

--- a/examples/other/1xt2x/baidu_en8k/local/test.sh
+++ b/examples/other/1xt2x/baidu_en8k/local/test.sh
@@ -19,7 +19,7 @@ if [ $? -ne 0 ]; then
 fi
 python3 -u ${BIN_DIR}/test.py \
--nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --result_file ${ckpt_prefix}.rsl \
 --checkpoint_path ${ckpt_prefix} \

--- a/examples/other/1xt2x/librispeech/local/data.sh
+++ b/examples/other/1xt2x/librispeech/local/data.sh
@@ -14,7 +14,7 @@ unit_type=char
 source ${MAIN_ROOT}/utils/parse_options.sh
 mkdir -p data
-TARGET_DIR=${MAIN_ROOT}/examples/dataset
+TARGET_DIR=${MAIN_ROOT}/dataset
 mkdir -p ${TARGET_DIR}
 bash local/download_model.sh ${ckpt_dir}

--- a/examples/other/1xt2x/librispeech/local/test.sh
+++ b/examples/other/1xt2x/librispeech/local/test.sh
@@ -19,7 +19,7 @@ if [ $? -ne 0 ]; then
 fi
 python3 -u ${BIN_DIR}/test.py \
--nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --result_file ${ckpt_prefix}.rsl \
 --checkpoint_path ${ckpt_prefix} \

--- a/examples/other/1xt2x/src_deepspeech2x/test_model.py
+++ b/examples/other/1xt2x/src_deepspeech2x/test_model.py
@@ -403,7 +403,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
    def setup(self):
        """Setup the experiment.
        """
-        paddle.set_device('gpu' if self.args.nprocs > 0 else 'cpu')
+        paddle.set_device('gpu' if self.args.ngpu > 0 else 'cpu')
        self.setup_output_dir()
        self.setup_checkpointer()

--- a/examples/other/use_mfa/README.md
+++ b/examples/other/use_mfa/README.md
--- a/examples/other/use_mfa/local/cmudict-0.7b
+++ b/examples/other/use_mfa/local/cmudict-0.7b
--- a/examples/other/use_mfa/local/detect_oov.py
+++ b/examples/other/use_mfa/local/detect_oov.py
--- a/examples/other/use_mfa/local/generate_lexicon.py
+++ b/examples/other/use_mfa/local/generate_lexicon.py
--- a/examples/other/use_mfa/local/reorganize_aishell3.py
+++ b/examples/other/use_mfa/local/reorganize_aishell3.py
--- a/examples/other/use_mfa/local/reorganize_baker.py
+++ b/examples/other/use_mfa/local/reorganize_baker.py
--- a/examples/other/use_mfa/local/reorganize_ljspeech.py
+++ b/examples/other/use_mfa/local/reorganize_ljspeech.py
--- a/examples/other/use_mfa/local/reorganize_vctk.py
+++ b/examples/other/use_mfa/local/reorganize_vctk.py
--- a/examples/other/use_mfa/run.sh
+++ b/examples/other/use_mfa/run.sh
--- a/examples/ted_en_zh/st0/local/data.sh
+++ b/examples/ted_en_zh/st0/local/data.sh
@@ -16,7 +16,7 @@ data_dir=./TED-En-Zh
 . ${MAIN_ROOT}/utils/parse_options.sh || exit -1;
-TARGET_DIR=${MAIN_ROOT}/examples/dataset
+TARGET_DIR=${MAIN_ROOT}/dataset
 mkdir -p ${TARGET_DIR}
 mkdir -p data
 mkdir -p ${dict_dir}

--- a/examples/ted_en_zh/st0/local/test.sh
+++ b/examples/ted_en_zh/st0/local/test.sh
@@ -15,7 +15,7 @@ for type in fullsentence; do
    echo "decoding ${type}"
    batch_size=32
    python3 -u ${BIN_DIR}/test.py \
-    --nproc ${ngpu} \
+    --ngpu ${ngpu} \
    --config ${config_path} \
    --result_file ${ckpt_prefix}.${type}.rsl \
    --checkpoint_path ${ckpt_prefix} \

--- a/examples/ted_en_zh/st0/local/train.sh
+++ b/examples/ted_en_zh/st0/local/train.sh
@@ -20,7 +20,7 @@ if [ ${seed} != 0 ]; then
 fi
 python3 -u ${BIN_DIR}/train.py \
--nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --output exp/${ckpt_name} \
 --seed ${seed}

--- a/examples/ted_en_zh/st1/local/data.sh
+++ b/examples/ted_en_zh/st1/local/data.sh
@@ -15,7 +15,7 @@ data_dir=./TED_EnZh
 source ${MAIN_ROOT}/utils/parse_options.sh
-TARGET_DIR=${MAIN_ROOT}/examples/dataset
+TARGET_DIR=${MAIN_ROOT}/dataset
 mkdir -p ${TARGET_DIR}
 mkdir -p data
 mkdir -p ${dict_dir}

--- a/examples/ted_en_zh/st1/local/test.sh
+++ b/examples/ted_en_zh/st1/local/test.sh
@@ -15,7 +15,7 @@ for type in fullsentence; do
    echo "decoding ${type}"
    batch_size=32
    python3 -u ${BIN_DIR}/test.py \
-    --nproc ${ngpu} \
+    --ngpu ${ngpu} \
    --config ${config_path} \
    --result_file ${ckpt_prefix}.${type}.rsl \
    --checkpoint_path ${ckpt_prefix} \

--- a/examples/ted_en_zh/st1/local/train_finetune.sh
+++ b/examples/ted_en_zh/st1/local/train_finetune.sh
@@ -21,7 +21,7 @@ if [ ${seed} != 0 ]; then
 fi
 python3 -u ${BIN_DIR}/train.py \
--nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --output exp/${ckpt_name} \
 --checkpoint_path ${ckpt_path} \

--- a/examples/thchs30/align0/local/data.sh
+++ b/examples/thchs30/align0/local/data.sh
@@ -6,7 +6,7 @@ stop_stage=100
 source ${MAIN_ROOT}/utils/parse_options.sh
 mkdir -p data
-TARGET_DIR=${MAIN_ROOT}/examples/dataset
+TARGET_DIR=${MAIN_ROOT}/dataset
 mkdir -p ${TARGET_DIR}
 LEXICON_NAME=$1

--- a/examples/timit/asr1/local/align.sh
+++ b/examples/timit/asr1/local/align.sh
@@ -18,7 +18,7 @@ mkdir -p ${output_dir}
 # align dump in `result_file`
 # .tier, .TextGrid dump in `dir of result_file`
 python3 -u ${BIN_DIR}/alignment.py \
--nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --result_file ${output_dir}/${type}.align \
 --checkpoint_path ${ckpt_prefix} \

--- a/examples/timit/asr1/local/data.sh
+++ b/examples/timit/asr1/local/data.sh
@@ -12,7 +12,7 @@ TIMIT_path=
 mkdir -p data
 mkdir -p ${dict_dir}
-TARGET_DIR=${MAIN_ROOT}/examples/dataset
+TARGET_DIR=${MAIN_ROOT}/dataset
 mkdir -p ${TARGET_DIR}

--- a/examples/timit/asr1/local/export.sh
+++ b/examples/timit/asr1/local/export.sh
@@ -13,7 +13,7 @@ ckpt_path_prefix=$2
 jit_model_export_path=$3
 python3 -u ${BIN_DIR}/export.py \
--nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --checkpoint_path ${ckpt_path_prefix} \
 --export_path ${jit_model_export_path}

--- a/examples/timit/asr1/local/test.sh
+++ b/examples/timit/asr1/local/test.sh
@@ -41,7 +41,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
            batch_size=64
        fi
        python3 -u ${BIN_DIR}/test.py \
-        --nproc ${ngpu} \
+        --ngpu ${ngpu} \
        --config ${config_path} \
        --result_file ${ckpt_prefix}.${type}.rsl \
        --checkpoint_path ${ckpt_prefix} \
@@ -61,7 +61,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
        echo "decoding ${type}"
        batch_size=1
        python3 -u ${BIN_DIR}/test.py \
-        --nproc ${ngpu}  \
+        --ngpu ${ngpu}  \
        --config ${config_path} \
        --result_file ${ckpt_prefix}.${type}.rsl \
        --checkpoint_path ${ckpt_prefix} \
@@ -80,7 +80,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
        echo "decoding ${type}"
        batch_size=1
        python3 -u ${BIN_DIR}/test.py \
-        --nproc ${ngpu}  \
+        --ngpu ${ngpu}  \
        --config ${config_path} \
        --result_file ${ckpt_prefix}.${type}.rsl \
        --checkpoint_path ${ckpt_prefix} \

--- a/examples/timit/asr1/local/train.sh
+++ b/examples/timit/asr1/local/train.sh
@@ -20,7 +20,7 @@ if [ ${seed} != 0  ]; then
 fi
 python3 -u ${BIN_DIR}/train.py \
--nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --output exp/${ckpt_name} \
 --seed ${seed}

--- a/examples/tiny/asr0/local/data.sh
+++ b/examples/tiny/asr0/local/data.sh
@@ -10,7 +10,7 @@ dict_dir=data/lang_char
 mkdir -p data
 mkdir -p ${dict_dir}
-TARGET_DIR=${MAIN_ROOT}/examples/dataset
+TARGET_DIR=${MAIN_ROOT}/dataset
 mkdir -p ${TARGET_DIR}
 if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then

--- a/examples/tiny/asr0/local/export.sh
+++ b/examples/tiny/asr0/local/export.sh
@@ -14,7 +14,7 @@ jit_model_export_path=$3
 model_type=$4
 python3 -u ${BIN_DIR}/export.py \
--nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --checkpoint_path ${ckpt_path_prefix} \
 --export_path ${jit_model_export_path} \

--- a/examples/tiny/asr0/local/test.sh
+++ b/examples/tiny/asr0/local/test.sh
@@ -19,7 +19,7 @@ if [ $? -ne 0 ]; then
 fi
 python3 -u ${BIN_DIR}/test.py \
--nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --result_file ${ckpt_prefix}.rsl \
 --checkpoint_path ${ckpt_prefix} \

--- a/examples/tiny/asr0/local/train.sh
+++ b/examples/tiny/asr0/local/train.sh
@@ -27,7 +27,7 @@ model_type=$3
 mkdir -p exp
 python3 -u ${BIN_DIR}/train.py \
--nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --output exp/${ckpt_name} \
 --model_type ${model_type} \

--- a/examples/tiny/asr1/local/align.sh
+++ b/examples/tiny/asr1/local/align.sh
@@ -18,7 +18,7 @@ mkdir -p ${output_dir}
 # align dump in `result_file`
 # .tier, .TextGrid dump in `dir of result_file`
 python3 -u ${BIN_DIR}/alignment.py \
--nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --result_file ${output_dir}/${type}.align \
 --checkpoint_path ${ckpt_prefix} \

--- a/examples/tiny/asr1/local/data.sh
+++ b/examples/tiny/asr1/local/data.sh
@@ -14,7 +14,7 @@ bpeprefix="${dict_dir}/bpe_${bpemode}_${nbpe}"
 mkdir -p data
 mkdir -p ${dict_dir}
-TARGET_DIR=${MAIN_ROOT}/examples/dataset
+TARGET_DIR=${MAIN_ROOT}/dataset
 mkdir -p ${TARGET_DIR}
 if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then

--- a/examples/tiny/asr1/local/export.sh
+++ b/examples/tiny/asr1/local/export.sh
@@ -13,7 +13,7 @@ ckpt_path_prefix=$2
 jit_model_export_path=$3
 python3 -u ${BIN_DIR}/export.py \
--nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --checkpoint_path ${ckpt_path_prefix} \
 --export_path ${jit_model_export_path}

--- a/examples/tiny/asr1/local/test.sh
+++ b/examples/tiny/asr1/local/test.sh
@@ -31,7 +31,7 @@ for type in attention ctc_greedy_search; do
        batch_size=64
    fi
    python3 -u ${BIN_DIR}/test.py \
-    --nproc ${ngpu} \
+    --ngpu ${ngpu} \
    --config ${config_path} \
    --result_file ${ckpt_prefix}.${type}.rsl \
    --checkpoint_path ${ckpt_prefix} \
@@ -48,7 +48,7 @@ for type in ctc_prefix_beam_search attention_rescoring; do
    echo "decoding ${type}"
    batch_size=1
    python3 -u ${BIN_DIR}/test.py \
-    --nproc ${ngpu} \
+    --ngpu ${ngpu} \
    --config ${config_path} \
    --result_file ${ckpt_prefix}.${type}.rsl \
    --checkpoint_path ${ckpt_prefix} \

--- a/examples/tiny/asr1/local/train.sh
+++ b/examples/tiny/asr1/local/train.sh
@@ -29,7 +29,7 @@ mkdir -p exp
 python3 -u ${BIN_DIR}/train.py \
 --seed ${seed} \
--nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --output exp/${ckpt_name} \
 --profiler-options "${profiler_options}" \

--- a/examples/vctk/tts3/README.md
+++ b/examples/vctk/tts3/README.md
@@ -7,8 +7,8 @@ Download VCTK-0.92 from the [official website](https://datashare.ed.ac.uk/handle
 ### Get MFA Result and Extract
 We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for fastspeech2.
-You can download from here [vctk_alignment.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/VCTK-Corpus-0.92/vctk_alignment.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) of our repo.
+You can download from here [vctk_alignment.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/VCTK-Corpus-0.92/vctk_alignment.tar.gz), or train your own MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo.
-ps: we remove three speakers in VCTK-0.92 (see [reorganize_vctk.py](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/examples/other/use_mfa/local/reorganize_vctk.py)):
+ps: we remove three speakers in VCTK-0.92 (see [reorganize_vctk.py](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/examples/other/mfa/local/reorganize_vctk.py)):
 1. `p315`, because no txt for it.
 2. `p280` and `p362`, because no *_mic2.flac (which is better than *_mic1.flac) for  them.

--- a/examples/vctk/tts3/conf/default.yaml
+++ b/examples/vctk/tts3/conf/default.yaml
@@ -45,7 +45,6 @@ model:
    postnet_layers: 5                 # number of layers of postnset
    postnet_filts: 5                  # filter size of conv layers in postnet
    postnet_chans: 256                # number of channels of conv layers in postnet
-    use_masking: True                 # whether to apply masking for padded part in loss calculation
    use_scaled_pos_enc: True          # whether to use scaled positional encoding
    encoder_normalize_before: True    # whether to perform layer normalization before the input
    decoder_normalize_before: True    # whether to perform layer normalization before the input

--- a/examples/vctk/voc1/README.md
+++ b/examples/vctk/voc1/README.md
@@ -6,9 +6,9 @@ This example contains code used to train a [parallel wavegan](http://arxiv.org/a
 Download VCTK-0.92  from the [official website](https://datashare.ed.ac.uk/handle/10283/3443) and extract it to `~/datasets`. Then the dataset is in directory `~/datasets/VCTK-Corpus-0.92`.
 ### Get MFA Result and Extract
-We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to cut silence in the edge of audio.
+We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to  cut silence in the edge of audio.
-You can download from here [vctk_alignment.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/VCTK-Corpus-0.92/vctk_alignment.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) of our repo.
+You can download from here [vctk_alignment.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/VCTK-Corpus-0.92/vctk_alignment.tar.gz), or train your own MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo.
-ps: we remove three speakers in VCTK-0.92 (see [reorganize_vctk.py](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/examples/other/use_mfa/local/reorganize_vctk.py)):
+ps: we remove three speakers in VCTK-0.92 (see [reorganize_vctk.py](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/examples/other/mfa/local/reorganize_vctk.py)):
 1. `p315`, because no txt for it.
 2. `p280` and `p362`, because no *_mic2.flac (which is better than *_mic1.flac) for  them.

--- a/examples/wenetspeech/asr1/local/data.sh
+++ b/examples/wenetspeech/asr1/local/data.sh
@@ -27,7 +27,7 @@ set -o pipefail
 mkdir -p data
-TARGET_DIR=${MAIN_ROOT}/examples/dataset
+TARGET_DIR=${MAIN_ROOT}/dataset
 mkdir -p ${TARGET_DIR}
 if [ ${stage} -le -2 ] && [ ${stop_stage} -ge -2 ]; then

--- a/examples/wenetspeech/asr1/local/test.sh
+++ b/examples/wenetspeech/asr1/local/test.sh
@@ -34,7 +34,7 @@ for type in attention ctc_greedy_search; do
    output_dir=${ckpt_prefix}
    mkdir -p ${output_dir}
    python3 -u ${BIN_DIR}/test.py \
-    --nproc ${ngpu} \
+    --ngpu ${ngpu} \
    --config ${config_path} \
    --result_file ${output_dir}/${type}.rsl \
    --checkpoint_path ${ckpt_prefix} \
@@ -53,7 +53,7 @@ for type in ctc_prefix_beam_search attention_rescoring; do
    output_dir=${ckpt_prefix}
    mkdir -p ${output_dir}
    python3 -u ${BIN_DIR}/test.py \
-    --nproc ${ngpu} \
+    --ngpu ${ngpu} \
    --config ${config_path} \
    --result_file ${output_dir}/${type}.rsl \
    --checkpoint_path ${ckpt_prefix} \

--- a/examples/wenetspeech/asr1/local/test_wav.sh
+++ b/examples/wenetspeech/asr1/local/test_wav.sh
@@ -29,7 +29,7 @@ for type in  attention_rescoring; do
    output_dir=${ckpt_prefix}
    mkdir -p ${output_dir}
    python3 -u ${BIN_DIR}/test_wav.py \
-    --nproc ${ngpu} \
+    --ngpu ${ngpu} \
    --config ${config_path} \
    --result_file ${output_dir}/${type}.rsl \
    --checkpoint_path ${ckpt_prefix} \

--- a/audio/paddleaudio/__init__.py
+++ b/audio/paddleaudio/__init__.py
--- a/audio/paddleaudio/backends/__init__.py
+++ b/audio/paddleaudio/backends/__init__.py
--- a/audio/paddleaudio/backends/audio.py
+++ b/audio/paddleaudio/backends/audio.py
--- a/audio/paddleaudio/datasets/__init__.py
+++ b/audio/paddleaudio/datasets/__init__.py
@@ -11,24 +11,14 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from .aishell import AISHELL1
-from .dcase import UrbanAcousticScenes
-from .dcase import UrbanAudioVisualScenes
 from .esc50 import ESC50
 from .gtzan import GTZAN
-from .librispeech import LIBRISPEECH
-from .ravdess import RAVDESS
 from .tess import TESS
 from .urban_sound import UrbanSound8K
 __all__ = [
-    'AISHELL1',
-    'LIBRISPEECH',
    'ESC50',
    'UrbanSound8K',
    'GTZAN',
-    'UrbanAcousticScenes',
-    'UrbanAudioVisualScenes',
-    'RAVDESS',
    'TESS',
 ]
--- a/audio/paddleaudio/datasets/dataset.py
+++ b/audio/paddleaudio/datasets/dataset.py
--- a/audio/paddleaudio/datasets/esc50.py
+++ b/audio/paddleaudio/datasets/esc50.py
--- a/audio/paddleaudio/datasets/gtzan.py
+++ b/audio/paddleaudio/datasets/gtzan.py
--- a/audio/paddleaudio/datasets/tess.py
+++ b/audio/paddleaudio/datasets/tess.py
--- a/audio/paddleaudio/datasets/urban_sound.py
+++ b/audio/paddleaudio/datasets/urban_sound.py
--- a/audio/paddleaudio/features/__init__.py
+++ b/audio/paddleaudio/features/__init__.py
@@ -13,3 +13,4 @@
 # limitations under the License.
 from .augment import *
 from .core import *
+from .spectrum import *
--- a/audio/paddleaudio/features/augment.py
+++ b/audio/paddleaudio/features/augment.py
@@ -15,8 +15,9 @@ from typing import List
 import numpy as np
 from numpy import ndarray as array
-from paddleaudio.backends import depth_convert
-from paddleaudio.utils import ParameterError
+from ..backends import depth_convert
+from ..utils import ParameterError
 __all__ = [
    'depth_augment',

--- a/audio/paddleaudio/features/core.py
+++ b/audio/paddleaudio/features/core.py
@@ -21,9 +21,10 @@ import numpy as np
 import scipy
 from numpy import ndarray as array
 from numpy.lib.stride_tricks import as_strided
-from paddleaudio.utils import ParameterError
 from scipy.signal import get_window
+from ..utils import ParameterError
 __all__ = [
    'stft',
    'mfcc',
@@ -293,6 +294,7 @@ def stft(x: array,
    This function is aligned with librosa.
    """
    _check_audio(x)
    # By default, use the entire frame
    if win_length is None:
        win_length = n_fft
@@ -397,7 +399,7 @@ def mfcc(x,
    This function is NOT strictly aligned with librosa. The following example shows how to get the
    same result with librosa:
-    # paddleaudioe mfcc:
+    # mfcc:
     kwargs = {
        'window_size':512,
        'hop_length':320,

--- a/paddleaudio/features/spectrum.py
+++ b/paddleaudio/features/spectrum.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from functools import partial
+from typing import Optional
+from typing import Union
+import paddle
+import paddle.nn as nn
+from .window import get_window
+__all__ = [
+    'Spectrogram',
+    'MelSpectrogram',
+    'LogMelSpectrogram',
+]
+def hz_to_mel(freq: Union[paddle.Tensor, float],
+              htk: bool=False) -> Union[paddle.Tensor, float]:
+    """Convert Hz to Mels.
+    Parameters:
+        freq: the input tensor of arbitrary shape, or a single floating point number.
+        htk: use HTK formula to do the conversion.
+            The default value is False.
+    Returns:
+        The frequencies represented in Mel-scale.
+    """
+    if htk:
+        if isinstance(freq, paddle.Tensor):
+            return 2595.0 * paddle.log10(1.0 + freq / 700.0)
+        else:
+            return 2595.0 * math.log10(1.0 + freq / 700.0)
+    # Fill in the linear part
+    f_min = 0.0
+    f_sp = 200.0 / 3
+    mels = (freq - f_min) / f_sp
+    # Fill in the log-scale part
+    min_log_hz = 1000.0  # beginning of log region (Hz)
+    min_log_mel = (min_log_hz - f_min) / f_sp  # same (Mels)
+    logstep = math.log(6.4) / 27.0  # step size for log region
+    if isinstance(freq, paddle.Tensor):
+        target = min_log_mel + paddle.log(
+            freq / min_log_hz + 1e-10) / logstep  # prevent nan with 1e-10
+        mask = (freq > min_log_hz).astype(freq.dtype)
+        mels = target * mask + mels * (
+            1 - mask)  # will replace by masked_fill OP in future
+    else:
+        if freq >= min_log_hz:
+            mels = min_log_mel + math.log(freq / min_log_hz + 1e-10) / logstep
+    return mels
+def mel_to_hz(mel: Union[float, paddle.Tensor],
+              htk: bool=False) -> Union[float, paddle.Tensor]:
+    """Convert mel bin numbers to frequencies.
+    Parameters:
+        mel: the mel frequency represented as a tensor of arbitrary shape, or a floating point number.
+        htk: use HTK formula to do the conversion.
+    Returns:
+        The frequencies represented in hz.
+    """
+    if htk:
+        return 700.0 * (10.0**(mel / 2595.0) - 1.0)
+    f_min = 0.0
+    f_sp = 200.0 / 3
+    freqs = f_min + f_sp * mel
+    # And now the nonlinear scale
+    min_log_hz = 1000.0  # beginning of log region (Hz)
+    min_log_mel = (min_log_hz - f_min) / f_sp  # same (Mels)
+    logstep = math.log(6.4) / 27.0  # step size for log region
+    if isinstance(mel, paddle.Tensor):
+        target = min_log_hz * paddle.exp(logstep * (mel - min_log_mel))
+        mask = (mel > min_log_mel).astype(mel.dtype)
+        freqs = target * mask + freqs * (
+            1 - mask)  # will replace by masked_fill OP in future
+    else:
+        if mel >= min_log_mel:
+            freqs = min_log_hz * math.exp(logstep * (mel - min_log_mel))
+    return freqs
+def mel_frequencies(n_mels: int=64,
+                    f_min: float=0.0,
+                    f_max: float=11025.0,
+                    htk: bool=False,
+                    dtype: str=paddle.float32):
+    """Compute mel frequencies.
+    Parameters:
+        n_mels(int): number of Mel bins.
+        f_min(float): the lower cut-off frequency, below which the filter response is zero.
+        f_max(float): the upper cut-off frequency, above which the filter response is zero.
+        htk(bool): whether to use htk formula.
+        dtype(str): the datatype of the return frequencies.
+    Returns:
+        The frequencies represented in Mel-scale
+    """
+    # 'Center freqs' of mel bands - uniformly spaced between limits
+    min_mel = hz_to_mel(f_min, htk=htk)
+    max_mel = hz_to_mel(f_max, htk=htk)
+    mels = paddle.linspace(min_mel, max_mel, n_mels, dtype=dtype)
+    freqs = mel_to_hz(mels, htk=htk)
+    return freqs
+def fft_frequencies(sr: int, n_fft: int, dtype: str=paddle.float32):
+    """Compute fourier frequencies.
+    Parameters:
+        sr(int): the audio sample rate.
+        n_fft(float): the number of fft bins.
+        dtype(str): the datatype of the return frequencies.
+    Returns:
+        The frequencies represented in hz.
+    """
+    return paddle.linspace(0, float(sr) / 2, int(1 + n_fft // 2), dtype=dtype)
+def compute_fbank_matrix(sr: int,
+                         n_fft: int,
+                         n_mels: int=64,
+                         f_min: float=0.0,
+                         f_max: Optional[float]=None,
+                         htk: bool=False,
+                         norm: Union[str, float]='slaney',
+                         dtype: str=paddle.float32):
+    """Compute fbank matrix.
+    Parameters:
+        sr(int): the audio sample rate.
+        n_fft(int): the number of fft bins.
+        n_mels(int): the number of Mel bins.
+        f_min(float): the lower cut-off frequency, below which the filter response is zero.
+        f_max(float): the upper cut-off frequency, above which the filter response is zero.
+        htk: whether to use htk formula.
+        return_complex(bool): whether to return complex matrix. If True, the matrix will
+            be complex type. Otherwise, the real and image part will be stored in the last
+            axis of returned tensor.
+        dtype(str): the datatype of the returned fbank matrix.
+    Returns:
+        The fbank matrix of shape (n_mels, int(1+n_fft//2)).
+    Shape:
+        output: (n_mels, int(1+n_fft//2))
+    """
+    if f_max is None:
+        f_max = float(sr) / 2
+    # Initialize the weights
+    weights = paddle.zeros((n_mels, int(1 + n_fft // 2)), dtype=dtype)
+    # Center freqs of each FFT bin
+    fftfreqs = fft_frequencies(sr=sr, n_fft=n_fft, dtype=dtype)
+    # 'Center freqs' of mel bands - uniformly spaced between limits
+    mel_f = mel_frequencies(
+        n_mels + 2, f_min=f_min, f_max=f_max, htk=htk, dtype=dtype)
+    fdiff = mel_f[1:] - mel_f[:-1]  #np.diff(mel_f)
+    ramps = mel_f.unsqueeze(1) - fftfreqs.unsqueeze(0)
+    #ramps = np.subtract.outer(mel_f, fftfreqs)
+    for i in range(n_mels):
+        # lower and upper slopes for all bins
+        lower = -ramps[i] / fdiff[i]
+        upper = ramps[i + 2] / fdiff[i + 1]
+        # .. then intersect them with each other and zero
+        weights[i] = paddle.maximum(
+            paddle.zeros_like(lower), paddle.minimum(lower, upper))
+    # Slaney-style mel is scaled to be approx constant energy per channel
+    if norm == 'slaney':
+        enorm = 2.0 / (mel_f[2:n_mels + 2] - mel_f[:n_mels])
+        weights *= enorm.unsqueeze(1)
+    elif isinstance(norm, int) or isinstance(norm, float):
+        weights = paddle.nn.functional.normalize(weights, p=norm, axis=-1)
+    return weights
+def power_to_db(magnitude: paddle.Tensor,
+                ref_value: float=1.0,
+                amin: float=1e-10,
+                top_db: Optional[float]=None) -> paddle.Tensor:
+    """Convert a power spectrogram (amplitude squared) to decibel (dB) units.
+    The function computes the scaling ``10 * log10(x / ref)`` in a numerically
+    stable way.
+    Parameters:
+        magnitude(Tensor): the input magnitude tensor of any shape.
+        ref_value(float): the reference value. If smaller than 1.0, the db level
+            of the signal will be pulled up accordingly. Otherwise, the db level
+            is pushed down.
+        amin(float): the minimum value of input magnitude, below which the input
+            magnitude is clipped(to amin).
+        top_db(float): the maximum db value of resulting spectrum, above which the
+            spectrum is clipped(to top_db).
+    Returns:
+        The spectrogram in log-scale.
+    shape:
+        input: any shape
+        output: same as input
+    """
+    if amin <= 0:
+        raise Exception("amin must be strictly positive")
+    if ref_value <= 0:
+        raise Exception("ref_value must be strictly positive")
+    ones = paddle.ones_like(magnitude)
+    log_spec = 10.0 * paddle.log10(paddle.maximum(ones * amin, magnitude))
+    log_spec -= 10.0 * math.log10(max(ref_value, amin))
+    if top_db is not None:
+        if top_db < 0:
+            raise Exception("top_db must be non-negative")
+        log_spec = paddle.maximum(log_spec, ones * (log_spec.max() - top_db))
+    return log_spec
+class Spectrogram(nn.Layer):
+    def __init__(self,
+                 n_fft: int=512,
+                 hop_length: Optional[int]=None,
+                 win_length: Optional[int]=None,
+                 window: str='hann',
+                 center: bool=True,
+                 pad_mode: str='reflect',
+                 dtype: str=paddle.float32):
+        """Compute spectrogram of a given signal, typically an audio waveform.
+        The spectorgram is defined as the complex norm of the short-time
+        Fourier transformation.
+        Parameters:
+            n_fft(int): the number of frequency components of the discrete Fourier transform.
+                The default value is 2048,
+            hop_length(int|None): the hop length of the short time FFT. If None, it is set to win_length//4.
+                The default value is None.
+            win_length: the window length of the short time FFt. If None, it is set to same as n_fft.
+                The default value is None.
+            window(str): the name of the window function applied to the single before the Fourier transform.
+                The folllowing window names are supported: 'hamming','hann','kaiser','gaussian',
+                'exponential','triang','bohman','blackman','cosine','tukey','taylor'.
+                The default value is 'hann'
+            center(bool): if True, the signal is padded so that frame t is centered at x[t * hop_length].
+                If False, frame t begins at x[t * hop_length]
+                The default value is True
+            pad_mode(str): the mode to pad the signal if necessary. The supported modes are 'reflect'
+                and 'constant'. The default value is 'reflect'.
+            dtype(str): the data type of input and window.
+        Notes:
+            The Spectrogram transform relies on STFT transform to compute the spectrogram.
+            By default, the weights are not learnable. To fine-tune the Fourier coefficients,
+            set stop_gradient=False before training.
+            For more information, see STFT().
+        """
+        super(Spectrogram, self).__init__()
+        if win_length is None:
+            win_length = n_fft
+        fft_window = get_window(window, win_length, fftbins=True, dtype=dtype)
+        self._stft = partial(
+            paddle.signal.stft,
+            n_fft=n_fft,
+            hop_length=hop_length,
+            win_length=win_length,
+            window=fft_window,
+            center=center,
+            pad_mode=pad_mode)
+    def forward(self, x):
+        stft = self._stft(x)
+        spectrogram = paddle.square(paddle.abs(stft))
+        return spectrogram
+class MelSpectrogram(nn.Layer):
+    def __init__(self,
+                 sr: int=22050,
+                 n_fft: int=512,
+                 hop_length: Optional[int]=None,
+                 win_length: Optional[int]=None,
+                 window: str='hann',
+                 center: bool=True,
+                 pad_mode: str='reflect',
+                 n_mels: int=64,
+                 f_min: float=50.0,
+                 f_max: Optional[float]=None,
+                 htk: bool=False,
+                 norm: Union[str, float]='slaney',
+                 dtype: str=paddle.float32):
+        """Compute the melspectrogram of a given signal, typically an audio waveform.
+        The melspectrogram is also known as filterbank or fbank feature in audio community.
+        It is computed by multiplying spectrogram with Mel filter bank matrix.
+        Parameters:
+            sr(int): the audio sample rate.
+                The default value is 22050.
+            n_fft(int): the number of frequency components of the discrete Fourier transform.
+                The default value is 2048,
+            hop_length(int|None): the hop length of the short time FFT. If None, it is set to win_length//4.
+                The default value is None.
+            win_length: the window length of the short time FFt. If None, it is set to same as n_fft.
+                The default value is None.
+            window(str): the name of the window function applied to the single before the Fourier transform.
+                The folllowing window names are supported: 'hamming','hann','kaiser','gaussian',
+                'exponential','triang','bohman','blackman','cosine','tukey','taylor'.
+                The default value is 'hann'
+            center(bool): if True, the signal is padded so that frame t is centered at x[t * hop_length].
+                If False, frame t begins at x[t * hop_length]
+                The default value is True
+            pad_mode(str): the mode to pad the signal if necessary. The supported modes are 'reflect'
+                and 'constant'.
+                The default value is 'reflect'.
+            n_mels(int): the mel bins.
+            f_min(float): the lower cut-off frequency, below which the filter response is zero.
+            f_max(float): the upper cut-off frequency, above which the filter response is zeros.
+            htk(bool): whether to use HTK formula in computing fbank matrix.
+            norm(str|float): the normalization type in computing fbank matrix.  Slaney-style is used by default.
+                You can specify norm=1.0/2.0 to use customized p-norm normalization.
+            dtype(str): the datatype of fbank matrix used in the transform. Use float64 to increase numerical
+                accuracy. Note that the final transform will be conducted in float32 regardless of dtype of fbank matrix.
+        """
+        super(MelSpectrogram, self).__init__()
+        self._spectrogram = Spectrogram(
+            n_fft=n_fft,
+            hop_length=hop_length,
+            win_length=win_length,
+            window=window,
+            center=center,
+            pad_mode=pad_mode,
+            dtype=dtype)
+        self.n_mels = n_mels
+        self.f_min = f_min
+        self.f_max = f_max
+        self.htk = htk
+        self.norm = norm
+        if f_max is None:
+            f_max = sr // 2
+        self.fbank_matrix = compute_fbank_matrix(
+            sr=sr,
+            n_fft=n_fft,
+            n_mels=n_mels,
+            f_min=f_min,
+            f_max=f_max,
+            htk=htk,
+            norm=norm,
+            dtype=dtype)  # float64 for better numerical results
+        self.register_buffer('fbank_matrix', self.fbank_matrix)
+    def forward(self, x):
+        spect_feature = self._spectrogram(x)
+        mel_feature = paddle.matmul(self.fbank_matrix, spect_feature)
+        return mel_feature
+class LogMelSpectrogram(nn.Layer):
+    def __init__(self,
+                 sr: int=22050,
+                 n_fft: int=512,
+                 hop_length: Optional[int]=None,
+                 win_length: Optional[int]=None,
+                 window: str='hann',
+                 center: bool=True,
+                 pad_mode: str='reflect',
+                 n_mels: int=64,
+                 f_min: float=50.0,
+                 f_max: Optional[float]=None,
+                 htk: bool=False,
+                 norm: Union[str, float]='slaney',
+                 ref_value: float=1.0,
+                 amin: float=1e-10,
+                 top_db: Optional[float]=None,
+                 dtype: str=paddle.float32):
+        """Compute log-mel-spectrogram(also known as LogFBank) feature of a given signal,
+        typically an audio waveform.
+        Parameters:
+            sr(int): the audio sample rate.
+                The default value is 22050.
+            n_fft(int): the number of frequency components of the discrete Fourier transform.
+                The default value is 2048,
+            hop_length(int|None): the hop length of the short time FFT. If None, it is set to win_length//4.
+                The default value is None.
+            win_length: the window length of the short time FFt. If None, it is set to same as n_fft.
+                The default value is None.
+            window(str): the name of the window function applied to the single before the Fourier transform.
+                The folllowing window names are supported: 'hamming','hann','kaiser','gaussian',
+                'exponential','triang','bohman','blackman','cosine','tukey','taylor'.
+                The default value is 'hann'
+            center(bool): if True, the signal is padded so that frame t is centered at x[t * hop_length].
+                If False, frame t begins at x[t * hop_length]
+                The default value is True
+            pad_mode(str): the mode to pad the signal if necessary. The supported modes are 'reflect'
+                and 'constant'.
+                The default value is 'reflect'.
+            n_mels(int): the mel bins.
+            f_min(float): the lower cut-off frequency, below which the filter response is zero.
+            f_max(float): the upper cut-off frequency, above which the filter response is zeros.
+            ref_value(float): the reference value. If smaller than 1.0, the db level
+            htk(bool): whether to use HTK formula in computing fbank matrix.
+            norm(str|float): the normalization type in computing fbank matrix. Slaney-style is used by default.
+                You can specify norm=1.0/2.0 to use customized p-norm normalization.
+            dtype(str): the datatype of fbank matrix used in the transform. Use float64 to increase numerical
+                accuracy. Note that the final transform will be conducted in float32 regardless of dtype of fbank matrix.
+            amin(float): the minimum value of input magnitude, below which the input of the signal will be pulled up accordingly.
+                Otherwise, the db level is pushed down.
+                magnitude is clipped(to amin). For numerical stability, set amin to a larger value,
+                e.g., 1e-3.
+            top_db(float): the maximum db value of resulting spectrum, above which the
+                spectrum is clipped(to top_db).
+        """
+        super(LogMelSpectrogram, self).__init__()
+        self._melspectrogram = MelSpectrogram(
+            sr=sr,
+            n_fft=n_fft,
+            hop_length=hop_length,
+            win_length=win_length,
+            window=window,
+            center=center,
+            pad_mode=pad_mode,
+            n_mels=n_mels,
+            f_min=f_min,
+            f_max=f_max,
+            htk=htk,
+            norm=norm,
+            dtype=dtype)
+        self.ref_value = ref_value
+        self.amin = amin
+        self.top_db = top_db
+    def forward(self, x):
+        # import ipdb; ipdb.set_trace()
+        mel_feature = self._melspectrogram(x)
+        log_mel_feature = power_to_db(
+            mel_feature,
+            ref_value=self.ref_value,
+            amin=self.amin,
+            top_db=self.top_db)
+        return log_mel_feature
--- a/paddleaudio/features/window.py
+++ b/paddleaudio/features/window.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+import math
+from typing import List
+from typing import Tuple
+from typing import Union
+import paddle
+from paddle import Tensor
+__all__ = [
+    'get_window',
+]
+def _cat(a: List[Tensor], data_type: str) -> Tensor:
+    l = [paddle.to_tensor(_a, data_type) for _a in a]
+    return paddle.concat(l)
+def _acosh(x: Union[Tensor, float]) -> Tensor:
+    if isinstance(x, float):
+        return math.log(x + math.sqrt(x**2 - 1))
+    return paddle.log(x + paddle.sqrt(paddle.square(x) - 1))
+def _extend(M: int, sym: bool) -> bool:
+    """Extend window by 1 sample if needed for DFT-even symmetry"""
+    if not sym:
+        return M + 1, True
+    else:
+        return M, False
+def _len_guards(M: int) -> bool:
+    """Handle small or incorrect window lengths"""
+    if int(M) != M or M < 0:
+        raise ValueError('Window length M must be a non-negative integer')
+    return M <= 1
+def _truncate(w: Tensor, needed: bool) -> Tensor:
+    """Truncate window by 1 sample if needed for DFT-even symmetry"""
+    if needed:
+        return w[:-1]
+    else:
+        return w
+def general_gaussian(M: int, p, sig, sym: bool=True,
+                     dtype: str='float64') -> Tensor:
+    """Compute a window with a generalized Gaussian shape.
+    This function is consistent with scipy.signal.windows.general_gaussian().
+    """
+    if _len_guards(M):
+        return paddle.ones((M, ), dtype=dtype)
+    M, needs_trunc = _extend(M, sym)
+    n = paddle.arange(0, M, dtype=dtype) - (M - 1.0) / 2.0
+    w = paddle.exp(-0.5 * paddle.abs(n / sig)**(2 * p))
+    return _truncate(w, needs_trunc)
+def general_hamming(M: int, alpha: float, sym: bool=True,
+                    dtype: str='float64') -> Tensor:
+    """Compute a generalized Hamming window.
+    This function is consistent with scipy.signal.windows.general_hamming()
+    """
+    return general_cosine(M, [alpha, 1. - alpha], sym, dtype=dtype)
+def taylor(M: int,
+           nbar=4,
+           sll=30,
+           norm=True,
+           sym: bool=True,
+           dtype: str='float64') -> Tensor:
+    """Compute a Taylor window.
+    The Taylor window taper function approximates the Dolph-Chebyshev window's
+    constant sidelobe level for a parameterized number of near-in sidelobes.
+    Parameters:
+        M(int): window size
+        nbar, sil, norm: the window-specific parameter.
+        sym(bool)：whether to return symmetric window.
+            The default value is True
+        dtype(str): the datatype of returned tensor.
+    Returns:
+        Tensor: the window tensor
+    """
+    if _len_guards(M):
+        return paddle.ones((M, ), dtype=dtype)
+    M, needs_trunc = _extend(M, sym)
+    # Original text uses a negative sidelobe level parameter and then negates
+    # it in the calculation of B. To keep consistent with other methods we
+    # assume the sidelobe level parameter to be positive.
+    B = 10**(sll / 20)
+    A = _acosh(B) / math.pi
+    s2 = nbar**2 / (A**2 + (nbar - 0.5)**2)
+    ma = paddle.arange(1, nbar, dtype=dtype)
+    Fm = paddle.empty((nbar - 1, ), dtype=dtype)
+    signs = paddle.empty_like(ma)
+    signs[::2] = 1
+    signs[1::2] = -1
+    m2 = ma * ma
+    for mi in range(len(ma)):
+        numer = signs[mi] * paddle.prod(1 - m2[mi] / s2 / (A**2 + (ma - 0.5)**2
+                                                           ))
+        if mi == 0:
+            denom = 2 * paddle.prod(1 - m2[mi] / m2[mi + 1:])
+        elif mi == len(ma) - 1:
+            denom = 2 * paddle.prod(1 - m2[mi] / m2[:mi])
+        else:
+            denom = 2 * paddle.prod(1 - m2[mi] / m2[:mi]) * paddle.prod(1 - m2[
+                mi] / m2[mi + 1:])
+        Fm[mi] = numer / denom
+    def W(n):
+        return 1 + 2 * paddle.matmul(
+            Fm.unsqueeze(0),
+            paddle.cos(2 * math.pi * ma.unsqueeze(1) * (n - M / 2. + 0.5) / M))
+    w = W(paddle.arange(0, M, dtype=dtype))
+    # normalize (Note that this is not described in the original text [1])
+    if norm:
+        scale = 1.0 / W((M - 1) / 2)
+        w *= scale
+    w = w.squeeze()
+    return _truncate(w, needs_trunc)
+def general_cosine(M: int, a: float, sym: bool=True,
+                   dtype: str='float64') -> Tensor:
+    """Compute a generic weighted sum of cosine terms window.
+    This function is consistent with scipy.signal.windows.general_cosine().
+    """
+    if _len_guards(M):
+        return paddle.ones((M, ), dtype=dtype)
+    M, needs_trunc = _extend(M, sym)
+    fac = paddle.linspace(-math.pi, math.pi, M, dtype=dtype)
+    w = paddle.zeros((M, ), dtype=dtype)
+    for k in range(len(a)):
+        w += a[k] * paddle.cos(k * fac)
+    return _truncate(w, needs_trunc)
+def hamming(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
+    """Compute a Hamming window.
+    The Hamming window is a taper formed by using a raised cosine with
+    non-zero endpoints, optimized to minimize the nearest side lobe.
+    Parameters:
+        M(int): window size
+        sym(bool)：whether to return symmetric window.
+            The default value is True
+        dtype(str): the datatype of returned tensor.
+    Returns:
+        Tensor: the window tensor
+    """
+    return general_hamming(M, 0.54, sym, dtype=dtype)
+def hann(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
+    """Compute a Hann window.
+    The Hann window is a taper formed by using a raised cosine or sine-squared
+    with ends that touch zero.
+    Parameters:
+        M(int): window size
+        sym(bool)：whether to return symmetric window.
+            The default value is True
+        dtype(str): the datatype of returned tensor.
+    Returns:
+        Tensor: the window tensor
+    """
+    return general_hamming(M, 0.5, sym, dtype=dtype)
+def tukey(M: int, alpha=0.5, sym: bool=True, dtype: str='float64') -> Tensor:
+    """Compute a Tukey window.
+    The Tukey window is also known as a tapered cosine window.
+    Parameters:
+        M(int): window size
+        sym(bool)：whether to return symmetric window.
+            The default value is True
+        dtype(str): the datatype of returned tensor.
+    Returns:
+        Tensor: the window tensor
+    """
+    if _len_guards(M):
+        return paddle.ones((M, ), dtype=dtype)
+    if alpha <= 0:
+        return paddle.ones((M, ), dtype=dtype)
+    elif alpha >= 1.0:
+        return hann(M, sym=sym)
+    M, needs_trunc = _extend(M, sym)
+    n = paddle.arange(0, M, dtype=dtype)
+    width = int(alpha * (M - 1) / 2.0)
+    n1 = n[0:width + 1]
+    n2 = n[width + 1:M - width - 1]
+    n3 = n[M - width - 1:]
+    w1 = 0.5 * (1 + paddle.cos(math.pi * (-1 + 2.0 * n1 / alpha / (M - 1))))
+    w2 = paddle.ones(n2.shape, dtype=dtype)
+    w3 = 0.5 * (1 + paddle.cos(math.pi * (-2.0 / alpha + 1 + 2.0 * n3 / alpha /
+                                          (M - 1))))
+    w = paddle.concat([w1, w2, w3])
+    return _truncate(w, needs_trunc)
+def kaiser(M: int, beta: float, sym: bool=True, dtype: str='float64') -> Tensor:
+    """Compute a Kaiser window.
+    The Kaiser window is a taper formed by using a Bessel function.
+    Parameters:
+        M(int): window size.
+        beta(float): the window-specific parameter.
+        sym(bool)：whether to return symmetric window.
+            The default value is True
+    Returns:
+        Tensor: the window tensor
+    """
+    raise NotImplementedError()
+def gaussian(M: int, std: float, sym: bool=True,
+             dtype: str='float64') -> Tensor:
+    """Compute a Gaussian window.
+    The Gaussian widows has a Gaussian shape defined by the standard deviation(std).
+    Parameters:
+        M(int): window size.
+        std(float): the window-specific parameter.
+        sym(bool)：whether to return symmetric window.
+            The default value is True
+        dtype(str): the datatype of returned tensor.
+    Returns:
+        Tensor: the window tensor
+    """
+    if _len_guards(M):
+        return paddle.ones((M, ), dtype=dtype)
+    M, needs_trunc = _extend(M, sym)
+    n = paddle.arange(0, M, dtype=dtype) - (M - 1.0) / 2.0
+    sig2 = 2 * std * std
+    w = paddle.exp(-n**2 / sig2)
+    return _truncate(w, needs_trunc)
+def exponential(M: int,
+                center=None,
+                tau=1.,
+                sym: bool=True,
+                dtype: str='float64') -> Tensor:
+    """Compute an exponential (or Poisson) window.
+    Parameters:
+        M(int): window size.
+        tau(float): the window-specific parameter.
+        sym(bool)：whether to return symmetric window.
+            The default value is True
+        dtype(str): the datatype of returned tensor.
+    Returns:
+        Tensor: the window tensor
+    """
+    if sym and center is not None:
+        raise ValueError("If sym==True, center must be None.")
+    if _len_guards(M):
+        return paddle.ones((M, ), dtype=dtype)
+    M, needs_trunc = _extend(M, sym)
+    if center is None:
+        center = (M - 1) / 2
+    n = paddle.arange(0, M, dtype=dtype)
+    w = paddle.exp(-paddle.abs(n - center) / tau)
+    return _truncate(w, needs_trunc)
+def triang(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
+    """Compute a triangular window.
+    Parameters:
+        M(int): window size.
+        sym(bool)：whether to return symmetric window.
+            The default value is True
+        dtype(str): the datatype of returned tensor.
+    Returns:
+        Tensor: the window tensor
+    """
+    if _len_guards(M):
+        return paddle.ones((M, ), dtype=dtype)
+    M, needs_trunc = _extend(M, sym)
+    n = paddle.arange(1, (M + 1) // 2 + 1, dtype=dtype)
+    if M % 2 == 0:
+        w = (2 * n - 1.0) / M
+        w = paddle.concat([w, w[::-1]])
+    else:
+        w = 2 * n / (M + 1.0)
+        w = paddle.concat([w, w[-2::-1]])
+    return _truncate(w, needs_trunc)
+def bohman(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
+    """Compute a Bohman window.
+    The Bohman window is the autocorrelation of a cosine window.
+    Parameters:
+        M(int): window size.
+        sym(bool)：whether to return symmetric window.
+            The default value is True
+        dtype(str): the datatype of returned tensor.
+    Returns:
+        Tensor: the window tensor
+    """
+    if _len_guards(M):
+        return paddle.ones((M, ), dtype=dtype)
+    M, needs_trunc = _extend(M, sym)
+    fac = paddle.abs(paddle.linspace(-1, 1, M, dtype=dtype)[1:-1])
+    w = (1 - fac) * paddle.cos(math.pi * fac) + 1.0 / math.pi * paddle.sin(
+        math.pi * fac)
+    w = _cat([0, w, 0], dtype)
+    return _truncate(w, needs_trunc)
+def blackman(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
+    """Compute a Blackman window.
+    The Blackman window is a taper formed by using the first three terms of
+    a summation of cosines. It was designed to have close to the minimal
+    leakage possible.  It is close to optimal, only slightly worse than a
+    Kaiser window.
+    Parameters:
+        M(int): window size.
+        sym(bool)：whether to return symmetric window.
+            The default value is True
+        dtype(str): the datatype of returned tensor.
+    Returns:
+        Tensor: the window tensor
+    """
+    return general_cosine(M, [0.42, 0.50, 0.08], sym, dtype=dtype)
+def cosine(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
+    """Compute a window with a simple cosine shape.
+    Parameters:
+        M(int): window size.
+        sym(bool)：whether to return symmetric window.
+            The default value is True
+        dtype(str): the datatype of returned tensor.
+    Returns:
+        Tensor: the window tensor
+    """
+    if _len_guards(M):
+        return paddle.ones((M, ), dtype=dtype)
+    M, needs_trunc = _extend(M, sym)
+    w = paddle.sin(math.pi / M * (paddle.arange(0, M, dtype=dtype) + .5))
+    return _truncate(w, needs_trunc)
+def get_window(window: Union[str, Tuple[str, float]],
+               win_length: int,
+               fftbins: bool=True,
+               dtype: str='float64') -> Tensor:
+    """Return a window of a given length and type.
+    Parameters:
+        window(str|(str,float)): the type of window to create.
+        win_length(int): the number of samples in the window.
+        fftbins(bool): If True, create a "periodic" window. Otherwise,
+            create a "symmetric" window, for use in filter design.
+    Returns:
+       The window represented as a tensor.
+    """
+    sym = not fftbins
+    args = ()
+    if isinstance(window, tuple):
+        winstr = window[0]
+        if len(window) > 1:
+            args = window[1:]
+    elif isinstance(window, str):
+        if window in ['gaussian', 'exponential']:
+            raise ValueError("The '" + window + "' window needs one or "
+                             "more parameters -- pass a tuple.")
+        else:
+            winstr = window
+    else:
+        raise ValueError("%s as window type is not supported." %
+                         str(type(window)))
+    try:
+        winfunc = eval(winstr)
+    except KeyError as e:
+        raise ValueError("Unknown window type.") from e
+    params = (win_length, ) + args
+    kwargs = {'sym': sym}
+    return winfunc(*params, dtype=dtype, **kwargs)
--- a/audio/paddleaudio/utils/__init__.py
+++ b/audio/paddleaudio/utils/__init__.py
--- a/audio/paddleaudio/utils/download.py
+++ b/audio/paddleaudio/utils/download.py
@@ -17,7 +17,6 @@ from typing import List
 from paddle.framework import load as load_state_dict
 from paddle.utils import download
-from pathos.multiprocessing import ProcessPool
 from .log import logger
@@ -32,27 +31,18 @@ def decompress(file: str):
    download._decompress(file)
-def download_and_decompress(archives: List[Dict[str, str]],
+def download_and_decompress(archives: List[Dict[str, str]], path: str):
-                            path: str,
-                            n_workers: int=0):
    """
    Download archieves and decompress to specific path.
    """
    if not os.path.isdir(path):
        os.makedirs(path)
-    if n_workers <= 0:
+    for archive in archives:
-        for archive in archives:
+        assert 'url' in archive and 'md5' in archive, \
-            assert 'url' in archive and 'md5' in archive, \
+            'Dictionary keys of "url" and "md5" are required in the archive, but got: {list(archieve.keys())}'
-                'Dictionary keys of "url" and "md5" are required in the archive, but got: {list(archieve.keys())}'
-            download.get_path_from_url(archive['url'], path, archive['md5'])
+        download.get_path_from_url(archive['url'], path, archive['md5'])
-    else:
-        pool = ProcessPool(nodes=n_workers)
-        pool.imap(download.get_path_from_url, [_['url'] for _ in archives],
-                  [path] * len(archives), [_['md5'] for _ in archives])
-        pool.close()
-        pool.join()
 def load_state_dict_from_url(url: str, path: str, md5: str=None):

--- a/audio/paddleaudio/utils/env.py
+++ b/audio/paddleaudio/utils/env.py
--- a/audio/paddleaudio/utils/error.py
+++ b/audio/paddleaudio/utils/error.py
--- a/audio/paddleaudio/utils/log.py
+++ b/audio/paddleaudio/utils/log.py
--- a/audio/paddleaudio/utils/time.py
+++ b/audio/paddleaudio/utils/time.py
--- a/paddlespeech/cli/README.md
+++ b/paddlespeech/cli/README.md
+# PaddleSpeech Command Line
+ The simplest approach to use PaddleSpeech models.
+ ## Help
+ `paddlespeech help`
+ ## S2T
+ `paddlespeech s2t --config ./s2t.yaml --input ./zh.wav --device gpu`
--- a/paddlespeech/cli/__init__.py
+++ b/paddlespeech/cli/__init__.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .base_commands import BaseCommand
+from .base_commands import HelpCommand
+from .s2t import S2TExecutor
--- a/paddlespeech/cli/base_commands.py
+++ b/paddlespeech/cli/base_commands.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List
+from .entry import commands
+from .utils import cli_register
+from .utils import get_command
+__all__ = [
+    'BaseCommand',
+    'HelpCommand',
+]
+@cli_register(name='paddlespeech')
+class BaseCommand:
+    def execute(self, argv: List[str]) -> bool:
+        help = get_command('paddlespeech.help')
+        return help().execute(argv)
+@cli_register(name='paddlespeech.help', description='Show help for commands.')
+class HelpCommand:
+    def execute(self, argv: List[str]) -> bool:
+        msg = 'Usage:\n'
+        msg += '    paddlespeech <command> <options>\n\n'
+        msg += 'Commands:\n'
+        for command, detail in commands['paddlespeech'].items():
+            if command.startswith('_'):
+                continue
+            if '_description' not in detail:
+                continue
+            msg += '    {:<15}        {}\n'.format(command,
+                                                   detail['_description'])
+        print(msg)
+        return True
--- a/paddlespeech/cli/cls/__init.__py
+++ b/paddlespeech/cli/cls/__init.__py
--- a/paddlespeech/cli/entry.py
+++ b/paddlespeech/cli/entry.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+from collections import defaultdict
+__all__ = ['commands']
+def _CommandDict():
+    return defaultdict(_CommandDict)
+def _execute():
+    com = commands
+    for idx, _argv in enumerate(['paddlespeech'] + sys.argv[1:]):
+        if _argv not in com:
+            break
+        com = com[_argv]
+    # The method 'execute' of a command instance returns 'True' for a success
+    # while 'False' for a failure. Here converts this result into a exit status
+    # in bash: 0 for a success and 1 for a failure.
+    status = 0 if com['_entry']().execute(sys.argv[idx:]) else 1
+    return status
+commands = _CommandDict()
--- a/paddlespeech/t2s/modules/expansion.py
+++ b/paddlespeech/t2s/modules/expansion.py
@@ -11,27 +11,57 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import numpy as np
+import os
+from abc import ABC
+from abc import abstractmethod
+from typing import Optional
+from typing import Union
 import paddle
-from paddle import Tensor
-def expand(encodings: Tensor, durations: Tensor) -> Tensor:
+class BaseExecutor(ABC):
    """
-    encodings: (B, T, C)
+        An abstract executor of paddlespeech tasks.
-    durations: (B, T)
    """
-    batch_size, t_enc = durations.shape
-    durations = durations.numpy()
+    def __init__(self):
-    slens = np.sum(durations, -1)
+        self.input = None
-    t_dec = np.max(slens)
+        self.output = None
-    M = np.zeros([batch_size, t_dec, t_enc])
-    for i in range(batch_size):
+    @abstractmethod
-        k = 0
+    def _get_default_cfg_path(self):
-        for j in range(t_enc):
+        """
-            d = durations[i, j]
+            Returns a default config file path of current task.
-            M[i, k:k + d, j] = 1
+        """
-            k += d
+        pass
-    M = paddle.to_tensor(M, dtype=encodings.dtype)
-    encodings = paddle.matmul(M, encodings)
+    @abstractmethod
-    return encodings
+    def _init_from_cfg(self, cfg_path: Optional[os.PathLike]=None):
+        """
+            Init model from a specific config file.
+        """
+        pass
+    @abstractmethod
+    def preprocess(self, input: Union[str, os.PathLike]):
+        """
+            Input preprocess and return paddle.Tensor stored in self.input.
+            Input content can be a text(t2s), a file(s2t, cls) or a streaming(not supported yet).
+        """
+        pass
+    @paddle.no_grad()
+    @abstractmethod
+    def infer(self, device: str):
+        """
+            Model inference and result stored in self.output.
+        """
+        pass
+    @abstractmethod
+    def postprocess(self) -> Union[str, os.PathLike]:
+        """
+            Output postprocess and return human-readable results such as texts and audio files.
+        """
+        pass
--- a/paddlespeech/cli/s2t/__init__.py
+++ b/paddlespeech/cli/s2t/__init__.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .infer import S2TExecutor
--- a/paddlespeech/cli/s2t/conf/default_conf.yaml
+++ b/paddlespeech/cli/s2t/conf/default_conf.yaml
--- a/paddlespeech/cli/s2t/infer.py
+++ b/paddlespeech/cli/s2t/infer.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+from typing import List
+from typing import Optional
+from typing import Union
+import paddle
+from ..executor import BaseExecutor
+from ..utils import cli_register
+__all__ = ['S2TExecutor']
+@cli_register(
+    name='paddlespeech.s2t', description='Speech to text infer command.')
+class S2TExecutor(BaseExecutor):
+    def __init__(self):
+        super(S2TExecutor, self).__init__()
+        self.parser = argparse.ArgumentParser(
+            prog='paddlespeech.s2t', add_help=True)
+        self.parser.add_argument(
+            '--config',
+            type=str,
+            default=None,
+            help='Config of s2t task. Use deault config when it is None.')
+        self.parser.add_argument(
+            '--input', type=str, help='Audio file to recognize.')
+        self.parser.add_argument(
+            '--device',
+            type=str,
+            default='cpu',
+            help='Choose device to execute model inference.')
+    def _get_default_cfg_path(self):
+        """
+            Returns a default config file path of current task.
+        """
+        pass
+    def _init_from_cfg(self, cfg_path: Optional[os.PathLike]=None):
+        """
+            Init model from a specific config file.
+        """
+        pass
+    def preprocess(self, input: Union[str, os.PathLike]):
+        """
+            Input preprocess and return paddle.Tensor stored in self.input.
+            Input content can be a text(t2s), a file(s2t, cls) or a streaming(not supported yet).
+        """
+        pass
+    @paddle.no_grad()
+    def infer(self):
+        """
+            Model inference and result stored in self.output.
+        """
+        pass
+    def postprocess(self) -> Union[str, os.PathLike]:
+        """
+            Output postprocess and return human-readable results such as texts and audio files.
+        """
+        pass
+    def execute(self, argv: List[str]) -> bool:
+        parser_args = self.parser.parse_args(argv)
+        print(parser_args)
+        config = parser_args.config
+        audio_file = parser_args.input
+        device = parser_args.device
+        if config is not None:
+            assert os.path.isfile(config), 'Config file is not valid.'
+        else:
+            config = self._get_default_cfg_path()
+        try:
+            self._init_from_cfg(config)
+            self.preprocess(audio_file)
+            self.infer()
+            res = self.postprocess()  # Retrieve result of s2t.
+            print(res)
+            return True
+        except Exception as e:
+            print(e)
+            return False
--- a/paddlespeech/cli/t2s/__init.__py
+++ b/paddlespeech/cli/t2s/__init.__py
--- a/paddlespeech/cli/utils.py
+++ b/paddlespeech/cli/utils.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from typing import Any
+from typing import Dict
+from typing import List
+from paddle.framework import load
+from paddle.utils import download
+from .entry import commands
+__all__ = [
+    'cli_register',
+    'get_command',
+    'download_and_decompress',
+    'load_state_dict_from_url',
+]
+def cli_register(name: str, description: str='') -> Any:
+    def _warpper(command):
+        items = name.split('.')
+        com = commands
+        for item in items:
+            com = com[item]
+        com['_entry'] = command
+        if description:
+            com['_description'] = description
+        return command
+    return _warpper
+def get_command(name: str) -> Any:
+    items = name.split('.')
+    com = commands
+    for item in items:
+        com = com[item]
+    return com['_entry']
+def decompress(file: str):
+    """
+    Extracts all files from a compressed file.
+    """
+    assert os.path.isfile(file), "File: {} not exists.".format(file)
+    download._decompress(file)
+def download_and_decompress(archives: List[Dict[str, str]], path: str):
+    """
+    Download archieves and decompress to specific path.
+    """
+    if not os.path.isdir(path):
+        os.makedirs(path)
+    for archive in archives:
+        assert 'url' in archive and 'md5' in archive, \
+            'Dictionary keys of "url" and "md5" are required in the archive, but got: {list(archieve.keys())}'
+        download.get_path_from_url(archive['url'], path, archive['md5'])
+def load_state_dict_from_url(url: str, path: str, md5: str=None):
+    """
+    Download and load a state dict from url
+    """
+    if not os.path.isdir(path):
+        os.makedirs(path)
+    download.get_path_from_url(url, path, md5)
+    return load(os.path.join(path, os.path.basename(url)))
--- a/audio/paddleaudio/models/__init__.py
+++ b/audio/paddleaudio/models/__init__.py
--- a/paddlespeech/t2s/modules/fastspeech2_predictor/__init__.py
+++ b/paddlespeech/t2s/modules/fastspeech2_predictor/__init__.py
--- a/paddlespeech/cls/exps/panns/deploy/__init__.py
+++ b/paddlespeech/cls/exps/panns/deploy/__init__.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/audio/examples/sound_classification/deploy/python/predict.py
+++ b/audio/examples/sound_classification/deploy/python/predict.py
@@ -16,16 +16,18 @@ import os
 import numpy as np
 from paddle import inference
+from scipy.special import softmax
 from paddleaudio.backends import load as load_audio
 from paddleaudio.datasets import ESC50
 from paddleaudio.features import melspectrogram
-from scipy.special import softmax
 # yapf: disable
 parser = argparse.ArgumentParser()
 parser.add_argument("--model_dir", type=str, required=True, default="./export", help="The directory to static model.")
-parser.add_argument("--batch_size", type=int, default=2, help="Batch size per GPU/CPU for training.")
 parser.add_argument('--device', choices=['cpu', 'gpu', 'xpu'], default="gpu", help="Select which device to train model, defaults to gpu.")
+parser.add_argument("--wav", type=str, required=True, help="Audio file to infer.")
+parser.add_argument("--batch_size", type=int, default=1, help="Batch size per GPU/CPU for training.")
 parser.add_argument('--use_tensorrt', type=eval, default=False, choices=[True, False], help='Enable to use tensorrt to speed up.')
 parser.add_argument("--precision", type=str, default="fp32", choices=["fp32", "fp16"], help='The tensorrt precision.')
 parser.add_argument('--cpu_threads', type=int, default=10, help='Number of threads to predict when using cpu.')
@@ -131,10 +133,7 @@ if __name__ == "__main__":
                          args.use_tensorrt, args.precision, args.cpu_threads,
                          args.enable_mkldnn)
-    wavs = [
+    wavs = [args.wav]
-        '~/audio_demo_resource/cat.wav',
-        '~/audio_demo_resource/dog.wav',
-    ]
    for i in range(len(wavs)):
        wavs[i] = os.path.abspath(os.path.expanduser(wavs[i]))

--- a/audio/examples/sound_classification/export_model.py
+++ b/audio/examples/sound_classification/export_model.py
@@ -15,9 +15,10 @@ import argparse
 import os
 import paddle
-from model import SoundClassifier
 from paddleaudio.datasets import ESC50
-from paddleaudio.models.panns import cnn14
+from paddlespeech.cls.models import cnn14
+from paddlespeech.cls.models import SoundClassifier
 # yapf: disable
 parser = argparse.ArgumentParser(__doc__)

--- a/audio/examples/sound_classification/predict.py
+++ b/audio/examples/sound_classification/predict.py
@@ -16,30 +16,40 @@ import argparse
 import numpy as np
 import paddle
 import paddle.nn.functional as F
-from model import SoundClassifier
 from paddleaudio.backends import load as load_audio
 from paddleaudio.datasets import ESC50
+from paddleaudio.features import LogMelSpectrogram
 from paddleaudio.features import melspectrogram
-from paddleaudio.models.panns import cnn14
+from paddlespeech.cls.models import cnn14
+from paddlespeech.cls.models import SoundClassifier
 # yapf: disable
 parser = argparse.ArgumentParser(__doc__)
-parser.add_argument('--device', choices=['cpu', 'gpu'], default="gpu", help="Select which device to predict, defaults to gpu.")
 parser.add_argument("--wav", type=str, required=True, help="Audio file to infer.")
+parser.add_argument("--feat_backend", type=str, choices=['numpy', 'paddle'], default='numpy', help="Choose backend to extract features from audio files.")
 parser.add_argument("--top_k", type=int, default=1, help="Show top k predicted results")
 parser.add_argument("--checkpoint", type=str, required=True, help="Checkpoint of model.")
 args = parser.parse_args()
 # yapf: enable
-def extract_features(file: str, **kwargs):
+def extract_features(file: str, feat_backend: str='numpy',
+                     **kwargs) -> paddle.Tensor:
    waveform, sr = load_audio(file, sr=None)
-    feat = melspectrogram(waveform, sr, **kwargs).transpose()
+    if args.feat_backend == 'numpy':
+        feat = melspectrogram(waveform, sr, **kwargs).transpose()
+        feat = np.expand_dims(feat, 0)
+        feat = paddle.to_tensor(feat)
+    else:
+        feature_extractor = LogMelSpectrogram(sr=sr, **kwargs)
+        feat = feature_extractor(paddle.to_tensor(waveform).unsqueeze(0))
+        feat = paddle.transpose(feat, [0, 2, 1])
    return feat
 if __name__ == '__main__':
-    paddle.set_device(args.device)
    model = SoundClassifier(
        backbone=cnn14(pretrained=False, extract_embedding=True),
@@ -47,8 +57,7 @@ if __name__ == '__main__':
    model.set_state_dict(paddle.load(args.checkpoint))
    model.eval()
-    feat = np.expand_dims(extract_features(args.wav), 0)
+    feat = extract_features(args.wav, args.feat_backend)
-    feat = paddle.to_tensor(feat)
    logits = model(feat)
    probs = F.softmax(logits, axis=1).numpy()

--- a/audio/examples/sound_classification/train.py
+++ b/audio/examples/sound_classification/train.py
@@ -15,16 +15,18 @@ import argparse
 import os
 import paddle
-from model import SoundClassifier
 from paddleaudio.datasets import ESC50
-from paddleaudio.models.panns import cnn14
+from paddleaudio.features import LogMelSpectrogram
 from paddleaudio.utils import logger
 from paddleaudio.utils import Timer
+from paddlespeech.cls.models import cnn14
+from paddlespeech.cls.models import SoundClassifier
 # yapf: disable
 parser = argparse.ArgumentParser(__doc__)
-parser.add_argument('--device', choices=['cpu', 'gpu'], default="gpu", help="Select which device to train model, defaults to gpu.")
 parser.add_argument("--epochs", type=int, default=50, help="Number of epoches for fine-tuning.")
+parser.add_argument("--feat_backend", type=str, choices=['numpy', 'paddle'], default='numpy', help="Choose backend to extract features from audio files.")
 parser.add_argument("--learning_rate", type=float, default=5e-5, help="Learning rate used to train with warmup.")
 parser.add_argument("--batch_size", type=int, default=16, help="Total examples' number in batch for training.")
 parser.add_argument("--num_workers", type=int, default=0, help="Number of workers in dataloader.")
@@ -35,7 +37,6 @@ args = parser.parse_args()
 # yapf: enable
 if __name__ == "__main__":
-    paddle.set_device(args.device)
    nranks = paddle.distributed.get_world_size()
    if paddle.distributed.get_world_size() > 1:
        paddle.distributed.init_parallel_env()
@@ -48,8 +49,13 @@ if __name__ == "__main__":
        learning_rate=args.learning_rate, parameters=model.parameters())
    criterion = paddle.nn.loss.CrossEntropyLoss()
-    train_ds = ESC50(mode='train', feat_type='melspectrogram')
+    if args.feat_backend == 'numpy':
-    dev_ds = ESC50(mode='dev', feat_type='melspectrogram')
+        train_ds = ESC50(mode='train', feat_type='melspectrogram')
+        dev_ds = ESC50(mode='dev', feat_type='melspectrogram')
+    else:
+        train_ds = ESC50(mode='train')
+        dev_ds = ESC50(mode='dev')
+        feature_extractor = LogMelSpectrogram(sr=16000)
    train_sampler = paddle.io.DistributedBatchSampler(
        train_ds, batch_size=args.batch_size, shuffle=True, drop_last=False)
@@ -71,7 +77,16 @@ if __name__ == "__main__":
        num_corrects = 0
        num_samples = 0
        for batch_idx, batch in enumerate(train_loader):
-            feats, labels = batch
+            if args.feat_backend == 'numpy':
+                feats, labels = batch
+            else:
+                waveforms, labels = batch
+                feats = feature_extractor(
+                    waveforms
+                )  # Need a padding when lengths of waveforms differ in a batch.
+                feats = paddle.transpose(feats,
+                                         [0, 2, 1])  # To [N, length, n_mels]
            logits = model(feats)
            loss = criterion(logits, labels)
@@ -126,7 +141,13 @@ if __name__ == "__main__":
            num_samples = 0
            with logger.processing('Evaluation on validation dataset'):
                for batch_idx, batch in enumerate(dev_loader):
-                    feats, labels = batch
+                    if args.feat_backend == 'numpy':
+                        feats, labels = batch
+                    else:
+                        waveforms, labels = batch
+                        feats = feature_extractor(waveforms)
+                        feats = paddle.transpose(feats, [0, 2, 1])
                    logits = model(feats)
                    preds = paddle.argmax(logits, axis=1)

--- a/paddlespeech/cls/models/__init__.py
+++ b/paddlespeech/cls/models/__init__.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .panns import *
--- a/paddlespeech/cls/models/panns/__init__.py
+++ b/paddlespeech/cls/models/panns/__init__.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .classifier import *
+from .panns import *
--- a/audio/examples/sound_classification/model.py
+++ b/audio/examples/sound_classification/model.py
--- a/audio/paddleaudio/models/panns.py
+++ b/audio/paddleaudio/models/panns.py
@@ -16,8 +16,8 @@ import os
 import paddle.nn as nn
 import paddle.nn.functional as F
-from ..utils.download import load_state_dict_from_url
+from paddleaudio.utils.download import load_state_dict_from_url
-from ..utils.env import MODEL_HOME
+from paddleaudio.utils.env import MODEL_HOME
 __all__ = ['CNN14', 'CNN10', 'CNN6', 'cnn14', 'cnn10', 'cnn6']

--- a/paddlespeech/s2t/decoders/recog.py
+++ b/paddlespeech/s2t/decoders/recog.py
@@ -40,7 +40,6 @@ def get_config(config_path):
 def load_trained_model(args):
-    args.nprocs = args.ngpu
    confs = get_config(args.model_conf)
    class_obj = dynamic_import_tester(args.model_name)
    exp = class_obj(confs, args)

--- a/paddlespeech/s2t/exps/deepspeech2/bin/test_hub.py
+++ b/paddlespeech/s2t/exps/deepspeech2/bin/test_hub.py
@@ -87,7 +87,7 @@ class DeepSpeech2Tester_hub():
    def setup(self):
        """Setup the experiment.
        """
-        paddle.set_device('gpu' if self.args.nprocs > 0 else 'cpu')
+        paddle.set_device('gpu' if self.args.ngpu > 0 else 'cpu')
        self.setup_output_dir()
        self.setup_checkpointer()
@@ -110,8 +110,8 @@ class DeepSpeech2Tester_hub():
    def setup_model(self):
        config = self.config.clone()
        with UpdateConfig(config):
-            config.model.feat_size = self.collate_fn_test.feature_size
+            config.model.input_dim = self.collate_fn_test.feature_size
-            config.model.dict_size = self.collate_fn_test.vocab_size
+            config.model.output_dim = self.collate_fn_test.vocab_size
        if self.args.model_type == 'offline':
            model = DeepSpeech2Model.from_config(config.model)

--- a/paddlespeech/s2t/exps/deepspeech2/bin/train.py
+++ b/paddlespeech/s2t/exps/deepspeech2/bin/train.py
@@ -27,8 +27,8 @@ def main_sp(config, args):
 def main(config, args):
-    if args.nprocs > 0:
+    if args.ngpu > 1:
-        dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs)
+        dist.spawn(main_sp, args=(config, args), nprocs=args.ngpu)
    else:
        main_sp(config, args)

--- a/paddlespeech/s2t/exps/deepspeech2/model.py
+++ b/paddlespeech/s2t/exps/deepspeech2/model.py
@@ -154,11 +154,11 @@ class DeepSpeech2Trainer(Trainer):
        config = self.config.clone()
        with UpdateConfig(config):
            if self.train:
-                config.model.feat_size = self.train_loader.collate_fn.feature_size
+                config.model.input_dim = self.train_loader.collate_fn.feature_size
-                config.model.dict_size = self.train_loader.collate_fn.vocab_size
+                config.model.output_dim = self.train_loader.collate_fn.vocab_size
            else:
-                config.model.feat_size = self.test_loader.collate_fn.feature_size
+                config.model.input_dim = self.test_loader.collate_fn.feature_size
-                config.model.dict_size = self.test_loader.collate_fn.vocab_size
+                config.model.output_dim = self.test_loader.collate_fn.vocab_size
        if self.args.model_type == 'offline':
            model = DeepSpeech2Model.from_config(config.model)

--- a/paddlespeech/s2t/exps/u2/bin/test_wav.py
+++ b/paddlespeech/s2t/exps/u2/bin/test_wav.py
@@ -47,7 +47,7 @@ class U2Infer():
            vocab_filepath=config.collator.vocab_filepath,
            spm_model_prefix=config.collator.spm_model_prefix)
-        paddle.set_device('gpu' if self.args.nprocs > 0 else 'cpu')
+        paddle.set_device('gpu' if self.args.ngpu > 0 else 'cpu')
        # model
        model_conf = config.model

--- a/paddlespeech/s2t/exps/u2/bin/train.py
+++ b/paddlespeech/s2t/exps/u2/bin/train.py
@@ -32,8 +32,8 @@ def main_sp(config, args):
 def main(config, args):
-    if args.nprocs > 0:
+    if args.ngpu > 1:
-        dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs)
+        dist.spawn(main_sp, args=(config, args), nprocs=args.ngpu)
    else:
        main_sp(config, args)

--- a/paddlespeech/s2t/exps/u2/model.py
+++ b/paddlespeech/s2t/exps/u2/model.py
@@ -257,7 +257,7 @@ class U2Trainer(Trainer):
                maxlen_in=float('inf'),
                maxlen_out=float('inf'),
                minibatches=0,
-                mini_batch_size=self.args.nprocs,
+                mini_batch_size=self.args.ngpu,
                batch_count='auto',
                batch_bins=0,
                batch_frames_in=0,
@@ -277,7 +277,7 @@ class U2Trainer(Trainer):
                maxlen_in=float('inf'),
                maxlen_out=float('inf'),
                minibatches=0,
-                mini_batch_size=self.args.nprocs,
+                mini_batch_size=self.args.ngpu,
                batch_count='auto',
                batch_bins=0,
                batch_frames_in=0,

--- a/paddlespeech/s2t/exps/u2_kaldi/bin/train.py
+++ b/paddlespeech/s2t/exps/u2_kaldi/bin/train.py
@@ -36,8 +36,8 @@ def main_sp(config, args):
 def main(config, args):
-    if args.nprocs > 0:
+    if args.ngpu > 1:
-        dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs)
+        dist.spawn(main_sp, args=(config, args), nprocs=args.ngpu)
    else:
        main_sp(config, args)

--- a/paddlespeech/s2t/exps/u2_kaldi/model.py
+++ b/paddlespeech/s2t/exps/u2_kaldi/model.py
@@ -239,7 +239,7 @@ class U2Trainer(Trainer):
            maxlen_in=float('inf'),
            maxlen_out=float('inf'),
            minibatches=0,
-            mini_batch_size=self.args.nprocs,
+            mini_batch_size=self.args.ngpu,
            batch_count='auto',
            batch_bins=0,
            batch_frames_in=0,
@@ -258,7 +258,7 @@ class U2Trainer(Trainer):
            maxlen_in=float('inf'),
            maxlen_out=float('inf'),
            minibatches=0,
-            mini_batch_size=self.args.nprocs,
+            mini_batch_size=self.args.ngpu,
            batch_count='auto',
            batch_bins=0,
            batch_frames_in=0,

--- a/paddlespeech/s2t/exps/u2_st/bin/train.py
+++ b/paddlespeech/s2t/exps/u2_st/bin/train.py
@@ -30,8 +30,8 @@ def main_sp(config, args):
 def main(config, args):
-    if args.nprocs > 0:
+    if args.ngpu > 1:
-        dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs)
+        dist.spawn(main_sp, args=(config, args), nprocs=args.ngpu)
    else:
        main_sp(config, args)

--- a/paddlespeech/s2t/models/ds2/deepspeech2.py
+++ b/paddlespeech/s2t/models/ds2/deepspeech2.py
@@ -249,8 +249,8 @@ class DeepSpeech2Model(nn.Layer):
            The model built from config.
        """
        model = cls(
-            feat_size=config.feat_size,
+            feat_size=config.input_dim,
-            dict_size=config.dict_size,
+            dict_size=config.output_dim,
            num_conv_layers=config.num_conv_layers,
            num_rnn_layers=config.num_rnn_layers,
            rnn_size=config.rnn_layer_size,

--- a/paddlespeech/s2t/models/ds2_online/deepspeech2.py
+++ b/paddlespeech/s2t/models/ds2_online/deepspeech2.py
@@ -381,8 +381,8 @@ class DeepSpeech2ModelOnline(nn.Layer):
            The model built from config.
        """
        model = cls(
-            feat_size=config.feat_size,
+            feat_size=config.input_dim,
-            dict_size=config.dict_size,
+            dict_size=config.output_dim,
            num_conv_layers=config.num_conv_layers,
            num_rnn_layers=config.num_rnn_layers,
            rnn_size=config.rnn_layer_size,

--- a/paddlespeech/s2t/training/cli.py
+++ b/paddlespeech/s2t/training/cli.py
@@ -51,7 +51,7 @@ def default_argument_parser(parser=None):
    The ``--checkpoint_path`` specifies the checkpoint to load from.
-    The ``--nprocs`` specifies how to run the training.
+    The ``--ngpu`` specifies how to run the training.
    See Also
@@ -78,7 +78,7 @@ def default_argument_parser(parser=None):
        help="seed to use for paddle, np and random. None or 0 for random, else set seed."
    )
    train_group.add_argument(
-        "--nprocs",
+        "--ngpu",
        type=int,
        default=1,
        help="number of parallel processes. 0 for cpu.")

--- a/paddlespeech/s2t/training/trainer.py
+++ b/paddlespeech/s2t/training/trainer.py
@@ -88,8 +88,8 @@ class Trainer():
    >>>     config.merge_from_list(args.opts)
    >>> config.freeze()
    >>>
-    >>> if args.nprocs > 0:
+    >>> if args.ngpu > 1:
-    >>>     dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs)
+    >>>     dist.spawn(main_sp, args=(config, args), nprocs=args.ngpu)
    >>> else:
    >>>     main_sp(config, args)
    """
@@ -112,7 +112,7 @@ class Trainer():
        logger.info(f"Rank: {self.rank}/{self.world_size}")
        # set device
-        paddle.set_device('gpu' if self.args.nprocs > 0 else 'cpu')
+        paddle.set_device('gpu' if self.args.ngpu > 0 else 'cpu')
        if self.parallel:
            self.init_parallel()
@@ -162,7 +162,7 @@ class Trainer():
        """A flag indicating whether the experiment should run with
        multiprocessing.
        """
-        return self.args.nprocs > 1
+        return self.args.ngpu > 1
    def init_parallel(self):
        """Init environment for multiprocess training.

--- a/paddlespeech/t2s/exps/fastspeech2/inference.py
+++ b/paddlespeech/t2s/exps/fastspeech2/inference.py
@@ -82,7 +82,9 @@ def main():
    with open(args.text, 'rt') as f:
        for line in f:
-            utt_id, sentence = line.strip().split()
+            items = line.strip().split()
+            utt_id = items[0]
+            sentence = "".join(items[1:])
            sentences.append((utt_id, sentence))
    for utt_id, sentence in sentences:

--- a/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e.py
+++ b/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e.py
@@ -37,7 +37,9 @@ def evaluate(args, fastspeech2_config, pwg_config):
    sentences = []
    with open(args.text, 'rt') as f:
        for line in f:
-            utt_id, sentence = line.strip().split()
+            items = line.strip().split()
+            utt_id = items[0]
+            sentence = "".join(items[1:])
            sentences.append((utt_id, sentence))
    with open(args.phones_dict, "r") as f:

--- a/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e.py
+++ b/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e.py
@@ -40,7 +40,9 @@ def evaluate(args, fastspeech2_config, pwg_config):
    sentences = []
    with open(args.text, 'rt') as f:
        for line in f:
-            utt_id, sentence = line.strip().split()
+            items = line.strip().split()
+            utt_id = items[0]
+            sentence = "".join(items[1:])
            sentences.append((utt_id, sentence))
    with open(args.phones_dict, "r") as f:

--- a/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e_melgan.py
+++ b/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e_melgan.py
@@ -40,7 +40,9 @@ def evaluate(args, fastspeech2_config, melgan_config):
    sentences = []
    with open(args.text, 'rt') as f:
        for line in f:
-            utt_id, sentence = line.strip().split()
+            items = line.strip().split()
+            utt_id = items[0]
+            sentence = "".join(items[1:])
            sentences.append((utt_id, sentence))
    with open(args.phones_dict, "r") as f:

--- a/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/train.py
+++ b/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/train.py
@@ -36,10 +36,10 @@ from paddlespeech.t2s.models.melgan import MBMelGANEvaluator
 from paddlespeech.t2s.models.melgan import MBMelGANUpdater
 from paddlespeech.t2s.models.melgan import MelGANGenerator
 from paddlespeech.t2s.models.melgan import MelGANMultiScaleDiscriminator
-from paddlespeech.t2s.modules.adversarial_loss import DiscriminatorAdversarialLoss
+from paddlespeech.t2s.modules.losses import DiscriminatorAdversarialLoss
-from paddlespeech.t2s.modules.adversarial_loss import GeneratorAdversarialLoss
+from paddlespeech.t2s.modules.losses import GeneratorAdversarialLoss
+from paddlespeech.t2s.modules.losses import MultiResolutionSTFTLoss
 from paddlespeech.t2s.modules.pqmf import PQMF
-from paddlespeech.t2s.modules.stft_loss import MultiResolutionSTFTLoss
 from paddlespeech.t2s.training.extensions.snapshot import Snapshot
 from paddlespeech.t2s.training.extensions.visualizer import VisualDL
 from paddlespeech.t2s.training.seeding import seed_everything

--- a/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py
+++ b/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py
@@ -36,7 +36,7 @@ from paddlespeech.t2s.models.parallel_wavegan import PWGDiscriminator
 from paddlespeech.t2s.models.parallel_wavegan import PWGEvaluator
 from paddlespeech.t2s.models.parallel_wavegan import PWGGenerator
 from paddlespeech.t2s.models.parallel_wavegan import PWGUpdater
-from paddlespeech.t2s.modules.stft_loss import MultiResolutionSTFTLoss
+from paddlespeech.t2s.modules.losses import MultiResolutionSTFTLoss
 from paddlespeech.t2s.training.extensions.snapshot import Snapshot
 from paddlespeech.t2s.training.extensions.visualizer import VisualDL
 from paddlespeech.t2s.training.seeding import seed_everything

--- a/paddlespeech/t2s/exps/speedyspeech/inference.py
+++ b/paddlespeech/t2s/exps/speedyspeech/inference.py
@@ -87,7 +87,9 @@ def main():
    with open(args.text, 'rt') as f:
        for line in f:
-            utt_id, sentence = line.strip().split()
+            items = line.strip().split()
+            utt_id = items[0]
+            sentence = "".join(items[1:])
            sentences.append((utt_id, sentence))
    for utt_id, sentence in sentences:

--- a/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py
+++ b/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py
@@ -40,7 +40,9 @@ def evaluate(args, speedyspeech_config, pwg_config):
    sentences = []
    with open(args.text, 'rt') as f:
        for line in f:
-            utt_id, sentence = line.strip().split()
+            items = line.strip().split()
+            utt_id = items[0]
+            sentence = "".join(items[1:])
            sentences.append((utt_id, sentence))
    with open(args.phones_dict, "r") as f:

--- a/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/train.py
+++ b/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/train.py
@@ -241,7 +241,7 @@ def main_sp(config, args):
 def main(config, args):
-    if args.ngpu:
+    if args.ngpu > 1:
        dist.spawn(main_sp, args=(config, args), nprocs=args.ngpu)
    else:
        main_sp(config, args)

--- a/paddlespeech/t2s/frontend/zh_frontend.py
+++ b/paddlespeech/t2s/frontend/zh_frontend.py
@@ -129,6 +129,8 @@ class Frontend():
                # we discriminate i, ii and iii
                if c and c not in self.punc:
                    phones.append(c)
+                if c and c in self.punc:
+                    phones.append('sp')
                if v and v not in self.punc:
                    phones.append(v)
            # add sp between sentence (replace the last punc with sp)
@@ -149,9 +151,14 @@ class Frontend():
        if word not in self.must_erhua and (word in self.not_erhua or
                                            pos in {"a", "j", "nr"}):
            return initials, finals
+        # "……" 等情况直接返回
+        if len(finals) != len(word):
+            return initials, finals
+        assert len(finals) == len(word)
        new_initials = []
        new_finals = []
-        assert len(finals) == len(word)
        for i, phn in enumerate(finals):
            if i == len(finals) - 1 and word[i] == "儿" and phn in {
                    "er2", "er5"

--- a/paddlespeech/t2s/frontend/zh_normalization/chronology.py
+++ b/paddlespeech/t2s/frontend/zh_normalization/chronology.py
@@ -32,6 +32,15 @@ RE_TIME = re.compile(r'([0-1]?[0-9]|2[0-3])'
                     r':([0-5][0-9])'
                     r'(:([0-5][0-9]))?')
+# 时间范围，如8:30-12:30
+RE_TIME_RANGE = re.compile(r'([0-1]?[0-9]|2[0-3])'
+                           r':([0-5][0-9])'
+                           r'(:([0-5][0-9]))?'
+                           r'(~|-)'
+                           r'([0-1]?[0-9]|2[0-3])'
+                           r':([0-5][0-9])'
+                           r'(:([0-5][0-9]))?')
 def replace_time(match) -> str:
    """
@@ -42,15 +51,32 @@ def replace_time(match) -> str:
    ----------
    str
    """
+    is_range = len(match.groups()) > 5
    hour = match.group(1)
    minute = match.group(2)
    second = match.group(4)
+    if is_range:
+        hour_2 = match.group(6)
+        minute_2 = match.group(7)
+        second_2 = match.group(9)
    result = f"{num2str(hour)}点"
    if minute.lstrip('0'):
        result += f"{_time_num2str(minute)}分"
    if second and second.lstrip('0'):
        result += f"{_time_num2str(second)}秒"
+    if is_range:
+        result += "至"
+        result += f"{num2str(hour_2)}点"
+        if minute_2.lstrip('0'):
+            result += f"{_time_num2str(minute_2)}分"
+        if second_2 and second_2.lstrip('0'):
+            result += f"{_time_num2str(second_2)}秒"
    return result

--- a/paddlespeech/t2s/frontend/zh_normalization/phonecode.py
+++ b/paddlespeech/t2s/frontend/zh_normalization/phonecode.py
@@ -26,16 +26,19 @@ RE_MOBILE_PHONE = re.compile(
 RE_TELEPHONE = re.compile(
    r"(?<!\d)((0(10|2[1-3]|[3-9]\d{2})-?)?[1-9]\d{7,8})(?!\d)")
+# 全国统一的号码400开头
+RE_NATIONAL_UNIFORM_NUMBER = re.compile(r"(400)(-)?\d{3}(-)?\d{4}")
 def phone2str(phone_string: str, mobile=True) -> str:
    if mobile:
        sp_parts = phone_string.strip('+').split()
-        result = ''.join(
+        result = '，'.join(
            [verbalize_digit(part, alt_one=True) for part in sp_parts])
        return result
    else:
        sil_parts = phone_string.split('-')
-        result = ''.join(
+        result = '，'.join(
            [verbalize_digit(part, alt_one=True) for part in sil_parts])
        return result

--- a/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py
+++ b/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py
@@ -18,6 +18,7 @@ from .char_convert import tranditional_to_simplified
 from .chronology import RE_DATE
 from .chronology import RE_DATE2
 from .chronology import RE_TIME
+from .chronology import RE_TIME_RANGE
 from .chronology import replace_date
 from .chronology import replace_date2
 from .chronology import replace_time
@@ -40,6 +41,7 @@ from .num import replace_percentage
 from .num import replace_positive_quantifier
 from .num import replace_range
 from .phonecode import RE_MOBILE_PHONE
+from .phonecode import RE_NATIONAL_UNIFORM_NUMBER
 from .phonecode import RE_TELEPHONE
 from .phonecode import replace_mobile
 from .phonecode import replace_phone
@@ -62,6 +64,8 @@ class TextNormalizer():
        List[str]
            Sentences.
        """
+        # Only for pure Chinese here
+        text = text.replace(" ", "")
        text = self.SENTENCE_SPLITOR.sub(r'\1\n', text)
        text = text.strip()
        sentences = [sentence.strip() for sentence in re.split(r'\n+', text)]
@@ -76,12 +80,19 @@ class TextNormalizer():
        # number related NSW verbalization
        sentence = RE_DATE.sub(replace_date, sentence)
        sentence = RE_DATE2.sub(replace_date2, sentence)
+        # range first
+        sentence = RE_TIME_RANGE.sub(replace_time, sentence)
        sentence = RE_TIME.sub(replace_time, sentence)
        sentence = RE_TEMPERATURE.sub(replace_temperature, sentence)
        sentence = RE_FRAC.sub(replace_frac, sentence)
        sentence = RE_PERCENTAGE.sub(replace_percentage, sentence)
        sentence = RE_MOBILE_PHONE.sub(replace_mobile, sentence)
        sentence = RE_TELEPHONE.sub(replace_phone, sentence)
+        sentence = RE_NATIONAL_UNIFORM_NUMBER.sub(replace_phone, sentence)
        sentence = RE_RANGE.sub(replace_range, sentence)
        sentence = RE_INTEGER.sub(replace_negative_num, sentence)
        sentence = RE_DECIMAL_NUM.sub(replace_number, sentence)
@@ -94,5 +105,6 @@ class TextNormalizer():
    def normalize(self, text: str) -> List[str]:
        sentences = self._split(text)
        sentences = [self.normalize_sentence(sent) for sent in sentences]
        return sentences
--- a/paddlespeech/t2s/models/__init__.py
+++ b/paddlespeech/t2s/models/__init__.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from .fastspeech2 import *
+from .melgan import *
+from .parallel_wavegan import *
 from .tacotron2 import *
 from .transformer_tts import *
 from .waveflow import *
--- a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
+++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
@@ -24,17 +24,16 @@ import paddle.nn.functional as F
 from paddle import nn
 from typeguard import check_argument_types
-from paddlespeech.t2s.modules.fastspeech2_predictor.duration_predictor import DurationPredictor
-from paddlespeech.t2s.modules.fastspeech2_predictor.duration_predictor import DurationPredictorLoss
-from paddlespeech.t2s.modules.fastspeech2_predictor.length_regulator import LengthRegulator
-from paddlespeech.t2s.modules.fastspeech2_predictor.variance_predictor import VariancePredictor
-from paddlespeech.t2s.modules.fastspeech2_transformer.embedding import PositionalEncoding
-from paddlespeech.t2s.modules.fastspeech2_transformer.embedding import ScaledPositionalEncoding
-from paddlespeech.t2s.modules.fastspeech2_transformer.encoder import Encoder as TransformerEncoder
 from paddlespeech.t2s.modules.nets_utils import initialize
 from paddlespeech.t2s.modules.nets_utils import make_non_pad_mask
 from paddlespeech.t2s.modules.nets_utils import make_pad_mask
+from paddlespeech.t2s.modules.predictor.duration_predictor import DurationPredictor
+from paddlespeech.t2s.modules.predictor.duration_predictor import DurationPredictorLoss
+from paddlespeech.t2s.modules.predictor.length_regulator import LengthRegulator
+from paddlespeech.t2s.modules.predictor.variance_predictor import VariancePredictor
 from paddlespeech.t2s.modules.tacotron2.decoder import Postnet
+from paddlespeech.t2s.modules.transformer.encoder import ConformerEncoder
+from paddlespeech.t2s.modules.transformer.encoder import TransformerEncoder
 class FastSpeech2(nn.Layer):
@@ -66,6 +65,7 @@ class FastSpeech2(nn.Layer):
            postnet_layers: int=5,
            postnet_chans: int=512,
            postnet_filts: int=5,
+            postnet_dropout_rate: float=0.5,
            positionwise_layer_type: str="conv1d",
            positionwise_conv_kernel_size: int=1,
            use_scaled_pos_enc: bool=True,
@@ -77,10 +77,27 @@ class FastSpeech2(nn.Layer):
            reduction_factor: int=1,
            encoder_type: str="transformer",
            decoder_type: str="transformer",
+            # for transformer
+            transformer_enc_dropout_rate: float=0.1,
+            transformer_enc_positional_dropout_rate: float=0.1,
+            transformer_enc_attn_dropout_rate: float=0.1,
+            transformer_dec_dropout_rate: float=0.1,
+            transformer_dec_positional_dropout_rate: float=0.1,
+            transformer_dec_attn_dropout_rate: float=0.1,
+            # for conformer
+            conformer_pos_enc_layer_type: str="rel_pos",
+            conformer_self_attn_layer_type: str="rel_selfattn",
+            conformer_activation_type: str="swish",
+            use_macaron_style_in_conformer: bool=True,
+            use_cnn_in_conformer: bool=True,
+            zero_triu: bool=False,
+            conformer_enc_kernel_size: int=7,
+            conformer_dec_kernel_size: int=31,
            # duration predictor
            duration_predictor_layers: int=2,
            duration_predictor_chans: int=384,
            duration_predictor_kernel_size: int=3,
+            duration_predictor_dropout_rate: float=0.1,
            # energy predictor
            energy_predictor_layers: int=2,
            energy_predictor_chans: int=384,
@@ -101,25 +118,147 @@ class FastSpeech2(nn.Layer):
            spk_num: int=None,
            spk_embed_dim: int=None,
            spk_embed_integration_type: str="add",
-            #  tone emb
+            # tone emb
-            num_tones: int=None,
+            tone_num: int=None,
            tone_embed_dim: int=None,
            tone_embed_integration_type: str="add",
            # training related
-            transformer_enc_dropout_rate: float=0.1,
-            transformer_enc_positional_dropout_rate: float=0.1,
-            transformer_enc_attn_dropout_rate: float=0.1,
-            transformer_dec_dropout_rate: float=0.1,
-            transformer_dec_positional_dropout_rate: float=0.1,
-            transformer_dec_attn_dropout_rate: float=0.1,
-            duration_predictor_dropout_rate: float=0.1,
-            postnet_dropout_rate: float=0.5,
            init_type: str="xavier_uniform",
            init_enc_alpha: float=1.0,
-            init_dec_alpha: float=1.0,
+            init_dec_alpha: float=1.0, ):
-            use_masking: bool=False,
+        """Initialize FastSpeech2 module.
-            use_weighted_masking: bool=False, ):
+        Parameters
-        """Initialize FastSpeech2 module."""
+        ----------
+        idim : int
+            Dimension of the inputs.
+        odim : int
+            Dimension of the outputs.
+        adim : int
+            Attention dimension.
+        aheads : int
+            Number of attention heads.
+        elayers : int
+            Number of encoder layers.
+        eunits : int
+            Number of encoder hidden units.
+        dlayers : int
+            Number of decoder layers.
+        dunits : int
+            Number of decoder hidden units.
+        postnet_layers : int
+            Number of postnet layers.
+        postnet_chans : int
+            Number of postnet channels.
+        postnet_filts : int
+            Kernel size of postnet.
+        postnet_dropout_rate : float
+            Dropout rate in postnet.
+        use_scaled_pos_enc : bool
+            Whether to use trainable scaled pos encoding.
+        use_batch_norm : bool
+            Whether to use batch normalization in encoder prenet.
+        encoder_normalize_before : bool
+            Whether to apply layernorm layer before encoder block.
+        decoder_normalize_before : bool
+            Whether to apply layernorm layer before
+            decoder block.
+        encoder_concat_after : bool
+            Whether to concatenate attention layer's input and output in encoder.
+        decoder_concat_after : bool
+            Whether to concatenate attention layer's input  and output in decoder.
+        reduction_factor : int
+            Reduction factor.
+        encoder_type : str
+            Encoder type ("transformer" or "conformer").
+        decoder_type : str
+            Decoder type ("transformer" or "conformer").
+        transformer_enc_dropout_rate : float
+            Dropout rate in encoder except attention and positional encoding.
+        transformer_enc_positional_dropout_rate (float): Dropout rate after encoder
+            positional encoding.
+        transformer_enc_attn_dropout_rate (float): Dropout rate in encoder
+            self-attention module.
+        transformer_dec_dropout_rate (float): Dropout rate in decoder except
+            attention & positional encoding.
+        transformer_dec_positional_dropout_rate (float): Dropout rate after decoder
+            positional encoding.
+        transformer_dec_attn_dropout_rate (float): Dropout rate in decoder
+            self-attention module.
+        conformer_pos_enc_layer_type : str
+            Pos encoding layer type in conformer.
+        conformer_self_attn_layer_type : str
+            Self-attention layer type in conformer
+        conformer_activation_type : str
+            Activation function type in conformer.
+        use_macaron_style_in_conformer : bool
+            Whether to use macaron style FFN.
+        use_cnn_in_conformer : bool
+            Whether to use CNN in conformer.
+        zero_triu : bool
+            Whether to use zero triu in relative self-attention module.
+        conformer_enc_kernel_size : int
+            Kernel size of encoder conformer.
+        conformer_dec_kernel_size : int
+            Kernel size of decoder conformer.
+        duration_predictor_layers : int
+            Number of duration predictor layers.
+        duration_predictor_chans : int
+            Number of duration predictor channels.
+        duration_predictor_kernel_size : int
+            Kernel size of duration predictor.
+        duration_predictor_dropout_rate : float
+            Dropout rate in duration predictor.
+        pitch_predictor_layers : int
+            Number of pitch predictor layers.
+        pitch_predictor_chans : int
+            Number of pitch predictor channels.
+        pitch_predictor_kernel_size : int
+            Kernel size of pitch predictor.
+        pitch_predictor_dropout_rate : float
+            Dropout rate in pitch predictor.
+        pitch_embed_kernel_size : float
+            Kernel size of pitch embedding.
+        pitch_embed_dropout_rate : float
+            Dropout rate for pitch embedding.
+        stop_gradient_from_pitch_predictor : bool
+            Whether to stop gradient from pitch predictor to encoder.
+        energy_predictor_layers : int
+            Number of energy predictor layers.
+        energy_predictor_chans : int
+            Number of energy predictor channels.
+        energy_predictor_kernel_size : int
+            Kernel size of energy predictor.
+        energy_predictor_dropout_rate : float
+            Dropout rate in energy predictor.
+        energy_embed_kernel_size : float
+            Kernel size of energy embedding.
+        energy_embed_dropout_rate : float
+            Dropout rate for energy embedding.
+        stop_gradient_from_energy_predictor : bool 
+            Whether to stop gradient from energy predictor to encoder.
+        spk_num : Optional[int]
+            Number of speakers. If not None, assume that the spk_embed_dim is not None,
+            spk_ids will be provided as the input and use spk_embedding_table.
+        spk_embed_dim : Optional[int]
+            Speaker embedding dimension. If not None, 
+            assume that spk_emb will be provided as the input or spk_num is not None.
+        spk_embed_integration_type : str
+            How to integrate speaker embedding.
+        tone_num : Optional[int]
+            Number of tones. If not None, assume that the
+            tone_ids will be provided as the input and use tone_embedding_table.
+        tone_embed_dim : Optional[int]
+            Tone embedding dimension. If not None, assume that tone_num is not None.
+        tone_embed_integration_type : str
+            How to integrate tone embedding.
+        init_type : str
+            How to initialize transformer parameters.
+        init_enc_alpha : float
+            Initial value of alpha in scaled pos encoding of the encoder.
+        init_dec_alpha : float
+            Initial value of alpha in scaled pos encoding of the decoder.
+        """
        assert check_argument_types()
        super().__init__()
@@ -156,13 +295,12 @@ class FastSpeech2(nn.Layer):
        if self.tone_embed_dim is not None:
            self.tone_embedding_table = nn.Embedding(
-                num_embeddings=num_tones,
+                num_embeddings=tone_num,
                embedding_dim=self.tone_embed_dim,
                padding_idx=self.padding_idx)
-        # get positional encoding class
+        # get positional encoding layer type
-        pos_enc_class = (ScaledPositionalEncoding
+        transformer_pos_enc_layer_type = "scaled_abs_pos" if self.use_scaled_pos_enc else "abs_pos"
-                         if self.use_scaled_pos_enc else PositionalEncoding)
        # define encoder
        encoder_input_layer = nn.Embedding(
@@ -171,6 +309,7 @@ class FastSpeech2(nn.Layer):
            padding_idx=self.padding_idx)
        if encoder_type == "transformer":
+            print("encoder_type is transformer")
            self.encoder = TransformerEncoder(
                idim=idim,
                attention_dim=adim,
@@ -181,11 +320,34 @@ class FastSpeech2(nn.Layer):
                dropout_rate=transformer_enc_dropout_rate,
                positional_dropout_rate=transformer_enc_positional_dropout_rate,
                attention_dropout_rate=transformer_enc_attn_dropout_rate,
-                pos_enc_class=pos_enc_class,
+                pos_enc_layer_type=transformer_pos_enc_layer_type,
                normalize_before=encoder_normalize_before,
                concat_after=encoder_concat_after,
                positionwise_layer_type=positionwise_layer_type,
                positionwise_conv_kernel_size=positionwise_conv_kernel_size, )
+        elif encoder_type == "conformer":
+            print("encoder_type is conformer")
+            self.encoder = ConformerEncoder(
+                idim=idim,
+                attention_dim=adim,
+                attention_heads=aheads,
+                linear_units=eunits,
+                num_blocks=elayers,
+                input_layer=encoder_input_layer,
+                dropout_rate=transformer_enc_dropout_rate,
+                positional_dropout_rate=transformer_enc_positional_dropout_rate,
+                attention_dropout_rate=transformer_enc_attn_dropout_rate,
+                normalize_before=encoder_normalize_before,
+                concat_after=encoder_concat_after,
+                positionwise_layer_type=positionwise_layer_type,
+                positionwise_conv_kernel_size=positionwise_conv_kernel_size,
+                macaron_style=use_macaron_style_in_conformer,
+                pos_enc_layer_type=conformer_pos_enc_layer_type,
+                selfattention_layer_type=conformer_self_attn_layer_type,
+                activation_type=conformer_activation_type,
+                use_cnn_module=use_cnn_in_conformer,
+                cnn_module_kernel=conformer_enc_kernel_size,
+                zero_triu=zero_triu, )
        else:
            raise ValueError(f"{encoder_type} is not supported.")
@@ -251,6 +413,7 @@ class FastSpeech2(nn.Layer):
        # NOTE: we use encoder as decoder
        # because fastspeech's decoder is the same as encoder
        if decoder_type == "transformer":
+            print("decoder_type is transformer")
            self.decoder = TransformerEncoder(
                idim=0,
                attention_dim=adim,
@@ -262,11 +425,33 @@ class FastSpeech2(nn.Layer):
                dropout_rate=transformer_dec_dropout_rate,
                positional_dropout_rate=transformer_dec_positional_dropout_rate,
                attention_dropout_rate=transformer_dec_attn_dropout_rate,
-                pos_enc_class=pos_enc_class,
+                pos_enc_layer_type=transformer_pos_enc_layer_type,
                normalize_before=decoder_normalize_before,
                concat_after=decoder_concat_after,
                positionwise_layer_type=positionwise_layer_type,
                positionwise_conv_kernel_size=positionwise_conv_kernel_size, )
+        elif decoder_type == "conformer":
+            print("decoder_type is conformer")
+            self.decoder = ConformerEncoder(
+                idim=0,
+                attention_dim=adim,
+                attention_heads=aheads,
+                linear_units=dunits,
+                num_blocks=dlayers,
+                input_layer=None,
+                dropout_rate=transformer_dec_dropout_rate,
+                positional_dropout_rate=transformer_dec_positional_dropout_rate,
+                attention_dropout_rate=transformer_dec_attn_dropout_rate,
+                normalize_before=decoder_normalize_before,
+                concat_after=decoder_concat_after,
+                positionwise_layer_type=positionwise_layer_type,
+                positionwise_conv_kernel_size=positionwise_conv_kernel_size,
+                macaron_style=use_macaron_style_in_conformer,
+                pos_enc_layer_type=conformer_pos_enc_layer_type,
+                selfattention_layer_type=conformer_self_attn_layer_type,
+                activation_type=conformer_activation_type,
+                use_cnn_module=use_cnn_in_conformer,
+                cnn_module_kernel=conformer_dec_kernel_size, )
        else:
            raise ValueError(f"{decoder_type} is not supported.")

--- a/paddlespeech/t2s/models/melgan/melgan.py
+++ b/paddlespeech/t2s/models/melgan/melgan.py
@@ -78,7 +78,7 @@ class MelGANGenerator(nn.Layer):
            Padding function module name before dilated convolution layer.
        pad_params : dict
            Hyperparameters for padding function.
-        use_final_nonlinear_activation : paddle.nn.Layer
+        use_final_nonlinear_activation : nn.Layer
            Activation function for the final layer.
        use_weight_norm : bool
            Whether to use weight norm.

--- a/paddlespeech/t2s/models/speedyspeech/speedyspeech.py
+++ b/paddlespeech/t2s/models/speedyspeech/speedyspeech.py
@@ -11,13 +11,34 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import numpy as np
 import paddle
 from paddle import nn
-from paddlespeech.t2s.modules.expansion import expand
 from paddlespeech.t2s.modules.positional_encoding import sinusoid_position_encoding
+def expand(encodings: paddle.Tensor, durations: paddle.Tensor) -> paddle.Tensor:
+    """
+    encodings: (B, T, C)
+    durations: (B, T)
+    """
+    batch_size, t_enc = durations.shape
+    durations = durations.numpy()
+    slens = np.sum(durations, -1)
+    t_dec = np.max(slens)
+    M = np.zeros([batch_size, t_dec, t_enc])
+    for i in range(batch_size):
+        k = 0
+        for j in range(t_enc):
+            d = durations[i, j]
+            M[i, k:k + d, j] = 1
+            k += d
+    M = paddle.to_tensor(M, dtype=encodings.dtype)
+    encodings = paddle.matmul(M, encodings)
+    return encodings
 class ResidualBlock(nn.Layer):
    def __init__(self, channels, kernel_size, dilation, n=2):
        super().__init__()

--- a/paddlespeech/t2s/models/speedyspeech/speedyspeech_updater.py
+++ b/paddlespeech/t2s/models/speedyspeech/speedyspeech_updater.py
@@ -19,8 +19,8 @@ from paddle.fluid.layers import huber_loss
 from paddle.nn import functional as F
 from paddlespeech.t2s.modules.losses import masked_l1_loss
+from paddlespeech.t2s.modules.losses import ssim
 from paddlespeech.t2s.modules.losses import weighted_mean
-from paddlespeech.t2s.modules.ssim import ssim
 from paddlespeech.t2s.training.extensions.evaluator import StandardEvaluator
 from paddlespeech.t2s.training.reporter import report
 from paddlespeech.t2s.training.updaters.standard_updater import StandardUpdater

--- a/paddlespeech/t2s/models/tacotron2.py
+++ b/paddlespeech/t2s/models/tacotron2.py
@@ -20,7 +20,6 @@ from paddle.nn import functional as F
 from paddle.nn import initializer as I
 from tqdm import trange
-from paddlespeech.t2s.modules.attention import LocationSensitiveAttention
 from paddlespeech.t2s.modules.conv import Conv1dBatchNorm
 from paddlespeech.t2s.modules.losses import guided_attention_loss
 from paddlespeech.t2s.utils import checkpoint
@@ -28,6 +27,99 @@ from paddlespeech.t2s.utils import checkpoint
 __all__ = ["Tacotron2", "Tacotron2Loss"]
+class LocationSensitiveAttention(nn.Layer):
+    """Location Sensitive Attention module.
+    Reference: `Attention-Based Models for Speech Recognition <https://arxiv.org/pdf/1506.07503.pdf>`_
+    Parameters
+    -----------
+    d_query: int
+        The feature size of query.
+    d_key : int
+        The feature size of key.
+    d_attention : int
+        The feature size of dimension.
+    location_filters : int
+        Filter size of attention convolution.
+    location_kernel_size : int
+        Kernel size of attention convolution.
+    """
+    def __init__(self,
+                 d_query: int,
+                 d_key: int,
+                 d_attention: int,
+                 location_filters: int,
+                 location_kernel_size: int):
+        super().__init__()
+        self.query_layer = nn.Linear(d_query, d_attention, bias_attr=False)
+        self.key_layer = nn.Linear(d_key, d_attention, bias_attr=False)
+        self.value = nn.Linear(d_attention, 1, bias_attr=False)
+        # Location Layer
+        self.location_conv = nn.Conv1D(
+            2,
+            location_filters,
+            kernel_size=location_kernel_size,
+            padding=int((location_kernel_size - 1) / 2),
+            bias_attr=False,
+            data_format='NLC')
+        self.location_layer = nn.Linear(
+            location_filters, d_attention, bias_attr=False)
+    def forward(self,
+                query,
+                processed_key,
+                value,
+                attention_weights_cat,
+                mask=None):
+        """Compute context vector and attention weights.
+        Parameters
+        -----------
+        query : Tensor [shape=(batch_size, d_query)]
+            The queries.
+        processed_key : Tensor [shape=(batch_size, time_steps_k, d_attention)]
+            The keys after linear layer.
+        value : Tensor [shape=(batch_size, time_steps_k, d_key)]
+            The values.
+        attention_weights_cat : Tensor [shape=(batch_size, time_step_k, 2)]
+            Attention weights concat.
+        mask : Tensor, optional
+            The mask. Shape should be (batch_size, times_steps_k, 1).
+            Defaults to None.
+        Returns
+        ----------
+        attention_context : Tensor [shape=(batch_size, d_attention)]
+            The context vector.
+        attention_weights : Tensor [shape=(batch_size, time_steps_k)]
+            The attention weights.
+        """
+        processed_query = self.query_layer(paddle.unsqueeze(query, axis=[1]))
+        processed_attention_weights = self.location_layer(
+            self.location_conv(attention_weights_cat))
+        # (B, T_enc, 1)
+        alignment = self.value(
+            paddle.tanh(processed_attention_weights + processed_key +
+                        processed_query))
+        if mask is not None:
+            alignment = alignment + (1.0 - mask) * -1e9
+        attention_weights = F.softmax(alignment, axis=1)
+        attention_context = paddle.matmul(
+            attention_weights, value, transpose_x=True)
+        attention_weights = paddle.squeeze(attention_weights, axis=-1)
+        attention_context = paddle.squeeze(attention_context, axis=1)
+        return attention_context, attention_weights
 class DecoderPreNet(nn.Layer):
    """Decoder prenet module for Tacotron2.
@@ -197,7 +289,7 @@ class Tacotron2Encoder(nn.Layer):
        super().__init__()
        k = math.sqrt(1.0 / (d_hidden * kernel_size))
-        self.conv_batchnorms = paddle.nn.LayerList([
+        self.conv_batchnorms = nn.LayerList([
            Conv1dBatchNorm(
                d_hidden,
                d_hidden,
@@ -903,7 +995,7 @@ class Tacotron2Loss(nn.Layer):
        self.use_stop_token_loss = use_stop_token_loss
        self.use_guided_attention_loss = use_guided_attention_loss
        self.attn_criterion = guided_attention_loss
-        self.stop_criterion = paddle.nn.BCEWithLogitsLoss()
+        self.stop_criterion = nn.BCEWithLogitsLoss()
        self.sigma = sigma
    def forward(self,

--- a/paddlespeech/t2s/models/transformer_tts/transformer_tts.py
+++ b/paddlespeech/t2s/models/transformer_tts/transformer_tts.py
@@ -23,12 +23,6 @@ import paddle.nn.functional as F
 from paddle import nn
 from typeguard import check_argument_types
-from paddlespeech.t2s.modules.fastspeech2_transformer.attention import MultiHeadedAttention
-from paddlespeech.t2s.modules.fastspeech2_transformer.decoder import Decoder
-from paddlespeech.t2s.modules.fastspeech2_transformer.embedding import PositionalEncoding
-from paddlespeech.t2s.modules.fastspeech2_transformer.embedding import ScaledPositionalEncoding
-from paddlespeech.t2s.modules.fastspeech2_transformer.encoder import Encoder
-from paddlespeech.t2s.modules.fastspeech2_transformer.mask import subsequent_mask
 from paddlespeech.t2s.modules.nets_utils import initialize
 from paddlespeech.t2s.modules.nets_utils import make_non_pad_mask
 from paddlespeech.t2s.modules.nets_utils import make_pad_mask
@@ -36,6 +30,12 @@ from paddlespeech.t2s.modules.style_encoder import StyleEncoder
 from paddlespeech.t2s.modules.tacotron2.decoder import Postnet
 from paddlespeech.t2s.modules.tacotron2.decoder import Prenet as DecoderPrenet
 from paddlespeech.t2s.modules.tacotron2.encoder import Encoder as EncoderPrenet
+from paddlespeech.t2s.modules.transformer.attention import MultiHeadedAttention
+from paddlespeech.t2s.modules.transformer.decoder import Decoder
+from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding
+from paddlespeech.t2s.modules.transformer.embedding import ScaledPositionalEncoding
+from paddlespeech.t2s.modules.transformer.encoder import TransformerEncoder
+from paddlespeech.t2s.modules.transformer.mask import subsequent_mask
 class TransformerTTS(nn.Layer):
@@ -257,9 +257,9 @@ class TransformerTTS(nn.Layer):
        self.padding_idx = 0
        # set_global_initializer 会影响后面的全局，包括 create_parameter
        initialize(self, init_type)
-        # get positional encoding class
-        pos_enc_class = (ScaledPositionalEncoding
+        # get positional encoding layer type
-                         if self.use_scaled_pos_enc else PositionalEncoding)
+        transformer_pos_enc_layer_type = "scaled_abs_pos" if self.use_scaled_pos_enc else "abs_pos"
        # define transformer encoder
        if eprenet_conv_layers != 0:
@@ -281,7 +281,7 @@ class TransformerTTS(nn.Layer):
                num_embeddings=idim,
                embedding_dim=adim,
                padding_idx=self.padding_idx)
-        self.encoder = Encoder(
+        self.encoder = TransformerEncoder(
            idim=idim,
            attention_dim=adim,
            attention_heads=aheads,
@@ -291,7 +291,7 @@ class TransformerTTS(nn.Layer):
            dropout_rate=transformer_enc_dropout_rate,
            positional_dropout_rate=transformer_enc_positional_dropout_rate,
            attention_dropout_rate=transformer_enc_attn_dropout_rate,
-            pos_enc_class=pos_enc_class,
+            pos_enc_layer_type=transformer_pos_enc_layer_type,
            normalize_before=encoder_normalize_before,
            concat_after=encoder_concat_after,
            positionwise_layer_type=positionwise_layer_type,
@@ -330,6 +330,9 @@ class TransformerTTS(nn.Layer):
                nn.Linear(dprenet_units, adim), )
        else:
            decoder_input_layer = "linear"
+        # get positional encoding class
+        pos_enc_class = (ScaledPositionalEncoding
+                         if self.use_scaled_pos_enc else PositionalEncoding)
        self.decoder = Decoder(
            odim=odim,  # odim is needed when no prenet is used
            attention_dim=adim,

--- a/paddlespeech/t2s/models/waveflow.py
+++ b/paddlespeech/t2s/models/waveflow.py
@@ -329,7 +329,7 @@ class ResidualNet(nn.LayerList):
        if len(dilations_h) != n_layer:
            raise ValueError(
                "number of dilations_h should equals num of layers")
-        super(ResidualNet, self).__init__()
+        super().__init__()
        for i in range(n_layer):
            dilation = (dilations_h[i], 2**i)
            layer = ResidualBlock(residual_channels, condition_channels,

--- a/paddlespeech/t2s/modules/__init__.py
+++ b/paddlespeech/t2s/modules/__init__.py
@@ -11,10 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from .attention import *
 from .conv import *
 from .geometry import *
 from .losses import *
-from .masking import *
 from .positional_encoding import *
-from .transformer import *
--- a/paddlespeech/t2s/modules/glu.py
+++ b/paddlespeech/t2s/modules/glu.py
@@ -11,8 +11,9 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import paddle
+import paddle.nn.functional as F
 from paddle import nn
-from paddle.nn import functional as F
 class GLU(nn.Layer):
@@ -24,3 +25,18 @@ class GLU(nn.Layer):
    def forward(self, xs):
        return F.glu(xs, axis=self.dim)
+def get_activation(act):
+    """Return activation function."""
+    activation_funcs = {
+        "hardtanh": paddle.nn.Hardtanh,
+        "tanh": paddle.nn.Tanh,
+        "relu": paddle.nn.ReLU,
+        "selu": paddle.nn.SELU,
+        "swish": paddle.nn.Swish,
+        "glu": GLU
+    }
+    return activation_funcs[act]()
--- a/paddlespeech/t2s/modules/adversarial_loss.py
+++ b/paddlespeech/t2s/modules/adversarial_loss.py
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# Modified from espnet(https://github.com/espnet/espnet)
-"""Adversarial loss modules."""
-import paddle
-import paddle.nn.functional as F
-from paddle import nn
-class GeneratorAdversarialLoss(nn.Layer):
-    """Generator adversarial loss module."""
-    def __init__(
-            self,
-            average_by_discriminators=True,
-            loss_type="mse", ):
-        """Initialize GeneratorAversarialLoss module."""
-        super().__init__()
-        self.average_by_discriminators = average_by_discriminators
-        assert loss_type in ["mse", "hinge"], f"{loss_type} is not supported."
-        if loss_type == "mse":
-            self.criterion = self._mse_loss
-        else:
-            self.criterion = self._hinge_loss
-    def forward(self, outputs):
-        """Calcualate generator adversarial loss.
-        Parameters
-        ----------
-        outputs: Tensor or List
-        Discriminator outputs or list of discriminator outputs.
-        Returns
-        ----------
-        Tensor
-            Generator adversarial loss value.
-        """
-        if isinstance(outputs, (tuple, list)):
-            adv_loss = 0.0
-            for i, outputs_ in enumerate(outputs):
-                if isinstance(outputs_, (tuple, list)):
-                    # case including feature maps
-                    outputs_ = outputs_[-1]
-                adv_loss += self.criterion(outputs_)
-            if self.average_by_discriminators:
-                adv_loss /= i + 1
-        else:
-            adv_loss = self.criterion(outputs)
-        return adv_loss
-    def _mse_loss(self, x):
-        return F.mse_loss(x, paddle.ones_like(x))
-    def _hinge_loss(self, x):
-        return -x.mean()
-class DiscriminatorAdversarialLoss(nn.Layer):
-    """Discriminator adversarial loss module."""
-    def __init__(
-            self,
-            average_by_discriminators=True,
-            loss_type="mse", ):
-        """Initialize DiscriminatorAversarialLoss module."""
-        super().__init__()
-        self.average_by_discriminators = average_by_discriminators
-        assert loss_type in ["mse"], f"{loss_type} is not supported."
-        if loss_type == "mse":
-            self.fake_criterion = self._mse_fake_loss
-            self.real_criterion = self._mse_real_loss
-    def forward(self, outputs_hat, outputs):
-        """Calcualate discriminator adversarial loss.
-        Parameters
-        ----------
-        outputs_hat : Tensor or list
-            Discriminator outputs or list of
-            discriminator outputs calculated from generator outputs.
-        outputs : Tensor or list
-            Discriminator outputs or list of
-            discriminator outputs calculated from groundtruth.
-        Returns
-        ----------
-        Tensor
-            Discriminator real loss value.
-        Tensor
-            Discriminator fake loss value.
-        """
-        if isinstance(outputs, (tuple, list)):
-            real_loss = 0.0
-            fake_loss = 0.0
-            for i, (outputs_hat_,
-                    outputs_) in enumerate(zip(outputs_hat, outputs)):
-                if isinstance(outputs_hat_, (tuple, list)):
-                    # case including feature maps
-                    outputs_hat_ = outputs_hat_[-1]
-                    outputs_ = outputs_[-1]
-                real_loss += self.real_criterion(outputs_)
-                fake_loss += self.fake_criterion(outputs_hat_)
-            if self.average_by_discriminators:
-                fake_loss /= i + 1
-                real_loss /= i + 1
-        else:
-            real_loss = self.real_criterion(outputs)
-            fake_loss = self.fake_criterion(outputs_hat)
-        return real_loss, fake_loss
-    def _mse_real_loss(self, x):
-        return F.mse_loss(x, paddle.ones_like(x))
-    def _mse_fake_loss(self, x):
-        return F.mse_loss(x, paddle.zeros_like(x))
--- a/paddlespeech/t2s/modules/attention.py
+++ b/paddlespeech/t2s/modules/attention.py
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import math
-import numpy as np
-import paddle
-from paddle import nn
-from paddle.nn import functional as F
-def scaled_dot_product_attention(q, k, v, mask=None, dropout=0.0,
-                                 training=True):
-    r"""Scaled dot product attention with masking. 
-    Assume that q, k, v all have the same leading dimensions (denoted as * in 
-    descriptions below). Dropout is applied to attention weights before 
-    weighted sum of values.
-    Parameters
-    -----------
-    q : Tensor [shape=(\*, T_q, d)]
-        the query tensor.
-    k : Tensor [shape=(\*, T_k, d)]
-        the key tensor.
-    v : Tensor [shape=(\*, T_k, d_v)]
-        the value tensor.
-    mask : Tensor, [shape=(\*, T_q, T_k) or broadcastable shape], optional
-        the mask tensor, zeros correspond to paddings. Defaults to None.
-    Returns
-    ----------
-    out : Tensor [shape=(\*, T_q, d_v)]
-        the context vector.
-    attn_weights : Tensor [shape=(\*, T_q, T_k)]
-        the attention weights.
-    """
-    d = q.shape[-1]  # we only support imperative execution
-    qk = paddle.matmul(q, k, transpose_y=True)
-    scaled_logit = paddle.scale(qk, 1.0 / math.sqrt(d))
-    if mask is not None:
-        scaled_logit += paddle.scale((1.0 - mask), -1e9)  # hard coded here
-    attn_weights = F.softmax(scaled_logit, axis=-1)
-    attn_weights = F.dropout(attn_weights, dropout, training=training)
-    out = paddle.matmul(attn_weights, v)
-    return out, attn_weights
-def drop_head(x, drop_n_heads, training=True):
-    """Drop n context vectors from multiple ones.
-    Parameters
-    ----------
-    x : Tensor [shape=(batch_size, num_heads, time_steps, channels)]
-        The input, multiple context vectors.
-    drop_n_heads : int [0<= drop_n_heads <= num_heads]
-        Number of vectors to drop.
-    training : bool
-        A flag indicating whether it is in training. If `False`, no dropout is 
-        applied.
-    Returns
-    -------
-    Tensor
-        The output.
-    """
-    if not training or (drop_n_heads == 0):
-        return x
-    batch_size, num_heads, _, _ = x.shape
-    # drop all heads
-    if num_heads == drop_n_heads:
-        return paddle.zeros_like(x)
-    mask = np.ones([batch_size, num_heads])
-    mask[:, :drop_n_heads] = 0
-    for subarray in mask:
-        np.random.shuffle(subarray)
-    scale = float(num_heads) / (num_heads - drop_n_heads)
-    mask = scale * np.reshape(mask, [batch_size, num_heads, 1, 1])
-    out = x * paddle.to_tensor(mask)
-    return out
-def _split_heads(x, num_heads):
-    batch_size, time_steps, _ = x.shape
-    x = paddle.reshape(x, [batch_size, time_steps, num_heads, -1])
-    x = paddle.transpose(x, [0, 2, 1, 3])
-    return x
-def _concat_heads(x):
-    batch_size, _, time_steps, _ = x.shape
-    x = paddle.transpose(x, [0, 2, 1, 3])
-    x = paddle.reshape(x, [batch_size, time_steps, -1])
-    return x
-# Standard implementations of Monohead Attention & Multihead Attention
-class MonoheadAttention(nn.Layer):
-    """Monohead Attention module.
-    Parameters
-    ----------
-    model_dim : int
-        Feature size of the query.
-    dropout : float, optional
-        Dropout probability of scaled dot product attention and final context
-        vector. Defaults to 0.0.
-    k_dim : int, optional
-        Feature size of the key of each scaled dot product attention. If not
-        provided, it is set to `model_dim / num_heads`. Defaults to None.
-    v_dim : int, optional
-        Feature size of the key of each scaled dot product attention. If not
-        provided, it is set to `model_dim / num_heads`. Defaults to None.
-    """
-    def __init__(self,
-                 model_dim: int,
-                 dropout: float=0.0,
-                 k_dim: int=None,
-                 v_dim: int=None):
-        super(MonoheadAttention, self).__init__()
-        k_dim = k_dim or model_dim
-        v_dim = v_dim or model_dim
-        self.affine_q = nn.Linear(model_dim, k_dim)
-        self.affine_k = nn.Linear(model_dim, k_dim)
-        self.affine_v = nn.Linear(model_dim, v_dim)
-        self.affine_o = nn.Linear(v_dim, model_dim)
-        self.model_dim = model_dim
-        self.dropout = dropout
-    def forward(self, q, k, v, mask):
-        """Compute context vector and attention weights.
-        Parameters
-        -----------
-        q : Tensor [shape=(batch_size, time_steps_q, model_dim)]
-            The queries.
-        k : Tensor [shape=(batch_size, time_steps_k, model_dim)]
-            The keys.
-        v : Tensor [shape=(batch_size, time_steps_k, model_dim)]
-            The values.
-        mask : Tensor [shape=(batch_size, times_steps_q, time_steps_k] or broadcastable shape
-            The mask.
-        Returns
-        ----------
-        out : Tensor [shape=(batch_size, time_steps_q, model_dim)]
-            The context vector.
-        attention_weights : Tensor [shape=(batch_size, times_steps_q, time_steps_k)]
-            The attention weights.
-        """
-        q = self.affine_q(q)  # (B, T, C)
-        k = self.affine_k(k)
-        v = self.affine_v(v)
-        context_vectors, attention_weights = scaled_dot_product_attention(
-            q, k, v, mask, self.dropout, self.training)
-        out = self.affine_o(context_vectors)
-        return out, attention_weights
-class MultiheadAttention(nn.Layer):
-    """Multihead Attention module.
-    Parameters
-    -----------
-    model_dim: int
-        The feature size of query.
-    num_heads : int
-        The number of attention heads.
-    dropout : float, optional
-        Dropout probability of scaled dot product attention and final context
-        vector. Defaults to 0.0.
-    k_dim : int, optional
-        Feature size of the key of each scaled dot product attention. If not
-        provided, it is set to ``model_dim / num_heads``. Defaults to None.
-    v_dim : int, optional
-        Feature size of the key of each scaled dot product attention. If not
-        provided, it is set to ``model_dim / num_heads``. Defaults to None.
-    Raises
-    ---------
-    ValueError
-        If ``model_dim`` is not divisible by ``num_heads``.
-    """
-    def __init__(self,
-                 model_dim: int,
-                 num_heads: int,
-                 dropout: float=0.0,
-                 k_dim: int=None,
-                 v_dim: int=None):
-        super(MultiheadAttention, self).__init__()
-        if model_dim % num_heads != 0:
-            raise ValueError("model_dim must be divisible by num_heads")
-        depth = model_dim // num_heads
-        k_dim = k_dim or depth
-        v_dim = v_dim or depth
-        self.affine_q = nn.Linear(model_dim, num_heads * k_dim)
-        self.affine_k = nn.Linear(model_dim, num_heads * k_dim)
-        self.affine_v = nn.Linear(model_dim, num_heads * v_dim)
-        self.affine_o = nn.Linear(num_heads * v_dim, model_dim)
-        self.num_heads = num_heads
-        self.model_dim = model_dim
-        self.dropout = dropout
-    def forward(self, q, k, v, mask):
-        """Compute context vector and attention weights.
-        Parameters
-        -----------
-        q : Tensor [shape=(batch_size, time_steps_q, model_dim)]
-            The queries.
-        k : Tensor [shape=(batch_size, time_steps_k, model_dim)]
-            The keys.
-        v : Tensor [shape=(batch_size, time_steps_k, model_dim)]
-            The values.
-        mask : Tensor [shape=(batch_size, times_steps_q, time_steps_k] or broadcastable shape
-            The mask.
-        Returns
-        ----------
-        out : Tensor [shape=(batch_size, time_steps_q, model_dim)]
-            The context vector.
-        attention_weights : Tensor [shape=(batch_size, times_steps_q, time_steps_k)]
-            The attention weights.
-        """
-        q = _split_heads(self.affine_q(q), self.num_heads)  # (B, h, T, C)
-        k = _split_heads(self.affine_k(k), self.num_heads)
-        v = _split_heads(self.affine_v(v), self.num_heads)
-        mask = paddle.unsqueeze(mask, 1)  # unsqueeze for the h dim
-        context_vectors, attention_weights = scaled_dot_product_attention(
-            q, k, v, mask, self.dropout, self.training)
-        # NOTE: there is more sophisticated implementation: Scheduled DropHead
-        context_vectors = _concat_heads(context_vectors)  # (B, T, h*C)
-        out = self.affine_o(context_vectors)
-        return out, attention_weights
-class LocationSensitiveAttention(nn.Layer):
-    """Location Sensitive Attention module.
-    Reference: `Attention-Based Models for Speech Recognition <https://arxiv.org/pdf/1506.07503.pdf>`_
-    Parameters
-    -----------
-    d_query: int
-        The feature size of query.
-    d_key : int
-        The feature size of key.
-    d_attention : int
-        The feature size of dimension.
-    location_filters : int
-        Filter size of attention convolution.
-    location_kernel_size : int
-        Kernel size of attention convolution.
-    """
-    def __init__(self,
-                 d_query: int,
-                 d_key: int,
-                 d_attention: int,
-                 location_filters: int,
-                 location_kernel_size: int):
-        super().__init__()
-        self.query_layer = nn.Linear(d_query, d_attention, bias_attr=False)
-        self.key_layer = nn.Linear(d_key, d_attention, bias_attr=False)
-        self.value = nn.Linear(d_attention, 1, bias_attr=False)
-        # Location Layer
-        self.location_conv = nn.Conv1D(
-            2,
-            location_filters,
-            kernel_size=location_kernel_size,
-            padding=int((location_kernel_size - 1) / 2),
-            bias_attr=False,
-            data_format='NLC')
-        self.location_layer = nn.Linear(
-            location_filters, d_attention, bias_attr=False)
-    def forward(self,
-                query,
-                processed_key,
-                value,
-                attention_weights_cat,
-                mask=None):
-        """Compute context vector and attention weights.
-        Parameters
-        -----------
-        query : Tensor [shape=(batch_size, d_query)]
-            The queries.
-        processed_key : Tensor [shape=(batch_size, time_steps_k, d_attention)]
-            The keys after linear layer.
-        value : Tensor [shape=(batch_size, time_steps_k, d_key)]
-            The values.
-        attention_weights_cat : Tensor [shape=(batch_size, time_step_k, 2)]
-            Attention weights concat.
-        mask : Tensor, optional
-            The mask. Shape should be (batch_size, times_steps_k, 1).
-            Defaults to None.
-        Returns
-        ----------
-        attention_context : Tensor [shape=(batch_size, d_attention)]
-            The context vector.
-        attention_weights : Tensor [shape=(batch_size, time_steps_k)]
-            The attention weights.
-        """
-        processed_query = self.query_layer(paddle.unsqueeze(query, axis=[1]))
-        processed_attention_weights = self.location_layer(
-            self.location_conv(attention_weights_cat))
-        # (B, T_enc, 1)
-        alignment = self.value(
-            paddle.tanh(processed_attention_weights + processed_key +
-                        processed_query))
-        if mask is not None:
-            alignment = alignment + (1.0 - mask) * -1e9
-        attention_weights = F.softmax(alignment, axis=1)
-        attention_context = paddle.matmul(
-            attention_weights, value, transpose_x=True)
-        attention_weights = paddle.squeeze(attention_weights, axis=-1)
-        attention_context = paddle.squeeze(attention_context, axis=1)
-        return attention_context, attention_weights
--- a/paddlespeech/t2s/modules/audio.py
+++ b/paddlespeech/t2s/modules/audio.py
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import librosa
-import numpy as np
-import paddle
-from librosa.util import pad_center
-from paddle import nn
-from paddle.nn import functional as F
-from scipy import signal
-__all__ = ["quantize", "dequantize", "STFT", "MelScale"]
-def quantize(values, n_bands):
-    """Linearlly quantize a float Tensor in [-1, 1) to an interger Tensor in
-    [0, n_bands).
-    Parameters
-    -----------
-    values : Tensor [dtype: flaot32 or float64]
-        The floating point value.
-    n_bands : int
-        The number of bands. The output integer Tensor's value is in the range
-        [0, n_bans).
-    Returns
-    ----------
-    Tensor [dtype: int 64]
-        The quantized tensor.
-    """
-    quantized = paddle.cast((values + 1.0) / 2.0 * n_bands, "int64")
-    return quantized
-def dequantize(quantized, n_bands, dtype=None):
-    """Linearlly dequantize an integer Tensor into a float Tensor in the range
-    [-1, 1).
-    Parameters
-    -----------
-    quantized : Tensor [dtype: int]
-        The quantized value in the range [0, n_bands).
-    n_bands : int
-        Number of bands. The input integer Tensor's value is in the range
-        [0, n_bans).
-    dtype : str, optional
-        Data type of the output.
-    Returns
-    -----------
-    Tensor
-        The dequantized tensor, dtype is specified by `dtype`. If `dtype` is 
-        not specified, the default float data type is used.
-    """
-    dtype = dtype or paddle.get_default_dtype()
-    value = (paddle.cast(quantized, dtype) + 0.5) * (2.0 / n_bands) - 1.0
-    return value
-class STFT(nn.Layer):
-    """A module for computing stft transformation in a differentiable way.
-    Parameters
-    ------------
-    n_fft : int
-        Number of samples in a frame.
-    hop_length : int
-        Number of samples shifted between adjacent frames.
-    win_length : int
-        Length of the window.
-    window : str, optional
-        Name of window function, see `scipy.signal.get_window` for more
-        details. Defaults to "hanning".
-    center : bool
-        If True, the signal y is padded so that frame D[:, t] is centered 
-        at y[t * hop_length]. If False, then D[:, t] begins at y[t * hop_length].
-        Defaults to True.
-    pad_mode : string or function
-        If center=True, this argument is passed to np.pad for padding the edges
-        of the signal y. By default (pad_mode="reflect"), y is padded on both
-        sides with its own reflection, mirrored around its first and last
-        sample respectively. If center=False, this argument is ignored.
-    Notes
-    -----------
-    It behaves like ``librosa.core.stft``. See ``librosa.core.stft`` for more
-    details.
-    Given a audio which ``T`` samples, it the STFT transformation outputs a
-    spectrum with (C, frames) and complex dtype, where ``C = 1 + n_fft / 2``
-    and ``frames = 1 + T // hop_lenghth``.
-    Ony ``center`` and ``reflect`` padding is supported now.
-    """
-    def __init__(self,
-                 n_fft,
-                 hop_length=None,
-                 win_length=None,
-                 window="hanning",
-                 center=True,
-                 pad_mode="reflect"):
-        super().__init__()
-        # By default, use the entire frame
-        if win_length is None:
-            win_length = n_fft
-        # Set the default hop, if it's not already specified
-        if hop_length is None:
-            hop_length = int(win_length // 4)
-        self.hop_length = hop_length
-        self.n_bin = 1 + n_fft // 2
-        self.n_fft = n_fft
-        self.center = center
-        self.pad_mode = pad_mode
-        # calculate window
-        window = signal.get_window(window, win_length, fftbins=True)
-        # pad window to n_fft size
-        if n_fft != win_length:
-            window = pad_center(window, n_fft, mode="constant")
-            # lpad = (n_fft - win_length) // 2
-            # rpad = n_fft - win_length - lpad
-            # window = np.pad(window, ((lpad, pad), ), 'constant')
-        # calculate weights
-        # r = np.arange(0, n_fft)
-        # M = np.expand_dims(r, -1) * np.expand_dims(r, 0)
-        # w_real = np.reshape(window *
-        # np.cos(2 * np.pi * M / n_fft)[:self.n_bin],
-        # (self.n_bin, 1, self.n_fft))
-        # w_imag = np.reshape(window *
-        # np.sin(-2 * np.pi * M / n_fft)[:self.n_bin],
-        # (self.n_bin, 1, self.n_fft))
-        weight = np.fft.fft(np.eye(n_fft))[:self.n_bin]
-        w_real = weight.real
-        w_imag = weight.imag
-        w = np.concatenate([w_real, w_imag], axis=0)
-        w = w * window
-        w = np.expand_dims(w, 1)
-        weight = paddle.cast(paddle.to_tensor(w), paddle.get_default_dtype())
-        self.register_buffer("weight", weight)
-    def forward(self, x):
-        """Compute the stft transform.
-        Parameters
-        ------------
-        x : Tensor [shape=(B, T)]
-            The input waveform.
-        Returns
-        ------------
-        real : Tensor [shape=(B, C, frames)]
-            The real part of the spectrogram.
-        imag : Tensor [shape=(B, C, frames)]
-            The image part of the spectrogram.
-        """
-        x = paddle.unsqueeze(x, axis=1)
-        if self.center:
-            x = F.pad(
-                x, [self.n_fft // 2, self.n_fft // 2],
-                data_format='NCL',
-                mode=self.pad_mode)
-        # to BCT, C=1
-        out = F.conv1d(x, self.weight, stride=self.hop_length)
-        real, imag = paddle.chunk(out, 2, axis=1)  # BCT
-        return real, imag
-    def power(self, x):
-        """Compute the power spectrum.
-        Parameters
-        ------------
-        x : Tensor [shape=(B, T)]
-            The input waveform.
-        Returns
-        ------------
-        Tensor [shape=(B, C, T)]
-            The power spectrum.
-        """
-        real, imag = self.forward(x)
-        power = real**2 + imag**2
-        return power
-    def magnitude(self, x):
-        """Compute the magnitude of the spectrum.
-        Parameters
-        ------------
-        x : Tensor [shape=(B, T)]
-            The input waveform.
-        Returns
-        ------------
-        Tensor [shape=(B, C, T)]
-            The magnitude of the spectrum.
-        """
-        power = self.power(x)
-        magnitude = paddle.sqrt(power)  # TODO(chenfeiyu): maybe clipping
-        return magnitude
-class MelScale(nn.Layer):
-    def __init__(self, sr, n_fft, n_mels, fmin, fmax):
-        super().__init__()
-        mel_basis = librosa.filters.mel(sr, n_fft, n_mels, fmin, fmax)
-        # self.weight = paddle.to_tensor(mel_basis)
-        weight = paddle.to_tensor(mel_basis, dtype=paddle.get_default_dtype())
-        self.register_buffer("weight", weight)
-    def forward(self, spec):
-        # (n_mels, n_freq) * (batch_size, n_freq, n_frames)
-        mel = paddle.matmul(self.weight, spec)
-        return mel
--- a/paddlespeech/t2s/modules/causal_conv.py
+++ b/paddlespeech/t2s/modules/causal_conv.py
@@ -13,9 +13,10 @@
 # limitations under the License.
 """Causal convolusion layer modules."""
 import paddle
+from paddle import nn
-class CausalConv1D(paddle.nn.Layer):
+class CausalConv1D(nn.Layer):
    """CausalConv1D module with customized initialization."""
    def __init__(
@@ -31,7 +32,7 @@ class CausalConv1D(paddle.nn.Layer):
        super().__init__()
        self.pad = getattr(paddle.nn, pad)((kernel_size - 1) * dilation,
                                           **pad_params)
-        self.conv = paddle.nn.Conv1D(
+        self.conv = nn.Conv1D(
            in_channels,
            out_channels,
            kernel_size,
@@ -52,7 +53,7 @@ class CausalConv1D(paddle.nn.Layer):
        return self.conv(self.pad(x))[:, :, :x.shape[2]]
-class CausalConv1DTranspose(paddle.nn.Layer):
+class CausalConv1DTranspose(nn.Layer):
    """CausalConv1DTranspose module with customized initialization."""
    def __init__(self,
@@ -63,7 +64,7 @@ class CausalConv1DTranspose(paddle.nn.Layer):
                 bias=True):
        """Initialize CausalConvTranspose1d module."""
        super().__init__()
-        self.deconv = paddle.nn.Conv1DTranspose(
+        self.deconv = nn.Conv1DTranspose(
            in_channels, out_channels, kernel_size, stride, bias_attr=bias)
        self.stride = stride

--- a/paddlespeech/t2s/modules/fastspeech2_transformer/__init__.py
+++ b/paddlespeech/t2s/modules/fastspeech2_transformer/__init__.py
--- a/paddlespeech/t2s/modules/conformer/convolution.py
+++ b/paddlespeech/t2s/modules/conformer/convolution.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+"""ConvolutionModule definition."""
+from paddle import nn
+class ConvolutionModule(nn.Layer):
+    """ConvolutionModule in Conformer model.
+    Parameters
+    ----------
+    channels : int
+        The number of channels of conv layers.
+    kernel_size : int
+        Kernerl size of conv layers.
+    """
+    def __init__(self, channels, kernel_size, activation=nn.ReLU(), bias=True):
+        """Construct an ConvolutionModule object."""
+        super().__init__()
+        # kernerl_size should be a odd number for 'SAME' padding
+        assert (kernel_size - 1) % 2 == 0
+        self.pointwise_conv1 = nn.Conv1D(
+            channels,
+            2 * channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias_attr=bias, )
+        self.depthwise_conv = nn.Conv1D(
+            channels,
+            channels,
+            kernel_size,
+            stride=1,
+            padding=(kernel_size - 1) // 2,
+            groups=channels,
+            bias_attr=bias, )
+        self.norm = nn.BatchNorm1D(channels)
+        self.pointwise_conv2 = nn.Conv1D(
+            channels,
+            channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias_attr=bias, )
+        self.activation = activation
+    def forward(self, x):
+        """Compute convolution module.
+        Parameters
+        ----------
+        x : paddle.Tensor
+            Input tensor (#batch, time, channels).
+        Returns
+        ----------
+        paddle.Tensor
+            Output tensor (#batch, time, channels).
+        """
+        # exchange the temporal dimension and the feature dimension
+        x = x.transpose([0, 2, 1])
+        # GLU mechanism
+        # (batch, 2*channel, time)
+        x = self.pointwise_conv1(x)
+        # (batch, channel, time)
+        x = nn.functional.glu(x, axis=1)
+        # 1D Depthwise Conv
+        x = self.depthwise_conv(x)
+        x = self.activation(self.norm(x))
+        x = self.pointwise_conv2(x)
+        return x.transpose([0, 2, 1])
--- a/paddlespeech/t2s/modules/conformer/encoder_layer.py
+++ b/paddlespeech/t2s/modules/conformer/encoder_layer.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+"""Encoder self-attention layer definition."""
+import paddle
+from paddle import nn
+from paddlespeech.t2s.modules.layer_norm import LayerNorm
+class EncoderLayer(nn.Layer):
+    """Encoder layer module.
+    Parameters
+    ----------
+    size : int
+        Input dimension.
+    self_attn : nn.Layer
+        Self-attention module instance.
+        `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` instance
+        can be used as the argument.
+    feed_forward : nn.Layer
+        Feed-forward module instance.
+        `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance
+        can be used as the argument.
+    feed_forward_macaron : nn.Layer
+        Additional feed-forward module instance.
+        `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance
+        can be used as the argument.
+    conv_module : nn.Layer
+        Convolution module instance.
+        `ConvlutionModule` instance can be used as the argument.
+    dropout_rate : float
+        Dropout rate.
+    normalize_before : bool
+        Whether to use layer_norm before the first block.
+    concat_after : bool
+        Whether to concat attention layer's input and output.
+        if True, additional linear will be applied.
+        i.e. x -> x + linear(concat(x, att(x)))
+        if False, no additional linear will be applied. i.e. x -> x + att(x)
+    stochastic_depth_rate : float
+        Proability to skip this layer.
+        During training, the layer may skip residual computation and return input
+        as-is with given probability.
+    """
+    def __init__(
+            self,
+            size,
+            self_attn,
+            feed_forward,
+            feed_forward_macaron,
+            conv_module,
+            dropout_rate,
+            normalize_before=True,
+            concat_after=False,
+            stochastic_depth_rate=0.0, ):
+        """Construct an EncoderLayer object."""
+        super().__init__()
+        self.self_attn = self_attn
+        self.feed_forward = feed_forward
+        self.feed_forward_macaron = feed_forward_macaron
+        self.conv_module = conv_module
+        self.norm_ff = LayerNorm(size)  # for the FNN module
+        self.norm_mha = LayerNorm(size)  # for the MHA module
+        if feed_forward_macaron is not None:
+            self.norm_ff_macaron = LayerNorm(size)
+            self.ff_scale = 0.5
+        else:
+            self.ff_scale = 1.0
+        if self.conv_module is not None:
+            self.norm_conv = LayerNorm(size)  # for the CNN module
+            self.norm_final = LayerNorm(
+                size)  # for the final output of the block
+        self.dropout = nn.Dropout(dropout_rate)
+        self.size = size
+        self.normalize_before = normalize_before
+        self.concat_after = concat_after
+        if self.concat_after:
+            self.concat_linear = nn.Linear(size + size, size)
+        self.stochastic_depth_rate = stochastic_depth_rate
+    def forward(self, x_input, mask, cache=None):
+        """Compute encoded features.
+        Parameters
+        ----------
+        x_input : Union[Tuple, paddle.Tensor]
+            Input tensor w/ or w/o pos emb.
+            - w/ pos emb: Tuple of tensors [(#batch, time, size), (1, time, size)].
+            - w/o pos emb: Tensor (#batch, time, size).
+        mask : paddle.Tensor
+            Mask tensor for the input (#batch, time).
+        cache paddle.Tensor
+            Cache tensor of the input (#batch, time - 1, size).
+        Returns
+        ----------
+        paddle.Tensor
+            Output tensor (#batch, time, size).
+        paddle.Tensor
+            Mask tensor (#batch, time).
+        """
+        if isinstance(x_input, tuple):
+            x, pos_emb = x_input[0], x_input[1]
+        else:
+            x, pos_emb = x_input, None
+        skip_layer = False
+        # with stochastic depth, residual connection `x + f(x)` becomes
+        # `x <- x + 1 / (1 - p) * f(x)` at training time.
+        stoch_layer_coeff = 1.0
+        if self.training and self.stochastic_depth_rate > 0:
+            skip_layer = paddle.rand(1).item() < self.stochastic_depth_rate
+            stoch_layer_coeff = 1.0 / (1 - self.stochastic_depth_rate)
+        if skip_layer:
+            if cache is not None:
+                x = paddle.concat([cache, x], axis=1)
+            if pos_emb is not None:
+                return (x, pos_emb), mask
+            return x, mask
+        # whether to use macaron style
+        if self.feed_forward_macaron is not None:
+            residual = x
+            if self.normalize_before:
+                x = self.norm_ff_macaron(x)
+            x = residual + stoch_layer_coeff * self.ff_scale * self.dropout(
+                self.feed_forward_macaron(x))
+            if not self.normalize_before:
+                x = self.norm_ff_macaron(x)
+        # multi-headed self-attention module
+        residual = x
+        if self.normalize_before:
+            x = self.norm_mha(x)
+        if cache is None:
+            x_q = x
+        else:
+            assert cache.shape == (x.shape[0], x.shape[1] - 1, self.size)
+            x_q = x[:, -1:, :]
+            residual = residual[:, -1:, :]
+            mask = None if mask is None else mask[:, -1:, :]
+        if pos_emb is not None:
+            x_att = self.self_attn(x_q, x, x, pos_emb, mask)
+        else:
+            x_att = self.self_attn(x_q, x, x, mask)
+        if self.concat_after:
+            x_concat = paddle.concat((x, x_att), axis=-1)
+            x = residual + stoch_layer_coeff * self.concat_linear(x_concat)
+        else:
+            x = residual + stoch_layer_coeff * self.dropout(x_att)
+        if not self.normalize_before:
+            x = self.norm_mha(x)
+        # convolution module
+        if self.conv_module is not None:
+            residual = x
+            if self.normalize_before:
+                x = self.norm_conv(x)
+            x = residual + stoch_layer_coeff * self.dropout(self.conv_module(x))
+            if not self.normalize_before:
+                x = self.norm_conv(x)
+        # feed forward module
+        residual = x
+        if self.normalize_before:
+            x = self.norm_ff(x)
+        x = residual + stoch_layer_coeff * self.ff_scale * self.dropout(
+            self.feed_forward(x))
+        if not self.normalize_before:
+            x = self.norm_ff(x)
+        if self.conv_module is not None:
+            x = self.norm_final(x)
+        if cache is not None:
+            x = paddle.concat([cache, x], axis=1)
+        if pos_emb is not None:
+            return (x, pos_emb), mask
+        return x, mask
--- a/paddlespeech/t2s/modules/conv.py
+++ b/paddlespeech/t2s/modules/conv.py
@@ -84,7 +84,7 @@ class Conv1dCell(nn.Conv1D):
        _kernel_size = kernel_size[0] if isinstance(kernel_size, (
            tuple, list)) else kernel_size
        self._r = 1 + (_kernel_size - 1) * _dilation
-        super(Conv1dCell, self).__init__(
+        super().__init__(
            in_channels,
            out_channels,
            kernel_size,
@@ -226,7 +226,7 @@ class Conv1dBatchNorm(nn.Layer):
                 data_format="NCL",
                 momentum=0.9,
                 epsilon=1e-05):
-        super(Conv1dBatchNorm, self).__init__()
+        super().__init__()
        self.conv = nn.Conv1D(
            in_channels,
            out_channels,

--- a/paddlespeech/t2s/modules/fastspeech2_transformer/encoder.py
+++ b/paddlespeech/t2s/modules/fastspeech2_transformer/encoder.py
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# Modified from espnet(https://github.com/espnet/espnet)
-from paddle import nn
-from paddlespeech.t2s.modules.fastspeech2_transformer.attention import MultiHeadedAttention
-from paddlespeech.t2s.modules.fastspeech2_transformer.embedding import PositionalEncoding
-from paddlespeech.t2s.modules.fastspeech2_transformer.encoder_layer import EncoderLayer
-from paddlespeech.t2s.modules.fastspeech2_transformer.multi_layer_conv import Conv1dLinear
-from paddlespeech.t2s.modules.fastspeech2_transformer.multi_layer_conv import MultiLayeredConv1d
-from paddlespeech.t2s.modules.fastspeech2_transformer.positionwise_feed_forward import PositionwiseFeedForward
-from paddlespeech.t2s.modules.fastspeech2_transformer.repeat import repeat
-class Encoder(nn.Layer):
-    """Transformer encoder module.
-    Parameters
-    ----------
-    idim : int
-        Input dimension.
-    attention_dim : int
-        Dimention of attention.
-    attention_heads : int
-        The number of heads of multi head attention.
-    linear_units : int
-        The number of units of position-wise feed forward.
-    num_blocks : int
-        The number of decoder blocks.
-    dropout_rate : float
-        Dropout rate.
-    positional_dropout_rate : float
-        Dropout rate after adding positional encoding.
-    attention_dropout_rate : float
-        Dropout rate in attention.
-    input_layer : Union[str, paddle.nn.Layer]
-        Input layer type.
-    pos_enc_class : paddle.nn.Layer
-        Positional encoding module class.
-        `PositionalEncoding `or `ScaledPositionalEncoding`
-    normalize_before : bool
-        Whether to use layer_norm before the first block.
-    concat_after : bool
-        Whether to concat attention layer's input and output.
-        if True, additional linear will be applied.
-        i.e. x -> x + linear(concat(x, att(x)))
-        if False, no additional linear will be applied. i.e. x -> x + att(x)
-    positionwise_layer_type : str
-        "linear", "conv1d", or "conv1d-linear".
-    positionwise_conv_kernel_size : int
-        Kernel size of positionwise conv1d layer.
-    selfattention_layer_type : str
-        Encoder attention layer type.
-    padding_idx : int
-        Padding idx for input_layer=embed.
-    """
-    def __init__(
-            self,
-            idim,
-            attention_dim=256,
-            attention_heads=4,
-            linear_units=2048,
-            num_blocks=6,
-            dropout_rate=0.1,
-            positional_dropout_rate=0.1,
-            attention_dropout_rate=0.0,
-            input_layer="conv2d",
-            pos_enc_class=PositionalEncoding,
-            normalize_before=True,
-            concat_after=False,
-            positionwise_layer_type="linear",
-            positionwise_conv_kernel_size=1,
-            selfattention_layer_type="selfattn",
-            padding_idx=-1, ):
-        """Construct an Encoder object."""
-        super(Encoder, self).__init__()
-        self.conv_subsampling_factor = 1
-        if input_layer == "linear":
-            self.embed = nn.Sequential(
-                nn.Linear(idim, attention_dim, bias_attr=True),
-                nn.LayerNorm(attention_dim),
-                nn.Dropout(dropout_rate),
-                nn.ReLU(),
-                pos_enc_class(attention_dim, positional_dropout_rate), )
-        elif input_layer == "embed":
-            self.embed = nn.Sequential(
-                nn.Embedding(idim, attention_dim, padding_idx=padding_idx),
-                pos_enc_class(attention_dim, positional_dropout_rate), )
-        elif isinstance(input_layer, nn.Layer):
-            self.embed = nn.Sequential(
-                input_layer,
-                pos_enc_class(attention_dim, positional_dropout_rate), )
-        elif input_layer is None:
-            self.embed = nn.Sequential(
-                pos_enc_class(attention_dim, positional_dropout_rate))
-        else:
-            raise ValueError("unknown input_layer: " + input_layer)
-        self.normalize_before = normalize_before
-        positionwise_layer, positionwise_layer_args = self.get_positionwise_layer(
-            positionwise_layer_type,
-            attention_dim,
-            linear_units,
-            dropout_rate,
-            positionwise_conv_kernel_size, )
-        if selfattention_layer_type in [
-                "selfattn",
-                "rel_selfattn",
-                "legacy_rel_selfattn",
-        ]:
-            encoder_selfattn_layer = MultiHeadedAttention
-            encoder_selfattn_layer_args = [
-                (attention_heads, attention_dim, attention_dropout_rate, )
-            ] * num_blocks
-        else:
-            raise NotImplementedError(selfattention_layer_type)
-        self.encoders = repeat(
-            num_blocks,
-            lambda lnum: EncoderLayer(
-                attention_dim,
-                encoder_selfattn_layer(*encoder_selfattn_layer_args[lnum]),
-                positionwise_layer(*positionwise_layer_args),
-                dropout_rate,
-                normalize_before,
-                concat_after, ), )
-        if self.normalize_before:
-            self.after_norm = nn.LayerNorm(attention_dim)
-    def get_positionwise_layer(
-            self,
-            positionwise_layer_type="linear",
-            attention_dim=256,
-            linear_units=2048,
-            dropout_rate=0.1,
-            positionwise_conv_kernel_size=1, ):
-        """Define positionwise layer."""
-        if positionwise_layer_type == "linear":
-            positionwise_layer = PositionwiseFeedForward
-            positionwise_layer_args = (attention_dim, linear_units,
-                                       dropout_rate)
-        elif positionwise_layer_type == "conv1d":
-            positionwise_layer = MultiLayeredConv1d
-            positionwise_layer_args = (attention_dim, linear_units,
-                                       positionwise_conv_kernel_size,
-                                       dropout_rate, )
-        elif positionwise_layer_type == "conv1d-linear":
-            positionwise_layer = Conv1dLinear
-            positionwise_layer_args = (attention_dim, linear_units,
-                                       positionwise_conv_kernel_size,
-                                       dropout_rate, )
-        else:
-            raise NotImplementedError("Support only linear or conv1d.")
-        return positionwise_layer, positionwise_layer_args
-    def forward(self, xs, masks):
-        """Encode input sequence.
-        Parameters
-        ----------
-        xs : paddle.Tensor
-            Input tensor (#batch, time, idim).
-        masks : paddle.Tensor
-            Mask tensor (#batch, time).
-        Returns
-        ----------
-        paddle.Tensor
-            Output tensor (#batch, time, attention_dim).
-        paddle.Tensor
-            Mask tensor (#batch, time).
-        """
-        xs = self.embed(xs)
-        xs, masks = self.encoders(xs, masks)
-        if self.normalize_before:
-            xs = self.after_norm(xs)
-        return xs, masks
-    def forward_one_step(self, xs, masks, cache=None):
-        """Encode input frame.
-        Parameters
-        ----------
-        xs : paddle.Tensor
-            Input tensor.
-        masks : paddle.Tensor
-            Mask tensor.
-        cache : List[paddle.Tensor]
-            List of cache tensors.
-        Returns
-        ----------
-        paddle.Tensor
-            Output tensor.
-        paddle.Tensor
-            Mask tensor.
-        List[paddle.Tensor]
-            List of new cache tensors.
-        """
-        xs = self.embed(xs)
-        if cache is None:
-            cache = [None for _ in range(len(self.encoders))]
-        new_cache = []
-        for c, e in zip(cache, self.encoders):
-            xs, masks = e(xs, masks, cache=c)
-            new_cache.append(xs)
-        if self.normalize_before:
-            xs = self.after_norm(xs)
-        return xs, masks, new_cache
--- a/paddlespeech/t2s/modules/layer_norm.py
+++ b/paddlespeech/t2s/modules/layer_norm.py
@@ -13,9 +13,10 @@
 # limitations under the License.
 """Layer normalization module."""
 import paddle
+from paddle import nn
-class LayerNorm(paddle.nn.LayerNorm):
+class LayerNorm(nn.LayerNorm):
    """Layer normalization module.
    Parameters
@@ -28,7 +29,7 @@ class LayerNorm(paddle.nn.LayerNorm):
    def __init__(self, nout, dim=-1):
        """Construct an LayerNorm object."""
-        super(LayerNorm, self).__init__(nout)
+        super().__init__(nout)
        self.dim = dim
    def forward(self, x):

--- a/paddlespeech/t2s/modules/losses.py
+++ b/paddlespeech/t2s/modules/losses.py
@@ -11,18 +11,16 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import math
 import paddle
+from paddle import nn
 from paddle.fluid.layers import sequence_mask
 from paddle.nn import functional as F
+from scipy import signal
-__all__ = [
-    "guided_attention_loss",
-    "weighted_mean",
-    "masked_l1_loss",
-    "masked_softmax_with_cross_entropy",
-]
+# Loss for Tacotron2
 def attention_guide(dec_lens, enc_lens, N, T, g, dtype=None):
    """Build that W matrix. shape(B, T_dec, T_enc)
    W[i, n, t] = 1 - exp(-(n/dec_lens[i] - t/enc_lens[i])**2 / (2g**2)) 
@@ -57,6 +55,367 @@ def guided_attention_loss(attention_weight, dec_lens, enc_lens, g):
    return loss
+# Losses for GAN Vocoder
+def stft(x,
+         fft_size,
+         hop_length=None,
+         win_length=None,
+         window='hann',
+         center=True,
+         pad_mode='reflect'):
+    """Perform STFT and convert to magnitude spectrogram.
+    Parameters
+    ----------
+    x : Tensor
+        Input signal tensor (B, T).
+    fft_size : int
+        FFT size.
+    hop_size : int
+        Hop size.
+    win_length : int
+        window : str, optional
+    window : str
+        Name of window function, see `scipy.signal.get_window` for more
+        details. Defaults to "hann".
+    center : bool, optional
+        center (bool, optional): Whether to pad `x` to make that the
+        :math:`t \times hop\_length` at the center of :math:`t`-th frame. Default: `True`.
+    pad_mode : str, optional
+        Choose padding pattern when `center` is `True`.
+    Returns
+    ----------
+    Tensor:
+        Magnitude spectrogram (B, #frames, fft_size // 2 + 1).
+    """
+    # calculate window
+    window = signal.get_window(window, win_length, fftbins=True)
+    window = paddle.to_tensor(window)
+    x_stft = paddle.signal.stft(
+        x,
+        fft_size,
+        hop_length,
+        win_length,
+        window=window,
+        center=center,
+        pad_mode=pad_mode)
+    real = x_stft.real()
+    imag = x_stft.imag()
+    return paddle.sqrt(paddle.clip(real**2 + imag**2, min=1e-7)).transpose(
+        [0, 2, 1])
+class SpectralConvergenceLoss(nn.Layer):
+    """Spectral convergence loss module."""
+    def __init__(self):
+        """Initilize spectral convergence loss module."""
+        super().__init__()
+    def forward(self, x_mag, y_mag):
+        """Calculate forward propagation.
+        Parameters
+        ----------
+        x_mag : Tensor
+            Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
+        y_mag : Tensor)
+            Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
+        Returns
+        ----------
+        Tensor
+            Spectral convergence loss value.
+        """
+        return paddle.norm(
+            y_mag - x_mag, p="fro") / paddle.clip(
+                paddle.norm(y_mag, p="fro"), min=1e-10)
+class LogSTFTMagnitudeLoss(nn.Layer):
+    """Log STFT magnitude loss module."""
+    def __init__(self, epsilon=1e-7):
+        """Initilize los STFT magnitude loss module."""
+        super().__init__()
+        self.epsilon = epsilon
+    def forward(self, x_mag, y_mag):
+        """Calculate forward propagation.
+        Parameters
+        ----------
+        x_mag : Tensor
+            Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
+        y_mag : Tensor
+            Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
+        Returns
+        ----------
+        Tensor
+            Log STFT magnitude loss value.
+        """
+        return F.l1_loss(
+            paddle.log(paddle.clip(y_mag, min=self.epsilon)),
+            paddle.log(paddle.clip(x_mag, min=self.epsilon)))
+class STFTLoss(nn.Layer):
+    """STFT loss module."""
+    def __init__(self,
+                 fft_size=1024,
+                 shift_size=120,
+                 win_length=600,
+                 window="hann"):
+        """Initialize STFT loss module."""
+        super().__init__()
+        self.fft_size = fft_size
+        self.shift_size = shift_size
+        self.win_length = win_length
+        self.window = window
+        self.spectral_convergence_loss = SpectralConvergenceLoss()
+        self.log_stft_magnitude_loss = LogSTFTMagnitudeLoss()
+    def forward(self, x, y):
+        """Calculate forward propagation.
+        Parameters
+        ----------
+        x : Tensor
+            Predicted signal (B, T).
+        y : Tensor
+            Groundtruth signal (B, T).
+        Returns
+        ----------
+        Tensor
+            Spectral convergence loss value.
+        Tensor
+            Log STFT magnitude loss value.
+        """
+        x_mag = stft(x, self.fft_size, self.shift_size, self.win_length,
+                     self.window)
+        y_mag = stft(y, self.fft_size, self.shift_size, self.win_length,
+                     self.window)
+        sc_loss = self.spectral_convergence_loss(x_mag, y_mag)
+        mag_loss = self.log_stft_magnitude_loss(x_mag, y_mag)
+        return sc_loss, mag_loss
+class MultiResolutionSTFTLoss(nn.Layer):
+    """Multi resolution STFT loss module."""
+    def __init__(
+            self,
+            fft_sizes=[1024, 2048, 512],
+            hop_sizes=[120, 240, 50],
+            win_lengths=[600, 1200, 240],
+            window="hann", ):
+        """Initialize Multi resolution STFT loss module.
+        Parameters
+        ----------
+        fft_sizes : list
+            List of FFT sizes.
+        hop_sizes : list
+            List of hop sizes.
+        win_lengths : list
+            List of window lengths.
+        window : str
+            Window function type.
+        """
+        super().__init__()
+        assert len(fft_sizes) == len(hop_sizes) == len(win_lengths)
+        self.stft_losses = nn.LayerList()
+        for fs, ss, wl in zip(fft_sizes, hop_sizes, win_lengths):
+            self.stft_losses.append(STFTLoss(fs, ss, wl, window))
+    def forward(self, x, y):
+        """Calculate forward propagation.
+        Parameters
+        ----------
+        x : Tensor
+            Predicted signal (B, T) or (B, #subband, T).
+        y : Tensor
+            Groundtruth signal (B, T) or (B, #subband, T).
+        Returns
+        ----------
+        Tensor
+            Multi resolution spectral convergence loss value.
+        Tensor
+            Multi resolution log STFT magnitude loss value.
+        """
+        if len(x.shape) == 3:
+            # (B, C, T) -> (B x C, T)
+            x = x.reshape([-1, x.shape[2]])
+            # (B, C, T) -> (B x C, T)
+            y = y.reshape([-1, y.shape[2]])
+        sc_loss = 0.0
+        mag_loss = 0.0
+        for f in self.stft_losses:
+            sc_l, mag_l = f(x, y)
+            sc_loss += sc_l
+            mag_loss += mag_l
+        sc_loss /= len(self.stft_losses)
+        mag_loss /= len(self.stft_losses)
+        return sc_loss, mag_loss
+class GeneratorAdversarialLoss(nn.Layer):
+    """Generator adversarial loss module."""
+    def __init__(
+            self,
+            average_by_discriminators=True,
+            loss_type="mse", ):
+        """Initialize GeneratorAversarialLoss module."""
+        super().__init__()
+        self.average_by_discriminators = average_by_discriminators
+        assert loss_type in ["mse", "hinge"], f"{loss_type} is not supported."
+        if loss_type == "mse":
+            self.criterion = self._mse_loss
+        else:
+            self.criterion = self._hinge_loss
+    def forward(self, outputs):
+        """Calcualate generator adversarial loss.
+        Parameters
+        ----------
+        outputs: Tensor or List
+        Discriminator outputs or list of discriminator outputs.
+        Returns
+        ----------
+        Tensor
+            Generator adversarial loss value.
+        """
+        if isinstance(outputs, (tuple, list)):
+            adv_loss = 0.0
+            for i, outputs_ in enumerate(outputs):
+                if isinstance(outputs_, (tuple, list)):
+                    # case including feature maps
+                    outputs_ = outputs_[-1]
+                adv_loss += self.criterion(outputs_)
+            if self.average_by_discriminators:
+                adv_loss /= i + 1
+        else:
+            adv_loss = self.criterion(outputs)
+        return adv_loss
+    def _mse_loss(self, x):
+        return F.mse_loss(x, paddle.ones_like(x))
+    def _hinge_loss(self, x):
+        return -x.mean()
+class DiscriminatorAdversarialLoss(nn.Layer):
+    """Discriminator adversarial loss module."""
+    def __init__(
+            self,
+            average_by_discriminators=True,
+            loss_type="mse", ):
+        """Initialize DiscriminatorAversarialLoss module."""
+        super().__init__()
+        self.average_by_discriminators = average_by_discriminators
+        assert loss_type in ["mse"], f"{loss_type} is not supported."
+        if loss_type == "mse":
+            self.fake_criterion = self._mse_fake_loss
+            self.real_criterion = self._mse_real_loss
+    def forward(self, outputs_hat, outputs):
+        """Calcualate discriminator adversarial loss.
+        Parameters
+        ----------
+        outputs_hat : Tensor or list
+            Discriminator outputs or list of
+            discriminator outputs calculated from generator outputs.
+        outputs : Tensor or list
+            Discriminator outputs or list of
+            discriminator outputs calculated from groundtruth.
+        Returns
+        ----------
+        Tensor
+            Discriminator real loss value.
+        Tensor
+            Discriminator fake loss value.
+        """
+        if isinstance(outputs, (tuple, list)):
+            real_loss = 0.0
+            fake_loss = 0.0
+            for i, (outputs_hat_,
+                    outputs_) in enumerate(zip(outputs_hat, outputs)):
+                if isinstance(outputs_hat_, (tuple, list)):
+                    # case including feature maps
+                    outputs_hat_ = outputs_hat_[-1]
+                    outputs_ = outputs_[-1]
+                real_loss += self.real_criterion(outputs_)
+                fake_loss += self.fake_criterion(outputs_hat_)
+            if self.average_by_discriminators:
+                fake_loss /= i + 1
+                real_loss /= i + 1
+        else:
+            real_loss = self.real_criterion(outputs)
+            fake_loss = self.fake_criterion(outputs_hat)
+        return real_loss, fake_loss
+    def _mse_real_loss(self, x):
+        return F.mse_loss(x, paddle.ones_like(x))
+    def _mse_fake_loss(self, x):
+        return F.mse_loss(x, paddle.zeros_like(x))
+# Losses for SpeedySpeech
+# Structural Similarity Index Measure (SSIM)
+def gaussian(window_size, sigma):
+    gauss = paddle.to_tensor([
+        math.exp(-(x - window_size // 2)**2 / float(2 * sigma**2))
+        for x in range(window_size)
+    ])
+    return gauss / gauss.sum()
+def create_window(window_size, channel):
+    _1D_window = gaussian(window_size, 1.5).unsqueeze(1)
+    _2D_window = paddle.matmul(_1D_window, paddle.transpose(
+        _1D_window, [1, 0])).unsqueeze([0, 1])
+    window = paddle.expand(_2D_window, [channel, 1, window_size, window_size])
+    return window
+def _ssim(img1, img2, window, window_size, channel, size_average=True):
+    mu1 = F.conv2d(img1, window, padding=window_size // 2, groups=channel)
+    mu2 = F.conv2d(img2, window, padding=window_size // 2, groups=channel)
+    mu1_sq = mu1.pow(2)
+    mu2_sq = mu2.pow(2)
+    mu1_mu2 = mu1 * mu2
+    sigma1_sq = F.conv2d(
+        img1 * img1, window, padding=window_size // 2, groups=channel) - mu1_sq
+    sigma2_sq = F.conv2d(
+        img2 * img2, window, padding=window_size // 2, groups=channel) - mu2_sq
+    sigma12 = F.conv2d(
+        img1 * img2, window, padding=window_size // 2, groups=channel) - mu1_mu2
+    C1 = 0.01**2
+    C2 = 0.03**2
+    ssim_map = ((2 * mu1_mu2 + C1) * (2 * sigma12 + C2)) \
+             / ((mu1_sq + mu2_sq + C1) * (sigma1_sq + sigma2_sq + C2))
+    if size_average:
+        return ssim_map.mean()
+    else:
+        return ssim_map.mean(1).mean(1).mean(1)
+def ssim(img1, img2, window_size=11, size_average=True):
+    (_, channel, _, _) = img1.shape
+    window = create_window(window_size, channel)
+    return _ssim(img1, img2, window, window_size, channel, size_average)
 def weighted_mean(input, weight):
    """Weighted mean. It can also be used as masked mean.
@@ -98,28 +457,3 @@ def masked_l1_loss(prediction, target, mask):
    abs_error = F.l1_loss(prediction, target, reduction='none')
    loss = weighted_mean(abs_error, mask)
    return loss
-def masked_softmax_with_cross_entropy(logits, label, mask, axis=-1):
-    """Compute masked softmax with cross entropy loss.
-    Parameters
-    ----------
-    logits : Tensor
-        The logits. The ``axis``-th axis is the class dimension.
-    label : Tensor [dtype: int]
-        The label. The size of the ``axis``-th axis should be 1.
-    mask : Tensor 
-        The mask. The shape should be broadcastable to ``label``.
-    axis : int, optional
-        The index of the class dimension in the shape of ``logits``, by default
-        -1.
-    Returns
-    -------
-    Tensor [shape=(1,)]
-        The masked softmax with cross entropy loss.
-    """
-    ce = F.softmax_with_cross_entropy(logits, label, axis=axis)
-    loss = weighted_mean(ce, mask)
-    return loss
--- a/paddlespeech/t2s/modules/masking.py
+++ b/paddlespeech/t2s/modules/masking.py
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import paddle
-__all__ = [
-    "id_mask",
-    "feature_mask",
-    "combine_mask",
-    "future_mask",
-]
-def id_mask(input, padding_index=0, dtype="bool"):
-    """Generate mask with input ids. 
-    Those positions where the value equals ``padding_index`` correspond to 0 or
-    ``False``, otherwise, 1 or ``True``.
-    Parameters
-    ----------
-    input : Tensor [dtype: int]
-        The input tensor. It represents the ids.
-    padding_index : int, optional
-        The id which represents padding, by default 0.
-    dtype : str, optional
-        Data type of the returned mask, by default "bool".
-    Returns
-    -------
-    Tensor
-        The generate mask. It has the same shape as ``input`` does.
-    """
-    return paddle.cast(input != padding_index, dtype)
-def feature_mask(input, axis, dtype="bool"):
-    """Compute mask from input features.
-    For a input features, represented as batched feature vectors, those vectors
-    which all zeros are considerd padding vectors.
-    Parameters
-    ----------
-    input : Tensor [dtype: float]
-        The input tensor which represents featues.
-    axis : int
-        The index of the feature dimension in ``input``. Other dimensions are
-        considered ``spatial`` dimensions.
-    dtype : str, optional
-        Data type of the generated mask, by default "bool"
-    Returns
-    -------
-    Tensor
-        The geenrated mask with ``spatial`` shape as mentioned above.
-        It has one less dimension than ``input`` does.
-    """
-    feature_sum = paddle.sum(paddle.abs(input), axis)
-    return paddle.cast(feature_sum != 0, dtype)
-def combine_mask(mask1, mask2):
-    """Combine two mask with multiplication or logical and.
-    Parameters
-    -----------
-    mask1 : Tensor
-        The first mask.
-    mask2 : Tensor
-        The second mask with broadcastable shape with ``mask1``.
-    Returns
-    --------
-    Tensor
-        Combined mask.
-    Notes
-    ------
-    It is mainly used to combine the padding mask and no future mask for
-    transformer decoder. 
-    Padding mask is used to mask padding positions of the decoder inputs and
-    no future mask is used to prevent the decoder to see future information.
-    """
-    if mask1.dtype == paddle.fluid.core.VarDesc.VarType.BOOL:
-        return paddle.logical_and(mask1, mask2)
-    else:
-        return mask1 * mask2
-def future_mask(time_steps, dtype="bool"):
-    """Generate lower triangular mask.
-    It is used at transformer decoder to prevent the decoder to see future
-    information.
-    Parameters
-    ----------
-    time_steps : int
-        Decoder time steps.
-    dtype : str, optional
-        The data type of the generate mask, by default "bool".
-    Returns
-    -------
-    Tensor
-        The generated mask.
-    """
-    mask = paddle.tril(paddle.ones([time_steps, time_steps]))
-    return paddle.cast(mask, dtype)
--- a/paddlespeech/t2s/modules/nets_utils.py
+++ b/paddlespeech/t2s/modules/nets_utils.py
@@ -129,7 +129,7 @@ def initialize(model: nn.Layer, init: str):
    Parameters
    ----------
-    model : paddle.nn.Layer
+    model : nn.Layer
        Target.
    init : str
        Method of initialization.

--- a/paddlespeech/t2s/modules/pqmf.py
+++ b/paddlespeech/t2s/modules/pqmf.py
@@ -16,6 +16,7 @@
 import numpy as np
 import paddle
 import paddle.nn.functional as F
+from paddle import nn
 from scipy.signal import kaiser
@@ -56,7 +57,7 @@ def design_prototype_filter(taps=62, cutoff_ratio=0.142, beta=9.0):
    return h
-class PQMF(paddle.nn.Layer):
+class PQMF(nn.Layer):
    """PQMF module.
    This module is based on `Near-perfect-reconstruction pseudo-QMF banks`_.
    .. _`Near-perfect-reconstruction pseudo-QMF banks`:
@@ -105,7 +106,7 @@ class PQMF(paddle.nn.Layer):
        self.updown_filter = updown_filter
        self.subbands = subbands
        # keep padding info
-        self.pad_fn = paddle.nn.Pad1D(taps // 2, mode='constant', value=0.0)
+        self.pad_fn = nn.Pad1D(taps // 2, mode='constant', value=0.0)
    def analysis(self, x):
        """Analysis with PQMF.

--- a/paddlespeech/t2s/modules/predictor/__init__.py
+++ b/paddlespeech/t2s/modules/predictor/__init__.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/paddlespeech/t2s/modules/fastspeech2_predictor/duration_predictor.py
+++ b/paddlespeech/t2s/modules/fastspeech2_predictor/duration_predictor.py
@@ -65,7 +65,7 @@ class DurationPredictor(nn.Layer):
            Offset value to avoid nan in log domain.
        """
-        super(DurationPredictor, self).__init__()
+        super().__init__()
        self.offset = offset
        self.conv = nn.LayerList()
        for idx in range(n_layers):
@@ -155,7 +155,7 @@ class DurationPredictorLoss(nn.Layer):
        reduction : str
            Reduction type in loss calculation.
        """
-        super(DurationPredictorLoss, self).__init__()
+        super().__init__()
        self.criterion = nn.MSELoss(reduction=reduction)
        self.offset = offset

--- a/paddlespeech/t2s/modules/fastspeech2_predictor/length_regulator.py
+++ b/paddlespeech/t2s/modules/fastspeech2_predictor/length_regulator.py
--- a/paddlespeech/t2s/modules/fastspeech2_predictor/variance_predictor.py
+++ b/paddlespeech/t2s/modules/fastspeech2_predictor/variance_predictor.py
--- a/paddlespeech/t2s/modules/ssim.py
+++ b/paddlespeech/t2s/modules/ssim.py
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from math import exp
-import paddle
-import paddle.nn.functional as F
-from paddle import nn
-def gaussian(window_size, sigma):
-    gauss = paddle.to_tensor([
-        exp(-(x - window_size // 2)**2 / float(2 * sigma**2))
-        for x in range(window_size)
-    ])
-    return gauss / gauss.sum()
-def create_window(window_size, channel):
-    _1D_window = gaussian(window_size, 1.5).unsqueeze(1)
-    _2D_window = paddle.matmul(_1D_window, paddle.transpose(
-        _1D_window, [1, 0])).unsqueeze([0, 1])
-    window = paddle.expand(_2D_window, [channel, 1, window_size, window_size])
-    return window
-def _ssim(img1, img2, window, window_size, channel, size_average=True):
-    mu1 = F.conv2d(img1, window, padding=window_size // 2, groups=channel)
-    mu2 = F.conv2d(img2, window, padding=window_size // 2, groups=channel)
-    mu1_sq = mu1.pow(2)
-    mu2_sq = mu2.pow(2)
-    mu1_mu2 = mu1 * mu2
-    sigma1_sq = F.conv2d(
-        img1 * img1, window, padding=window_size // 2, groups=channel) - mu1_sq
-    sigma2_sq = F.conv2d(
-        img2 * img2, window, padding=window_size // 2, groups=channel) - mu2_sq
-    sigma12 = F.conv2d(
-        img1 * img2, window, padding=window_size // 2, groups=channel) - mu1_mu2
-    C1 = 0.01**2
-    C2 = 0.03**2
-    ssim_map = ((2 * mu1_mu2 + C1) * (2 * sigma12 + C2)) \
-             / ((mu1_sq + mu2_sq + C1) * (sigma1_sq + sigma2_sq + C2))
-    if size_average:
-        return ssim_map.mean()
-    else:
-        return ssim_map.mean(1).mean(1).mean(1)
-class SSIM(nn.Layer):
-    def __init__(self, window_size=11, size_average=True):
-        super().__init__()
-        self.window_size = window_size
-        self.size_average = size_average
-        self.channel = 1
-        self.window = create_window(window_size, self.channel)
-    def forward(self, img1, img2):
-        return _ssim(img1, img2, self.window, self.window_size, self.channel,
-                     self.size_average)
-def ssim(img1, img2, window_size=11, size_average=True):
-    (_, channel, _, _) = img1.shape
-    window = create_window(window_size, channel)
-    return _ssim(img1, img2, window, window_size, channel, size_average)
--- a/paddlespeech/t2s/modules/stft_loss.py
+++ b/paddlespeech/t2s/modules/stft_loss.py
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# Modified from espnet(https://github.com/espnet/espnet)
-import paddle
-from paddle import nn
-from paddle.nn import functional as F
-from scipy import signal
-def stft(x,
-         fft_size,
-         hop_length=None,
-         win_length=None,
-         window='hann',
-         center=True,
-         pad_mode='reflect'):
-    """Perform STFT and convert to magnitude spectrogram.
-    Parameters
-    ----------
-    x : Tensor
-        Input signal tensor (B, T).
-    fft_size : int
-        FFT size.
-    hop_size : int
-        Hop size.
-    win_length : int
-        window : str, optional
-    window : str
-        Name of window function, see `scipy.signal.get_window` for more
-        details. Defaults to "hann".
-    center : bool, optional
-        center (bool, optional): Whether to pad `x` to make that the
-        :math:`t \times hop\_length` at the center of :math:`t`-th frame. Default: `True`.
-    pad_mode : str, optional
-        Choose padding pattern when `center` is `True`.
-    Returns
-    ----------
-    Tensor:
-        Magnitude spectrogram (B, #frames, fft_size // 2 + 1).
-    """
-    # calculate window
-    window = signal.get_window(window, win_length, fftbins=True)
-    window = paddle.to_tensor(window)
-    x_stft = paddle.signal.stft(
-        x,
-        fft_size,
-        hop_length,
-        win_length,
-        window=window,
-        center=center,
-        pad_mode=pad_mode)
-    real = x_stft.real()
-    imag = x_stft.imag()
-    return paddle.sqrt(paddle.clip(real**2 + imag**2, min=1e-7)).transpose(
-        [0, 2, 1])
-class SpectralConvergenceLoss(nn.Layer):
-    """Spectral convergence loss module."""
-    def __init__(self):
-        """Initilize spectral convergence loss module."""
-        super().__init__()
-    def forward(self, x_mag, y_mag):
-        """Calculate forward propagation.
-        Parameters
-        ----------
-        x_mag : Tensor
-            Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
-        y_mag : Tensor)
-            Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
-        Returns
-        ----------
-        Tensor
-            Spectral convergence loss value.
-        """
-        return paddle.norm(
-            y_mag - x_mag, p="fro") / paddle.clip(
-                paddle.norm(y_mag, p="fro"), min=1e-10)
-class LogSTFTMagnitudeLoss(nn.Layer):
-    """Log STFT magnitude loss module."""
-    def __init__(self, epsilon=1e-7):
-        """Initilize los STFT magnitude loss module."""
-        super().__init__()
-        self.epsilon = epsilon
-    def forward(self, x_mag, y_mag):
-        """Calculate forward propagation.
-        Parameters
-        ----------
-        x_mag : Tensor
-            Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
-        y_mag : Tensor
-            Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
-        Returns
-        ----------
-        Tensor
-            Log STFT magnitude loss value.
-        """
-        return F.l1_loss(
-            paddle.log(paddle.clip(y_mag, min=self.epsilon)),
-            paddle.log(paddle.clip(x_mag, min=self.epsilon)))
-class STFTLoss(nn.Layer):
-    """STFT loss module."""
-    def __init__(self,
-                 fft_size=1024,
-                 shift_size=120,
-                 win_length=600,
-                 window="hann"):
-        """Initialize STFT loss module."""
-        super().__init__()
-        self.fft_size = fft_size
-        self.shift_size = shift_size
-        self.win_length = win_length
-        self.window = window
-        self.spectral_convergence_loss = SpectralConvergenceLoss()
-        self.log_stft_magnitude_loss = LogSTFTMagnitudeLoss()
-    def forward(self, x, y):
-        """Calculate forward propagation.
-        Parameters
-        ----------
-        x : Tensor
-            Predicted signal (B, T).
-        y : Tensor
-            Groundtruth signal (B, T).
-        Returns
-        ----------
-        Tensor
-            Spectral convergence loss value.
-        Tensor
-            Log STFT magnitude loss value.
-        """
-        x_mag = stft(x, self.fft_size, self.shift_size, self.win_length,
-                     self.window)
-        y_mag = stft(y, self.fft_size, self.shift_size, self.win_length,
-                     self.window)
-        sc_loss = self.spectral_convergence_loss(x_mag, y_mag)
-        mag_loss = self.log_stft_magnitude_loss(x_mag, y_mag)
-        return sc_loss, mag_loss
-class MultiResolutionSTFTLoss(nn.Layer):
-    """Multi resolution STFT loss module."""
-    def __init__(
-            self,
-            fft_sizes=[1024, 2048, 512],
-            hop_sizes=[120, 240, 50],
-            win_lengths=[600, 1200, 240],
-            window="hann", ):
-        """Initialize Multi resolution STFT loss module.
-        Parameters
-        ----------
-        fft_sizes : list
-            List of FFT sizes.
-        hop_sizes : list
-            List of hop sizes.
-        win_lengths : list
-            List of window lengths.
-        window : str
-            Window function type.
-        """
-        super().__init__()
-        assert len(fft_sizes) == len(hop_sizes) == len(win_lengths)
-        self.stft_losses = nn.LayerList()
-        for fs, ss, wl in zip(fft_sizes, hop_sizes, win_lengths):
-            self.stft_losses.append(STFTLoss(fs, ss, wl, window))
-    def forward(self, x, y):
-        """Calculate forward propagation.
-        Parameters
-        ----------
-        x : Tensor
-            Predicted signal (B, T) or (B, #subband, T).
-        y : Tensor
-            Groundtruth signal (B, T) or (B, #subband, T).
-        Returns
-        ----------
-        Tensor
-            Multi resolution spectral convergence loss value.
-        Tensor
-            Multi resolution log STFT magnitude loss value.
-        """
-        if len(x.shape) == 3:
-            # (B, C, T) -> (B x C, T)
-            x = x.reshape([-1, x.shape[2]])
-            # (B, C, T) -> (B x C, T)
-            y = y.reshape([-1, y.shape[2]])
-        sc_loss = 0.0
-        mag_loss = 0.0
-        for f in self.stft_losses:
-            sc_l, mag_l = f(x, y)
-            sc_loss += sc_l
-            mag_loss += mag_l
-        sc_loss /= len(self.stft_losses)
-        mag_loss /= len(self.stft_losses)
-        return sc_loss, mag_loss
--- a/paddlespeech/t2s/modules/style_encoder.py
+++ b/paddlespeech/t2s/modules/style_encoder.py
@@ -19,7 +19,7 @@ import paddle
 from paddle import nn
 from typeguard import check_argument_types
-from paddlespeech.t2s.modules.fastspeech2_transformer.attention import MultiHeadedAttention as BaseMultiHeadedAttention
+from paddlespeech.t2s.modules.transformer.attention import MultiHeadedAttention as BaseMultiHeadedAttention
 class StyleEncoder(nn.Layer):
@@ -74,7 +74,7 @@ class StyleEncoder(nn.Layer):
            gru_units: int=128, ):
        """Initilize global style encoder module."""
        assert check_argument_types()
-        super(StyleEncoder, self).__init__()
+        super().__init__()
        self.ref_enc = ReferenceEncoder(
            idim=idim,
@@ -93,11 +93,15 @@ class StyleEncoder(nn.Layer):
    def forward(self, speech: paddle.Tensor) -> paddle.Tensor:
        """Calculate forward propagation.
-        Args:
+        Parameters
-            speech (Tensor): Batch of padded target features (B, Lmax, odim).
+        ----------
+        speech : Tensor
+            Batch of padded target features (B, Lmax, odim).
-        Returns:
+        Returns
-            Tensor: Style token embeddings (B, token_dim).
+        ----------
+        Tensor:
+            Style token embeddings (B, token_dim).
        """
        ref_embs = self.ref_enc(speech)
@@ -145,7 +149,7 @@ class ReferenceEncoder(nn.Layer):
            gru_units: int=128, ):
        """Initilize reference encoder module."""
        assert check_argument_types()
-        super(ReferenceEncoder, self).__init__()
+        super().__init__()
        # check hyperparameters are valid
        assert conv_kernel_size % 2 == 1, "kernel size must be odd."
@@ -249,7 +253,7 @@ class StyleTokenLayer(nn.Layer):
            dropout_rate: float=0.0, ):
        """Initilize style token layer module."""
        assert check_argument_types()
-        super(StyleTokenLayer, self).__init__()
+        super().__init__()
        gst_embs = paddle.randn(shape=[gst_tokens, gst_token_dim // gst_heads])
        self.gst_embs = paddle.create_parameter(

--- a/paddlespeech/t2s/modules/tacotron2/encoder.py
+++ b/paddlespeech/t2s/modules/tacotron2/encoder.py
@@ -73,7 +73,7 @@ class Encoder(nn.Layer):
            Dropout rate.
        """
-        super(Encoder, self).__init__()
+        super().__init__()
        # store the hyperparameters
        self.idim = idim
        self.use_residual = use_residual

--- a/paddlespeech/t2s/modules/transformer.py
+++ b/paddlespeech/t2s/modules/transformer.py
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from paddle import nn
-from paddle.nn import functional as F
-from paddlespeech.t2s.modules import attention as attn
-__all__ = [
-    "PositionwiseFFN",
-    "TransformerEncoderLayer",
-    "TransformerDecoderLayer",
-]
-class PositionwiseFFN(nn.Layer):
-    """A faithful implementation of Position-wise Feed-Forward Network 
-    in `Attention is All You Need <https://arxiv.org/abs/1706.03762>`_.
-    It is basically a 2-layer MLP, with relu actication and dropout in between.
-    Parameters
-    ----------
-    input_size: int
-        The feature size of the intput. It is also the feature size of the
-        output.
-    hidden_size: int
-        The hidden size.
-    dropout: float
-        The probability of the Dropout applied to the output of the first
-        layer, by default 0.
-    """
-    def __init__(self, input_size: int, hidden_size: int, dropout=0.0):
-        super(PositionwiseFFN, self).__init__()
-        self.linear1 = nn.Linear(input_size, hidden_size)
-        self.linear2 = nn.Linear(hidden_size, input_size)
-        self.dropout = nn.Dropout(dropout)
-        self.input_size = input_size
-        self.hidden_szie = hidden_size
-    def forward(self, x):
-        r"""Forward pass of positionwise feed forward network.
-        Parameters
-        ----------
-        x : Tensor [shape=(\*, input_size)]
-            The input tensor, where ``\*`` means arbitary shape.
-        Returns
-        -------
-        Tensor [shape=(\*, input_size)]
-            The output tensor.
-        """
-        l1 = self.dropout(F.relu(self.linear1(x)))
-        l2 = self.linear2(l1)
-        return l2
-class TransformerEncoderLayer(nn.Layer):
-    """A faithful implementation of Transformer encoder layer in
-    `Attention is All You Need <https://arxiv.org/abs/1706.03762>`_.
-    Parameters
-    ----------
-    d_model :int 
-        The feature size of the input. It is also the feature size of the
-        output.
-    n_heads : int
-        The number of heads of self attention (a ``MultiheadAttention``
-        layer).
-    d_ffn : int 
-        The hidden size of the positional feed forward network (a
-        ``PositionwiseFFN`` layer).
-    dropout : float, optional
-        The probability of the dropout in MultiHeadAttention and
-        PositionwiseFFN, by default 0.
-    Notes
-    ------
-    It uses the PostLN (post layer norm) scheme.
-    """
-    def __init__(self, d_model, n_heads, d_ffn, dropout=0.):
-        super(TransformerEncoderLayer, self).__init__()
-        self.self_mha = attn.MultiheadAttention(d_model, n_heads, dropout)
-        self.layer_norm1 = nn.LayerNorm([d_model], epsilon=1e-6)
-        self.ffn = PositionwiseFFN(d_model, d_ffn, dropout)
-        self.layer_norm2 = nn.LayerNorm([d_model], epsilon=1e-6)
-        self.dropout = dropout
-    def forward(self, x, mask):
-        """Forward pass of TransformerEncoderLayer.
-        Parameters
-        ----------
-        x : Tensor [shape=(batch_size, time_steps, d_model)]
-            The input.
-        mask : Tensor
-            The padding mask. The shape is (batch_size, time_steps,
-            time_steps) or broadcastable shape.
-        Returns
-        -------
-        x :Tensor [shape=(batch_size, time_steps, d_model)]
-            The encoded output.
-        attn_weights : Tensor [shape=(batch_size, n_heads, time_steps, time_steps)]
-            The attention weights of the self attention.
-        """
-        context_vector, attn_weights = self.self_mha(x, x, x, mask)
-        x = self.layer_norm1(
-            F.dropout(x + context_vector, self.dropout, training=self.training))
-        x = self.layer_norm2(
-            F.dropout(x + self.ffn(x), self.dropout, training=self.training))
-        return x, attn_weights
-class TransformerDecoderLayer(nn.Layer):
-    """A faithful implementation of Transformer decoder layer in 
-    `Attention is All You Need <https://arxiv.org/abs/1706.03762>`_.
-    Parameters
-    ----------
-    d_model :int 
-        The feature size of the input. It is also the feature size of the
-        output.
-    n_heads : int
-        The number of heads of attentions (``MultiheadAttention``
-        layers).
-    d_ffn : int 
-        The hidden size of the positional feed forward network (a
-        ``PositionwiseFFN`` layer).
-    dropout : float, optional
-        The probability of the dropout in MultiHeadAttention and
-        PositionwiseFFN, by default 0.
-    Notes
-    ------
-    It uses the PostLN (post layer norm) scheme.
-    """
-    def __init__(self, d_model, n_heads, d_ffn, dropout=0.):
-        super(TransformerDecoderLayer, self).__init__()
-        self.self_mha = attn.MultiheadAttention(d_model, n_heads, dropout)
-        self.layer_norm1 = nn.LayerNorm([d_model], epsilon=1e-6)
-        self.cross_mha = attn.MultiheadAttention(d_model, n_heads, dropout)
-        self.layer_norm2 = nn.LayerNorm([d_model], epsilon=1e-6)
-        self.ffn = PositionwiseFFN(d_model, d_ffn, dropout)
-        self.layer_norm3 = nn.LayerNorm([d_model], epsilon=1e-6)
-        self.dropout = dropout
-    def forward(self, q, k, v, encoder_mask, decoder_mask):
-        """Forward pass of TransformerEncoderLayer.
-        Parameters
-        ----------
-        q : Tensor [shape=(batch_size, time_steps_q, d_model)]
-            The decoder input.
-        k : Tensor [shape=(batch_size, time_steps_k, d_model)]
-            The keys.
-        v : Tensor [shape=(batch_size, time_steps_k, d_model)]
-            The values
-        encoder_mask : Tensor
-            Encoder padding mask, shape is ``(batch_size, time_steps_k,
-            time_steps_k)`` or broadcastable shape.
-        decoder_mask : Tensor
-            Decoder mask, shape is ``(batch_size, time_steps_q, time_steps_k)``
-            or broadcastable shape. 
-        Returns
-        --------
-        q : Tensor [shape=(batch_size, time_steps_q, d_model)]
-            The decoder output.
-        self_attn_weights : Tensor [shape=(batch_size, n_heads, time_steps_q, time_steps_q)]
-            Decoder self attention.
-        cross_attn_weights : Tensor [shape=(batch_size, n_heads, time_steps_q, time_steps_k)]
-            Decoder-encoder cross attention.
-        """
-        context_vector, self_attn_weights = self.self_mha(q, q, q, decoder_mask)
-        q = self.layer_norm1(
-            F.dropout(q + context_vector, self.dropout, training=self.training))
-        context_vector, cross_attn_weights = self.cross_mha(q, k, v,
-                                                            encoder_mask)
-        q = self.layer_norm2(
-            F.dropout(q + context_vector, self.dropout, training=self.training))
-        q = self.layer_norm3(
-            F.dropout(q + self.ffn(q), self.dropout, training=self.training))
-        return q, self_attn_weights, cross_attn_weights
--- a/paddlespeech/t2s/modules/transformer/__init__.py
+++ b/paddlespeech/t2s/modules/transformer/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/paddlespeech/t2s/modules/fastspeech2_transformer/attention.py
+++ b/paddlespeech/t2s/modules/fastspeech2_transformer/attention.py
@@ -37,7 +37,7 @@ class MultiHeadedAttention(nn.Layer):
    def __init__(self, n_head, n_feat, dropout_rate):
        """Construct an MultiHeadedAttention object."""
-        super(MultiHeadedAttention, self).__init__()
+        super().__init__()
        assert n_feat % n_head == 0
        # We assume d_v always equals d_k
        self.d_k = n_feat // n_head
@@ -70,7 +70,7 @@ class MultiHeadedAttention(nn.Layer):
        paddle.Tensor
            Transformed value tensor (#batch, n_head, time2, d_k).
        """
-        n_batch = query.shape[0]
+        n_batch = paddle.shape(query)[0]
        q = paddle.reshape(
            self.linear_q(query), [n_batch, -1, self.h, self.d_k])
@@ -104,7 +104,7 @@ class MultiHeadedAttention(nn.Layer):
            Transformed value (#batch, time1, d_model)
            weighted by the attention score (#batch, time1, time2).
        """
-        n_batch = value.shape[0]
+        n_batch = paddle.shape(value)[0]
        softmax = paddle.nn.Softmax(axis=-1)
        if mask is not None:
            mask = mask.unsqueeze(1)
@@ -126,8 +126,8 @@ class MultiHeadedAttention(nn.Layer):
        # (batch, time1, d_model)
        x = (paddle.reshape(
            x.transpose((0, 2, 1, 3)), (n_batch, -1, self.h * self.d_k)))
+        # (batch, time1, d_model)
-        return self.linear_out(x)  # (batch, time1, d_model)
+        return self.linear_out(x)
    def forward(self, query, key, value, mask=None):
        """Compute scaled dot product attention.
@@ -153,3 +153,113 @@ class MultiHeadedAttention(nn.Layer):
            (0, 1, 3, 2))) / math.sqrt(self.d_k)
        return self.forward_attention(v, scores, mask)
+class RelPositionMultiHeadedAttention(MultiHeadedAttention):
+    """Multi-Head Attention layer with relative position encoding (new implementation).
+    Details can be found in https://github.com/espnet/espnet/pull/2816.
+    Paper: https://arxiv.org/abs/1901.02860
+    Parameters
+    ----------
+    n_head : int
+        The number of heads.
+    n_feat : int
+        The number of features.
+    dropout_rate : float
+        Dropout rate.
+    zero_triu : bool
+        Whether to zero the upper triangular part of attention matrix.
+    """
+    def __init__(self, n_head, n_feat, dropout_rate, zero_triu=False):
+        """Construct an RelPositionMultiHeadedAttention object."""
+        super().__init__(n_head, n_feat, dropout_rate)
+        self.zero_triu = zero_triu
+        # linear transformation for positional encoding
+        self.linear_pos = nn.Linear(n_feat, n_feat, bias_attr=False)
+        # these two learnable bias are used in matrix c and matrix d
+        # as described in https://arxiv.org/abs/1901.02860 Section 3.3
+        self.pos_bias_u = paddle.create_parameter(
+            shape=(self.h, self.d_k),
+            dtype='float32',
+            default_initializer=paddle.nn.initializer.XavierUniform())
+        self.pos_bias_v = paddle.create_parameter(
+            shape=(self.h, self.d_k),
+            dtype='float32',
+            default_initializer=paddle.nn.initializer.XavierUniform())
+    def rel_shift(self, x):
+        """Compute relative positional encoding.
+        Parameters
+        ----------
+        x : paddle.Tensor
+            Input tensor (batch, head, time1, 2*time1-1).
+            time1 means the length of query vector.
+        Returns
+        ----------
+        paddle.Tensor
+            Output tensor.
+        """
+        b, h, t1, t2 = paddle.shape(x)
+        zero_pad = paddle.zeros((b, h, t1, 1))
+        x_padded = paddle.concat([zero_pad, x], axis=-1)
+        x_padded = x_padded.reshape([b, h, t2 + 1, t1])
+        # only keep the positions from 0 to time2
+        x = x_padded[:, :, 1:].reshape([b, h, t1, t2])[:, :, :, :t2 // 2 + 1]
+        if self.zero_triu:
+            ones = paddle.ones((t1, t2))
+            x = x * paddle.tril(ones, t2 - 1)[None, None, :, :]
+        return x
+    def forward(self, query, key, value, pos_emb, mask):
+        """Compute 'Scaled Dot Product Attention' with rel. positional encoding.
+        Parameters
+        ----------
+        query : paddle.Tensor 
+            Query tensor (#batch, time1, size).
+        key : paddle.Tensor
+            Key tensor (#batch, time2, size).
+        value : paddle.Tensor
+            Value tensor (#batch, time2, size).
+        pos_emb : paddle.Tensor
+            Positional embedding tensor
+            (#batch, 2*time1-1, size).
+        mask : paddle.Tensor
+            Mask tensor (#batch, 1, time2) or
+            (#batch, time1, time2).
+        Returns
+        ----------
+        paddle.Tensor
+            Output tensor (#batch, time1, d_model).
+        """
+        q, k, v = self.forward_qkv(query, key, value)
+        # (batch, time1, head, d_k)
+        q = q.transpose([0, 2, 1, 3])
+        n_batch_pos = paddle.shape(pos_emb)[0]
+        p = self.linear_pos(pos_emb).reshape(
+            [n_batch_pos, -1, self.h, self.d_k])
+        # (batch, head, 2*time1-1, d_k)
+        p = p.transpose([0, 2, 1, 3])
+        # (batch, head, time1, d_k)
+        q_with_bias_u = (q + self.pos_bias_u).transpose([0, 2, 1, 3])
+        # (batch, head, time1, d_k)
+        q_with_bias_v = (q + self.pos_bias_v).transpose([0, 2, 1, 3])
+        # compute attention score
+        # first compute matrix a and matrix c
+        # as described in https://arxiv.org/abs/1901.02860 Section 3.3
+        # (batch, head, time1, time2)
+        matrix_ac = paddle.matmul(q_with_bias_u, k.transpose([0, 1, 3, 2]))
+        # compute matrix b and matrix d
+        # (batch, head, time1, 2*time1-1)
+        matrix_bd = paddle.matmul(q_with_bias_v, p.transpose([0, 1, 3, 2]))
+        matrix_bd = self.rel_shift(matrix_bd)
+        # (batch, head, time1, time2)
+        scores = (matrix_ac + matrix_bd) / math.sqrt(self.d_k)
+        return self.forward_attention(v, scores, mask)
--- a/paddlespeech/t2s/modules/fastspeech2_transformer/decoder.py
+++ b/paddlespeech/t2s/modules/fastspeech2_transformer/decoder.py
@@ -23,14 +23,14 @@ import paddle
 import paddle.nn.functional as F
 from paddle import nn
-from paddlespeech.t2s.modules.fastspeech2_transformer.attention import MultiHeadedAttention
-from paddlespeech.t2s.modules.fastspeech2_transformer.decoder_layer import DecoderLayer
-from paddlespeech.t2s.modules.fastspeech2_transformer.embedding import PositionalEncoding
-from paddlespeech.t2s.modules.fastspeech2_transformer.lightconv import LightweightConvolution
-from paddlespeech.t2s.modules.fastspeech2_transformer.mask import subsequent_mask
-from paddlespeech.t2s.modules.fastspeech2_transformer.positionwise_feed_forward import PositionwiseFeedForward
-from paddlespeech.t2s.modules.fastspeech2_transformer.repeat import repeat
 from paddlespeech.t2s.modules.layer_norm import LayerNorm
+from paddlespeech.t2s.modules.transformer.attention import MultiHeadedAttention
+from paddlespeech.t2s.modules.transformer.decoder_layer import DecoderLayer
+from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding
+from paddlespeech.t2s.modules.transformer.lightconv import LightweightConvolution
+from paddlespeech.t2s.modules.transformer.mask import subsequent_mask
+from paddlespeech.t2s.modules.transformer.positionwise_feed_forward import PositionwiseFeedForward
+from paddlespeech.t2s.modules.transformer.repeat import repeat
 class Decoder(nn.Layer):
@@ -67,11 +67,11 @@ class Decoder(nn.Layer):
        Dropout rate in self-attention.
    src_attention_dropout_rate : float
        Dropout rate in source-attention.
-    input_layer : (Union[str, paddle.nn.Layer])
+    input_layer : (Union[str, nn.Layer])
        Input layer type.
    use_output_layer : bool
        Whether to use output layer.
-    pos_enc_class : paddle.nn.Layer
+    pos_enc_class : nn.Layer
        Positional encoding module class.
        `PositionalEncoding `or `ScaledPositionalEncoding`
    normalize_before : bool
@@ -122,8 +122,7 @@ class Decoder(nn.Layer):
                input_layer,
                pos_enc_class(attention_dim, positional_dropout_rate))
        else:
-            raise NotImplementedError(
+            raise NotImplementedError("only `embed` or nn.Layer is supported.")
-                "only `embed` or paddle.nn.Layer is supported.")
        self.normalize_before = normalize_before
        # self-attention module definition

--- a/paddlespeech/t2s/modules/fastspeech2_transformer/decoder_layer.py
+++ b/paddlespeech/t2s/modules/fastspeech2_transformer/decoder_layer.py
@@ -26,13 +26,13 @@ class DecoderLayer(nn.Layer):
    ----------
    size : int
        Input dimension.
-    self_attn : paddle.nn.Layer
+    self_attn : nn.Layer
        Self-attention module instance.
        `MultiHeadedAttention` instance can be used as the argument.
-    src_attn : paddle.nn.Layer
+    src_attn : nn.Layer
        Self-attention module instance.
        `MultiHeadedAttention` instance can be used as the argument.
-    feed_forward : paddle.nn.Layer
+    feed_forward : nn.Layer
        Feed-forward module instance.
        `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance can be used as the argument.
    dropout_rate : float

--- a/paddlespeech/t2s/modules/fastspeech2_transformer/embedding.py
+++ b/paddlespeech/t2s/modules/fastspeech2_transformer/embedding.py
@@ -43,7 +43,7 @@ class PositionalEncoding(nn.Layer):
                 dtype="float32",
                 reverse=False):
        """Construct an PositionalEncoding object."""
-        super(PositionalEncoding, self).__init__()
+        super().__init__()
        self.d_model = d_model
        self.reverse = reverse
        self.xscale = math.sqrt(self.d_model)
@@ -96,14 +96,14 @@ class ScaledPositionalEncoding(PositionalEncoding):
    Parameters
    ----------
-        d_model : int
+    d_model : int
-            Embedding dimension.
+        Embedding dimension.
-        dropout_rate : float
+    dropout_rate : float
-            Dropout rate.
+        Dropout rate.
-        max_len : int
+    max_len : int
-            Maximum input length.
+        Maximum input length.
-        dtype : str
+    dtype : str
-            dtype of param
+        dtype of param
    """
    def __init__(self, d_model, dropout_rate, max_len=5000, dtype="float32"):
@@ -117,7 +117,7 @@ class ScaledPositionalEncoding(PositionalEncoding):
        self.alpha = paddle.create_parameter(
            shape=x.shape,
            dtype=self.dtype,
-            default_initializer=paddle.nn.initializer.Assign(x))
+            default_initializer=nn.initializer.Assign(x))
    def reset_parameters(self):
        """Reset parameters."""
@@ -128,14 +128,87 @@ class ScaledPositionalEncoding(PositionalEncoding):
        Parameters
        ----------
-            x : paddle.Tensor
+        x : paddle.Tensor
-                Input tensor (batch, time, `*`).
+            Input tensor (batch, time, `*`).
        Returns
        ----------
-            paddle.Tensor
+        paddle.Tensor
-                Encoded tensor (batch, time, `*`).
+            Encoded tensor (batch, time, `*`).
        """
        self.extend_pe(x)
        T = paddle.shape(x)[1]
        x = x + self.alpha * self.pe[:, :T]
        return self.dropout(x)
+class RelPositionalEncoding(nn.Layer):
+    """Relative positional encoding module (new implementation).
+    Details can be found in https://github.com/espnet/espnet/pull/2816.
+    See : Appendix B in https://arxiv.org/abs/1901.02860
+    Parameters
+    ----------
+    d_model : int
+        Embedding dimension.
+    dropout_rate : float
+        Dropout rate.
+    max_len : int
+        Maximum input length.
+    """
+    def __init__(self, d_model, dropout_rate, max_len=5000, dtype="float32"):
+        """Construct an PositionalEncoding object."""
+        super().__init__()
+        self.d_model = d_model
+        self.xscale = math.sqrt(self.d_model)
+        self.dropout = nn.Dropout(p=dropout_rate)
+        self.pe = None
+        self.dtype = dtype
+        self.extend_pe(paddle.expand(paddle.zeros([1]), (1, max_len)))
+    def extend_pe(self, x):
+        """Reset the positional encodings."""
+        if self.pe is not None:
+            # self.pe contains both positive and negative parts
+            # the length of self.pe is 2 * input_len - 1
+            if paddle.shape(self.pe)[1] >= paddle.shape(x)[1] * 2 - 1:
+                return
+        # Suppose `i` means to the position of query vecotr and `j` means the
+        # position of key vector. We use position relative positions when keys
+        # are to the left (i>j) and negative relative positions otherwise (i<j).
+        x_shape = paddle.shape(x)
+        pe_positive = paddle.zeros([x_shape[1], self.d_model])
+        pe_negative = paddle.zeros([x_shape[1], self.d_model])
+        position = paddle.arange(0, x_shape[1], dtype=self.dtype).unsqueeze(1)
+        div_term = paddle.exp(
+            paddle.arange(0, self.d_model, 2, dtype=self.dtype) *
+            -(math.log(10000.0) / self.d_model))
+        pe_positive[:, 0::2] = paddle.sin(position * div_term)
+        pe_positive[:, 1::2] = paddle.cos(position * div_term)
+        pe_negative[:, 0::2] = paddle.sin(-1 * position * div_term)
+        pe_negative[:, 1::2] = paddle.cos(-1 * position * div_term)
+        # Reserve the order of positive indices and concat both positive and
+        # negative indices. This is used to support the shifting trick
+        # as in https://arxiv.org/abs/1901.02860
+        pe_positive = paddle.flip(pe_positive, [0]).unsqueeze(0)
+        pe_negative = pe_negative[1:].unsqueeze(0)
+        pe = paddle.concat([pe_positive, pe_negative], axis=1)
+        self.pe = pe
+    def forward(self, x: paddle.Tensor):
+        """Add positional encoding.
+        Parameters
+        ----------
+        x : paddle.Tensor
+            Input tensor (batch, time, `*`).
+        Returns
+        ----------
+        paddle.Tensor
+            Encoded tensor (batch, time, `*`).
+        """
+        self.extend_pe(x)
+        x = x * self.xscale
+        T = paddle.shape(x)[1]
+        pe_size = paddle.shape(self.pe)
+        pos_emb = self.pe[:, pe_size[1] // 2 - T + 1:pe_size[1] // 2 + T, ]
+        return self.dropout(x), self.dropout(pos_emb)
--- a/paddlespeech/t2s/modules/transformer/encoder.py
+++ b/paddlespeech/t2s/modules/transformer/encoder.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+from typing import List
+from typing import Union
+from paddle import nn
+from paddlespeech.t2s.modules.activation import get_activation
+from paddlespeech.t2s.modules.conformer.convolution import ConvolutionModule
+from paddlespeech.t2s.modules.conformer.encoder_layer import EncoderLayer as ConformerEncoderLayer
+from paddlespeech.t2s.modules.layer_norm import LayerNorm
+from paddlespeech.t2s.modules.transformer.attention import MultiHeadedAttention
+from paddlespeech.t2s.modules.transformer.attention import RelPositionMultiHeadedAttention
+from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding
+from paddlespeech.t2s.modules.transformer.embedding import RelPositionalEncoding
+from paddlespeech.t2s.modules.transformer.embedding import ScaledPositionalEncoding
+from paddlespeech.t2s.modules.transformer.encoder_layer import EncoderLayer
+from paddlespeech.t2s.modules.transformer.multi_layer_conv import Conv1dLinear
+from paddlespeech.t2s.modules.transformer.multi_layer_conv import MultiLayeredConv1d
+from paddlespeech.t2s.modules.transformer.positionwise_feed_forward import PositionwiseFeedForward
+from paddlespeech.t2s.modules.transformer.repeat import repeat
+from paddlespeech.t2s.modules.transformer.subsampling import Conv2dSubsampling
+class BaseEncoder(nn.Layer):
+    """Base Encoder module.
+    Parameters
+    ----------
+    idim : int
+        Input dimension.
+    attention_dim : int
+        Dimention of attention.
+    attention_heads : int
+        The number of heads of multi head attention.
+    linear_units : int
+        The number of units of position-wise feed forward.
+    num_blocks : int
+        The number of decoder blocks.
+    dropout_rate : float
+        Dropout rate.
+    positional_dropout_rate : float
+        Dropout rate after adding positional encoding.
+    attention_dropout_rate : float
+        Dropout rate in attention.
+    input_layer : Union[str, nn.Layer]
+        Input layer type.
+    normalize_before : bool
+        Whether to use layer_norm before the first block.
+    concat_after : bool
+        Whether to concat attention layer's input and output.
+        if True, additional linear will be applied.
+        i.e. x -> x + linear(concat(x, att(x)))
+        if False, no additional linear will be applied. i.e. x -> x + att(x)
+    positionwise_layer_type : str
+        "linear", "conv1d", or "conv1d-linear".
+    positionwise_conv_kernel_size : int
+        Kernel size of positionwise conv1d layer.
+    macaron_style : bool
+        Whether to use macaron style for positionwise layer.
+    pos_enc_layer_type : str
+        Encoder positional encoding layer type.
+    selfattention_layer_type : str
+        Encoder attention layer type.
+    activation_type : str
+        Encoder activation function type.
+    use_cnn_module : bool
+        Whether to use convolution module.
+    zero_triu : bool
+        Whether to zero the upper triangular part of attention matrix.
+    cnn_module_kernel : int
+        Kernerl size of convolution module.
+    padding_idx : int
+        Padding idx for input_layer=embed.
+    stochastic_depth_rate : float
+        Maximum probability to skip the encoder layer.
+    intermediate_layers : Union[List[int], None]
+        indices of intermediate CTC layer.
+        indices start from 1.
+        if not None, intermediate outputs are returned (which changes return type
+        signature.)
+    encoder_type: str
+         "transformer", or "conformer".
+    """
+    def __init__(self,
+                 idim: int,
+                 attention_dim: int=256,
+                 attention_heads: int=4,
+                 linear_units: int=2048,
+                 num_blocks: int=6,
+                 dropout_rate: float=0.1,
+                 positional_dropout_rate: float=0.1,
+                 attention_dropout_rate: float=0.0,
+                 input_layer: str="conv2d",
+                 normalize_before: bool=True,
+                 concat_after: bool=False,
+                 positionwise_layer_type: str="linear",
+                 positionwise_conv_kernel_size: int=1,
+                 macaron_style: bool=False,
+                 pos_enc_layer_type: str="abs_pos",
+                 selfattention_layer_type: str="selfattn",
+                 activation_type: str="swish",
+                 use_cnn_module: bool=False,
+                 zero_triu: bool=False,
+                 cnn_module_kernel: int=31,
+                 padding_idx: int=-1,
+                 stochastic_depth_rate: float=0.0,
+                 intermediate_layers: Union[List[int], None]=None,
+                 encoder_type: str="transformer"):
+        """Construct an Base Encoder object."""
+        super().__init__()
+        activation = get_activation(activation_type)
+        pos_enc_class = self.get_pos_enc_class(pos_enc_layer_type,
+                                               selfattention_layer_type)
+        self.encoder_type = encoder_type
+        self.conv_subsampling_factor = 1
+        self.embed = self.get_embed(
+            idim=idim,
+            input_layer=input_layer,
+            attention_dim=attention_dim,
+            pos_enc_class=pos_enc_class,
+            dropout_rate=dropout_rate,
+            positional_dropout_rate=positional_dropout_rate,
+            padding_idx=padding_idx)
+        self.normalize_before = normalize_before
+        # self-attention module definition
+        encoder_selfattn_layer, encoder_selfattn_layer_args = self.get_encoder_selfattn_layer(
+            selfattention_layer_type=selfattention_layer_type,
+            attention_heads=attention_heads,
+            attention_dim=attention_dim,
+            attention_dropout_rate=attention_dropout_rate,
+            zero_triu=zero_triu,
+            pos_enc_layer_type=pos_enc_layer_type)
+        # feed-forward module definition
+        positionwise_layer, positionwise_layer_args = self.get_positionwise_layer(
+            positionwise_layer_type, attention_dim, linear_units, dropout_rate,
+            positionwise_conv_kernel_size, activation)
+        # convolution module definition
+        convolution_layer = ConvolutionModule
+        convolution_layer_args = (attention_dim, cnn_module_kernel, activation)
+        if self.encoder_type == "transformer":
+            self.encoders = repeat(
+                num_blocks,
+                lambda lnum: EncoderLayer(
+                    attention_dim,
+                    encoder_selfattn_layer(*encoder_selfattn_layer_args),
+                    positionwise_layer(*positionwise_layer_args),
+                    dropout_rate,
+                    normalize_before,
+                    concat_after, ), )
+        elif self.encoder_type == "conformer":
+            self.encoders = repeat(
+                num_blocks,
+                lambda lnum: ConformerEncoderLayer(
+                    attention_dim,
+                    encoder_selfattn_layer(*encoder_selfattn_layer_args),
+                    positionwise_layer(*positionwise_layer_args),
+                    positionwise_layer(*positionwise_layer_args) if macaron_style else None,
+                    convolution_layer(*convolution_layer_args) if use_cnn_module else None,
+                    dropout_rate,
+                    normalize_before,
+                    concat_after,
+                    stochastic_depth_rate * float(1 + lnum) / num_blocks, ), )
+            self.intermediate_layers = intermediate_layers
+        else:
+            raise NotImplementedError("Support only linear or conv1d.")
+        if self.normalize_before:
+            self.after_norm = LayerNorm(attention_dim)
+    def get_positionwise_layer(self,
+                               positionwise_layer_type: str="linear",
+                               attention_dim: int=256,
+                               linear_units: int=2048,
+                               dropout_rate: float=0.1,
+                               positionwise_conv_kernel_size: int=1,
+                               activation: nn.Layer=nn.ReLU()):
+        """Define positionwise layer."""
+        if positionwise_layer_type == "linear":
+            positionwise_layer = PositionwiseFeedForward
+            positionwise_layer_args = (attention_dim, linear_units,
+                                       dropout_rate, activation)
+        elif positionwise_layer_type == "conv1d":
+            positionwise_layer = MultiLayeredConv1d
+            positionwise_layer_args = (attention_dim, linear_units,
+                                       positionwise_conv_kernel_size,
+                                       dropout_rate, )
+        elif positionwise_layer_type == "conv1d-linear":
+            positionwise_layer = Conv1dLinear
+            positionwise_layer_args = (attention_dim, linear_units,
+                                       positionwise_conv_kernel_size,
+                                       dropout_rate, )
+        else:
+            raise NotImplementedError("Support only linear or conv1d.")
+        return positionwise_layer, positionwise_layer_args
+    def get_encoder_selfattn_layer(self,
+                                   selfattention_layer_type: str="selfattn",
+                                   attention_heads: int=4,
+                                   attention_dim: int=256,
+                                   attention_dropout_rate: float=0.0,
+                                   zero_triu: bool=False,
+                                   pos_enc_layer_type: str="abs_pos"):
+        if selfattention_layer_type == "selfattn":
+            encoder_selfattn_layer = MultiHeadedAttention
+            encoder_selfattn_layer_args = (attention_heads, attention_dim,
+                                           attention_dropout_rate, )
+        elif selfattention_layer_type == "rel_selfattn":
+            assert pos_enc_layer_type == "rel_pos"
+            encoder_selfattn_layer = RelPositionMultiHeadedAttention
+            encoder_selfattn_layer_args = (attention_heads, attention_dim,
+                                           attention_dropout_rate, zero_triu, )
+        else:
+            raise ValueError("unknown encoder_attn_layer: " +
+                             selfattention_layer_type)
+        return encoder_selfattn_layer, encoder_selfattn_layer_args
+    def get_pos_enc_class(self,
+                          pos_enc_layer_type: str="abs_pos",
+                          selfattention_layer_type: str="selfattn"):
+        if pos_enc_layer_type == "abs_pos":
+            pos_enc_class = PositionalEncoding
+        elif pos_enc_layer_type == "scaled_abs_pos":
+            pos_enc_class = ScaledPositionalEncoding
+        elif pos_enc_layer_type == "rel_pos":
+            assert selfattention_layer_type == "rel_selfattn"
+            pos_enc_class = RelPositionalEncoding
+        else:
+            raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type)
+        return pos_enc_class
+    def get_embed(self,
+                  idim,
+                  input_layer="conv2d",
+                  attention_dim: int=256,
+                  pos_enc_class=PositionalEncoding,
+                  dropout_rate: int=0.1,
+                  positional_dropout_rate: int=0.1,
+                  padding_idx: int=-1):
+        if input_layer == "linear":
+            embed = nn.Sequential(
+                nn.Linear(idim, attention_dim),
+                nn.LayerNorm(attention_dim),
+                nn.Dropout(dropout_rate),
+                nn.ReLU(),
+                pos_enc_class(attention_dim, positional_dropout_rate), )
+        elif input_layer == "conv2d":
+            embed = Conv2dSubsampling(
+                idim,
+                attention_dim,
+                dropout_rate,
+                pos_enc_class(attention_dim, positional_dropout_rate), )
+            self.conv_subsampling_factor = 4
+        elif input_layer == "embed":
+            embed = nn.Sequential(
+                nn.Embedding(idim, attention_dim, padding_idx=padding_idx),
+                pos_enc_class(attention_dim, positional_dropout_rate), )
+        elif isinstance(input_layer, nn.Layer):
+            embed = nn.Sequential(
+                input_layer,
+                pos_enc_class(attention_dim, positional_dropout_rate), )
+        elif input_layer is None:
+            embed = nn.Sequential(
+                pos_enc_class(attention_dim, positional_dropout_rate))
+        else:
+            raise ValueError("unknown input_layer: " + input_layer)
+        return embed
+    def forward(self, xs, masks):
+        """Encode input sequence.
+        Parameters
+        ----------
+        xs : paddle.Tensor
+            Input tensor (#batch, time, idim).
+        masks : paddle.Tensor
+            Mask tensor (#batch, 1, time).
+        Returns
+        ----------
+        paddle.Tensor
+            Output tensor (#batch, time, attention_dim).
+        paddle.Tensor
+            Mask tensor (#batch, 1, time).
+        """
+        xs = self.embed(xs)
+        xs, masks = self.encoders(xs, masks)
+        if self.normalize_before:
+            xs = self.after_norm(xs)
+        return xs, masks
+class TransformerEncoder(BaseEncoder):
+    """Transformer encoder module.
+    Parameters
+    ----------
+    idim : int
+        Input dimension.
+    attention_dim : int
+        Dimention of attention.
+    attention_heads : int
+        The number of heads of multi head attention.
+    linear_units : int
+        The number of units of position-wise feed forward.
+    num_blocks : int
+        The number of decoder blocks.
+    dropout_rate : float
+        Dropout rate.
+    positional_dropout_rate : float
+        Dropout rate after adding positional encoding.
+    attention_dropout_rate : float
+        Dropout rate in attention.
+    input_layer : Union[str, paddle.nn.Layer]
+        Input layer type.
+    pos_enc_layer_type : str
+        Encoder positional encoding layer type.
+    normalize_before : bool
+        Whether to use layer_norm before the first block.
+    concat_after : bool
+        Whether to concat attention layer's input and output.
+        if True, additional linear will be applied.
+        i.e. x -> x + linear(concat(x, att(x)))
+        if False, no additional linear will be applied. i.e. x -> x + att(x)
+    positionwise_layer_type : str
+        "linear", "conv1d", or "conv1d-linear".
+    positionwise_conv_kernel_size : int
+        Kernel size of positionwise conv1d layer.
+    selfattention_layer_type : str
+        Encoder attention layer type.
+    activation_type : str
+        Encoder activation function type.
+    padding_idx : int
+        Padding idx for input_layer=embed.
+    """
+    def __init__(
+            self,
+            idim,
+            attention_dim: int=256,
+            attention_heads: int=4,
+            linear_units: int=2048,
+            num_blocks: int=6,
+            dropout_rate: float=0.1,
+            positional_dropout_rate: float=0.1,
+            attention_dropout_rate: float=0.0,
+            input_layer: str="conv2d",
+            pos_enc_layer_type: str="abs_pos",
+            normalize_before: bool=True,
+            concat_after: bool=False,
+            positionwise_layer_type: str="linear",
+            positionwise_conv_kernel_size: int=1,
+            selfattention_layer_type: str="selfattn",
+            activation_type: str="relu",
+            padding_idx: int=-1, ):
+        """Construct an Transformer Encoder object."""
+        super().__init__(
+            idim,
+            attention_dim=attention_dim,
+            attention_heads=attention_heads,
+            linear_units=linear_units,
+            num_blocks=num_blocks,
+            dropout_rate=dropout_rate,
+            positional_dropout_rate=positional_dropout_rate,
+            attention_dropout_rate=attention_dropout_rate,
+            input_layer=input_layer,
+            pos_enc_layer_type=pos_enc_layer_type,
+            normalize_before=normalize_before,
+            concat_after=concat_after,
+            positionwise_layer_type=positionwise_layer_type,
+            positionwise_conv_kernel_size=positionwise_conv_kernel_size,
+            selfattention_layer_type=selfattention_layer_type,
+            activation_type=activation_type,
+            padding_idx=padding_idx,
+            encoder_type="transformer")
+    def forward(self, xs, masks):
+        """Encode input sequence.
+        Parameters
+        ----------
+        xs : paddle.Tensor
+            Input tensor (#batch, time, idim).
+        masks : paddle.Tensor
+            Mask tensor (#batch, 1, time).
+        Returns
+        ----------
+        paddle.Tensor
+            Output tensor (#batch, time, attention_dim).
+        paddle.Tensor
+            Mask tensor (#batch, 1, time).
+        """
+        xs = self.embed(xs)
+        xs, masks = self.encoders(xs, masks)
+        if self.normalize_before:
+            xs = self.after_norm(xs)
+        return xs, masks
+    def forward_one_step(self, xs, masks, cache=None):
+        """Encode input frame.
+        Parameters
+        ----------
+        xs : paddle.Tensor
+            Input tensor.
+        masks : paddle.Tensor
+            Mask tensor.
+        cache : List[paddle.Tensor]
+            List of cache tensors.
+        Returns
+        ----------
+        paddle.Tensor
+            Output tensor.
+        paddle.Tensor
+            Mask tensor.
+        List[paddle.Tensor]
+            List of new cache tensors.
+        """
+        xs = self.embed(xs)
+        if cache is None:
+            cache = [None for _ in range(len(self.encoders))]
+        new_cache = []
+        for c, e in zip(cache, self.encoders):
+            xs, masks = e(xs, masks, cache=c)
+            new_cache.append(xs)
+        if self.normalize_before:
+            xs = self.after_norm(xs)
+        return xs, masks, new_cache
+class ConformerEncoder(BaseEncoder):
+    """Conformer encoder module.
+    Parameters
+    ----------
+    idim : int
+        Input dimension.
+    attention_dim : int
+        Dimention of attention.
+    attention_heads : int
+        The number of heads of multi head attention.
+    linear_units : int
+        The number of units of position-wise feed forward.
+    num_blocks : int
+        The number of decoder blocks.
+    dropout_rate : float
+        Dropout rate.
+    positional_dropout_rate : float
+        Dropout rate after adding positional encoding.
+    attention_dropout_rate : float
+        Dropout rate in attention.
+    input_layer : Union[str, nn.Layer]
+        Input layer type.
+    normalize_before : bool
+        Whether to use layer_norm before the first block.
+    concat_after : bool
+        Whether to concat attention layer's input and output.
+        if True, additional linear will be applied.
+        i.e. x -> x + linear(concat(x, att(x)))
+        if False, no additional linear will be applied. i.e. x -> x + att(x)
+    positionwise_layer_type : str
+        "linear", "conv1d", or "conv1d-linear".
+    positionwise_conv_kernel_size : int
+        Kernel size of positionwise conv1d layer.
+    macaron_style : bool
+        Whether to use macaron style for positionwise layer.
+    pos_enc_layer_type : str
+        Encoder positional encoding layer type.
+    selfattention_layer_type : str
+        Encoder attention layer type.
+    activation_type : str
+        Encoder activation function type.
+    use_cnn_module : bool
+        Whether to use convolution module.
+    zero_triu : bool
+        Whether to zero the upper triangular part of attention matrix.
+    cnn_module_kernel : int
+        Kernerl size of convolution module.
+    padding_idx : int
+        Padding idx for input_layer=embed.
+    stochastic_depth_rate : float
+        Maximum probability to skip the encoder layer.
+    intermediate_layers : Union[List[int], None]
+        indices of intermediate CTC layer.
+        indices start from 1.
+        if not None, intermediate outputs are returned (which changes return type
+        signature.)
+    """
+    def __init__(
+            self,
+            idim: int,
+            attention_dim: int=256,
+            attention_heads: int=4,
+            linear_units: int=2048,
+            num_blocks: int=6,
+            dropout_rate: float=0.1,
+            positional_dropout_rate: float=0.1,
+            attention_dropout_rate: float=0.0,
+            input_layer: str="conv2d",
+            normalize_before: bool=True,
+            concat_after: bool=False,
+            positionwise_layer_type: str="linear",
+            positionwise_conv_kernel_size: int=1,
+            macaron_style: bool=False,
+            pos_enc_layer_type: str="rel_pos",
+            selfattention_layer_type: str="rel_selfattn",
+            activation_type: str="swish",
+            use_cnn_module: bool=False,
+            zero_triu: bool=False,
+            cnn_module_kernel: int=31,
+            padding_idx: int=-1,
+            stochastic_depth_rate: float=0.0,
+            intermediate_layers: Union[List[int], None]=None, ):
+        """Construct an Conformer Encoder object."""
+        super().__init__(
+            idim=idim,
+            attention_dim=attention_dim,
+            attention_heads=attention_heads,
+            linear_units=linear_units,
+            num_blocks=num_blocks,
+            dropout_rate=dropout_rate,
+            positional_dropout_rate=positional_dropout_rate,
+            attention_dropout_rate=attention_dropout_rate,
+            input_layer=input_layer,
+            normalize_before=normalize_before,
+            concat_after=concat_after,
+            positionwise_layer_type=positionwise_layer_type,
+            positionwise_conv_kernel_size=positionwise_conv_kernel_size,
+            macaron_style=macaron_style,
+            pos_enc_layer_type=pos_enc_layer_type,
+            selfattention_layer_type=selfattention_layer_type,
+            activation_type=activation_type,
+            use_cnn_module=use_cnn_module,
+            zero_triu=zero_triu,
+            cnn_module_kernel=cnn_module_kernel,
+            padding_idx=padding_idx,
+            stochastic_depth_rate=stochastic_depth_rate,
+            intermediate_layers=intermediate_layers,
+            encoder_type="conformer")
+    def forward(self, xs, masks):
+        """Encode input sequence.
+        Parameters
+        ----------
+        xs : paddle.Tensor
+            Input tensor (#batch, time, idim).
+        masks : paddle.Tensor
+            Mask tensor (#batch, 1, time).
+        Returns
+        ----------
+        paddle.Tensor
+            Output tensor (#batch, time, attention_dim).
+        paddle.Tensor
+            Mask tensor (#batch, 1, time).
+        """
+        if isinstance(self.embed, (Conv2dSubsampling)):
+            xs, masks = self.embed(xs, masks)
+        else:
+            xs = self.embed(xs)
+        if self.intermediate_layers is None:
+            xs, masks = self.encoders(xs, masks)
+        else:
+            intermediate_outputs = []
+            for layer_idx, encoder_layer in enumerate(self.encoders):
+                xs, masks = encoder_layer(xs, masks)
+                if (self.intermediate_layers is not None and
+                        layer_idx + 1 in self.intermediate_layers):
+                    # intermediate branches also require normalization.
+                    encoder_output = xs
+                    if isinstance(encoder_output, tuple):
+                        encoder_output = encoder_output[0]
+                        if self.normalize_before:
+                            encoder_output = self.after_norm(encoder_output)
+                    intermediate_outputs.append(encoder_output)
+        if isinstance(xs, tuple):
+            xs = xs[0]
+        if self.normalize_before:
+            xs = self.after_norm(xs)
+        if self.intermediate_layers is not None:
+            return xs, masks, intermediate_outputs
+        return xs, masks
--- a/paddlespeech/t2s/modules/fastspeech2_transformer/encoder_layer.py
+++ b/paddlespeech/t2s/modules/fastspeech2_transformer/encoder_layer.py
@@ -24,10 +24,10 @@ class EncoderLayer(nn.Layer):
    ----------
    size : int
        Input dimension.
-    self_attn : paddle.nn.Layer
+    self_attn : nn.Layer
        Self-attention module instance.
        `MultiHeadedAttention`  instance can be used as the argument.
-    feed_forward : paddle.nn.Layer
+    feed_forward : nn.Layer
        Feed-forward module instance.
        `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance can be used as the argument.
    dropout_rate : float
@@ -50,7 +50,7 @@ class EncoderLayer(nn.Layer):
            normalize_before=True,
            concat_after=False, ):
        """Construct an EncoderLayer object."""
-        super(EncoderLayer, self).__init__()
+        super().__init__()
        self.self_attn = self_attn
        self.feed_forward = feed_forward
        self.norm1 = nn.LayerNorm(size)

--- a/paddlespeech/t2s/modules/fastspeech2_transformer/lightconv.py
+++ b/paddlespeech/t2s/modules/fastspeech2_transformer/lightconv.py
@@ -18,7 +18,7 @@ import paddle
 import paddle.nn.functional as F
 from paddle import nn
-from paddlespeech.t2s.modules.glu import GLU
+from paddlespeech.t2s.modules.activation import get_activation
 from paddlespeech.t2s.modules.masked_fill import masked_fill
 MIN_VALUE = float(numpy.finfo(numpy.float32).min)
@@ -56,7 +56,7 @@ class LightweightConvolution(nn.Layer):
            use_kernel_mask=False,
            use_bias=False, ):
        """Construct Lightweight Convolution layer."""
-        super(LightweightConvolution, self).__init__()
+        super().__init__()
        assert n_feat % wshare == 0
        self.wshare = wshare
@@ -68,7 +68,7 @@ class LightweightConvolution(nn.Layer):
        # linear -> GLU -> lightconv -> linear
        self.linear1 = nn.Linear(n_feat, n_feat * 2)
        self.linear2 = nn.Linear(n_feat, n_feat)
-        self.act = GLU()
+        self.act = get_activation("glu")
        # lightconv related
        self.uniform_ = nn.initializer.Uniform()

--- a/paddlespeech/t2s/modules/fastspeech2_transformer/mask.py
+++ b/paddlespeech/t2s/modules/fastspeech2_transformer/mask.py
--- a/paddlespeech/t2s/modules/fastspeech2_transformer/multi_layer_conv.py
+++ b/paddlespeech/t2s/modules/fastspeech2_transformer/multi_layer_conv.py
@@ -12,10 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Layer modules for FFT block in FastSpeech (Feed-forward Transformer)."""
-import paddle
+from paddle import nn
-class MultiLayeredConv1d(paddle.nn.Layer):
+class MultiLayeredConv1d(nn.Layer):
    """Multi-layered conv1d for Transformer block.
    This is a module of multi-leyered conv1d designed
@@ -43,21 +43,21 @@ class MultiLayeredConv1d(paddle.nn.Layer):
            Dropout rate.
        """
-        super(MultiLayeredConv1d, self).__init__()
+        super().__init__()
-        self.w_1 = paddle.nn.Conv1D(
+        self.w_1 = nn.Conv1D(
            in_chans,
            hidden_chans,
            kernel_size,
            stride=1,
            padding=(kernel_size - 1) // 2, )
-        self.w_2 = paddle.nn.Conv1D(
+        self.w_2 = nn.Conv1D(
            hidden_chans,
            in_chans,
            kernel_size,
            stride=1,
            padding=(kernel_size - 1) // 2, )
-        self.dropout = paddle.nn.Dropout(dropout_rate)
+        self.dropout = nn.Dropout(dropout_rate)
-        self.relu = paddle.nn.ReLU()
+        self.relu = nn.ReLU()
    def forward(self, x):
        """Calculate forward propagation.
@@ -77,7 +77,7 @@ class MultiLayeredConv1d(paddle.nn.Layer):
            [0, 2, 1])
-class Conv1dLinear(paddle.nn.Layer):
+class Conv1dLinear(nn.Layer):
    """Conv1D + Linear for Transformer block.
    A variant of MultiLayeredConv1d, which replaces second conv-layer to linear.
@@ -98,16 +98,16 @@ class Conv1dLinear(paddle.nn.Layer):
        dropout_rate : float
            Dropout rate.
        """
-        super(Conv1dLinear, self).__init__()
+        super().__init__()
-        self.w_1 = paddle.nn.Conv1D(
+        self.w_1 = nn.Conv1D(
            in_chans,
            hidden_chans,
            kernel_size,
            stride=1,
            padding=(kernel_size - 1) // 2, )
-        self.w_2 = paddle.nn.Linear(hidden_chans, in_chans, bias_attr=True)
+        self.w_2 = nn.Linear(hidden_chans, in_chans, bias_attr=True)
-        self.dropout = paddle.nn.Dropout(dropout_rate)
+        self.dropout = nn.Dropout(dropout_rate)
-        self.relu = paddle.nn.ReLU()
+        self.relu = nn.ReLU()
    def forward(self, x):
        """Calculate forward propagation.

--- a/paddlespeech/t2s/modules/fastspeech2_transformer/positionwise_feed_forward.py
+++ b/paddlespeech/t2s/modules/fastspeech2_transformer/positionwise_feed_forward.py
@@ -14,9 +14,10 @@
 # Modified from espnet(https://github.com/espnet/espnet)
 """Positionwise feed forward layer definition."""
 import paddle
+from paddle import nn
-class PositionwiseFeedForward(paddle.nn.Layer):
+class PositionwiseFeedForward(nn.Layer):
    """Positionwise feed forward layer.
    Parameters
@@ -35,7 +36,7 @@ class PositionwiseFeedForward(paddle.nn.Layer):
                 dropout_rate,
                 activation=paddle.nn.ReLU()):
        """Construct an PositionwiseFeedForward object."""
-        super(PositionwiseFeedForward, self).__init__()
+        super().__init__()
        self.w_1 = paddle.nn.Linear(idim, hidden_units, bias_attr=True)
        self.w_2 = paddle.nn.Linear(hidden_units, idim, bias_attr=True)
        self.dropout = paddle.nn.Dropout(dropout_rate)

--- a/paddlespeech/t2s/modules/fastspeech2_transformer/repeat.py
+++ b/paddlespeech/t2s/modules/fastspeech2_transformer/repeat.py
--- a/paddlespeech/t2s/modules/transformer/subsampling.py
+++ b/paddlespeech/t2s/modules/transformer/subsampling.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+"""Subsampling layer definition."""
+import paddle
+from paddle import nn
+from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding
+class Conv2dSubsampling(nn.Layer):
+    """Convolutional 2D subsampling (to 1/4 length).
+    Parameters
+    ----------
+    idim : int
+        Input dimension.
+    odim : int
+        Output dimension.
+    dropout_rate : float
+        Dropout rate.
+    pos_enc : nn.Layer
+        Custom position encoding layer.
+    """
+    def __init__(self, idim, odim, dropout_rate, pos_enc=None):
+        """Construct an Conv2dSubsampling object."""
+        super().__init__()
+        self.conv = nn.Sequential(
+            nn.Conv2D(1, odim, 3, 2),
+            nn.ReLU(),
+            nn.Conv2D(odim, odim, 3, 2),
+            nn.ReLU(), )
+        self.out = nn.Sequential(
+            nn.Linear(odim * (((idim - 1) // 2 - 1) // 2), odim),
+            pos_enc if pos_enc is not None else
+            PositionalEncoding(odim, dropout_rate), )
+    def forward(self, x, x_mask):
+        """Subsample x.
+        Parameters
+        ----------
+        x : paddle.Tensor
+            Input tensor (#batch, time, idim).
+        x_mask : paddle.Tensor
+            Input mask (#batch, 1, time).
+        Returns
+        ----------
+        paddle.Tensor
+            Subsampled tensor (#batch, time', odim),
+            where time' = time // 4.
+        paddle.Tensor
+            Subsampled mask (#batch, 1, time'),
+            where time' = time // 4.
+        """
+        # (b, c, t, f)
+        x = x.unsqueeze(1)
+        x = self.conv(x)
+        b, c, t, f = paddle.shape(x)
+        x = self.out(x.transpose([0, 2, 1, 3]).reshape([b, t, c * f]))
+        if x_mask is None:
+            return x, None
+        return x, x_mask[:, :, :-2:2][:, :, :-2:2]
+    def __getitem__(self, key):
+        """Get item.
+        When reset_parameters() is called, if use_scaled_pos_enc is used,
+            return the positioning encoding.
+        """
+        if key != -1:
+            raise NotImplementedError(
+                "Support only `-1` (for `reset_parameters`).")
+        return self.out[key]
--- a/paddlespeech/t2s/training/cli.py
+++ b/paddlespeech/t2s/training/cli.py
@@ -30,7 +30,7 @@ def default_argument_parser():
    The ``--checkpoint_path`` specifies the checkpoint to load from.
-    The ``--device`` and ``--nprocs`` specifies how to run the training.
+    The ``--ngpu`` specifies how to run the training.
    See Also
    --------

--- a/paddlespeech/t2s/training/experiment.py
+++ b/paddlespeech/t2s/training/experiment.py
@@ -82,8 +82,8 @@ class ExperimentBase(object):
    >>>     config.merge_from_list(args.opts)
    >>> config.freeze()
    >>>
-    >>> if args.nprocs > 1 and args.device == "gpu":
+    >>> if args.ngpu > 1:
-    >>>     dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs)
+    >>>     dist.spawn(main_sp, args=(config, args), nprocs=args.ngpu)
    >>> else:
    >>>     main_sp(config, args)
    """

--- a/paddlespeech/t2s/training/optimizer.py
+++ b/paddlespeech/t2s/training/optimizer.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import paddle
+from paddle import nn
 optim_classes = dict(
    adadelta=paddle.optimizer.Adadelta,
@@ -25,7 +26,7 @@ optim_classes = dict(
    sgd=paddle.optimizer.SGD, )
-def build_optimizers(model: paddle.nn.Layer,
+def build_optimizers(model: nn.Layer,
                     optim='adadelta',
                     max_grad_norm=None,
                     learning_rate=0.01) -> paddle.optimizer:

--- a/paddlespeech/text/examples/punctuation_restoration/chinese/local/test.sh
+++ b/paddlespeech/text/examples/punctuation_restoration/chinese/local/test.sh
@@ -9,17 +9,11 @@ fi
 ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
-device=gpu
-if [ ${ngpu} == 0 ];then
-    device=cpu
-fi
 config_path=$1
 ckpt_prefix=$2
 python3 -u ${BIN_DIR}/test.py \
--device ${device} \
+--ngpu 1 \
--nproc 1 \
 --config ${config_path} \
 --result_file ${ckpt_prefix}.rsl \
 --checkpoint_path ${ckpt_prefix}

--- a/paddlespeech/text/examples/punctuation_restoration/chinese/local/train.sh
+++ b/paddlespeech/text/examples/punctuation_restoration/chinese/local/train.sh
@@ -11,16 +11,10 @@ echo "using $ngpu gpus..."
 config_path=$1
 ckpt_name=$2
-device=gpu
-if [ ${ngpu} == 0 ];then
-    device=cpu
-fi
 mkdir -p exp
 python3 -u ${BIN_DIR}/train.py \
--device ${device} \
+--ngpu ${ngpu} \
--nproc ${ngpu} \
 --config ${config_path} \
 --output exp/${ckpt_name}

--- a/paddlespeech/text/examples/punctuation_restoration/english/local/test.sh
+++ b/paddlespeech/text/examples/punctuation_restoration/english/local/test.sh
@@ -9,17 +9,12 @@ fi
 ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
-device=gpu
-if [ ${ngpu} == 0 ];then
-    device=cpu
-fi
 config_path=$1
 ckpt_prefix=$2
 python3 -u ${BIN_DIR}/test.py \
--device ${device} \
+--ngpu 1 \
--nproc 1 \
 --config ${config_path} \
 --result_file ${ckpt_prefix}.rsl \
 --checkpoint_path ${ckpt_prefix}

--- a/paddlespeech/text/examples/punctuation_restoration/english/local/train.sh
+++ b/paddlespeech/text/examples/punctuation_restoration/english/local/train.sh
@@ -11,16 +11,10 @@ echo "using $ngpu gpus..."
 config_path=$1
 ckpt_name=$2
-device=gpu
-if [ ${ngpu} == 0 ];then
-    device=cpu
-fi
 mkdir -p exp
 python3 -u ${BIN_DIR}/train.py \
--device ${device} \
+--ngpu ${ngpu} \
--nproc ${ngpu} \
 --config ${config_path} \
 --output exp/${ckpt_name}

--- a/paddlespeech/text/speechtask/punctuation_restoration/bin/train.py
+++ b/paddlespeech/text/speechtask/punctuation_restoration/bin/train.py
@@ -26,8 +26,8 @@ def main_sp(config, args):
 def main(config, args):
-    if args.device == "gpu" and args.nprocs > 1:
+    if args.ngpu > 1:
-        dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs)
+        dist.spawn(main_sp, args=(config, args), nprocs=args.ngpu)
    else:
        main_sp(config, args)

--- a/paddlespeech/text/speechtask/punctuation_restoration/training/trainer.py
+++ b/paddlespeech/text/speechtask/punctuation_restoration/training/trainer.py
@@ -106,8 +106,8 @@ class Trainer():
    >>>     config.merge_from_list(args.opts)
    >>> config.freeze()
    >>> 
-    >>> if args.nprocs > 1 and args.device == "gpu":
+    >>> if args.ngpu > 1:
-    >>>     dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs)
+    >>>     dist.spawn(main_sp, args=(config, args), nprocs=args.ngpu)
    >>> else:
    >>>     main_sp(config, args)
    """
@@ -147,7 +147,7 @@ class Trainer():
        """A flag indicating whether the experiment should run with 
        multiprocessing.
        """
-        return self.args.device == "gpu" and self.args.nprocs > 1
+        return self.args.ngpu > 1
    def init_parallel(self):
        """Init environment for multiprocess training.

--- a/paddlespeech/text/speechtask/punctuation_restoration/utils/default_parser.py
+++ b/paddlespeech/text/speechtask/punctuation_restoration/utils/default_parser.py
@@ -30,7 +30,7 @@ def default_argument_parser():
    The ``--checkpoint_path`` specifies the checkpoint to load from.
-    The ``--device`` and ``--nprocs`` specifies how to run the training.
+    The ``--ngpu`` specifies how to run the training.
    See Also
@@ -60,9 +60,7 @@ def default_argument_parser():
    parser.add_argument("--result_file", type=str, help="path of save the asr result")
    # running
-    parser.add_argument("--device", type=str, default='gpu', choices=["cpu", "gpu"],
+    parser.add_argument("--ngpu", type=int, default=1, help="number of parallel processes to use. if ngpu=0, using cpu.")
-                        help="device type to use, cpu and gpu are supported.")
-    parser.add_argument("--nprocs", type=int, default=1, help="number of parallel processes to use.")
    # overwrite extra config and default config
    # parser.add_argument("--opts", nargs=argparse.REMAINDER, 

--- a/setup.py
+++ b/setup.py
@@ -187,6 +187,9 @@ setup_info = dict(
        'Programming Language :: Python :: 3.7',
        'Programming Language :: Python :: 3.8',
        'Programming Language :: Python :: 3.9',
-    ], )
+    ],
+    entry_points={
+        'console_scripts': ['paddlespeech=paddlespeech.cli.entry:_execute']
+    })
 setup(**setup_info)
--- a/audio/setup.py
+++ b/audio/setup.py
@@ -16,19 +16,16 @@ import setuptools
 # set the version here
 version = '0.1.0a'
-with open("README.md", "r") as fh:
-    long_description = fh.read()
 setuptools.setup(
    name="paddleaudio",
    version=version,
    author="",
    author_email="",
    description="PaddleAudio, in development",
-    long_description=long_description,
+    long_description="",
    long_description_content_type="text/markdown",
    url="",
-    packages=setuptools.find_packages(exclude=["build*", "test*", "examples*"]),
+    packages=setuptools.find_packages(include=['paddleaudio*']),
    classifiers=[
        "Programming Language :: Python :: 3",
        "License :: OSI Approved :: MIT License",
@@ -41,8 +38,4 @@ setuptools.setup(
        'resampy >= 0.2.2',
        'soundfile >= 0.9.0',
        'colorlog',
-        'pathos',
+    ], )
-    ],
-    extras_require={'dev': ['pytest>=3.7', 'librosa>=0.7.2']
-                    }  # for dev only, install: pip install -e .[dev]
-)
--- a/tests/benchmark/conformer/run_benchmark.sh
+++ b/tests/benchmark/conformer/run_benchmark.sh
@@ -38,7 +38,7 @@ function _train(){
    train_cmd="--config=${config_path}
               --output=${output}
               --seed=${seed}
-               --nproc=${ngpu}
+               --ngpu=${ngpu}
               --profiler-options "${profiler_options}"
               --benchmark-batch-size ${batch_size}
               --benchmark-max-step ${benchmark_max_step} "

--- a/tests/chains/ds2/ds2_params_lite_train_infer.txt
+++ b/tests/chains/ds2/ds2_params_lite_train_infer.txt
@@ -21,13 +21,13 @@ null:null
 null:null
 ##
 ===========================eval_params===========================
-eval: ../../../paddlespeech/s2t/exps/deepspeech2/bin/test.py --nproc 1 --config conf/deepspeech2.yaml --checkpoint_path exp/deepspeech_tiny/checkpoints/9 --result_file tests/9.rsl  --model_type offline
+eval: ../../../paddlespeech/s2t/exps/deepspeech2/bin/test.py --ngpu 1 --config conf/deepspeech2.yaml --checkpoint_path exp/deepspeech_tiny/checkpoints/9 --result_file tests/9.rsl  --model_type offline
 null:null
 ##
 ===========================infer_params===========================
 null:null
 null:null
-norm_export: ../../../paddlespeech/s2t/exps/deepspeech2/bin/export.py --nproc 1 --config conf/deepspeech2.yaml --model_type offline --checkpoint_path exp/deepspeech_tiny/checkpoints/9 --export_path exp/deepspeech_tiny/checkpoints/9.jit
+norm_export: ../../../paddlespeech/s2t/exps/deepspeech2/bin/export.py --ngpu 1 --config conf/deepspeech2.yaml --model_type offline --checkpoint_path exp/deepspeech_tiny/checkpoints/9 --export_path exp/deepspeech_tiny/checkpoints/9.jit
 quant_export:null
 fpgm_export:null
 distill_export:null

--- a/tests/chains/ds2/ds2_params_whole_train_infer.txt
+++ b/tests/chains/ds2/ds2_params_whole_train_infer.txt
@@ -21,13 +21,13 @@ null:null
 null:null
 ##
 ===========================eval_params===========================
-eval: ../../../paddlespeech/s2t/exps/deepspeech2/bin/test.py --nproc 1 --config conf/deepspeech2.yaml --result_file tests/49.rsl --checkpoint_path exp/deepspeech_whole/checkpoints/49 --model_type offline
+eval: ../../../paddlespeech/s2t/exps/deepspeech2/bin/test.py --ngpu 1 --config conf/deepspeech2.yaml --result_file tests/49.rsl --checkpoint_path exp/deepspeech_whole/checkpoints/49 --model_type offline
 null:null
 ##
 ===========================infer_params===========================
 null:null
 null:null
-norm_export: ../../../paddlespeech/s2t/exps/deepspeech2/bin/export.py --nproc 1 --config conf/deepspeech2.yaml --model_type offline  --checkpoint_path exp/deepspeech_whole/checkpoints/49 --export_path exp/deepspeech_whole/checkpoints/49.jit
+norm_export: ../../../paddlespeech/s2t/exps/deepspeech2/bin/export.py --ngpu 1 --config conf/deepspeech2.yaml --model_type offline  --checkpoint_path exp/deepspeech_whole/checkpoints/49 --export_path exp/deepspeech_whole/checkpoints/49.jit
 quant_export:null
 fpgm_export:null
 distill_export:null

--- a/tests/chains/ds2/speedyspeech_params_lite.txt
+++ b/tests/chains/ds2/speedyspeech_params_lite.txt
@@ -21,7 +21,7 @@ null:null
 null:null
 ##
 ===========================eval_params===========================
-eval:../examples/speedyspeech/baker/synthesize_e2e.py --speedyspeech-config=../examples/speedyspeech/baker/conf/default.yaml --speedyspeech-checkpoint=exp/default/checkpoints/snapshot_iter_90.pdz --speedyspeech-stat=pretrain_models/speedyspeech_baker_ckpt_0.4/speedy_speech_stats.npy --pwg-config=../examples/parallelwave_gan/baker/conf/default.yaml --pwg-checkpoint=pretrain_models/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz --pwg-stat=pretrain_models/pwg_baker_ckpt_0.4/pwg_stats.npy --text=../examples/speedyspeech/baker/sentences.txt --output-dir=e2e --inference-dir=inference --device="gpu" --phones-dict=../examples/speedyspeech/baker/phones.txt --tones-dict=../examples/speedyspeech/baker/tones.txt
+eval:../examples/speedyspeech/baker/synthesize_e2e.py --speedyspeech-config=../examples/speedyspeech/baker/conf/default.yaml --speedyspeech-checkpoint=exp/default/checkpoints/snapshot_iter_90.pdz --speedyspeech-stat=pretrain_models/speedyspeech_baker_ckpt_0.4/speedy_speech_stats.npy --pwg-config=../examples/parallelwave_gan/baker/conf/default.yaml --pwg-checkpoint=pretrain_models/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz --pwg-stat=pretrain_models/pwg_baker_ckpt_0.4/pwg_stats.npy --text=../examples/speedyspeech/baker/sentences.txt --output-dir=e2e --inference-dir=inference --ngpu=1 --phones-dict=../examples/speedyspeech/baker/phones.txt --tones-dict=../examples/speedyspeech/baker/tones.txt
 null:null
 ##
 ===========================infer_params===========================

--- a/tests/chains/ds2/test.sh
+++ b/tests/chains/ds2/test.sh
@@ -323,7 +323,7 @@ else
                elif [ ${#gpu} -le 15 ];then  # train with multi-gpu
                    gsu=${gpu//,/ }
                    nump=`echo $gsu | wc -w`
-                    cmd="${python} ${run_train} --nproc=$nump"
+                    cmd="${python} ${run_train} --ngpu=$nump"
                else     # train with multi-machine
                    cmd="${python} -m paddle.distributed.launch --ips=${ips} --gpus=${gpu} ${run_train} ${set_save_model} ${set_pretrain} ${set_epoch} ${set_autocast} ${set_batchsize} ${set_train_params1}"
                fi

--- a/tests/chains/speedyspeech/speedyspeech_params_lite_multi_gpu.txt
+++ b/tests/chains/speedyspeech/speedyspeech_params_lite_multi_gpu.txt
@@ -21,7 +21,7 @@ null:null
 null:null
 ##
 ===========================eval_params===========================
-eval:../../../paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py --speedyspeech-config=../../../examples/csmsc/tts2/conf/default.yaml --speedyspeech-checkpoint=exp/default/checkpoints/snapshot_iter_20.pdz --speedyspeech-stat=train_data/mini_BZNSYP/train/feats_stats.npy --pwg-config=pretrain_models/pwg_baker_ckpt_0.4/pwg_default.yaml --pwg-checkpoint=pretrain_models/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz --pwg-stat=pretrain_models/pwg_baker_ckpt_0.4/pwg_stats.npy --text=../../../paddlespeech/t2s/exps/sentences.txt --output-dir=e2e --inference-dir=inference --device="gpu" --phones-dict=train_data/mini_BZNSYP/phone_id_map.txt --tones-dict=train_data/mini_BZNSYP/tone_id_map.txt
+eval:../../../paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py --speedyspeech-config=../../../examples/csmsc/tts2/conf/default.yaml --speedyspeech-checkpoint=exp/default/checkpoints/snapshot_iter_20.pdz --speedyspeech-stat=train_data/mini_BZNSYP/train/feats_stats.npy --pwg-config=pretrain_models/pwg_baker_ckpt_0.4/pwg_default.yaml --pwg-checkpoint=pretrain_models/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz --pwg-stat=pretrain_models/pwg_baker_ckpt_0.4/pwg_stats.npy --text=../../../paddlespeech/t2s/exps/sentences.txt --output-dir=e2e --inference-dir=inference --ngpu=1 --phones-dict=train_data/mini_BZNSYP/phone_id_map.txt --tones-dict=train_data/mini_BZNSYP/tone_id_map.txt
 null:null
 ##
 ===========================infer_params===========================

--- a/tests/chains/speedyspeech/speedyspeech_params_lite_single_gpu.txt
+++ b/tests/chains/speedyspeech/speedyspeech_params_lite_single_gpu.txt
@@ -21,7 +21,7 @@ null:null
 null:null
 ##
 ===========================eval_params===========================
-eval:../../../paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py --speedyspeech-config=../../../examples/csmsc/tts2/conf/default.yaml --speedyspeech-checkpoint=exp/default/checkpoints/snapshot_iter_30.pdz --speedyspeech-stat=train_data/mini_BZNSYP/train/feats_stats.npy --pwg-config=pretrain_models/pwg_baker_ckpt_0.4/pwg_default.yaml --pwg-checkpoint=pretrain_models/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz --pwg-stat=pretrain_models/pwg_baker_ckpt_0.4/pwg_stats.npy --text=../../../paddlespeech/t2s/exps/sentences.txt --output-dir=e2e --inference-dir=inference --device="gpu" --phones-dict=train_data/mini_BZNSYP/phone_id_map.txt --tones-dict=train_data/mini_BZNSYP/tone_id_map.txt
+eval:../../../paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py --speedyspeech-config=../../../examples/csmsc/tts2/conf/default.yaml --speedyspeech-checkpoint=exp/default/checkpoints/snapshot_iter_30.pdz --speedyspeech-stat=train_data/mini_BZNSYP/train/feats_stats.npy --pwg-config=pretrain_models/pwg_baker_ckpt_0.4/pwg_default.yaml --pwg-checkpoint=pretrain_models/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz --pwg-stat=pretrain_models/pwg_baker_ckpt_0.4/pwg_stats.npy --text=../../../paddlespeech/t2s/exps/sentences.txt --output-dir=e2e --inference-dir=inference --ngpu=1 --phones-dict=train_data/mini_BZNSYP/phone_id_map.txt --tones-dict=train_data/mini_BZNSYP/tone_id_map.txt
 null:null
 ##
 ===========================infer_params===========================

--- a/tests/chains/speedyspeech/speedyspeech_params_whole_multi_gpu.txt
+++ b/tests/chains/speedyspeech/speedyspeech_params_whole_multi_gpu.txt
@@ -21,7 +21,7 @@ null:null
 null:null
 ##
 ===========================eval_params===========================
-eval:../../../paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py --speedyspeech-config=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/default.yaml --speedyspeech-checkpoint=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/snapshot_iter_11400.pdz --speedyspeech-stat=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/feats_stats.npy --pwg-config=pretrain_models/pwg_baker_ckpt_0.4/pwg_default.yaml --pwg-checkpoint=pretrain_models/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz --pwg-stat=pretrain_models/pwg_baker_ckpt_0.4/pwg_stats.npy --text=../../../paddlespeech/t2s/exps/sentences.txt --output-dir=e2e --inference-dir=inference --device="gpu" --phones-dict=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/phone_id_map.txt --tones-dict=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/tone_id_map.txt
+eval:../../../paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py --speedyspeech-config=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/default.yaml --speedyspeech-checkpoint=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/snapshot_iter_11400.pdz --speedyspeech-stat=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/feats_stats.npy --pwg-config=pretrain_models/pwg_baker_ckpt_0.4/pwg_default.yaml --pwg-checkpoint=pretrain_models/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz --pwg-stat=pretrain_models/pwg_baker_ckpt_0.4/pwg_stats.npy --text=../../../paddlespeech/t2s/exps/sentences.txt --output-dir=e2e --inference-dir=inference --ngpu=1 --phones-dict=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/phone_id_map.txt --tones-dict=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/tone_id_map.txt
 null:null
 ##
 ===========================infer_params===========================

--- a/tests/chains/speedyspeech/speedyspeech_params_whole_single_gpu.txt
+++ b/tests/chains/speedyspeech/speedyspeech_params_whole_single_gpu.txt
@@ -21,7 +21,7 @@ null:null
 null:null
 ##
 ===========================eval_params===========================
-eval:../../../paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py --speedyspeech-config=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/default.yaml --speedyspeech-checkpoint=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/snapshot_iter_11400.pdz --speedyspeech-stat=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/feats_stats.npy --pwg-config=pretrain_models/pwg_baker_ckpt_0.4/pwg_default.yaml --pwg-checkpoint=pretrain_models/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz --pwg-stat=pretrain_models/pwg_baker_ckpt_0.4/pwg_stats.npy --text=../../../paddlespeech/t2s/exps/sentences.txt --output-dir=e2e --inference-dir=inference --device="gpu" --phones-dict=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/phone_id_map.txt --tones-dict=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/tone_id_map.txt
+eval:../../../paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py --speedyspeech-config=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/default.yaml --speedyspeech-checkpoint=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/snapshot_iter_11400.pdz --speedyspeech-stat=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/feats_stats.npy --pwg-config=pretrain_models/pwg_baker_ckpt_0.4/pwg_default.yaml --pwg-checkpoint=pretrain_models/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz --pwg-stat=pretrain_models/pwg_baker_ckpt_0.4/pwg_stats.npy --text=../../../paddlespeech/t2s/exps/sentences.txt --output-dir=e2e --inference-dir=inference --ngpu=1 --phones-dict=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/phone_id_map.txt --tones-dict=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/tone_id_map.txt
 null:null
 ##
 ===========================infer_params===========================

--- a/tests/unit/tts/test_stft.py
+++ b/tests/unit/tts/test_stft.py
@@ -11,52 +11,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import librosa
-import numpy as np
 import paddle
 import torch
 from parallel_wavegan.losses import stft_loss as sl
-from scipy import signal
-from paddlespeech.t2s.modules.stft_loss import MultiResolutionSTFTLoss
+from paddlespeech.t2s.modules.losses import MultiResolutionSTFTLoss
-from paddlespeech.t2s.modules.stft_loss import STFT
-def test_stft():
-    stft = STFT(n_fft=1024, hop_length=256, win_length=1024)
-    x = paddle.uniform([4, 46080])
-    S = stft.magnitude(x)
-    window = signal.get_window('hann', 1024, fftbins=True)
-    D2 = torch.stft(
-        torch.as_tensor(x.numpy()),
-        n_fft=1024,
-        hop_length=256,
-        win_length=1024,
-        window=torch.as_tensor(window))
-    S2 = (D2**2).sum(-1).sqrt()
-    S3 = np.abs(
-        librosa.stft(x.numpy()[0], n_fft=1024, hop_length=256, win_length=1024))
-    print(S2.shape)
-    print(S.numpy()[0])
-    print(S2.data.cpu().numpy()[0])
-    print(S3)
-def test_torch_stft():
-    # NOTE: torch.stft use no window by default
-    x = np.random.uniform(-1.0, 1.0, size=(46080, ))
-    window = signal.get_window('hann', 1024, fftbins=True)
-    D2 = torch.stft(
-        torch.as_tensor(x),
-        n_fft=1024,
-        hop_length=256,
-        win_length=1024,
-        window=torch.as_tensor(window))
-    D3 = librosa.stft(
-        x, n_fft=1024, hop_length=256, win_length=1024, window='hann')
-    print(D2[:, :, 0].data.cpu().numpy()[:, 30:60])
-    print(D3.real[:, 30:60])
-    # print(D3.imag[:, 30:60])
 def test_multi_resolution_stft_loss():

--- a/tools/extras/install_kaldi.sh
+++ b/tools/extras/install_kaldi.sh
@@ -9,6 +9,7 @@ apt-get install subversion -y
 KALDI_GIT="--depth 1 -b master https://github.com/kaldi-asr/kaldi.git"
 KALDI_DIR="$PWD/kaldi"
+SHARED=false
 if [ ! -d "$KALDI_DIR" ]; then
    git clone $KALDI_GIT $KALDI_DIR
@@ -23,17 +24,25 @@ git pull
 mkdir -p "python"
 touch "python/.use_default_python"
+# check deps
 ./extras/check_dependencies.sh
+# make tools
 make -j4
+# make src
 pushd ../src
 OPENBLAS_DIR=${KALDI_DIR}/../OpenBLAS
 mkdir -p ${OPENBLAS_DIR}/install
-./configure --shared --use-cuda=no --static-math --mathlib=OPENBLAS --openblas-root=${OPENBLAS_DIR}/install
+if [ $SHARED == true ];
+   ./configure --shared --use-cuda=no --static-math --mathlib=OPENBLAS --openblas-root=${OPENBLAS_DIR}/install
+else
+   ./configure --static --use-cuda=no --static-math --mathlib=OPENBLAS --openblas-root=${OPENBLAS_DIR}/install
+fi
 make clean -j && make depend -j && make -j4
-popd
+popd # kaldi/src
-popd
+popd # kaldi/tools
 echo "Done installing Kaldi."