Merge branch 'develop' into datapipe

2ba3f00b · Hui Zhang · b57b8659 · d2a05df0 · b57b8659 · 2ba3f00b
280 changed file
--- a/.bashrc
+++ b/.bashrc
-unset GREP_OPTIONS
-
-# https://zhuanlan.zhihu.com/p/33050965
-alias nvs='nvidia-smi'
-alias his='history'
-alias jobs='jobs -l'
-alias ports='netstat -tulanp'
-alias wget='wget -c'
-
-## Colorize the grep command output for ease of use (good for log files)##
-alias grep='grep --color=auto'
-alias egrep='egrep --color=auto'
-alias fgrep='fgrep --color=auto'
-
-
--- a/.mergify.yml
+++ b/.mergify.yml
@@ -53,7 +53,7 @@ pull_request_rules:
        add: ["T2S"]
  - name: "auto add label=Audio"
    conditions:
-      - files~=^paddleaudio/
+      - files~=^audio/
    actions:
      label:
        add: ["Audio"]
@@ -69,6 +69,12 @@ pull_request_rules:
    actions:
      label:
        add: ["Example"]
+  - name: "auto add label=Demo"
+    conditions:
+      - files~=^demos/
+    actions:
+      label:
+        add: ["Demo"]
  - name: "auto add label=README"
    conditions:
      - files~=README.md
@@ -77,7 +83,7 @@ pull_request_rules:
        add: ["README"]
  - name: "auto add label=Documentation"
    conditions:
-      - files~=^doc/
+      - files~=^docs/
    actions:
      label:
        add: ["Documentation"]

--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -13,8 +13,8 @@
        files: (?!.*paddle)^.*$
    -   id: end-of-file-fixer
        files: \.md$
-    -   id: trailing-whitespace
-        files: \.md$
+    #-   id: trailing-whitespace
+    #    files: \.md$
    -   id: requirements-txt-fixer
        exclude: (?=third_party).*$
    -   id: check-yaml

--- a/.vimrc
+++ b/.vimrc
-"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
-" Maintainer: 
-"       Amir Salihefendic — @amix3k
-"
-" Awesome_version:
-"       Get this config, nice color schemes and lots of plugins!
-"
-"       Install the awesome version from:
-"
-"           https://github.com/amix/vimrc
-"
-" Sections:
-"    -> General
-"    -> VIM user interface
-"    -> Colors and Fonts
-"    -> Files and backups
-"    -> Text, tab and indent related
-"    -> Visual mode related
-"    -> Moving around, tabs and buffers
-"    -> Status line
-"    -> Editing mappings
-"    -> vimgrep searching and cope displaying
-"    -> Spell checking
-"    -> Misc
-"    -> Helper functions
-"
-"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
-
-
-"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
-" => General
-"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
-" Sets how many lines of history VIM has to remember
-set history=500
-
-" Enable filetype plugins
-filetype plugin on
-filetype indent on
-
-" Set to auto read when a file is changed from the outside
-set autoread
-au FocusGained,BufEnter * checktime
-
-" With a map leader it's possible to do extra key combinations
-" like <leader>w saves the current file
-let mapleader = ","
-
-" Fast saving
-nmap <leader>w :w!<cr>
-
-" :W sudo saves the file 
-" (useful for handling the permission-denied error)
-command! W execute 'w !sudo tee % > /dev/null' <bar> edit!
-
-
-"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
-" => VIM user interface
-"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
-" Set 7 lines to the cursor - when moving vertically using j/k
-set so=7
-
-" Avoid garbled characters in Chinese language windows OS
-let $LANG='en' 
-set langmenu=en
-source $VIMRUNTIME/delmenu.vim
-source $VIMRUNTIME/menu.vim
-
-" Turn on the Wild menu
-set wildmenu
-
-" Ignore compiled files
-set wildignore=*.o,*~,*.pyc
-if has("win16") || has("win32")
-    set wildignore+=.git\*,.hg\*,.svn\*
-else
-    set wildignore+=*/.git/*,*/.hg/*,*/.svn/*,*/.DS_Store
-endif
-
-"Always show current position
-set ruler
-
-" Height of the command bar
-set cmdheight=1
-
-" A buffer becomes hidden when it is abandoned
-set hid
-
-" Configure backspace so it acts as it should act
-set backspace=eol,start,indent
-set whichwrap+=<,>,h,l
-
-" Ignore case when searching
-set ignorecase
-
-" When searching try to be smart about cases 
-set smartcase
-
-" Highlight search results
-set hlsearch
-
-" Makes search act like search in modern browsers
-set incsearch 
-
-" Don't redraw while executing macros (good performance config)
-set lazyredraw 
-
-" For regular expressions turn magic on
-set magic
-
-" Show matching brackets when text indicator is over them
-set showmatch 
-" How many tenths of a second to blink when matching brackets
-set mat=2
-
-" No annoying sound on errors
-set noerrorbells
-set novisualbell
-set t_vb=
-set tm=500
-
-" Properly disable sound on errors on MacVim
-if has("gui_macvim")
-    autocmd GUIEnter * set vb t_vb=
-endif
-
-
-" Add a bit extra margin to the left
-set foldcolumn=1
-
-
-"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
-" => Colors and Fonts
-"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
-" Enable syntax highlighting
-syntax enable 
-
-" Enable 256 colors palette in Gnome Terminal
-if $COLORTERM == 'gnome-terminal'
-    set t_Co=256
-endif
-
-try
-    colorscheme desert
-catch
-endtry
-
-set background=dark
-
-" Set extra options when running in GUI mode
-if has("gui_running")
-    set guioptions-=T
-    set guioptions-=e
-    set t_Co=256
-    set guitablabel=%M\ %t
-endif
-
-" Set utf8 as standard encoding and en_US as the standard language
-set encoding=utf8
-set fileencodings=ucs-bom,utf-8,cp936
-set fileencoding=gb2312
-set termencoding=utf-8
-
-" Use Unix as the standard file type
-set ffs=unix,dos,mac
-
-
-"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
-" => Files, backups and undo
-"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
-" Turn backup off, since most stuff is in SVN, git etc. anyway...
-set nobackup
-set nowb
-set noswapfile
-
-
-"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
-" => Text, tab and indent related
-"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
-" Use spaces instead of tabs
-set expandtab
-
-" Be smart when using tabs ;)
-set smarttab
-
-" 1 tab == 4 spaces
-set shiftwidth=4
-set tabstop=4
-
-" Linebreak on 500 characters
-set lbr
-set tw=500
-
-set ai "Auto indent
-set si "Smart indent
-set wrap "Wrap lines
-
-
-""""""""""""""""""""""""""""""
-" => Visual mode related
-""""""""""""""""""""""""""""""
-" Visual mode pressing * or # searches for the current selection
-" Super useful! From an idea by Michael Naumann
-vnoremap <silent> * :<C-u>call VisualSelection('', '')<CR>/<C-R>=@/<CR><CR>
-vnoremap <silent> # :<C-u>call VisualSelection('', '')<CR>?<C-R>=@/<CR><CR>
-
-
-"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
-" => Moving around, tabs, windows and buffers
-"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
-" Map <Space> to / (search) and Ctrl-<Space> to ? (backwards search)
-map <space> /
-map <C-space> ?
-
-" Disable highlight when <leader><cr> is pressed
-map <silent> <leader><cr> :noh<cr>
-
-" Smart way to move between windows
-map <C-j> <C-W>j
-map <C-k> <C-W>k
-map <C-h> <C-W>h
-map <C-l> <C-W>l
-
-" Close the current buffer
-map <leader>bd :Bclose<cr>:tabclose<cr>gT
-
-" Close all the buffers
-map <leader>ba :bufdo bd<cr>
-
-map <leader>l :bnext<cr>
-map <leader>h :bprevious<cr>
-
-" Useful mappings for managing tabs
-map <leader>tn :tabnew<cr>
-map <leader>to :tabonly<cr>
-map <leader>tc :tabclose<cr>
-map <leader>tm :tabmove 
-map <leader>t<leader> :tabnext 
-
-" Let 'tl' toggle between this and the last accessed tab
-let g:lasttab = 1
-nmap <Leader>tl :exe "tabn ".g:lasttab<CR>
-au TabLeave * let g:lasttab = tabpagenr()
-
-
-" Opens a new tab with the current buffer's path
-" Super useful when editing files in the same directory
-map <leader>te :tabedit <C-r>=expand("%:p:h")<cr>/
-
-" Switch CWD to the directory of the open buffer
-map <leader>cd :cd %:p:h<cr>:pwd<cr>
-
-" Specify the behavior when switching between buffers 
-try
-  set switchbuf=useopen,usetab,newtab
-  set stal=2
-catch
-endtry
-
-" Return to last edit position when opening files (You want this!)
-au BufReadPost * if line("'\"") > 1 && line("'\"") <= line("$") | exe "normal! g'\"" | endif
-
-
-""""""""""""""""""""""""""""""
-" => Status line
-""""""""""""""""""""""""""""""
-" Always show the status line
-set laststatus=2
-
-" Format the status line
-set statusline=\ %{HasPaste()}%F%m%r%h\ %w\ \ CWD:\ %r%{getcwd()}%h\ \ \ Line:\ %l\ \ Column:\ %c
-
-
-"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
-" => Editing mappings
-"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
-" Remap VIM 0 to first non-blank character
-map 0 ^
-
-" Move a line of text using ALT+[jk] or Command+[jk] on mac
-nmap <M-j> mz:m+<cr>`z
-nmap <M-k> mz:m-2<cr>`z
-vmap <M-j> :m'>+<cr>`<my`>mzgv`yo`z
-vmap <M-k> :m'<-2<cr>`>my`<mzgv`yo`z
-
-if has("mac") || has("macunix")
-  nmap <D-j> <M-j>
-  nmap <D-k> <M-k>
-  vmap <D-j> <M-j>
-  vmap <D-k> <M-k>
-endif
-
-" Delete trailing white space on save, useful for some filetypes ;)
-fun! CleanExtraSpaces()
-    let save_cursor = getpos(".")
-    let old_query = getreg('/')
-    silent! %s/\s\+$//e
-    call setpos('.', save_cursor)
-    call setreg('/', old_query)
-endfun
-
-if has("autocmd")
-    autocmd BufWritePre *.txt,*.js,*.py,*.wiki,*.sh,*.coffee :call CleanExtraSpaces()
-endif
-
-
-"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
-" => Spell checking
-"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
-" Pressing ,ss will toggle and untoggle spell checking
-map <leader>ss :setlocal spell!<cr>
-
-" Shortcuts using <leader>
-map <leader>sn ]s
-map <leader>sp [s
-map <leader>sa zg
-map <leader>s? z=
-
-
-"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
-" => Misc
-"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
-" Remove the Windows ^M - when the encodings gets messed up
-noremap <Leader>m mmHmt:%s/<C-V><cr>//ge<cr>'tzt'm
-
-" Quickly open a buffer for scribble
-map <leader>q :e ~/buffer<cr>
-
-" Quickly open a markdown buffer for scribble
-map <leader>x :e ~/buffer.md<cr>
-
-" Toggle paste mode on and off
-map <leader>pp :setlocal paste!<cr>
-
-
-"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
-" => Helper functions
-"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
-" Returns true if paste mode is enabled
-function! HasPaste()
-    if &paste
-        return 'PASTE MODE  '
-    endif
-    return ''
-endfunction
-
-" Don't close window, when deleting a buffer
-command! Bclose call <SID>BufcloseCloseIt()
-function! <SID>BufcloseCloseIt()
-    let l:currentBufNum = bufnr("%")
-    let l:alternateBufNum = bufnr("#")
-
-    if buflisted(l:alternateBufNum)
-        buffer #
-    else
-        bnext
-    endif
-
-    if bufnr("%") == l:currentBufNum
-        new
-    endif
-
-    if buflisted(l:currentBufNum)
-        execute("bdelete! ".l:currentBufNum)
-    endif
-endfunction
-
-function! CmdLine(str)
-    call feedkeys(":" . a:str)
-endfunction 
-
-function! VisualSelection(direction, extra_filter) range
-    let l:saved_reg = @"
-    execute "normal! vgvy"
-
-    let l:pattern = escape(@", "\\/.*'$^~[]")
-    let l:pattern = substitute(l:pattern, "\n$", "", "")
-
-    if a:direction == 'gv'
-        call CmdLine("Ack '" . l:pattern . "' " )
-    elseif a:direction == 'replace'
-        call CmdLine("%s" . '/'. l:pattern . '/')
-    endif
-
-    let @/ = l:pattern
-    let @" = l:saved_reg
-endfunction
-
-
-""""""""""""""""""""""""""""""
-" => Python section
-""""""""""""""""""""""""""""""
-let python_highlight_all = 1
-au FileType python syn keyword pythonDecorator True None False self
-
-au BufNewFile,BufRead *.jinja set syntax=htmljinja
-au BufNewFile,BufRead *.mako set ft=mako
-
-au FileType python map <buffer> F :set foldmethod=indent<cr>
-
-au FileType python inoremap <buffer> $r return 
-au FileType python inoremap <buffer> $i import 
-au FileType python inoremap <buffer> $p print 
-au FileType python inoremap <buffer> $f # --- <esc>a
-au FileType python map <buffer> <leader>1 /class 
-au FileType python map <buffer> <leader>2 /def 
-au FileType python map <buffer> <leader>C ?class 
-au FileType python map <buffer> <leader>D ?def 
-
-
-""""""""""""""""""""""""""""""
-" => JavaScript section
-"""""""""""""""""""""""""""""""
-au FileType javascript call JavaScriptFold()
-au FileType javascript setl fen
-au FileType javascript setl nocindent
-
-au FileType javascript imap <C-t> $log();<esc>hi
-au FileType javascript imap <C-a> alert();<esc>hi
-
-au FileType javascript inoremap <buffer> $r return 
-au FileType javascript inoremap <buffer> $f // --- PH<esc>FP2xi
-
-function! JavaScriptFold() 
-    setl foldmethod=syntax
-    setl foldlevelstart=1
-    syn region foldBraces start=/{/ end=/}/ transparent fold keepend extend
-
-    function! FoldText()
-        return substitute(getline(v:foldstart), '{.*', '{...}', '')
-    endfunction
-    setl foldtext=FoldText()
-endfunction
-
-
-""""""""""""""""""""""""""""""
-" => CoffeeScript section
-"""""""""""""""""""""""""""""""
-function! CoffeeScriptFold()
-    setl foldmethod=indent
-    setl foldlevelstart=1
-endfunction
-au FileType coffee call CoffeeScriptFold()
-
-au FileType gitcommit call setpos('.', [0, 1, 1, 0])
-
-
-""""""""""""""""""""""""""""""
-" => Shell section
-""""""""""""""""""""""""""""""
-if exists('$TMUX') 
-    if has('nvim')
-        set termguicolors
-    else
-        set term=screen-256color 
-    endif
-endif
-
-
-""""""""""""""""""""""""""""""
-" => Twig section
-""""""""""""""""""""""""""""""
-autocmd BufRead *.twig set syntax=html filetype=html
-
-
-""""""""""""""""""""""""""""""
-" => Markdown
-""""""""""""""""""""""""""""""
-let vim_markdown_folding_disabled = 1
--- a/README.md
+++ b/README.md
--- a/demos/asr_hub/README.md
+++ b/demos/asr_hub/README.md
+# ASR
+
+```shell
+CUDA_VISIBLE_DEVICES=0 ./run.sh
+```
--- a/demos/asr_hub/hub_infer.py
+++ b/demos/asr_hub/hub_infer.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+
+import paddle
+import paddlehub as hub
+
+# yapf: disable
+parser = argparse.ArgumentParser(__doc__)
+parser.add_argument("--device", type=str, default='gpu', choices=['cpu', 'gpu'])
+parser.add_argument("--wav_en", type=str)
+parser.add_argument("--wav_zh", type=str)
+args = parser.parse_args()
+# yapf: enable
+
+if __name__ == '__main__':
+    paddle.set_device(args.device)
+
+    s2t_en_model = hub.Module(name='u2_conformer_librispeech')
+    s2t_zh_model = hub.Module(name='u2_conformer_aishell')
+
+    args.wav_en = os.path.abspath(os.path.expanduser(args.wav_en))
+    args.wav_zh = os.path.abspath(os.path.expanduser(args.wav_zh))
+
+    assert os.path.isfile(args.wav_en) and os.path.isfile(
+        args.wav_zh), 'Wav files not exist.'
+
+    print('[S2T][en]Wav: {}'.format(args.wav_en))
+    text_en = s2t_en_model.speech_recognize(args.wav_en)
+    print('[S2T][en]Text: {}'.format(text_en))
+
+    print('[S2T][zh]Wav: {}'.format(args.wav_zh))
+    text_zh = s2t_zh_model.speech_recognize(args.wav_zh)
+    print('[S2T][zh]Text: {}'.format(text_zh))
--- a/demos/asr_hub/run.sh
+++ b/demos/asr_hub/run.sh
+#!/bin/bash
+
+if python -c "import paddlehub" &> /dev/null; then
+    echo 'PaddleHub has already been installed.'
+else
+    echo 'Installing PaddleHub...'
+    pip install paddlehub -U
+fi
+
+mkdir -p data
+wav_en=data/en.wav
+wav_zh=data/zh.wav
+test -e ${wav_en}  || wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav -P data
+test -e ${wav_zh}  || wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav -P data
+
+ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+if [ ${ngpu} == 0 ];then
+    device=cpu
+else
+    device=gpu
+fi
+
+echo "using ${device}..."
+
+python3 -u hub_infer.py \
+--device ${device} \
+--wav_en ${wav_en} \
+--wav_zh ${wav_zh}
+
+exit 0
--- a/demos/echo_hub/.gitignore
+++ b/demos/echo_hub/.gitignore
+data
--- a/demos/echo_hub/README.md
+++ b/demos/echo_hub/README.md
+# echo system
+
+ASR + TTS
+
+中文：
+```shell
+CUDA_VISIBLE_DEVICES=0 ./run.sh 用科技让复杂的世界更简单 . zh
+```
+
+英文：
+```shell
+CUDA_VISIBLE_DEVICES=0 ./run.sh "Text to speech system converts normal language text into speech." . en
+```
--- a/demos/echo_hub/hub_infer.py
+++ b/demos/echo_hub/hub_infer.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+
+import librosa
+import paddle
+import paddlehub as hub
+import soundfile as sf
+
+# yapf: disable
+parser = argparse.ArgumentParser(__doc__)
+parser.add_argument("--lang", type=str, default='zh', choices=['zh', 'en'])
+parser.add_argument("--device", type=str, default='gpu', choices=['cpu', 'gpu'])
+parser.add_argument("--text", type=str, nargs='+')
+parser.add_argument("--output_dir", type=str)
+args = parser.parse_args()
+# yapf: enable
+
+if __name__ == '__main__':
+    paddle.set_device(args.device)
+
+    output_dir = os.path.abspath(os.path.expanduser(args.output_dir))
+    if args.lang == 'zh':
+        t2s_model = hub.Module(name='fastspeech2_baker', output_dir=output_dir)
+        s2t_model = hub.Module(name='u2_conformer_aishell')
+    else:
+        t2s_model = hub.Module(
+            name='fastspeech2_ljspeech', output_dir=output_dir)
+        s2t_model = hub.Module(name='u2_conformer_librispeech')
+
+    if isinstance(args.text, list):
+        args.text = ' '.join(args.text)
+
+    wavs = t2s_model.generate([args.text], device=args.device)
+    print('[T2S]Wav file has been generated: {}'.format(wavs[0]))
+    # convert sr to 16k
+    x, sr = librosa.load(wavs[0])
+    y = librosa.resample(x, sr, 16000)
+    wav_16k = wavs[0].replace('.wav', '_16k.wav')
+    sf.write(wav_16k, y, 16000)
+    print('[S2T]Resample to 16k: {}'.format(wav_16k))
+    text = s2t_model.speech_recognize(wav_16k)
+    print('[S2T]Text recognized from wav file: {}'.format(text))
--- a/demos/echo_hub/run.sh
+++ b/demos/echo_hub/run.sh
+#!/bin/bash
+
+if python -c "import paddlehub" &> /dev/null; then
+    echo 'PaddleHub has already been installed.'
+else
+    echo 'Installing PaddleHub...'
+    pip install paddlehub -U
+fi
+
+if [ $# != 2 -a $# != 3 ];then
+    echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} text output_dir [lang]"
+    exit -1
+fi
+
+ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+if [ ${ngpu} == 0 ];then
+    device=cpu
+else
+    device=gpu
+fi
+
+echo "using ${device}..."
+
+text=$1
+output_dir=$2
+if [ $# == 3 ];then
+    lang=$3
+else
+    lang=zh
+fi
+
+if [ ! -d $output_dir ];then
+    mkdir -p $output_dir
+fi
+
+python3 -u hub_infer.py \
+--lang ${lang} \
+--device ${device} \
+--text \"${text}\" \
+--output_dir ${output_dir}
+
+exit 0
--- a/demos/metaverse/Lamarr.png
+++ b/demos/metaverse/Lamarr.png
--- a/speechnn/env.sh
+++ b/speechnn/env.sh
-export MAIN_ROOT=${PWD}
+#!/bin/bash
+export MAIN_ROOT=`realpath ${PWD}/../../`

-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}:/usr/local/bin/
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
 export LC_ALL=C

+export PYTHONDONTWRITEBYTECODE=1
 # Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
 export PYTHONIOENCODING=UTF-8
 export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}

-export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/
+MODEL=fastspeech2
+export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
--- a/demos/metaverse/run.sh
+++ b/demos/metaverse/run.sh
+#!/bin/bash
+source path.sh
+
+gpus=0
+stage=0
+stop_stage=100
+
+# with the following command, you can choice the stage range you want to run
+# such as `./run.sh --stage 0 --stop-stage 0`
+# this can not be mixed use with `$1`, `$2` ...
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
+
+mkdir -p download
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # install PaddleGAN
+    git clone https://github.com/PaddlePaddle/PaddleGAN.git
+    pip install -e PaddleGAN/
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then 
+    # download pretrained PaddleGAN model
+    wget -P download https://paddlegan.bj.bcebos.com/models/wav2lip_hq.pdparams
+fi 
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # download pretrained tts models and unzip
+    wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip
+    unzip -d download download/pwg_baker_ckpt_0.4.zip
+    wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip
+    unzip -d download download/fastspeech2_nosil_baker_ckpt_0.4.zip
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    # run tts
+    CUDA_VISIBLE_DEVICES=${gpus} \
+    python3 ${BIN_DIR}/synthesize_e2e.py \
+        --fastspeech2-config=download/fastspeech2_nosil_baker_ckpt_0.4/default.yaml \
+        --fastspeech2-checkpoint=download/fastspeech2_nosil_baker_ckpt_0.4/snapshot_iter_76000.pdz \
+        --fastspeech2-stat=download/fastspeech2_nosil_baker_ckpt_0.4/speech_stats.npy \
+        --pwg-config=download/pwg_baker_ckpt_0.4/pwg_default.yaml \
+        --pwg-checkpoint=download/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
+        --pwg-stat=download/pwg_baker_ckpt_0.4/pwg_stats.npy \
+        --text=sentences.txt \
+        --output-dir=output/wavs \
+        --inference-dir=output/inference \
+        --phones-dict=download/fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt
+    # output/inference is not needed here, which save the static models
+    rm -rf output/inference
+fi
+
+if [ ${stage} -le  4 ] && [ ${stop_stage} -ge 4 ]; then
+    # We only test one audio here, cause it's slow
+    CUDA_VISIBLE_DEVICES=${gpus} \
+    python3 PaddleGAN/applications/tools/wav2lip.py \
+        --checkpoint_path download/wav2lip_hq.pdparams \
+        --face Lamarr.png \
+        --audio output/wavs/000.wav \
+        --outfile output/tts_lips.mp4 \
+        --face_enhancement
+fi
--- a/demos/metaverse/sentences.txt
+++ b/demos/metaverse/sentences.txt
+000 谁知青蛙一落地，竟变成了一位英俊的王子。于是遵照国王的意思，他做了公主的亲密伴侣。
--- a/demos/story_talker/imgs/000.jpg
+++ b/demos/story_talker/imgs/000.jpg
--- a/demos/story_talker/ocr.py
+++ b/demos/story_talker/ocr.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+import re
+from pathlib import Path
+
+import paddle
+from paddleocr import draw_ocr
+from paddleocr import PaddleOCR
+from PIL import Image
+
+
+def evaluate(args, ocr):
+    img_dir = Path(args.img_dir)
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    img_out_dir = output_dir / "imgs"
+    img_out_dir.mkdir(parents=True, exist_ok=True)
+    with open(output_dir / "sentences.txt", "w") as wf:
+        for name in os.listdir(img_dir):
+            id = name.split(".")[0]
+            img_path = img_dir / name
+            result = ocr.ocr(str(img_path), cls=True)
+            # draw result
+            image = Image.open(img_path).convert('RGB')
+            boxes = [line[0] for line in result]
+            txts = [line[1][0] for line in result]
+            scores = [line[1][1] for line in result]
+            im_show = draw_ocr(
+                image, boxes, txts, scores, font_path=args.font_path)
+            im_show = Image.fromarray(im_show)
+            paragraph = "".join(txts)
+            # 过滤出中文结果
+            pattern = re.compile(r'[^(\u4e00-\u9fa5)+，。？、]')
+            sentence = re.sub(pattern, '', paragraph)
+            im_show.save(img_out_dir / name)
+            wf.write(id + " " + sentence + "\n")
+
+
+def main():
+    # parse args and config and redirect to train_sp
+    parser = argparse.ArgumentParser(
+        description="Synthesize with fastspeech2 & parallel wavegan.")
+    parser.add_argument("--img-dir", default="imgs", type=str, help="img_dir.")
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default="output",
+        help="output sentences path.")
+    parser.add_argument(
+        "--font-path", type=str, default="simfang.ttf", help="font path")
+    args = parser.parse_args()
+
+    paddle.set_device("gpu")
+    # need to run only once to download and load model into memory
+    ocr = PaddleOCR(use_angle_cls=True, lang='ch')
+
+    evaluate(args, ocr)
+
+
+if __name__ == "__main__":
+    main()
--- a/demos/story_talker/path.sh
+++ b/demos/story_talker/path.sh
+#!/bin/bash
+export MAIN_ROOT=`realpath ${PWD}/../../`
+
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
+export LC_ALL=C
+
+export PYTHONDONTWRITEBYTECODE=1
+# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
+
+MODEL=fastspeech2
+export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
--- a/demos/story_talker/run.sh
+++ b/demos/story_talker/run.sh
+#!/bin/bash
+source path.sh
+
+gpus=0
+stage=0
+stop_stage=100
+
+# with the following command, you can choice the stage range you want to run
+# such as `./run.sh --stage 0 --stop-stage 0`
+# this can not be mixed use with `$1`, `$2` ...
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
+
+mkdir -p download
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # install PaddleOCR
+    pip install "paddleocr>=2.0.1"
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # download pretrained tts models and unzip
+    wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip
+    unzip -d download download/pwg_baker_ckpt_0.4.zip
+    wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip
+    unzip -d download download/fastspeech2_nosil_baker_ckpt_0.4.zip
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # run ocr
+    CUDA_VISIBLE_DEVICES=${gpus} \
+    python3 ocr.py --img-dir=imgs --output-dir=output --font-path=simfang.ttf
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    # run tts
+    CUDA_VISIBLE_DEVICES=${gpus} \
+    python3 ${BIN_DIR}/synthesize_e2e.py \
+        --fastspeech2-config=download/fastspeech2_nosil_baker_ckpt_0.4/default.yaml \
+        --fastspeech2-checkpoint=download/fastspeech2_nosil_baker_ckpt_0.4/snapshot_iter_76000.pdz \
+        --fastspeech2-stat=download/fastspeech2_nosil_baker_ckpt_0.4/speech_stats.npy \
+        --pwg-config=download/pwg_baker_ckpt_0.4/pwg_default.yaml \
+        --pwg-checkpoint=download/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
+        --pwg-stat=download/pwg_baker_ckpt_0.4/pwg_stats.npy \
+        --text=output/sentences.txt \
+        --output-dir=output/wavs \
+        --inference-dir=output/inference \
+        --phones-dict=download/fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt
+    # output/inference is not needed here, which save the static models
+    rm -rf output/inference
+fi
--- a/demos/story_talker/simfang.ttf
+++ b/demos/story_talker/simfang.ttf
--- a/demos/style_fs2/path.sh
+++ b/demos/style_fs2/path.sh
+#!/bin/bash
+export MAIN_ROOT=`realpath ${PWD}/../../`
+
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
+export LC_ALL=C
+
+export PYTHONDONTWRITEBYTECODE=1
+# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
+MODEL=fastspeech2
+export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
--- a/demos/style_fs2/run.sh
+++ b/demos/style_fs2/run.sh
+#!/bin/bash
+source path.sh
+
+gpus=0
+stage=0
+stop_stage=100
+
+# with the following command, you can choice the stage range you want to run
+# such as `./run.sh --stage 0 --stop-stage 0`
+# this can not be mixed use with `$1`, `$2` ...
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
+
+mkdir -p download
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # download pretrained tts models and unzip
+    wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip
+    unzip -d download download/pwg_baker_ckpt_0.4.zip
+    wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip
+    unzip -d download download/fastspeech2_nosil_baker_ckpt_0.4.zip
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # run tts
+    CUDA_VISIBLE_DEVICES=${gpus} \
+    python3 style_syn.py \
+        --fastspeech2-config=download/fastspeech2_nosil_baker_ckpt_0.4/default.yaml \
+        --fastspeech2-checkpoint=download/fastspeech2_nosil_baker_ckpt_0.4/snapshot_iter_76000.pdz \
+        --fastspeech2-stat=download/fastspeech2_nosil_baker_ckpt_0.4/speech_stats.npy \
+        --fastspeech2-pitch-stat=download/fastspeech2_nosil_baker_ckpt_0.4/pitch_stats.npy \
+        --fastspeech2-energy-stat=download/fastspeech2_nosil_baker_ckpt_0.4/energy_stats.npy \
+        --pwg-config=download/pwg_baker_ckpt_0.4/pwg_default.yaml \
+        --pwg-checkpoint=download/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
+        --pwg-stat=download/pwg_baker_ckpt_0.4/pwg_stats.npy \
+        --text=${BIN_DIR}/../sentences.txt \
+        --output-dir=output \
+        --phones-dict=download/fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt
+fi
--- a/demos/style_fs2/sentences.txt
+++ b/demos/style_fs2/sentences.txt
+000 谁知青蛙一落地，竟变成了一位英俊的王子。于是遵照国王的意思，他做了公主的亲密伴侣。
--- a/demos/style_fs2/style_syn.py
+++ b/demos/style_fs2/style_syn.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+from pathlib import Path
+
+import numpy as np
+import paddle
+import soundfile as sf
+import yaml
+from yacs.config import CfgNode
+
+from paddlespeech.t2s.frontend.zh_frontend import Frontend
+from paddlespeech.t2s.models.fastspeech2 import FastSpeech2
+from paddlespeech.t2s.models.fastspeech2 import StyleFastSpeech2Inference
+from paddlespeech.t2s.models.parallel_wavegan import PWGGenerator
+from paddlespeech.t2s.models.parallel_wavegan import PWGInference
+from paddlespeech.t2s.modules.normalizer import ZScore
+
+
+def evaluate(args, fastspeech2_config, pwg_config):
+
+    # construct dataset for evaluation
+    sentences = []
+    with open(args.text, 'rt') as f:
+        for line in f:
+            utt_id, sentence = line.strip().split()
+            sentences.append((utt_id, sentence))
+
+    with open(args.phones_dict, "r") as f:
+        phn_id = [line.strip().split() for line in f.readlines()]
+    vocab_size = len(phn_id)
+    print("vocab_size:", vocab_size)
+
+    odim = fastspeech2_config.n_mels
+    model = FastSpeech2(
+        idim=vocab_size, odim=odim, **fastspeech2_config["model"])
+
+    model.set_state_dict(
+        paddle.load(args.fastspeech2_checkpoint)["main_params"])
+    model.eval()
+
+    vocoder = PWGGenerator(**pwg_config["generator_params"])
+    vocoder.set_state_dict(paddle.load(args.pwg_checkpoint)["generator_params"])
+    vocoder.remove_weight_norm()
+    vocoder.eval()
+    print("model done!")
+
+    frontend = Frontend(phone_vocab_path=args.phones_dict)
+    print("frontend done!")
+
+    stat = np.load(args.fastspeech2_stat)
+    mu, std = stat
+    mu = paddle.to_tensor(mu)
+    std = paddle.to_tensor(std)
+    fastspeech2_normalizer = ZScore(mu, std)
+
+    stat = np.load(args.pwg_stat)
+    mu, std = stat
+    mu = paddle.to_tensor(mu)
+    std = paddle.to_tensor(std)
+    pwg_normalizer = ZScore(mu, std)
+
+    fastspeech2_inference = StyleFastSpeech2Inference(
+        fastspeech2_normalizer, model, args.fastspeech2_pitch_stat,
+        args.fastspeech2_energy_stat)
+    fastspeech2_inference.eval()
+
+    pwg_inference = PWGInference(pwg_normalizer, vocoder)
+    pwg_inference.eval()
+
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    styles = ["normal", "robot", "1.2xspeed", "0.8xspeed", "child_voice"]
+    for style in styles:
+        robot = False
+        durations = None
+        durations_scale = None
+        durations_bias = None
+        pitch = None
+        pitch_scale = None
+        pitch_bias = None
+        energy = None
+        energy_scale = None
+        energy_bias = None
+        if style == "robot":
+            # all tones in phones be `1`
+            # all pitch should be the same, we use mean here
+            robot = True
+        if style == "1.2xspeed":
+            durations_scale = 1 / 1.2
+        if style == "0.8xspeed":
+            durations_scale = 1 / 0.8
+        if style == "child_voice":
+            pitch_scale = 1.3
+        sub_output_dir = output_dir / style
+        sub_output_dir.mkdir(parents=True, exist_ok=True)
+        for utt_id, sentence in sentences:
+            input_ids = frontend.get_input_ids(
+                sentence, merge_sentences=True, robot=robot)
+            phone_ids = input_ids["phone_ids"][0]
+
+            with paddle.no_grad():
+                mel = fastspeech2_inference(
+                    phone_ids,
+                    durations=durations,
+                    durations_scale=durations_scale,
+                    durations_bias=durations_bias,
+                    pitch=pitch,
+                    pitch_scale=pitch_scale,
+                    pitch_bias=pitch_bias,
+                    energy=energy,
+                    energy_scale=energy_scale,
+                    energy_bias=energy_bias,
+                    robot=robot)
+                wav = pwg_inference(mel)
+
+            sf.write(
+                str(sub_output_dir / (utt_id + ".wav")),
+                wav.numpy(),
+                samplerate=fastspeech2_config.fs)
+            print(f"{style}_{utt_id} done!")
+
+
+def main():
+    # parse args and config and redirect to train_sp
+    parser = argparse.ArgumentParser(
+        description="Synthesize with fastspeech2 & parallel wavegan.")
+    parser.add_argument(
+        "--fastspeech2-config", type=str, help="fastspeech2 config file.")
+    parser.add_argument(
+        "--fastspeech2-checkpoint",
+        type=str,
+        help="fastspeech2 checkpoint to load.")
+    parser.add_argument(
+        "--fastspeech2-stat",
+        type=str,
+        help="mean and standard deviation used to normalize spectrogram when training fastspeech2."
+    )
+    parser.add_argument(
+        "--fastspeech2-pitch-stat",
+        type=str,
+        help="mean and standard deviation used to normalize pitch when training fastspeech2"
+    )
+    parser.add_argument(
+        "--fastspeech2-energy-stat",
+        type=str,
+        help="mean and standard deviation used to normalize energy when training fastspeech2."
+    )
+    parser.add_argument(
+        "--pwg-config", type=str, help="parallel wavegan config file.")
+    parser.add_argument(
+        "--pwg-checkpoint",
+        type=str,
+        help="parallel wavegan generator parameters to load.")
+    parser.add_argument(
+        "--pwg-stat",
+        type=str,
+        help="mean and standard deviation used to normalize spectrogram when training parallel wavegan."
+    )
+    parser.add_argument(
+        "--phones-dict",
+        type=str,
+        default="phone_id_map.txt",
+        help="phone vocabulary file.")
+    parser.add_argument(
+        "--text",
+        type=str,
+        help="text to synthesize, a 'utt_id sentence' pair per line.")
+    parser.add_argument("--output-dir", type=str, help="output dir.")
+    parser.add_argument(
+        "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
+    parser.add_argument("--verbose", type=int, default=1, help="verbose.")
+
+    args = parser.parse_args()
+
+    if args.ngpu == 0:
+        paddle.set_device("cpu")
+    elif args.ngpu > 0:
+        paddle.set_device("gpu")
+    else:
+        print("ngpu should >= 0 !")
+
+    with open(args.fastspeech2_config) as f:
+        fastspeech2_config = CfgNode(yaml.safe_load(f))
+    with open(args.pwg_config) as f:
+        pwg_config = CfgNode(yaml.safe_load(f))
+
+    print("========Args========")
+    print(yaml.safe_dump(vars(args)))
+    print("========Config========")
+    print(fastspeech2_config)
+    print(pwg_config)
+
+    evaluate(args, fastspeech2_config, pwg_config)
+
+
+if __name__ == "__main__":
+    main()
--- a/demos/tts_hub/README.md
+++ b/demos/tts_hub/README.md
+# TTS
+
+中文：
+```shell
+CUDA_VISIBLE_DEVICES=0 ./run.sh 用科技让复杂的世界更简单 . zh
+```
+
+英文：
+```shell
+CUDA_VISIBLE_DEVICES=0 ./run.sh "Text to speech system converts normal language text into speech." . en
+```
--- a/demos/tts_hub/hub_infer.py
+++ b/demos/tts_hub/hub_infer.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+
+import paddle
+import paddlehub as hub
+
+# yapf: disable
+parser = argparse.ArgumentParser(__doc__)
+parser.add_argument("--lang", type=str, default='zh', choices=['zh', 'en'])
+parser.add_argument("--device", type=str, default='gpu', choices=['cpu', 'gpu'])
+parser.add_argument("--text", type=str, nargs='+')
+parser.add_argument("--output_dir", type=str)
+args = parser.parse_args()
+# yapf: enable
+
+if __name__ == '__main__':
+    paddle.set_device(args.device)
+
+    output_dir = os.path.abspath(os.path.expanduser(args.output_dir))
+    if args.lang == 'zh':
+        t2s_model = hub.Module(name='fastspeech2_baker', output_dir=output_dir)
+    else:
+        t2s_model = hub.Module(
+            name='fastspeech2_ljspeech', output_dir=output_dir)
+
+    if isinstance(args.text, list):
+        args.text = ' '.join(args.text)
+
+    wavs = t2s_model.generate([args.text], device=args.device)
+    print('[T2S]Wav file has been generated: {}'.format(wavs[0]))
--- a/demos/tts_hub/run.sh
+++ b/demos/tts_hub/run.sh
+#!/bin/bash
+
+if python -c "import paddlehub" &> /dev/null; then
+    echo 'PaddleHub has already been installed.'
+else
+    echo 'Installing PaddleHub...'
+    pip install paddlehub -U
+fi
+
+if [ $# != 2 -a $# != 3 ];then
+    echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} text output_dir [lang]"
+    exit -1
+fi
+
+ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+if [ ${ngpu} == 0 ];then
+    device=cpu
+else
+    device=gpu
+fi
+
+echo "using ${device}..."
+
+text=$1
+output_dir=$2
+if [ $# == 3 ];then
+    lang=$3
+else
+    lang=zh
+fi
+
+if [ ! -d $output_dir ];then
+    mkdir -p $output_dir
+fi
+
+python3 -u hub_infer.py \
+--lang ${lang} \
+--device ${device} \
+--text \"${text}\" \
+--output_dir ${output_dir}
+
+exit 0
--- a/docs/images/PaddleSpeech_log.png
+++ b/docs/images/PaddleSpeech_log.png
--- a/docs/source/asr/models_introduction.md
+++ b/docs/source/asr/models_introduction.md
@@ -13,7 +13,7 @@ In addition, the training process and the testing process are also introduced.
 The arcitecture of the model is shown in Fig.1.

 <p align="center">
-    <img src="https://raw.githubusercontent.com/PaddlePaddle/DeepSpeech/develop/docs/images/ds2onlineModel.png" width=800>
+    <img src="https://raw.githubusercontent.com/PaddlePaddle/PaddleSpeech/develop/docs/images/ds2onlineModel.png" width=800>
    <br/>Fig.1 The Arcitecture of deepspeech2 online model
 </p>

@@ -160,7 +160,7 @@ The deepspeech2 offline model is similarity to the deepspeech2 online model. The

 The arcitecture of the model is shown in Fig.2.
 <p align="center">
-    <img src="https://raw.githubusercontent.com/PaddlePaddle/DeepSpeech/develop/docs/images/ds2offlineModel.png" width=800>
+    <img src="https://raw.githubusercontent.com/PaddlePaddle/PaddleSpeech/develop/docs/images/ds2offlineModel.png" width=800>
    <br/>Fig.2 The Arcitecture of deepspeech2 offline model
 </p>


--- a/docs/source/dependencies.md
+++ b/docs/source/dependencies.md
+# The Dependencies
+
+## By apt-get
+
+### The base dependencies:
+
+```
+bc flac jq vim tig tree pkg-config libsndfile1 libflac-dev libvorbis-dev libboost-dev swig python3-dev
+```
+
+### The dependencies of kenlm:
+
+```  
+build-essential cmake libboost-system-dev libboost-thread-dev libboost-program-options-dev libboost-test-dev libeigen3-dev zlib1g-dev libbz2-dev liblzma-dev gcc-5 g++-5
+```
+
+### The dependencies of sox:
+
+```
+libvorbis-dev libmp3lame-dev libmad-ocaml-dev
+```  
+
+
+## By make or setup
+
+```  
+kenlm
+sox
+mfa
+openblas
+kaldi
+sctk
+AutoLog
+swig-decoder
+python_kaldi_features
+```
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -23,7 +23,7 @@ Contents
   
 .. toctree::
   :maxdepth: 1
-   :caption: Speech-To-Text
+   :caption: Speech-to-Text

   asr/models_introduction
   asr/data_preparation
@@ -33,7 +33,7 @@ Contents

 .. toctree::
   :maxdepth: 1
-   :caption: Text-To-Speech
+   :caption: Text-to-Speech

   tts/basic_usage
   tts/advanced_usage

--- a/docs/source/install.md
+++ b/docs/source/install.md
@@ -6,52 +6,38 @@ To avoid the trouble of environment setup, [running in Docker container](#runnin
 - Python >= 3.7
 - PaddlePaddle latest version (please refer to the [Installation Guide](https://www.paddlepaddle.org.cn/documentation/docs/en/beginners_guide/index_en.html))

-## Setup (Important)
-
- Make sure these libraries or tools installed: `pkg-config`, `flac`, `ogg`, `vorbis`, `boost`, `sox`, and `swig`, e.g. installing them via `apt-get`:
-
-```bash
-sudo apt-get install -y sox pkg-config libflac-dev libogg-dev libvorbis-dev libboost-dev swig python3-dev
-```
-The version of `swig` should >= 3.0
-
-or, installing them via `yum`:
-
-```bash
-sudo yum install pkgconfig libogg-devel libvorbis-devel boost-devel python3-devel
-wget https://ftp.osuosl.org/pub/xiph/releases/flac/flac-1.3.1.tar.xz
-xz -d flac-1.3.1.tar.xz
-tar -xvf flac-1.3.1.tar
-cd flac-1.3.1
-./configure
-make
-make install
-```
+## Simple Setup

- Run the setup script for the remaining dependencies
+For user who working on `Ubuntu` with `root`  privilege.

-```bash
+```python
 git clone https://github.com/PaddlePaddle/DeepSpeech.git
 cd DeepSpeech
-pushd tools; make virtualenv.done:; popd
-source tools/venv/bin/activate
 pip install -e .
 ```

- Source venv before do experiment.
+For user who only needs the basic function of paddlespeech, using conda to do installing is recommended.
+You can go to [minicoda](https://docs.conda.io/en/latest/miniconda.html) to select a version and install it by yourself, or you can use the scripts below to install the last miniconda version.

-```bash
-source tools/venv/bin/activate
+```python
+pushd tools
+bash extras/install_miniconda.sh
+popd
+bash
 ```

-## Simple Setup
-
+After installing the conda, run the setup.sh to complete the installing process.
 ```python
-git clone https://github.com/PaddlePaddle/DeepSpeech.git
-cd DeepSpeech
-pip install -e .
+bash setup.sh
 ```

+
+## Setup (Other Platform)
+
+- Make sure these libraries or tools in [dependencies](./dependencies.md) installed. More information please see: `setup.py `and ` tools/Makefile`.
+- The version of `swig` should >= 3.0
+- we will do more to simplify the install process.
+
 ## Running in Docker Container (optional)

 Docker is an open source tool to build, ship, and run distributed applications in an isolated environment. A Docker image for this project has been provided in [hub.docker.com](https://hub.docker.com) with all the dependencies installed. This Docker image requires the support of NVIDIA GPU, so please make sure its availiability and the [nvidia-docker](https://github.com/NVIDIA/nvidia-docker) has been installed.

--- a/docs/source/introduction.md
+++ b/docs/source/introduction.md
 # PaddleSpeech

 ## What is PaddleSpeech?
-PaddleSpeech is an open-source toolkit on PaddlePaddle platform for two critical tasks in Speech -  Speech-To-Text (Automatic Speech Recognition, ASR) and Text-To-Speech Synthesis (TTS), with modules involving state-of-art and influential models.
+PaddleSpeech is an open-source toolkit on PaddlePaddle platform for two critical tasks in Speech -  Speech-to-Text (Automatic Speech Recognition, ASR) and Text-to-Speech Synthesis (TTS), with modules involving state-of-art and influential models.

 ## What can PaddleSpeech do?

-### Speech-To-Text
-(An introduce of ASR in PaddleSpeech is needed here!)
+### Speech-to-Text
+PaddleSpeech ASR mainly consists of components below:
+- Implementation of models and commonly used neural network layers.
+- Dataset abstraction and common data preprocessing pipelines.
+- Ready-to-run experiments.
+
+PaddleSpeech ASR provides you with a complete ASR pipeline, including:
+- Data Preparation
+    - Build vocabulary
+    - Compute Cepstral mean and variance normalization (CMVN)
+    - Featrue extraction
+        - linear
+        - fbank (also support kaldi feature)
+        - mfcc
+- Acoustic Models
+    - Deepspeech2 (Streaming and Non-Streaming)
+    - Transformer (Streaming and Non-Streaming)
+    - Conformer (Streaming and Non-Streaming)
+- Decoder
+    - ctc greedy search (used in DeepSpeech2, Transformer and Conformer)
+    - ctc beam search (used in DeepSpeech2, Transformer and Conformer)
+    - attention decoding (used in Transformer and Conformer)
+    - attention rescoring (used in Transformer and Conformer)
+
+Speech-to-Text helps you training the ASR model very simply.

-### Text-To-Speech
+### Text-to-Speech
 TTS mainly consists of components below:
 - Implementation of models and commonly used neural network layers.
 - Dataset abstraction and common data preprocessing pipelines.
@@ -30,4 +53,4 @@ PaddleSpeech TTS provides you with a complete TTS pipeline, including:
    - Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis
    - GE2E

-Text-To-Speech  helps you to train TTS models with simple commands.
+Text-to-Speech  helps you to train TTS models with simple commands.
--- a/docs/source/reference.md
+++ b/docs/source/reference.md
 # Reference

-We borrowed a lot of code from these repos to build `model` and `engine`, thank for these great work:
+We borrowed a lot of code from these repos to build `model` and `engine`, thank for these great work and opensource community!

 * [espnet](https://github.com/espnet/espnet/blob/master/LICENSE)
 - Apache-2.0 License

--- a/docs/source/released_model.md
+++ b/docs/source/released_model.md
+
 # Released Models

-## Speech-To-Text Models
+## Speech-to-Text Models
 ### Acoustic Model Released in paddle 2.X
-Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | Hours of speech
-:-------------:| :------------:| :-----: | -----: | :----------------- |:--------- | :---------- | :---------
-[Ds2 Online Aishell Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s0/aishell.s0.ds_online.5rnn.debug.tar.gz) | Aishell Dataset | Char-based | 345 MB  | 2 Conv + 5 LSTM layers with only forward direction | 0.0824 |-| 151 h
-[Ds2 Offline Aishell Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s0/aishell.s0.ds2.offline.cer6p65.release.tar.gz)| Aishell Dataset | Char-based | 306 MB | 2 Conv + 3 bidirectional GRU layers| 0.065 |-| 151 h
-[Conformer Online Aishell Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.chunk.release.tar.gz) | Aishell Dataset | Char-based | 283 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention + CTC | 0.0594 |-| 151 h
-[Conformer Offline Aishell Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.release.tar.gz) | Aishell Dataset | Char-based | 284 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention | 0.0547 |-| 151 h
-[Conformer Librispeech Model](https://deepspeech.bj.bcebos.com/release2.1/librispeech/s1/conformer.release.tar.gz) | Librispeech Dataset | Word-based | 287 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention |-| 0.0325 | 960 h
-[Transformer Librispeech Model](https://deepspeech.bj.bcebos.com/release2.1/librispeech/s1/transformer.release.tar.gz) | Librispeech Dataset | Word-based | 195 MB  | Encoder:Transformer, Decoder:Transformer, Decoding method: Attention |-| 0.0544 | 960 h
+Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | Hours of speech | example link
+:-------------:| :------------:| :-----: | -----: | :----------------- |:--------- | :---------- | :--------- | :-----------
+[Ds2 Online Aishell S0 Model](https://deepspeech.bj.bcebos.com/release2.2/aishell/s0/ds2_online_aishll_CER8.02_release.tar.gz) | Aishell Dataset | Char-based | 345 MB  | 2 Conv + 5 LSTM layers with only forward direction | 0.080218 |-| 151 h | [D2 Online Aishell S0 Example](../../examples/aishell/s0)
+[Ds2 Offline Aishell S0 Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s0/aishell.s0.ds2.offline.cer6p65.release.tar.gz)| Aishell Dataset | Char-based | 306 MB | 2 Conv + 3 bidirectional GRU layers| 0.065 |-| 151 h | [Ds2 Offline Aishell S0 Example](../../examples/aishell/s0)
+[Conformer Online Aishell S1 Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.chunk.release.tar.gz) | Aishell Dataset | Char-based | 283 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0594 |-| 151 h | [Conformer Online Aishell S1 Example](../../examples/aishell/s1)
+[Conformer Offline Aishell S1 Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.release.tar.gz) | Aishell Dataset | Char-based | 284 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0547 |-| 151 h | [Conformer Offline Aishell S1 Example](../../examples/aishell/s1)
+[Conformer Librispeech S1 Model](https://deepspeech.bj.bcebos.com/release2.1/librispeech/s1/conformer.release.tar.gz) | Librispeech Dataset | subword-based | 287 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring |-| 0.0325 | 960 h | [Conformer Librispeech S1 example](../../example/librispeech/s1)
+[Transformer Librispeech S1 Model](https://deepspeech.bj.bcebos.com/release2.2/librispeech/s1/librispeech.s1.transformer.all.wer5p62.release.tar.gz) | Librispeech Dataset | subword-based | 131 MB  | Encoder:Transformer, Decoder:Transformer, Decoding method: Attention rescoring |-| 0.0456 | 960 h | [Transformer Librispeech S1 example](../../example/librispeech/s1)
+[Transformer Librispeech S2 Model](https://deepspeech.bj.bcebos.com/release2.2/librispeech/s2/libri_transformer_espnet_wer3p84.release.tar.gz) | Librispeech Dataset | subword-based | 131 MB  | Encoder:Transformer, Decoder:Transformer, Decoding method: Attention |-| 0.0384 | 960 h | [Transformer Librispeech S2 example](../../example/librispeech/s2)

 ### Acoustic Model Transformed from paddle 1.8
 Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | Hours of speech
@@ -26,30 +28,31 @@ Language Model | Training Data | Token-based | Size | Descriptions
 [Mandarin LM Small](https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm) | Baidu Internal Corpus | Char-based | 2.8 GB | Pruned with 0 1 2 4 4; <br/> About 0.13 billion n-grams; <br/> 'probing' binary with default settings
 [Mandarin LM Large](https://deepspeech.bj.bcebos.com/zh_lm/zhidao_giga.klm) | Baidu Internal Corpus | Char-based | 70.4 GB | No Pruning; <br/> About 3.7 billion n-grams; <br/> 'probing' binary with default settings

-## Text-To-Speech Models
+## Text-to-Speech Models
 ### Acoustic Models
-Model Type | Dataset| Example Link | Pretrained Models
-:-------------:| :------------:| :-----: | :-----
-Tacotron2|LJSpeech|[tacotron2-vctk](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/ljspeech/tts0)|[tacotron2_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_ckpt_0.3.zip)
-TransformerTTS| LJSpeech| [transformer-ljspeech](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/ljspeech/tts1)|[transformer_tts_ljspeech_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/transformer_tts_ljspeech_ckpt_0.4.zip)
-SpeedySpeech| CSMSC | [speedyspeech-csmsc](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/csmsc/tts2) |[speedyspeech_nosil_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/speedyspeech_nosil_baker_ckpt_0.5.zip)
-FastSpeech2| CSMSC |[fastspeech2-csmsc](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/csmsc/tts3)|[fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip)
-FastSpeech2| AISHELL-3 |[fastspeech2-aishell3](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/aishell3/tts3)|[fastspeech2_nosil_aishell3_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_aishell3_ckpt_0.4.zip)
-FastSpeech2| LJSpeech |[fastspeech2-ljspeech](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/ljspeech/tts3)|[fastspeech2_nosil_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_ljspeech_ckpt_0.5.zip)
-FastSpeech2| VCTK |[fastspeech2-csmsc](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/vctk/tts3)|[fastspeech2_nosil_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_vctk_ckpt_0.5.zip)
-
+Model Type | Dataset| Example Link | Pretrained Models|Static Models|Siize(static)
+:-------------:| :------------:| :-----: | :-----:| :-----:| :-----:
+Tacotron2|LJSpeech|[tacotron2-vctk](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts0)|[tacotron2_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_ckpt_0.3.zip)|||
+TransformerTTS| LJSpeech| [transformer-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts1)|[transformer_tts_ljspeech_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/transformer_tts_ljspeech_ckpt_0.4.zip)|||
+SpeedySpeech| CSMSC | [speedyspeech-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts2) |[speedyspeech_nosil_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/speedyspeech_nosil_baker_ckpt_0.5.zip)|[speedyspeech_nosil_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/speedyspeech_nosil_baker_static_0.5.zip)|12MB|
+FastSpeech2| CSMSC |[fastspeech2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts3)|[fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip)|[fastspeech2_nosil_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_static_0.4.zip)|157MB|
+FastSpeech2| AISHELL-3 |[fastspeech2-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/tts3)|[fastspeech2_nosil_aishell3_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_aishell3_ckpt_0.4.zip)|||
+FastSpeech2| LJSpeech |[fastspeech2-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts3)|[fastspeech2_nosil_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_ljspeech_ckpt_0.5.zip)|||
+FastSpeech2| VCTK |[fastspeech2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/tts3)|[fastspeech2_nosil_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_vctk_ckpt_0.5.zip)|||

 ### Vocoders

-Model Type | Dataset| Example Link | Pretrained Models
-:-------------:| :------------:| :-----: | :-----
-WaveFlow| LJSpeech |[waveflow-ljspeech](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/ljspeech/voc0)|[waveflow_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_ljspeech_ckpt_0.3.zip)
-Parallel WaveGAN| CSMSC |[PWGAN-csmsc](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/csmsc/voc1)|[pwg_baker_ckpt_0.4.zip.](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip)
-Parallel WaveGAN| LJSpeech |[PWGAN-ljspeech](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/ljspeech/voc1)|[pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_ljspeech_ckpt_0.5.zip)
-Parallel WaveGAN| VCTK |[PWGAN-vctk](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/vctk/voc1)|[pwg_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_vctk_ckpt_0.5.zip)
+Model Type | Dataset| Example Link | Pretrained Models| Static Models|Size(static)
+:-------------:| :------------:| :-----: | :-----:| :-----:| :-----:
+WaveFlow| LJSpeech |[waveflow-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc0)|[waveflow_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_ljspeech_ckpt_0.3.zip)|||
+Parallel WaveGAN| CSMSC |[PWGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc1)|[pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip)|[pwg_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_static_0.4.zip)|5.1MB|
+Parallel WaveGAN| LJSpeech |[PWGAN-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc1)|[pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_ljspeech_ckpt_0.5.zip)|||
+Parallel WaveGAN|AISHELL-3 |[PWGAN-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/voc1)|[pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_aishell3_ckpt_0.5.zip)|||
+Parallel WaveGAN| VCTK |[PWGAN-vctk](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/voc1)|[pwg_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_vctk_ckpt_0.5.zip)|||
+|Multi Band MelGAN |CSMSC|[MB MelGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc3) | [mb_melgan_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/mb_melgan_baker_ckpt_0.5.zip)|[mb_melgan_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/mb_melgan_baker_static_0.5.zip) |8.2MB|

 ### Voice Cloning
 Model Type | Dataset| Example Link | Pretrained Models
-:-------------:| :------------:| :-----: | :-----
-GE2E| AISHELL-3, etc. |[ge2e](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/other/ge2e)|[ge2e_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/ge2e_ckpt_0.3.zip)
-GE2E + Tactron2| AISHELL-3 |[ge2e-tactron2-aishell3](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/aishell3/vc0)|[tacotron2_aishell3_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_aishell3_ckpt_0.3.zip)
+:-------------:| :------------:| :-----: | :-----:
+GE2E| AISHELL-3, etc. |[ge2e](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/ge2e)|[ge2e_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/ge2e_ckpt_0.3.zip)
+GE2E + Tactron2| AISHELL-3 |[ge2e-tactron2-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/vc0)|[tacotron2_aishell3_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_aishell3_ckpt_0.3.zip)
--- a/docs/source/tts/demo.rst
+++ b/docs/source/tts/demo.rst
--- a/docs/source/tts/gan_vocoder.md
+++ b/docs/source/tts/gan_vocoder.md
@@ -6,4 +6,4 @@ Model  | Generator Loss |Discriminator Loss
 Parallel Wave GAN| adversial loss <br> Feature Matching  | Multi-Scale Discriminator |
 Mel GAN |adversial loss <br> Multi-resolution STFT loss  | adversial loss|
 Multi-Band Mel GAN | adversial loss <br> full band Multi-resolution STFT loss <br> sub band Multi-resolution STFT loss |Multi-Scale Discriminator|
-HiFi GAN |adversial loss <br> Feature Matching <br>  Mel-Spectrogram Loss | Multi-Scale Discriminator <br> Multi-Period Discriminato  |
+HiFi GAN |adversial loss <br> Feature Matching <br>  Mel-Spectrogram Loss | Multi-Scale Discriminator <br> Multi-Period Discriminator|
--- a/docs/source/tts/models_introduction.md
+++ b/docs/source/tts/models_introduction.md
@@ -27,14 +27,14 @@ At present, there are two mainstream acoustic model structures.
   - Acoustic decoder (N Frames - > N Frames).

 <div align="left">
-  <img src="https://raw.githubusercontent.com/PaddlePaddle/DeepSpeech/develop/docs/images/frame_level_am.png" width=500 /> <br>
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/PaddleSpeech/develop/docs/images/frame_level_am.png" width=500 /> <br>
 </div>

 - Sequence to sequence acoustic model:
    - M Tokens - > N Frames.

 <div align="left">
-  <img src="https://raw.githubusercontent.com/PaddlePaddle/DeepSpeech/develop/docs/images/seq2seq_am.png" width=500 /> <br>
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/PaddleSpeech/develop/docs/images/seq2seq_am.png" width=500 /> <br>
 </div>

 ### Tacotron2
@@ -54,7 +54,7 @@ At present, there are two mainstream acoustic model structures.
    - CBHG postprocess.
    - Vocoder: Griffin-Lim.
 <div align="left">
-  <img src="https://raw.githubusercontent.com/PaddlePaddle/DeepSpeech/develop/docs/images/tacotron.png" width=700 /> <br>
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/PaddleSpeech/develop/docs/images/tacotron.png" width=700 /> <br>
 </div>

 **Advantage of Tacotron:**
@@ -89,10 +89,10 @@ At present, there are two mainstream acoustic model structures.
   - The alignment matrix of previous time is considered at the step `t` of decoder.

 <div align="left">
-  <img src="https://raw.githubusercontent.com/PaddlePaddle/DeepSpeech/develop/docs/images/tacotron2.png" width=500 /> <br>
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/PaddleSpeech/develop/docs/images/tacotron2.png" width=500 /> <br>
 </div>

-You can find PaddleSpeech TTS's tacotron2 with LJSpeech dataset example at [examples/ljspeech/tts0](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/ljspeech/tts0).
+You can find PaddleSpeech TTS's tacotron2 with LJSpeech dataset example at [examples/ljspeech/tts0](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts0).

 ### TransformerTTS
 **Disadvantages of the Tacotrons:**
@@ -118,7 +118,7 @@ Transformer TTS is a combination of Tacotron2 and Transformer.
    - Positional Encoding.

 <div align="left">
-  <img src="https://raw.githubusercontent.com/PaddlePaddle/DeepSpeech/develop/docs/images/transformer.png" width=500 /> <br>
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/PaddleSpeech/develop/docs/images/transformer.png" width=500 /> <br>
 </div>

 #### Transformer TTS
@@ -138,7 +138,7 @@ Transformer TTS is a seq2seq acoustic model based on Transformer and Tacotron2.
    - Uniform scale position encoding may have a negative impact on input or output sequences.

 <div align="left">
-  <img src="https://raw.githubusercontent.com/PaddlePaddle/DeepSpeech/develop/docs/images/transformer_tts.png" width=500 /> <br>
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/PaddleSpeech/develop/docs/images/transformer_tts.png" width=500 /> <br>
 </div>

 **Disadvantages of Transformer TTS:**
@@ -146,7 +146,7 @@ Transformer TTS is a seq2seq acoustic model based on Transformer and Tacotron2.
 - The ability to perceive local information is weak, and local information is more related to pronunciation.
 - Stability is worse than Tacotron2.

-You can find PaddleSpeech TTS's Transformer TTS with LJSpeech dataset example at [examples/ljspeech/tts1](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/ljspeech/tts1).
+You can find PaddleSpeech TTS's Transformer TTS with LJSpeech dataset example at [examples/ljspeech/tts1](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts1).


 ### FastSpeech2
@@ -184,14 +184,14 @@ Instead of using the encoder-attention-decoder based architecture as adopted by
 • Can be generated in parallel (decoding time is less affected by sequence length)

 <div align="left">
-  <img src="https://raw.githubusercontent.com/PaddlePaddle/DeepSpeech/develop/docs/images/fastspeech.png" width=800 /> <br>
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/PaddleSpeech/develop/docs/images/fastspeech.png" width=800 /> <br>
 </div>

 #### FastPitch
 [FastPitch](https://arxiv.org/abs/2006.06873) follows FastSpeech. A single pitch value is predicted for every temporal location, which improves the overall quality of synthesized speech.

 <div align="left">
-  <img src="https://raw.githubusercontent.com/PaddlePaddle/DeepSpeech/develop/docs/images/fastpitch.png" width=500 /> <br>
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/PaddleSpeech/develop/docs/images/fastpitch.png" width=500 /> <br>
 </div>

 #### FastSpeech2
@@ -209,10 +209,10 @@ Instead of using the encoder-attention-decoder based architecture as adopted by
 FastSpeech2 is similar to FastPitch but introduces more variation information of speech.

 <div align="left">
-  <img src="https://raw.githubusercontent.com/PaddlePaddle/DeepSpeech/develop/docs/images/fastspeech2.png" width=800 /> <br>
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/PaddleSpeech/develop/docs/images/fastspeech2.png" width=800 /> <br>
 </div>

-You can find PaddleSpeech TTS's FastSpeech2/FastPitch with CSMSC dataset example at [examples/csmsc/tts3](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/csmsc/tts3), We use token-averaged pitch and energy values introduced in FastPitch rather than frame level ones in FastSpeech2.
+You can find PaddleSpeech TTS's FastSpeech2/FastPitch with CSMSC dataset example at [examples/csmsc/tts3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts3), We use token-averaged pitch and energy values introduced in FastPitch rather than frame level ones in FastSpeech2.

 ### SpeedySpeech
 [SpeedySpeech](https://arxiv.org/abs/2008.03802) simplify the teacher-student architecture of FastSpeech and provide a fast and stable training procedure.
@@ -223,10 +223,10 @@ You can find PaddleSpeech TTS's FastSpeech2/FastPitch with CSMSC dataset example
 - Describe a simple data augmentation technique that can be used early in the training to make the teacher network robust to sequential error propagation.

 <div align="left">
-  <img src="https://raw.githubusercontent.com/PaddlePaddle/DeepSpeech/develop/docs/images/speedyspeech.png" width=500 /> <br>
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/PaddleSpeech/develop/docs/images/speedyspeech.png" width=500 /> <br>
 </div>

-You can find PaddleSpeech TTS's SpeedySpeech with CSMSC dataset example at [examples/csmsc/tts2](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/csmsc/tts2).
+You can find PaddleSpeech TTS's SpeedySpeech with CSMSC dataset example at [examples/csmsc/tts2](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts2).

 ## Vocoders
 In speech synthesis, the main task of the vocoder is to convert the spectral parameters predicted by the acoustic model into the final speech waveform.
@@ -276,7 +276,7 @@ Here, we introduce a Flow-based vocoder WaveFlow and a GAN-based vocoder Paralle
 - It is a small-footprint flow-based model for raw audio. It has only 5.9M parameters, which is 15x smalller than WaveGlow (87.9M).
 - It is directly trained with maximum likelihood without probability density distillation and auxiliary losses as used in [Parallel WaveNet](https://arxiv.org/abs/1711.10433) and [ClariNet](https://openreview.net/pdf?id=HklY120cYm), which simplifies the training pipeline and reduces the cost of development.

-You can find PaddleSpeech TTS's WaveFlow with LJSpeech dataset example at [examples/ljspeech/voc0](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/ljspeech/voc0).
+You can find PaddleSpeech TTS's WaveFlow with LJSpeech dataset example at [examples/ljspeech/voc0](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc0).

 ### Parallel WaveGAN
 [Parallel WaveGAN](https://arxiv.org/abs/1910.11480) trains a non-autoregressive WaveNet variant as a generator in a GAN based training method.
@@ -289,7 +289,7 @@ You can find PaddleSpeech TTS's WaveFlow with LJSpeech dataset example at [examp
 - Multi-resolution STFT loss.

 <div align="left">
-  <img src="https://raw.githubusercontent.com/PaddlePaddle/DeepSpeech/develop/docs/images/pwg.png" width=600 /> <br>
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/PaddleSpeech/develop/docs/images/pwg.png" width=600 /> <br>
 </div>

-You can find PaddleSpeech TTS's Parallel WaveGAN with CSMSC example at [examples/csmsc/voc1](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/csmsc/voc1).
+You can find PaddleSpeech TTS's Parallel WaveGAN with CSMSC example at [examples/csmsc/voc1](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc1).
--- a/docs/source/tts/quick_start.md
+++ b/docs/source/tts/quick_start.md
@@ -18,7 +18,7 @@ The models in PaddleSpeech TTS have the following mapping relationship:

 ## Quick Start

-Let's take a FastSpeech2 + Parallel WaveGAN with CSMSC dataset for instance. (./examples/csmsc/)(https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/csmsc)
+Let's take a FastSpeech2 + Parallel WaveGAN with CSMSC dataset for instance. (./examples/csmsc/)(https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc)

 ### Train Parallel WaveGAN with CSMSC
 - Go to directory

--- a/docs/source/tts/zh_text_frontend.md
+++ b/docs/source/tts/zh_text_frontend.md
 # Chinese Rule Based Text Frontend
-A TTS system mainly includes three modules: `Text Frontend`, `Acoustic model` and `Vocoder`. We provide a complete Chinese text frontend module in PaddleSpeech TTS, see exapmle in [examples/other/text_frontend/](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/other/text_frontend).
+A TTS system mainly includes three modules: `Text Frontend`, `Acoustic model` and `Vocoder`. We provide a complete Chinese text frontend module in PaddleSpeech TTS, see exapmles in [examples/other/tn](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/tn) and [examples/other/g2p](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/g2p).

 A text frontend module mainly includes:
 - Text Segmentation

--- a/docs/topic/ctc/ctc_loss.ipynb
+++ b/docs/topic/ctc/ctc_loss.ipynb
--- a/docs/topic/ctc/img/ctc_loss_alpha_definition.png
+++ b/docs/topic/ctc/img/ctc_loss_alpha_definition.png
--- a/docs/topic/ctc/img/ctc_loss_alpha_recurse.png
+++ b/docs/topic/ctc/img/ctc_loss_alpha_recurse.png
--- a/docs/topic/ctc/img/ctc_loss_alpha_recurse_2.png
+++ b/docs/topic/ctc/img/ctc_loss_alpha_recurse_2.png
--- a/docs/topic/ctc/img/ctc_loss_backward_1.png
+++ b/docs/topic/ctc/img/ctc_loss_backward_1.png
--- a/docs/topic/ctc/img/ctc_loss_backward_2.png
+++ b/docs/topic/ctc/img/ctc_loss_backward_2.png
--- a/docs/topic/ctc/img/ctc_loss_backward_3.png
+++ b/docs/topic/ctc/img/ctc_loss_backward_3.png
--- a/docs/topic/ctc/img/ctc_loss_backward_recurse.png
+++ b/docs/topic/ctc/img/ctc_loss_backward_recurse.png
--- a/docs/topic/ctc/img/ctc_loss_cat_lattice.png
+++ b/docs/topic/ctc/img/ctc_loss_cat_lattice.png
--- a/docs/topic/ctc/img/ctc_loss_forward_backward.png
+++ b/docs/topic/ctc/img/ctc_loss_forward_backward.png
--- a/docs/topic/ctc/img/ctc_loss_forward_backward_to_loss.png
+++ b/docs/topic/ctc/img/ctc_loss_forward_backward_to_loss.png
--- a/docs/topic/ctc/img/ctc_loss_forward_loss.png
+++ b/docs/topic/ctc/img/ctc_loss_forward_loss.png
--- a/docs/topic/ctc/img/ctc_loss_gradient_of_y_hat.png
+++ b/docs/topic/ctc/img/ctc_loss_gradient_of_y_hat.png
--- a/docs/topic/ctc/img/ctc_loss_gradient_with_y.png
+++ b/docs/topic/ctc/img/ctc_loss_gradient_with_y.png
--- a/docs/topic/ctc/img/ctc_loss_prob_l_x.png
+++ b/docs/topic/ctc/img/ctc_loss_prob_l_x.png
--- a/docs/topic/ctc/img/ctc_loss_prob_pi_x.png
+++ b/docs/topic/ctc/img/ctc_loss_prob_pi_x.png
--- a/docs/topic/ctc/img/ctc_loss_rescale_loss.png
+++ b/docs/topic/ctc/img/ctc_loss_rescale_loss.png
--- a/docs/tutorial/tts/source/fastpitch.png
+++ b/docs/tutorial/tts/source/fastpitch.png
--- a/docs/tutorial/tts/source/fastspeech2.png
+++ b/docs/tutorial/tts/source/fastspeech2.png
--- a/docs/tutorial/tts/source/frog_prince.jpg
+++ b/docs/tutorial/tts/source/frog_prince.jpg
--- a/docs/tutorial/tts/source/ocr.wav
+++ b/docs/tutorial/tts/source/ocr.wav
--- a/docs/tutorial/tts/source/ocr_result.jpg
+++ b/docs/tutorial/tts/source/ocr_result.jpg
--- a/docs/tutorial/tts/source/pwgan.png
+++ b/docs/tutorial/tts/source/pwgan.png
--- a/docs/tutorial/tts/source/signal_pipeline.png
+++ b/docs/tutorial/tts/source/signal_pipeline.png
--- a/docs/tutorial/tts/source/text_frontend_struct.png
+++ b/docs/tutorial/tts/source/text_frontend_struct.png
--- a/docs/tutorial/tts/source/tts_lips.mp4
+++ b/docs/tutorial/tts/source/tts_lips.mp4
--- a/docs/tutorial/tts/source/tts_pipeline.png
+++ b/docs/tutorial/tts/source/tts_pipeline.png
--- a/docs/tutorial/tts/tts_tutorial.ipynb
+++ b/docs/tutorial/tts/tts_tutorial.ipynb
--- a/examples/aishell/s0/conf/deepspeech2_online.yaml
+++ b/examples/aishell/s0/conf/deepspeech2_online.yaml
@@ -46,10 +46,10 @@ model:
  ctc_grad_norm_type: null
  
 training:
-  n_epoch: 50
+  n_epoch: 65
  accum_grad: 1
-  lr: 2e-3
-  lr_decay: 0.9  # 0.83
+  lr: 5e-4
+  lr_decay: 0.93
  weight_decay: 1e-06
  global_grad_clip: 3.0
  log_interval: 100
@@ -63,7 +63,7 @@ decoding:
  decoding_method: ctc_beam_search
  lang_model_path: data/lm/zh_giga.no_cna_cmn.prune01244.klm
  alpha: 2.2 #1.9
-  beta: 5.0
+  beta: 4.3
  beam_size: 300
  cutoff_prob: 0.99
  cutoff_top_n: 40

--- a/examples/aishell/s0/local/download_lm_ch.sh
+++ b/examples/aishell/s0/local/download_lm_ch.sh
@@ -9,12 +9,13 @@ URL='https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm'
 MD5="29e02312deb2e59b3c8686c7966d4fe3"
 TARGET=${DIR}/zh_giga.no_cna_cmn.prune01244.klm

-
-echo "Download language model ..."
-download $URL $MD5 $TARGET
+echo "Start downloading the language model. The language model is large, please wait for a moment ..."
+download $URL $MD5 $TARGET > /dev/null 2>&1
 if [ $? -ne 0 ]; then
    echo "Fail to download the language model!"
    exit 1
+else
+    echo "Download the language model sucessfully"
 fi



--- a/examples/aishell/s0/local/test.sh
+++ b/examples/aishell/s0/local/test.sh
@@ -13,7 +13,7 @@ ckpt_prefix=$2
 model_type=$3

 # download language model
-bash local/download_lm_ch.sh > /dev/null 2>&1
+bash local/download_lm_ch.sh
 if [ $? -ne 0 ]; then
   exit 1
 fi

--- a/examples/aishell/s0/run.sh
+++ b/examples/aishell/s0/run.sh
@@ -5,9 +5,9 @@ source path.sh
 gpus=0,1,2,3
 stage=0
 stop_stage=100
-conf_path=conf/deepspeech2.yaml
+conf_path=conf/deepspeech2.yaml    #conf/deepspeech2.yaml or conf/deepspeeech2_online.yaml
 avg_num=1
-model_type=offline
+model_type=offline    # offline or online

 source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;


--- a/examples/aishell/s1/run.sh
+++ b/examples/aishell/s1/run.sh
@@ -2,6 +2,7 @@
 source path.sh
 set -e

+gpus=0,1,2,3
 stage=0
 stop_stage=100
 conf_path=conf/conformer.yaml
@@ -22,7 +23,7 @@ fi

 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    # train model, all `ckpt` under `exp` dir
-    CUDA_VISIBLE_DEVICES=0,1,2,3 ./local/train.sh ${conf_path}  ${ckpt}
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path}  ${ckpt}
 fi

 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
@@ -40,18 +41,19 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
    CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
 fi

-if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
-    # export ckpt avg_n
-    CUDA_VISIBLE_DEVICES=0 ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
+# if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+#     # export ckpt avg_n
+#     CUDA_VISIBLE_DEVICES=0 ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
+# fi
+
+# Optionally, you can add LM and test it with runtime.
+if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
+    # test a single .wav file
+    CUDA_VISIBLE_DEVICES=0 ./local/test_hub.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1
 fi

- # Optionally, you can add LM and test it with runtime.
- if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
+if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
+    echo "warning: deps on kaldi and srilm, please make sure installed."
    # train lm and build TLG
    ./local/tlg.sh --corpus aishell --lmtype srilm
- fi
-
-if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
-    # test a single .wav file
-    CUDA_VISIBLE_DEVICES=0 ./local/test_hub.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1
 fi
--- a/examples/aishell3/tts3/README.md
+++ b/examples/aishell3/tts3/README.md
@@ -17,7 +17,7 @@ tar zxvf data_aishell3.tgz -C data_aishell3
 ```
 ### Get MFA result of AISHELL-3 and Extract it
 We use [MFA2.x](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for aishell3_fastspeech2.
-You can download from here [aishell3_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/with_tone/aishell3_alignment_tone.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/other/use_mfa) (use MFA1.x now) of our repo.
+You can download from here [aishell3_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/with_tone/aishell3_alignment_tone.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) (use MFA1.x now) of our repo.

 ## Get Started
 Assume the path to the dataset is `~/datasets/data_aishell3`.
@@ -67,8 +67,8 @@ Here's the complete help message.
 ```text
 usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
                [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
-                [--device DEVICE] [--nprocs NPROCS] [--verbose VERBOSE]
-                [--phones-dict PHONES_DICT] [--speaker-dict SPEAKER_DICT]
+                [--ngpu NGPU] [--verbose VERBOSE] [--phones-dict PHONES_DICT]
+                [--speaker-dict SPEAKER_DICT]

 Train a FastSpeech2 model.

@@ -81,8 +81,7 @@ optional arguments:
                        dev data.
  --output-dir OUTPUT_DIR
                        output dir.
-  --device DEVICE       device type to use.
-  --nprocs NPROCS       number of processes.
+  --ngpu NGPU           if ngpu=0, use cpu.
  --verbose VERBOSE     verbose.
  --phones-dict PHONES_DICT
                        phone vocabulary file.
@@ -92,23 +91,22 @@ optional arguments:
 1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`.
 2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder.
 3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are save in `checkpoints/` inside this directory.
-4. `--device` is the type of the device to run the experiment, 'cpu' or 'gpu' are supported.
-5. `--nprocs` is the number of processes to run in parallel, note that nprocs > 1 is only supported when `--device` is 'gpu'.
-6. `--phones-dict` is the path of the phone vocabulary file.
-7. `--speaker-dict`is the path of the  speaker id map file when training a multi-speaker FastSpeech2.
+4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
+5. `--phones-dict` is the path of the phone vocabulary file.
+6. `--speaker-dict`is the path of the  speaker id map file when training a multi-speaker FastSpeech2.

 ### Synthesize
-We use [parallel wavegan](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/csmsc/voc1) as the neural vocoder.
-Download pretrained parallel wavegan model from [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip) and unzip it.
+We use [parallel wavegan](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/voc1) as the neural vocoder.
+Download pretrained parallel wavegan model from [pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_aishell3_ckpt_0.5.zip) and unzip it.
 ```bash
-unzip pwg_baker_ckpt_0.4.zip
+unzip pwg_aishell3_ckpt_0.5.zip
 ```
 Parallel WaveGAN checkpoint contains files listed below.
 ```text
-pwg_baker_ckpt_0.4
-├── pwg_default.yaml               # default config used to train parallel wavegan
-├── pwg_snapshot_iter_400000.pdz   # model parameters of parallel wavegan
-└── pwg_stats.npy                  # statistics used to normalize spectrogram when training parallel wavegan
+pwg_aishell3_ckpt_0.5
+├── default.yaml                   # default config used to train parallel wavegan
+├── feats_stats.npy                # statistics used to normalize spectrogram when training parallel wavegan
+└── snapshot_iter_1000000.pdz      # generator parameters of parallel wavegan
 ```
 `./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
 ```bash
@@ -122,7 +120,7 @@ usage: synthesize.py [-h] [--fastspeech2-config FASTSPEECH2_CONFIG]
                     [--pwg-checkpoint PWG_CHECKPOINT] [--pwg-stat PWG_STAT]
                     [--phones-dict PHONES_DICT] [--speaker-dict SPEAKER_DICT]
                     [--test-metadata TEST_METADATA] [--output-dir OUTPUT_DIR]
-                     [--device DEVICE] [--verbose VERBOSE]
+                     [--ngpu NGPU] [--verbose VERBOSE]

 Synthesize with fastspeech2 & parallel wavegan.

@@ -149,8 +147,8 @@ optional arguments:
                        test metadata.
  --output-dir OUTPUT_DIR
                        output dir.
-  --device DEVICE       device type to use.
-  --verbose VERBOSE     verbose.
+  --ngpu NGPU           if ngpu == 0, use cpu.
+  --verbose VERBOSE     verbose
 ```
 `./local/synthesize_e2e.sh` calls `${BIN_DIR}/multi_spk_synthesize_e2e.py`, which can synthesize waveform from text file.
 ```bash
@@ -166,7 +164,7 @@ usage: multi_spk_synthesize_e2e.py [-h]
                                   [--pwg-stat PWG_STAT]
                                   [--phones-dict PHONES_DICT]
                                   [--speaker-dict SPEAKER_DICT] [--text TEXT]
-                                   [--output-dir OUTPUT_DIR] [--device DEVICE]
+                                   [--output-dir OUTPUT_DIR] [--ngpu NGPU]
                                   [--verbose VERBOSE]

 Synthesize with fastspeech2 & parallel wavegan.
@@ -193,7 +191,7 @@ optional arguments:
  --text TEXT           text to synthesize, a 'utt_id sentence' pair per line.
  --output-dir OUTPUT_DIR
                        output dir.
-  --device DEVICE       device type to use.
+  --ngpu NGPU           if ngpu == 0, use cpu.
  --verbose VERBOSE     verbose.
 ```
 1. `--fastspeech2-config`, `--fastspeech2-checkpoint`, `--fastspeech2-stat`, `--phones-dict` and `--speaker-dict` are arguments for fastspeech2, which correspond to the 5 files in the fastspeech2 pretrained model.
@@ -201,7 +199,7 @@ optional arguments:
 3. `--test-metadata` should be the metadata file in the normalized subfolder of `test`  in the `dump` folder.
 4. `--text` is the text file, which contains sentences to synthesize.
 5. `--output-dir` is the directory to save synthesized audio files.
-6. `--device` is the type of device to run synthesis, 'cpu' and 'gpu' are supported. 'gpu' is recommended for faster synthesis.
+6. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.

 ## Pretrained Model
 Pretrained FastSpeech2 model with no silence in the edge of audios. [fastspeech2_nosil_aishell3_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_aishell3_ckpt_0.4.zip)
@@ -226,15 +224,12 @@ python3 ${BIN_DIR}/multi_spk_synthesize_e2e.py \
  --fastspeech2-config=fastspeech2_nosil_aishell3_ckpt_0.4/default.yaml \
  --fastspeech2-checkpoint=fastspeech2_nosil_aishell3_ckpt_0.4/snapshot_iter_96400.pdz \
  --fastspeech2-stat=fastspeech2_nosil_aishell3_ckpt_0.4/speech_stats.npy \
-  --pwg-config=pwg_baker_ckpt_0.4/pwg_default.yaml \
-  --pwg-checkpoint=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
-  --pwg-stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
+  --pwg-config=pwg_aishell3_ckpt_0.5/default.yaml \
+  --pwg-checkpoint=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
+  --pwg-stat=pwg_aishell3_ckpt_0.5/feats_stats.npy  \
  --text=${BIN_DIR}/../sentences.txt \
  --output-dir=exp/default/test_e2e \
-  --device="gpu" \
  --phones-dict=fastspeech2_nosil_aishell3_ckpt_0.4/phone_id_map.txt \
  --speaker-dict=fastspeech2_nosil_aishell3_ckpt_0.4/speaker_id_map.txt

 ```
-## Future work
-A multi-speaker  vocoder is needed.
--- a/examples/aishell3/tts3/conf/default.yaml
+++ b/examples/aishell3/tts3/conf/default.yaml
@@ -24,7 +24,7 @@ f0max: 400         # Minimum f0 for pitch extraction.
 #                       DATA SETTING                      #
 ###########################################################
 batch_size: 64
-num_workers: 4
+num_workers: 2


 ###########################################################

--- a/examples/aishell3/tts3/local/synthesize.sh
+++ b/examples/aishell3/tts3/local/synthesize.sh
@@ -10,11 +10,10 @@ python3 ${BIN_DIR}/synthesize.py \
  --fastspeech2-config=${config_path} \
  --fastspeech2-checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
  --fastspeech2-stat=dump/train/speech_stats.npy \
-  --pwg-config=pwg_baker_ckpt_0.4/pwg_default.yaml \
-  --pwg-checkpoint=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
-  --pwg-stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
+  --pwg-config=pwg_aishell3_ckpt_0.5/default.yaml \
+  --pwg-checkpoint=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
+  --pwg-stat=pwg_aishell3_ckpt_0.5/feats_stats.npy  \
  --test-metadata=dump/test/norm/metadata.jsonl \
  --output-dir=${train_output_path}/test \
-  --device="gpu" \
  --phones-dict=dump/phone_id_map.txt \
  --speaker-dict=dump/speaker_id_map.txt
--- a/examples/aishell3/tts3/local/synthesize_e2e.sh
+++ b/examples/aishell3/tts3/local/synthesize_e2e.sh
@@ -10,11 +10,10 @@ python3 ${BIN_DIR}/multi_spk_synthesize_e2e.py \
  --fastspeech2-config=${config_path} \
  --fastspeech2-checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
  --fastspeech2-stat=dump/train/speech_stats.npy \
-  --pwg-config=pwg_baker_ckpt_0.4/pwg_default.yaml \
-  --pwg-checkpoint=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
-  --pwg-stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
+  --pwg-config=pwg_aishell3_ckpt_0.5/default.yaml \
+  --pwg-checkpoint=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
+  --pwg-stat=pwg_aishell3_ckpt_0.5/feats_stats.npy  \
  --text=${BIN_DIR}/../sentences.txt \
  --output-dir=${train_output_path}/test_e2e \
-  --device="gpu" \
  --phones-dict=dump/phone_id_map.txt \
  --speaker-dict=dump/speaker_id_map.txt
--- a/examples/aishell3/tts3/local/train.sh
+++ b/examples/aishell3/tts3/local/train.sh
@@ -8,6 +8,6 @@ python3 ${BIN_DIR}/train.py \
    --dev-metadata=dump/dev/norm/metadata.jsonl \
    --config=${config_path} \
    --output-dir=${train_output_path} \
-    --nprocs=2 \
+    --ngpu=2 \
    --phones-dict=dump/phone_id_map.txt \
    --speaker-dict=dump/speaker_id_map.txt
--- a/examples/aishell3/tts3/run.sh
+++ b/examples/aishell3/tts3/run.sh
@@ -7,7 +7,6 @@ gpus=0,1
 stage=0
 stop_stage=100

-
 conf_path=conf/default.yaml
 train_output_path=exp/default
 ckpt_name=snapshot_iter_482.pdz

--- a/examples/aishell3/vc0/README.md
+++ b/examples/aishell3/vc0/README.md
 # Tacotron2 + AISHELL-3 Voice Cloning
 This example contains code used to train a [Tacotron2 ](https://arxiv.org/abs/1712.05884) model with [AISHELL-3](http://www.aishelltech.com/aishell_3). The trained model can be used in Voice Cloning Task, We refer to the model structure of  [Transfer Learning from Speaker Veriﬁcation to Multispeaker Text-To-Speech Synthesis](https://arxiv.org/pdf/1806.04558.pdf) . The general steps are as follows:
-1. Speaker Encoder: We  use a Speaker Verification to train a speaker encoder. Datasets used in this task are different from those used in Tacotron2, because the  transcriptions are not needed, we use more datasets, refer to  [ge2e](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/other/ge2e).
+1. Speaker Encoder: We  use a Speaker Verification to train a speaker encoder. Datasets used in this task are different from those used in Tacotron2, because the  transcriptions are not needed, we use more datasets, refer to  [ge2e](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/ge2e).
 2. Synthesizer: Then, we use the trained speaker encoder to generate utterance embedding for each  sentence in AISHELL-3. This embedding is a extra input of  Tacotron2 which will be concated with encoder outputs.
-3. Vocoder: We use WaveFlow as the neural Vocoder, refer to [waveflow](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/ljspeech/voc0).
+3. Vocoder: We use WaveFlow as the neural Vocoder, refer to [waveflow](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc0).

 ## Get Started
 Assume the path to the dataset is `~/datasets/data_aishell3`.
@@ -28,7 +28,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    python3 ${BIN_DIR}/../ge2e/inference.py \
        --input=${input} \
        --output=${preprocess_path}/embed \
-        --device="gpu" \
+        --ngpu=1 \
        --checkpoint_path=${ge2e_ckpt_path}
 fi
 ```
@@ -39,9 +39,9 @@ There are silence in the edge of AISHELL-3's wavs, and the audio amplitude is ve

 We use Montreal Force Aligner 1.0. The label in  aishell3 include pinyin，so the lexicon we provided to MFA is pinyin rather than Chinese characters. And the prosody marks(`$`  and `%`) need to be removed. You shoud preprocess the dataset into the format  which MFA needs, the texts have the same name with wavs and have the suffix `.lab`.

-We use [lexicon.txt](https://github.com/PaddlePaddle/DeepSpeech/blob/develop/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/lexicon.txt) as the lexicon.
+We use [lexicon.txt](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/lexicon.txt) as the lexicon.

-You can download the alignment results from here [alignment_aishell3.tar.gz](https://paddlespeech.bj.bcebos.com/Parakeet/alignment_aishell3.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/other/use_mfa) (use MFA1.x now) of our repo.
+You can download the alignment results from here [alignment_aishell3.tar.gz](https://paddlespeech.bj.bcebos.com/Parakeet/alignment_aishell3.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) (use MFA1.x now) of our repo.

 ```bash
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then

--- a/examples/aishell3/vc0/local/preprocess.sh
+++ b/examples/aishell3/vc0/local/preprocess.sh
@@ -9,10 +9,9 @@ alignment=$3
 ge2e_ckpt_path=$4

 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
-    python3 ${BIN_DIR}/../../ge2e/inference.py \
-        --input=${input} \
+    python3 ${MAIN_ROOT}/paddlespeech/vector/exps/ge2e/inference.py \
+        --input=${input}/wav \
        --output=${preprocess_path}/embed \
-        --device="gpu" \
        --checkpoint_path=${ge2e_ckpt_path}
 fi


--- a/examples/aishell3/vc0/local/train.sh
+++ b/examples/aishell3/vc0/local/train.sh
@@ -6,4 +6,4 @@ train_output_path=$2
 python3 ${BIN_DIR}/train.py \
    --data=${preprocess_path} \
    --output=${train_output_path} \
-    --device="gpu"
\ No newline at end of file
+    --ngpu=1
\ No newline at end of file
--- a/examples/aishell3/vc1/README.md
+++ b/examples/aishell3/vc1/README.md
+# FastSpeech2 + AISHELL-3 Voice Cloning
+This example contains code used to train a [Tacotron2 ](https://arxiv.org/abs/1712.05884) model with [AISHELL-3](http://www.aishelltech.com/aishell_3). The trained model can be used in Voice Cloning Task, We refer to the model structure of  [Transfer Learning from Speaker Veriﬁcation to Multispeaker Text-To-Speech Synthesis](https://arxiv.org/pdf/1806.04558.pdf) . The general steps are as follows:
+1. Speaker Encoder: We  use a Speaker Verification to train a speaker encoder. Datasets used in this task are different from those used in Tacotron2, because the  transcriptions are not needed, we use more datasets, refer to  [ge2e](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/ge2e).
+2. Synthesizer: Then, we use the trained speaker encoder to generate utterance embedding for each  sentence in AISHELL-3. This embedding is a extra input of  Tacotron2 which will be concated with encoder outputs.
+3. Vocoder: We use WaveFlow as the neural Vocoder, refer to [waveflow](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc0).
+
+## Get Started
+Assume the path to the dataset is `~/datasets/data_aishell3`.
+Assume the path to the MFA result of AISHELL-3 is `./alignment`.
+Assume the path to the pretrained ge2e model is `ge2e_ckpt_path=./ge2e_ckpt_0.3/step-3000000`
+Run the command below to
+1. **source path**.
+2. preprocess the dataset,
+3. train the model.
+4. start a voice cloning inference.
+```bash
+./run.sh
+```
+### Preprocess the dataset
+```bash
+CUDA_VISIBLE_DEVICES=${gpus} ./local/preprocess.sh ${input} ${preprocess_path} ${alignment} ${ge2e_ckpt_path}
+```
+#### generate utterance embedding
+ Use pretrained GE2E (speaker encoder) to generate utterance embedding for each sentence in AISHELL-3, which has the same file structure with wav files and the format is  `.npy`.
+
+```bash
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    python3 ${BIN_DIR}/../ge2e/inference.py \
+        --input=${input} \
+        --output=${preprocess_path}/embed \
+        --ngpu=1 \
+        --checkpoint_path=${ge2e_ckpt_path}
+fi
+```
+
+The computing time of  utterance embedding can be x hours.
+####  process wav
+There are silence in the edge of AISHELL-3's wavs, and the audio amplitude is very small, so, we need to remove the silence and normalize the audio. You can the silence remove method based on   volume or energy, but the effect is not very good, We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get  the alignment of text and  speech, then utilize the alignment results to remove the silence.
+
+We use Montreal Force Aligner 1.0. The label in  aishell3 include pinyin，so the lexicon we provided to MFA is pinyin rather than Chinese characters. And the prosody marks(`$`  and `%`) need to be removed. You shoud preprocess the dataset into the format  which MFA needs, the texts have the same name with wavs and have the suffix `.lab`.
+
+We use [lexicon.txt](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/lexicon.txt) as the lexicon.
+
+You can download the alignment results from here [alignment_aishell3.tar.gz](https://paddlespeech.bj.bcebos.com/Parakeet/alignment_aishell3.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) (use MFA1.x now) of our repo.
+
+```bash
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    echo "Process wav ..."
+    python3 ${BIN_DIR}/process_wav.py \
+        --input=${input}/wav \
+        --output=${preprocess_path}/normalized_wav \
+        --alignment=${alignment}
+fi
+```
+
+#### preprocess transcription
+We revert the transcription into `phones` and  `tones`. It is worth noting that our processing here is different from that used for MFA, we separated the tones. This is a processing method, of course, you can only segment initials and vowels.
+
+```bash
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    python3 ${BIN_DIR}/preprocess_transcription.py \
+        --input=${input} \
+        --output=${preprocess_path}
+fi
+```
+The default input is  `~/datasets/data_aishell3/train`，which contains `label_train-set.txt`, the processed results are `metadata.yaml` and  `metadata.pickle`. the former is a text format for easy viewing, and the latter is a binary format for direct reading.
+#### extract mel
+```python
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    python3 ${BIN_DIR}/extract_mel.py \
+        --input=${preprocess_path}/normalized_wav \
+        --output=${preprocess_path}/mel
+fi
+```
+
+###  Train the model
+```bash
+CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${preprocess_path} ${train_output_path}
+```
+
+Our model remve  stop token prediction in Tacotron2, because of the problem of extremely unbalanced proportion of positive and negative samples of stop token prediction, and it's very sensitive to the clip of audio silence. We use the last symbol from the highest point of attention to the encoder side as the termination condition.
+
+In addition, in order to accelerate the convergence of the model, we add `guided attention loss` to induce the alignment between encoder and decoder to show diagonal lines faster.
+###  Infernece
+```bash
+CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_cloning.sh ${ge2e_params_path} ${tacotron2_params_path} ${waveflow_params_path} ${vc_input} ${vc_output}
+```
+## Pretrained Model
+[tacotron2_aishell3_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_aishell3_ckpt_0.3.zip).
--- a/examples/aishell3/vc1/conf/default.yaml
+++ b/examples/aishell3/vc1/conf/default.yaml
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+
+fs: 24000          # sr
+n_fft: 2048        # FFT size.
+n_shift: 300       # Hop size.
+win_length: 1200   # Window length.
+                   # If set to null, it will be the same as fft_size.
+window: "hann"     # Window function.
+
+# Only used for feats_type != raw
+
+fmin: 80           # Minimum frequency of Mel basis.
+fmax: 7600         # Maximum frequency of Mel basis.
+n_mels: 80         # The number of mel basis.
+
+# Only used for the model using pitch features (e.g. FastSpeech2)
+f0min: 80          # Maximum f0 for pitch extraction.
+f0max: 400         # Minimum f0 for pitch extraction.
+
+
+###########################################################
+#                       DATA SETTING                      #
+###########################################################
+batch_size: 64
+num_workers: 2
+
+
+###########################################################
+#                       MODEL SETTING                     #
+###########################################################
+model:
+    adim: 384         # attention dimension
+    aheads: 2         # number of attention heads
+    elayers: 4        # number of encoder layers
+    eunits: 1536      # number of encoder ff units
+    dlayers: 4        # number of decoder layers
+    dunits: 1536      # number of decoder ff units
+    positionwise_layer_type: conv1d   # type of position-wise layer
+    positionwise_conv_kernel_size: 3  # kernel size of position wise conv layer
+    duration_predictor_layers: 2      # number of layers of duration predictor
+    duration_predictor_chans: 256     # number of channels of duration predictor
+    duration_predictor_kernel_size: 3 # filter size of duration predictor
+    postnet_layers: 5                 # number of layers of postnset
+    postnet_filts: 5                  # filter size of conv layers in postnet
+    postnet_chans: 256                # number of channels of conv layers in postnet
+    use_masking: True                 # whether to apply masking for padded part in loss calculation
+    use_scaled_pos_enc: True          # whether to use scaled positional encoding
+    encoder_normalize_before: True    # whether to perform layer normalization before the input
+    decoder_normalize_before: True    # whether to perform layer normalization before the input
+    reduction_factor: 1               # reduction factor
+    init_type: xavier_uniform         # initialization type
+    init_enc_alpha: 1.0               # initial value of alpha of encoder scaled position encoding
+    init_dec_alpha: 1.0               # initial value of alpha of decoder scaled position encoding
+    transformer_enc_dropout_rate: 0.2            # dropout rate for transformer encoder layer
+    transformer_enc_positional_dropout_rate: 0.2 # dropout rate for transformer encoder positional encoding
+    transformer_enc_attn_dropout_rate: 0.2       # dropout rate for transformer encoder attention layer
+    transformer_dec_dropout_rate: 0.2            # dropout rate for transformer decoder layer
+    transformer_dec_positional_dropout_rate: 0.2 # dropout rate for transformer decoder positional encoding
+    transformer_dec_attn_dropout_rate: 0.2       # dropout rate for transformer decoder attention layer
+    pitch_predictor_layers: 5                  # number of conv layers in pitch predictor
+    pitch_predictor_chans: 256                 # number of channels of conv layers in pitch predictor
+    pitch_predictor_kernel_size: 5             # kernel size of conv leyers in pitch predictor
+    pitch_predictor_dropout: 0.5               # dropout rate in pitch predictor
+    pitch_embed_kernel_size: 1                 # kernel size of conv embedding layer for pitch
+    pitch_embed_dropout: 0.0                   # dropout rate after conv embedding layer for pitch
+    stop_gradient_from_pitch_predictor: true   # whether to stop the gradient from pitch predictor to encoder
+    energy_predictor_layers: 2                 # number of conv layers in energy predictor
+    energy_predictor_chans: 256                # number of channels of conv layers in energy predictor
+    energy_predictor_kernel_size: 3            # kernel size of conv leyers in energy predictor
+    energy_predictor_dropout: 0.5              # dropout rate in energy predictor
+    energy_embed_kernel_size: 1                # kernel size of conv embedding layer for energy
+    energy_embed_dropout: 0.0                  # dropout rate after conv embedding layer for energy
+    stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder
+    spk_embed_dim: 256                         # speaker embedding dimension
+    spk_embed_integration_type: concat         # speaker embedding integration type
+
+
+
+###########################################################
+#                       UPDATER SETTING                   #
+###########################################################
+updater:
+    use_masking: True                 # whether to apply masking for padded part in loss calculation
+
+
+###########################################################
+#                     OPTIMIZER SETTING                   #
+###########################################################
+optimizer:
+  optim: adam               # optimizer type
+  learning_rate: 0.001     # learning rate
+
+###########################################################
+#                     TRAINING SETTING                    #
+###########################################################
+max_epoch: 200
+num_snapshots: 5
+
+
+###########################################################
+#                       OTHER SETTING                     #
+###########################################################
+seed: 10086
--- a/examples/aishell3/vc1/local/preprocess.sh
+++ b/examples/aishell3/vc1/local/preprocess.sh
+#!/bin/bash
+
+stage=0
+stop_stage=100
+
+config_path=$1
+ge2e_ckpt_path=$2
+
+# gen speaker embedding
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    python3 ${MAIN_ROOT}/paddlespeech/vector/exps/ge2e/inference.py \
+        --input=~/datasets/data_aishell3/train/wav/ \
+        --output=dump/embed \
+        --checkpoint_path=${ge2e_ckpt_path}
+fi
+
+# copy from tts3/preprocess
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # get durations from MFA's result
+    echo "Generate durations.txt from MFA results ..."
+    python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
+        --inputdir=./aishell3_alignment_tone \
+        --output durations.txt \
+        --config=${config_path}
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # extract features
+    echo "Extract features ..."
+    python3 ${BIN_DIR}/preprocess.py \
+        --dataset=aishell3 \
+        --rootdir=~/datasets/data_aishell3/ \
+        --dumpdir=dump \
+        --dur-file=durations.txt \
+        --config=${config_path} \
+        --num-cpu=20 \
+        --cut-sil=True \
+        --spk_emb_dir=dump/embed
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    # get features' stats(mean and std)
+    echo "Get features' stats ..."
+    python3 ${MAIN_ROOT}/utils/compute_statistics.py \
+        --metadata=dump/train/raw/metadata.jsonl \
+        --field-name="speech"
+
+    python3 ${MAIN_ROOT}/utils/compute_statistics.py \
+        --metadata=dump/train/raw/metadata.jsonl \
+        --field-name="pitch"
+
+    python3 ${MAIN_ROOT}/utils/compute_statistics.py \
+        --metadata=dump/train/raw/metadata.jsonl \
+        --field-name="energy"
+fi
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    # normalize and covert phone/speaker to id, dev and test should use train's stats
+    echo "Normalize ..."
+    python3 ${BIN_DIR}/normalize.py \
+        --metadata=dump/train/raw/metadata.jsonl \
+        --dumpdir=dump/train/norm \
+        --speech-stats=dump/train/speech_stats.npy \
+        --pitch-stats=dump/train/pitch_stats.npy \
+        --energy-stats=dump/train/energy_stats.npy \
+        --phones-dict=dump/phone_id_map.txt \
+        --speaker-dict=dump/speaker_id_map.txt
+
+    python3 ${BIN_DIR}/normalize.py \
+        --metadata=dump/dev/raw/metadata.jsonl \
+        --dumpdir=dump/dev/norm \
+        --speech-stats=dump/train/speech_stats.npy \
+        --pitch-stats=dump/train/pitch_stats.npy \
+        --energy-stats=dump/train/energy_stats.npy \
+        --phones-dict=dump/phone_id_map.txt \
+        --speaker-dict=dump/speaker_id_map.txt
+
+    python3 ${BIN_DIR}/normalize.py \
+        --metadata=dump/test/raw/metadata.jsonl \
+        --dumpdir=dump/test/norm \
+        --speech-stats=dump/train/speech_stats.npy \
+        --pitch-stats=dump/train/pitch_stats.npy \
+        --energy-stats=dump/train/energy_stats.npy \
+        --phones-dict=dump/phone_id_map.txt \
+        --speaker-dict=dump/speaker_id_map.txt
+fi
--- a/examples/aishell3/vc1/local/synthesize.sh
+++ b/examples/aishell3/vc1/local/synthesize.sh
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+
+FLAGS_allocator_strategy=naive_best_fit \
+FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+python3 ${BIN_DIR}/synthesize.py \
+  --fastspeech2-config=${config_path} \
+  --fastspeech2-checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
+  --fastspeech2-stat=dump/train/speech_stats.npy \
+  --pwg-config=pwg_aishell3_ckpt_0.5/default.yaml \
+  --pwg-checkpoint=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
+  --pwg-stat=pwg_aishell3_ckpt_0.5/feats_stats.npy  \
+  --test-metadata=dump/test/norm/metadata.jsonl \
+  --output-dir=${train_output_path}/test \
+  --phones-dict=dump/phone_id_map.txt \
+  --voice-cloning=True
--- a/examples/aishell3/vc1/local/train.sh
+++ b/examples/aishell3/vc1/local/train.sh
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+
+python3 ${BIN_DIR}/train.py \
+    --train-metadata=dump/train/norm/metadata.jsonl \
+    --dev-metadata=dump/dev/norm/metadata.jsonl \
+    --config=${config_path} \
+    --output-dir=${train_output_path} \
+    --ngpu=2 \
+    --phones-dict=dump/phone_id_map.txt \
+    --voice-cloning=True
\ No newline at end of file
--- a/examples/aishell3/vc1/local/voice_cloning.sh
+++ b/examples/aishell3/vc1/local/voice_cloning.sh
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+ge2e_params_path=$4
+ref_audio_dir=$5
+
+FLAGS_allocator_strategy=naive_best_fit \
+FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+python3 ${BIN_DIR}/voice_cloning.py \
+  --fastspeech2-config=${config_path} \
+  --fastspeech2-checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
+  --fastspeech2-stat=dump/train/speech_stats.npy \
+  --pwg-config=pwg_aishell3_ckpt_0.5/default.yaml \
+  --pwg-checkpoint=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
+  --pwg-stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \
+  --ge2e_params_path=${ge2e_params_path} \
+  --text="凯莫瑞安联合体的经济崩溃迫在眉睫。" \
+  --input-dir=${ref_audio_dir} \
+  --output-dir=${train_output_path}/vc_syn \
+  --phones-dict=dump/phone_id_map.txt
--- a/examples/aishell3/vc1/path.sh
+++ b/examples/aishell3/vc1/path.sh
+#!/bin/bash
+export MAIN_ROOT=`realpath ${PWD}/../../../`
+
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
+export LC_ALL=C
+
+export PYTHONDONTWRITEBYTECODE=1
+# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
+
+MODEL=fastspeech2
+export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
--- a/examples/aishell3/vc1/run.sh
+++ b/examples/aishell3/vc1/run.sh
+#!/bin/bash
+
+set -e
+source path.sh
+
+gpus=0,1
+stage=0
+stop_stage=100
+
+conf_path=conf/default.yaml
+train_output_path=exp/default
+ckpt_name=snapshot_iter_482.pdz
+ref_audio_dir=ref_audio
+
+# not include ".pdparams" here
+ge2e_ckpt_path=./ge2e_ckpt_0.3/step-3000000
+
+# include ".pdparams" here
+ge2e_params_path=${ge2e_ckpt_path}.pdparams
+
+# with the following command, you can choice the stage range you want to run
+# such as `./run.sh --stage 0 --stop-stage 0`
+# this can not be mixed use with `$1`, `$2` ...
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # prepare data
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/preprocess.sh ${conf_path} ${ge2e_ckpt_path} || exit -1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # train model, all `ckpt` under `train_output_path/checkpoints/` dir
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # synthesize, vocoder is pwgan
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    # synthesize, vocoder is pwgan
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_cloning.sh ${conf_path} ${train_output_path} ${ckpt_name} ${ge2e_params_path} ${ref_audio_dir} || exit -1
+fi
--- a/examples/aishell3/voc1/README.md
+++ b/examples/aishell3/voc1/README.md
+# Parallel WaveGAN with AISHELL-3
+This example contains code used to train a [parallel wavegan](http://arxiv.org/abs/1910.11480) model with [AISHELL-3](http://www.aishelltech.com/aishell_3).
+
+AISHELL-3 is a large-scale and high-fidelity multi-speaker Mandarin speech corpus which could be used to train multi-speaker Text-to-Speech (TTS) systems.
+## Dataset
+### Download and Extract the datasaet
+Download AISHELL-3.
+```bash
+wget https://www.openslr.org/resources/93/data_aishell3.tgz
+```
+Extract AISHELL-3.
+```bash
+mkdir data_aishell3
+tar zxvf data_aishell3.tgz -C data_aishell3
+```
+### Get MFA result of AISHELL-3 and Extract it
+We use [MFA2.x](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for aishell3_fastspeech2.
+You can download from here [aishell3_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/with_tone/aishell3_alignment_tone.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) (use MFA1.x now) of our repo.
+
+## Get Started
+Assume the path to the dataset is `~/datasets/data_aishell3`.
+Assume the path to the MFA result of AISHELL-3 is `./aishell3_alignment_tone`.
+Run the command below to
+1. **source path**.
+2. preprocess the dataset,
+3. train the model.
+4. synthesize wavs.
+    - synthesize waveform from `metadata.jsonl`.
+```bash
+./run.sh
+```
+### Preprocess the dataset
+```bash
+./local/preprocess.sh ${conf_path}
+```
+When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below.
+
+```text
+dump
+├── dev
+│   ├── norm
+│   └── raw
+├── test
+│   ├── norm
+│   └── raw
+└── train
+    ├── norm
+    ├── raw
+    └── feats_stats.npy
+```
+
+The dataset is split into 3 parts, namely `train`, `dev` and `test`, each of which contains a `norm` and `raw` subfolder. The `raw` folder contains log magnitude of mel spectrogram of each utterances, while the norm folder contains normalized spectrogram. The statistics used to normalize the spectrogram is computed from the training set, which is located in `dump/train/feats_stats.npy`.
+
+Also there is a `metadata.jsonl` in each subfolder. It is a table-like file which contains id and paths to spectrogam of each utterance.
+
+### Train the model
+```bash
+CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path}
+```
+`./local/train.sh` calls `${BIN_DIR}/train.py`.
+Here's the complete help message.
+
+```text
+usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
+                [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
+                [--ngpu NGPU] [--verbose VERBOSE] [--batch-size BATCH_SIZE]
+                [--max-iter MAX_ITER] [--run-benchmark RUN_BENCHMARK]
+                [--profiler_options PROFILER_OPTIONS]
+
+Train a ParallelWaveGAN model.
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --config CONFIG       config file to overwrite default config.
+  --train-metadata TRAIN_METADATA
+                        training data.
+  --dev-metadata DEV_METADATA
+                        dev data.
+  --output-dir OUTPUT_DIR
+                        output dir.
+  --ngpu NGPU           if ngpu == 0, use cpu.
+  --verbose VERBOSE     verbose.
+
+benchmark:
+  arguments related to benchmark.
+
+  --batch-size BATCH_SIZE
+                        batch size.
+  --max-iter MAX_ITER   train max steps.
+  --run-benchmark RUN_BENCHMARK
+                        runing benchmark or not, if True, use the --batch-size
+                        and --max-iter.
+  --profiler_options PROFILER_OPTIONS
+                        The option of profiler, which should be in format
+                        "key1=value1;key2=value2;key3=value3".
+```
+
+1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`.
+2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder.
+3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are save in `checkpoints/` inside this directory.
+4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
+
+### Synthesize
+`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
+```bash
+CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name}
+```
+```text
+usage: synthesize.py [-h] [--config CONFIG] [--checkpoint CHECKPOINT]
+                     [--test-metadata TEST_METADATA] [--output-dir OUTPUT_DIR]
+                     [--ngpu NGPU] [--verbose VERBOSE]
+
+Synthesize with parallel wavegan.
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --config CONFIG       parallel wavegan config file.
+  --checkpoint CHECKPOINT
+                        snapshot to load.
+  --test-metadata TEST_METADATA
+                        dev data.
+  --output-dir OUTPUT_DIR
+                        output dir.
+  --ngpu NGPU           if ngpu == 0, use cpu.
+  --verbose VERBOSE     verbose.
+```
+
+1. `--config` parallel wavegan config file. You should use the same config with which the model is trained.
+2. `--checkpoint` is the checkpoint to load. Pick one of the checkpoints from `checkpoints` inside the training output directory. If you use the pretrained model, use the `snapshot_iter_1000000.pdz `.
+3. `--test-metadata` is the metadata of the test dataset. Use the `metadata.jsonl` in the `dev/norm` subfolder from the processed directory.
+4. `--output-dir` is the directory to save the synthesized audio files.
+5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
+
+## Pretrained Models
+Pretrained models can be downloaded here [pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_aishell3_ckpt_0.5.zip).
+
+Parallel WaveGAN checkpoint contains files listed below.
+
+```text
+pwg_aishell3_ckpt_0.5
+├── default.yaml                   # default config used to train parallel wavegan
+├── feats_stats.npy                # statistics used to normalize spectrogram when training parallel wavegan
+└── snapshot_iter_1000000.pdz      # generator parameters of parallel wavegan
+```
+## Acknowledgement
+We adapted some code from https://github.com/kan-bayashi/ParallelWaveGAN.
--- a/examples/aishell3/voc1/conf/default.yaml
+++ b/examples/aishell3/voc1/conf/default.yaml
--- a/examples/aishell3/voc1/local/preprocess.sh
+++ b/examples/aishell3/voc1/local/preprocess.sh
--- a/examples/aishell3/voc1/local/synthesize.sh
+++ b/examples/aishell3/voc1/local/synthesize.sh
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+
+FLAGS_allocator_strategy=naive_best_fit \
+FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+python3 ${BIN_DIR}/synthesize.py \
+  --config=${config_path} \
+  --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
+  --test-metadata=dump/test/norm/metadata.jsonl \
+  --output-dir=${train_output_path}/test
--- a/examples/aishell3/voc1/local/train.sh
+++ b/examples/aishell3/voc1/local/train.sh
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+
+FLAGS_cudnn_exhaustive_search=true \
+FLAGS_conv_workspace_size_limit=4000 \
+python ${BIN_DIR}/train.py \
+    --train-metadata=dump/train/norm/metadata.jsonl \
+    --dev-metadata=dump/dev/norm/metadata.jsonl \
+    --config=${config_path} \
+    --output-dir=${train_output_path} \
+    --ngpu=1
--- a/examples/aishell3/voc1/path.sh
+++ b/examples/aishell3/voc1/path.sh
--- a/examples/aishell3/voc1/run.sh
+++ b/examples/aishell3/voc1/run.sh
--- a/examples/callcenter/asr1/run.sh
+++ b/examples/callcenter/asr1/run.sh
--- a/examples/csmsc/tts2/README.md
+++ b/examples/csmsc/tts2/README.md
--- a/examples/csmsc/tts2/local/synthesize.sh
+++ b/examples/csmsc/tts2/local/synthesize.sh
--- a/examples/csmsc/tts2/local/synthesize_e2e.sh
+++ b/examples/csmsc/tts2/local/synthesize_e2e.sh
--- a/examples/csmsc/tts2/local/train.sh
+++ b/examples/csmsc/tts2/local/train.sh
--- a/examples/csmsc/tts3/README.md
+++ b/examples/csmsc/tts3/README.md
--- a/examples/csmsc/tts3/local/synthesize.sh
+++ b/examples/csmsc/tts3/local/synthesize.sh
--- a/examples/csmsc/tts3/local/synthesize_e2e.sh
+++ b/examples/csmsc/tts3/local/synthesize_e2e.sh
--- a/examples/csmsc/tts3/local/train.sh
+++ b/examples/csmsc/tts3/local/train.sh
--- a/examples/csmsc/voc1/README.md
+++ b/examples/csmsc/voc1/README.md
--- a/examples/csmsc/voc1/conf/default.yaml
+++ b/examples/csmsc/voc1/conf/default.yaml
--- a/examples/csmsc/voc1/local/train.sh
+++ b/examples/csmsc/voc1/local/train.sh
--- a/examples/csmsc/voc3/README.md
+++ b/examples/csmsc/voc3/README.md
--- a/examples/csmsc/voc3/conf/finetune.yaml
+++ b/examples/csmsc/voc3/conf/finetune.yaml
--- a/examples/csmsc/voc3/finetune.sh
+++ b/examples/csmsc/voc3/finetune.sh
--- a/examples/csmsc/voc3/local/link_wav.py
+++ b/examples/csmsc/voc3/local/link_wav.py
--- a/examples/csmsc/voc3/local/train.sh
+++ b/examples/csmsc/voc3/local/train.sh
--- a/examples/librispeech/s0/local/download_lm_en.sh
+++ b/examples/librispeech/s0/local/download_lm_en.sh
--- a/examples/librispeech/s0/local/test.sh
+++ b/examples/librispeech/s0/local/test.sh
--- a/examples/librispeech/s0/run.sh
+++ b/examples/librispeech/s0/run.sh
--- a/examples/librispeech/s1/run.sh
+++ b/examples/librispeech/s1/run.sh
--- a/examples/librispeech/s2/conf/transformer.yaml
+++ b/examples/librispeech/s2/conf/transformer.yaml
--- a/examples/librispeech/s2/path.sh
+++ b/examples/librispeech/s2/path.sh
--- a/examples/librispeech/s2/run.sh
+++ b/examples/librispeech/s2/run.sh
--- a/examples/ljspeech/tts0/README.md
+++ b/examples/ljspeech/tts0/README.md
--- a/examples/ljspeech/tts0/local/synthesize.sh
+++ b/examples/ljspeech/tts0/local/synthesize.sh
--- a/examples/ljspeech/tts0/local/train.sh
+++ b/examples/ljspeech/tts0/local/train.sh
--- a/examples/ljspeech/tts1/README.md
+++ b/examples/ljspeech/tts1/README.md
--- a/examples/ljspeech/tts1/local/synthesize.sh
+++ b/examples/ljspeech/tts1/local/synthesize.sh
--- a/examples/ljspeech/tts1/local/synthesize_e2e.sh
+++ b/examples/ljspeech/tts1/local/synthesize_e2e.sh
--- a/examples/ljspeech/tts1/local/train.sh
+++ b/examples/ljspeech/tts1/local/train.sh
--- a/examples/ljspeech/tts3/README.md
+++ b/examples/ljspeech/tts3/README.md
--- a/examples/ljspeech/tts3/local/synthesize.sh
+++ b/examples/ljspeech/tts3/local/synthesize.sh
--- a/examples/ljspeech/tts3/local/synthesize_e2e.sh
+++ b/examples/ljspeech/tts3/local/synthesize_e2e.sh
--- a/examples/ljspeech/tts3/local/train.sh
+++ b/examples/ljspeech/tts3/local/train.sh
--- a/examples/ljspeech/voc0/README.md
+++ b/examples/ljspeech/voc0/README.md
--- a/examples/ljspeech/voc0/local/synthesize.sh
+++ b/examples/ljspeech/voc0/local/synthesize.sh
--- a/examples/ljspeech/voc0/local/train.sh
+++ b/examples/ljspeech/voc0/local/train.sh
--- a/examples/ljspeech/voc1/README.md
+++ b/examples/ljspeech/voc1/README.md
--- a/examples/ljspeech/voc1/local/train.sh
+++ b/examples/ljspeech/voc1/local/train.sh
--- a/examples/other/1xt2x/aishell/local/download_lm_ch.sh
+++ b/examples/other/1xt2x/aishell/local/download_lm_ch.sh
--- a/examples/other/1xt2x/aishell/local/test.sh
+++ b/examples/other/1xt2x/aishell/local/test.sh
--- a/examples/other/1xt2x/baidu_en8k/local/download_lm_en.sh
+++ b/examples/other/1xt2x/baidu_en8k/local/download_lm_en.sh
--- a/examples/other/1xt2x/baidu_en8k/local/test.sh
+++ b/examples/other/1xt2x/baidu_en8k/local/test.sh
--- a/examples/other/1xt2x/librispeech/local/download_lm_en.sh
+++ b/examples/other/1xt2x/librispeech/local/download_lm_en.sh
--- a/examples/other/1xt2x/librispeech/local/test.sh
+++ b/examples/other/1xt2x/librispeech/local/test.sh
--- a/examples/other/text_frontend/README.md
+++ b/examples/other/text_frontend/README.md
--- a/examples/other/text_frontend/get_g2p_data.py
+++ b/examples/other/text_frontend/get_g2p_data.py
--- a/examples/other/g2p/run.sh
+++ b/examples/other/g2p/run.sh
--- a/examples/other/text_frontend/test_g2p.py
+++ b/examples/other/text_frontend/test_g2p.py
--- a/examples/other/ge2e/README.md
+++ b/examples/other/ge2e/README.md
--- a/examples/other/ge2e/local/inference.sh
+++ b/examples/other/ge2e/local/inference.sh
--- a/examples/other/ge2e/local/train.sh
+++ b/examples/other/ge2e/local/train.sh
--- a/examples/other/ge2e/path.sh
+++ b/examples/other/ge2e/path.sh
--- a/examples/other/ngram_lm/READEME.md
+++ b/examples/other/ngram_lm/READEME.md
--- a/examples/other/text_frontend/make_sclite.sh
+++ b/examples/other/text_frontend/make_sclite.sh
--- a/examples/other/tn/README.md
+++ b/examples/other/tn/README.md
--- a/examples/other/text_frontend/data/textnorm_test_cases.txt
+++ b/examples/other/text_frontend/data/textnorm_test_cases.txt
--- a/examples/other/text_frontend/get_textnorm_data.py
+++ b/examples/other/text_frontend/get_textnorm_data.py
--- a/examples/other/text_frontend/run.sh
+++ b/examples/other/text_frontend/run.sh
--- a/examples/other/text_frontend/test_textnorm.py
+++ b/examples/other/text_frontend/test_textnorm.py
--- a/examples/ted_en_zh/t0/conf/transformer_joint_noam.yaml
+++ b/examples/ted_en_zh/t0/conf/transformer_joint_noam.yaml
--- a/examples/ted_en_zh/t0/run.sh
+++ b/examples/ted_en_zh/t0/run.sh
--- a/examples/timit/asr1/run.sh
+++ b/examples/timit/asr1/run.sh
--- a/examples/tiny/s0/local/download_lm_en.sh
+++ b/examples/tiny/s0/local/download_lm_en.sh
--- a/examples/tiny/s0/local/test.sh
+++ b/examples/tiny/s0/local/test.sh
--- a/examples/tiny/s1/run.sh
+++ b/examples/tiny/s1/run.sh
--- a/examples/vctk/tts3/README.md
+++ b/examples/vctk/tts3/README.md
--- a/examples/vctk/tts3/local/synthesize.sh
+++ b/examples/vctk/tts3/local/synthesize.sh
--- a/examples/vctk/tts3/local/synthesize_e2e.sh
+++ b/examples/vctk/tts3/local/synthesize_e2e.sh
--- a/examples/vctk/tts3/local/train.sh
+++ b/examples/vctk/tts3/local/train.sh
--- a/examples/vctk/voc1/README.md
+++ b/examples/vctk/voc1/README.md
--- a/examples/vctk/voc1/local/train.sh
+++ b/examples/vctk/voc1/local/train.sh
--- a/paddlespeech/cls/__init__.py
+++ b/paddlespeech/cls/__init__.py
--- a/paddlespeech/s2t/decoders/ctcdecoder/swig/setup.py
+++ b/paddlespeech/s2t/decoders/ctcdecoder/swig/setup.py
--- a/paddlespeech/s2t/exps/deepspeech2/bin/test_export.py
+++ b/paddlespeech/s2t/exps/deepspeech2/bin/test_export.py
--- a/paddlespeech/s2t/exps/deepspeech2/model.py
+++ b/paddlespeech/s2t/exps/deepspeech2/model.py
--- a/paddlespeech/s2t/exps/lm/transformer/lm_cacu_perplexity.py
+++ b/paddlespeech/s2t/exps/lm/transformer/lm_cacu_perplexity.py
--- a/paddlespeech/s2t/exps/u2/model.py
+++ b/paddlespeech/s2t/exps/u2/model.py
--- a/paddlespeech/s2t/exps/u2_st/model.py
+++ b/paddlespeech/s2t/exps/u2_st/model.py
--- a/paddlespeech/s2t/io/collator.py
+++ b/paddlespeech/s2t/io/collator.py
--- a/paddlespeech/s2t/io/dataset.py
+++ b/paddlespeech/s2t/io/dataset.py
--- a/paddlespeech/s2t/models/lm/dataset.py
+++ b/paddlespeech/s2t/models/lm/dataset.py
--- a/paddlespeech/s2t/models/u2_st/u2_st.py
+++ b/paddlespeech/s2t/models/u2_st/u2_st.py
--- a/paddlespeech/t2s/datasets/am_batch_fn.py
+++ b/paddlespeech/t2s/datasets/am_batch_fn.py
--- a/paddlespeech/t2s/datasets/vocoder_batch_fn.py
+++ b/paddlespeech/t2s/datasets/vocoder_batch_fn.py
--- a/paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py
+++ b/paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py
--- a/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e.py
+++ b/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e.py
--- a/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e_en.py
+++ b/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e_en.py
--- a/paddlespeech/t2s/exps/fastspeech2/normalize.py
+++ b/paddlespeech/t2s/exps/fastspeech2/normalize.py
--- a/paddlespeech/t2s/exps/fastspeech2/preprocess.py
+++ b/paddlespeech/t2s/exps/fastspeech2/preprocess.py
--- a/paddlespeech/t2s/exps/fastspeech2/synthesize.py
+++ b/paddlespeech/t2s/exps/fastspeech2/synthesize.py
--- a/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e.py
+++ b/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e.py
--- a/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e_en.py
+++ b/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e_en.py
--- a/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e_melgan.py
+++ b/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e_melgan.py
--- a/paddlespeech/t2s/exps/fastspeech2/train.py
+++ b/paddlespeech/t2s/exps/fastspeech2/train.py
--- a/paddlespeech/t2s/exps/fastspeech2/voice_cloning.py
+++ b/paddlespeech/t2s/exps/fastspeech2/voice_cloning.py
--- a/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/synthesize.py
+++ b/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/synthesize.py
--- a/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/train.py
+++ b/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/train.py
--- a/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/synthesize.py
+++ b/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/synthesize.py
--- a/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/synthesize_from_wav.py
+++ b/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/synthesize_from_wav.py
--- a/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py
+++ b/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py
--- a/paddlespeech/t2s/exps/gan_vocoder/preprocess.py
+++ b/paddlespeech/t2s/exps/gan_vocoder/preprocess.py
--- a/paddlespeech/t2s/exps/speedyspeech/inference.py
+++ b/paddlespeech/t2s/exps/speedyspeech/inference.py
--- a/paddlespeech/t2s/exps/speedyspeech/synthesize.py
+++ b/paddlespeech/t2s/exps/speedyspeech/synthesize.py
--- a/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py
+++ b/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py
--- a/paddlespeech/t2s/exps/speedyspeech/train.py
+++ b/paddlespeech/t2s/exps/speedyspeech/train.py
--- a/paddlespeech/t2s/exps/tacotron2/synthesize.py
+++ b/paddlespeech/t2s/exps/tacotron2/synthesize.py
--- a/paddlespeech/t2s/exps/tacotron2/train.py
+++ b/paddlespeech/t2s/exps/tacotron2/train.py
--- a/paddlespeech/t2s/exps/transformer_tts/synthesize.py
+++ b/paddlespeech/t2s/exps/transformer_tts/synthesize.py
--- a/paddlespeech/t2s/exps/transformer_tts/synthesize_e2e.py
+++ b/paddlespeech/t2s/exps/transformer_tts/synthesize_e2e.py
--- a/paddlespeech/t2s/exps/transformer_tts/train.py
+++ b/paddlespeech/t2s/exps/transformer_tts/train.py
--- a/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/train.py
+++ b/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/train.py
--- a/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/voice_cloning.py
+++ b/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/voice_cloning.py
--- a/paddlespeech/t2s/exps/waveflow/synthesize.py
+++ b/paddlespeech/t2s/exps/waveflow/synthesize.py
--- a/paddlespeech/t2s/exps/waveflow/train.py
+++ b/paddlespeech/t2s/exps/waveflow/train.py
--- a/paddlespeech/t2s/frontend/pinyin.py
+++ b/paddlespeech/t2s/frontend/pinyin.py
--- a/paddlespeech/t2s/frontend/zh_frontend.py
+++ b/paddlespeech/t2s/frontend/zh_frontend.py
--- a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
+++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
--- a/paddlespeech/t2s/models/fastspeech2/fastspeech2_updater.py
+++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2_updater.py
--- a/paddlespeech/t2s/models/transformer_tts/transformer_tts.py
+++ b/paddlespeech/t2s/models/transformer_tts/transformer_tts.py
--- a/paddlespeech/t2s/modules/fastspeech2_transformer/encoder.py
+++ b/paddlespeech/t2s/modules/fastspeech2_transformer/encoder.py
--- a/paddlespeech/t2s/training/cli.py
+++ b/paddlespeech/t2s/training/cli.py
--- a/paddlespeech/t2s/training/experiment.py
+++ b/paddlespeech/t2s/training/experiment.py
--- a/paddlespeech/vector/__init__.py
+++ b/paddlespeech/vector/__init__.py
--- a/paddlespeech/t2s/exps/ge2e/__init__.py
+++ b/paddlespeech/t2s/exps/ge2e/__init__.py
--- a/paddlespeech/vector/exps/ge2e/__init__.py
+++ b/paddlespeech/vector/exps/ge2e/__init__.py
--- a/paddlespeech/t2s/exps/ge2e/audio_processor.py
+++ b/paddlespeech/t2s/exps/ge2e/audio_processor.py
--- a/paddlespeech/t2s/exps/ge2e/config.py
+++ b/paddlespeech/t2s/exps/ge2e/config.py
--- a/paddlespeech/t2s/exps/ge2e/dataset_processors.py
+++ b/paddlespeech/t2s/exps/ge2e/dataset_processors.py
--- a/paddlespeech/t2s/exps/ge2e/inference.py
+++ b/paddlespeech/t2s/exps/ge2e/inference.py
--- a/paddlespeech/t2s/exps/ge2e/preprocess.py
+++ b/paddlespeech/t2s/exps/ge2e/preprocess.py
--- a/paddlespeech/t2s/exps/ge2e/random_cycle.py
+++ b/paddlespeech/t2s/exps/ge2e/random_cycle.py
--- a/paddlespeech/t2s/exps/ge2e/speaker_verification_dataset.py
+++ b/paddlespeech/t2s/exps/ge2e/speaker_verification_dataset.py
--- a/paddlespeech/t2s/exps/ge2e/train.py
+++ b/paddlespeech/t2s/exps/ge2e/train.py
--- a/paddlespeech/vector/models/__init__.py
+++ b/paddlespeech/vector/models/__init__.py
--- a/paddlespeech/t2s/models/lstm_speaker_encoder.py
+++ b/paddlespeech/t2s/models/lstm_speaker_encoder.py
--- a/requirements.txt
+++ b/requirements.txt
--- a/setup.py
+++ b/setup.py
--- a/setup.sh
+++ b/setup.sh
--- a/speechnn/.gitignore
+++ b/speechnn/.gitignore
--- a/speechnn/CMakeLists.txt
+++ b/speechnn/CMakeLists.txt
--- a/speechnn/cmake/third_party.cmake
+++ b/speechnn/cmake/third_party.cmake
--- a/speechnn/cmake/third_party/absl.cmake
+++ b/speechnn/cmake/third_party/absl.cmake
--- a/speechnn/cmake/third_party/boost.cmake
+++ b/speechnn/cmake/third_party/boost.cmake
--- a/speechnn/cmake/third_party/eigen.cmake
+++ b/speechnn/cmake/third_party/eigen.cmake
--- a/speechnn/cmake/third_party/libsndfile.cmake
+++ b/speechnn/cmake/third_party/libsndfile.cmake
--- a/speechnn/cmake/third_party/openfst.cmake
+++ b/speechnn/cmake/third_party/openfst.cmake
--- a/speechnn/cmake/third_party/openfst_lib_target.cmake
+++ b/speechnn/cmake/third_party/openfst_lib_target.cmake
--- a/speechnn/cmake/third_party/threadpool.cmake
+++ b/speechnn/cmake/third_party/threadpool.cmake
--- a/speechnn/cmake/third_party/version.cmake
+++ b/speechnn/cmake/third_party/version.cmake
--- a/speechnn/core/transformers/.gitkeep
+++ b/speechnn/core/transformers/.gitkeep
--- a/speechnn/core/transformers/README.md
+++ b/speechnn/core/transformers/README.md
--- a/speechnn/examples/.gitkeep
+++ b/speechnn/examples/.gitkeep
--- a/speechnn/examples/CMakeLists.txt
+++ b/speechnn/examples/CMakeLists.txt
--- a/speechnn/speechnn/CMakeLists.txt
+++ b/speechnn/speechnn/CMakeLists.txt
--- a/speechnn/speechnn/decoder/CMakeLists.txt
+++ b/speechnn/speechnn/decoder/CMakeLists.txt
--- a/speechnn/speechnn/frontend/CMakeLists.txt
+++ b/speechnn/speechnn/frontend/CMakeLists.txt
--- a/speechnn/speechnn/frontend/audio/CMakeLists.txt
+++ b/speechnn/speechnn/frontend/audio/CMakeLists.txt
--- a/speechnn/speechnn/frontend/text/CMakeLists.txt
+++ b/speechnn/speechnn/frontend/text/CMakeLists.txt
--- a/speechnn/speechnn/model/CMakeLists.txt
+++ b/speechnn/speechnn/model/CMakeLists.txt
--- a/speechnn/speechnn/nn/CMakeLists.txt
+++ b/speechnn/speechnn/nn/CMakeLists.txt
--- a/speechnn/speechnn/protocol/CMakeLists.txt
+++ b/speechnn/speechnn/protocol/CMakeLists.txt
--- a/speechnn/speechnn/utils/CMakeLists.txt
+++ b/speechnn/speechnn/utils/CMakeLists.txt
--- a/tests/benchmark/conformer/README.md
+++ b/tests/benchmark/conformer/README.md
--- a/tests/benchmark/conformer/analysis.py
+++ b/tests/benchmark/conformer/analysis.py
--- a/tests/benchmark/conformer/prepare.sh
+++ b/tests/benchmark/conformer/prepare.sh
--- a/tests/benchmark/conformer/run.sh
+++ b/tests/benchmark/conformer/run.sh
--- a/tests/benchmark/conformer/run_analysis_mp.sh
+++ b/tests/benchmark/conformer/run_analysis_mp.sh
--- a/tests/benchmark/conformer/run_analysis_sp.sh
+++ b/tests/benchmark/conformer/run_analysis_sp.sh
--- a/tests/benchmark/conformer/run_benchmark.sh
+++ b/tests/benchmark/conformer/run_benchmark.sh
--- a/tests/benchmark/pwgan/run_all.sh
+++ b/tests/benchmark/pwgan/run_all.sh
--- a/tests/benchmark/pwgan/run_benchmark.sh
+++ b/tests/benchmark/pwgan/run_benchmark.sh
--- a/tests/chains/speedyspeech/test.sh
+++ b/tests/chains/speedyspeech/test.sh
--- a/third_party/python_kaldi_features/setup.py
+++ b/third_party/python_kaldi_features/setup.py
--- a/tools/Makefile
+++ b/tools/Makefile
--- a/tools/extras/install_mfa_v1.sh
+++ b/tools/extras/install_mfa_v1.sh
--- a/tools/extras/install_mfa.sh
+++ b/tools/extras/install_mfa.sh
--- a/tools/extras/install_miniconda.sh
+++ b/tools/extras/install_miniconda.sh
--- a/tools/extras/install_sclite.sh
+++ b/tools/extras/install_sclite.sh
--- a/tools/extras/install_sox.sh
+++ b/tools/extras/install_sox.sh
--- a/tools/extras/install_venv.sh
+++ b/tools/extras/install_venv.sh