diff --git a/.bashrc b/.bashrc deleted file mode 100644 index 8abbb3c7d761a844c786122174e26d62f64901cd..0000000000000000000000000000000000000000 --- a/.bashrc +++ /dev/null @@ -1,15 +0,0 @@ -unset GREP_OPTIONS - -# https://zhuanlan.zhihu.com/p/33050965 -alias nvs='nvidia-smi' -alias his='history' -alias jobs='jobs -l' -alias ports='netstat -tulanp' -alias wget='wget -c' - -## Colorize the grep command output for ease of use (good for log files)## -alias grep='grep --color=auto' -alias egrep='egrep --color=auto' -alias fgrep='fgrep --color=auto' - - diff --git a/.vimrc b/.vimrc deleted file mode 100644 index fbb070d6ea518080c443bda38488dfdf6eadc923..0000000000000000000000000000000000000000 --- a/.vimrc +++ /dev/null @@ -1,468 +0,0 @@ -""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" -" Maintainer: -" Amir Salihefendic — @amix3k -" -" Awesome_version: -" Get this config, nice color schemes and lots of plugins! -" -" Install the awesome version from: -" -" https://github.com/amix/vimrc -" -" Sections: -" -> General -" -> VIM user interface -" -> Colors and Fonts -" -> Files and backups -" -> Text, tab and indent related -" -> Visual mode related -" -> Moving around, tabs and buffers -" -> Status line -" -> Editing mappings -" -> vimgrep searching and cope displaying -" -> Spell checking -" -> Misc -" -> Helper functions -" -""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" - - -""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" -" => General -""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" -" Sets how many lines of history VIM has to remember -set history=500 - -" Enable filetype plugins -filetype plugin on -filetype indent on - -" Set to auto read when a file is changed from the outside -set autoread -au FocusGained,BufEnter * checktime - -" With a map leader it's possible to do extra key combinations -" like w saves the current file -let mapleader = "," - -" Fast saving -nmap w :w! - -" :W sudo saves the file -" (useful for handling the permission-denied error) -command! W execute 'w !sudo tee % > /dev/null' edit! - - -""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" -" => VIM user interface -""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" -" Set 7 lines to the cursor - when moving vertically using j/k -set so=7 - -" Avoid garbled characters in Chinese language windows OS -let $LANG='en' -set langmenu=en -source $VIMRUNTIME/delmenu.vim -source $VIMRUNTIME/menu.vim - -" Turn on the Wild menu -set wildmenu - -" Ignore compiled files -set wildignore=*.o,*~,*.pyc -if has("win16") || has("win32") - set wildignore+=.git\*,.hg\*,.svn\* -else - set wildignore+=*/.git/*,*/.hg/*,*/.svn/*,*/.DS_Store -endif - -"Always show current position -set ruler - -" Height of the command bar -set cmdheight=1 - -" A buffer becomes hidden when it is abandoned -set hid - -" Configure backspace so it acts as it should act -set backspace=eol,start,indent -set whichwrap+=<,>,h,l - -" Ignore case when searching -set ignorecase - -" When searching try to be smart about cases -set smartcase - -" Highlight search results -set hlsearch - -" Makes search act like search in modern browsers -set incsearch - -" Don't redraw while executing macros (good performance config) -set lazyredraw - -" For regular expressions turn magic on -set magic - -" Show matching brackets when text indicator is over them -set showmatch -" How many tenths of a second to blink when matching brackets -set mat=2 - -" No annoying sound on errors -set noerrorbells -set novisualbell -set t_vb= -set tm=500 - -" Properly disable sound on errors on MacVim -if has("gui_macvim") - autocmd GUIEnter * set vb t_vb= -endif - - -" Add a bit extra margin to the left -set foldcolumn=1 - - -""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" -" => Colors and Fonts -""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" -" Enable syntax highlighting -syntax enable - -" Enable 256 colors palette in Gnome Terminal -if $COLORTERM == 'gnome-terminal' - set t_Co=256 -endif - -try - colorscheme desert -catch -endtry - -set background=dark - -" Set extra options when running in GUI mode -if has("gui_running") - set guioptions-=T - set guioptions-=e - set t_Co=256 - set guitablabel=%M\ %t -endif - -" Set utf8 as standard encoding and en_US as the standard language -set encoding=utf8 -set fileencodings=ucs-bom,utf-8,cp936 -set fileencoding=gb2312 -set termencoding=utf-8 - -" Use Unix as the standard file type -set ffs=unix,dos,mac - - -""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" -" => Files, backups and undo -""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" -" Turn backup off, since most stuff is in SVN, git etc. anyway... -set nobackup -set nowb -set noswapfile - - -""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" -" => Text, tab and indent related -""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" -" Use spaces instead of tabs -set expandtab - -" Be smart when using tabs ;) -set smarttab - -" 1 tab == 4 spaces -set shiftwidth=4 -set tabstop=4 - -" Linebreak on 500 characters -set lbr -set tw=500 - -set ai "Auto indent -set si "Smart indent -set wrap "Wrap lines - - -"""""""""""""""""""""""""""""" -" => Visual mode related -"""""""""""""""""""""""""""""" -" Visual mode pressing * or # searches for the current selection -" Super useful! From an idea by Michael Naumann -vnoremap * :call VisualSelection('', '')/=@/ -vnoremap # :call VisualSelection('', '')?=@/ - - -""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" -" => Moving around, tabs, windows and buffers -""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" -" Map to / (search) and Ctrl- to ? (backwards search) -map / -map ? - -" Disable highlight when is pressed -map :noh - -" Smart way to move between windows -map j -map k -map h -map l - -" Close the current buffer -map bd :Bclose:tabclosegT - -" Close all the buffers -map ba :bufdo bd - -map l :bnext -map h :bprevious - -" Useful mappings for managing tabs -map tn :tabnew -map to :tabonly -map tc :tabclose -map tm :tabmove -map t :tabnext - -" Let 'tl' toggle between this and the last accessed tab -let g:lasttab = 1 -nmap tl :exe "tabn ".g:lasttab -au TabLeave * let g:lasttab = tabpagenr() - - -" Opens a new tab with the current buffer's path -" Super useful when editing files in the same directory -map te :tabedit =expand("%:p:h")/ - -" Switch CWD to the directory of the open buffer -map cd :cd %:p:h:pwd - -" Specify the behavior when switching between buffers -try - set switchbuf=useopen,usetab,newtab - set stal=2 -catch -endtry - -" Return to last edit position when opening files (You want this!) -au BufReadPost * if line("'\"") > 1 && line("'\"") <= line("$") | exe "normal! g'\"" | endif - - -"""""""""""""""""""""""""""""" -" => Status line -"""""""""""""""""""""""""""""" -" Always show the status line -set laststatus=2 - -" Format the status line -set statusline=\ %{HasPaste()}%F%m%r%h\ %w\ \ CWD:\ %r%{getcwd()}%h\ \ \ Line:\ %l\ \ Column:\ %c - - -""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" -" => Editing mappings -""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" -" Remap VIM 0 to first non-blank character -map 0 ^ - -" Move a line of text using ALT+[jk] or Command+[jk] on mac -nmap mz:m+`z -nmap mz:m-2`z -vmap :m'>+`mzgv`yo`z -vmap :m'<-2`>my` - nmap - vmap - vmap -endif - -" Delete trailing white space on save, useful for some filetypes ;) -fun! CleanExtraSpaces() - let save_cursor = getpos(".") - let old_query = getreg('/') - silent! %s/\s\+$//e - call setpos('.', save_cursor) - call setreg('/', old_query) -endfun - -if has("autocmd") - autocmd BufWritePre *.txt,*.js,*.py,*.wiki,*.sh,*.coffee :call CleanExtraSpaces() -endif - - -""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" -" => Spell checking -""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" -" Pressing ,ss will toggle and untoggle spell checking -map ss :setlocal spell! - -" Shortcuts using -map sn ]s -map sp [s -map sa zg -map s? z= - - -""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" -" => Misc -""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" -" Remove the Windows ^M - when the encodings gets messed up -noremap m mmHmt:%s///ge'tzt'm - -" Quickly open a buffer for scribble -map q :e ~/buffer - -" Quickly open a markdown buffer for scribble -map x :e ~/buffer.md - -" Toggle paste mode on and off -map pp :setlocal paste! - - -""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" -" => Helper functions -""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" -" Returns true if paste mode is enabled -function! HasPaste() - if &paste - return 'PASTE MODE ' - endif - return '' -endfunction - -" Don't close window, when deleting a buffer -command! Bclose call BufcloseCloseIt() -function! BufcloseCloseIt() - let l:currentBufNum = bufnr("%") - let l:alternateBufNum = bufnr("#") - - if buflisted(l:alternateBufNum) - buffer # - else - bnext - endif - - if bufnr("%") == l:currentBufNum - new - endif - - if buflisted(l:currentBufNum) - execute("bdelete! ".l:currentBufNum) - endif -endfunction - -function! CmdLine(str) - call feedkeys(":" . a:str) -endfunction - -function! VisualSelection(direction, extra_filter) range - let l:saved_reg = @" - execute "normal! vgvy" - - let l:pattern = escape(@", "\\/.*'$^~[]") - let l:pattern = substitute(l:pattern, "\n$", "", "") - - if a:direction == 'gv' - call CmdLine("Ack '" . l:pattern . "' " ) - elseif a:direction == 'replace' - call CmdLine("%s" . '/'. l:pattern . '/') - endif - - let @/ = l:pattern - let @" = l:saved_reg -endfunction - - -"""""""""""""""""""""""""""""" -" => Python section -"""""""""""""""""""""""""""""" -let python_highlight_all = 1 -au FileType python syn keyword pythonDecorator True None False self - -au BufNewFile,BufRead *.jinja set syntax=htmljinja -au BufNewFile,BufRead *.mako set ft=mako - -au FileType python map F :set foldmethod=indent - -au FileType python inoremap $r return -au FileType python inoremap $i import -au FileType python inoremap $p print -au FileType python inoremap $f # --- a -au FileType python map 1 /class -au FileType python map 2 /def -au FileType python map C ?class -au FileType python map D ?def - - -"""""""""""""""""""""""""""""" -" => JavaScript section -""""""""""""""""""""""""""""""" -au FileType javascript call JavaScriptFold() -au FileType javascript setl fen -au FileType javascript setl nocindent - -au FileType javascript imap $log();hi -au FileType javascript imap alert();hi - -au FileType javascript inoremap $r return -au FileType javascript inoremap $f // --- PHFP2xi - -function! JavaScriptFold() - setl foldmethod=syntax - setl foldlevelstart=1 - syn region foldBraces start=/{/ end=/}/ transparent fold keepend extend - - function! FoldText() - return substitute(getline(v:foldstart), '{.*', '{...}', '') - endfunction - setl foldtext=FoldText() -endfunction - - -"""""""""""""""""""""""""""""" -" => CoffeeScript section -""""""""""""""""""""""""""""""" -function! CoffeeScriptFold() - setl foldmethod=indent - setl foldlevelstart=1 -endfunction -au FileType coffee call CoffeeScriptFold() - -au FileType gitcommit call setpos('.', [0, 1, 1, 0]) - - -"""""""""""""""""""""""""""""" -" => Shell section -"""""""""""""""""""""""""""""" -if exists('$TMUX') - if has('nvim') - set termguicolors - else - set term=screen-256color - endif -endif - - -"""""""""""""""""""""""""""""" -" => Twig section -"""""""""""""""""""""""""""""" -autocmd BufRead *.twig set syntax=html filetype=html - - -"""""""""""""""""""""""""""""" -" => Markdown -"""""""""""""""""""""""""""""" -let vim_markdown_folding_disabled = 1 diff --git a/docs/source/index.rst b/docs/source/index.rst index 53e5d15df5baaf307a3d0c24fce608af0d34a5e2..ea2599abe49e06c2b652488d073bda45a3a3b80e 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -23,7 +23,7 @@ Contents .. toctree:: :maxdepth: 1 - :caption: Speech-To-Text + :caption: Speech-to-Text asr/models_introduction asr/data_preparation @@ -33,7 +33,7 @@ Contents .. toctree:: :maxdepth: 1 - :caption: Text-To-Speech + :caption: Text-to-Speech tts/basic_usage tts/advanced_usage diff --git a/docs/source/introduction.md b/docs/source/introduction.md index e7dd2892afe37c1391c04ca2bc9a410ea7754756..e3fc8b9ea9e1c2d9b6d80e8ea6edb1c6dbbf1385 100644 --- a/docs/source/introduction.md +++ b/docs/source/introduction.md @@ -1,11 +1,11 @@ # PaddleSpeech ## What is PaddleSpeech? -PaddleSpeech is an open-source toolkit on PaddlePaddle platform for two critical tasks in Speech - Speech-To-Text (Automatic Speech Recognition, ASR) and Text-To-Speech Synthesis (TTS), with modules involving state-of-art and influential models. +PaddleSpeech is an open-source toolkit on PaddlePaddle platform for two critical tasks in Speech - Speech-to-Text (Automatic Speech Recognition, ASR) and Text-to-Speech Synthesis (TTS), with modules involving state-of-art and influential models. ## What can PaddleSpeech do? -### Speech-To-Text +### Speech-to-Text PaddleSpeech ASR mainly consists of components below: - Implementation of models and commonly used neural network layers. - Dataset abstraction and common data preprocessing pipelines. @@ -29,9 +29,9 @@ PaddleSpeech ASR provides you with a complete ASR pipeline, including: - attention decoding (used in Transformer and Conformer) - attention rescoring (used in Transformer and Conformer) -Speech-To-Text helps you training the ASR model very simply. +Speech-to-Text helps you training the ASR model very simply. -### Text-To-Speech +### Text-to-Speech TTS mainly consists of components below: - Implementation of models and commonly used neural network layers. - Dataset abstraction and common data preprocessing pipelines. @@ -53,4 +53,4 @@ PaddleSpeech TTS provides you with a complete TTS pipeline, including: - Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis - GE2E -Text-To-Speech helps you to train TTS models with simple commands. +Text-to-Speech helps you to train TTS models with simple commands. diff --git a/docs/source/released_model.md b/docs/source/released_model.md index bb03689c7e3f1712af5f4d0d47c328206765d770..78f5c92f0ed0aae46c5ec3a82d59c10bf4360064 100644 --- a/docs/source/released_model.md +++ b/docs/source/released_model.md @@ -1,16 +1,17 @@ # Released Models -## Speech-To-Text Models +## Speech-to-Text Models ### Acoustic Model Released in paddle 2.X -Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | Hours of speech -:-------------:| :------------:| :-----: | -----: | :----------------- |:--------- | :---------- | :--------- -[Ds2 Online Aishell Model](https://deepspeech.bj.bcebos.com/release2.2/aishell/s0/ds2_online_aishll_CER8.02_release.tar.gz) | Aishell Dataset | Char-based | 345 MB | 2 Conv + 5 LSTM layers with only forward direction | 0.080218 |-| 151 h -[Ds2 Offline Aishell Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s0/aishell.s0.ds2.offline.cer6p65.release.tar.gz)| Aishell Dataset | Char-based | 306 MB | 2 Conv + 3 bidirectional GRU layers| 0.065 |-| 151 h -[Conformer Online Aishell Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.chunk.release.tar.gz) | Aishell Dataset | Char-based | 283 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention + CTC | 0.0594 |-| 151 h -[Conformer Offline Aishell Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.release.tar.gz) | Aishell Dataset | Char-based | 284 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention | 0.0547 |-| 151 h -[Conformer Librispeech Model](https://deepspeech.bj.bcebos.com/release2.1/librispeech/s1/conformer.release.tar.gz) | Librispeech Dataset | Word-based | 287 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention |-| 0.0325 | 960 h -[Transformer Librispeech Model](https://deepspeech.bj.bcebos.com/release2.1/librispeech/s1/transformer.release.tar.gz) | Librispeech Dataset | Word-based | 195 MB | Encoder:Transformer, Decoder:Transformer, Decoding method: Attention |-| 0.0544 | 960 h +Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | Hours of speech | example link +:-------------:| :------------:| :-----: | -----: | :----------------- |:--------- | :---------- | :--------- | :----------- +[Ds2 Online Aishell S0 Model](https://deepspeech.bj.bcebos.com/release2.2/aishell/s0/ds2_online_aishll_CER8.02_release.tar.gz) | Aishell Dataset | Char-based | 345 MB | 2 Conv + 5 LSTM layers with only forward direction | 0.080218 |-| 151 h | [D2 Online Aishell S0 Example](../../examples/aishell/s0) +[Ds2 Offline Aishell S0 Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s0/aishell.s0.ds2.offline.cer6p65.release.tar.gz)| Aishell Dataset | Char-based | 306 MB | 2 Conv + 3 bidirectional GRU layers| 0.065 |-| 151 h | [Ds2 Offline Aishell S0 Example](../../examples/aishell/s0) +[Conformer Online Aishell S1 Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.chunk.release.tar.gz) | Aishell Dataset | Char-based | 283 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0594 |-| 151 h | [Conformer Online Aishell S1 Example](../../examples/aishell/s1) +[Conformer Offline Aishell S1 Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.release.tar.gz) | Aishell Dataset | Char-based | 284 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0547 |-| 151 h | [Conformer Offline Aishell S1 Example](../../examples/aishell/s1) +[Conformer Librispeech S1 Model](https://deepspeech.bj.bcebos.com/release2.1/librispeech/s1/conformer.release.tar.gz) | Librispeech Dataset | subword-based | 287 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring |-| 0.0325 | 960 h | [Conformer Librispeech S1 example](../../example/librispeech/s1) +[Transformer Librispeech S1 Model](https://deepspeech.bj.bcebos.com/release2.2/librispeech/s1/librispeech.s1.transformer.all.wer5p62.release.tar.gz) | Librispeech Dataset | subword-based | 131 MB | Encoder:Transformer, Decoder:Transformer, Decoding method: Attention rescoring |-| 0.0456 | 960 h | [Transformer Librispeech S1 example](../../example/librispeech/s1) +[Transformer Librispeech S2 Model](https://deepspeech.bj.bcebos.com/release2.2/librispeech/s2/libri_transformer_espnet_wer3p84.release.tar.gz) | Librispeech Dataset | subword-based | 131 MB | Encoder:Transformer, Decoder:Transformer, Decoding method: Attention |-| 0.0384 | 960 h | [Transformer Librispeech S2 example](../../example/librispeech/s2) ### Acoustic Model Transformed from paddle 1.8 Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | Hours of speech @@ -27,7 +28,7 @@ Language Model | Training Data | Token-based | Size | Descriptions [Mandarin LM Small](https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm) | Baidu Internal Corpus | Char-based | 2.8 GB | Pruned with 0 1 2 4 4;
About 0.13 billion n-grams;
'probing' binary with default settings [Mandarin LM Large](https://deepspeech.bj.bcebos.com/zh_lm/zhidao_giga.klm) | Baidu Internal Corpus | Char-based | 70.4 GB | No Pruning;
About 3.7 billion n-grams;
'probing' binary with default settings -## Text-To-Speech Models +## Text-to-Speech Models ### Acoustic Models Model Type | Dataset| Example Link | Pretrained Models|Static Models|Siize(static) :-------------:| :------------:| :-----: | :-----:| :-----:| :-----: diff --git a/examples/aishell/s1/run.sh b/examples/aishell/s1/run.sh index f2877945241e2e7e843e26adcd67dc7e8bb3c3dc..0b40e0649fd7d13c90c785909be56631f3630c77 100644 --- a/examples/aishell/s1/run.sh +++ b/examples/aishell/s1/run.sh @@ -41,10 +41,10 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi -if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then - # export ckpt avg_n - CUDA_VISIBLE_DEVICES=0 ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit -fi +# if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then +# # export ckpt avg_n +# CUDA_VISIBLE_DEVICES=0 ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit +# fi # Optionally, you can add LM and test it with runtime. if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then diff --git a/examples/aishell3/tts3/README.md b/examples/aishell3/tts3/README.md index 82b69ad8432b8bf9b4649502ffa785de7600edc4..fe4887b935e329abe58ac075a57d7f1e993ad49d 100644 --- a/examples/aishell3/tts3/README.md +++ b/examples/aishell3/tts3/README.md @@ -24,7 +24,7 @@ Assume the path to the dataset is `~/datasets/data_aishell3`. Assume the path to the MFA result of AISHELL-3 is `./aishell3_alignment_tone`. Run the command below to 1. **source path**. -2. preprocess the dataset, +2. preprocess the dataset. 3. train the model. 4. synthesize wavs. - synthesize waveform from `metadata.jsonl`. diff --git a/examples/aishell3/tts3/conf/default.yaml b/examples/aishell3/tts3/conf/default.yaml index 90816e7d7e1627c0ca138d4ae45309ecad9d0160..1dd782dbeed656429499cf3518e0becda0619202 100644 --- a/examples/aishell3/tts3/conf/default.yaml +++ b/examples/aishell3/tts3/conf/default.yaml @@ -24,7 +24,7 @@ f0max: 400 # Minimum f0 for pitch extraction. # DATA SETTING # ########################################################### batch_size: 64 -num_workers: 4 +num_workers: 2 ########################################################### diff --git a/examples/aishell3/tts3/run.sh b/examples/aishell3/tts3/run.sh index 656710763aa578716d16f77c69c84ea402b7b9e9..95e4d38fe212f6bdbc9e8ab143671f8a427ecfc8 100755 --- a/examples/aishell3/tts3/run.sh +++ b/examples/aishell3/tts3/run.sh @@ -7,7 +7,6 @@ gpus=0,1 stage=0 stop_stage=100 - conf_path=conf/default.yaml train_output_path=exp/default ckpt_name=snapshot_iter_482.pdz diff --git a/examples/aishell3/vc0/README.md b/examples/aishell3/vc0/README.md index 28fea629262e303f8752c457eb1438936bc27eb4..2f1b37ee2fcd0906265ffe19101365189dd155e5 100644 --- a/examples/aishell3/vc0/README.md +++ b/examples/aishell3/vc0/README.md @@ -1,7 +1,7 @@ # Tacotron2 + AISHELL-3 Voice Cloning This example contains code used to train a [Tacotron2 ](https://arxiv.org/abs/1712.05884) model with [AISHELL-3](http://www.aishelltech.com/aishell_3). The trained model can be used in Voice Cloning Task, We refer to the model structure of [Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis](https://arxiv.org/pdf/1806.04558.pdf) . The general steps are as follows: 1. Speaker Encoder: We use a Speaker Verification to train a speaker encoder. Datasets used in this task are different from those used in Tacotron2, because the transcriptions are not needed, we use more datasets, refer to [ge2e](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/ge2e). -2. Synthesizer: Then, we use the trained speaker encoder to generate utterance embedding for each sentence in AISHELL-3. This embedding is a extra input of Tacotron2 which will be concated with encoder outputs. +2. Synthesizer: We use the trained speaker encoder to generate speaker embedding for each sentence in AISHELL-3. This embedding is an extra input of Tacotron2 which will be concated with encoder outputs. 3. Vocoder: We use WaveFlow as the neural Vocoder, refer to [waveflow](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc0). ## Get Started @@ -10,7 +10,7 @@ Assume the path to the MFA result of AISHELL-3 is `./alignment`. Assume the path to the pretrained ge2e model is `ge2e_ckpt_path=./ge2e_ckpt_0.3/step-3000000` Run the command below to 1. **source path**. -2. preprocess the dataset, +2. preprocess the dataset. 3. train the model. 4. start a voice cloning inference. ```bash @@ -20,8 +20,8 @@ Run the command below to ```bash CUDA_VISIBLE_DEVICES=${gpus} ./local/preprocess.sh ${input} ${preprocess_path} ${alignment} ${ge2e_ckpt_path} ``` -#### generate utterance embedding - Use pretrained GE2E (speaker encoder) to generate utterance embedding for each sentence in AISHELL-3, which has the same file structure with wav files and the format is `.npy`. +#### generate speaker embedding + Use pretrained GE2E (speaker encoder) to generate speaker embedding for each sentence in AISHELL-3, which has the same file structure with wav files and the format is `.npy`. ```bash if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then @@ -33,8 +33,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then fi ``` -The computing time of utterance embedding can be x hours. -#### process wav +The computing time of utterance embedding can be x hours. +#### process wav There are silence in the edge of AISHELL-3's wavs, and the audio amplitude is very small, so, we need to remove the silence and normalize the audio. You can the silence remove method based on volume or energy, but the effect is not very good, We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get the alignment of text and speech, then utilize the alignment results to remove the silence. We use Montreal Force Aligner 1.0. The label in aishell3 include pinyin,so the lexicon we provided to MFA is pinyin rather than Chinese characters. And the prosody marks(`$` and `%`) need to be removed. You shoud preprocess the dataset into the format which MFA needs, the texts have the same name with wavs and have the suffix `.lab`. @@ -73,7 +73,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then fi ``` -### Train the model +### Train the model ```bash CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${preprocess_path} ${train_output_path} ``` @@ -81,7 +81,7 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${preprocess_path} ${train_output_ Our model remve stop token prediction in Tacotron2, because of the problem of extremely unbalanced proportion of positive and negative samples of stop token prediction, and it's very sensitive to the clip of audio silence. We use the last symbol from the highest point of attention to the encoder side as the termination condition. In addition, in order to accelerate the convergence of the model, we add `guided attention loss` to induce the alignment between encoder and decoder to show diagonal lines faster. -### Infernece +### Voice Cloning ```bash CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_cloning.sh ${ge2e_params_path} ${tacotron2_params_path} ${waveflow_params_path} ${vc_input} ${vc_output} ``` diff --git a/examples/aishell3/vc0/local/preprocess.sh b/examples/aishell3/vc0/local/preprocess.sh index eeb1923f13f9e9b5547ec409943333703491f5f8..5bf880667eecdb284bb2a4c64b5f7fe6722eb5d7 100755 --- a/examples/aishell3/vc0/local/preprocess.sh +++ b/examples/aishell3/vc0/local/preprocess.sh @@ -9,7 +9,7 @@ alignment=$3 ge2e_ckpt_path=$4 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - python3 ${BIN_DIR}/../../ge2e/inference.py \ + python3 ${MAIN_ROOT}/paddlespeech/vector/exps/ge2e/inference.py \ --input=${input}/wav \ --output=${preprocess_path}/embed \ --checkpoint_path=${ge2e_ckpt_path} diff --git a/examples/aishell3/vc1/README.md b/examples/aishell3/vc1/README.md new file mode 100644 index 0000000000000000000000000000000000000000..834942fa05c0d8863ae6592db4fa9d099331c845 --- /dev/null +++ b/examples/aishell3/vc1/README.md @@ -0,0 +1,129 @@ +# FastSpeech2 + AISHELL-3 Voice Cloning +This example contains code used to train a [FastSpeech2](https://arxiv.org/abs/2006.04558) model with [AISHELL-3](http://www.aishelltech.com/aishell_3). The trained model can be used in Voice Cloning Task, We refer to the model structure of [Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis](https://arxiv.org/pdf/1806.04558.pdf) . The general steps are as follows: +1. Speaker Encoder: We use a Speaker Verification to train a speaker encoder. Datasets used in this task are different from those used in `FastSpeech2`, because the transcriptions are not needed, we use more datasets, refer to [ge2e](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/ge2e). +2. Synthesizer: We use the trained speaker encoder to generate speaker embedding for each sentence in AISHELL-3. This embedding is an extra input of `FastSpeech2` which will be concated with encoder outputs. +3. Vocoder: We use [Parallel Wave GAN](http://arxiv.org/abs/1910.11480) as the neural Vocoder, refer to [voc1](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/voc1). + +## Dataset +### Download and Extract the datasaet +Download AISHELL-3. +```bash +wget https://www.openslr.org/resources/93/data_aishell3.tgz +``` +Extract AISHELL-3. +```bash +mkdir data_aishell3 +tar zxvf data_aishell3.tgz -C data_aishell3 +``` +### Get MFA result of AISHELL-3 and Extract it +We use [MFA2.x](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for aishell3_fastspeech2. +You can download from here [aishell3_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/with_tone/aishell3_alignment_tone.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) (use MFA1.x now) of our repo. + +## Pretrained GE2E model +We use pretrained GE2E model to generate spwaker embedding for each sentence. + +Download pretrained GE2E model from here [ge2e_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/ge2e_ckpt_0.3.zip), and `unzip` it. + +## Get Started +Assume the path to the dataset is `~/datasets/data_aishell3`. +Assume the path to the MFA result of AISHELL-3 is `./aishell3_alignment_tone`. +Assume the path to the pretrained ge2e model is `./ge2e_ckpt_0.3`. + +Run the command below to +1. **source path**. +2. preprocess the dataset. +3. train the model. +4. synthesize waveform from `metadata.jsonl`. +5. start a voice cloning inference. +```bash +./run.sh +``` +### Preprocess the dataset +```bash +CUDA_VISIBLE_DEVICES=${gpus} ./local/preprocess.sh ${conf_path} ${ge2e_ckpt_path} +``` +When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below. +```text +dump +├── dev +│ ├── norm +│ └── raw +├── embed +│ ├── SSB0005 +│ ├── SSB0009 +│ ├── ... +│ └── ... +├── phone_id_map.txt +├── speaker_id_map.txt +├── test +│ ├── norm +│ └── raw +└── train + ├── energy_stats.npy + ├── norm + ├── pitch_stats.npy + ├── raw + └── speech_stats.npy +``` +The `embed` contains the generated speaker embedding for each sentence in AISHELL-3, which has the same file structure with wav files and the format is `.npy`. + +The computing time of utterance embedding can be x hours. + +The dataset is split into 3 parts, namely `train`, `dev` and` test`, each of which contains a `norm` and `raw` sub folder. The raw folder contains speech、pitch and energy features of each utterances, while the norm folder contains normalized ones. The statistics used to normalize features are computed from the training set, which is located in `dump/train/*_stats.npy`. + +Also there is a `metadata.jsonl` in each subfolder. It is a table-like file which contains phones, text_lengths, speech_lengths, durations, path of speech features, path of pitch features, path of energy features, speaker and id of each utterance. + +The preprocessing step is very similar to that one of [tts3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/tts3), but there is one more `ge2e/inference` step here. + +### Train the model +`./local/train.sh` calls `${BIN_DIR}/train.py`. +```bash +CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} +``` +The training step is very similar to that one of [tts3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/tts3), but we should set `--voice-cloning=True` when calling `${BIN_DIR}/train.py`. + +### Synthesize +We use [parallel wavegan](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/voc1) as the neural vocoder. +Download pretrained parallel wavegan model from [pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_aishell3_ckpt_0.5.zip) and unzip it. +```bash +unzip pwg_aishell3_ckpt_0.5.zip +``` +Parallel WaveGAN checkpoint contains files listed below. +```text +pwg_aishell3_ckpt_0.5 +├── default.yaml # default config used to train parallel wavegan +├── feats_stats.npy # statistics used to normalize spectrogram when training parallel wavegan +└── snapshot_iter_1000000.pdz # generator parameters of parallel wavegan +``` +`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`. +```bash +CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} +``` +The synthesizing step is very similar to that one of [tts3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/tts3), but we should set `--voice-cloning=True` when calling `${BIN_DIR}/synthesize.py`. + +### Voice Cloning +Assume there are some reference audios in `./ref_audio` +```text +ref_audio +├── 001238.wav +├── LJ015-0254.wav +└── audio_self_test.mp3 +``` +`./local/voice_cloning.sh` calls `${BIN_DIR}/voice_cloning.py` + +```bash +CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_cloning.sh ${conf_path} ${train_output_path} ${ckpt_name} ${ge2e_params_path} ${ref_audio_dir} +``` +## Pretrained Model +[fastspeech2_nosil_aishell3_vc1_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_aishell3_vc1_ckpt_0.5.zip) + +FastSpeech2 checkpoint contains files listed below. +(There is no need for `speaker_id_map.txt` here ) + +```text +fastspeech2_nosil_aishell3_ckpt_vc1_0.5 +├── default.yaml # default config used to train fastspeech2 +├── phone_id_map.txt # phone vocabulary file when training fastspeech2 +├── snapshot_iter_96400.pdz # model parameters and optimizer states +└── speech_stats.npy # statistics used to normalize spectrogram when training fastspeech2 +``` diff --git a/examples/aishell3/vc1/conf/default.yaml b/examples/aishell3/vc1/conf/default.yaml new file mode 100644 index 0000000000000000000000000000000000000000..bdd2a765e1dee40324a05226be7bc590448c7c49 --- /dev/null +++ b/examples/aishell3/vc1/conf/default.yaml @@ -0,0 +1,105 @@ +########################################################### +# FEATURE EXTRACTION SETTING # +########################################################### + +fs: 24000 # sr +n_fft: 2048 # FFT size. +n_shift: 300 # Hop size. +win_length: 1200 # Window length. + # If set to null, it will be the same as fft_size. +window: "hann" # Window function. + +# Only used for feats_type != raw + +fmin: 80 # Minimum frequency of Mel basis. +fmax: 7600 # Maximum frequency of Mel basis. +n_mels: 80 # The number of mel basis. + +# Only used for the model using pitch features (e.g. FastSpeech2) +f0min: 80 # Maximum f0 for pitch extraction. +f0max: 400 # Minimum f0 for pitch extraction. + + +########################################################### +# DATA SETTING # +########################################################### +batch_size: 64 +num_workers: 2 + + +########################################################### +# MODEL SETTING # +########################################################### +model: + adim: 384 # attention dimension + aheads: 2 # number of attention heads + elayers: 4 # number of encoder layers + eunits: 1536 # number of encoder ff units + dlayers: 4 # number of decoder layers + dunits: 1536 # number of decoder ff units + positionwise_layer_type: conv1d # type of position-wise layer + positionwise_conv_kernel_size: 3 # kernel size of position wise conv layer + duration_predictor_layers: 2 # number of layers of duration predictor + duration_predictor_chans: 256 # number of channels of duration predictor + duration_predictor_kernel_size: 3 # filter size of duration predictor + postnet_layers: 5 # number of layers of postnset + postnet_filts: 5 # filter size of conv layers in postnet + postnet_chans: 256 # number of channels of conv layers in postnet + use_masking: True # whether to apply masking for padded part in loss calculation + use_scaled_pos_enc: True # whether to use scaled positional encoding + encoder_normalize_before: True # whether to perform layer normalization before the input + decoder_normalize_before: True # whether to perform layer normalization before the input + reduction_factor: 1 # reduction factor + init_type: xavier_uniform # initialization type + init_enc_alpha: 1.0 # initial value of alpha of encoder scaled position encoding + init_dec_alpha: 1.0 # initial value of alpha of decoder scaled position encoding + transformer_enc_dropout_rate: 0.2 # dropout rate for transformer encoder layer + transformer_enc_positional_dropout_rate: 0.2 # dropout rate for transformer encoder positional encoding + transformer_enc_attn_dropout_rate: 0.2 # dropout rate for transformer encoder attention layer + transformer_dec_dropout_rate: 0.2 # dropout rate for transformer decoder layer + transformer_dec_positional_dropout_rate: 0.2 # dropout rate for transformer decoder positional encoding + transformer_dec_attn_dropout_rate: 0.2 # dropout rate for transformer decoder attention layer + pitch_predictor_layers: 5 # number of conv layers in pitch predictor + pitch_predictor_chans: 256 # number of channels of conv layers in pitch predictor + pitch_predictor_kernel_size: 5 # kernel size of conv leyers in pitch predictor + pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor + pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch + pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch + stop_gradient_from_pitch_predictor: true # whether to stop the gradient from pitch predictor to encoder + energy_predictor_layers: 2 # number of conv layers in energy predictor + energy_predictor_chans: 256 # number of channels of conv layers in energy predictor + energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor + energy_predictor_dropout: 0.5 # dropout rate in energy predictor + energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy + energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy + stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder + spk_embed_dim: 256 # speaker embedding dimension + spk_embed_integration_type: concat # speaker embedding integration type + + + +########################################################### +# UPDATER SETTING # +########################################################### +updater: + use_masking: True # whether to apply masking for padded part in loss calculation + + +########################################################### +# OPTIMIZER SETTING # +########################################################### +optimizer: + optim: adam # optimizer type + learning_rate: 0.001 # learning rate + +########################################################### +# TRAINING SETTING # +########################################################### +max_epoch: 200 +num_snapshots: 5 + + +########################################################### +# OTHER SETTING # +########################################################### +seed: 10086 diff --git a/examples/aishell3/vc1/local/preprocess.sh b/examples/aishell3/vc1/local/preprocess.sh new file mode 100755 index 0000000000000000000000000000000000000000..5f939a1a8203c51945f4dfa748ba5d92aba34337 --- /dev/null +++ b/examples/aishell3/vc1/local/preprocess.sh @@ -0,0 +1,86 @@ +#!/bin/bash + +stage=0 +stop_stage=100 + +config_path=$1 +ge2e_ckpt_path=$2 + +# gen speaker embedding +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + python3 ${MAIN_ROOT}/paddlespeech/vector/exps/ge2e/inference.py \ + --input=~/datasets/data_aishell3/train/wav/ \ + --output=dump/embed \ + --checkpoint_path=${ge2e_ckpt_path} +fi + +# copy from tts3/preprocess +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # get durations from MFA's result + echo "Generate durations.txt from MFA results ..." + python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \ + --inputdir=./aishell3_alignment_tone \ + --output durations.txt \ + --config=${config_path} +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + # extract features + echo "Extract features ..." + python3 ${BIN_DIR}/preprocess.py \ + --dataset=aishell3 \ + --rootdir=~/datasets/data_aishell3/ \ + --dumpdir=dump \ + --dur-file=durations.txt \ + --config=${config_path} \ + --num-cpu=20 \ + --cut-sil=True \ + --spk_emb_dir=dump/embed +fi + +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + # get features' stats(mean and std) + echo "Get features' stats ..." + python3 ${MAIN_ROOT}/utils/compute_statistics.py \ + --metadata=dump/train/raw/metadata.jsonl \ + --field-name="speech" + + python3 ${MAIN_ROOT}/utils/compute_statistics.py \ + --metadata=dump/train/raw/metadata.jsonl \ + --field-name="pitch" + + python3 ${MAIN_ROOT}/utils/compute_statistics.py \ + --metadata=dump/train/raw/metadata.jsonl \ + --field-name="energy" +fi + +if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + # normalize and covert phone/speaker to id, dev and test should use train's stats + echo "Normalize ..." + python3 ${BIN_DIR}/normalize.py \ + --metadata=dump/train/raw/metadata.jsonl \ + --dumpdir=dump/train/norm \ + --speech-stats=dump/train/speech_stats.npy \ + --pitch-stats=dump/train/pitch_stats.npy \ + --energy-stats=dump/train/energy_stats.npy \ + --phones-dict=dump/phone_id_map.txt \ + --speaker-dict=dump/speaker_id_map.txt + + python3 ${BIN_DIR}/normalize.py \ + --metadata=dump/dev/raw/metadata.jsonl \ + --dumpdir=dump/dev/norm \ + --speech-stats=dump/train/speech_stats.npy \ + --pitch-stats=dump/train/pitch_stats.npy \ + --energy-stats=dump/train/energy_stats.npy \ + --phones-dict=dump/phone_id_map.txt \ + --speaker-dict=dump/speaker_id_map.txt + + python3 ${BIN_DIR}/normalize.py \ + --metadata=dump/test/raw/metadata.jsonl \ + --dumpdir=dump/test/norm \ + --speech-stats=dump/train/speech_stats.npy \ + --pitch-stats=dump/train/pitch_stats.npy \ + --energy-stats=dump/train/energy_stats.npy \ + --phones-dict=dump/phone_id_map.txt \ + --speaker-dict=dump/speaker_id_map.txt +fi diff --git a/examples/aishell3/vc1/local/synthesize.sh b/examples/aishell3/vc1/local/synthesize.sh new file mode 100755 index 0000000000000000000000000000000000000000..35478c784b76b76a5aeb8c57a9b3a7de6aa583c5 --- /dev/null +++ b/examples/aishell3/vc1/local/synthesize.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +config_path=$1 +train_output_path=$2 +ckpt_name=$3 + +FLAGS_allocator_strategy=naive_best_fit \ +FLAGS_fraction_of_gpu_memory_to_use=0.01 \ +python3 ${BIN_DIR}/synthesize.py \ + --fastspeech2-config=${config_path} \ + --fastspeech2-checkpoint=${train_output_path}/checkpoints/${ckpt_name} \ + --fastspeech2-stat=dump/train/speech_stats.npy \ + --pwg-config=pwg_aishell3_ckpt_0.5/default.yaml \ + --pwg-checkpoint=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \ + --pwg-stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \ + --test-metadata=dump/test/norm/metadata.jsonl \ + --output-dir=${train_output_path}/test \ + --phones-dict=dump/phone_id_map.txt \ + --voice-cloning=True diff --git a/examples/aishell3/vc1/local/train.sh b/examples/aishell3/vc1/local/train.sh new file mode 100755 index 0000000000000000000000000000000000000000..c775fcadcceef12e05225c46aa53812e22aa2ee4 --- /dev/null +++ b/examples/aishell3/vc1/local/train.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +config_path=$1 +train_output_path=$2 + +python3 ${BIN_DIR}/train.py \ + --train-metadata=dump/train/norm/metadata.jsonl \ + --dev-metadata=dump/dev/norm/metadata.jsonl \ + --config=${config_path} \ + --output-dir=${train_output_path} \ + --ngpu=2 \ + --phones-dict=dump/phone_id_map.txt \ + --voice-cloning=True \ No newline at end of file diff --git a/examples/aishell3/vc1/local/voice_cloning.sh b/examples/aishell3/vc1/local/voice_cloning.sh new file mode 100755 index 0000000000000000000000000000000000000000..55bdd761ef845d7dc8084ce0cb80947f7b050656 --- /dev/null +++ b/examples/aishell3/vc1/local/voice_cloning.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +config_path=$1 +train_output_path=$2 +ckpt_name=$3 +ge2e_params_path=$4 +ref_audio_dir=$5 + +FLAGS_allocator_strategy=naive_best_fit \ +FLAGS_fraction_of_gpu_memory_to_use=0.01 \ +python3 ${BIN_DIR}/voice_cloning.py \ + --fastspeech2-config=${config_path} \ + --fastspeech2-checkpoint=${train_output_path}/checkpoints/${ckpt_name} \ + --fastspeech2-stat=dump/train/speech_stats.npy \ + --pwg-config=pwg_aishell3_ckpt_0.5/default.yaml \ + --pwg-checkpoint=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \ + --pwg-stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \ + --ge2e_params_path=${ge2e_params_path} \ + --text="凯莫瑞安联合体的经济崩溃迫在眉睫。" \ + --input-dir=${ref_audio_dir} \ + --output-dir=${train_output_path}/vc_syn \ + --phones-dict=dump/phone_id_map.txt diff --git a/examples/aishell3/vc1/path.sh b/examples/aishell3/vc1/path.sh new file mode 100755 index 0000000000000000000000000000000000000000..fb7e8411c80cc8cbf1c65dffaaf771bda961e10e --- /dev/null +++ b/examples/aishell3/vc1/path.sh @@ -0,0 +1,13 @@ +#!/bin/bash +export MAIN_ROOT=`realpath ${PWD}/../../../` + +export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} +export LC_ALL=C + +export PYTHONDONTWRITEBYTECODE=1 +# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C +export PYTHONIOENCODING=UTF-8 +export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} + +MODEL=fastspeech2 +export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL} diff --git a/examples/aishell3/vc1/run.sh b/examples/aishell3/vc1/run.sh new file mode 100755 index 0000000000000000000000000000000000000000..4eae1bdd87fadfe1663202cbd9700620e64cea37 --- /dev/null +++ b/examples/aishell3/vc1/run.sh @@ -0,0 +1,44 @@ +#!/bin/bash + +set -e +source path.sh + +gpus=0,1 +stage=0 +stop_stage=100 + +conf_path=conf/default.yaml +train_output_path=exp/default +ckpt_name=snapshot_iter_482.pdz +ref_audio_dir=ref_audio + +# not include ".pdparams" here +ge2e_ckpt_path=./ge2e_ckpt_0.3/step-3000000 + +# include ".pdparams" here +ge2e_params_path=${ge2e_ckpt_path}.pdparams + +# with the following command, you can choice the stage range you want to run +# such as `./run.sh --stage 0 --stop-stage 0` +# this can not be mixed use with `$1`, `$2` ... +source ${MAIN_ROOT}/utils/parse_options.sh || exit 1 + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # prepare data + CUDA_VISIBLE_DEVICES=${gpus} ./local/preprocess.sh ${conf_path} ${ge2e_ckpt_path} || exit -1 +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # train model, all `ckpt` under `train_output_path/checkpoints/` dir + CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1 +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + # synthesize, vocoder is pwgan + CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 +fi + +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + # synthesize, vocoder is pwgan + CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_cloning.sh ${conf_path} ${train_output_path} ${ckpt_name} ${ge2e_params_path} ${ref_audio_dir} || exit -1 +fi diff --git a/examples/aishell3/voc1/README.md b/examples/aishell3/voc1/README.md index d9e8ce591d4132a2f9c39a25986bfce67707d284..d67af726ebd001e29716ed0681327c2430bf43a9 100644 --- a/examples/aishell3/voc1/README.md +++ b/examples/aishell3/voc1/README.md @@ -22,7 +22,7 @@ Assume the path to the dataset is `~/datasets/data_aishell3`. Assume the path to the MFA result of AISHELL-3 is `./aishell3_alignment_tone`. Run the command below to 1. **source path**. -2. preprocess the dataset, +2. preprocess the dataset. 3. train the model. 4. synthesize wavs. - synthesize waveform from `metadata.jsonl`. diff --git a/examples/csmsc/tts2/README.md b/examples/csmsc/tts2/README.md index 2088ed15c2003a1df072836d655234010c99748b..61c4972bcd44a781175264ac779a8b3ad02ec8e0 100644 --- a/examples/csmsc/tts2/README.md +++ b/examples/csmsc/tts2/README.md @@ -14,7 +14,7 @@ Assume the path to the dataset is `~/datasets/BZNSYP`. Assume the path to the MFA result of CSMSC is `./baker_alignment_tone`. Run the command below to 1. **source path**. -2. preprocess the dataset, +2. preprocess the dataset. 3. train the model. 4. synthesize wavs. - synthesize waveform from `metadata.jsonl`. @@ -208,7 +208,8 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path} ``` ## Pretrained Model -Pretrained SpeedySpeech model with no silence in the edge of audios. [speedyspeech_nosil_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/speedyspeech_nosil_baker_ckpt_0.5.zip) +Pretrained SpeedySpeech model with no silence in the edge of audios[speedyspeech_nosil_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/speedyspeech_nosil_baker_ckpt_0.5.zip). + Static model can be downloaded here [speedyspeech_nosil_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/speedyspeech_nosil_baker_static_0.5.zip). SpeedySpeech checkpoint contains files listed below. diff --git a/examples/csmsc/tts3/README.md b/examples/csmsc/tts3/README.md index 6e4701dfe3138d9aae3e1de25f3481f780efb482..6570d33dc6862e9c7ebfd3694046578b1d28dabf 100644 --- a/examples/csmsc/tts3/README.md +++ b/examples/csmsc/tts3/README.md @@ -14,7 +14,7 @@ Assume the path to the dataset is `~/datasets/BZNSYP`. Assume the path to the MFA result of CSMSC is `./baker_alignment_tone`. Run the command below to 1. **source path**. -2. preprocess the dataset, +2. preprocess the dataset. 3. train the model. 4. synthesize wavs. - synthesize waveform from `metadata.jsonl`. @@ -199,8 +199,9 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path} ``` ## Pretrained Model -Pretrained FastSpeech2 model with no silence in the edge of audios. [fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip) -Static model can be downloaded here [fastspeech2_nosil_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_static_0.4.zip) +Pretrained FastSpeech2 model with no silence in the edge of audios [fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip). + +Static model can be downloaded here [fastspeech2_nosil_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_static_0.4.zip). FastSpeech2 checkpoint contains files listed below. ```text diff --git a/examples/csmsc/voc1/README.md b/examples/csmsc/voc1/README.md index f789cba0e9f10c9671066aca29f0e1069fd67390..b9c8a465fe2b79e55ebd032e1c43f1fd0e3397a6 100644 --- a/examples/csmsc/voc1/README.md +++ b/examples/csmsc/voc1/README.md @@ -13,7 +13,7 @@ Assume the path to the dataset is `~/datasets/BZNSYP`. Assume the path to the MFA result of CSMSC is `./baker_alignment_tone`. Run the command below to 1. **source path**. -2. preprocess the dataset, +2. preprocess the dataset. 3. train the model. 4. synthesize wavs. - synthesize waveform from `metadata.jsonl`. @@ -123,7 +123,8 @@ optional arguments: ## Pretrained Models Pretrained model can be downloaded here [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip). -Static models can be downloaded here [pwg_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_static_0.4.zip). + +Static model can be downloaded here [pwg_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_static_0.4.zip). Parallel WaveGAN checkpoint contains files listed below. diff --git a/examples/csmsc/voc1/conf/default.yaml b/examples/csmsc/voc1/conf/default.yaml index 5628b7f7c2e82ef19c4b2803f6d50fd177f372b5..1363b454f1a36b259059954ef8d0945b0f0e50ec 100644 --- a/examples/csmsc/voc1/conf/default.yaml +++ b/examples/csmsc/voc1/conf/default.yaml @@ -80,7 +80,7 @@ lambda_adv: 4.0 # Loss balancing coefficient. batch_size: 8 # Batch size. batch_max_steps: 25500 # Length of each audio in batch. Make sure dividable by hop_size. pin_memory: true # Whether to pin memory in Pytorch DataLoader. -num_workers: 4 # Number of workers in Pytorch DataLoader. +num_workers: 2 # Number of workers in Pytorch DataLoader. remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps. allow_cache: true # Whether to allow cache in dataset. If true, it requires cpu memory. diff --git a/examples/csmsc/voc3/README.md b/examples/csmsc/voc3/README.md index 9cb9d34d4692ff7c0799de18e42af5277f95a576..a72f60f1633c4469441f2bbebdd2a94e172d098b 100644 --- a/examples/csmsc/voc3/README.md +++ b/examples/csmsc/voc3/README.md @@ -13,7 +13,7 @@ Assume the path to the dataset is `~/datasets/BZNSYP`. Assume the path to the MFA result of CSMSC is `./baker_alignment_tone`. Run the command below to 1. **source path**. -2. preprocess the dataset, +2. preprocess the dataset. 3. train the model. 4. synthesize wavs. - synthesize waveform from `metadata.jsonl`. @@ -106,8 +106,51 @@ optional arguments: 4. `--output-dir` is the directory to save the synthesized audio files. 5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. +## Finetune +Since there are no `noise` in the input of Multi Band MelGAN, the audio quality is not so good (see [espnet issue](https://github.com/espnet/espnet/issues/3536#issuecomment-916035415)), we refer to the method proposed in [HiFiGAN](https://arxiv.org/abs/2010.05646), finetune Multi Band MelGAN with the predicted mel-spectrogram from `FastSpeech2`. + +The length of mel-spectrograms should align with the length of wavs, so we should generate mels using ground truth alignment. + +But since we are fine-tuning, we should use the statistics computed during training step. + +You should first download pretrained `FastSpeech2` model from [fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip) and `unzip` it. + +Assume the path to the dump-dir of training step is `dump`. +Assume the path to the duration result of CSMSC is `durations.txt` (generated during training step's preprocessing). +Assume the path to the pretrained `FastSpeech2` model is `fastspeech2_nosil_baker_ckpt_0.4`. +\ +The `finetune.sh` can +1. **source path**. +2. generate ground truth alignment mels. +3. link `*_wave.npy` from `dump` to `dump_finetune` (because we only use new mels, the wavs are the ones used during train step) . +4. copy features' stats from `dump` to `dump_finetune`. +5. normalize the ground truth alignment mels. +6. finetune the model. + +Before finetune, make sure that the pretrained model is in `finetune.sh` 's `${output-dir}/checkpoints`, and there is a `records.jsonl` in it to refer to this pretrained model +```text +exp/finetune/checkpoints +├── records.jsonl +└── snapshot_iter_1000000.pdz +``` +The content of `records.jsonl` should be as follows (change `"path"` to your own ckpt path): +``` +{"time": "2021-11-21 15:11:20.337311", "path": "~/PaddleSpeech/examples/csmsc/voc3/exp/finetune/checkpoints/snapshot_iter_1000000.pdz", "iteration": 1000000}↩ +``` +Run the command below +```bash +./finetune.sh +``` +By default, `finetune.sh` will use `conf/finetune.yaml` as config, the dump-dir is `dump_finetune`, the experiment dir is `exp/finetune`. + +TODO: +The hyperparameter of `finetune.yaml` is not good enough, a smaller `learning_rate` should be used (more `milestones` should be set). + ## Pretrained Models Pretrained model can be downloaded here [mb_melgan_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/mb_melgan_baker_ckpt_0.5.zip). + +Finetuned model can ben downloaded here [mb_melgan_baker_finetune_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/mb_melgan_baker_finetune_ckpt_0.5.zip). + Static model can be downloaded here [mb_melgan_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/mb_melgan_baker_static_0.5.zip) Multi Band MelGAN checkpoint contains files listed below. diff --git a/examples/csmsc/voc3/conf/finetune.yaml b/examples/csmsc/voc3/conf/finetune.yaml index e02f3e220147e4ca78fffc1e564efa4c968c9089..80ab6bed707b4cdf89a367634490a5bfba122fea 100644 --- a/examples/csmsc/voc3/conf/finetune.yaml +++ b/examples/csmsc/voc3/conf/finetune.yaml @@ -128,7 +128,7 @@ discriminator_scheduler_params: # INTERVAL SETTING # ########################################################### discriminator_train_start_steps: 200000 # Number of steps to start to train discriminator. -train_max_steps: 1200000 # Number of training steps. +train_max_steps: 2000000 # Number of training steps. save_interval_steps: 1000 # Interval steps to save checkpoint. eval_interval_steps: 1000 # Interval steps to evaluate the network. @@ -136,4 +136,4 @@ eval_interval_steps: 1000 # Interval steps to evaluate the network # OTHER SETTING # ########################################################### num_snapshots: 10 # max number of snapshots to keep while training -seed: 42 # random seed for paddle, random, and np.random \ No newline at end of file +seed: 42 # random seed for paddle, random, and np.random diff --git a/examples/csmsc/voc3/finetune.sh b/examples/csmsc/voc3/finetune.sh index 42e5a39796acdb46a8104876d8c4086b61866fdb..4ab10e5b3775af433ae9c9f6a2e82966ca3361ba 100755 --- a/examples/csmsc/voc3/finetune.sh +++ b/examples/csmsc/voc3/finetune.sh @@ -9,20 +9,19 @@ stop_stage=100 source ${MAIN_ROOT}/utils/parse_options.sh || exit 1 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - python3 ${MAIN_ROOT}/paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py \ - --fastspeech2-config=fastspeech2_nosil_baker_ckpt_0.4/default.yaml \ - --fastspeech2-checkpoint=fastspeech2_nosil_baker_ckpt_0.4/snapshot_iter_76000.pdz \ - --fastspeech2-stat=fastspeech2_nosil_baker_ckpt_0.4/speech_stats.npy \ - --dur-file=durations.txt \ - --output-dir=dump_finetune \ - --phones-dict=fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt + python3 ${MAIN_ROOT}/paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py \ + --fastspeech2-config=fastspeech2_nosil_baker_ckpt_0.4/default.yaml \ + --fastspeech2-checkpoint=fastspeech2_nosil_baker_ckpt_0.4/snapshot_iter_76000.pdz \ + --fastspeech2-stat=fastspeech2_nosil_baker_ckpt_0.4/speech_stats.npy \ + --dur-file=durations.txt \ + --output-dir=dump_finetune \ + --phones-dict=fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then - python3 local/link_wav.py \ - --old-dump-dir=dump \ - --dump-dir=dump_finetune - + python3 local/link_wav.py \ + --old-dump-dir=dump \ + --dump-dir=dump_finetune fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then @@ -51,13 +50,13 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then - CUDA_VISIBLE_DEVICES=${gpus} \ - FLAGS_cudnn_exhaustive_search=true \ - FLAGS_conv_workspace_size_limit=4000 \ - python ${BIN_DIR}/train.py \ - --train-metadata=dump_finetune/train/norm/metadata.jsonl \ - --dev-metadata=dump_finetune/dev/norm/metadata.jsonl \ - --config=conf/finetune.yaml \ - --output-dir=exp/finetune \ - --ngpu=1 + CUDA_VISIBLE_DEVICES=${gpus} \ + FLAGS_cudnn_exhaustive_search=true \ + FLAGS_conv_workspace_size_limit=4000 \ + python ${BIN_DIR}/train.py \ + --train-metadata=dump_finetune/train/norm/metadata.jsonl \ + --dev-metadata=dump_finetune/dev/norm/metadata.jsonl \ + --config=conf/finetune.yaml \ + --output-dir=exp/finetune \ + --ngpu=1 fi \ No newline at end of file diff --git a/examples/librispeech/s1/run.sh b/examples/librispeech/s1/run.sh index 4396597f6c4d6bf1b3d26c0d26615cafc7d776f6..74f7cbc1a4ecea0bfb4b6ae5064bc6773a4a4024 100755 --- a/examples/librispeech/s1/run.sh +++ b/examples/librispeech/s1/run.sh @@ -43,10 +43,10 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi -if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then - # export ckpt avg_n - CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit -fi +# if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then +# # export ckpt avg_n +# CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit +# fi if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then # test a single .wav file diff --git a/examples/librispeech/s2/conf/transformer.yaml b/examples/librispeech/s2/conf/transformer.yaml index d77329f50843e270b52750ef5dcc2e9429bd8617..de1ac347a8f2b57377c3493a804b8d07141f082f 100644 --- a/examples/librispeech/s2/conf/transformer.yaml +++ b/examples/librispeech/s2/conf/transformer.yaml @@ -1,6 +1,8 @@ # https://yaml.org/type/float.html # network architecture model: + cmvn_file: + cmvn_file_type: "json" # encoder related encoder: transformer encoder_conf: diff --git a/examples/librispeech/s2/run.sh b/examples/librispeech/s2/run.sh index 0c5b585b876cd07b30688a4761f0385875ba6806..facaafcb4afd816372914bb54cc318c4b0f97a58 100755 --- a/examples/librispeech/s2/run.sh +++ b/examples/librispeech/s2/run.sh @@ -48,10 +48,10 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} ${dict_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi -if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then - # export ckpt avg_n - ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit -fi +# if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then +# # export ckpt avg_n +# ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit +# fi if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then ./local/cacu_perplexity.sh || exit -1 diff --git a/examples/ljspeech/tts0/README.md b/examples/ljspeech/tts0/README.md index a3e947e81ac4cd01d1a63b5b3284c05d09810ea0..09fd0c133a8d43afe47947e2639bcd2041defce2 100644 --- a/examples/ljspeech/tts0/README.md +++ b/examples/ljspeech/tts0/README.md @@ -12,7 +12,7 @@ tar xjvf LJSpeech-1.1.tar.bz2 Assume the path to the dataset is `~/datasets/LJSpeech-1.1`. Run the command below to 1. **source path**. -2. preprocess the dataset, +2. preprocess the dataset. 3. train the model. 4. synthesize mels. ```bash diff --git a/examples/ljspeech/tts1/README.md b/examples/ljspeech/tts1/README.md index fbb2c9d0423db4e65493cffeb12ff7bad11375ae..12e43e2ea42825872a6623ae76e7bd6963ddb8ae 100644 --- a/examples/ljspeech/tts1/README.md +++ b/examples/ljspeech/tts1/README.md @@ -12,7 +12,7 @@ tar xjvf LJSpeech-1.1.tar.bz2 Assume the path to the dataset is `~/datasets/LJSpeech-1.1`. Run the command below to 1. **source path**. -2. preprocess the dataset, +2. preprocess the dataset. 3. train the model. 4. synthesize wavs. - synthesize waveform from `metadata.jsonl`. diff --git a/examples/ljspeech/tts3/README.md b/examples/ljspeech/tts3/README.md index bc38aac64f24230690b35a9c8016fc344ec9e172..cda5354122bdc333c333616868dd97a3008524a8 100644 --- a/examples/ljspeech/tts3/README.md +++ b/examples/ljspeech/tts3/README.md @@ -14,7 +14,7 @@ Assume the path to the dataset is `~/datasets/LJSpeech-1.1`. Assume the path to the MFA result of LJSpeech-1.1 is `./ljspeech_alignment`. Run the command below to 1. **source path**. -2. preprocess the dataset, +2. preprocess the dataset. 3. train the model. 4. synthesize wavs. - synthesize waveform from `metadata.jsonl`. diff --git a/examples/ljspeech/voc0/README.md b/examples/ljspeech/voc0/README.md index ad2337ef9daa2138a8f72f5afccb18902db3cced..09856c36766dd5dc8cbdf457ab37678282473017 100644 --- a/examples/ljspeech/voc0/README.md +++ b/examples/ljspeech/voc0/README.md @@ -13,7 +13,7 @@ Assume the path to the dataset is `~/datasets/LJSpeech-1.1`. Assume the path to the Tacotron2 generated mels is `../tts0/output/test`. Run the command below to 1. **source path**. -2. preprocess the dataset, +2. preprocess the dataset. 3. train the model. 4. synthesize wavs from mels. ```bash diff --git a/examples/ljspeech/voc1/README.md b/examples/ljspeech/voc1/README.md index fdeac6329550be85c4e8e58d5b0ea994636cf715..0506d5d8f75e35b646bd985b2e47e6f65e85ee55 100644 --- a/examples/ljspeech/voc1/README.md +++ b/examples/ljspeech/voc1/README.md @@ -12,7 +12,7 @@ Assume the path to the dataset is `~/datasets/LJSpeech-1.1`. Assume the path to the MFA result of LJSpeech-1.1 is `./ljspeech_alignment`. Run the command below to 1. **source path**. -2. preprocess the dataset, +2. preprocess the dataset. 3. train the model. 4. synthesize wavs. - synthesize waveform from `metadata.jsonl`. diff --git a/examples/other/ge2e/path.sh b/examples/other/ge2e/path.sh index b4f77985908fafacfa9fada5a7e120a52a621832..24305ef78b9faeeaa552014ed2a1b783c1f446e6 100755 --- a/examples/other/ge2e/path.sh +++ b/examples/other/ge2e/path.sh @@ -10,4 +10,4 @@ export PYTHONIOENCODING=UTF-8 export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} MODEL=ge2e -export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL} +export BIN_DIR=${MAIN_ROOT}/paddlespeech/vector/exps/${MODEL} diff --git a/examples/ted_en_zh/t0/run.sh b/examples/ted_en_zh/t0/run.sh index 2e2bc37d69ec2aba592619bdbe453a95fe2313c6..ed9ab5f87506aeec6b0edf9ad3f2b4299bb9647c 100755 --- a/examples/ted_en_zh/t0/run.sh +++ b/examples/ted_en_zh/t0/run.sh @@ -35,7 +35,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi -if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then - # export ckpt avg_n - CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit -fi +# if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then +# # export ckpt avg_n +# CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit +# fi diff --git a/examples/timit/s1/run.sh b/examples/timit/s1/run.sh index 74226c53f6bdfb2ab0c68923db59c47e870dbcfa..a95b5f3ad7ce8333c0e21494dbd7fbae169a4fc2 100755 --- a/examples/timit/s1/run.sh +++ b/examples/timit/s1/run.sh @@ -42,7 +42,7 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi -if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then - # export ckpt avg_n - CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit -fi +# if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then +# # export ckpt avg_n +# CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit +# fi diff --git a/examples/tiny/s1/run.sh b/examples/tiny/s1/run.sh index 23b2206cf31765451146c55b664e947eef6f0c7a..155eca171d1f23f7d3011245f21948ecec19e4ef 100755 --- a/examples/tiny/s1/run.sh +++ b/examples/tiny/s1/run.sh @@ -39,8 +39,8 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then CUDA_VISIBLE_DEVICES=${gpus} ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi -if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then - # export ckpt avg_n - CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit -fi +# if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then +# # export ckpt avg_n +# CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit +# fi diff --git a/examples/vctk/tts3/README.md b/examples/vctk/tts3/README.md index 78bfb9668bb7bf554a8b5f1a1cf9d729799c41a9..334372f9549807e0f7da09af853e15025fa8ead2 100644 --- a/examples/vctk/tts3/README.md +++ b/examples/vctk/tts3/README.md @@ -17,7 +17,7 @@ Assume the path to the dataset is `~/datasets/VCTK-Corpus-0.92`. Assume the path to the MFA result of VCTK is `./vctk_alignment`. Run the command below to 1. **source path**. -2. preprocess the dataset, +2. preprocess the dataset. 3. train the model. 4. synthesize wavs. - synthesize waveform from `metadata.jsonl`. diff --git a/examples/vctk/voc1/README.md b/examples/vctk/voc1/README.md index c9ecae0d2a1f3265e4d6caa91d0bfed412bcfa75..5063b869c872edbd015b85e5da2b66cb827ced4e 100644 --- a/examples/vctk/voc1/README.md +++ b/examples/vctk/voc1/README.md @@ -17,7 +17,7 @@ Assume the path to the dataset is `~/datasets/VCTK-Corpus-0.92`. Assume the path to the MFA result of VCTK is `./vctk_alignment`. Run the command below to 1. **source path**. -2. preprocess the dataset, +2. preprocess the dataset. 3. train the model. 4. synthesize wavs. - synthesize waveform from `metadata.jsonl`. diff --git a/paddlespeech/s2t/decoders/ctcdecoder/swig/setup.py b/paddlespeech/s2t/decoders/ctcdecoder/swig/setup.py index c089f96cd75f41ac336a20a4b67955b4928d13f4..d06125b7b85c8ba74433c96e6da32d39f7828ab3 100644 --- a/paddlespeech/s2t/decoders/ctcdecoder/swig/setup.py +++ b/paddlespeech/s2t/decoders/ctcdecoder/swig/setup.py @@ -126,8 +126,12 @@ decoders_module = [ ] setup( - name='swig_decoders', - version='1.1', - description="""CTC decoders""", + name='paddlespeech_ctcdecoders', + version='0.0.1a', + description="CTC decoders in paddlespeech", + author="PaddlePaddle Speech and Language Team", + author_email="paddlesl@baidu.com", + url="https://github.com/PaddlePaddle/PaddleSpeech", + license='Apache 2.0', ext_modules=decoders_module, - py_modules=['swig_decoders'], ) + py_modules=['swig_decoders']) diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/test_export.py b/paddlespeech/s2t/exps/deepspeech2/bin/test_export.py index b6b34d0835808d9e8e9692ed238980956263bb2c..e073ebbf9823d75d87e950335bffd570ce7e8fb4 100644 --- a/paddlespeech/s2t/exps/deepspeech2/bin/test_export.py +++ b/paddlespeech/s2t/exps/deepspeech2/bin/test_export.py @@ -39,6 +39,8 @@ if __name__ == "__main__": "--export_path", type=str, help="path of the jit model to save") parser.add_argument( "--model_type", type=str, default='offline', help='offline/online') + parser.add_argument( + "--enable-auto-log", action="store_true", help="use auto log") args = parser.parse_args() print_arguments(args, globals()) print("model_type:{}".format(args.model_type)) diff --git a/paddlespeech/s2t/exps/deepspeech2/model.py b/paddlespeech/s2t/exps/deepspeech2/model.py index 683fab146748009b3011c0d30d0c77ec1a36b8c7..177d710b066d4b05bb5e60ee0b040aa51f823613 100644 --- a/paddlespeech/s2t/exps/deepspeech2/model.py +++ b/paddlespeech/s2t/exps/deepspeech2/model.py @@ -42,7 +42,6 @@ from paddlespeech.s2t.training.trainer import Trainer from paddlespeech.s2t.utils import error_rate from paddlespeech.s2t.utils import layer_tools from paddlespeech.s2t.utils import mp_tools -from paddlespeech.s2t.utils.log import Autolog from paddlespeech.s2t.utils.log import Log from paddlespeech.s2t.utils.utility import UpdateConfig @@ -339,8 +338,6 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): error_rate_type=cfg.error_rate_type) def compute_result_transcripts(self, audio, audio_len, vocab_list, cfg): - self.autolog.times.start() - self.autolog.times.stamp() result_transcripts = self.model.decode( audio, audio_len, @@ -354,19 +351,12 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): cutoff_top_n=cfg.cutoff_top_n, num_processes=cfg.num_proc_bsearch) - self.autolog.times.stamp() - self.autolog.times.stamp() - self.autolog.times.end() return result_transcripts @mp_tools.rank_zero_only @paddle.no_grad() def test(self): logger.info(f"Test Total Examples: {len(self.test_loader.dataset)}") - self.autolog = Autolog( - batch_size=self.config.decoding.batch_size, - model_name="deepspeech2", - model_precision="fp32").getlog() self.model.eval() cfg = self.config error_rate_type = None @@ -390,7 +380,6 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): msg += "Final error rate [%s] (%d/%d) = %f" % ( error_rate_type, num_ins, num_ins, errors_sum / len_refs) logger.info(msg) - self.autolog.report() @paddle.no_grad() def export(self): @@ -414,6 +403,43 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester): def __init__(self, config, args): super().__init__(config, args) self.apply_static = True + self.args = args + + @mp_tools.rank_zero_only + @paddle.no_grad() + def test(self): + logger.info(f"Test Total Examples: {len(self.test_loader.dataset)}") + if self.args.enable_auto_log == True: + from paddlespeech.s2t.utils.log import Autolog + self.autolog = Autolog( + batch_size=self.config.decoding.batch_size, + model_name="deepspeech2", + model_precision="fp32").getlog() + self.model.eval() + cfg = self.config + error_rate_type = None + errors_sum, len_refs, num_ins = 0.0, 0, 0 + with jsonlines.open(self.args.result_file, 'w') as fout: + for i, batch in enumerate(self.test_loader): + utts, audio, audio_len, texts, texts_len = batch + metrics = self.compute_metrics(utts, audio, audio_len, texts, + texts_len, fout) + errors_sum += metrics['errors_sum'] + len_refs += metrics['len_refs'] + num_ins += metrics['num_ins'] + error_rate_type = metrics['error_rate_type'] + logger.info("Error rate [%s] (%d/?) = %f" % + (error_rate_type, num_ins, errors_sum / len_refs)) + + # logging + msg = "Test: " + msg += "epoch: {}, ".format(self.epoch) + msg += "step: {}, ".format(self.iteration) + msg += "Final error rate [%s] (%d/%d) = %f" % ( + error_rate_type, num_ins, num_ins, errors_sum / len_refs) + logger.info(msg) + if self.args.enable_auto_log == True: + self.autolog.report() def compute_result_transcripts(self, audio, audio_len, vocab_list, cfg): if self.args.model_type == "online": @@ -486,8 +512,8 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester): x_len_list = np.split(x_len_batch, batch_size, axis=0) for x, x_len in zip(x_list, x_len_list): - self.autolog.times.start() - self.autolog.times.stamp() + if self.args.enable_auto_log == True: + self.autolog.times.start() x_len = x_len[0] assert (chunk_size <= x_len) @@ -521,6 +547,10 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester): probs_chunk_list = [] probs_chunk_lens_list = [] + if self.args.enable_auto_log == True: + # record the model preprocessing time + self.autolog.times.stamp() + for i in range(0, num_chunk): start = i * chunk_stride end = start + chunk_size @@ -576,9 +606,12 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester): [output_probs, output_probs_padding], axis=1) output_probs_list.append(output_probs) output_lens_list.append(output_lens) - self.autolog.times.stamp() - self.autolog.times.stamp() - self.autolog.times.end() + if self.args.enable_auto_log == True: + # record the model inference time + self.autolog.times.stamp() + # record the post processing time + self.autolog.times.stamp() + self.autolog.times.end() output_probs = np.concatenate(output_probs_list, axis=0) output_lens = np.concatenate(output_lens_list, axis=0) return output_probs, output_lens @@ -608,12 +641,17 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester): audio_len_handle.reshape(x_len.shape) audio_len_handle.copy_from_cpu(x_len) - self.autolog.times.start() - self.autolog.times.stamp() + if self.args.enable_auto_log == True: + self.autolog.times.start() + # record the prefix processing time + self.autolog.times.stamp() self.predictor.run() - self.autolog.times.stamp() - self.autolog.times.stamp() - self.autolog.times.end() + if self.args.enable_auto_log == True: + # record the model inference time + self.autolog.times.stamp() + # record the post processing time + self.autolog.times.stamp() + self.autolog.times.end() output_names = self.predictor.get_output_names() output_handle = self.predictor.get_output_handle(output_names[0]) @@ -624,11 +662,11 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester): def setup_model(self): super().setup_model() - speedyspeech_config = inference.Config( + deepspeech_config = inference.Config( self.args.export_path + ".pdmodel", self.args.export_path + ".pdiparams") if (os.environ['CUDA_VISIBLE_DEVICES'].strip() != ''): - speedyspeech_config.enable_use_gpu(100, 0) - speedyspeech_config.enable_memory_optim() - speedyspeech_predictor = inference.create_predictor(speedyspeech_config) - self.predictor = speedyspeech_predictor + deepspeech_config.enable_use_gpu(100, 0) + deepspeech_config.enable_memory_optim() + deepspeech_predictor = inference.create_predictor(deepspeech_config) + self.predictor = deepspeech_predictor diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py index fd9982716d26ff3f737173646c9fbaf3f7942255..9977cecc4bb2ecc337f7126cd7c1a38d95424358 100644 --- a/paddlespeech/s2t/models/u2/u2.py +++ b/paddlespeech/s2t/models/u2/u2.py @@ -860,7 +860,7 @@ class U2Model(U2DecodeModel): int, nn.Layer, nn.Layer, nn.Layer: vocab size, encoder, decoder, ctc """ # cmvn - if configs['cmvn_file'] is not None: + if 'cmvn_file' in configs and configs['cmvn_file'] is not None: mean, istd = load_cmvn(configs['cmvn_file'], configs['cmvn_file_type']) global_cmvn = GlobalCMVN( diff --git a/paddlespeech/t2s/datasets/am_batch_fn.py b/paddlespeech/t2s/datasets/am_batch_fn.py index 5ed9aa7af1d9ed9337a1a0cb3709b5ae11d2143f..9470f9234c2e5e8623d4c9b5eb86d8c0edb96392 100644 --- a/paddlespeech/t2s/datasets/am_batch_fn.py +++ b/paddlespeech/t2s/datasets/am_batch_fn.py @@ -100,7 +100,7 @@ def fastspeech2_single_spk_batch_fn(examples): def fastspeech2_multi_spk_batch_fn(examples): - # fields = ["text", "text_lengths", "speech", "speech_lengths", "durations", "pitch", "energy", "spk_id"] + # fields = ["text", "text_lengths", "speech", "speech_lengths", "durations", "pitch", "energy", "spk_id"/"spk_emb"] text = [np.array(item["text"], dtype=np.int64) for item in examples] speech = [np.array(item["speech"], dtype=np.float32) for item in examples] pitch = [np.array(item["pitch"], dtype=np.float32) for item in examples] @@ -114,7 +114,6 @@ def fastspeech2_multi_spk_batch_fn(examples): speech_lengths = [ np.array(item["speech_lengths"], dtype=np.int64) for item in examples ] - spk_id = [np.array(item["spk_id"], dtype=np.int64) for item in examples] text = batch_sequences(text) pitch = batch_sequences(pitch) @@ -130,7 +129,6 @@ def fastspeech2_multi_spk_batch_fn(examples): energy = paddle.to_tensor(energy) text_lengths = paddle.to_tensor(text_lengths) speech_lengths = paddle.to_tensor(speech_lengths) - spk_id = paddle.to_tensor(spk_id) batch = { "text": text, @@ -139,9 +137,20 @@ def fastspeech2_multi_spk_batch_fn(examples): "speech": speech, "speech_lengths": speech_lengths, "pitch": pitch, - "energy": energy, - "spk_id": spk_id + "energy": energy } + # spk_emb has a higher priority than spk_id + if "spk_emb" in examples[0]: + spk_emb = [ + np.array(item["spk_emb"], dtype=np.float32) for item in examples + ] + spk_emb = batch_sequences(spk_emb) + spk_emb = paddle.to_tensor(spk_emb) + batch["spk_emb"] = spk_emb + elif "spk_id" in examples[0]: + spk_id = [np.array(item["spk_id"], dtype=np.int64) for item in examples] + spk_id = paddle.to_tensor(spk_id) + batch["spk_id"] = spk_id return batch diff --git a/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e.py b/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e.py index ee9fe05792420758b3d0fccbbb0538b4a96c28cc..1839415e978d84b7fbf83f3516f01176e4aabe6c 100644 --- a/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e.py +++ b/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e.py @@ -46,14 +46,14 @@ def evaluate(args, fastspeech2_config, pwg_config): print("vocab_size:", vocab_size) with open(args.speaker_dict, 'rt') as f: spk_id = [line.strip().split() for line in f.readlines()] - num_speakers = len(spk_id) - print("num_speakers:", num_speakers) + spk_num = len(spk_id) + print("spk_num:", spk_num) odim = fastspeech2_config.n_mels model = FastSpeech2( idim=vocab_size, odim=odim, - num_speakers=num_speakers, + spk_num=spk_num, **fastspeech2_config["model"]) model.set_state_dict( diff --git a/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e_en.py b/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e_en.py index b5d0ce1716b40cc82dca81789b4809fe635f5366..095d20821132cfd0f78485a025d91494f0409b83 100644 --- a/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e_en.py +++ b/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e_en.py @@ -51,14 +51,14 @@ def evaluate(args, fastspeech2_config, pwg_config): print("vocab_size:", vocab_size) with open(args.speaker_dict, 'rt') as f: spk_id = [line.strip().split() for line in f.readlines()] - num_speakers = len(spk_id) - print("num_speakers:", num_speakers) + spk_num = len(spk_id) + print("spk_num:", spk_num) odim = fastspeech2_config.n_mels model = FastSpeech2( idim=vocab_size, odim=odim, - num_speakers=num_speakers, + spk_num=spk_num, **fastspeech2_config["model"]) model.set_state_dict( diff --git a/paddlespeech/t2s/exps/fastspeech2/normalize.py b/paddlespeech/t2s/exps/fastspeech2/normalize.py index 7283f6b430c92de51db9fe9398cf2ec0f8b3ea5a..8ec20ebf0f8f1865c45cdeed99d487e079e498b0 100644 --- a/paddlespeech/t2s/exps/fastspeech2/normalize.py +++ b/paddlespeech/t2s/exps/fastspeech2/normalize.py @@ -167,6 +167,10 @@ def main(): "pitch": str(pitch_path), "energy": str(energy_path) } + # add spk_emb for voice cloning + if "spk_emb" in item: + record["spk_emb"] = str(item["spk_emb"]) + output_metadata.append(record) output_metadata.sort(key=itemgetter('utt_id')) output_metadata_path = Path(args.dumpdir) / "metadata.jsonl" diff --git a/paddlespeech/t2s/exps/fastspeech2/preprocess.py b/paddlespeech/t2s/exps/fastspeech2/preprocess.py index 3702ecd314abe9318a465df2fb607885f0f04d2a..b874b3a70a320bdee2e8ec142fb06e4504cd1ac5 100644 --- a/paddlespeech/t2s/exps/fastspeech2/preprocess.py +++ b/paddlespeech/t2s/exps/fastspeech2/preprocess.py @@ -44,7 +44,8 @@ def process_sentence(config: Dict[str, Any], mel_extractor=None, pitch_extractor=None, energy_extractor=None, - cut_sil: bool=True): + cut_sil: bool=True, + spk_emb_dir: Path=None): utt_id = fp.stem # for vctk if utt_id.endswith("_mic2"): @@ -116,6 +117,14 @@ def process_sentence(config: Dict[str, Any], "energy": str(energy_path), "speaker": speaker } + if spk_emb_dir: + if speaker in os.listdir(spk_emb_dir): + embed_name = utt_id + ".npy" + embed_path = spk_emb_dir / speaker / embed_name + if embed_path.is_file(): + record["spk_emb"] = str(embed_path) + else: + return None return record @@ -127,13 +136,14 @@ def process_sentences(config, pitch_extractor=None, energy_extractor=None, nprocs: int=1, - cut_sil: bool=True): + cut_sil: bool=True, + spk_emb_dir: Path=None): if nprocs == 1: results = [] for fp in fps: record = process_sentence(config, fp, sentences, output_dir, mel_extractor, pitch_extractor, - energy_extractor, cut_sil) + energy_extractor, cut_sil, spk_emb_dir) if record: results.append(record) else: @@ -144,7 +154,7 @@ def process_sentences(config, future = pool.submit(process_sentence, config, fp, sentences, output_dir, mel_extractor, pitch_extractor, energy_extractor, - cut_sil) + cut_sil, spk_emb_dir) future.add_done_callback(lambda p: progress.update()) futures.append(future) @@ -202,6 +212,11 @@ def main(): default=True, help="whether cut sil in the edge of audio") + parser.add_argument( + "--spk_emb_dir", + default=None, + type=str, + help="directory to speaker embedding files.") args = parser.parse_args() rootdir = Path(args.rootdir).expanduser() @@ -211,6 +226,11 @@ def main(): dumpdir.mkdir(parents=True, exist_ok=True) dur_file = Path(args.dur_file).expanduser() + if args.spk_emb_dir: + spk_emb_dir = Path(args.spk_emb_dir).expanduser().resolve() + else: + spk_emb_dir = None + assert rootdir.is_dir() assert dur_file.is_file() @@ -251,6 +271,7 @@ def main(): test_wav_files += wav_files[-sub_num_dev:] else: train_wav_files += wav_files + elif args.dataset == "ljspeech": wav_files = sorted(list((rootdir / "wavs").rglob("*.wav"))) # split data into 3 sections @@ -317,7 +338,8 @@ def main(): pitch_extractor, energy_extractor, nprocs=args.num_cpu, - cut_sil=args.cut_sil) + cut_sil=args.cut_sil, + spk_emb_dir=spk_emb_dir) if dev_wav_files: process_sentences( config, @@ -327,7 +349,8 @@ def main(): mel_extractor, pitch_extractor, energy_extractor, - cut_sil=args.cut_sil) + cut_sil=args.cut_sil, + spk_emb_dir=spk_emb_dir) if test_wav_files: process_sentences( config, @@ -338,7 +361,8 @@ def main(): pitch_extractor, energy_extractor, nprocs=args.num_cpu, - cut_sil=args.cut_sil) + cut_sil=args.cut_sil, + spk_emb_dir=spk_emb_dir) if __name__ == "__main__": diff --git a/paddlespeech/t2s/exps/fastspeech2/synthesize.py b/paddlespeech/t2s/exps/fastspeech2/synthesize.py index 207275f90052c35bc4a53667d046d002b5441d3c..249845e4dba5b96747a881eeeb2a2299e020969f 100644 --- a/paddlespeech/t2s/exps/fastspeech2/synthesize.py +++ b/paddlespeech/t2s/exps/fastspeech2/synthesize.py @@ -40,16 +40,19 @@ def evaluate(args, fastspeech2_config, pwg_config): fields = ["utt_id", "text"] + spk_num = None if args.speaker_dict is not None: print("multiple speaker fastspeech2!") with open(args.speaker_dict, 'rt') as f: spk_id = [line.strip().split() for line in f.readlines()] - num_speakers = len(spk_id) + spk_num = len(spk_id) fields += ["spk_id"] + elif args.voice_cloning: + print("voice cloning!") + fields += ["spk_emb"] else: print("single speaker fastspeech2!") - num_speakers = None - print("num_speakers:", num_speakers) + print("spk_num:", spk_num) test_dataset = DataTable(data=test_metadata, fields=fields) @@ -62,7 +65,7 @@ def evaluate(args, fastspeech2_config, pwg_config): model = FastSpeech2( idim=vocab_size, odim=odim, - num_speakers=num_speakers, + spk_num=spk_num, **fastspeech2_config["model"]) model.set_state_dict( @@ -96,12 +99,15 @@ def evaluate(args, fastspeech2_config, pwg_config): for datum in test_dataset: utt_id = datum["utt_id"] text = paddle.to_tensor(datum["text"]) - if "spk_id" in datum: + spk_emb = None + spk_id = None + if args.voice_cloning and "spk_emb" in datum: + spk_emb = paddle.to_tensor(np.load(datum["spk_emb"])) + elif "spk_id" in datum: spk_id = paddle.to_tensor(datum["spk_id"]) - else: - spk_id = None with paddle.no_grad(): - wav = pwg_inference(fastspeech2_inference(text, spk_id=spk_id)) + wav = pwg_inference( + fastspeech2_inference(text, spk_id=spk_id, spk_emb=spk_emb)) sf.write( str(output_dir / (utt_id + ".wav")), wav.numpy(), @@ -142,6 +148,15 @@ def main(): type=str, default=None, help="speaker id map file for multiple speaker model.") + + def str2bool(str): + return True if str.lower() == 'true' else False + + parser.add_argument( + "--voice-cloning", + type=str2bool, + default=False, + help="whether training voice cloning model.") parser.add_argument("--test-metadata", type=str, help="test metadata.") parser.add_argument("--output-dir", type=str, help="output dir.") parser.add_argument( diff --git a/paddlespeech/t2s/exps/fastspeech2/train.py b/paddlespeech/t2s/exps/fastspeech2/train.py index 38ac2fe3f395fc818644d7422db62127169cc081..fafded6fca0cef090d9ba1c844cdc526fe5d49d6 100644 --- a/paddlespeech/t2s/exps/fastspeech2/train.py +++ b/paddlespeech/t2s/exps/fastspeech2/train.py @@ -61,18 +61,24 @@ def train_sp(args, config): "text", "text_lengths", "speech", "speech_lengths", "durations", "pitch", "energy" ] + converters = {"speech": np.load, "pitch": np.load, "energy": np.load} + spk_num = None if args.speaker_dict is not None: print("multiple speaker fastspeech2!") collate_fn = fastspeech2_multi_spk_batch_fn with open(args.speaker_dict, 'rt') as f: spk_id = [line.strip().split() for line in f.readlines()] - num_speakers = len(spk_id) + spk_num = len(spk_id) fields += ["spk_id"] + elif args.voice_cloning: + print("Training voice cloning!") + collate_fn = fastspeech2_multi_spk_batch_fn + fields += ["spk_emb"] + converters["spk_emb"] = np.load else: print("single speaker fastspeech2!") collate_fn = fastspeech2_single_spk_batch_fn - num_speakers = None - print("num_speakers:", num_speakers) + print("spk_num:", spk_num) # dataloader has been too verbose logging.getLogger("DataLoader").disabled = True @@ -83,17 +89,13 @@ def train_sp(args, config): train_dataset = DataTable( data=train_metadata, fields=fields, - converters={"speech": np.load, - "pitch": np.load, - "energy": np.load}, ) + converters=converters, ) with jsonlines.open(args.dev_metadata, 'r') as reader: dev_metadata = list(reader) dev_dataset = DataTable( data=dev_metadata, fields=fields, - converters={"speech": np.load, - "pitch": np.load, - "energy": np.load}, ) + converters=converters, ) # collate function and dataloader @@ -127,10 +129,7 @@ def train_sp(args, config): odim = config.n_mels model = FastSpeech2( - idim=vocab_size, - odim=odim, - num_speakers=num_speakers, - **config["model"]) + idim=vocab_size, odim=odim, spk_num=spk_num, **config["model"]) if world_size > 1: model = DataParallel(model) print("model done!") @@ -184,6 +183,15 @@ def main(): default=None, help="speaker id map file for multiple speaker model.") + def str2bool(str): + return True if str.lower() == 'true' else False + + parser.add_argument( + "--voice-cloning", + type=str2bool, + default=False, + help="whether training voice cloning model.") + args = parser.parse_args() with open(args.config) as f: diff --git a/paddlespeech/t2s/exps/fastspeech2/voice_cloning.py b/paddlespeech/t2s/exps/fastspeech2/voice_cloning.py new file mode 100644 index 0000000000000000000000000000000000000000..9fbd496418199d05b6319ab335f1a5437bb961d2 --- /dev/null +++ b/paddlespeech/t2s/exps/fastspeech2/voice_cloning.py @@ -0,0 +1,208 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +import os +from pathlib import Path + +import numpy as np +import paddle +import soundfile as sf +import yaml +from yacs.config import CfgNode + +from paddlespeech.t2s.frontend.zh_frontend import Frontend +from paddlespeech.t2s.models.fastspeech2 import FastSpeech2 +from paddlespeech.t2s.models.fastspeech2 import FastSpeech2Inference +from paddlespeech.t2s.models.parallel_wavegan import PWGGenerator +from paddlespeech.t2s.models.parallel_wavegan import PWGInference +from paddlespeech.t2s.modules.normalizer import ZScore +from paddlespeech.vector.exps.ge2e.audio_processor import SpeakerVerificationPreprocessor +from paddlespeech.vector.models.lstm_speaker_encoder import LSTMSpeakerEncoder + + +def voice_cloning(args, fastspeech2_config, pwg_config): + # speaker encoder + p = SpeakerVerificationPreprocessor( + sampling_rate=16000, + audio_norm_target_dBFS=-30, + vad_window_length=30, + vad_moving_average_width=8, + vad_max_silence_length=6, + mel_window_length=25, + mel_window_step=10, + n_mels=40, + partial_n_frames=160, + min_pad_coverage=0.75, + partial_overlap_ratio=0.5) + print("Audio Processor Done!") + + speaker_encoder = LSTMSpeakerEncoder( + n_mels=40, num_layers=3, hidden_size=256, output_size=256) + speaker_encoder.set_state_dict(paddle.load(args.ge2e_params_path)) + speaker_encoder.eval() + print("GE2E Done!") + + with open(args.phones_dict, "r") as f: + phn_id = [line.strip().split() for line in f.readlines()] + vocab_size = len(phn_id) + print("vocab_size:", vocab_size) + odim = fastspeech2_config.n_mels + model = FastSpeech2( + idim=vocab_size, odim=odim, **fastspeech2_config["model"]) + + model.set_state_dict( + paddle.load(args.fastspeech2_checkpoint)["main_params"]) + model.eval() + + vocoder = PWGGenerator(**pwg_config["generator_params"]) + vocoder.set_state_dict(paddle.load(args.pwg_checkpoint)["generator_params"]) + vocoder.remove_weight_norm() + vocoder.eval() + print("model done!") + + frontend = Frontend(phone_vocab_path=args.phones_dict) + print("frontend done!") + + stat = np.load(args.fastspeech2_stat) + mu, std = stat + mu = paddle.to_tensor(mu) + std = paddle.to_tensor(std) + fastspeech2_normalizer = ZScore(mu, std) + + stat = np.load(args.pwg_stat) + mu, std = stat + mu = paddle.to_tensor(mu) + std = paddle.to_tensor(std) + pwg_normalizer = ZScore(mu, std) + + fastspeech2_inference = FastSpeech2Inference(fastspeech2_normalizer, model) + fastspeech2_inference.eval() + pwg_inference = PWGInference(pwg_normalizer, vocoder) + pwg_inference.eval() + + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + input_dir = Path(args.input_dir) + + sentence = args.text + + input_ids = frontend.get_input_ids(sentence, merge_sentences=True) + phone_ids = input_ids["phone_ids"][0] + + for name in os.listdir(input_dir): + utt_id = name.split(".")[0] + ref_audio_path = input_dir / name + mel_sequences = p.extract_mel_partials(p.preprocess_wav(ref_audio_path)) + # print("mel_sequences: ", mel_sequences.shape) + with paddle.no_grad(): + spk_emb = speaker_encoder.embed_utterance( + paddle.to_tensor(mel_sequences)) + # print("spk_emb shape: ", spk_emb.shape) + + with paddle.no_grad(): + wav = pwg_inference( + fastspeech2_inference(phone_ids, spk_emb=spk_emb)) + + sf.write( + str(output_dir / (utt_id + ".wav")), + wav.numpy(), + samplerate=fastspeech2_config.fs) + print(f"{utt_id} done!") + # Randomly generate numbers of 0 ~ 0.2, 256 is the dim of spk_emb + random_spk_emb = np.random.rand(256) * 0.2 + random_spk_emb = paddle.to_tensor(random_spk_emb) + utt_id = "random_spk_emb" + with paddle.no_grad(): + wav = pwg_inference(fastspeech2_inference(phone_ids, spk_emb=spk_emb)) + sf.write( + str(output_dir / (utt_id + ".wav")), + wav.numpy(), + samplerate=fastspeech2_config.fs) + print(f"{utt_id} done!") + + +def main(): + # parse args and config and redirect to train_sp + parser = argparse.ArgumentParser(description="") + parser.add_argument( + "--fastspeech2-config", type=str, help="fastspeech2 config file.") + parser.add_argument( + "--fastspeech2-checkpoint", + type=str, + help="fastspeech2 checkpoint to load.") + parser.add_argument( + "--fastspeech2-stat", + type=str, + help="mean and standard deviation used to normalize spectrogram when training fastspeech2." + ) + parser.add_argument( + "--pwg-config", type=str, help="parallel wavegan config file.") + parser.add_argument( + "--pwg-checkpoint", + type=str, + help="parallel wavegan generator parameters to load.") + parser.add_argument( + "--pwg-stat", + type=str, + help="mean and standard deviation used to normalize spectrogram when training parallel wavegan." + ) + parser.add_argument( + "--phones-dict", + type=str, + default="phone_id_map.txt", + help="phone vocabulary file.") + parser.add_argument( + "--text", + type=str, + default="每当你觉得,想要批评什么人的时候,你切要记着,这个世界上的人,并非都具备你禀有的条件。", + help="text to synthesize, a line") + + parser.add_argument( + "--ge2e_params_path", type=str, help="ge2e params path.") + + parser.add_argument( + "--ngpu", type=int, default=1, help="if ngpu=0, use cpu.") + + parser.add_argument( + "--input-dir", + type=str, + help="input dir of *.wav, the sample rate will be resample to 16k.") + parser.add_argument("--output-dir", type=str, help="output dir.") + + args = parser.parse_args() + + if args.ngpu == 0: + paddle.set_device("cpu") + elif args.ngpu > 0: + paddle.set_device("gpu") + else: + print("ngpu should >= 0 !") + + with open(args.fastspeech2_config) as f: + fastspeech2_config = CfgNode(yaml.safe_load(f)) + with open(args.pwg_config) as f: + pwg_config = CfgNode(yaml.safe_load(f)) + + print("========Args========") + print(yaml.safe_dump(vars(args))) + print("========Config========") + print(fastspeech2_config) + print(pwg_config) + + voice_cloning(args, fastspeech2_config, pwg_config) + + +if __name__ == "__main__": + main() diff --git a/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/voice_cloning.py b/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/voice_cloning.py index 2f005e723d547b6eadfebbba68aeaf16ffe60586..4e6b8d36276c65e134221a0c2b5cf4b3ff61a469 100644 --- a/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/voice_cloning.py +++ b/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/voice_cloning.py @@ -20,14 +20,14 @@ import paddle import soundfile as sf from matplotlib import pyplot as plt -from paddlespeech.t2s.exps.ge2e.audio_processor import SpeakerVerificationPreprocessor from paddlespeech.t2s.exps.voice_cloning.tacotron2_ge2e.aishell3 import voc_phones from paddlespeech.t2s.exps.voice_cloning.tacotron2_ge2e.aishell3 import voc_tones from paddlespeech.t2s.exps.voice_cloning.tacotron2_ge2e.chinese_g2p import convert_sentence -from paddlespeech.t2s.models.lstm_speaker_encoder import LSTMSpeakerEncoder from paddlespeech.t2s.models.tacotron2 import Tacotron2 from paddlespeech.t2s.models.waveflow import ConditionalWaveFlow from paddlespeech.t2s.utils import display +from paddlespeech.vector.exps.ge2e.audio_processor import SpeakerVerificationPreprocessor +from paddlespeech.vector.models.lstm_speaker_encoder import LSTMSpeakerEncoder def voice_cloning(args): diff --git a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py index e650e2550b3c84bda0e210178bbb7ff8c6437885..cf957978eb20a6d3eab9a2f3b8cdb36c91213a53 100644 --- a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py +++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py @@ -98,7 +98,7 @@ class FastSpeech2(nn.Layer): pitch_embed_dropout: float=0.5, stop_gradient_from_pitch_predictor: bool=False, # spk emb - num_speakers: int=None, + spk_num: int=None, spk_embed_dim: int=None, spk_embed_integration_type: str="add", # tone emb @@ -148,9 +148,9 @@ class FastSpeech2(nn.Layer): # initialize parameters initialize(self, init_type) - if self.spk_embed_dim is not None: + if spk_num and self.spk_embed_dim: self.spk_embedding_table = nn.Embedding( - num_embeddings=num_speakers, + num_embeddings=spk_num, embedding_dim=self.spk_embed_dim, padding_idx=self.padding_idx) @@ -299,7 +299,7 @@ class FastSpeech2(nn.Layer): pitch: paddle.Tensor, energy: paddle.Tensor, tone_id: paddle.Tensor=None, - spembs: paddle.Tensor=None, + spk_emb: paddle.Tensor=None, spk_id: paddle.Tensor=None ) -> Tuple[paddle.Tensor, Dict[str, paddle.Tensor], paddle.Tensor]: """Calculate forward propagation. @@ -322,7 +322,7 @@ class FastSpeech2(nn.Layer): Batch of padded token-averaged energy (B, Tmax, 1). tone_id : Tensor, optional(int64) Batch of padded tone ids (B, Tmax). - spembs : Tensor, optional + spk_emb : Tensor, optional Batch of speaker embeddings (B, spk_embed_dim). spk_id : Tnesor, optional(int64) Batch of speaker ids (B,) @@ -366,7 +366,7 @@ class FastSpeech2(nn.Layer): ps, es, is_inference=False, - spembs=spembs, + spk_emb=spk_emb, spk_id=spk_id, tone_id=tone_id) # modify mod part of groundtruth @@ -387,7 +387,7 @@ class FastSpeech2(nn.Layer): es: paddle.Tensor=None, is_inference: bool=False, alpha: float=1.0, - spembs=None, + spk_emb=None, spk_id=None, tone_id=None) -> Sequence[paddle.Tensor]: # forward encoder @@ -397,11 +397,12 @@ class FastSpeech2(nn.Layer): # integrate speaker embedding if self.spk_embed_dim is not None: - if spembs is not None: - hs = self._integrate_with_spk_embed(hs, spembs) + # spk_emb has a higher priority than spk_id + if spk_emb is not None: + hs = self._integrate_with_spk_embed(hs, spk_emb) elif spk_id is not None: - spembs = self.spk_embedding_table(spk_id) - hs = self._integrate_with_spk_embed(hs, spembs) + spk_emb = self.spk_embedding_table(spk_id) + hs = self._integrate_with_spk_embed(hs, spk_emb) # integrate tone embedding if self.tone_embed_dim is not None: @@ -489,7 +490,7 @@ class FastSpeech2(nn.Layer): energy: paddle.Tensor=None, alpha: float=1.0, use_teacher_forcing: bool=False, - spembs=None, + spk_emb=None, spk_id=None, tone_id=None, ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]: @@ -512,7 +513,7 @@ class FastSpeech2(nn.Layer): use_teacher_forcing : bool, optional Whether to use teacher forcing. If true, groundtruth of duration, pitch and energy will be used. - spembs : Tensor, optional + spk_emb : Tensor, optional peaker embedding vector (spk_embed_dim,). spk_id : Tensor, optional(int64) Batch of padded spk ids (1,). @@ -527,7 +528,6 @@ class FastSpeech2(nn.Layer): # input of embedding must be int64 x = paddle.cast(text, 'int64') y = speech - spemb = spembs d, p, e = durations, pitch, energy # setup batch axis ilens = paddle.shape(x)[0] @@ -537,8 +537,8 @@ class FastSpeech2(nn.Layer): if y is not None: ys = y.unsqueeze(0) - if spemb is not None: - spembs = spemb.unsqueeze(0) + if spk_emb is not None: + spk_emb = spk_emb.unsqueeze(0) if tone_id is not None: tone_id = tone_id.unsqueeze(0) @@ -548,7 +548,7 @@ class FastSpeech2(nn.Layer): ds = d.unsqueeze(0) if d is not None else None ps = p.unsqueeze(0) if p is not None else None es = e.unsqueeze(0) if e is not None else None - # ds, ps, es = , p.unsqueeze(0), e.unsqueeze(0) + # (1, L, odim) _, outs, d_outs, p_outs, e_outs = self._forward( xs, @@ -557,7 +557,7 @@ class FastSpeech2(nn.Layer): ds=ds, ps=ps, es=es, - spembs=spembs, + spk_emb=spk_emb, spk_id=spk_id, tone_id=tone_id, is_inference=True) @@ -569,19 +569,19 @@ class FastSpeech2(nn.Layer): ys, is_inference=True, alpha=alpha, - spembs=spembs, + spk_emb=spk_emb, spk_id=spk_id, tone_id=tone_id) return outs[0], d_outs[0], p_outs[0], e_outs[0] - def _integrate_with_spk_embed(self, hs, spembs): + def _integrate_with_spk_embed(self, hs, spk_emb): """Integrate speaker embedding with hidden states. Parameters ---------- hs : Tensor Batch of hidden state sequences (B, Tmax, adim). - spembs : Tensor + spk_emb : Tensor Batch of speaker embeddings (B, spk_embed_dim). Returns @@ -591,13 +591,13 @@ class FastSpeech2(nn.Layer): """ if self.spk_embed_integration_type == "add": # apply projection and then add to hidden states - spembs = self.spk_projection(F.normalize(spembs)) - hs = hs + spembs.unsqueeze(1) + spk_emb = self.spk_projection(F.normalize(spk_emb)) + hs = hs + spk_emb.unsqueeze(1) elif self.spk_embed_integration_type == "concat": # concat hidden states with spk embeds and then apply projection - spembs = F.normalize(spembs).unsqueeze(1).expand( + spk_emb = F.normalize(spk_emb).unsqueeze(1).expand( shape=[-1, hs.shape[1], -1]) - hs = self.spk_projection(paddle.concat([hs, spembs], axis=-1)) + hs = self.spk_projection(paddle.concat([hs, spk_emb], axis=-1)) else: raise NotImplementedError("support only add or concat.") @@ -682,9 +682,9 @@ class FastSpeech2Inference(nn.Layer): self.normalizer = normalizer self.acoustic_model = model - def forward(self, text, spk_id=None): + def forward(self, text, spk_id=None, spk_emb=None): normalized_mel, d_outs, p_outs, e_outs = self.acoustic_model.inference( - text, spk_id=spk_id) + text, spk_id=spk_id, spk_emb=spk_emb) logmel = self.normalizer.inverse(normalized_mel) return logmel diff --git a/paddlespeech/t2s/models/fastspeech2/fastspeech2_updater.py b/paddlespeech/t2s/models/fastspeech2/fastspeech2_updater.py index 4297c8b6166ebddf3f754019826e8c2d74e6cb26..0dabf934ceb56b71f950b5179ba4c5e065faf499 100644 --- a/paddlespeech/t2s/models/fastspeech2/fastspeech2_updater.py +++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2_updater.py @@ -54,6 +54,10 @@ class FastSpeech2Updater(StandardUpdater): losses_dict = {} # spk_id!=None in multiple spk fastspeech2 spk_id = batch["spk_id"] if "spk_id" in batch else None + spk_emb = batch["spk_emb"] if "spk_emb" in batch else None + # No explicit speaker identifier labels are used during voice cloning training. + if spk_emb is not None: + spk_id = None before_outs, after_outs, d_outs, p_outs, e_outs, ys, olens = self.model( text=batch["text"], @@ -63,7 +67,8 @@ class FastSpeech2Updater(StandardUpdater): durations=batch["durations"], pitch=batch["pitch"], energy=batch["energy"], - spk_id=spk_id) + spk_id=spk_id, + spk_emb=spk_emb) l1_loss, duration_loss, pitch_loss, energy_loss = self.criterion( after_outs=after_outs, @@ -126,6 +131,9 @@ class FastSpeech2Evaluator(StandardEvaluator): losses_dict = {} # spk_id!=None in multiple spk fastspeech2 spk_id = batch["spk_id"] if "spk_id" in batch else None + spk_emb = batch["spk_emb"] if "spk_emb" in batch else None + if spk_emb is not None: + spk_id = None before_outs, after_outs, d_outs, p_outs, e_outs, ys, olens = self.model( text=batch["text"], @@ -135,7 +143,8 @@ class FastSpeech2Evaluator(StandardEvaluator): durations=batch["durations"], pitch=batch["pitch"], energy=batch["energy"], - spk_id=spk_id) + spk_id=spk_id, + spk_emb=spk_emb) l1_loss, duration_loss, pitch_loss, energy_loss = self.criterion( after_outs=after_outs, diff --git a/paddlespeech/t2s/models/transformer_tts/transformer_tts.py b/paddlespeech/t2s/models/transformer_tts/transformer_tts.py index 97233c766eb0d71ad51a17d7d33c87a8bc2f4da3..5958a16609a1541ec10e27582a3acbf0e319eb8c 100644 --- a/paddlespeech/t2s/models/transformer_tts/transformer_tts.py +++ b/paddlespeech/t2s/models/transformer_tts/transformer_tts.py @@ -391,7 +391,7 @@ class TransformerTTS(nn.Layer): text_lengths: paddle.Tensor, speech: paddle.Tensor, speech_lengths: paddle.Tensor, - spembs: paddle.Tensor=None, + spk_emb: paddle.Tensor=None, ) -> Tuple[paddle.Tensor, Dict[str, paddle.Tensor], paddle.Tensor]: """Calculate forward propagation. @@ -405,7 +405,7 @@ class TransformerTTS(nn.Layer): Batch of padded target features (B, Lmax, odim). speech_lengths : Tensor(int64) Batch of the lengths of each target (B,). - spembs : Tensor, optional + spk_emb : Tensor, optional Batch of speaker embeddings (B, spk_embed_dim). Returns @@ -439,7 +439,7 @@ class TransformerTTS(nn.Layer): # calculate transformer outputs after_outs, before_outs, logits = self._forward(xs, ilens, ys, olens, - spembs) + spk_emb) # modifiy mod part of groundtruth @@ -467,7 +467,7 @@ class TransformerTTS(nn.Layer): ilens: paddle.Tensor, ys: paddle.Tensor, olens: paddle.Tensor, - spembs: paddle.Tensor, + spk_emb: paddle.Tensor, ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]: # forward encoder x_masks = self._source_mask(ilens) @@ -480,7 +480,7 @@ class TransformerTTS(nn.Layer): # integrate speaker embedding if self.spk_embed_dim is not None: - hs = self._integrate_with_spk_embed(hs, spembs) + hs = self._integrate_with_spk_embed(hs, spk_emb) # thin out frames for reduction factor (B, Lmax, odim) -> (B, Lmax//r, odim) if self.reduction_factor > 1: @@ -514,7 +514,7 @@ class TransformerTTS(nn.Layer): self, text: paddle.Tensor, speech: paddle.Tensor=None, - spembs: paddle.Tensor=None, + spk_emb: paddle.Tensor=None, threshold: float=0.5, minlenratio: float=0.0, maxlenratio: float=10.0, @@ -528,7 +528,7 @@ class TransformerTTS(nn.Layer): Input sequence of characters (T,). speech : Tensor, optional Feature sequence to extract style (N, idim). - spembs : Tensor, optional + spk_emb : Tensor, optional Speaker embedding vector (spk_embed_dim,). threshold : float, optional Threshold in inference. @@ -551,7 +551,6 @@ class TransformerTTS(nn.Layer): """ # input of embedding must be int64 y = speech - spemb = spembs # add eos at the last of sequence text = numpy.pad( @@ -564,12 +563,12 @@ class TransformerTTS(nn.Layer): # get teacher forcing outputs xs, ys = x.unsqueeze(0), y.unsqueeze(0) - spembs = None if spemb is None else spemb.unsqueeze(0) + spk_emb = None if spk_emb is None else spk_emb.unsqueeze(0) ilens = paddle.to_tensor( [xs.shape[1]], dtype=paddle.int64, place=xs.place) olens = paddle.to_tensor( [ys.shape[1]], dtype=paddle.int64, place=ys.place) - outs, *_ = self._forward(xs, ilens, ys, olens, spembs) + outs, *_ = self._forward(xs, ilens, ys, olens, spk_emb) # get attention weights att_ws = [] @@ -590,9 +589,9 @@ class TransformerTTS(nn.Layer): hs = hs + style_embs.unsqueeze(1) # integrate speaker embedding - if self.spk_embed_dim is not None: - spembs = spemb.unsqueeze(0) - hs = self._integrate_with_spk_embed(hs, spembs) + if spk_emb is not None: + spk_emb = spk_emb.unsqueeze(0) + hs = self._integrate_with_spk_embed(hs, spk_emb) # set limits of length maxlen = int(hs.shape[1] * maxlenratio / self.reduction_factor) @@ -726,14 +725,14 @@ class TransformerTTS(nn.Layer): def _integrate_with_spk_embed(self, hs: paddle.Tensor, - spembs: paddle.Tensor) -> paddle.Tensor: + spk_emb: paddle.Tensor) -> paddle.Tensor: """Integrate speaker embedding with hidden states. Parameters ---------- hs : Tensor Batch of hidden state sequences (B, Tmax, adim). - spembs : Tensor + spk_emb : Tensor Batch of speaker embeddings (B, spk_embed_dim). Returns @@ -744,13 +743,13 @@ class TransformerTTS(nn.Layer): """ if self.spk_embed_integration_type == "add": # apply projection and then add to hidden states - spembs = self.projection(F.normalize(spembs)) - hs = hs + spembs.unsqueeze(1) + spk_emb = self.projection(F.normalize(spk_emb)) + hs = hs + spk_emb.unsqueeze(1) elif self.spk_embed_integration_type == "concat": # concat hidden states with spk embeds and then apply projection - spembs = F.normalize(spembs).unsqueeze(1).expand(-1, hs.shape[1], - -1) - hs = self.projection(paddle.concat([hs, spembs], axis=-1)) + spk_emb = F.normalize(spk_emb).unsqueeze(1).expand(-1, hs.shape[1], + -1) + hs = self.projection(paddle.concat([hs, spk_emb], axis=-1)) else: raise NotImplementedError("support only add or concat.") diff --git a/paddlespeech/xvector/__init__.py b/paddlespeech/vector/__init__.py similarity index 100% rename from paddlespeech/xvector/__init__.py rename to paddlespeech/vector/__init__.py diff --git a/paddlespeech/t2s/exps/ge2e/__init__.py b/paddlespeech/vector/exps/__init__.py similarity index 100% rename from paddlespeech/t2s/exps/ge2e/__init__.py rename to paddlespeech/vector/exps/__init__.py diff --git a/paddlespeech/vector/exps/ge2e/__init__.py b/paddlespeech/vector/exps/ge2e/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..abf198b97e6e818e1fbe59006f98492640bcee54 --- /dev/null +++ b/paddlespeech/vector/exps/ge2e/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/paddlespeech/t2s/exps/ge2e/audio_processor.py b/paddlespeech/vector/exps/ge2e/audio_processor.py similarity index 100% rename from paddlespeech/t2s/exps/ge2e/audio_processor.py rename to paddlespeech/vector/exps/ge2e/audio_processor.py diff --git a/paddlespeech/t2s/exps/ge2e/config.py b/paddlespeech/vector/exps/ge2e/config.py similarity index 100% rename from paddlespeech/t2s/exps/ge2e/config.py rename to paddlespeech/vector/exps/ge2e/config.py diff --git a/paddlespeech/t2s/exps/ge2e/dataset_processors.py b/paddlespeech/vector/exps/ge2e/dataset_processors.py similarity index 98% rename from paddlespeech/t2s/exps/ge2e/dataset_processors.py rename to paddlespeech/vector/exps/ge2e/dataset_processors.py index a9320d9859067154333c3464a495fc4557379dfc..908c852b2ec8121838f249ed04310f714776cffb 100644 --- a/paddlespeech/t2s/exps/ge2e/dataset_processors.py +++ b/paddlespeech/vector/exps/ge2e/dataset_processors.py @@ -19,7 +19,7 @@ from typing import List import numpy as np from tqdm import tqdm -from paddlespeech.t2s.exps.ge2e.audio_processor import SpeakerVerificationPreprocessor +from paddlespeech.vector.exps.ge2e.audio_processor import SpeakerVerificationPreprocessor def _process_utterance(path_pair, processor: SpeakerVerificationPreprocessor): diff --git a/paddlespeech/t2s/exps/ge2e/inference.py b/paddlespeech/vector/exps/ge2e/inference.py similarity index 95% rename from paddlespeech/t2s/exps/ge2e/inference.py rename to paddlespeech/vector/exps/ge2e/inference.py index eed3b7947d6bf9c4f561ecedbe458281cd61ab97..7660de5e876529448b0e8f0e2a3f6185d15e9322 100644 --- a/paddlespeech/t2s/exps/ge2e/inference.py +++ b/paddlespeech/vector/exps/ge2e/inference.py @@ -18,9 +18,9 @@ import numpy as np import paddle import tqdm -from paddlespeech.t2s.exps.ge2e.audio_processor import SpeakerVerificationPreprocessor -from paddlespeech.t2s.exps.ge2e.config import get_cfg_defaults -from paddlespeech.t2s.models.lstm_speaker_encoder import LSTMSpeakerEncoder +from paddlespeech.vector.exps.ge2e.audio_processor import SpeakerVerificationPreprocessor +from paddlespeech.vector.exps.ge2e.config import get_cfg_defaults +from paddlespeech.vector.models.lstm_speaker_encoder import LSTMSpeakerEncoder def embed_utterance(processor, model, fpath_or_wav): diff --git a/paddlespeech/t2s/exps/ge2e/preprocess.py b/paddlespeech/vector/exps/ge2e/preprocess.py similarity index 87% rename from paddlespeech/t2s/exps/ge2e/preprocess.py rename to paddlespeech/vector/exps/ge2e/preprocess.py index 604ff0c6735f378cfda7052147823b3dd63a1780..dabe0ce7694547ed197a4d570bcec0399e9ac54e 100644 --- a/paddlespeech/t2s/exps/ge2e/preprocess.py +++ b/paddlespeech/vector/exps/ge2e/preprocess.py @@ -14,14 +14,13 @@ import argparse from pathlib import Path -from audio_processor import SpeakerVerificationPreprocessor - -from paddlespeech.t2s.exps.ge2e.config import get_cfg_defaults -from paddlespeech.t2s.exps.ge2e.dataset_processors import process_aidatatang_200zh -from paddlespeech.t2s.exps.ge2e.dataset_processors import process_librispeech -from paddlespeech.t2s.exps.ge2e.dataset_processors import process_magicdata -from paddlespeech.t2s.exps.ge2e.dataset_processors import process_voxceleb1 -from paddlespeech.t2s.exps.ge2e.dataset_processors import process_voxceleb2 +from paddlespeech.vector.exps.ge2e.audio_processor import SpeakerVerificationPreprocessor +from paddlespeech.vector.exps.ge2e.config import get_cfg_defaults +from paddlespeech.vector.exps.ge2e.dataset_processors import process_aidatatang_200zh +from paddlespeech.vector.exps.ge2e.dataset_processors import process_librispeech +from paddlespeech.vector.exps.ge2e.dataset_processors import process_magicdata +from paddlespeech.vector.exps.ge2e.dataset_processors import process_voxceleb1 +from paddlespeech.vector.exps.ge2e.dataset_processors import process_voxceleb2 if __name__ == "__main__": parser = argparse.ArgumentParser( diff --git a/paddlespeech/t2s/exps/ge2e/random_cycle.py b/paddlespeech/vector/exps/ge2e/random_cycle.py similarity index 100% rename from paddlespeech/t2s/exps/ge2e/random_cycle.py rename to paddlespeech/vector/exps/ge2e/random_cycle.py diff --git a/paddlespeech/t2s/exps/ge2e/speaker_verification_dataset.py b/paddlespeech/vector/exps/ge2e/speaker_verification_dataset.py similarity index 98% rename from paddlespeech/t2s/exps/ge2e/speaker_verification_dataset.py rename to paddlespeech/vector/exps/ge2e/speaker_verification_dataset.py index a132199692e077c7aced0cce5ba0709528e88a80..194eb7f28fb485e8fc61ba25fb9c9fcb61bf1802 100644 --- a/paddlespeech/t2s/exps/ge2e/speaker_verification_dataset.py +++ b/paddlespeech/vector/exps/ge2e/speaker_verification_dataset.py @@ -18,7 +18,7 @@ import numpy as np from paddle.io import BatchSampler from paddle.io import Dataset -from paddlespeech.t2s.exps.ge2e.random_cycle import random_cycle +from paddlespeech.vector.exps.ge2e.random_cycle import random_cycle class MultiSpeakerMelDataset(Dataset): diff --git a/paddlespeech/t2s/exps/ge2e/train.py b/paddlespeech/vector/exps/ge2e/train.py similarity index 91% rename from paddlespeech/t2s/exps/ge2e/train.py rename to paddlespeech/vector/exps/ge2e/train.py index 55c6daf73fea8b829c97e821b3d4063345d7bd1a..bf1cf1074b5dec41f2287f5113b6facef9909283 100644 --- a/paddlespeech/t2s/exps/ge2e/train.py +++ b/paddlespeech/vector/exps/ge2e/train.py @@ -19,13 +19,13 @@ from paddle.io import DataLoader from paddle.nn.clip import ClipGradByGlobalNorm from paddle.optimizer import Adam -from paddlespeech.t2s.exps.ge2e.config import get_cfg_defaults -from paddlespeech.t2s.exps.ge2e.speaker_verification_dataset import Collate -from paddlespeech.t2s.exps.ge2e.speaker_verification_dataset import MultiSpeakerMelDataset -from paddlespeech.t2s.exps.ge2e.speaker_verification_dataset import MultiSpeakerSampler -from paddlespeech.t2s.models.lstm_speaker_encoder import LSTMSpeakerEncoder from paddlespeech.t2s.training import default_argument_parser from paddlespeech.t2s.training import ExperimentBase +from paddlespeech.vector.exps.ge2e.config import get_cfg_defaults +from paddlespeech.vector.exps.ge2e.speaker_verification_dataset import Collate +from paddlespeech.vector.exps.ge2e.speaker_verification_dataset import MultiSpeakerMelDataset +from paddlespeech.vector.exps.ge2e.speaker_verification_dataset import MultiSpeakerSampler +from paddlespeech.vector.models.lstm_speaker_encoder import LSTMSpeakerEncoder class Ge2eExperiment(ExperimentBase): diff --git a/paddlespeech/vector/models/__init__.py b/paddlespeech/vector/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..185a92b8d94d3426d616c0624f0f2ee04339349e --- /dev/null +++ b/paddlespeech/vector/models/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/paddlespeech/t2s/models/lstm_speaker_encoder.py b/paddlespeech/vector/models/lstm_speaker_encoder.py similarity index 100% rename from paddlespeech/t2s/models/lstm_speaker_encoder.py rename to paddlespeech/vector/models/lstm_speaker_encoder.py diff --git a/requirements.txt b/requirements.txt index 8e2552e7059e24ebbedb1ef2b67530e6780eb0cb..99e485f867ea3dfaeee2f7e440d230f8802881b6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,9 +1,11 @@ ConfigArgParse coverage +distro editdistance g2p_en g2pM gpustat +GPUtil h5py inflect jieba @@ -16,20 +18,24 @@ matplotlib nara_wpe nltk numba -numpy==1.20.0 +paddlespeech_ctcdecoders +paddlespeech_feat pandas phkit Pillow praatio~=4.1 pre-commit +psutil pybind11 +pynvml +pypi-kenlm pypinyin python-dateutil pyworld resampy==0.2.2 sacrebleu scipy -sentencepiece +sentencepiece~=0.1.96 snakeviz soundfile~=0.10 sox @@ -40,13 +46,7 @@ timer tqdm typeguard unidecode -visualdl==2.2.0 +visualdl webrtcvad yacs yq -pypi-kenlm -GPUtil -psutil -pynvml -distro - diff --git a/setup.py b/setup.py index 29583e8c91c56dc057353430467a80d14740941f..310eed1e75b1b6e489df467707dd60c959a1c4e1 100644 --- a/setup.py +++ b/setup.py @@ -78,7 +78,6 @@ def _post_install(install_lib_dir): print(os.getcwd()) check_call("./install_autolog.sh") print("autolog install.") - # ctcdecoder ctcdecoder_dir = HERE / 'paddlespeech/s2t/decoders/ctcdecoder/swig' with pushd(ctcdecoder_dir): @@ -102,11 +101,9 @@ class DevelopCommand(develop): class InstallCommand(install): def run(self): install.run(self) - # must after install.run, or pkg install by shell will not see - self.execute(_post_install, (self.install_lib, ), msg="Post Install...") -# cmd: python setup.py upload + # cmd: python setup.py upload class UploadCommand(Command): description = "Build and publish the package." user_options = [] @@ -133,11 +130,11 @@ class UploadCommand(Command): setup_info = dict( # Metadata name='paddlespeech', - version='2.1.2', - author='PaddleSL Speech Team', - author_email='', - url='https://github.com/PaddlePaddle/DeepSpeech', - license='Apache 2', + version='0.0.1a', + author='PaddlePaddle Speech and Language Team', + author_email='paddlesl@baidu.com', + url='https://github.com/PaddlePaddle/PaddleSpeech', + license='Apache 2.0', description='Speech tools and models based on Paddlepaddle', long_description=read("README.md"), long_description_content_type="text/markdown", @@ -175,11 +172,11 @@ setup_info = dict( }, # Package info - packages=find_packages(exclude=('tests', 'tests.*', 'examples*', + packages=find_packages(exclude=('utils', 'tests', 'tests.*', 'examples*', 'paddleaudio*', 'third_party*', 'tools*')), zip_safe=True, classifiers=[ - 'Development Status :: 4 - Beta', + 'Development Status :: 3 - Alpha', 'Intended Audience :: Developers', 'Intended Audience :: Science/Research', 'Topic :: Scientific/Engineering :: Artificial Intelligence', @@ -189,6 +186,7 @@ setup_info = dict( 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7', 'Programming Language :: Python :: 3.8', + 'Programming Language :: Python :: 3.9', ], ) setup(**setup_info) diff --git a/third_party/python_kaldi_features/setup.py b/third_party/python_kaldi_features/setup.py index 47c77718636963b1f2852118cba49957159d92d0..c76f23b5146b1b48ca683a4a5050a7b3c82613c3 100644 --- a/third_party/python_kaldi_features/setup.py +++ b/third_party/python_kaldi_features/setup.py @@ -3,12 +3,16 @@ try: except ImportError: from distutils.core import setup -setup(name='python_speech_features', - version='0.6', - description='Python Speech Feature extraction', - author='James Lyons', - author_email='james.lyons0@gmail.com', +with open("requirements.txt", encoding="utf-8-sig") as f: + requirements = f.readlines() + +setup(name='paddlespeech_feat', + version='0.0.1a', + description='python speech feature extraction in paddlespeech', + install_requires=requirements, + author="PaddlePaddle Speech and Language Team", + author_email="paddlesl@baidu.com", license='MIT', - url='https://github.com/jameslyons/python_speech_features', + url='https://github.com/PaddlePaddle/PaddleSpeech', packages=['python_speech_features'], )