diff --git a/README.md b/README.md index 26854ae6fa93d8e9559d8c7594ef04ad6f6a9632..3fdb2945f55eb44db902a2b97c1b7f4382892b59 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,4 @@ + ([简体中文](./README_cn.md)|English)

@@ -494,6 +495,14 @@ PaddleSpeech supports a series of most popular models. They are summarized in [r ge2e-fastspeech2-aishell3 + + End-to-End + VITS + CSMSC + + VITS-csmsc + + diff --git a/README_cn.md b/README_cn.md index 8c018a08e3d99a3e33d9750c3eacebb41cc17d80..91a01d71047c1aa38cd8c9d059b39e9ca5245d7a 100644 --- a/README_cn.md +++ b/README_cn.md @@ -1,3 +1,4 @@ + (简体中文|[English](./README.md))

@@ -481,6 +482,15 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块:文本前端、声 ge2e-fastspeech2-aishell3 + + + 端到端 + VITS + CSMSC + + VITS-csmsc + + diff --git a/dataset/aidatatang_200zh/README.md b/dataset/aidatatang_200zh/README.md index e6f1eefbd1f9f885bb36b075f79e3855bfc4b834..addc323a6c5e1dff621a0acf6fc8a1c6d39feae9 100644 --- a/dataset/aidatatang_200zh/README.md +++ b/dataset/aidatatang_200zh/README.md @@ -1,4 +1,4 @@ -# [Aidatatang_200zh](http://www.openslr.org/62/) +# [Aidatatang_200zh](http://openslr.elda.org/62/) Aidatatang_200zh is a free Chinese Mandarin speech corpus provided by Beijing DataTang Technology Co., Ltd under Creative Commons Attribution-NonCommercial-NoDerivatives 4.0 International Public License. The contents and the corresponding descriptions of the corpus include: diff --git a/dataset/aishell/README.md b/dataset/aishell/README.md index 6770cd20777c441601e174f77d2801f7559ee767..a7dd0cf326ad51dc49fc83207a89fe9adc457dbf 100644 --- a/dataset/aishell/README.md +++ b/dataset/aishell/README.md @@ -1,3 +1,3 @@ -# [Aishell1](http://www.openslr.org/33/) +# [Aishell1](http://openslr.elda.org/33/) This Open Source Mandarin Speech Corpus, AISHELL-ASR0009-OS1, is 178 hours long. It is a part of AISHELL-ASR0009, of which utterance contains 11 domains, including smart home, autonomous driving, and industrial production. The whole recording was put in quiet indoor environment, using 3 different devices at the same time: high fidelity microphone (44.1kHz, 16-bit,); Android-system mobile phone (16kHz, 16-bit), iOS-system mobile phone (16kHz, 16-bit). Audios in high fidelity were re-sampled to 16kHz to build AISHELL- ASR0009-OS1. 400 speakers from different accent areas in China were invited to participate in the recording. The manual transcription accuracy rate is above 95%, through professional speech annotation and strict quality inspection. The corpus is divided into training, development and testing sets. ( This database is free for academic research, not in the commerce, if without permission. ) diff --git a/dataset/aishell/aishell.py b/dataset/aishell/aishell.py index 7431fc08369546f372c93dc923f50300f1da10a3..ec43104dbc9dc4efc693c6b97b5fb004bc14ce1e 100644 --- a/dataset/aishell/aishell.py +++ b/dataset/aishell/aishell.py @@ -31,7 +31,7 @@ from utils.utility import unpack DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech') -URL_ROOT = 'http://www.openslr.org/resources/33' +URL_ROOT = 'http://openslr.elda.org/resources/33' # URL_ROOT = 'https://openslr.magicdatatech.com/resources/33' DATA_URL = URL_ROOT + '/data_aishell.tgz' MD5_DATA = '2f494334227864a8a8fec932999db9d8' diff --git a/dataset/librispeech/librispeech.py b/dataset/librispeech/librispeech.py index 65cab2490305762b84a06408b6d302517caea182..2d6f1763d9eccda4d8208538e0b76174c4c8445d 100644 --- a/dataset/librispeech/librispeech.py +++ b/dataset/librispeech/librispeech.py @@ -31,7 +31,7 @@ import soundfile from utils.utility import download from utils.utility import unpack -URL_ROOT = "http://www.openslr.org/resources/12" +URL_ROOT = "http://openslr.elda.org/resources/12" #URL_ROOT = "https://openslr.magicdatatech.com/resources/12" URL_TEST_CLEAN = URL_ROOT + "/test-clean.tar.gz" URL_TEST_OTHER = URL_ROOT + "/test-other.tar.gz" diff --git a/dataset/magicdata/README.md b/dataset/magicdata/README.md index 083aee97b9fec0bd916fdd1fc125319881894c0c..4641a21d6cdfb765605440a66a091d35c6daee38 100644 --- a/dataset/magicdata/README.md +++ b/dataset/magicdata/README.md @@ -1,4 +1,4 @@ -# [MagicData](http://www.openslr.org/68/) +# [MagicData](http://openslr.elda.org/68/) MAGICDATA Mandarin Chinese Read Speech Corpus was developed by MAGIC DATA Technology Co., Ltd. and freely published for non-commercial use. The contents and the corresponding descriptions of the corpus include: diff --git a/dataset/mini_librispeech/mini_librispeech.py b/dataset/mini_librispeech/mini_librispeech.py index 730c73a8b4dc44691351717de1bfe918f3b957ac..0eb80bf8f52a6ea23e114948ebaec30c5ad8d4cd 100644 --- a/dataset/mini_librispeech/mini_librispeech.py +++ b/dataset/mini_librispeech/mini_librispeech.py @@ -30,7 +30,7 @@ import soundfile from utils.utility import download from utils.utility import unpack -URL_ROOT = "http://www.openslr.org/resources/31" +URL_ROOT = "http://openslr.elda.org/resources/31" URL_TRAIN_CLEAN = URL_ROOT + "/train-clean-5.tar.gz" URL_DEV_CLEAN = URL_ROOT + "/dev-clean-2.tar.gz" diff --git a/dataset/musan/musan.py b/dataset/musan/musan.py index 2ac701bed0c9c24be1d1dffbd0482b6f4ce3f473..ae3430b2a3e5bc57631de668279b3ec3c9225b44 100644 --- a/dataset/musan/musan.py +++ b/dataset/musan/musan.py @@ -34,7 +34,7 @@ from utils.utility import unpack DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech') -URL_ROOT = 'https://www.openslr.org/resources/17' +URL_ROOT = 'https://openslr.elda.org/resources/17' DATA_URL = URL_ROOT + '/musan.tar.gz' MD5_DATA = '0c472d4fc0c5141eca47ad1ffeb2a7df' diff --git a/dataset/primewords/README.md b/dataset/primewords/README.md index a4f1ed65d01cc5db123f68beba3f69c7ef8be8ae..dba51cec7998fe3411613d58ce7571c2ddd47220 100644 --- a/dataset/primewords/README.md +++ b/dataset/primewords/README.md @@ -1,4 +1,4 @@ -# [Primewords](http://www.openslr.org/47/) +# [Primewords](http://openslr.elda.org/47/) This free Chinese Mandarin speech corpus set is released by Shanghai Primewords Information Technology Co., Ltd. The corpus is recorded by smart mobile phones from 296 native Chinese speakers. The transcription accuracy is larger than 98%, at the confidence level of 95%. It is free for academic use. diff --git a/dataset/rir_noise/rir_noise.py b/dataset/rir_noise/rir_noise.py index 009175e5bcce158b427cdf676540f2d1a7464032..b1d475584064237354fd9b309264a5dc8b184d11 100644 --- a/dataset/rir_noise/rir_noise.py +++ b/dataset/rir_noise/rir_noise.py @@ -34,7 +34,7 @@ from utils.utility import unzip DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech') -URL_ROOT = '--no-check-certificate http://www.openslr.org/resources/28' +URL_ROOT = '--no-check-certificate https://us.openslr.org/resources/28/rirs_noises.zip' DATA_URL = URL_ROOT + '/rirs_noises.zip' MD5_DATA = 'e6f48e257286e05de56413b4779d8ffb' diff --git a/dataset/st-cmds/README.md b/dataset/st-cmds/README.md index c7ae50e59d206e47cecbe19fe42d3f35004f603a..bbf85c3e7ef6c5f1194622686b9941a743d013bd 100644 --- a/dataset/st-cmds/README.md +++ b/dataset/st-cmds/README.md @@ -1 +1 @@ -# [FreeST](http://www.openslr.org/38/) +# [FreeST](http://openslr.elda.org/38/) diff --git a/dataset/thchs30/README.md b/dataset/thchs30/README.md index 6b59d663a2d94fef01f42e9c7d8191ec10b4b43a..b488a3551a81751b883f1b6311e8e3424094aba4 100644 --- a/dataset/thchs30/README.md +++ b/dataset/thchs30/README.md @@ -1,4 +1,4 @@ -# [THCHS30](http://www.openslr.org/18/) +# [THCHS30](http://openslr.elda.org/18/) This is the *data part* of the `THCHS30 2015` acoustic data & scripts dataset. diff --git a/dataset/thchs30/thchs30.py b/dataset/thchs30/thchs30.py index cdfc0a75c0aacfdf89492d2f83642cb7f5decea8..d41c0e175c7ccd2a8252592908b4cbaf89bade72 100644 --- a/dataset/thchs30/thchs30.py +++ b/dataset/thchs30/thchs30.py @@ -32,7 +32,7 @@ from utils.utility import unpack DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech') -URL_ROOT = 'http://www.openslr.org/resources/18' +URL_ROOT = 'http://openslr.elda.org/resources/18' # URL_ROOT = 'https://openslr.magicdatatech.com/resources/18' DATA_URL = URL_ROOT + '/data_thchs30.tgz' TEST_NOISE_URL = URL_ROOT + '/test-noise.tgz' diff --git a/demos/streaming_asr_server/web/app.py b/demos/streaming_asr_server/web/app.py deleted file mode 100644 index 22993c08efe9f81b5bddd316b624ee0d6f5ef821..0000000000000000000000000000000000000000 --- a/demos/streaming_asr_server/web/app.py +++ /dev/null @@ -1,23 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -# Copyright 2021 Mobvoi Inc. All Rights Reserved. -# Author: zhendong.peng@mobvoi.com (Zhendong Peng) -import argparse - -from flask import Flask -from flask import render_template - -parser = argparse.ArgumentParser(description='training your network') -parser.add_argument('--port', default=19999, type=int, help='port id') -args = parser.parse_args() - -app = Flask(__name__) - - -@app.route('/') -def index(): - return render_template('index.html') - - -if __name__ == '__main__': - app.run(host='0.0.0.0', port=args.port, debug=True) diff --git a/demos/streaming_asr_server/web/favicon.ico b/demos/streaming_asr_server/web/favicon.ico new file mode 100644 index 0000000000000000000000000000000000000000..342038720d7c5a8fbbef1110d098e50f7a0e6274 Binary files /dev/null and b/demos/streaming_asr_server/web/favicon.ico differ diff --git a/demos/streaming_asr_server/web/index.html b/demos/streaming_asr_server/web/index.html new file mode 100644 index 0000000000000000000000000000000000000000..33c676c55c3cb3618a1388570d45466f0fddc7e3 --- /dev/null +++ b/demos/streaming_asr_server/web/index.html @@ -0,0 +1,218 @@ + + + + + + + 飞桨PaddleSpeech + + + + +

+ + + diff --git a/demos/streaming_asr_server/web/paddle_web_demo.png b/demos/streaming_asr_server/web/paddle_web_demo.png index 214edffd076bd4f6df18b4faa3587239154b958a..db4b63ab9ed39cdc2c4ab75e291ae0dab02859c6 100644 Binary files a/demos/streaming_asr_server/web/paddle_web_demo.png and b/demos/streaming_asr_server/web/paddle_web_demo.png differ diff --git a/demos/streaming_asr_server/web/readme.md b/demos/streaming_asr_server/web/readme.md index 8310a25714d99e1015199c0899e2af15eb05c809..bef421711a22d161880077c6dce2d28248cd1612 100644 --- a/demos/streaming_asr_server/web/readme.md +++ b/demos/streaming_asr_server/web/readme.md @@ -1,18 +1,20 @@ # paddlespeech serving 网页Demo -- 感谢[wenet](https://github.com/wenet-e2e/wenet)团队的前端demo代码. +![图片](./paddle_web_demo.png) +step1: 开启流式语音识别服务器端 -## 使用方法 -### 1. 在本地电脑启动网页服务 - ``` - python app.py +``` +# 开启流式语音识别服务 +cd PaddleSpeech/demos/streaming_asr_server +paddlespeech_server start --config_file conf/ws_conformer_wenetspeech_application_faster.yaml +``` - ``` +step2: 谷歌游览器打开 `web`目录下`index.html` -### 2. 本地电脑浏览器 +step3: 点击`连接`,验证WebSocket是否成功连接 + +step4:点击开始录音(弹窗询问,允许录音) -在浏览器中输入127.0.0.1:19999 即可看到相关网页Demo。 -![图片](./paddle_web_demo.png) diff --git a/demos/streaming_asr_server/web/static/css/font-awesome.min.css b/demos/streaming_asr_server/web/static/css/font-awesome.min.css deleted file mode 100644 index 540440ce89f2a408aa699b65100e18f15e0f09ca..0000000000000000000000000000000000000000 --- a/demos/streaming_asr_server/web/static/css/font-awesome.min.css +++ /dev/null @@ -1,4 +0,0 @@ -/*! - * Font Awesome 4.7.0 by @davegandy - http://fontawesome.io - @fontawesome - * License - http://fontawesome.io/license (Font: SIL OFL 1.1, CSS: MIT License) - */@font-face{font-family:'FontAwesome';src:url('../fonts/fontawesome-webfont.eot?v=4.7.0');src:url('../fonts/fontawesome-webfont.eot?#iefix&v=4.7.0') format('embedded-opentype'),url('../fonts/fontawesome-webfont.woff2?v=4.7.0') format('woff2'),url('../fonts/fontawesome-webfont.woff?v=4.7.0') format('woff'),url('../fonts/fontawesome-webfont.ttf?v=4.7.0') format('truetype'),url('../fonts/fontawesome-webfont.svg?v=4.7.0#fontawesomeregular') format('svg');font-weight:normal;font-style:normal}.fa{display:inline-block;font:normal normal normal 14px/1 FontAwesome;font-size:inherit;text-rendering:auto;-webkit-font-smoothing:antialiased;-moz-osx-font-smoothing:grayscale}.fa-lg{font-size:1.33333333em;line-height:.75em;vertical-align:-15%}.fa-2x{font-size:2em}.fa-3x{font-size:3em}.fa-4x{font-size:4em}.fa-5x{font-size:5em}.fa-fw{width:1.28571429em;text-align:center}.fa-ul{padding-left:0;margin-left:2.14285714em;list-style-type:none}.fa-ul>li{position:relative}.fa-li{position:absolute;left:-2.14285714em;width:2.14285714em;top:.14285714em;text-align:center}.fa-li.fa-lg{left:-1.85714286em}.fa-border{padding:.2em .25em .15em;border:solid .08em #eee;border-radius:.1em}.fa-pull-left{float:left}.fa-pull-right{float:right}.fa.fa-pull-left{margin-right:.3em}.fa.fa-pull-right{margin-left:.3em}.pull-right{float:right}.pull-left{float:left}.fa.pull-left{margin-right:.3em}.fa.pull-right{margin-left:.3em}.fa-spin{-webkit-animation:fa-spin 2s infinite linear;animation:fa-spin 2s infinite linear}.fa-pulse{-webkit-animation:fa-spin 1s infinite steps(8);animation:fa-spin 1s infinite steps(8)}@-webkit-keyframes fa-spin{0%{-webkit-transform:rotate(0deg);transform:rotate(0deg)}100%{-webkit-transform:rotate(359deg);transform:rotate(359deg)}}@keyframes fa-spin{0%{-webkit-transform:rotate(0deg);transform:rotate(0deg)}100%{-webkit-transform:rotate(359deg);transform:rotate(359deg)}}.fa-rotate-90{-ms-filter:"progid:DXImageTransform.Microsoft.BasicImage(rotation=1)";-webkit-transform:rotate(90deg);-ms-transform:rotate(90deg);transform:rotate(90deg)}.fa-rotate-180{-ms-filter:"progid:DXImageTransform.Microsoft.BasicImage(rotation=2)";-webkit-transform:rotate(180deg);-ms-transform:rotate(180deg);transform:rotate(180deg)}.fa-rotate-270{-ms-filter:"progid:DXImageTransform.Microsoft.BasicImage(rotation=3)";-webkit-transform:rotate(270deg);-ms-transform:rotate(270deg);transform:rotate(270deg)}.fa-flip-horizontal{-ms-filter:"progid:DXImageTransform.Microsoft.BasicImage(rotation=0, mirror=1)";-webkit-transform:scale(-1, 1);-ms-transform:scale(-1, 1);transform:scale(-1, 1)}.fa-flip-vertical{-ms-filter:"progid:DXImageTransform.Microsoft.BasicImage(rotation=2, mirror=1)";-webkit-transform:scale(1, -1);-ms-transform:scale(1, -1);transform:scale(1, -1)}:root .fa-rotate-90,:root .fa-rotate-180,:root .fa-rotate-270,:root .fa-flip-horizontal,:root .fa-flip-vertical{filter:none}.fa-stack{position:relative;display:inline-block;width:2em;height:2em;line-height:2em;vertical-align:middle}.fa-stack-1x,.fa-stack-2x{position:absolute;left:0;width:100%;text-align:center}.fa-stack-1x{line-height:inherit}.fa-stack-2x{font-size:2em}.fa-inverse{color:#fff}.fa-glass:before{content:"\f000"}.fa-music:before{content:"\f001"}.fa-search:before{content:"\f002"}.fa-envelope-o:before{content:"\f003"}.fa-heart:before{content:"\f004"}.fa-star:before{content:"\f005"}.fa-star-o:before{content:"\f006"}.fa-user:before{content:"\f007"}.fa-film:before{content:"\f008"}.fa-th-large:before{content:"\f009"}.fa-th:before{content:"\f00a"}.fa-th-list:before{content:"\f00b"}.fa-check:before{content:"\f00c"}.fa-remove:before,.fa-close:before,.fa-times:before{content:"\f00d"}.fa-search-plus:before{content:"\f00e"}.fa-search-minus:before{content:"\f010"}.fa-power-off:before{content:"\f011"}.fa-signal:before{content:"\f012"}.fa-gear:before,.fa-cog:before{content:"\f013"}.fa-trash-o:before{content:"\f014"}.fa-home:before{content:"\f015"}.fa-file-o:before{content:"\f016"}.fa-clock-o:before{content:"\f017"}.fa-road:before{content:"\f018"}.fa-download:before{content:"\f019"}.fa-arrow-circle-o-down:before{content:"\f01a"}.fa-arrow-circle-o-up:before{content:"\f01b"}.fa-inbox:before{content:"\f01c"}.fa-play-circle-o:before{content:"\f01d"}.fa-rotate-right:before,.fa-repeat:before{content:"\f01e"}.fa-refresh:before{content:"\f021"}.fa-list-alt:before{content:"\f022"}.fa-lock:before{content:"\f023"}.fa-flag:before{content:"\f024"}.fa-headphones:before{content:"\f025"}.fa-volume-off:before{content:"\f026"}.fa-volume-down:before{content:"\f027"}.fa-volume-up:before{content:"\f028"}.fa-qrcode:before{content:"\f029"}.fa-barcode:before{content:"\f02a"}.fa-tag:before{content:"\f02b"}.fa-tags:before{content:"\f02c"}.fa-book:before{content:"\f02d"}.fa-bookmark:before{content:"\f02e"}.fa-print:before{content:"\f02f"}.fa-camera:before{content:"\f030"}.fa-font:before{content:"\f031"}.fa-bold:before{content:"\f032"}.fa-italic:before{content:"\f033"}.fa-text-height:before{content:"\f034"}.fa-text-width:before{content:"\f035"}.fa-align-left:before{content:"\f036"}.fa-align-center:before{content:"\f037"}.fa-align-right:before{content:"\f038"}.fa-align-justify:before{content:"\f039"}.fa-list:before{content:"\f03a"}.fa-dedent:before,.fa-outdent:before{content:"\f03b"}.fa-indent:before{content:"\f03c"}.fa-video-camera:before{content:"\f03d"}.fa-photo:before,.fa-image:before,.fa-picture-o:before{content:"\f03e"}.fa-pencil:before{content:"\f040"}.fa-map-marker:before{content:"\f041"}.fa-adjust:before{content:"\f042"}.fa-tint:before{content:"\f043"}.fa-edit:before,.fa-pencil-square-o:before{content:"\f044"}.fa-share-square-o:before{content:"\f045"}.fa-check-square-o:before{content:"\f046"}.fa-arrows:before{content:"\f047"}.fa-step-backward:before{content:"\f048"}.fa-fast-backward:before{content:"\f049"}.fa-backward:before{content:"\f04a"}.fa-play:before{content:"\f04b"}.fa-pause:before{content:"\f04c"}.fa-stop:before{content:"\f04d"}.fa-forward:before{content:"\f04e"}.fa-fast-forward:before{content:"\f050"}.fa-step-forward:before{content:"\f051"}.fa-eject:before{content:"\f052"}.fa-chevron-left:before{content:"\f053"}.fa-chevron-right:before{content:"\f054"}.fa-plus-circle:before{content:"\f055"}.fa-minus-circle:before{content:"\f056"}.fa-times-circle:before{content:"\f057"}.fa-check-circle:before{content:"\f058"}.fa-question-circle:before{content:"\f059"}.fa-info-circle:before{content:"\f05a"}.fa-crosshairs:before{content:"\f05b"}.fa-times-circle-o:before{content:"\f05c"}.fa-check-circle-o:before{content:"\f05d"}.fa-ban:before{content:"\f05e"}.fa-arrow-left:before{content:"\f060"}.fa-arrow-right:before{content:"\f061"}.fa-arrow-up:before{content:"\f062"}.fa-arrow-down:before{content:"\f063"}.fa-mail-forward:before,.fa-share:before{content:"\f064"}.fa-expand:before{content:"\f065"}.fa-compress:before{content:"\f066"}.fa-plus:before{content:"\f067"}.fa-minus:before{content:"\f068"}.fa-asterisk:before{content:"\f069"}.fa-exclamation-circle:before{content:"\f06a"}.fa-gift:before{content:"\f06b"}.fa-leaf:before{content:"\f06c"}.fa-fire:before{content:"\f06d"}.fa-eye:before{content:"\f06e"}.fa-eye-slash:before{content:"\f070"}.fa-warning:before,.fa-exclamation-triangle:before{content:"\f071"}.fa-plane:before{content:"\f072"}.fa-calendar:before{content:"\f073"}.fa-random:before{content:"\f074"}.fa-comment:before{content:"\f075"}.fa-magnet:before{content:"\f076"}.fa-chevron-up:before{content:"\f077"}.fa-chevron-down:before{content:"\f078"}.fa-retweet:before{content:"\f079"}.fa-shopping-cart:before{content:"\f07a"}.fa-folder:before{content:"\f07b"}.fa-folder-open:before{content:"\f07c"}.fa-arrows-v:before{content:"\f07d"}.fa-arrows-h:before{content:"\f07e"}.fa-bar-chart-o:before,.fa-bar-chart:before{content:"\f080"}.fa-twitter-square:before{content:"\f081"}.fa-facebook-square:before{content:"\f082"}.fa-camera-retro:before{content:"\f083"}.fa-key:before{content:"\f084"}.fa-gears:before,.fa-cogs:before{content:"\f085"}.fa-comments:before{content:"\f086"}.fa-thumbs-o-up:before{content:"\f087"}.fa-thumbs-o-down:before{content:"\f088"}.fa-star-half:before{content:"\f089"}.fa-heart-o:before{content:"\f08a"}.fa-sign-out:before{content:"\f08b"}.fa-linkedin-square:before{content:"\f08c"}.fa-thumb-tack:before{content:"\f08d"}.fa-external-link:before{content:"\f08e"}.fa-sign-in:before{content:"\f090"}.fa-trophy:before{content:"\f091"}.fa-github-square:before{content:"\f092"}.fa-upload:before{content:"\f093"}.fa-lemon-o:before{content:"\f094"}.fa-phone:before{content:"\f095"}.fa-square-o:before{content:"\f096"}.fa-bookmark-o:before{content:"\f097"}.fa-phone-square:before{content:"\f098"}.fa-twitter:before{content:"\f099"}.fa-facebook-f:before,.fa-facebook:before{content:"\f09a"}.fa-github:before{content:"\f09b"}.fa-unlock:before{content:"\f09c"}.fa-credit-card:before{content:"\f09d"}.fa-feed:before,.fa-rss:before{content:"\f09e"}.fa-hdd-o:before{content:"\f0a0"}.fa-bullhorn:before{content:"\f0a1"}.fa-bell:before{content:"\f0f3"}.fa-certificate:before{content:"\f0a3"}.fa-hand-o-right:before{content:"\f0a4"}.fa-hand-o-left:before{content:"\f0a5"}.fa-hand-o-up:before{content:"\f0a6"}.fa-hand-o-down:before{content:"\f0a7"}.fa-arrow-circle-left:before{content:"\f0a8"}.fa-arrow-circle-right:before{content:"\f0a9"}.fa-arrow-circle-up:before{content:"\f0aa"}.fa-arrow-circle-down:before{content:"\f0ab"}.fa-globe:before{content:"\f0ac"}.fa-wrench:before{content:"\f0ad"}.fa-tasks:before{content:"\f0ae"}.fa-filter:before{content:"\f0b0"}.fa-briefcase:before{content:"\f0b1"}.fa-arrows-alt:before{content:"\f0b2"}.fa-group:before,.fa-users:before{content:"\f0c0"}.fa-chain:before,.fa-link:before{content:"\f0c1"}.fa-cloud:before{content:"\f0c2"}.fa-flask:before{content:"\f0c3"}.fa-cut:before,.fa-scissors:before{content:"\f0c4"}.fa-copy:before,.fa-files-o:before{content:"\f0c5"}.fa-paperclip:before{content:"\f0c6"}.fa-save:before,.fa-floppy-o:before{content:"\f0c7"}.fa-square:before{content:"\f0c8"}.fa-navicon:before,.fa-reorder:before,.fa-bars:before{content:"\f0c9"}.fa-list-ul:before{content:"\f0ca"}.fa-list-ol:before{content:"\f0cb"}.fa-strikethrough:before{content:"\f0cc"}.fa-underline:before{content:"\f0cd"}.fa-table:before{content:"\f0ce"}.fa-magic:before{content:"\f0d0"}.fa-truck:before{content:"\f0d1"}.fa-pinterest:before{content:"\f0d2"}.fa-pinterest-square:before{content:"\f0d3"}.fa-google-plus-square:before{content:"\f0d4"}.fa-google-plus:before{content:"\f0d5"}.fa-money:before{content:"\f0d6"}.fa-caret-down:before{content:"\f0d7"}.fa-caret-up:before{content:"\f0d8"}.fa-caret-left:before{content:"\f0d9"}.fa-caret-right:before{content:"\f0da"}.fa-columns:before{content:"\f0db"}.fa-unsorted:before,.fa-sort:before{content:"\f0dc"}.fa-sort-down:before,.fa-sort-desc:before{content:"\f0dd"}.fa-sort-up:before,.fa-sort-asc:before{content:"\f0de"}.fa-envelope:before{content:"\f0e0"}.fa-linkedin:before{content:"\f0e1"}.fa-rotate-left:before,.fa-undo:before{content:"\f0e2"}.fa-legal:before,.fa-gavel:before{content:"\f0e3"}.fa-dashboard:before,.fa-tachometer:before{content:"\f0e4"}.fa-comment-o:before{content:"\f0e5"}.fa-comments-o:before{content:"\f0e6"}.fa-flash:before,.fa-bolt:before{content:"\f0e7"}.fa-sitemap:before{content:"\f0e8"}.fa-umbrella:before{content:"\f0e9"}.fa-paste:before,.fa-clipboard:before{content:"\f0ea"}.fa-lightbulb-o:before{content:"\f0eb"}.fa-exchange:before{content:"\f0ec"}.fa-cloud-download:before{content:"\f0ed"}.fa-cloud-upload:before{content:"\f0ee"}.fa-user-md:before{content:"\f0f0"}.fa-stethoscope:before{content:"\f0f1"}.fa-suitcase:before{content:"\f0f2"}.fa-bell-o:before{content:"\f0a2"}.fa-coffee:before{content:"\f0f4"}.fa-cutlery:before{content:"\f0f5"}.fa-file-text-o:before{content:"\f0f6"}.fa-building-o:before{content:"\f0f7"}.fa-hospital-o:before{content:"\f0f8"}.fa-ambulance:before{content:"\f0f9"}.fa-medkit:before{content:"\f0fa"}.fa-fighter-jet:before{content:"\f0fb"}.fa-beer:before{content:"\f0fc"}.fa-h-square:before{content:"\f0fd"}.fa-plus-square:before{content:"\f0fe"}.fa-angle-double-left:before{content:"\f100"}.fa-angle-double-right:before{content:"\f101"}.fa-angle-double-up:before{content:"\f102"}.fa-angle-double-down:before{content:"\f103"}.fa-angle-left:before{content:"\f104"}.fa-angle-right:before{content:"\f105"}.fa-angle-up:before{content:"\f106"}.fa-angle-down:before{content:"\f107"}.fa-desktop:before{content:"\f108"}.fa-laptop:before{content:"\f109"}.fa-tablet:before{content:"\f10a"}.fa-mobile-phone:before,.fa-mobile:before{content:"\f10b"}.fa-circle-o:before{content:"\f10c"}.fa-quote-left:before{content:"\f10d"}.fa-quote-right:before{content:"\f10e"}.fa-spinner:before{content:"\f110"}.fa-circle:before{content:"\f111"}.fa-mail-reply:before,.fa-reply:before{content:"\f112"}.fa-github-alt:before{content:"\f113"}.fa-folder-o:before{content:"\f114"}.fa-folder-open-o:before{content:"\f115"}.fa-smile-o:before{content:"\f118"}.fa-frown-o:before{content:"\f119"}.fa-meh-o:before{content:"\f11a"}.fa-gamepad:before{content:"\f11b"}.fa-keyboard-o:before{content:"\f11c"}.fa-flag-o:before{content:"\f11d"}.fa-flag-checkered:before{content:"\f11e"}.fa-terminal:before{content:"\f120"}.fa-code:before{content:"\f121"}.fa-mail-reply-all:before,.fa-reply-all:before{content:"\f122"}.fa-star-half-empty:before,.fa-star-half-full:before,.fa-star-half-o:before{content:"\f123"}.fa-location-arrow:before{content:"\f124"}.fa-crop:before{content:"\f125"}.fa-code-fork:before{content:"\f126"}.fa-unlink:before,.fa-chain-broken:before{content:"\f127"}.fa-question:before{content:"\f128"}.fa-info:before{content:"\f129"}.fa-exclamation:before{content:"\f12a"}.fa-superscript:before{content:"\f12b"}.fa-subscript:before{content:"\f12c"}.fa-eraser:before{content:"\f12d"}.fa-puzzle-piece:before{content:"\f12e"}.fa-microphone:before{content:"\f130"}.fa-microphone-slash:before{content:"\f131"}.fa-shield:before{content:"\f132"}.fa-calendar-o:before{content:"\f133"}.fa-fire-extinguisher:before{content:"\f134"}.fa-rocket:before{content:"\f135"}.fa-maxcdn:before{content:"\f136"}.fa-chevron-circle-left:before{content:"\f137"}.fa-chevron-circle-right:before{content:"\f138"}.fa-chevron-circle-up:before{content:"\f139"}.fa-chevron-circle-down:before{content:"\f13a"}.fa-html5:before{content:"\f13b"}.fa-css3:before{content:"\f13c"}.fa-anchor:before{content:"\f13d"}.fa-unlock-alt:before{content:"\f13e"}.fa-bullseye:before{content:"\f140"}.fa-ellipsis-h:before{content:"\f141"}.fa-ellipsis-v:before{content:"\f142"}.fa-rss-square:before{content:"\f143"}.fa-play-circle:before{content:"\f144"}.fa-ticket:before{content:"\f145"}.fa-minus-square:before{content:"\f146"}.fa-minus-square-o:before{content:"\f147"}.fa-level-up:before{content:"\f148"}.fa-level-down:before{content:"\f149"}.fa-check-square:before{content:"\f14a"}.fa-pencil-square:before{content:"\f14b"}.fa-external-link-square:before{content:"\f14c"}.fa-share-square:before{content:"\f14d"}.fa-compass:before{content:"\f14e"}.fa-toggle-down:before,.fa-caret-square-o-down:before{content:"\f150"}.fa-toggle-up:before,.fa-caret-square-o-up:before{content:"\f151"}.fa-toggle-right:before,.fa-caret-square-o-right:before{content:"\f152"}.fa-euro:before,.fa-eur:before{content:"\f153"}.fa-gbp:before{content:"\f154"}.fa-dollar:before,.fa-usd:before{content:"\f155"}.fa-rupee:before,.fa-inr:before{content:"\f156"}.fa-cny:before,.fa-rmb:before,.fa-yen:before,.fa-jpy:before{content:"\f157"}.fa-ruble:before,.fa-rouble:before,.fa-rub:before{content:"\f158"}.fa-won:before,.fa-krw:before{content:"\f159"}.fa-bitcoin:before,.fa-btc:before{content:"\f15a"}.fa-file:before{content:"\f15b"}.fa-file-text:before{content:"\f15c"}.fa-sort-alpha-asc:before{content:"\f15d"}.fa-sort-alpha-desc:before{content:"\f15e"}.fa-sort-amount-asc:before{content:"\f160"}.fa-sort-amount-desc:before{content:"\f161"}.fa-sort-numeric-asc:before{content:"\f162"}.fa-sort-numeric-desc:before{content:"\f163"}.fa-thumbs-up:before{content:"\f164"}.fa-thumbs-down:before{content:"\f165"}.fa-youtube-square:before{content:"\f166"}.fa-youtube:before{content:"\f167"}.fa-xing:before{content:"\f168"}.fa-xing-square:before{content:"\f169"}.fa-youtube-play:before{content:"\f16a"}.fa-dropbox:before{content:"\f16b"}.fa-stack-overflow:before{content:"\f16c"}.fa-instagram:before{content:"\f16d"}.fa-flickr:before{content:"\f16e"}.fa-adn:before{content:"\f170"}.fa-bitbucket:before{content:"\f171"}.fa-bitbucket-square:before{content:"\f172"}.fa-tumblr:before{content:"\f173"}.fa-tumblr-square:before{content:"\f174"}.fa-long-arrow-down:before{content:"\f175"}.fa-long-arrow-up:before{content:"\f176"}.fa-long-arrow-left:before{content:"\f177"}.fa-long-arrow-right:before{content:"\f178"}.fa-apple:before{content:"\f179"}.fa-windows:before{content:"\f17a"}.fa-android:before{content:"\f17b"}.fa-linux:before{content:"\f17c"}.fa-dribbble:before{content:"\f17d"}.fa-skype:before{content:"\f17e"}.fa-foursquare:before{content:"\f180"}.fa-trello:before{content:"\f181"}.fa-female:before{content:"\f182"}.fa-male:before{content:"\f183"}.fa-gittip:before,.fa-gratipay:before{content:"\f184"}.fa-sun-o:before{content:"\f185"}.fa-moon-o:before{content:"\f186"}.fa-archive:before{content:"\f187"}.fa-bug:before{content:"\f188"}.fa-vk:before{content:"\f189"}.fa-weibo:before{content:"\f18a"}.fa-renren:before{content:"\f18b"}.fa-pagelines:before{content:"\f18c"}.fa-stack-exchange:before{content:"\f18d"}.fa-arrow-circle-o-right:before{content:"\f18e"}.fa-arrow-circle-o-left:before{content:"\f190"}.fa-toggle-left:before,.fa-caret-square-o-left:before{content:"\f191"}.fa-dot-circle-o:before{content:"\f192"}.fa-wheelchair:before{content:"\f193"}.fa-vimeo-square:before{content:"\f194"}.fa-turkish-lira:before,.fa-try:before{content:"\f195"}.fa-plus-square-o:before{content:"\f196"}.fa-space-shuttle:before{content:"\f197"}.fa-slack:before{content:"\f198"}.fa-envelope-square:before{content:"\f199"}.fa-wordpress:before{content:"\f19a"}.fa-openid:before{content:"\f19b"}.fa-institution:before,.fa-bank:before,.fa-university:before{content:"\f19c"}.fa-mortar-board:before,.fa-graduation-cap:before{content:"\f19d"}.fa-yahoo:before{content:"\f19e"}.fa-google:before{content:"\f1a0"}.fa-reddit:before{content:"\f1a1"}.fa-reddit-square:before{content:"\f1a2"}.fa-stumbleupon-circle:before{content:"\f1a3"}.fa-stumbleupon:before{content:"\f1a4"}.fa-delicious:before{content:"\f1a5"}.fa-digg:before{content:"\f1a6"}.fa-pied-piper-pp:before{content:"\f1a7"}.fa-pied-piper-alt:before{content:"\f1a8"}.fa-drupal:before{content:"\f1a9"}.fa-joomla:before{content:"\f1aa"}.fa-language:before{content:"\f1ab"}.fa-fax:before{content:"\f1ac"}.fa-building:before{content:"\f1ad"}.fa-child:before{content:"\f1ae"}.fa-paw:before{content:"\f1b0"}.fa-spoon:before{content:"\f1b1"}.fa-cube:before{content:"\f1b2"}.fa-cubes:before{content:"\f1b3"}.fa-behance:before{content:"\f1b4"}.fa-behance-square:before{content:"\f1b5"}.fa-steam:before{content:"\f1b6"}.fa-steam-square:before{content:"\f1b7"}.fa-recycle:before{content:"\f1b8"}.fa-automobile:before,.fa-car:before{content:"\f1b9"}.fa-cab:before,.fa-taxi:before{content:"\f1ba"}.fa-tree:before{content:"\f1bb"}.fa-spotify:before{content:"\f1bc"}.fa-deviantart:before{content:"\f1bd"}.fa-soundcloud:before{content:"\f1be"}.fa-database:before{content:"\f1c0"}.fa-file-pdf-o:before{content:"\f1c1"}.fa-file-word-o:before{content:"\f1c2"}.fa-file-excel-o:before{content:"\f1c3"}.fa-file-powerpoint-o:before{content:"\f1c4"}.fa-file-photo-o:before,.fa-file-picture-o:before,.fa-file-image-o:before{content:"\f1c5"}.fa-file-zip-o:before,.fa-file-archive-o:before{content:"\f1c6"}.fa-file-sound-o:before,.fa-file-audio-o:before{content:"\f1c7"}.fa-file-movie-o:before,.fa-file-video-o:before{content:"\f1c8"}.fa-file-code-o:before{content:"\f1c9"}.fa-vine:before{content:"\f1ca"}.fa-codepen:before{content:"\f1cb"}.fa-jsfiddle:before{content:"\f1cc"}.fa-life-bouy:before,.fa-life-buoy:before,.fa-life-saver:before,.fa-support:before,.fa-life-ring:before{content:"\f1cd"}.fa-circle-o-notch:before{content:"\f1ce"}.fa-ra:before,.fa-resistance:before,.fa-rebel:before{content:"\f1d0"}.fa-ge:before,.fa-empire:before{content:"\f1d1"}.fa-git-square:before{content:"\f1d2"}.fa-git:before{content:"\f1d3"}.fa-y-combinator-square:before,.fa-yc-square:before,.fa-hacker-news:before{content:"\f1d4"}.fa-tencent-weibo:before{content:"\f1d5"}.fa-qq:before{content:"\f1d6"}.fa-wechat:before,.fa-weixin:before{content:"\f1d7"}.fa-send:before,.fa-paper-plane:before{content:"\f1d8"}.fa-send-o:before,.fa-paper-plane-o:before{content:"\f1d9"}.fa-history:before{content:"\f1da"}.fa-circle-thin:before{content:"\f1db"}.fa-header:before{content:"\f1dc"}.fa-paragraph:before{content:"\f1dd"}.fa-sliders:before{content:"\f1de"}.fa-share-alt:before{content:"\f1e0"}.fa-share-alt-square:before{content:"\f1e1"}.fa-bomb:before{content:"\f1e2"}.fa-soccer-ball-o:before,.fa-futbol-o:before{content:"\f1e3"}.fa-tty:before{content:"\f1e4"}.fa-binoculars:before{content:"\f1e5"}.fa-plug:before{content:"\f1e6"}.fa-slideshare:before{content:"\f1e7"}.fa-twitch:before{content:"\f1e8"}.fa-yelp:before{content:"\f1e9"}.fa-newspaper-o:before{content:"\f1ea"}.fa-wifi:before{content:"\f1eb"}.fa-calculator:before{content:"\f1ec"}.fa-paypal:before{content:"\f1ed"}.fa-google-wallet:before{content:"\f1ee"}.fa-cc-visa:before{content:"\f1f0"}.fa-cc-mastercard:before{content:"\f1f1"}.fa-cc-discover:before{content:"\f1f2"}.fa-cc-amex:before{content:"\f1f3"}.fa-cc-paypal:before{content:"\f1f4"}.fa-cc-stripe:before{content:"\f1f5"}.fa-bell-slash:before{content:"\f1f6"}.fa-bell-slash-o:before{content:"\f1f7"}.fa-trash:before{content:"\f1f8"}.fa-copyright:before{content:"\f1f9"}.fa-at:before{content:"\f1fa"}.fa-eyedropper:before{content:"\f1fb"}.fa-paint-brush:before{content:"\f1fc"}.fa-birthday-cake:before{content:"\f1fd"}.fa-area-chart:before{content:"\f1fe"}.fa-pie-chart:before{content:"\f200"}.fa-line-chart:before{content:"\f201"}.fa-lastfm:before{content:"\f202"}.fa-lastfm-square:before{content:"\f203"}.fa-toggle-off:before{content:"\f204"}.fa-toggle-on:before{content:"\f205"}.fa-bicycle:before{content:"\f206"}.fa-bus:before{content:"\f207"}.fa-ioxhost:before{content:"\f208"}.fa-angellist:before{content:"\f209"}.fa-cc:before{content:"\f20a"}.fa-shekel:before,.fa-sheqel:before,.fa-ils:before{content:"\f20b"}.fa-meanpath:before{content:"\f20c"}.fa-buysellads:before{content:"\f20d"}.fa-connectdevelop:before{content:"\f20e"}.fa-dashcube:before{content:"\f210"}.fa-forumbee:before{content:"\f211"}.fa-leanpub:before{content:"\f212"}.fa-sellsy:before{content:"\f213"}.fa-shirtsinbulk:before{content:"\f214"}.fa-simplybuilt:before{content:"\f215"}.fa-skyatlas:before{content:"\f216"}.fa-cart-plus:before{content:"\f217"}.fa-cart-arrow-down:before{content:"\f218"}.fa-diamond:before{content:"\f219"}.fa-ship:before{content:"\f21a"}.fa-user-secret:before{content:"\f21b"}.fa-motorcycle:before{content:"\f21c"}.fa-street-view:before{content:"\f21d"}.fa-heartbeat:before{content:"\f21e"}.fa-venus:before{content:"\f221"}.fa-mars:before{content:"\f222"}.fa-mercury:before{content:"\f223"}.fa-intersex:before,.fa-transgender:before{content:"\f224"}.fa-transgender-alt:before{content:"\f225"}.fa-venus-double:before{content:"\f226"}.fa-mars-double:before{content:"\f227"}.fa-venus-mars:before{content:"\f228"}.fa-mars-stroke:before{content:"\f229"}.fa-mars-stroke-v:before{content:"\f22a"}.fa-mars-stroke-h:before{content:"\f22b"}.fa-neuter:before{content:"\f22c"}.fa-genderless:before{content:"\f22d"}.fa-facebook-official:before{content:"\f230"}.fa-pinterest-p:before{content:"\f231"}.fa-whatsapp:before{content:"\f232"}.fa-server:before{content:"\f233"}.fa-user-plus:before{content:"\f234"}.fa-user-times:before{content:"\f235"}.fa-hotel:before,.fa-bed:before{content:"\f236"}.fa-viacoin:before{content:"\f237"}.fa-train:before{content:"\f238"}.fa-subway:before{content:"\f239"}.fa-medium:before{content:"\f23a"}.fa-yc:before,.fa-y-combinator:before{content:"\f23b"}.fa-optin-monster:before{content:"\f23c"}.fa-opencart:before{content:"\f23d"}.fa-expeditedssl:before{content:"\f23e"}.fa-battery-4:before,.fa-battery:before,.fa-battery-full:before{content:"\f240"}.fa-battery-3:before,.fa-battery-three-quarters:before{content:"\f241"}.fa-battery-2:before,.fa-battery-half:before{content:"\f242"}.fa-battery-1:before,.fa-battery-quarter:before{content:"\f243"}.fa-battery-0:before,.fa-battery-empty:before{content:"\f244"}.fa-mouse-pointer:before{content:"\f245"}.fa-i-cursor:before{content:"\f246"}.fa-object-group:before{content:"\f247"}.fa-object-ungroup:before{content:"\f248"}.fa-sticky-note:before{content:"\f249"}.fa-sticky-note-o:before{content:"\f24a"}.fa-cc-jcb:before{content:"\f24b"}.fa-cc-diners-club:before{content:"\f24c"}.fa-clone:before{content:"\f24d"}.fa-balance-scale:before{content:"\f24e"}.fa-hourglass-o:before{content:"\f250"}.fa-hourglass-1:before,.fa-hourglass-start:before{content:"\f251"}.fa-hourglass-2:before,.fa-hourglass-half:before{content:"\f252"}.fa-hourglass-3:before,.fa-hourglass-end:before{content:"\f253"}.fa-hourglass:before{content:"\f254"}.fa-hand-grab-o:before,.fa-hand-rock-o:before{content:"\f255"}.fa-hand-stop-o:before,.fa-hand-paper-o:before{content:"\f256"}.fa-hand-scissors-o:before{content:"\f257"}.fa-hand-lizard-o:before{content:"\f258"}.fa-hand-spock-o:before{content:"\f259"}.fa-hand-pointer-o:before{content:"\f25a"}.fa-hand-peace-o:before{content:"\f25b"}.fa-trademark:before{content:"\f25c"}.fa-registered:before{content:"\f25d"}.fa-creative-commons:before{content:"\f25e"}.fa-gg:before{content:"\f260"}.fa-gg-circle:before{content:"\f261"}.fa-tripadvisor:before{content:"\f262"}.fa-odnoklassniki:before{content:"\f263"}.fa-odnoklassniki-square:before{content:"\f264"}.fa-get-pocket:before{content:"\f265"}.fa-wikipedia-w:before{content:"\f266"}.fa-safari:before{content:"\f267"}.fa-chrome:before{content:"\f268"}.fa-firefox:before{content:"\f269"}.fa-opera:before{content:"\f26a"}.fa-internet-explorer:before{content:"\f26b"}.fa-tv:before,.fa-television:before{content:"\f26c"}.fa-contao:before{content:"\f26d"}.fa-500px:before{content:"\f26e"}.fa-amazon:before{content:"\f270"}.fa-calendar-plus-o:before{content:"\f271"}.fa-calendar-minus-o:before{content:"\f272"}.fa-calendar-times-o:before{content:"\f273"}.fa-calendar-check-o:before{content:"\f274"}.fa-industry:before{content:"\f275"}.fa-map-pin:before{content:"\f276"}.fa-map-signs:before{content:"\f277"}.fa-map-o:before{content:"\f278"}.fa-map:before{content:"\f279"}.fa-commenting:before{content:"\f27a"}.fa-commenting-o:before{content:"\f27b"}.fa-houzz:before{content:"\f27c"}.fa-vimeo:before{content:"\f27d"}.fa-black-tie:before{content:"\f27e"}.fa-fonticons:before{content:"\f280"}.fa-reddit-alien:before{content:"\f281"}.fa-edge:before{content:"\f282"}.fa-credit-card-alt:before{content:"\f283"}.fa-codiepie:before{content:"\f284"}.fa-modx:before{content:"\f285"}.fa-fort-awesome:before{content:"\f286"}.fa-usb:before{content:"\f287"}.fa-product-hunt:before{content:"\f288"}.fa-mixcloud:before{content:"\f289"}.fa-scribd:before{content:"\f28a"}.fa-pause-circle:before{content:"\f28b"}.fa-pause-circle-o:before{content:"\f28c"}.fa-stop-circle:before{content:"\f28d"}.fa-stop-circle-o:before{content:"\f28e"}.fa-shopping-bag:before{content:"\f290"}.fa-shopping-basket:before{content:"\f291"}.fa-hashtag:before{content:"\f292"}.fa-bluetooth:before{content:"\f293"}.fa-bluetooth-b:before{content:"\f294"}.fa-percent:before{content:"\f295"}.fa-gitlab:before{content:"\f296"}.fa-wpbeginner:before{content:"\f297"}.fa-wpforms:before{content:"\f298"}.fa-envira:before{content:"\f299"}.fa-universal-access:before{content:"\f29a"}.fa-wheelchair-alt:before{content:"\f29b"}.fa-question-circle-o:before{content:"\f29c"}.fa-blind:before{content:"\f29d"}.fa-audio-description:before{content:"\f29e"}.fa-volume-control-phone:before{content:"\f2a0"}.fa-braille:before{content:"\f2a1"}.fa-assistive-listening-systems:before{content:"\f2a2"}.fa-asl-interpreting:before,.fa-american-sign-language-interpreting:before{content:"\f2a3"}.fa-deafness:before,.fa-hard-of-hearing:before,.fa-deaf:before{content:"\f2a4"}.fa-glide:before{content:"\f2a5"}.fa-glide-g:before{content:"\f2a6"}.fa-signing:before,.fa-sign-language:before{content:"\f2a7"}.fa-low-vision:before{content:"\f2a8"}.fa-viadeo:before{content:"\f2a9"}.fa-viadeo-square:before{content:"\f2aa"}.fa-snapchat:before{content:"\f2ab"}.fa-snapchat-ghost:before{content:"\f2ac"}.fa-snapchat-square:before{content:"\f2ad"}.fa-pied-piper:before{content:"\f2ae"}.fa-first-order:before{content:"\f2b0"}.fa-yoast:before{content:"\f2b1"}.fa-themeisle:before{content:"\f2b2"}.fa-google-plus-circle:before,.fa-google-plus-official:before{content:"\f2b3"}.fa-fa:before,.fa-font-awesome:before{content:"\f2b4"}.fa-handshake-o:before{content:"\f2b5"}.fa-envelope-open:before{content:"\f2b6"}.fa-envelope-open-o:before{content:"\f2b7"}.fa-linode:before{content:"\f2b8"}.fa-address-book:before{content:"\f2b9"}.fa-address-book-o:before{content:"\f2ba"}.fa-vcard:before,.fa-address-card:before{content:"\f2bb"}.fa-vcard-o:before,.fa-address-card-o:before{content:"\f2bc"}.fa-user-circle:before{content:"\f2bd"}.fa-user-circle-o:before{content:"\f2be"}.fa-user-o:before{content:"\f2c0"}.fa-id-badge:before{content:"\f2c1"}.fa-drivers-license:before,.fa-id-card:before{content:"\f2c2"}.fa-drivers-license-o:before,.fa-id-card-o:before{content:"\f2c3"}.fa-quora:before{content:"\f2c4"}.fa-free-code-camp:before{content:"\f2c5"}.fa-telegram:before{content:"\f2c6"}.fa-thermometer-4:before,.fa-thermometer:before,.fa-thermometer-full:before{content:"\f2c7"}.fa-thermometer-3:before,.fa-thermometer-three-quarters:before{content:"\f2c8"}.fa-thermometer-2:before,.fa-thermometer-half:before{content:"\f2c9"}.fa-thermometer-1:before,.fa-thermometer-quarter:before{content:"\f2ca"}.fa-thermometer-0:before,.fa-thermometer-empty:before{content:"\f2cb"}.fa-shower:before{content:"\f2cc"}.fa-bathtub:before,.fa-s15:before,.fa-bath:before{content:"\f2cd"}.fa-podcast:before{content:"\f2ce"}.fa-window-maximize:before{content:"\f2d0"}.fa-window-minimize:before{content:"\f2d1"}.fa-window-restore:before{content:"\f2d2"}.fa-times-rectangle:before,.fa-window-close:before{content:"\f2d3"}.fa-times-rectangle-o:before,.fa-window-close-o:before{content:"\f2d4"}.fa-bandcamp:before{content:"\f2d5"}.fa-grav:before{content:"\f2d6"}.fa-etsy:before{content:"\f2d7"}.fa-imdb:before{content:"\f2d8"}.fa-ravelry:before{content:"\f2d9"}.fa-eercast:before{content:"\f2da"}.fa-microchip:before{content:"\f2db"}.fa-snowflake-o:before{content:"\f2dc"}.fa-superpowers:before{content:"\f2dd"}.fa-wpexplorer:before{content:"\f2de"}.fa-meetup:before{content:"\f2e0"}.sr-only{position:absolute;width:1px;height:1px;padding:0;margin:-1px;overflow:hidden;clip:rect(0, 0, 0, 0);border:0}.sr-only-focusable:active,.sr-only-focusable:focus{position:static;width:auto;height:auto;margin:0;overflow:visible;clip:auto} diff --git a/demos/streaming_asr_server/web/static/css/style.css b/demos/streaming_asr_server/web/static/css/style.css deleted file mode 100644 index a3040718b8f1caa8fed98832b8c82778b0003a9f..0000000000000000000000000000000000000000 --- a/demos/streaming_asr_server/web/static/css/style.css +++ /dev/null @@ -1,453 +0,0 @@ -/* -* @Author: baipengxia -* @Date: 2021-03-12 11:44:28 -* @Last Modified by: baipengxia -* @Last Modified time: 2021-03-12 15:14:24 -*/ - -/** COMMON RESET **/ -* { - -webkit-tap-highlight-color: rgba(0, 0, 0, 0); -} - -body, -h1, -h2, -h3, -h4, -h5, -h6, -hr, -p, -dl, -dt, -dd, -ul, -ol, -li, -fieldset, -lengend, -button, -input, -textarea, -th, -td { - margin: 0; - padding: 0; - color: #000; -} - -body { - font-size: 14px; -} -html, body { - min-width: 1200px; -} - -button, -input, -select, -textarea { - font-size: 14px; -} - -h1 { - font-size: 18px; -} - -h2 { - font-size: 14px; -} - -h3 { - font-size: 14px; -} - -ul, -ol, -li { - list-style: none; -} - -a { - text-decoration: none; -} - -a:hover { - text-decoration: none; -} - -fieldset, -img { - border: none; -} - -table { - border-collapse: collapse; - border-spacing: 0; -} - -i { - font-style: normal; -} - -label { - position: inherit; -} - -.clearfix:after { - content: "."; - display: block; - height: 0; - clear: both; - visibility: hidden; -} - -.clearfix { - zoom: 1; - display: block; -} - -html, -body { - font-family: Tahoma, Arial, 'microsoft yahei', 'Roboto', 'Droid Sans', 'Helvetica Neue', 'Droid Sans Fallback', 'Heiti SC', 'Hiragino Sans GB', 'Simsun', 'sans-self'; -} - - - -.audio-banner { - width: 100%; - overflow: auto; - padding: 0; - background: url('../image/voice-dictation.svg'); - background-size: cover; -} -.weaper { - width: 1200px; - height: 155px; - margin: 72px auto; -} -.text-content { - width: 670px; - height: 100%; - float: left; -} -.text-content .title { - font-size: 34px; - font-family: 'PingFangSC-Medium'; - font-weight: 500; - color: rgba(255, 255, 255, 1); - line-height: 48px; -} -.text-content .con { - font-size: 16px; - font-family: PingFangSC-Light; - font-weight: 300; - color: rgba(255, 255, 255, 1); - line-height: 30px; -} -.img-con { - width: 416px; - height: 100%; - float: right; -} -.img-con img { - width: 100%; - height: 100%; -} -.con-container { - margin-top: 34px; -} - -.audio-advantage { - background: #f8f9fa; -} -.asr-advantage { - width: 1200px; - margin: 0 auto; -} -.asr-advantage h2 { - text-align: center; - font-size: 22px; - padding: 30px 0 0 0; -} -.asr-advantage > ul > li { - box-sizing: border-box; - padding: 0 16px; - width: 33%; - text-align: center; - margin-bottom: 35px; -} -.asr-advantage > ul > li .icons{ - margin-top: 10px; - margin-bottom: 20px; - width: 42px; - height: 42px; -} -.service-item-content { - margin-top: 35px; - display: flex; - justify-content: center; - flex-wrap: wrap; -} -.service-item-content img { - width: 160px; - vertical-align: bottom; -} -.service-item-content > li { - box-sizing: border-box; - padding: 0 16px; - width: 33%; - text-align: center; - margin-bottom: 35px; -} -.service-item-content > li .service-item-content-title { - line-height: 1.5; - font-weight: 700; - margin-top: 10px; -} -.service-item-content > li .service-item-content-desc { - margin-top: 5px; - line-height: 1.8; - color: #657384; -} - - -.audio-scene-con { - width: 100%; - padding-bottom: 84px; - background: #fff; -} -.audio-scene { - overflow: auto; - width: 1200px; - background: #fff; - text-align: center; - padding: 0; - margin: 0 auto; -} -.audio-scene h2 { - padding: 30px 0 0 0; - font-size: 22px; - text-align: center; -} - -.audio-experience { - width: 100%; - height: 538px; - background: #fff; - padding: 0; - margin: 0; - overflow: auto; -} -.asr-box { - width: 1200px; - height: 394px; - margin: 64px auto; -} -.asr-box h2 { - font-size: 22px; - text-align: center; - margin-bottom: 64px; -} -.voice-container { - position: relative; - width: 1200px; - height: 308px; - background: rgba(255, 255, 255, 1); - border-radius: 8px; - border: 1px solid rgba(225, 225, 225, 1); -} -.voice-container .voice { - height: 236px; - width: 100%; - border-radius: 8px; -} -.voice-container .voice textarea { - height: 100%; - width: 100%; - border: none; - outline: none; - border-radius: 8px; - padding: 25px; - font-size: 14px; - box-sizing: border-box; - resize: none; -} -.voice-input { - width: 100%; - height: 72px; - box-sizing: border-box; - padding-left: 35px; - background: rgba(242, 244, 245, 1); - border-radius: 8px; - line-height: 72px; -} -.voice-input .el-select { - width: 492px; -} -.start-voice { - display: inline-block; - margin-left: 10px; -} -.start-voice .time { - margin-right: 25px; -} -.asr-advantage > ul > li { - margin-bottom: 77px; -} -#msg { - width: 100%; - line-height: 40px; - font-size: 14px; - margin-left: 330px; -} -#captcha { - margin-left: 350px !important; - display: inline-block; - position: relative; -} -.black { - position: fixed; - width: 100%; - height: 100%; - z-index: 5; - background: rgba(0, 0, 0, 0.5); - top: 0; - left: 0; -} -.container { - position: fixed; - z-index: 6; - top: 25%; - left: 10%; -} -.audio-scene-con { - width: 100%; - padding-bottom: 84px; - background: #fff; -} -#sound { - color: #fff; - cursor: pointer; - background: #147ede; - padding: 10px; - margin-top: 30px; - margin-left: 135px; - width: 176px; - height: 30px !important; - text-align: center; - line-height: 30px !important; - border-radius: 10px; -} -.con-ten { - position: absolute; - width: 100%; - height: 100%; - z-index: 5; - background: #fff; - opacity: 0.5; - top: 0; - left: 0; -} -.websocket-url { - width: 320px; - height: 20px; - border: 1px solid #dcdfe6; - line-height: 20px; - padding: 10px; - border-radius: 4px; -} -.voice-btn { - color: #fff; - background-color: #409eff; - font-weight: 500; - padding: 12px 20px; - font-size: 14px; - border-radius: 4px; - border: 0; - cursor: pointer; -} -.voice-btn.end { - display: none; -} -.result-text { - background: #fff; - padding: 20px; -} -.voice-footer { - border-top: 1px solid #dddede; - background: #f7f9fa; - text-align: center; - margin-bottom: 8px; - color: #333; - font-size: 12px; - padding: 20px 0; -} - -/** line animate **/ -.time-box { - display: none; - margin-left: 10px; - width: 300px; -} -.total-time { - font-size: 14px; - color: #545454; -} -.voice-btn.end.show, -.time-box.show { - display: inline; -} -.start-taste-line { - margin-right: 20px; - display: inline-block; -} -.start-taste-line hr { - background-color: #187cff; - width: 3px; - height: 8px; - margin: 0 3px; - display: inline-block; - border: none; -} -.hr { - animation: note 0.2s ease-in-out; - animation-iteration-count: infinite; - animation-direction: alternate; -} -.hr-one { - animation-delay: -0.9s; -} -.hr-two { - animation-delay: -0.8s; -} -.hr-three { - animation-delay: -0.7s; -} -.hr-four { - animation-delay: -0.6s; -} -.hr-five { - animation-delay: -0.5s; -} -.hr-six { - animation-delay: -0.4s; -} -.hr-seven { - animation-delay: -0.3s; -} -.hr-eight { - animation-delay: -0.2s; -} -.hr-nine { - animation-delay: -0.1s; -} -@keyframes note { - from { - transform: scaleY(1); - } - to { - transform: scaleY(4); - } -} \ No newline at end of file diff --git a/demos/streaming_asr_server/web/static/fonts/FontAwesome.otf b/demos/streaming_asr_server/web/static/fonts/FontAwesome.otf deleted file mode 100644 index 401ec0f36e4f73b8efa40bd6f604fe80d286db70..0000000000000000000000000000000000000000 Binary files a/demos/streaming_asr_server/web/static/fonts/FontAwesome.otf and /dev/null differ diff --git a/demos/streaming_asr_server/web/static/fonts/fontawesome-webfont.eot b/demos/streaming_asr_server/web/static/fonts/fontawesome-webfont.eot deleted file mode 100644 index e9f60ca953f93e35eab4108bd414bc02ddcf3928..0000000000000000000000000000000000000000 Binary files a/demos/streaming_asr_server/web/static/fonts/fontawesome-webfont.eot and /dev/null differ diff --git a/demos/streaming_asr_server/web/static/fonts/fontawesome-webfont.svg b/demos/streaming_asr_server/web/static/fonts/fontawesome-webfont.svg deleted file mode 100644 index 6cd0326be380a32c3193c42e1879b7a6c6cf527e..0000000000000000000000000000000000000000 --- a/demos/streaming_asr_server/web/static/fonts/fontawesome-webfont.svg +++ /dev/null @@ -1,1951 +0,0 @@ - - - - -Created by FontForge 20120731 at Mon Oct 24 17:37:40 2016 - By ,,, -Copyright Dave Gandy 2016. All rights reserveddiff --git a/demos/streaming_asr_server/web/static/fonts/fontawesome-webfont.ttf b/demos/streaming_asr_server/web/static/fonts/fontawesome-webfont.ttf deleted file mode 100644 index 35acda2fa1196aad98c2adf4378a7611dd713aa3..0000000000000000000000000000000000000000 Binary files a/demos/streaming_asr_server/web/static/fonts/fontawesome-webfont.ttf and /dev/null differ diff --git a/demos/streaming_asr_server/web/static/fonts/fontawesome-webfont.woff b/demos/streaming_asr_server/web/static/fonts/fontawesome-webfont.woff deleted file mode 100644 index 400014a4b06eee3d0c0d54402a47ab2601b2862b..0000000000000000000000000000000000000000 Binary files a/demos/streaming_asr_server/web/static/fonts/fontawesome-webfont.woff and /dev/null differ diff --git a/demos/streaming_asr_server/web/static/fonts/fontawesome-webfont.woff2 b/demos/streaming_asr_server/web/static/fonts/fontawesome-webfont.woff2 deleted file mode 100644 index 4d13fc60404b91e398a37200c4a77b645cfd9586..0000000000000000000000000000000000000000 Binary files a/demos/streaming_asr_server/web/static/fonts/fontawesome-webfont.woff2 and /dev/null differ diff --git a/demos/streaming_asr_server/web/static/image/PaddleSpeech_logo.png b/demos/streaming_asr_server/web/static/image/PaddleSpeech_logo.png deleted file mode 100644 index fb25277540c9023c8a7d010e22e7e033ad0d74d7..0000000000000000000000000000000000000000 Binary files a/demos/streaming_asr_server/web/static/image/PaddleSpeech_logo.png and /dev/null differ diff --git a/demos/streaming_asr_server/web/static/image/voice-dictation.svg b/demos/streaming_asr_server/web/static/image/voice-dictation.svg deleted file mode 100644 index d35971499ddfed4ab0016419fb87e8d6a0d695cc..0000000000000000000000000000000000000000 --- a/demos/streaming_asr_server/web/static/image/voice-dictation.svg +++ /dev/null @@ -1,94 +0,0 @@ - - - - 背景 - Created with Sketch. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/demos/streaming_asr_server/web/static/js/SoundRecognizer.js b/demos/streaming_asr_server/web/static/js/SoundRecognizer.js deleted file mode 100644 index 5ef3d2e89dc27945d9e356b3c9eb5519f9cea69a..0000000000000000000000000000000000000000 --- a/demos/streaming_asr_server/web/static/js/SoundRecognizer.js +++ /dev/null @@ -1,133 +0,0 @@ -SoundRecognizer = { - rec: null, - wave: null, - SampleRate: 16000, - testBitRate: 16, - isCloseRecorder: false, - SendInterval: 300, - realTimeSendTryType: 'pcm', - realTimeSendTryEncBusy: 0, - realTimeSendTryTime: 0, - realTimeSendTryNumber: 0, - transferUploadNumberMax: 0, - realTimeSendTryChunk: null, - soundType: "pcm", - init: function (config) { - this.soundType = config.soundType || 'pcm'; - this.SampleRate = config.sampleRate || 16000; - this.recwaveElm = config.recwaveElm || ''; - this.TransferUpload = config.translerCallBack || this.TransferProcess; - this.initRecorder(); - }, - RealTimeSendTryReset: function (type) { - this.realTimeSendTryType = type; - this.realTimeSendTryTime = 0; - }, - RealTimeSendTry: function (rec, isClose) { - var that = this; - var t1 = Date.now(), endT = 0, recImpl = Recorder.prototype; - if (this.realTimeSendTryTime == 0) { - this.realTimeSendTryTime = t1; - this.realTimeSendTryEncBusy = 0; - this.realTimeSendTryNumber = 0; - this.transferUploadNumberMax = 0; - this.realTimeSendTryChunk = null; - } - if (!isClose && t1 - this.realTimeSendTryTime < this.SendInterval) { - return;//控制缓冲达到指定间隔才进行传输 - } - this.realTimeSendTryTime = t1; - var number = ++this.realTimeSendTryNumber; - - //借用SampleData函数进行数据的连续处理,采样率转换是顺带的 - var chunk = Recorder.SampleData(rec.buffers, rec.srcSampleRate, this.SampleRate, this.realTimeSendTryChunk, { frameType: isClose ? "" : this.realTimeSendTryType }); - - //清理已处理完的缓冲数据,释放内存以支持长时间录音,最后完成录音时不能调用stop,因为数据已经被清掉了 - for (var i = this.realTimeSendTryChunk ? this.realTimeSendTryChunk.index : 0; i < chunk.index; i++) { - rec.buffers[i] = null; - } - this.realTimeSendTryChunk = chunk; - - //没有新数据,或结束时的数据量太小,不能进行mock转码 - if (chunk.data.length == 0 || isClose && chunk.data.length < 2000) { - this.TransferUpload(number, null, 0, null, isClose); - return; - } - //实时编码队列阻塞处理 - if (!isClose) { - if (this.realTimeSendTryEncBusy >= 2) { - console.log("编码队列阻塞,已丢弃一帧", 1); - return; - } - } - this.realTimeSendTryEncBusy++; - - //通过mock方法实时转码成mp3、wav - var encStartTime = Date.now(); - var recMock = Recorder({ - type: this.realTimeSendTryType - , sampleRate: this.SampleRate //采样率 - , bitRate: this.testBitRate //比特率 - }); - recMock.mock(chunk.data, chunk.sampleRate); - recMock.stop(function (blob, duration) { - that.realTimeSendTryEncBusy && (that.realTimeSendTryEncBusy--); - blob.encTime = Date.now() - encStartTime; - - //转码好就推入传输 - that.TransferUpload(number, blob, duration, recMock, isClose); - }, function (msg) { - that.realTimeSendTryEncBusy && (that.realTimeSendTryEncBusy--); - //转码错误?没想到什么时候会产生错误! - console.log("不应该出现的错误:" + msg, 1); - }); - }, - recordClose: function () { - try { - this.rec.close(function () { - this.isCloseRecorder = true; - }); - this.RealTimeSendTry(this.rec, true);//最后一次发送 - } catch (ex) { - // recordClose(); - } - }, - recordEnd: function () { - try { - this.rec.stop(function (blob, time) { - this.recordClose(); - }, function (s) { - this.recordClose(); - }); - } catch (ex) { - } - }, - initRecorder: function () { - var that = this; - var rec = Recorder({ - type: that.soundType - , bitRate: that.testBitRate - , sampleRate: that.SampleRate - , onProcess: function (buffers, level, time, sampleRate) { - that.wave.input(buffers[buffers.length - 1], level, sampleRate); - that.RealTimeSendTry(rec, false);//推入实时处理,因为是unknown格式,这里简化函数调用,没有用到buffers和bufferSampleRate,因为这些数据和rec.buffers是完全相同的。 - } - }); - - rec.open(function () { - that.wave = Recorder.FrequencyHistogramView({ - elem: that.recwaveElm, lineCount: 90 - , position: 0 - , minHeight: 1 - , stripeEnable: false - }); - rec.start(); - that.isCloseRecorder = false; - that.RealTimeSendTryReset(that.soundType);//重置 - }); - this.rec = rec; - }, - TransferProcess: function (number, blobOrNull, duration, blobRec, isClose) { - - } -} \ No newline at end of file diff --git a/demos/streaming_asr_server/web/static/js/jquery-3.2.1.min.js b/demos/streaming_asr_server/web/static/js/jquery-3.2.1.min.js deleted file mode 100644 index 644d35e274fd64ddaf6d12af813e820c424176a9..0000000000000000000000000000000000000000 --- a/demos/streaming_asr_server/web/static/js/jquery-3.2.1.min.js +++ /dev/null @@ -1,4 +0,0 @@ -/*! jQuery v3.2.1 | (c) JS Foundation and other contributors | jquery.org/license */ -!function(a,b){"use strict";"object"==typeof module&&"object"==typeof module.exports?module.exports=a.document?b(a,!0):function(a){if(!a.document)throw new Error("jQuery requires a window with a document");return b(a)}:b(a)}("undefined"!=typeof window?window:this,function(a,b){"use strict";var c=[],d=a.document,e=Object.getPrototypeOf,f=c.slice,g=c.concat,h=c.push,i=c.indexOf,j={},k=j.toString,l=j.hasOwnProperty,m=l.toString,n=m.call(Object),o={};function p(a,b){b=b||d;var c=b.createElement("script");c.text=a,b.head.appendChild(c).parentNode.removeChild(c)}var q="3.2.1",r=function(a,b){return new r.fn.init(a,b)},s=/^[\s\uFEFF\xA0]+|[\s\uFEFF\xA0]+$/g,t=/^-ms-/,u=/-([a-z])/g,v=function(a,b){return b.toUpperCase()};r.fn=r.prototype={jquery:q,constructor:r,length:0,toArray:function(){return f.call(this)},get:function(a){return null==a?f.call(this):a<0?this[a+this.length]:this[a]},pushStack:function(a){var b=r.merge(this.constructor(),a);return b.prevObject=this,b},each:function(a){return r.each(this,a)},map:function(a){return this.pushStack(r.map(this,function(b,c){return a.call(b,c,b)}))},slice:function(){return this.pushStack(f.apply(this,arguments))},first:function(){return this.eq(0)},last:function(){return this.eq(-1)},eq:function(a){var b=this.length,c=+a+(a<0?b:0);return this.pushStack(c>=0&&c0&&b-1 in a)}var x=function(a){var b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u="sizzle"+1*new Date,v=a.document,w=0,x=0,y=ha(),z=ha(),A=ha(),B=function(a,b){return a===b&&(l=!0),0},C={}.hasOwnProperty,D=[],E=D.pop,F=D.push,G=D.push,H=D.slice,I=function(a,b){for(var c=0,d=a.length;c+~]|"+K+")"+K+"*"),S=new RegExp("="+K+"*([^\\]'\"]*?)"+K+"*\\]","g"),T=new RegExp(N),U=new RegExp("^"+L+"$"),V={ID:new RegExp("^#("+L+")"),CLASS:new RegExp("^\\.("+L+")"),TAG:new RegExp("^("+L+"|[*])"),ATTR:new RegExp("^"+M),PSEUDO:new RegExp("^"+N),CHILD:new RegExp("^:(only|first|last|nth|nth-last)-(child|of-type)(?:\\("+K+"*(even|odd|(([+-]|)(\\d*)n|)"+K+"*(?:([+-]|)"+K+"*(\\d+)|))"+K+"*\\)|)","i"),bool:new RegExp("^(?:"+J+")$","i"),needsContext:new RegExp("^"+K+"*[>+~]|:(even|odd|eq|gt|lt|nth|first|last)(?:\\("+K+"*((?:-\\d)?\\d*)"+K+"*\\)|)(?=[^-]|$)","i")},W=/^(?:input|select|textarea|button)$/i,X=/^h\d$/i,Y=/^[^{]+\{\s*\[native \w/,Z=/^(?:#([\w-]+)|(\w+)|\.([\w-]+))$/,$=/[+~]/,_=new RegExp("\\\\([\\da-f]{1,6}"+K+"?|("+K+")|.)","ig"),aa=function(a,b,c){var d="0x"+b-65536;return d!==d||c?b:d<0?String.fromCharCode(d+65536):String.fromCharCode(d>>10|55296,1023&d|56320)},ba=/([\0-\x1f\x7f]|^-?\d)|^-$|[^\0-\x1f\x7f-\uFFFF\w-]/g,ca=function(a,b){return b?"\0"===a?"\ufffd":a.slice(0,-1)+"\\"+a.charCodeAt(a.length-1).toString(16)+" ":"\\"+a},da=function(){m()},ea=ta(function(a){return a.disabled===!0&&("form"in a||"label"in a)},{dir:"parentNode",next:"legend"});try{G.apply(D=H.call(v.childNodes),v.childNodes),D[v.childNodes.length].nodeType}catch(fa){G={apply:D.length?function(a,b){F.apply(a,H.call(b))}:function(a,b){var c=a.length,d=0;while(a[c++]=b[d++]);a.length=c-1}}}function ga(a,b,d,e){var f,h,j,k,l,o,r,s=b&&b.ownerDocument,w=b?b.nodeType:9;if(d=d||[],"string"!=typeof a||!a||1!==w&&9!==w&&11!==w)return d;if(!e&&((b?b.ownerDocument||b:v)!==n&&m(b),b=b||n,p)){if(11!==w&&(l=Z.exec(a)))if(f=l[1]){if(9===w){if(!(j=b.getElementById(f)))return d;if(j.id===f)return d.push(j),d}else if(s&&(j=s.getElementById(f))&&t(b,j)&&j.id===f)return d.push(j),d}else{if(l[2])return G.apply(d,b.getElementsByTagName(a)),d;if((f=l[3])&&c.getElementsByClassName&&b.getElementsByClassName)return G.apply(d,b.getElementsByClassName(f)),d}if(c.qsa&&!A[a+" "]&&(!q||!q.test(a))){if(1!==w)s=b,r=a;else if("object"!==b.nodeName.toLowerCase()){(k=b.getAttribute("id"))?k=k.replace(ba,ca):b.setAttribute("id",k=u),o=g(a),h=o.length;while(h--)o[h]="#"+k+" "+sa(o[h]);r=o.join(","),s=$.test(a)&&qa(b.parentNode)||b}if(r)try{return G.apply(d,s.querySelectorAll(r)),d}catch(x){}finally{k===u&&b.removeAttribute("id")}}}return i(a.replace(P,"$1"),b,d,e)}function ha(){var a=[];function b(c,e){return a.push(c+" ")>d.cacheLength&&delete b[a.shift()],b[c+" "]=e}return b}function ia(a){return a[u]=!0,a}function ja(a){var b=n.createElement("fieldset");try{return!!a(b)}catch(c){return!1}finally{b.parentNode&&b.parentNode.removeChild(b),b=null}}function ka(a,b){var c=a.split("|"),e=c.length;while(e--)d.attrHandle[c[e]]=b}function la(a,b){var c=b&&a,d=c&&1===a.nodeType&&1===b.nodeType&&a.sourceIndex-b.sourceIndex;if(d)return d;if(c)while(c=c.nextSibling)if(c===b)return-1;return a?1:-1}function ma(a){return function(b){var c=b.nodeName.toLowerCase();return"input"===c&&b.type===a}}function na(a){return function(b){var c=b.nodeName.toLowerCase();return("input"===c||"button"===c)&&b.type===a}}function oa(a){return function(b){return"form"in b?b.parentNode&&b.disabled===!1?"label"in b?"label"in b.parentNode?b.parentNode.disabled===a:b.disabled===a:b.isDisabled===a||b.isDisabled!==!a&&ea(b)===a:b.disabled===a:"label"in b&&b.disabled===a}}function pa(a){return ia(function(b){return b=+b,ia(function(c,d){var e,f=a([],c.length,b),g=f.length;while(g--)c[e=f[g]]&&(c[e]=!(d[e]=c[e]))})})}function qa(a){return a&&"undefined"!=typeof a.getElementsByTagName&&a}c=ga.support={},f=ga.isXML=function(a){var b=a&&(a.ownerDocument||a).documentElement;return!!b&&"HTML"!==b.nodeName},m=ga.setDocument=function(a){var b,e,g=a?a.ownerDocument||a:v;return g!==n&&9===g.nodeType&&g.documentElement?(n=g,o=n.documentElement,p=!f(n),v!==n&&(e=n.defaultView)&&e.top!==e&&(e.addEventListener?e.addEventListener("unload",da,!1):e.attachEvent&&e.attachEvent("onunload",da)),c.attributes=ja(function(a){return a.className="i",!a.getAttribute("className")}),c.getElementsByTagName=ja(function(a){return a.appendChild(n.createComment("")),!a.getElementsByTagName("*").length}),c.getElementsByClassName=Y.test(n.getElementsByClassName),c.getById=ja(function(a){return o.appendChild(a).id=u,!n.getElementsByName||!n.getElementsByName(u).length}),c.getById?(d.filter.ID=function(a){var b=a.replace(_,aa);return function(a){return a.getAttribute("id")===b}},d.find.ID=function(a,b){if("undefined"!=typeof b.getElementById&&p){var c=b.getElementById(a);return c?[c]:[]}}):(d.filter.ID=function(a){var b=a.replace(_,aa);return function(a){var c="undefined"!=typeof a.getAttributeNode&&a.getAttributeNode("id");return c&&c.value===b}},d.find.ID=function(a,b){if("undefined"!=typeof b.getElementById&&p){var c,d,e,f=b.getElementById(a);if(f){if(c=f.getAttributeNode("id"),c&&c.value===a)return[f];e=b.getElementsByName(a),d=0;while(f=e[d++])if(c=f.getAttributeNode("id"),c&&c.value===a)return[f]}return[]}}),d.find.TAG=c.getElementsByTagName?function(a,b){return"undefined"!=typeof b.getElementsByTagName?b.getElementsByTagName(a):c.qsa?b.querySelectorAll(a):void 0}:function(a,b){var c,d=[],e=0,f=b.getElementsByTagName(a);if("*"===a){while(c=f[e++])1===c.nodeType&&d.push(c);return d}return f},d.find.CLASS=c.getElementsByClassName&&function(a,b){if("undefined"!=typeof b.getElementsByClassName&&p)return b.getElementsByClassName(a)},r=[],q=[],(c.qsa=Y.test(n.querySelectorAll))&&(ja(function(a){o.appendChild(a).innerHTML="",a.querySelectorAll("[msallowcapture^='']").length&&q.push("[*^$]="+K+"*(?:''|\"\")"),a.querySelectorAll("[selected]").length||q.push("\\["+K+"*(?:value|"+J+")"),a.querySelectorAll("[id~="+u+"-]").length||q.push("~="),a.querySelectorAll(":checked").length||q.push(":checked"),a.querySelectorAll("a#"+u+"+*").length||q.push(".#.+[+~]")}),ja(function(a){a.innerHTML="";var b=n.createElement("input");b.setAttribute("type","hidden"),a.appendChild(b).setAttribute("name","D"),a.querySelectorAll("[name=d]").length&&q.push("name"+K+"*[*^$|!~]?="),2!==a.querySelectorAll(":enabled").length&&q.push(":enabled",":disabled"),o.appendChild(a).disabled=!0,2!==a.querySelectorAll(":disabled").length&&q.push(":enabled",":disabled"),a.querySelectorAll("*,:x"),q.push(",.*:")})),(c.matchesSelector=Y.test(s=o.matches||o.webkitMatchesSelector||o.mozMatchesSelector||o.oMatchesSelector||o.msMatchesSelector))&&ja(function(a){c.disconnectedMatch=s.call(a,"*"),s.call(a,"[s!='']:x"),r.push("!=",N)}),q=q.length&&new RegExp(q.join("|")),r=r.length&&new RegExp(r.join("|")),b=Y.test(o.compareDocumentPosition),t=b||Y.test(o.contains)?function(a,b){var c=9===a.nodeType?a.documentElement:a,d=b&&b.parentNode;return a===d||!(!d||1!==d.nodeType||!(c.contains?c.contains(d):a.compareDocumentPosition&&16&a.compareDocumentPosition(d)))}:function(a,b){if(b)while(b=b.parentNode)if(b===a)return!0;return!1},B=b?function(a,b){if(a===b)return l=!0,0;var d=!a.compareDocumentPosition-!b.compareDocumentPosition;return d?d:(d=(a.ownerDocument||a)===(b.ownerDocument||b)?a.compareDocumentPosition(b):1,1&d||!c.sortDetached&&b.compareDocumentPosition(a)===d?a===n||a.ownerDocument===v&&t(v,a)?-1:b===n||b.ownerDocument===v&&t(v,b)?1:k?I(k,a)-I(k,b):0:4&d?-1:1)}:function(a,b){if(a===b)return l=!0,0;var c,d=0,e=a.parentNode,f=b.parentNode,g=[a],h=[b];if(!e||!f)return a===n?-1:b===n?1:e?-1:f?1:k?I(k,a)-I(k,b):0;if(e===f)return la(a,b);c=a;while(c=c.parentNode)g.unshift(c);c=b;while(c=c.parentNode)h.unshift(c);while(g[d]===h[d])d++;return d?la(g[d],h[d]):g[d]===v?-1:h[d]===v?1:0},n):n},ga.matches=function(a,b){return ga(a,null,null,b)},ga.matchesSelector=function(a,b){if((a.ownerDocument||a)!==n&&m(a),b=b.replace(S,"='$1']"),c.matchesSelector&&p&&!A[b+" "]&&(!r||!r.test(b))&&(!q||!q.test(b)))try{var d=s.call(a,b);if(d||c.disconnectedMatch||a.document&&11!==a.document.nodeType)return d}catch(e){}return ga(b,n,null,[a]).length>0},ga.contains=function(a,b){return(a.ownerDocument||a)!==n&&m(a),t(a,b)},ga.attr=function(a,b){(a.ownerDocument||a)!==n&&m(a);var e=d.attrHandle[b.toLowerCase()],f=e&&C.call(d.attrHandle,b.toLowerCase())?e(a,b,!p):void 0;return void 0!==f?f:c.attributes||!p?a.getAttribute(b):(f=a.getAttributeNode(b))&&f.specified?f.value:null},ga.escape=function(a){return(a+"").replace(ba,ca)},ga.error=function(a){throw new Error("Syntax error, unrecognized expression: "+a)},ga.uniqueSort=function(a){var b,d=[],e=0,f=0;if(l=!c.detectDuplicates,k=!c.sortStable&&a.slice(0),a.sort(B),l){while(b=a[f++])b===a[f]&&(e=d.push(f));while(e--)a.splice(d[e],1)}return k=null,a},e=ga.getText=function(a){var b,c="",d=0,f=a.nodeType;if(f){if(1===f||9===f||11===f){if("string"==typeof a.textContent)return a.textContent;for(a=a.firstChild;a;a=a.nextSibling)c+=e(a)}else if(3===f||4===f)return a.nodeValue}else while(b=a[d++])c+=e(b);return c},d=ga.selectors={cacheLength:50,createPseudo:ia,match:V,attrHandle:{},find:{},relative:{">":{dir:"parentNode",first:!0}," ":{dir:"parentNode"},"+":{dir:"previousSibling",first:!0},"~":{dir:"previousSibling"}},preFilter:{ATTR:function(a){return a[1]=a[1].replace(_,aa),a[3]=(a[3]||a[4]||a[5]||"").replace(_,aa),"~="===a[2]&&(a[3]=" "+a[3]+" "),a.slice(0,4)},CHILD:function(a){return a[1]=a[1].toLowerCase(),"nth"===a[1].slice(0,3)?(a[3]||ga.error(a[0]),a[4]=+(a[4]?a[5]+(a[6]||1):2*("even"===a[3]||"odd"===a[3])),a[5]=+(a[7]+a[8]||"odd"===a[3])):a[3]&&ga.error(a[0]),a},PSEUDO:function(a){var b,c=!a[6]&&a[2];return V.CHILD.test(a[0])?null:(a[3]?a[2]=a[4]||a[5]||"":c&&T.test(c)&&(b=g(c,!0))&&(b=c.indexOf(")",c.length-b)-c.length)&&(a[0]=a[0].slice(0,b),a[2]=c.slice(0,b)),a.slice(0,3))}},filter:{TAG:function(a){var b=a.replace(_,aa).toLowerCase();return"*"===a?function(){return!0}:function(a){return a.nodeName&&a.nodeName.toLowerCase()===b}},CLASS:function(a){var b=y[a+" "];return b||(b=new RegExp("(^|"+K+")"+a+"("+K+"|$)"))&&y(a,function(a){return b.test("string"==typeof a.className&&a.className||"undefined"!=typeof a.getAttribute&&a.getAttribute("class")||"")})},ATTR:function(a,b,c){return function(d){var e=ga.attr(d,a);return null==e?"!="===b:!b||(e+="","="===b?e===c:"!="===b?e!==c:"^="===b?c&&0===e.indexOf(c):"*="===b?c&&e.indexOf(c)>-1:"$="===b?c&&e.slice(-c.length)===c:"~="===b?(" "+e.replace(O," ")+" ").indexOf(c)>-1:"|="===b&&(e===c||e.slice(0,c.length+1)===c+"-"))}},CHILD:function(a,b,c,d,e){var f="nth"!==a.slice(0,3),g="last"!==a.slice(-4),h="of-type"===b;return 1===d&&0===e?function(a){return!!a.parentNode}:function(b,c,i){var j,k,l,m,n,o,p=f!==g?"nextSibling":"previousSibling",q=b.parentNode,r=h&&b.nodeName.toLowerCase(),s=!i&&!h,t=!1;if(q){if(f){while(p){m=b;while(m=m[p])if(h?m.nodeName.toLowerCase()===r:1===m.nodeType)return!1;o=p="only"===a&&!o&&"nextSibling"}return!0}if(o=[g?q.firstChild:q.lastChild],g&&s){m=q,l=m[u]||(m[u]={}),k=l[m.uniqueID]||(l[m.uniqueID]={}),j=k[a]||[],n=j[0]===w&&j[1],t=n&&j[2],m=n&&q.childNodes[n];while(m=++n&&m&&m[p]||(t=n=0)||o.pop())if(1===m.nodeType&&++t&&m===b){k[a]=[w,n,t];break}}else if(s&&(m=b,l=m[u]||(m[u]={}),k=l[m.uniqueID]||(l[m.uniqueID]={}),j=k[a]||[],n=j[0]===w&&j[1],t=n),t===!1)while(m=++n&&m&&m[p]||(t=n=0)||o.pop())if((h?m.nodeName.toLowerCase()===r:1===m.nodeType)&&++t&&(s&&(l=m[u]||(m[u]={}),k=l[m.uniqueID]||(l[m.uniqueID]={}),k[a]=[w,t]),m===b))break;return t-=e,t===d||t%d===0&&t/d>=0}}},PSEUDO:function(a,b){var c,e=d.pseudos[a]||d.setFilters[a.toLowerCase()]||ga.error("unsupported pseudo: "+a);return e[u]?e(b):e.length>1?(c=[a,a,"",b],d.setFilters.hasOwnProperty(a.toLowerCase())?ia(function(a,c){var d,f=e(a,b),g=f.length;while(g--)d=I(a,f[g]),a[d]=!(c[d]=f[g])}):function(a){return e(a,0,c)}):e}},pseudos:{not:ia(function(a){var b=[],c=[],d=h(a.replace(P,"$1"));return d[u]?ia(function(a,b,c,e){var f,g=d(a,null,e,[]),h=a.length;while(h--)(f=g[h])&&(a[h]=!(b[h]=f))}):function(a,e,f){return b[0]=a,d(b,null,f,c),b[0]=null,!c.pop()}}),has:ia(function(a){return function(b){return ga(a,b).length>0}}),contains:ia(function(a){return a=a.replace(_,aa),function(b){return(b.textContent||b.innerText||e(b)).indexOf(a)>-1}}),lang:ia(function(a){return U.test(a||"")||ga.error("unsupported lang: "+a),a=a.replace(_,aa).toLowerCase(),function(b){var c;do if(c=p?b.lang:b.getAttribute("xml:lang")||b.getAttribute("lang"))return c=c.toLowerCase(),c===a||0===c.indexOf(a+"-");while((b=b.parentNode)&&1===b.nodeType);return!1}}),target:function(b){var c=a.location&&a.location.hash;return c&&c.slice(1)===b.id},root:function(a){return a===o},focus:function(a){return a===n.activeElement&&(!n.hasFocus||n.hasFocus())&&!!(a.type||a.href||~a.tabIndex)},enabled:oa(!1),disabled:oa(!0),checked:function(a){var b=a.nodeName.toLowerCase();return"input"===b&&!!a.checked||"option"===b&&!!a.selected},selected:function(a){return a.parentNode&&a.parentNode.selectedIndex,a.selected===!0},empty:function(a){for(a=a.firstChild;a;a=a.nextSibling)if(a.nodeType<6)return!1;return!0},parent:function(a){return!d.pseudos.empty(a)},header:function(a){return X.test(a.nodeName)},input:function(a){return W.test(a.nodeName)},button:function(a){var b=a.nodeName.toLowerCase();return"input"===b&&"button"===a.type||"button"===b},text:function(a){var b;return"input"===a.nodeName.toLowerCase()&&"text"===a.type&&(null==(b=a.getAttribute("type"))||"text"===b.toLowerCase())},first:pa(function(){return[0]}),last:pa(function(a,b){return[b-1]}),eq:pa(function(a,b,c){return[c<0?c+b:c]}),even:pa(function(a,b){for(var c=0;c=0;)a.push(d);return a}),gt:pa(function(a,b,c){for(var d=c<0?c+b:c;++d1?function(b,c,d){var e=a.length;while(e--)if(!a[e](b,c,d))return!1;return!0}:a[0]}function va(a,b,c){for(var d=0,e=b.length;d-1&&(f[j]=!(g[j]=l))}}else r=wa(r===g?r.splice(o,r.length):r),e?e(null,g,r,i):G.apply(g,r)})}function ya(a){for(var b,c,e,f=a.length,g=d.relative[a[0].type],h=g||d.relative[" "],i=g?1:0,k=ta(function(a){return a===b},h,!0),l=ta(function(a){return I(b,a)>-1},h,!0),m=[function(a,c,d){var e=!g&&(d||c!==j)||((b=c).nodeType?k(a,c,d):l(a,c,d));return b=null,e}];i1&&ua(m),i>1&&sa(a.slice(0,i-1).concat({value:" "===a[i-2].type?"*":""})).replace(P,"$1"),c,i0,e=a.length>0,f=function(f,g,h,i,k){var l,o,q,r=0,s="0",t=f&&[],u=[],v=j,x=f||e&&d.find.TAG("*",k),y=w+=null==v?1:Math.random()||.1,z=x.length;for(k&&(j=g===n||g||k);s!==z&&null!=(l=x[s]);s++){if(e&&l){o=0,g||l.ownerDocument===n||(m(l),h=!p);while(q=a[o++])if(q(l,g||n,h)){i.push(l);break}k&&(w=y)}c&&((l=!q&&l)&&r--,f&&t.push(l))}if(r+=s,c&&s!==r){o=0;while(q=b[o++])q(t,u,g,h);if(f){if(r>0)while(s--)t[s]||u[s]||(u[s]=E.call(i));u=wa(u)}G.apply(i,u),k&&!f&&u.length>0&&r+b.length>1&&ga.uniqueSort(i)}return k&&(w=y,j=v),t};return c?ia(f):f}return h=ga.compile=function(a,b){var c,d=[],e=[],f=A[a+" "];if(!f){b||(b=g(a)),c=b.length;while(c--)f=ya(b[c]),f[u]?d.push(f):e.push(f);f=A(a,za(e,d)),f.selector=a}return f},i=ga.select=function(a,b,c,e){var f,i,j,k,l,m="function"==typeof a&&a,n=!e&&g(a=m.selector||a);if(c=c||[],1===n.length){if(i=n[0]=n[0].slice(0),i.length>2&&"ID"===(j=i[0]).type&&9===b.nodeType&&p&&d.relative[i[1].type]){if(b=(d.find.ID(j.matches[0].replace(_,aa),b)||[])[0],!b)return c;m&&(b=b.parentNode),a=a.slice(i.shift().value.length)}f=V.needsContext.test(a)?0:i.length;while(f--){if(j=i[f],d.relative[k=j.type])break;if((l=d.find[k])&&(e=l(j.matches[0].replace(_,aa),$.test(i[0].type)&&qa(b.parentNode)||b))){if(i.splice(f,1),a=e.length&&sa(i),!a)return G.apply(c,e),c;break}}}return(m||h(a,n))(e,b,!p,c,!b||$.test(a)&&qa(b.parentNode)||b),c},c.sortStable=u.split("").sort(B).join("")===u,c.detectDuplicates=!!l,m(),c.sortDetached=ja(function(a){return 1&a.compareDocumentPosition(n.createElement("fieldset"))}),ja(function(a){return a.innerHTML="","#"===a.firstChild.getAttribute("href")})||ka("type|href|height|width",function(a,b,c){if(!c)return a.getAttribute(b,"type"===b.toLowerCase()?1:2)}),c.attributes&&ja(function(a){return a.innerHTML="",a.firstChild.setAttribute("value",""),""===a.firstChild.getAttribute("value")})||ka("value",function(a,b,c){if(!c&&"input"===a.nodeName.toLowerCase())return a.defaultValue}),ja(function(a){return null==a.getAttribute("disabled")})||ka(J,function(a,b,c){var d;if(!c)return a[b]===!0?b.toLowerCase():(d=a.getAttributeNode(b))&&d.specified?d.value:null}),ga}(a);r.find=x,r.expr=x.selectors,r.expr[":"]=r.expr.pseudos,r.uniqueSort=r.unique=x.uniqueSort,r.text=x.getText,r.isXMLDoc=x.isXML,r.contains=x.contains,r.escapeSelector=x.escape;var y=function(a,b,c){var d=[],e=void 0!==c;while((a=a[b])&&9!==a.nodeType)if(1===a.nodeType){if(e&&r(a).is(c))break;d.push(a)}return d},z=function(a,b){for(var c=[];a;a=a.nextSibling)1===a.nodeType&&a!==b&&c.push(a);return c},A=r.expr.match.needsContext;function B(a,b){return a.nodeName&&a.nodeName.toLowerCase()===b.toLowerCase()}var C=/^<([a-z][^\/\0>:\x20\t\r\n\f]*)[\x20\t\r\n\f]*\/?>(?:<\/\1>|)$/i,D=/^.[^:#\[\.,]*$/;function E(a,b,c){return r.isFunction(b)?r.grep(a,function(a,d){return!!b.call(a,d,a)!==c}):b.nodeType?r.grep(a,function(a){return a===b!==c}):"string"!=typeof b?r.grep(a,function(a){return i.call(b,a)>-1!==c}):D.test(b)?r.filter(b,a,c):(b=r.filter(b,a),r.grep(a,function(a){return i.call(b,a)>-1!==c&&1===a.nodeType}))}r.filter=function(a,b,c){var d=b[0];return c&&(a=":not("+a+")"),1===b.length&&1===d.nodeType?r.find.matchesSelector(d,a)?[d]:[]:r.find.matches(a,r.grep(b,function(a){return 1===a.nodeType}))},r.fn.extend({find:function(a){var b,c,d=this.length,e=this;if("string"!=typeof a)return this.pushStack(r(a).filter(function(){for(b=0;b1?r.uniqueSort(c):c},filter:function(a){return this.pushStack(E(this,a||[],!1))},not:function(a){return this.pushStack(E(this,a||[],!0))},is:function(a){return!!E(this,"string"==typeof a&&A.test(a)?r(a):a||[],!1).length}});var F,G=/^(?:\s*(<[\w\W]+>)[^>]*|#([\w-]+))$/,H=r.fn.init=function(a,b,c){var e,f;if(!a)return this;if(c=c||F,"string"==typeof a){if(e="<"===a[0]&&">"===a[a.length-1]&&a.length>=3?[null,a,null]:G.exec(a),!e||!e[1]&&b)return!b||b.jquery?(b||c).find(a):this.constructor(b).find(a);if(e[1]){if(b=b instanceof r?b[0]:b,r.merge(this,r.parseHTML(e[1],b&&b.nodeType?b.ownerDocument||b:d,!0)),C.test(e[1])&&r.isPlainObject(b))for(e in b)r.isFunction(this[e])?this[e](b[e]):this.attr(e,b[e]);return this}return f=d.getElementById(e[2]),f&&(this[0]=f,this.length=1),this}return a.nodeType?(this[0]=a,this.length=1,this):r.isFunction(a)?void 0!==c.ready?c.ready(a):a(r):r.makeArray(a,this)};H.prototype=r.fn,F=r(d);var I=/^(?:parents|prev(?:Until|All))/,J={children:!0,contents:!0,next:!0,prev:!0};r.fn.extend({has:function(a){var b=r(a,this),c=b.length;return this.filter(function(){for(var a=0;a-1:1===c.nodeType&&r.find.matchesSelector(c,a))){f.push(c);break}return this.pushStack(f.length>1?r.uniqueSort(f):f)},index:function(a){return a?"string"==typeof a?i.call(r(a),this[0]):i.call(this,a.jquery?a[0]:a):this[0]&&this[0].parentNode?this.first().prevAll().length:-1},add:function(a,b){return this.pushStack(r.uniqueSort(r.merge(this.get(),r(a,b))))},addBack:function(a){return this.add(null==a?this.prevObject:this.prevObject.filter(a))}});function K(a,b){while((a=a[b])&&1!==a.nodeType);return a}r.each({parent:function(a){var b=a.parentNode;return b&&11!==b.nodeType?b:null},parents:function(a){return y(a,"parentNode")},parentsUntil:function(a,b,c){return y(a,"parentNode",c)},next:function(a){return K(a,"nextSibling")},prev:function(a){return K(a,"previousSibling")},nextAll:function(a){return y(a,"nextSibling")},prevAll:function(a){return y(a,"previousSibling")},nextUntil:function(a,b,c){return y(a,"nextSibling",c)},prevUntil:function(a,b,c){return y(a,"previousSibling",c)},siblings:function(a){return z((a.parentNode||{}).firstChild,a)},children:function(a){return z(a.firstChild)},contents:function(a){return B(a,"iframe")?a.contentDocument:(B(a,"template")&&(a=a.content||a),r.merge([],a.childNodes))}},function(a,b){r.fn[a]=function(c,d){var e=r.map(this,b,c);return"Until"!==a.slice(-5)&&(d=c),d&&"string"==typeof d&&(e=r.filter(d,e)),this.length>1&&(J[a]||r.uniqueSort(e),I.test(a)&&e.reverse()),this.pushStack(e)}});var L=/[^\x20\t\r\n\f]+/g;function M(a){var b={};return r.each(a.match(L)||[],function(a,c){b[c]=!0}),b}r.Callbacks=function(a){a="string"==typeof a?M(a):r.extend({},a);var b,c,d,e,f=[],g=[],h=-1,i=function(){for(e=e||a.once,d=b=!0;g.length;h=-1){c=g.shift();while(++h-1)f.splice(c,1),c<=h&&h--}),this},has:function(a){return a?r.inArray(a,f)>-1:f.length>0},empty:function(){return f&&(f=[]),this},disable:function(){return e=g=[],f=c="",this},disabled:function(){return!f},lock:function(){return e=g=[],c||b||(f=c=""),this},locked:function(){return!!e},fireWith:function(a,c){return e||(c=c||[],c=[a,c.slice?c.slice():c],g.push(c),b||i()),this},fire:function(){return j.fireWith(this,arguments),this},fired:function(){return!!d}};return j};function N(a){return a}function O(a){throw a}function P(a,b,c,d){var e;try{a&&r.isFunction(e=a.promise)?e.call(a).done(b).fail(c):a&&r.isFunction(e=a.then)?e.call(a,b,c):b.apply(void 0,[a].slice(d))}catch(a){c.apply(void 0,[a])}}r.extend({Deferred:function(b){var c=[["notify","progress",r.Callbacks("memory"),r.Callbacks("memory"),2],["resolve","done",r.Callbacks("once memory"),r.Callbacks("once memory"),0,"resolved"],["reject","fail",r.Callbacks("once memory"),r.Callbacks("once memory"),1,"rejected"]],d="pending",e={state:function(){return d},always:function(){return f.done(arguments).fail(arguments),this},"catch":function(a){return e.then(null,a)},pipe:function(){var a=arguments;return r.Deferred(function(b){r.each(c,function(c,d){var e=r.isFunction(a[d[4]])&&a[d[4]];f[d[1]](function(){var a=e&&e.apply(this,arguments);a&&r.isFunction(a.promise)?a.promise().progress(b.notify).done(b.resolve).fail(b.reject):b[d[0]+"With"](this,e?[a]:arguments)})}),a=null}).promise()},then:function(b,d,e){var f=0;function g(b,c,d,e){return function(){var h=this,i=arguments,j=function(){var a,j;if(!(b=f&&(d!==O&&(h=void 0,i=[a]),c.rejectWith(h,i))}};b?k():(r.Deferred.getStackHook&&(k.stackTrace=r.Deferred.getStackHook()),a.setTimeout(k))}}return r.Deferred(function(a){c[0][3].add(g(0,a,r.isFunction(e)?e:N,a.notifyWith)),c[1][3].add(g(0,a,r.isFunction(b)?b:N)),c[2][3].add(g(0,a,r.isFunction(d)?d:O))}).promise()},promise:function(a){return null!=a?r.extend(a,e):e}},f={};return r.each(c,function(a,b){var g=b[2],h=b[5];e[b[1]]=g.add,h&&g.add(function(){d=h},c[3-a][2].disable,c[0][2].lock),g.add(b[3].fire),f[b[0]]=function(){return f[b[0]+"With"](this===f?void 0:this,arguments),this},f[b[0]+"With"]=g.fireWith}),e.promise(f),b&&b.call(f,f),f},when:function(a){var b=arguments.length,c=b,d=Array(c),e=f.call(arguments),g=r.Deferred(),h=function(a){return function(c){d[a]=this,e[a]=arguments.length>1?f.call(arguments):c,--b||g.resolveWith(d,e)}};if(b<=1&&(P(a,g.done(h(c)).resolve,g.reject,!b),"pending"===g.state()||r.isFunction(e[c]&&e[c].then)))return g.then();while(c--)P(e[c],h(c),g.reject);return g.promise()}});var Q=/^(Eval|Internal|Range|Reference|Syntax|Type|URI)Error$/;r.Deferred.exceptionHook=function(b,c){a.console&&a.console.warn&&b&&Q.test(b.name)&&a.console.warn("jQuery.Deferred exception: "+b.message,b.stack,c)},r.readyException=function(b){a.setTimeout(function(){throw b})};var R=r.Deferred();r.fn.ready=function(a){return R.then(a)["catch"](function(a){r.readyException(a)}),this},r.extend({isReady:!1,readyWait:1,ready:function(a){(a===!0?--r.readyWait:r.isReady)||(r.isReady=!0,a!==!0&&--r.readyWait>0||R.resolveWith(d,[r]))}}),r.ready.then=R.then;function S(){d.removeEventListener("DOMContentLoaded",S), -a.removeEventListener("load",S),r.ready()}"complete"===d.readyState||"loading"!==d.readyState&&!d.documentElement.doScroll?a.setTimeout(r.ready):(d.addEventListener("DOMContentLoaded",S),a.addEventListener("load",S));var T=function(a,b,c,d,e,f,g){var h=0,i=a.length,j=null==c;if("object"===r.type(c)){e=!0;for(h in c)T(a,b,h,c[h],!0,f,g)}else if(void 0!==d&&(e=!0,r.isFunction(d)||(g=!0),j&&(g?(b.call(a,d),b=null):(j=b,b=function(a,b,c){return j.call(r(a),c)})),b))for(;h1,null,!0)},removeData:function(a){return this.each(function(){X.remove(this,a)})}}),r.extend({queue:function(a,b,c){var d;if(a)return b=(b||"fx")+"queue",d=W.get(a,b),c&&(!d||Array.isArray(c)?d=W.access(a,b,r.makeArray(c)):d.push(c)),d||[]},dequeue:function(a,b){b=b||"fx";var c=r.queue(a,b),d=c.length,e=c.shift(),f=r._queueHooks(a,b),g=function(){r.dequeue(a,b)};"inprogress"===e&&(e=c.shift(),d--),e&&("fx"===b&&c.unshift("inprogress"),delete f.stop,e.call(a,g,f)),!d&&f&&f.empty.fire()},_queueHooks:function(a,b){var c=b+"queueHooks";return W.get(a,c)||W.access(a,c,{empty:r.Callbacks("once memory").add(function(){W.remove(a,[b+"queue",c])})})}}),r.fn.extend({queue:function(a,b){var c=2;return"string"!=typeof a&&(b=a,a="fx",c--),arguments.length\x20\t\r\n\f]+)/i,la=/^$|\/(?:java|ecma)script/i,ma={option:[1,""],thead:[1,"","
"],col:[2,"","
"],tr:[2,"","
"],td:[3,"","
"],_default:[0,"",""]};ma.optgroup=ma.option,ma.tbody=ma.tfoot=ma.colgroup=ma.caption=ma.thead,ma.th=ma.td;function na(a,b){var c;return c="undefined"!=typeof a.getElementsByTagName?a.getElementsByTagName(b||"*"):"undefined"!=typeof a.querySelectorAll?a.querySelectorAll(b||"*"):[],void 0===b||b&&B(a,b)?r.merge([a],c):c}function oa(a,b){for(var c=0,d=a.length;c-1)e&&e.push(f);else if(j=r.contains(f.ownerDocument,f),g=na(l.appendChild(f),"script"),j&&oa(g),c){k=0;while(f=g[k++])la.test(f.type||"")&&c.push(f)}return l}!function(){var a=d.createDocumentFragment(),b=a.appendChild(d.createElement("div")),c=d.createElement("input");c.setAttribute("type","radio"),c.setAttribute("checked","checked"),c.setAttribute("name","t"),b.appendChild(c),o.checkClone=b.cloneNode(!0).cloneNode(!0).lastChild.checked,b.innerHTML="",o.noCloneChecked=!!b.cloneNode(!0).lastChild.defaultValue}();var ra=d.documentElement,sa=/^key/,ta=/^(?:mouse|pointer|contextmenu|drag|drop)|click/,ua=/^([^.]*)(?:\.(.+)|)/;function va(){return!0}function wa(){return!1}function xa(){try{return d.activeElement}catch(a){}}function ya(a,b,c,d,e,f){var g,h;if("object"==typeof b){"string"!=typeof c&&(d=d||c,c=void 0);for(h in b)ya(a,h,c,d,b[h],f);return a}if(null==d&&null==e?(e=c,d=c=void 0):null==e&&("string"==typeof c?(e=d,d=void 0):(e=d,d=c,c=void 0)),e===!1)e=wa;else if(!e)return a;return 1===f&&(g=e,e=function(a){return r().off(a),g.apply(this,arguments)},e.guid=g.guid||(g.guid=r.guid++)),a.each(function(){r.event.add(this,b,e,d,c)})}r.event={global:{},add:function(a,b,c,d,e){var f,g,h,i,j,k,l,m,n,o,p,q=W.get(a);if(q){c.handler&&(f=c,c=f.handler,e=f.selector),e&&r.find.matchesSelector(ra,e),c.guid||(c.guid=r.guid++),(i=q.events)||(i=q.events={}),(g=q.handle)||(g=q.handle=function(b){return"undefined"!=typeof r&&r.event.triggered!==b.type?r.event.dispatch.apply(a,arguments):void 0}),b=(b||"").match(L)||[""],j=b.length;while(j--)h=ua.exec(b[j])||[],n=p=h[1],o=(h[2]||"").split(".").sort(),n&&(l=r.event.special[n]||{},n=(e?l.delegateType:l.bindType)||n,l=r.event.special[n]||{},k=r.extend({type:n,origType:p,data:d,handler:c,guid:c.guid,selector:e,needsContext:e&&r.expr.match.needsContext.test(e),namespace:o.join(".")},f),(m=i[n])||(m=i[n]=[],m.delegateCount=0,l.setup&&l.setup.call(a,d,o,g)!==!1||a.addEventListener&&a.addEventListener(n,g)),l.add&&(l.add.call(a,k),k.handler.guid||(k.handler.guid=c.guid)),e?m.splice(m.delegateCount++,0,k):m.push(k),r.event.global[n]=!0)}},remove:function(a,b,c,d,e){var f,g,h,i,j,k,l,m,n,o,p,q=W.hasData(a)&&W.get(a);if(q&&(i=q.events)){b=(b||"").match(L)||[""],j=b.length;while(j--)if(h=ua.exec(b[j])||[],n=p=h[1],o=(h[2]||"").split(".").sort(),n){l=r.event.special[n]||{},n=(d?l.delegateType:l.bindType)||n,m=i[n]||[],h=h[2]&&new RegExp("(^|\\.)"+o.join("\\.(?:.*\\.|)")+"(\\.|$)"),g=f=m.length;while(f--)k=m[f],!e&&p!==k.origType||c&&c.guid!==k.guid||h&&!h.test(k.namespace)||d&&d!==k.selector&&("**"!==d||!k.selector)||(m.splice(f,1),k.selector&&m.delegateCount--,l.remove&&l.remove.call(a,k));g&&!m.length&&(l.teardown&&l.teardown.call(a,o,q.handle)!==!1||r.removeEvent(a,n,q.handle),delete i[n])}else for(n in i)r.event.remove(a,n+b[j],c,d,!0);r.isEmptyObject(i)&&W.remove(a,"handle events")}},dispatch:function(a){var b=r.event.fix(a),c,d,e,f,g,h,i=new Array(arguments.length),j=(W.get(this,"events")||{})[b.type]||[],k=r.event.special[b.type]||{};for(i[0]=b,c=1;c=1))for(;j!==this;j=j.parentNode||this)if(1===j.nodeType&&("click"!==a.type||j.disabled!==!0)){for(f=[],g={},c=0;c-1:r.find(e,this,null,[j]).length),g[e]&&f.push(d);f.length&&h.push({elem:j,handlers:f})}return j=this,i\x20\t\r\n\f]*)[^>]*)\/>/gi,Aa=/\s*$/g;function Ea(a,b){return B(a,"table")&&B(11!==b.nodeType?b:b.firstChild,"tr")?r(">tbody",a)[0]||a:a}function Fa(a){return a.type=(null!==a.getAttribute("type"))+"/"+a.type,a}function Ga(a){var b=Ca.exec(a.type);return b?a.type=b[1]:a.removeAttribute("type"),a}function Ha(a,b){var c,d,e,f,g,h,i,j;if(1===b.nodeType){if(W.hasData(a)&&(f=W.access(a),g=W.set(b,f),j=f.events)){delete g.handle,g.events={};for(e in j)for(c=0,d=j[e].length;c1&&"string"==typeof q&&!o.checkClone&&Ba.test(q))return a.each(function(e){var f=a.eq(e);s&&(b[0]=q.call(this,e,f.html())),Ja(f,b,c,d)});if(m&&(e=qa(b,a[0].ownerDocument,!1,a,d),f=e.firstChild,1===e.childNodes.length&&(e=f),f||d)){for(h=r.map(na(e,"script"),Fa),i=h.length;l")},clone:function(a,b,c){var d,e,f,g,h=a.cloneNode(!0),i=r.contains(a.ownerDocument,a);if(!(o.noCloneChecked||1!==a.nodeType&&11!==a.nodeType||r.isXMLDoc(a)))for(g=na(h),f=na(a),d=0,e=f.length;d0&&oa(g,!i&&na(a,"script")),h},cleanData:function(a){for(var b,c,d,e=r.event.special,f=0;void 0!==(c=a[f]);f++)if(U(c)){if(b=c[W.expando]){if(b.events)for(d in b.events)e[d]?r.event.remove(c,d):r.removeEvent(c,d,b.handle);c[W.expando]=void 0}c[X.expando]&&(c[X.expando]=void 0)}}}),r.fn.extend({detach:function(a){return Ka(this,a,!0)},remove:function(a){return Ka(this,a)},text:function(a){return T(this,function(a){return void 0===a?r.text(this):this.empty().each(function(){1!==this.nodeType&&11!==this.nodeType&&9!==this.nodeType||(this.textContent=a)})},null,a,arguments.length)},append:function(){return Ja(this,arguments,function(a){if(1===this.nodeType||11===this.nodeType||9===this.nodeType){var b=Ea(this,a);b.appendChild(a)}})},prepend:function(){return Ja(this,arguments,function(a){if(1===this.nodeType||11===this.nodeType||9===this.nodeType){var b=Ea(this,a);b.insertBefore(a,b.firstChild)}})},before:function(){return Ja(this,arguments,function(a){this.parentNode&&this.parentNode.insertBefore(a,this)})},after:function(){return Ja(this,arguments,function(a){this.parentNode&&this.parentNode.insertBefore(a,this.nextSibling)})},empty:function(){for(var a,b=0;null!=(a=this[b]);b++)1===a.nodeType&&(r.cleanData(na(a,!1)),a.textContent="");return this},clone:function(a,b){return a=null!=a&&a,b=null==b?a:b,this.map(function(){return r.clone(this,a,b)})},html:function(a){return T(this,function(a){var b=this[0]||{},c=0,d=this.length;if(void 0===a&&1===b.nodeType)return b.innerHTML;if("string"==typeof a&&!Aa.test(a)&&!ma[(ka.exec(a)||["",""])[1].toLowerCase()]){a=r.htmlPrefilter(a);try{for(;c1)}});function _a(a,b,c,d,e){return new _a.prototype.init(a,b,c,d,e)}r.Tween=_a,_a.prototype={constructor:_a,init:function(a,b,c,d,e,f){this.elem=a,this.prop=c,this.easing=e||r.easing._default,this.options=b,this.start=this.now=this.cur(),this.end=d,this.unit=f||(r.cssNumber[c]?"":"px")},cur:function(){var a=_a.propHooks[this.prop];return a&&a.get?a.get(this):_a.propHooks._default.get(this)},run:function(a){var b,c=_a.propHooks[this.prop];return this.options.duration?this.pos=b=r.easing[this.easing](a,this.options.duration*a,0,1,this.options.duration):this.pos=b=a,this.now=(this.end-this.start)*b+this.start,this.options.step&&this.options.step.call(this.elem,this.now,this),c&&c.set?c.set(this):_a.propHooks._default.set(this),this}},_a.prototype.init.prototype=_a.prototype,_a.propHooks={_default:{get:function(a){var b;return 1!==a.elem.nodeType||null!=a.elem[a.prop]&&null==a.elem.style[a.prop]?a.elem[a.prop]:(b=r.css(a.elem,a.prop,""),b&&"auto"!==b?b:0)},set:function(a){r.fx.step[a.prop]?r.fx.step[a.prop](a):1!==a.elem.nodeType||null==a.elem.style[r.cssProps[a.prop]]&&!r.cssHooks[a.prop]?a.elem[a.prop]=a.now:r.style(a.elem,a.prop,a.now+a.unit)}}},_a.propHooks.scrollTop=_a.propHooks.scrollLeft={set:function(a){a.elem.nodeType&&a.elem.parentNode&&(a.elem[a.prop]=a.now)}},r.easing={linear:function(a){return a},swing:function(a){return.5-Math.cos(a*Math.PI)/2},_default:"swing"},r.fx=_a.prototype.init,r.fx.step={};var ab,bb,cb=/^(?:toggle|show|hide)$/,db=/queueHooks$/;function eb(){bb&&(d.hidden===!1&&a.requestAnimationFrame?a.requestAnimationFrame(eb):a.setTimeout(eb,r.fx.interval),r.fx.tick())}function fb(){return a.setTimeout(function(){ab=void 0}),ab=r.now()}function gb(a,b){var c,d=0,e={height:a};for(b=b?1:0;d<4;d+=2-b)c=ca[d],e["margin"+c]=e["padding"+c]=a;return b&&(e.opacity=e.width=a),e}function hb(a,b,c){for(var d,e=(kb.tweeners[b]||[]).concat(kb.tweeners["*"]),f=0,g=e.length;f1)},removeAttr:function(a){return this.each(function(){r.removeAttr(this,a)})}}),r.extend({attr:function(a,b,c){var d,e,f=a.nodeType;if(3!==f&&8!==f&&2!==f)return"undefined"==typeof a.getAttribute?r.prop(a,b,c):(1===f&&r.isXMLDoc(a)||(e=r.attrHooks[b.toLowerCase()]||(r.expr.match.bool.test(b)?lb:void 0)),void 0!==c?null===c?void r.removeAttr(a,b):e&&"set"in e&&void 0!==(d=e.set(a,c,b))?d:(a.setAttribute(b,c+""),c):e&&"get"in e&&null!==(d=e.get(a,b))?d:(d=r.find.attr(a,b), -null==d?void 0:d))},attrHooks:{type:{set:function(a,b){if(!o.radioValue&&"radio"===b&&B(a,"input")){var c=a.value;return a.setAttribute("type",b),c&&(a.value=c),b}}}},removeAttr:function(a,b){var c,d=0,e=b&&b.match(L);if(e&&1===a.nodeType)while(c=e[d++])a.removeAttribute(c)}}),lb={set:function(a,b,c){return b===!1?r.removeAttr(a,c):a.setAttribute(c,c),c}},r.each(r.expr.match.bool.source.match(/\w+/g),function(a,b){var c=mb[b]||r.find.attr;mb[b]=function(a,b,d){var e,f,g=b.toLowerCase();return d||(f=mb[g],mb[g]=e,e=null!=c(a,b,d)?g:null,mb[g]=f),e}});var nb=/^(?:input|select|textarea|button)$/i,ob=/^(?:a|area)$/i;r.fn.extend({prop:function(a,b){return T(this,r.prop,a,b,arguments.length>1)},removeProp:function(a){return this.each(function(){delete this[r.propFix[a]||a]})}}),r.extend({prop:function(a,b,c){var d,e,f=a.nodeType;if(3!==f&&8!==f&&2!==f)return 1===f&&r.isXMLDoc(a)||(b=r.propFix[b]||b,e=r.propHooks[b]),void 0!==c?e&&"set"in e&&void 0!==(d=e.set(a,c,b))?d:a[b]=c:e&&"get"in e&&null!==(d=e.get(a,b))?d:a[b]},propHooks:{tabIndex:{get:function(a){var b=r.find.attr(a,"tabindex");return b?parseInt(b,10):nb.test(a.nodeName)||ob.test(a.nodeName)&&a.href?0:-1}}},propFix:{"for":"htmlFor","class":"className"}}),o.optSelected||(r.propHooks.selected={get:function(a){var b=a.parentNode;return b&&b.parentNode&&b.parentNode.selectedIndex,null},set:function(a){var b=a.parentNode;b&&(b.selectedIndex,b.parentNode&&b.parentNode.selectedIndex)}}),r.each(["tabIndex","readOnly","maxLength","cellSpacing","cellPadding","rowSpan","colSpan","useMap","frameBorder","contentEditable"],function(){r.propFix[this.toLowerCase()]=this});function pb(a){var b=a.match(L)||[];return b.join(" ")}function qb(a){return a.getAttribute&&a.getAttribute("class")||""}r.fn.extend({addClass:function(a){var b,c,d,e,f,g,h,i=0;if(r.isFunction(a))return this.each(function(b){r(this).addClass(a.call(this,b,qb(this)))});if("string"==typeof a&&a){b=a.match(L)||[];while(c=this[i++])if(e=qb(c),d=1===c.nodeType&&" "+pb(e)+" "){g=0;while(f=b[g++])d.indexOf(" "+f+" ")<0&&(d+=f+" ");h=pb(d),e!==h&&c.setAttribute("class",h)}}return this},removeClass:function(a){var b,c,d,e,f,g,h,i=0;if(r.isFunction(a))return this.each(function(b){r(this).removeClass(a.call(this,b,qb(this)))});if(!arguments.length)return this.attr("class","");if("string"==typeof a&&a){b=a.match(L)||[];while(c=this[i++])if(e=qb(c),d=1===c.nodeType&&" "+pb(e)+" "){g=0;while(f=b[g++])while(d.indexOf(" "+f+" ")>-1)d=d.replace(" "+f+" "," ");h=pb(d),e!==h&&c.setAttribute("class",h)}}return this},toggleClass:function(a,b){var c=typeof a;return"boolean"==typeof b&&"string"===c?b?this.addClass(a):this.removeClass(a):r.isFunction(a)?this.each(function(c){r(this).toggleClass(a.call(this,c,qb(this),b),b)}):this.each(function(){var b,d,e,f;if("string"===c){d=0,e=r(this),f=a.match(L)||[];while(b=f[d++])e.hasClass(b)?e.removeClass(b):e.addClass(b)}else void 0!==a&&"boolean"!==c||(b=qb(this),b&&W.set(this,"__className__",b),this.setAttribute&&this.setAttribute("class",b||a===!1?"":W.get(this,"__className__")||""))})},hasClass:function(a){var b,c,d=0;b=" "+a+" ";while(c=this[d++])if(1===c.nodeType&&(" "+pb(qb(c))+" ").indexOf(b)>-1)return!0;return!1}});var rb=/\r/g;r.fn.extend({val:function(a){var b,c,d,e=this[0];{if(arguments.length)return d=r.isFunction(a),this.each(function(c){var e;1===this.nodeType&&(e=d?a.call(this,c,r(this).val()):a,null==e?e="":"number"==typeof e?e+="":Array.isArray(e)&&(e=r.map(e,function(a){return null==a?"":a+""})),b=r.valHooks[this.type]||r.valHooks[this.nodeName.toLowerCase()],b&&"set"in b&&void 0!==b.set(this,e,"value")||(this.value=e))});if(e)return b=r.valHooks[e.type]||r.valHooks[e.nodeName.toLowerCase()],b&&"get"in b&&void 0!==(c=b.get(e,"value"))?c:(c=e.value,"string"==typeof c?c.replace(rb,""):null==c?"":c)}}}),r.extend({valHooks:{option:{get:function(a){var b=r.find.attr(a,"value");return null!=b?b:pb(r.text(a))}},select:{get:function(a){var b,c,d,e=a.options,f=a.selectedIndex,g="select-one"===a.type,h=g?null:[],i=g?f+1:e.length;for(d=f<0?i:g?f:0;d-1)&&(c=!0);return c||(a.selectedIndex=-1),f}}}}),r.each(["radio","checkbox"],function(){r.valHooks[this]={set:function(a,b){if(Array.isArray(b))return a.checked=r.inArray(r(a).val(),b)>-1}},o.checkOn||(r.valHooks[this].get=function(a){return null===a.getAttribute("value")?"on":a.value})});var sb=/^(?:focusinfocus|focusoutblur)$/;r.extend(r.event,{trigger:function(b,c,e,f){var g,h,i,j,k,m,n,o=[e||d],p=l.call(b,"type")?b.type:b,q=l.call(b,"namespace")?b.namespace.split("."):[];if(h=i=e=e||d,3!==e.nodeType&&8!==e.nodeType&&!sb.test(p+r.event.triggered)&&(p.indexOf(".")>-1&&(q=p.split("."),p=q.shift(),q.sort()),k=p.indexOf(":")<0&&"on"+p,b=b[r.expando]?b:new r.Event(p,"object"==typeof b&&b),b.isTrigger=f?2:3,b.namespace=q.join("."),b.rnamespace=b.namespace?new RegExp("(^|\\.)"+q.join("\\.(?:.*\\.|)")+"(\\.|$)"):null,b.result=void 0,b.target||(b.target=e),c=null==c?[b]:r.makeArray(c,[b]),n=r.event.special[p]||{},f||!n.trigger||n.trigger.apply(e,c)!==!1)){if(!f&&!n.noBubble&&!r.isWindow(e)){for(j=n.delegateType||p,sb.test(j+p)||(h=h.parentNode);h;h=h.parentNode)o.push(h),i=h;i===(e.ownerDocument||d)&&o.push(i.defaultView||i.parentWindow||a)}g=0;while((h=o[g++])&&!b.isPropagationStopped())b.type=g>1?j:n.bindType||p,m=(W.get(h,"events")||{})[b.type]&&W.get(h,"handle"),m&&m.apply(h,c),m=k&&h[k],m&&m.apply&&U(h)&&(b.result=m.apply(h,c),b.result===!1&&b.preventDefault());return b.type=p,f||b.isDefaultPrevented()||n._default&&n._default.apply(o.pop(),c)!==!1||!U(e)||k&&r.isFunction(e[p])&&!r.isWindow(e)&&(i=e[k],i&&(e[k]=null),r.event.triggered=p,e[p](),r.event.triggered=void 0,i&&(e[k]=i)),b.result}},simulate:function(a,b,c){var d=r.extend(new r.Event,c,{type:a,isSimulated:!0});r.event.trigger(d,null,b)}}),r.fn.extend({trigger:function(a,b){return this.each(function(){r.event.trigger(a,b,this)})},triggerHandler:function(a,b){var c=this[0];if(c)return r.event.trigger(a,b,c,!0)}}),r.each("blur focus focusin focusout resize scroll click dblclick mousedown mouseup mousemove mouseover mouseout mouseenter mouseleave change select submit keydown keypress keyup contextmenu".split(" "),function(a,b){r.fn[b]=function(a,c){return arguments.length>0?this.on(b,null,a,c):this.trigger(b)}}),r.fn.extend({hover:function(a,b){return this.mouseenter(a).mouseleave(b||a)}}),o.focusin="onfocusin"in a,o.focusin||r.each({focus:"focusin",blur:"focusout"},function(a,b){var c=function(a){r.event.simulate(b,a.target,r.event.fix(a))};r.event.special[b]={setup:function(){var d=this.ownerDocument||this,e=W.access(d,b);e||d.addEventListener(a,c,!0),W.access(d,b,(e||0)+1)},teardown:function(){var d=this.ownerDocument||this,e=W.access(d,b)-1;e?W.access(d,b,e):(d.removeEventListener(a,c,!0),W.remove(d,b))}}});var tb=a.location,ub=r.now(),vb=/\?/;r.parseXML=function(b){var c;if(!b||"string"!=typeof b)return null;try{c=(new a.DOMParser).parseFromString(b,"text/xml")}catch(d){c=void 0}return c&&!c.getElementsByTagName("parsererror").length||r.error("Invalid XML: "+b),c};var wb=/\[\]$/,xb=/\r?\n/g,yb=/^(?:submit|button|image|reset|file)$/i,zb=/^(?:input|select|textarea|keygen)/i;function Ab(a,b,c,d){var e;if(Array.isArray(b))r.each(b,function(b,e){c||wb.test(a)?d(a,e):Ab(a+"["+("object"==typeof e&&null!=e?b:"")+"]",e,c,d)});else if(c||"object"!==r.type(b))d(a,b);else for(e in b)Ab(a+"["+e+"]",b[e],c,d)}r.param=function(a,b){var c,d=[],e=function(a,b){var c=r.isFunction(b)?b():b;d[d.length]=encodeURIComponent(a)+"="+encodeURIComponent(null==c?"":c)};if(Array.isArray(a)||a.jquery&&!r.isPlainObject(a))r.each(a,function(){e(this.name,this.value)});else for(c in a)Ab(c,a[c],b,e);return d.join("&")},r.fn.extend({serialize:function(){return r.param(this.serializeArray())},serializeArray:function(){return this.map(function(){var a=r.prop(this,"elements");return a?r.makeArray(a):this}).filter(function(){var a=this.type;return this.name&&!r(this).is(":disabled")&&zb.test(this.nodeName)&&!yb.test(a)&&(this.checked||!ja.test(a))}).map(function(a,b){var c=r(this).val();return null==c?null:Array.isArray(c)?r.map(c,function(a){return{name:b.name,value:a.replace(xb,"\r\n")}}):{name:b.name,value:c.replace(xb,"\r\n")}}).get()}});var Bb=/%20/g,Cb=/#.*$/,Db=/([?&])_=[^&]*/,Eb=/^(.*?):[ \t]*([^\r\n]*)$/gm,Fb=/^(?:about|app|app-storage|.+-extension|file|res|widget):$/,Gb=/^(?:GET|HEAD)$/,Hb=/^\/\//,Ib={},Jb={},Kb="*/".concat("*"),Lb=d.createElement("a");Lb.href=tb.href;function Mb(a){return function(b,c){"string"!=typeof b&&(c=b,b="*");var d,e=0,f=b.toLowerCase().match(L)||[];if(r.isFunction(c))while(d=f[e++])"+"===d[0]?(d=d.slice(1)||"*",(a[d]=a[d]||[]).unshift(c)):(a[d]=a[d]||[]).push(c)}}function Nb(a,b,c,d){var e={},f=a===Jb;function g(h){var i;return e[h]=!0,r.each(a[h]||[],function(a,h){var j=h(b,c,d);return"string"!=typeof j||f||e[j]?f?!(i=j):void 0:(b.dataTypes.unshift(j),g(j),!1)}),i}return g(b.dataTypes[0])||!e["*"]&&g("*")}function Ob(a,b){var c,d,e=r.ajaxSettings.flatOptions||{};for(c in b)void 0!==b[c]&&((e[c]?a:d||(d={}))[c]=b[c]);return d&&r.extend(!0,a,d),a}function Pb(a,b,c){var d,e,f,g,h=a.contents,i=a.dataTypes;while("*"===i[0])i.shift(),void 0===d&&(d=a.mimeType||b.getResponseHeader("Content-Type"));if(d)for(e in h)if(h[e]&&h[e].test(d)){i.unshift(e);break}if(i[0]in c)f=i[0];else{for(e in c){if(!i[0]||a.converters[e+" "+i[0]]){f=e;break}g||(g=e)}f=f||g}if(f)return f!==i[0]&&i.unshift(f),c[f]}function Qb(a,b,c,d){var e,f,g,h,i,j={},k=a.dataTypes.slice();if(k[1])for(g in a.converters)j[g.toLowerCase()]=a.converters[g];f=k.shift();while(f)if(a.responseFields[f]&&(c[a.responseFields[f]]=b),!i&&d&&a.dataFilter&&(b=a.dataFilter(b,a.dataType)),i=f,f=k.shift())if("*"===f)f=i;else if("*"!==i&&i!==f){if(g=j[i+" "+f]||j["* "+f],!g)for(e in j)if(h=e.split(" "),h[1]===f&&(g=j[i+" "+h[0]]||j["* "+h[0]])){g===!0?g=j[e]:j[e]!==!0&&(f=h[0],k.unshift(h[1]));break}if(g!==!0)if(g&&a["throws"])b=g(b);else try{b=g(b)}catch(l){return{state:"parsererror",error:g?l:"No conversion from "+i+" to "+f}}}return{state:"success",data:b}}r.extend({active:0,lastModified:{},etag:{},ajaxSettings:{url:tb.href,type:"GET",isLocal:Fb.test(tb.protocol),global:!0,processData:!0,async:!0,contentType:"application/x-www-form-urlencoded; charset=UTF-8",accepts:{"*":Kb,text:"text/plain",html:"text/html",xml:"application/xml, text/xml",json:"application/json, text/javascript"},contents:{xml:/\bxml\b/,html:/\bhtml/,json:/\bjson\b/},responseFields:{xml:"responseXML",text:"responseText",json:"responseJSON"},converters:{"* text":String,"text html":!0,"text json":JSON.parse,"text xml":r.parseXML},flatOptions:{url:!0,context:!0}},ajaxSetup:function(a,b){return b?Ob(Ob(a,r.ajaxSettings),b):Ob(r.ajaxSettings,a)},ajaxPrefilter:Mb(Ib),ajaxTransport:Mb(Jb),ajax:function(b,c){"object"==typeof b&&(c=b,b=void 0),c=c||{};var e,f,g,h,i,j,k,l,m,n,o=r.ajaxSetup({},c),p=o.context||o,q=o.context&&(p.nodeType||p.jquery)?r(p):r.event,s=r.Deferred(),t=r.Callbacks("once memory"),u=o.statusCode||{},v={},w={},x="canceled",y={readyState:0,getResponseHeader:function(a){var b;if(k){if(!h){h={};while(b=Eb.exec(g))h[b[1].toLowerCase()]=b[2]}b=h[a.toLowerCase()]}return null==b?null:b},getAllResponseHeaders:function(){return k?g:null},setRequestHeader:function(a,b){return null==k&&(a=w[a.toLowerCase()]=w[a.toLowerCase()]||a,v[a]=b),this},overrideMimeType:function(a){return null==k&&(o.mimeType=a),this},statusCode:function(a){var b;if(a)if(k)y.always(a[y.status]);else for(b in a)u[b]=[u[b],a[b]];return this},abort:function(a){var b=a||x;return e&&e.abort(b),A(0,b),this}};if(s.promise(y),o.url=((b||o.url||tb.href)+"").replace(Hb,tb.protocol+"//"),o.type=c.method||c.type||o.method||o.type,o.dataTypes=(o.dataType||"*").toLowerCase().match(L)||[""],null==o.crossDomain){j=d.createElement("a");try{j.href=o.url,j.href=j.href,o.crossDomain=Lb.protocol+"//"+Lb.host!=j.protocol+"//"+j.host}catch(z){o.crossDomain=!0}}if(o.data&&o.processData&&"string"!=typeof o.data&&(o.data=r.param(o.data,o.traditional)),Nb(Ib,o,c,y),k)return y;l=r.event&&o.global,l&&0===r.active++&&r.event.trigger("ajaxStart"),o.type=o.type.toUpperCase(),o.hasContent=!Gb.test(o.type),f=o.url.replace(Cb,""),o.hasContent?o.data&&o.processData&&0===(o.contentType||"").indexOf("application/x-www-form-urlencoded")&&(o.data=o.data.replace(Bb,"+")):(n=o.url.slice(f.length),o.data&&(f+=(vb.test(f)?"&":"?")+o.data,delete o.data),o.cache===!1&&(f=f.replace(Db,"$1"),n=(vb.test(f)?"&":"?")+"_="+ub++ +n),o.url=f+n),o.ifModified&&(r.lastModified[f]&&y.setRequestHeader("If-Modified-Since",r.lastModified[f]),r.etag[f]&&y.setRequestHeader("If-None-Match",r.etag[f])),(o.data&&o.hasContent&&o.contentType!==!1||c.contentType)&&y.setRequestHeader("Content-Type",o.contentType),y.setRequestHeader("Accept",o.dataTypes[0]&&o.accepts[o.dataTypes[0]]?o.accepts[o.dataTypes[0]]+("*"!==o.dataTypes[0]?", "+Kb+"; q=0.01":""):o.accepts["*"]);for(m in o.headers)y.setRequestHeader(m,o.headers[m]);if(o.beforeSend&&(o.beforeSend.call(p,y,o)===!1||k))return y.abort();if(x="abort",t.add(o.complete),y.done(o.success),y.fail(o.error),e=Nb(Jb,o,c,y)){if(y.readyState=1,l&&q.trigger("ajaxSend",[y,o]),k)return y;o.async&&o.timeout>0&&(i=a.setTimeout(function(){y.abort("timeout")},o.timeout));try{k=!1,e.send(v,A)}catch(z){if(k)throw z;A(-1,z)}}else A(-1,"No Transport");function A(b,c,d,h){var j,m,n,v,w,x=c;k||(k=!0,i&&a.clearTimeout(i),e=void 0,g=h||"",y.readyState=b>0?4:0,j=b>=200&&b<300||304===b,d&&(v=Pb(o,y,d)),v=Qb(o,v,y,j),j?(o.ifModified&&(w=y.getResponseHeader("Last-Modified"),w&&(r.lastModified[f]=w),w=y.getResponseHeader("etag"),w&&(r.etag[f]=w)),204===b||"HEAD"===o.type?x="nocontent":304===b?x="notmodified":(x=v.state,m=v.data,n=v.error,j=!n)):(n=x,!b&&x||(x="error",b<0&&(b=0))),y.status=b,y.statusText=(c||x)+"",j?s.resolveWith(p,[m,x,y]):s.rejectWith(p,[y,x,n]),y.statusCode(u),u=void 0,l&&q.trigger(j?"ajaxSuccess":"ajaxError",[y,o,j?m:n]),t.fireWith(p,[y,x]),l&&(q.trigger("ajaxComplete",[y,o]),--r.active||r.event.trigger("ajaxStop")))}return y},getJSON:function(a,b,c){return r.get(a,b,c,"json")},getScript:function(a,b){return r.get(a,void 0,b,"script")}}),r.each(["get","post"],function(a,b){r[b]=function(a,c,d,e){return r.isFunction(c)&&(e=e||d,d=c,c=void 0),r.ajax(r.extend({url:a,type:b,dataType:e,data:c,success:d},r.isPlainObject(a)&&a))}}),r._evalUrl=function(a){return r.ajax({url:a,type:"GET",dataType:"script",cache:!0,async:!1,global:!1,"throws":!0})},r.fn.extend({wrapAll:function(a){var b;return this[0]&&(r.isFunction(a)&&(a=a.call(this[0])),b=r(a,this[0].ownerDocument).eq(0).clone(!0),this[0].parentNode&&b.insertBefore(this[0]),b.map(function(){var a=this;while(a.firstElementChild)a=a.firstElementChild;return a}).append(this)),this},wrapInner:function(a){return r.isFunction(a)?this.each(function(b){r(this).wrapInner(a.call(this,b))}):this.each(function(){var b=r(this),c=b.contents();c.length?c.wrapAll(a):b.append(a)})},wrap:function(a){var b=r.isFunction(a);return this.each(function(c){r(this).wrapAll(b?a.call(this,c):a)})},unwrap:function(a){return this.parent(a).not("body").each(function(){r(this).replaceWith(this.childNodes)}),this}}),r.expr.pseudos.hidden=function(a){return!r.expr.pseudos.visible(a)},r.expr.pseudos.visible=function(a){return!!(a.offsetWidth||a.offsetHeight||a.getClientRects().length)},r.ajaxSettings.xhr=function(){try{return new a.XMLHttpRequest}catch(b){}};var Rb={0:200,1223:204},Sb=r.ajaxSettings.xhr();o.cors=!!Sb&&"withCredentials"in Sb,o.ajax=Sb=!!Sb,r.ajaxTransport(function(b){var c,d;if(o.cors||Sb&&!b.crossDomain)return{send:function(e,f){var g,h=b.xhr();if(h.open(b.type,b.url,b.async,b.username,b.password),b.xhrFields)for(g in b.xhrFields)h[g]=b.xhrFields[g];b.mimeType&&h.overrideMimeType&&h.overrideMimeType(b.mimeType),b.crossDomain||e["X-Requested-With"]||(e["X-Requested-With"]="XMLHttpRequest");for(g in e)h.setRequestHeader(g,e[g]);c=function(a){return function(){c&&(c=d=h.onload=h.onerror=h.onabort=h.onreadystatechange=null,"abort"===a?h.abort():"error"===a?"number"!=typeof h.status?f(0,"error"):f(h.status,h.statusText):f(Rb[h.status]||h.status,h.statusText,"text"!==(h.responseType||"text")||"string"!=typeof h.responseText?{binary:h.response}:{text:h.responseText},h.getAllResponseHeaders()))}},h.onload=c(),d=h.onerror=c("error"),void 0!==h.onabort?h.onabort=d:h.onreadystatechange=function(){4===h.readyState&&a.setTimeout(function(){c&&d()})},c=c("abort");try{h.send(b.hasContent&&b.data||null)}catch(i){if(c)throw i}},abort:function(){c&&c()}}}),r.ajaxPrefilter(function(a){a.crossDomain&&(a.contents.script=!1)}),r.ajaxSetup({accepts:{script:"text/javascript, application/javascript, application/ecmascript, application/x-ecmascript"},contents:{script:/\b(?:java|ecma)script\b/},converters:{"text script":function(a){return r.globalEval(a),a}}}),r.ajaxPrefilter("script",function(a){void 0===a.cache&&(a.cache=!1),a.crossDomain&&(a.type="GET")}),r.ajaxTransport("script",function(a){if(a.crossDomain){var b,c;return{send:function(e,f){b=r(" - - - - - - - - - - -
-
-
-
-

PaddleSpeech Serving简介

-

- PaddleSpeech 是基于飞桨 PaddlePaddle 的语音方向的开源模型库,用于语音和音频中的各种关键任务的开发。PaddleSpeech Serving是基于python + fastapi 的语音算法模型的C/S类型后端服务,旨在统一paddle speech下的各语音算子来对外提供后端服务。 -

-
-
- -
-
-
-
-
-

产品体验

-
-
-
-
-
-
-
-
- WebSocket URL: - -
- - -
- 识别中, 秒后自动停止识别 -
-
-
-
-
此处显示识别结果
-
-
-
-
-
- - - - diff --git a/docs/requirements.txt b/docs/requirements.txt index a5409a5448231dbb6793a6feacc2b6cd5ee15809..08a049c1be0089cb236cfd1439985ae2665b44fd 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -22,6 +22,7 @@ onnxruntime pandas paddlenlp paddlespeech_feat +Pillow>=9.0.0 praatio==5.0.0 pypinyin pypinyin-dict diff --git a/docs/source/released_model.md b/docs/source/released_model.md index 551a86ef0bd013120597be512f6a78242314f59f..a1e3eb8795557e40ad2a1c3e521a52c114bab253 100644 --- a/docs/source/released_model.md +++ b/docs/source/released_model.md @@ -10,7 +10,7 @@ Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | [Ds2 Offline Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_offline_aishell_ckpt_1.0.1.model.tar.gz)| Aishell Dataset | Char-based | 1.4 GB | 2 Conv + 5 bidirectional LSTM layers| 0.0554 |-| 151 h | [Ds2 Offline Aishell ASR0](../../examples/aishell/asr0) | inference/python | [Conformer Online Wenetspeech ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_wenetspeech_ckpt_1.0.0a.model.tar.gz) | WenetSpeech Dataset | Char-based | 457 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring| 0.11 (test\_net) 0.1879 (test\_meeting) |-| 10000 h |- | python | [Conformer Online Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_chunk_conformer_aishell_ckpt_0.2.0.model.tar.gz) | Aishell Dataset | Char-based | 189 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring| 0.0544 |-| 151 h | [Conformer Online Aishell ASR1](../../examples/aishell/asr1) | python | -[Conformer Offline Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_conformer_aishell_ckpt_0.1.2.model.tar.gz) | Aishell Dataset | Char-based | 189 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0464 |-| 151 h | [Conformer Offline Aishell ASR1](../../examples/aishell/asr1) | python | +[Conformer Offline Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_conformer_aishell_ckpt_1.0.1.model.tar.gz) | Aishell Dataset | Char-based | 189 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0460 |-| 151 h | [Conformer Offline Aishell ASR1](../../examples/aishell/asr1) | python | [Transformer Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_transformer_aishell_ckpt_0.1.1.model.tar.gz) | Aishell Dataset | Char-based | 128 MB | Encoder:Transformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0523 || 151 h | [Transformer Aishell ASR1](../../examples/aishell/asr1) | python | [Ds2 Offline Librispeech ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr0/asr0_deepspeech2_offline_librispeech_ckpt_1.0.1.model.tar.gz)| Librispeech Dataset | Char-based | 1.3 GB | 2 Conv + 5 bidirectional LSTM layers| - |0.0467| 960 h | [Ds2 Offline Librispeech ASR0](../../examples/librispeech/asr0) | inference/python | [Conformer Librispeech ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr1/asr1_conformer_librispeech_ckpt_0.1.1.model.tar.gz) | Librispeech Dataset | subword-based | 191 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring |-| 0.0338 | 960 h | [Conformer Librispeech ASR1](../../examples/librispeech/asr1) | python | diff --git a/examples/aishell/asr1/RESULTS.md b/examples/aishell/asr1/RESULTS.md index f16d423a2dc11f08aeac2a8061f4532d56e6ebbf..79c695b1b5df536b5e8086ece2f2bd5e46f412bb 100644 --- a/examples/aishell/asr1/RESULTS.md +++ b/examples/aishell/asr1/RESULTS.md @@ -2,13 +2,13 @@ ## Conformer paddle version: 2.2.2 -paddlespeech version: 0.2.0 +paddlespeech version: 1.0.1 | Model | Params | Config | Augmentation| Test set | Decode method | Loss | CER | | --- | --- | --- | --- | --- | --- | --- | --- | -| conformer | 47.07M | conf/conformer.yaml | spec_aug | test | attention | - | 0.0530 | -| conformer | 47.07M | conf/conformer.yaml | spec_aug | test | ctc_greedy_search | - | 0.0495 | -| conformer | 47.07M | conf/conformer.yaml | spec_aug| test | ctc_prefix_beam_search | - | 0.0494 | -| conformer | 47.07M | conf/conformer.yaml | spec_aug | test | attention_rescoring | - | 0.0464 | +| conformer | 47.07M | conf/conformer.yaml | spec_aug | test | attention | - | 0.0522 | +| conformer | 47.07M | conf/conformer.yaml | spec_aug | test | ctc_greedy_search | - | 0.0481 | +| conformer | 47.07M | conf/conformer.yaml | spec_aug| test | ctc_prefix_beam_search | - | 0.0480 | +| conformer | 47.07M | conf/conformer.yaml | spec_aug | test | attention_rescoring | - | 0.0460 | ## Conformer Streaming diff --git a/examples/aishell/asr1/conf/conformer.yaml b/examples/aishell/asr1/conf/conformer.yaml index 2419d07a4066d635fdf93f2e4258f56fcf4ea76d..0d12a9ef8f5fbcea21d07a1cd53647030b33532e 100644 --- a/examples/aishell/asr1/conf/conformer.yaml +++ b/examples/aishell/asr1/conf/conformer.yaml @@ -57,7 +57,7 @@ feat_dim: 80 stride_ms: 10.0 window_ms: 25.0 sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs -batch_size: 64 +batch_size: 32 maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced minibatches: 0 # for debug @@ -73,10 +73,10 @@ num_encs: 1 ########################################### # Training # ########################################### -n_epoch: 240 -accum_grad: 2 +n_epoch: 150 +accum_grad: 8 global_grad_clip: 5.0 -dist_sampler: True +dist_sampler: False optim: adam optim_conf: lr: 0.002 diff --git a/examples/csmsc/vits/README.md b/examples/csmsc/vits/README.md index 0c16840a04e32be8fefb3bae6c23fb4bd853be9f..5ca57e3a3603eb53fe4bf7c16fc1ba51bbc14147 100644 --- a/examples/csmsc/vits/README.md +++ b/examples/csmsc/vits/README.md @@ -144,3 +144,34 @@ optional arguments: 6. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. ## Pretrained Model + +The pretrained model can be downloaded here: + +- [vits_csmsc_ckpt_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/vits/vits_csmsc_ckpt_1.1.0.zip) (add_blank=true) + +VITS checkpoint contains files listed below. +```text +vits_csmsc_ckpt_1.1.0 +├── default.yaml # default config used to train vitx +├── phone_id_map.txt # phone vocabulary file when training vits +└── snapshot_iter_350000.pdz # model parameters and optimizer states +``` + +ps: This ckpt is not good enough, a better result is training + +You can use the following scripts to synthesize for `${BIN_DIR}/../sentences.txt` using pretrained VITS. + +```bash +source path.sh +add_blank=true + +FLAGS_allocator_strategy=naive_best_fit \ +FLAGS_fraction_of_gpu_memory_to_use=0.01 \ +python3 ${BIN_DIR}/synthesize_e2e.py \ + --config=vits_csmsc_ckpt_1.1.0/default.yaml \ + --ckpt=vits_csmsc_ckpt_1.1.0/snapshot_iter_350000.pdz \ + --phones_dict=vits_csmsc_ckpt_1.1.0/phone_id_map.txt \ + --output_dir=exp/default/test_e2e \ + --text=${BIN_DIR}/../sentences.txt \ + --add-blank=${add_blank} +``` diff --git a/examples/csmsc/vits/local/synthesize.sh b/examples/csmsc/vits/local/synthesize.sh index c15d5f99ff2f6a51c02630b72230999809cefcde..a4b35ec0aaa9f2ba830d2a35375283c0cea9389a 100755 --- a/examples/csmsc/vits/local/synthesize.sh +++ b/examples/csmsc/vits/local/synthesize.sh @@ -15,4 +15,4 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then --phones_dict=dump/phone_id_map.txt \ --test_metadata=dump/test/norm/metadata.jsonl \ --output_dir=${train_output_path}/test -fi \ No newline at end of file +fi diff --git a/examples/csmsc/vits/local/train.sh b/examples/csmsc/vits/local/train.sh index 42fff26cadd03ee0eddcecec634438fd7482fef7..289837a5d10af58b3c6fcb679e0ea895040d5d64 100755 --- a/examples/csmsc/vits/local/train.sh +++ b/examples/csmsc/vits/local/train.sh @@ -3,6 +3,11 @@ config_path=$1 train_output_path=$2 +# install monotonic_align +cd ${MAIN_ROOT}/paddlespeech/t2s/models/vits/monotonic_align +python3 setup.py build_ext --inplace +cd - + python3 ${BIN_DIR}/train.py \ --train-metadata=dump/train/norm/metadata.jsonl \ --dev-metadata=dump/dev/norm/metadata.jsonl \ diff --git a/examples/voxceleb/sv0/local/data.sh b/examples/voxceleb/sv0/local/data.sh index d6010ec66274db2707506b098ec087265b46a934..366397484bc9c0a2ffce45ff4254c045126ac233 100755 --- a/examples/voxceleb/sv0/local/data.sh +++ b/examples/voxceleb/sv0/local/data.sh @@ -74,7 +74,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # convert the m4a to wav # and we will not delete the original m4a file echo "start to convert the m4a to wav" - bash local/convert.sh ${TARGET_DIR}/voxceleb/vox2/test/ || exit 1; + bash local/convert.sh ${TARGET_DIR}/voxceleb/vox2/ || exit 1; if [ $? -ne 0 ]; then echo "Convert voxceleb2 dataset from m4a to wav failed. Terminated." diff --git a/paddlespeech/audio/transform/spec_augment.py b/paddlespeech/audio/transform/spec_augment.py index c8f0a855f4b52f8ef4f04aef36b9dc70fd2f528d..029e7b8f5a2f316e081df3b8e5b2f780f533e258 100644 --- a/paddlespeech/audio/transform/spec_augment.py +++ b/paddlespeech/audio/transform/spec_augment.py @@ -14,10 +14,8 @@ # Modified from espnet(https://github.com/espnet/espnet) """Spec Augment module for preprocessing i.e., data augmentation""" import random - import numpy from PIL import Image -from PIL.Image import BICUBIC from .functional import FuncTrans @@ -46,9 +44,10 @@ def time_warp(x, max_time_warp=80, inplace=False, mode="PIL"): warped = random.randrange(center - window, center + window) + 1 # 1 ... t - 1 - left = Image.fromarray(x[:center]).resize((x.shape[1], warped), BICUBIC) + left = Image.fromarray(x[:center]).resize((x.shape[1], warped), + Image.BICUBIC) right = Image.fromarray(x[center:]).resize((x.shape[1], t - warped), - BICUBIC) + Image.BICUBIC) if inplace: x[:warped] = left x[warped:] = right diff --git a/paddlespeech/cli/asr/infer.py b/paddlespeech/cli/asr/infer.py index 162c307278afb8cf5914b97f0cd190bd284a698a..76dfafb926135abc9295ca9a640648b837ebfe84 100644 --- a/paddlespeech/cli/asr/infer.py +++ b/paddlespeech/cli/asr/infer.py @@ -133,11 +133,11 @@ class ASRExecutor(BaseExecutor): """ Init model and other resources from a specific path. """ - logger.info("start to init the model") + logger.debug("start to init the model") # default max_len: unit:second self.max_len = 50 if hasattr(self, 'model'): - logger.info('Model had been initialized.') + logger.debug('Model had been initialized.') return if cfg_path is None or ckpt_path is None: @@ -151,15 +151,15 @@ class ASRExecutor(BaseExecutor): self.ckpt_path = os.path.join( self.res_path, self.task_resource.res_dict['ckpt_path'] + ".pdparams") - logger.info(self.res_path) + logger.debug(self.res_path) else: self.cfg_path = os.path.abspath(cfg_path) self.ckpt_path = os.path.abspath(ckpt_path + ".pdparams") self.res_path = os.path.dirname( os.path.dirname(os.path.abspath(self.cfg_path))) - logger.info(self.cfg_path) - logger.info(self.ckpt_path) + logger.debug(self.cfg_path) + logger.debug(self.ckpt_path) #Init body. self.config = CfgNode(new_allowed=True) @@ -216,7 +216,7 @@ class ASRExecutor(BaseExecutor): max_len = self.config.encoder_conf.max_len self.max_len = frame_shift_ms * max_len * subsample_rate - logger.info( + logger.debug( f"The asr server limit max duration len: {self.max_len}") def preprocess(self, model_type: str, input: Union[str, os.PathLike]): @@ -227,15 +227,15 @@ class ASRExecutor(BaseExecutor): audio_file = input if isinstance(audio_file, (str, os.PathLike)): - logger.info("Preprocess audio_file:" + audio_file) + logger.debug("Preprocess audio_file:" + audio_file) # Get the object for feature extraction if "deepspeech2" in model_type or "conformer" in model_type or "transformer" in model_type: - logger.info("get the preprocess conf") + logger.debug("get the preprocess conf") preprocess_conf = self.config.preprocess_config preprocess_args = {"train": False} preprocessing = Transformation(preprocess_conf) - logger.info("read the audio file") + logger.debug("read the audio file") audio, audio_sample_rate = soundfile.read( audio_file, dtype="int16", always_2d=True) if self.change_format: @@ -255,7 +255,7 @@ class ASRExecutor(BaseExecutor): else: audio = audio[:, 0] - logger.info(f"audio shape: {audio.shape}") + logger.debug(f"audio shape: {audio.shape}") # fbank audio = preprocessing(audio, **preprocess_args) @@ -264,19 +264,19 @@ class ASRExecutor(BaseExecutor): self._inputs["audio"] = audio self._inputs["audio_len"] = audio_len - logger.info(f"audio feat shape: {audio.shape}") + logger.debug(f"audio feat shape: {audio.shape}") else: raise Exception("wrong type") - logger.info("audio feat process success") + logger.debug("audio feat process success") @paddle.no_grad() def infer(self, model_type: str): """ Model inference and result stored in self.output. """ - logger.info("start to infer the model to get the output") + logger.debug("start to infer the model to get the output") cfg = self.config.decode audio = self._inputs["audio"] audio_len = self._inputs["audio_len"] @@ -293,7 +293,7 @@ class ASRExecutor(BaseExecutor): self._outputs["result"] = result_transcripts[0] elif "conformer" in model_type or "transformer" in model_type: - logger.info( + logger.debug( f"we will use the transformer like model : {model_type}") try: result_transcripts = self.model.decode( @@ -352,7 +352,7 @@ class ASRExecutor(BaseExecutor): logger.error("Please input the right audio file path") return False - logger.info("checking the audio file format......") + logger.debug("checking the audio file format......") try: audio, audio_sample_rate = soundfile.read( audio_file, dtype="int16", always_2d=True) @@ -374,7 +374,7 @@ class ASRExecutor(BaseExecutor): sox input_audio.xx --rate 8k --bits 16 --channels 1 output_audio.wav \n \ ") return False - logger.info("The sample rate is %d" % audio_sample_rate) + logger.debug("The sample rate is %d" % audio_sample_rate) if audio_sample_rate != self.sample_rate: logger.warning("The sample rate of the input file is not {}.\n \ The program will resample the wav file to {}.\n \ @@ -383,28 +383,28 @@ class ASRExecutor(BaseExecutor): ".format(self.sample_rate, self.sample_rate)) if force_yes is False: while (True): - logger.info( + logger.debug( "Whether to change the sample rate and the channel. Y: change the sample. N: exit the prgream." ) content = input("Input(Y/N):") if content.strip() == "Y" or content.strip( ) == "y" or content.strip() == "yes" or content.strip( ) == "Yes": - logger.info( + logger.debug( "change the sampele rate, channel to 16k and 1 channel" ) break elif content.strip() == "N" or content.strip( ) == "n" or content.strip() == "no" or content.strip( ) == "No": - logger.info("Exit the program") + logger.debug("Exit the program") return False else: logger.warning("Not regular input, please input again") self.change_format = True else: - logger.info("The audio file format is right") + logger.debug("The audio file format is right") self.change_format = False return True diff --git a/paddlespeech/cli/cls/infer.py b/paddlespeech/cli/cls/infer.py index 942dc3b9230b796057eb6fc9065d611867de6d4f..c869e28bfa30c61091df20d3cda2e4a3d56040cd 100644 --- a/paddlespeech/cli/cls/infer.py +++ b/paddlespeech/cli/cls/infer.py @@ -92,7 +92,7 @@ class CLSExecutor(BaseExecutor): Init model and other resources from a specific path. """ if hasattr(self, 'model'): - logger.info('Model had been initialized.') + logger.debug('Model had been initialized.') return if label_file is None or ckpt_path is None: @@ -135,14 +135,14 @@ class CLSExecutor(BaseExecutor): Input content can be a text(tts), a file(asr, cls) or a streaming(not supported yet). """ feat_conf = self._conf['feature'] - logger.info(feat_conf) + logger.debug(feat_conf) waveform, _ = load( file=audio_file, sr=feat_conf['sample_rate'], mono=True, dtype='float32') if isinstance(audio_file, (str, os.PathLike)): - logger.info("Preprocessing audio_file:" + audio_file) + logger.debug("Preprocessing audio_file:" + audio_file) # Feature extraction feature_extractor = LogMelSpectrogram( diff --git a/paddlespeech/cli/download.py b/paddlespeech/cli/download.py index ec72587470e8f0e211e453e3b2b2ea3d1f54f25b..5661f18f938eeffdb829f9f091bafc75baddb388 100644 --- a/paddlespeech/cli/download.py +++ b/paddlespeech/cli/download.py @@ -61,7 +61,7 @@ def _get_unique_endpoints(trainer_endpoints): continue ips.add(ip) unique_endpoints.add(endpoint) - logger.info("unique_endpoints {}".format(unique_endpoints)) + logger.debug("unique_endpoints {}".format(unique_endpoints)) return unique_endpoints @@ -96,7 +96,7 @@ def get_path_from_url(url, # data, and the same ip will only download data once. unique_endpoints = _get_unique_endpoints(ParallelEnv().trainer_endpoints[:]) if osp.exists(fullpath) and check_exist and _md5check(fullpath, md5sum): - logger.info("Found {}".format(fullpath)) + logger.debug("Found {}".format(fullpath)) else: if ParallelEnv().current_endpoint in unique_endpoints: fullpath = _download(url, root_dir, md5sum, method=method) @@ -118,7 +118,7 @@ def _get_download(url, fullname): try: req = requests.get(url, stream=True) except Exception as e: # requests.exceptions.ConnectionError - logger.info("Downloading {} from {} failed with exception {}".format( + logger.debug("Downloading {} from {} failed with exception {}".format( fname, url, str(e))) return False @@ -190,7 +190,7 @@ def _download(url, path, md5sum=None, method='get'): fullname = osp.join(path, fname) retry_cnt = 0 - logger.info("Downloading {} from {}".format(fname, url)) + logger.debug("Downloading {} from {}".format(fname, url)) while not (osp.exists(fullname) and _md5check(fullname, md5sum)): if retry_cnt < DOWNLOAD_RETRY_LIMIT: retry_cnt += 1 @@ -209,7 +209,7 @@ def _md5check(fullname, md5sum=None): if md5sum is None: return True - logger.info("File {} md5 checking...".format(fullname)) + logger.debug("File {} md5 checking...".format(fullname)) md5 = hashlib.md5() with open(fullname, 'rb') as f: for chunk in iter(lambda: f.read(4096), b""): @@ -217,8 +217,8 @@ def _md5check(fullname, md5sum=None): calc_md5sum = md5.hexdigest() if calc_md5sum != md5sum: - logger.info("File {} md5 check failed, {}(calc) != " - "{}(base)".format(fullname, calc_md5sum, md5sum)) + logger.debug("File {} md5 check failed, {}(calc) != " + "{}(base)".format(fullname, calc_md5sum, md5sum)) return False return True @@ -227,7 +227,7 @@ def _decompress(fname): """ Decompress for zip and tar file """ - logger.info("Decompressing {}...".format(fname)) + logger.debug("Decompressing {}...".format(fname)) # For protecting decompressing interupted, # decompress to fpath_tmp directory firstly, if decompress diff --git a/paddlespeech/cli/executor.py b/paddlespeech/cli/executor.py index d390f947d17cccc99a12eee75f634242e4bac9bb..d4187a51459498bcf3b5130cbfb03c97a4506077 100644 --- a/paddlespeech/cli/executor.py +++ b/paddlespeech/cli/executor.py @@ -217,7 +217,7 @@ class BaseExecutor(ABC): logging.getLogger(name) for name in logging.root.manager.loggerDict ] for l in loggers: - l.disabled = True + l.setLevel(logging.ERROR) def show_rtf(self, info: Dict[str, List[float]]): """ diff --git a/paddlespeech/cli/kws/infer.py b/paddlespeech/cli/kws/infer.py index e3f426f5776e3a280d21ff1cb7bf48a98c13107c..111cfd7542bc46a301862fb8df94934e1639dbfb 100644 --- a/paddlespeech/cli/kws/infer.py +++ b/paddlespeech/cli/kws/infer.py @@ -88,7 +88,7 @@ class KWSExecutor(BaseExecutor): Init model and other resources from a specific path. """ if hasattr(self, 'model'): - logger.info('Model had been initialized.') + logger.debug('Model had been initialized.') return if ckpt_path is None: @@ -141,7 +141,7 @@ class KWSExecutor(BaseExecutor): assert os.path.isfile(audio_file) waveform, _ = load(audio_file) if isinstance(audio_file, (str, os.PathLike)): - logger.info("Preprocessing audio_file:" + audio_file) + logger.debug("Preprocessing audio_file:" + audio_file) # Feature extraction waveform = paddle.to_tensor(waveform).unsqueeze(0) diff --git a/paddlespeech/cli/log.py b/paddlespeech/cli/log.py index 8644064c73ef407476e7870e65d1149019762723..8b33e71e100a1d56f17cc9839a004f49d8f2431d 100644 --- a/paddlespeech/cli/log.py +++ b/paddlespeech/cli/log.py @@ -49,7 +49,7 @@ class Logger(object): self.handler.setFormatter(self.format) self.logger.addHandler(self.handler) - self.logger.setLevel(logging.DEBUG) + self.logger.setLevel(logging.INFO) self.logger.propagate = False def __call__(self, log_level: str, msg: str): diff --git a/paddlespeech/cli/st/infer.py b/paddlespeech/cli/st/infer.py index 4e099c4021eca94bfea64eaefcd66267136eada1..bc2bdd1ac202a0a86ca6b50056047fba8146db25 100644 --- a/paddlespeech/cli/st/infer.py +++ b/paddlespeech/cli/st/infer.py @@ -110,7 +110,7 @@ class STExecutor(BaseExecutor): """ decompressed_path = download_and_decompress(self.kaldi_bins, MODEL_HOME) decompressed_path = os.path.abspath(decompressed_path) - logger.info("Kaldi_bins stored in: {}".format(decompressed_path)) + logger.debug("Kaldi_bins stored in: {}".format(decompressed_path)) if "LD_LIBRARY_PATH" in os.environ: os.environ["LD_LIBRARY_PATH"] += f":{decompressed_path}" else: @@ -128,7 +128,7 @@ class STExecutor(BaseExecutor): Init model and other resources from a specific path. """ if hasattr(self, 'model'): - logger.info('Model had been initialized.') + logger.debug('Model had been initialized.') return if cfg_path is None or ckpt_path is None: @@ -140,8 +140,8 @@ class STExecutor(BaseExecutor): self.ckpt_path = os.path.join( self.task_resource.res_dir, self.task_resource.res_dict['ckpt_path']) - logger.info(self.cfg_path) - logger.info(self.ckpt_path) + logger.debug(self.cfg_path) + logger.debug(self.ckpt_path) res_path = self.task_resource.res_dir else: self.cfg_path = os.path.abspath(cfg_path) @@ -192,7 +192,7 @@ class STExecutor(BaseExecutor): Input content can be a file(wav). """ audio_file = os.path.abspath(wav_file) - logger.info("Preprocess audio_file:" + audio_file) + logger.debug("Preprocess audio_file:" + audio_file) if "fat_st" in model_type: cmvn = self.config.cmvn_path diff --git a/paddlespeech/cli/text/infer.py b/paddlespeech/cli/text/infer.py index 7b8faf99c84691971744fbef291a714900dc60bc..24b8c9c2593ccdfe9a07cdb5d607a0416844e0a0 100644 --- a/paddlespeech/cli/text/infer.py +++ b/paddlespeech/cli/text/infer.py @@ -98,7 +98,7 @@ class TextExecutor(BaseExecutor): Init model and other resources from a specific path. """ if hasattr(self, 'model'): - logger.info('Model had been initialized.') + logger.debug('Model had been initialized.') return self.task = task diff --git a/paddlespeech/cli/tts/infer.py b/paddlespeech/cli/tts/infer.py index 4e0337bccea500c382f0782860cec36ad4897c46..ade8cdd6dc5f4f255a582b40fe6a7aa336b04fa0 100644 --- a/paddlespeech/cli/tts/infer.py +++ b/paddlespeech/cli/tts/infer.py @@ -173,16 +173,23 @@ class TTSExecutor(BaseExecutor): Init model and other resources from a specific path. """ if hasattr(self, 'am_inference') and hasattr(self, 'voc_inference'): - logger.info('Models had been initialized.') + logger.debug('Models had been initialized.') return + # am + if am_ckpt is None or am_config is None or am_stat is None or phones_dict is None: + use_pretrained_am = True + else: + use_pretrained_am = False + am_tag = am + '-' + lang self.task_resource.set_task_model( model_tag=am_tag, model_type=0, # am + skip_download=not use_pretrained_am, version=None, # default version ) - if am_ckpt is None or am_config is None or am_stat is None or phones_dict is None: + if use_pretrained_am: self.am_res_path = self.task_resource.res_dir self.am_config = os.path.join(self.am_res_path, self.task_resource.res_dict['config']) @@ -193,9 +200,9 @@ class TTSExecutor(BaseExecutor): # must have phones_dict in acoustic self.phones_dict = os.path.join( self.am_res_path, self.task_resource.res_dict['phones_dict']) - logger.info(self.am_res_path) - logger.info(self.am_config) - logger.info(self.am_ckpt) + logger.debug(self.am_res_path) + logger.debug(self.am_config) + logger.debug(self.am_ckpt) else: self.am_config = os.path.abspath(am_config) self.am_ckpt = os.path.abspath(am_ckpt) @@ -220,13 +227,19 @@ class TTSExecutor(BaseExecutor): self.speaker_dict = speaker_dict # voc + if voc_ckpt is None or voc_config is None or voc_stat is None: + use_pretrained_voc = True + else: + use_pretrained_voc = False + voc_tag = voc + '-' + lang self.task_resource.set_task_model( model_tag=voc_tag, model_type=1, # vocoder + skip_download=not use_pretrained_voc, version=None, # default version ) - if voc_ckpt is None or voc_config is None or voc_stat is None: + if use_pretrained_voc: self.voc_res_path = self.task_resource.voc_res_dir self.voc_config = os.path.join( self.voc_res_path, self.task_resource.voc_res_dict['config']) @@ -235,9 +248,9 @@ class TTSExecutor(BaseExecutor): self.voc_stat = os.path.join( self.voc_res_path, self.task_resource.voc_res_dict['speech_stats']) - logger.info(self.voc_res_path) - logger.info(self.voc_config) - logger.info(self.voc_ckpt) + logger.debug(self.voc_res_path) + logger.debug(self.voc_config) + logger.debug(self.voc_ckpt) else: self.voc_config = os.path.abspath(voc_config) self.voc_ckpt = os.path.abspath(voc_ckpt) @@ -254,21 +267,18 @@ class TTSExecutor(BaseExecutor): with open(self.phones_dict, "r") as f: phn_id = [line.strip().split() for line in f.readlines()] vocab_size = len(phn_id) - print("vocab_size:", vocab_size) tone_size = None if self.tones_dict: with open(self.tones_dict, "r") as f: tone_id = [line.strip().split() for line in f.readlines()] tone_size = len(tone_id) - print("tone_size:", tone_size) spk_num = None if self.speaker_dict: with open(self.speaker_dict, 'rt') as f: spk_id = [line.strip().split() for line in f.readlines()] spk_num = len(spk_id) - print("spk_num:", spk_num) # frontend if lang == 'zh': @@ -278,7 +288,6 @@ class TTSExecutor(BaseExecutor): elif lang == 'en': self.frontend = English(phone_vocab_path=self.phones_dict) - print("frontend done!") # acoustic model odim = self.am_config.n_mels @@ -311,7 +320,6 @@ class TTSExecutor(BaseExecutor): am_normalizer = ZScore(am_mu, am_std) self.am_inference = am_inference_class(am_normalizer, am) self.am_inference.eval() - print("acoustic model done!") # vocoder # model: {model_name}_{dataset} @@ -334,7 +342,6 @@ class TTSExecutor(BaseExecutor): voc_normalizer = ZScore(voc_mu, voc_std) self.voc_inference = voc_inference_class(voc_normalizer, voc) self.voc_inference.eval() - print("voc done!") def preprocess(self, input: Any, *args, **kwargs): """ @@ -375,7 +382,7 @@ class TTSExecutor(BaseExecutor): text, merge_sentences=merge_sentences) phone_ids = input_ids["phone_ids"] else: - print("lang should in {'zh', 'en'}!") + logger.error("lang should in {'zh', 'en'}!") self.frontend_time = time.time() - frontend_st self.am_time = 0 diff --git a/paddlespeech/cli/vector/infer.py b/paddlespeech/cli/vector/infer.py index 4bc8e135ad1226f41455d62f392099d283b53d08..48ca1f98dedb1ba1caa454720a5211bacddb7ad9 100644 --- a/paddlespeech/cli/vector/infer.py +++ b/paddlespeech/cli/vector/infer.py @@ -117,7 +117,7 @@ class VectorExecutor(BaseExecutor): # stage 2: read the input data and store them as a list task_source = self.get_input_source(parser_args.input) - logger.info(f"task source: {task_source}") + logger.debug(f"task source: {task_source}") # stage 3: process the audio one by one # we do action according the task type @@ -127,13 +127,13 @@ class VectorExecutor(BaseExecutor): try: # extract the speaker audio embedding if parser_args.task == "spk": - logger.info("do vector spk task") + logger.debug("do vector spk task") res = self(input_, model, sample_rate, config, ckpt_path, device) task_result[id_] = res elif parser_args.task == "score": - logger.info("do vector score task") - logger.info(f"input content {input_}") + logger.debug("do vector score task") + logger.debug(f"input content {input_}") if len(input_.split()) != 2: logger.error( f"vector score task input {input_} wav num is not two," @@ -142,7 +142,7 @@ class VectorExecutor(BaseExecutor): # get the enroll and test embedding enroll_audio, test_audio = input_.split() - logger.info( + logger.debug( f"score task, enroll audio: {enroll_audio}, test audio: {test_audio}" ) enroll_embedding = self(enroll_audio, model, sample_rate, @@ -158,8 +158,8 @@ class VectorExecutor(BaseExecutor): has_exceptions = True task_result[id_] = f'{e.__class__.__name__}: {e}' - logger.info("task result as follows: ") - logger.info(f"{task_result}") + logger.debug("task result as follows: ") + logger.debug(f"{task_result}") # stage 4: process the all the task results self.process_task_results(parser_args.input, task_result, @@ -207,7 +207,7 @@ class VectorExecutor(BaseExecutor): """ if not hasattr(self, "score_func"): self.score_func = paddle.nn.CosineSimilarity(axis=0) - logger.info("create the cosine score function ") + logger.debug("create the cosine score function ") score = self.score_func( paddle.to_tensor(enroll_embedding), @@ -244,7 +244,7 @@ class VectorExecutor(BaseExecutor): sys.exit(-1) # stage 1: set the paddle runtime host device - logger.info(f"device type: {device}") + logger.debug(f"device type: {device}") paddle.device.set_device(device) # stage 2: read the specific pretrained model @@ -283,7 +283,7 @@ class VectorExecutor(BaseExecutor): # stage 0: avoid to init the mode again self.task = task if hasattr(self, "model"): - logger.info("Model has been initialized") + logger.debug("Model has been initialized") return # stage 1: get the model and config path @@ -294,7 +294,7 @@ class VectorExecutor(BaseExecutor): sample_rate_str = "16k" if sample_rate == 16000 else "8k" tag = model_type + "-" + sample_rate_str self.task_resource.set_task_model(tag, version=None) - logger.info(f"load the pretrained model: {tag}") + logger.debug(f"load the pretrained model: {tag}") # get the model from the pretrained list # we download the pretrained model and store it in the res_path self.res_path = self.task_resource.res_dir @@ -312,19 +312,19 @@ class VectorExecutor(BaseExecutor): self.res_path = os.path.dirname( os.path.dirname(os.path.abspath(self.cfg_path))) - logger.info(f"start to read the ckpt from {self.ckpt_path}") - logger.info(f"read the config from {self.cfg_path}") - logger.info(f"get the res path {self.res_path}") + logger.debug(f"start to read the ckpt from {self.ckpt_path}") + logger.debug(f"read the config from {self.cfg_path}") + logger.debug(f"get the res path {self.res_path}") # stage 2: read and config and init the model body self.config = CfgNode(new_allowed=True) self.config.merge_from_file(self.cfg_path) # stage 3: get the model name to instance the model network with dynamic_import - logger.info("start to dynamic import the model class") + logger.debug("start to dynamic import the model class") model_name = model_type[:model_type.rindex('_')] model_class = self.task_resource.get_model_class(model_name) - logger.info(f"model name {model_name}") + logger.debug(f"model name {model_name}") model_conf = self.config.model backbone = model_class(**model_conf) model = SpeakerIdetification( @@ -333,11 +333,11 @@ class VectorExecutor(BaseExecutor): self.model.eval() # stage 4: load the model parameters - logger.info("start to set the model parameters to model") + logger.debug("start to set the model parameters to model") model_dict = paddle.load(self.ckpt_path) self.model.set_state_dict(model_dict) - logger.info("create the model instance success") + logger.debug("create the model instance success") @paddle.no_grad() def infer(self, model_type: str): @@ -349,14 +349,14 @@ class VectorExecutor(BaseExecutor): # stage 0: get the feat and length from _inputs feats = self._inputs["feats"] lengths = self._inputs["lengths"] - logger.info("start to do backbone network model forward") - logger.info( + logger.debug("start to do backbone network model forward") + logger.debug( f"feats shape:{feats.shape}, lengths shape: {lengths.shape}") # stage 1: get the audio embedding # embedding from (1, emb_size, 1) -> (emb_size) embedding = self.model.backbone(feats, lengths).squeeze().numpy() - logger.info(f"embedding size: {embedding.shape}") + logger.debug(f"embedding size: {embedding.shape}") # stage 2: put the embedding and dim info to _outputs property # the embedding type is numpy.array @@ -380,12 +380,13 @@ class VectorExecutor(BaseExecutor): """ audio_file = input_file if isinstance(audio_file, (str, os.PathLike)): - logger.info(f"Preprocess audio file: {audio_file}") + logger.debug(f"Preprocess audio file: {audio_file}") # stage 1: load the audio sample points # Note: this process must match the training process waveform, sr = load_audio(audio_file) - logger.info(f"load the audio sample points, shape is: {waveform.shape}") + logger.debug( + f"load the audio sample points, shape is: {waveform.shape}") # stage 2: get the audio feat # Note: Now we only support fbank feature @@ -396,9 +397,9 @@ class VectorExecutor(BaseExecutor): n_mels=self.config.n_mels, window_size=self.config.window_size, hop_length=self.config.hop_size) - logger.info(f"extract the audio feat, shape is: {feat.shape}") + logger.debug(f"extract the audio feat, shape is: {feat.shape}") except Exception as e: - logger.info(f"feat occurs exception {e}") + logger.debug(f"feat occurs exception {e}") sys.exit(-1) feat = paddle.to_tensor(feat).unsqueeze(0) @@ -411,11 +412,11 @@ class VectorExecutor(BaseExecutor): # stage 4: store the feat and length in the _inputs, # which will be used in other function - logger.info(f"feats shape: {feat.shape}") + logger.debug(f"feats shape: {feat.shape}") self._inputs["feats"] = feat self._inputs["lengths"] = lengths - logger.info("audio extract the feat success") + logger.debug("audio extract the feat success") def _check(self, audio_file: str, sample_rate: int): """Check if the model sample match the audio sample rate @@ -441,7 +442,7 @@ class VectorExecutor(BaseExecutor): logger.error("Please input the right audio file path") return False - logger.info("checking the aduio file format......") + logger.debug("checking the aduio file format......") try: audio, audio_sample_rate = soundfile.read( audio_file, dtype="float32", always_2d=True) @@ -458,7 +459,7 @@ class VectorExecutor(BaseExecutor): ") return False - logger.info(f"The sample rate is {audio_sample_rate}") + logger.debug(f"The sample rate is {audio_sample_rate}") if audio_sample_rate != self.sample_rate: logger.error("The sample rate of the input file is not {}.\n \ @@ -468,6 +469,6 @@ class VectorExecutor(BaseExecutor): ".format(self.sample_rate, self.sample_rate)) sys.exit(-1) else: - logger.info("The audio file format is right") + logger.debug("The audio file format is right") return True diff --git a/paddlespeech/resource/resource.py b/paddlespeech/resource/resource.py index 70f12b64c2dc5bbf6ef508b41872e0504855d6fb..8e9914b2e13912d34413f92ff042cd1f3cbd95d0 100644 --- a/paddlespeech/resource/resource.py +++ b/paddlespeech/resource/resource.py @@ -60,6 +60,7 @@ class CommonTaskResource: def set_task_model(self, model_tag: str, model_type: int=0, + skip_download: bool=False, version: Optional[str]=None): """Set model tag and version of current task. @@ -83,16 +84,18 @@ class CommonTaskResource: self.version = version self.res_dict = self.pretrained_models[model_tag][version] self._format_path(self.res_dict) - self.res_dir = self._fetch(self.res_dict, - self._get_model_dir(model_type)) + if not skip_download: + self.res_dir = self._fetch(self.res_dict, + self._get_model_dir(model_type)) else: assert self.task == 'tts', 'Vocoder will only be used in tts task.' self.voc_model_tag = model_tag self.voc_version = version self.voc_res_dict = self.pretrained_models[model_tag][version] self._format_path(self.voc_res_dict) - self.voc_res_dir = self._fetch(self.voc_res_dict, - self._get_model_dir(model_type)) + if not skip_download: + self.voc_res_dir = self._fetch(self.voc_res_dict, + self._get_model_dir(model_type)) @staticmethod def get_model_class(model_name) -> List[object]: diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/export.py b/paddlespeech/s2t/exps/deepspeech2/bin/export.py index 049e7b688bc595e0e1eace1d0f1179b7c4e5f8ca..8acd46dfce48270f4fe6b0c402ecd91b693f9344 100644 --- a/paddlespeech/s2t/exps/deepspeech2/bin/export.py +++ b/paddlespeech/s2t/exps/deepspeech2/bin/export.py @@ -35,12 +35,6 @@ if __name__ == "__main__": # save jit model to parser.add_argument( "--export_path", type=str, help="path of the jit model to save") - parser.add_argument( - '--nxpu', - type=int, - default=0, - choices=[0, 1], - help="if nxpu == 0 and ngpu == 0, use cpu.") args = parser.parse_args() print_arguments(args) diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/test.py b/paddlespeech/s2t/exps/deepspeech2/bin/test.py index a9828f6e71c8b2303f6ce70948d1175d274c4a77..030168a9ad510f9e90d947b7fa2fed52e697a871 100644 --- a/paddlespeech/s2t/exps/deepspeech2/bin/test.py +++ b/paddlespeech/s2t/exps/deepspeech2/bin/test.py @@ -35,12 +35,6 @@ if __name__ == "__main__": # save asr result to parser.add_argument( "--result_file", type=str, help="path of save the asr result") - parser.add_argument( - '--nxpu', - type=int, - default=0, - choices=[0, 1], - help="if nxpu == 0 and ngpu == 0, use cpu.") args = parser.parse_args() print_arguments(args, globals()) diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/test_export.py b/paddlespeech/s2t/exps/deepspeech2/bin/test_export.py index 8db081e7bbb79c18666432743b53c9c249d86063..d7a9402b9182764ffaa80f22861b08756f61d275 100644 --- a/paddlespeech/s2t/exps/deepspeech2/bin/test_export.py +++ b/paddlespeech/s2t/exps/deepspeech2/bin/test_export.py @@ -38,12 +38,6 @@ if __name__ == "__main__": #load jit model from parser.add_argument( "--export_path", type=str, help="path of the jit model to save") - parser.add_argument( - '--nxpu', - type=int, - default=0, - choices=[0, 1], - help="if nxpu == 0 and ngpu == 0, use cpu.") parser.add_argument( "--enable-auto-log", action="store_true", help="use auto log") args = parser.parse_args() diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/train.py b/paddlespeech/s2t/exps/deepspeech2/bin/train.py index fee7079d9abb4d514e756a1ca4bcd8cd4449bc66..2c9942f9b35af051471004c6b09312c619fa87c2 100644 --- a/paddlespeech/s2t/exps/deepspeech2/bin/train.py +++ b/paddlespeech/s2t/exps/deepspeech2/bin/train.py @@ -31,12 +31,6 @@ def main(config, args): if __name__ == "__main__": parser = default_argument_parser() - parser.add_argument( - '--nxpu', - type=int, - default=0, - choices=[0, 1], - help="if nxpu == 0 and ngpu == 0, use cpu.") args = parser.parse_args() print_arguments(args, globals()) diff --git a/paddlespeech/s2t/frontend/augmentor/spec_augment.py b/paddlespeech/s2t/frontend/augmentor/spec_augment.py index e91cfdce42b621934fa25b69cc629ad03c7fec34..380712851e9b0d5fcb031366da91b7233e1c9ec5 100644 --- a/paddlespeech/s2t/frontend/augmentor/spec_augment.py +++ b/paddlespeech/s2t/frontend/augmentor/spec_augment.py @@ -16,7 +16,6 @@ import random import numpy as np from PIL import Image -from PIL.Image import BICUBIC from paddlespeech.s2t.frontend.augmentor.base import AugmentorBase from paddlespeech.s2t.utils.log import Log @@ -164,9 +163,9 @@ class SpecAugmentor(AugmentorBase): window) + 1 # 1 ... t - 1 left = Image.fromarray(x[:center]).resize((x.shape[1], warped), - BICUBIC) + Image.BICUBIC) right = Image.fromarray(x[center:]).resize((x.shape[1], t - warped), - BICUBIC) + Image.BICUBIC) if self.inplace: x[:warped] = left x[warped:] = right diff --git a/paddlespeech/s2t/frontend/featurizer/text_featurizer.py b/paddlespeech/s2t/frontend/featurizer/text_featurizer.py index 0c0fa5e2f63b05387cd6ce9af6fb0331c400cfb8..982c6b8fe47e51f3f94de8f47b9a4b6110544052 100644 --- a/paddlespeech/s2t/frontend/featurizer/text_featurizer.py +++ b/paddlespeech/s2t/frontend/featurizer/text_featurizer.py @@ -226,10 +226,10 @@ class TextFeaturizer(): sos_id = vocab_list.index(SOS) if SOS in vocab_list else -1 space_id = vocab_list.index(SPACE) if SPACE in vocab_list else -1 - logger.info(f"BLANK id: {blank_id}") - logger.info(f"UNK id: {unk_id}") - logger.info(f"EOS id: {eos_id}") - logger.info(f"SOS id: {sos_id}") - logger.info(f"SPACE id: {space_id}") - logger.info(f"MASKCTC id: {maskctc_id}") + logger.debug(f"BLANK id: {blank_id}") + logger.debug(f"UNK id: {unk_id}") + logger.debug(f"EOS id: {eos_id}") + logger.debug(f"SOS id: {sos_id}") + logger.debug(f"SPACE id: {space_id}") + logger.debug(f"MASKCTC id: {maskctc_id}") return token2id, id2token, vocab_list, unk_id, eos_id, blank_id diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py index e3d0edb7f0ce0884d9c66ccd992eb21a52a7ab74..100aca18b7dca8bafb2d2a03ffc7391d15b434f5 100644 --- a/paddlespeech/s2t/models/u2/u2.py +++ b/paddlespeech/s2t/models/u2/u2.py @@ -827,7 +827,7 @@ class U2Model(U2DecodeModel): # encoder encoder_type = configs.get('encoder', 'transformer') - logger.info(f"U2 Encoder type: {encoder_type}") + logger.debug(f"U2 Encoder type: {encoder_type}") if encoder_type == 'transformer': encoder = TransformerEncoder( input_dim, global_cmvn=global_cmvn, **configs['encoder_conf']) @@ -894,7 +894,7 @@ class U2Model(U2DecodeModel): if checkpoint_path: infos = checkpoint.Checkpoint().load_parameters( model, checkpoint_path=checkpoint_path) - logger.info(f"checkpoint info: {infos}") + logger.debug(f"checkpoint info: {infos}") layer_tools.summary(model) return model diff --git a/paddlespeech/s2t/modules/loss.py b/paddlespeech/s2t/modules/loss.py index c7d9bd45dd2bf005a575098456c435a173678d26..884fb70c10ffa024833c61cf35b413e446cbf9d0 100644 --- a/paddlespeech/s2t/modules/loss.py +++ b/paddlespeech/s2t/modules/loss.py @@ -37,9 +37,9 @@ class CTCLoss(nn.Layer): self.loss = nn.CTCLoss(blank=blank, reduction=reduction) self.batch_average = batch_average - logger.info( + logger.debug( f"CTCLoss Loss reduction: {reduction}, div-bs: {batch_average}") - logger.info(f"CTCLoss Grad Norm Type: {grad_norm_type}") + logger.debug(f"CTCLoss Grad Norm Type: {grad_norm_type}") assert grad_norm_type in ('instance', 'batch', 'frame', None) self.norm_by_times = False @@ -70,7 +70,8 @@ class CTCLoss(nn.Layer): param = {} self._kwargs = {k: v for k, v in kwargs.items() if k in param} _notin = {k: v for k, v in kwargs.items() if k not in param} - logger.info(f"{self.loss} kwargs:{self._kwargs}, not support: {_notin}") + logger.debug( + f"{self.loss} kwargs:{self._kwargs}, not support: {_notin}") def forward(self, logits, ys_pad, hlens, ys_lens): """Compute CTC loss. diff --git a/paddlespeech/s2t/training/cli.py b/paddlespeech/s2t/training/cli.py index bb85732a6f7f33ee9c5f2f7febabcd7912b78374..1b6bec8a801ecf739fc0efd9ce827df97b1c1591 100644 --- a/paddlespeech/s2t/training/cli.py +++ b/paddlespeech/s2t/training/cli.py @@ -82,6 +82,12 @@ def default_argument_parser(parser=None): type=int, default=1, help="number of parallel processes. 0 for cpu.") + train_group.add_argument( + '--nxpu', + type=int, + default=0, + choices=[0, 1], + help="if nxpu == 0 and ngpu == 0, use cpu.") train_group.add_argument( "--config", metavar="CONFIG_FILE", help="config file.") train_group.add_argument( diff --git a/paddlespeech/s2t/utils/tensor_utils.py b/paddlespeech/s2t/utils/tensor_utils.py index f9a843ea15c58bf6351e141d16397429674f5eef..422d4f82a9a6a7421d4fab2a773b43f5ca31410b 100644 --- a/paddlespeech/s2t/utils/tensor_utils.py +++ b/paddlespeech/s2t/utils/tensor_utils.py @@ -94,7 +94,7 @@ def pad_sequence(sequences: List[paddle.Tensor], for i, tensor in enumerate(sequences): length = tensor.shape[0] # use index notation to prevent duplicate references to the tensor - logger.info( + logger.debug( f"length {length}, out_tensor {out_tensor.shape}, tensor {tensor.shape}" ) if batch_first: diff --git a/paddlespeech/server/bin/paddlespeech_client.py b/paddlespeech/server/bin/paddlespeech_client.py index fb521b309776e886101d67fc514da366a0d8950b..bd1186dfb1762f2a872783a42f3b428c1362ad43 100644 --- a/paddlespeech/server/bin/paddlespeech_client.py +++ b/paddlespeech/server/bin/paddlespeech_client.py @@ -123,7 +123,6 @@ class TTSClientExecutor(BaseExecutor): time_end = time.time() time_consume = time_end - time_start response_dict = res.json() - logger.info(response_dict["message"]) logger.info("Save synthesized audio successfully on %s." % (output)) logger.info("Audio duration: %f s." % (response_dict['result']['duration'])) @@ -702,7 +701,6 @@ class VectorClientExecutor(BaseExecutor): test_audio=args.test, task=task) time_end = time.time() - logger.info(f"The vector: {res}") logger.info("Response time %f s." % (time_end - time_start)) return True except Exception as e: diff --git a/paddlespeech/server/engine/acs/python/acs_engine.py b/paddlespeech/server/engine/acs/python/acs_engine.py index 930101ac91a8d236947de2f2e409507bbf90a40c..63964a82550c754f58209a5c966da78110210b44 100644 --- a/paddlespeech/server/engine/acs/python/acs_engine.py +++ b/paddlespeech/server/engine/acs/python/acs_engine.py @@ -30,7 +30,7 @@ class ACSEngine(BaseEngine): """The ACSEngine Engine """ super(ACSEngine, self).__init__() - logger.info("Create the ACSEngine Instance") + logger.debug("Create the ACSEngine Instance") self.word_list = [] def init(self, config: dict): @@ -42,7 +42,7 @@ class ACSEngine(BaseEngine): Returns: bool: The engine instance flag """ - logger.info("Init the acs engine") + logger.debug("Init the acs engine") try: self.config = config self.device = self.config.get("device", paddle.get_device()) @@ -50,7 +50,7 @@ class ACSEngine(BaseEngine): # websocket default ping timeout is 20 seconds self.ping_timeout = self.config.get("ping_timeout", 20) paddle.set_device(self.device) - logger.info(f"ACS Engine set the device: {self.device}") + logger.debug(f"ACS Engine set the device: {self.device}") except BaseException as e: logger.error( @@ -66,7 +66,9 @@ class ACSEngine(BaseEngine): self.url = "ws://" + self.config.asr_server_ip + ":" + str( self.config.asr_server_port) + "/paddlespeech/asr/streaming" - logger.info("Init the acs engine successfully") + logger.info("Initialize acs server engine successfully on device: %s." % + (self.device)) + return True def read_search_words(self): @@ -95,12 +97,12 @@ class ACSEngine(BaseEngine): Returns: _type_: _description_ """ - logger.info("send a message to the server") + logger.debug("send a message to the server") if self.url is None: logger.error("No asr server, please input valid ip and port") return "" ws = websocket.WebSocket() - logger.info(f"set the ping timeout: {self.ping_timeout} seconds") + logger.debug(f"set the ping timeout: {self.ping_timeout} seconds") ws.connect(self.url, ping_timeout=self.ping_timeout) audio_info = json.dumps( { @@ -123,7 +125,7 @@ class ACSEngine(BaseEngine): logger.info(f"audio result: {msg}") # 3. send chunk audio data to engine - logger.info("send the end signal") + logger.debug("send the end signal") audio_info = json.dumps( { "name": "test.wav", @@ -197,7 +199,7 @@ class ACSEngine(BaseEngine): start = max(time_stamp[m.start(0)]['bg'] - offset, 0) end = min(time_stamp[m.end(0) - 1]['ed'] + offset, max_ed) - logger.info(f'start: {start}, end: {end}') + logger.debug(f'start: {start}, end: {end}') acs_result.append({'w': w, 'bg': start, 'ed': end}) return acs_result, asr_result @@ -212,7 +214,7 @@ class ACSEngine(BaseEngine): Returns: acs_result, asr_result: the acs result and the asr result """ - logger.info("start to process the audio content search") + logger.debug("start to process the audio content search") msg = self.get_asr_content(io.BytesIO(audio_data)) acs_result, asr_result = self.get_macthed_word(msg) diff --git a/paddlespeech/server/engine/asr/online/onnx/asr_engine.py b/paddlespeech/server/engine/asr/online/onnx/asr_engine.py index cb743ea20a6053a227a0e6f8705634a3613d9049..ab4f113056ba7f635012f5e486c8d88fd083f9ca 100644 --- a/paddlespeech/server/engine/asr/online/onnx/asr_engine.py +++ b/paddlespeech/server/engine/asr/online/onnx/asr_engine.py @@ -44,7 +44,7 @@ class PaddleASRConnectionHanddler: asr_engine (ASREngine): the global asr engine """ super().__init__() - logger.info( + logger.debug( "create an paddle asr connection handler to process the websocket connection" ) self.config = asr_engine.config # server config @@ -152,12 +152,12 @@ class PaddleASRConnectionHanddler: self.output_reset() def extract_feat(self, samples: ByteString): - logger.info("Online ASR extract the feat") + logger.debug("Online ASR extract the feat") samples = np.frombuffer(samples, dtype=np.int16) assert samples.ndim == 1 self.num_samples += samples.shape[0] - logger.info( + logger.debug( f"This package receive {samples.shape[0]} pcm data. Global samples:{self.num_samples}" ) @@ -168,7 +168,7 @@ class PaddleASRConnectionHanddler: else: assert self.remained_wav.ndim == 1 # (T,) self.remained_wav = np.concatenate([self.remained_wav, samples]) - logger.info( + logger.debug( f"The concatenation of remain and now audio samples length is: {self.remained_wav.shape}" ) @@ -202,14 +202,14 @@ class PaddleASRConnectionHanddler: # update remained wav self.remained_wav = self.remained_wav[self.n_shift * num_frames:] - logger.info( + logger.debug( f"process the audio feature success, the cached feat shape: {self.cached_feat.shape}" ) - logger.info( + logger.debug( f"After extract feat, the cached remain the audio samples: {self.remained_wav.shape}" ) - logger.info(f"global samples: {self.num_samples}") - logger.info(f"global frames: {self.num_frames}") + logger.debug(f"global samples: {self.num_samples}") + logger.debug(f"global frames: {self.num_frames}") def decode(self, is_finished=False): """advance decoding @@ -237,7 +237,7 @@ class PaddleASRConnectionHanddler: return num_frames = self.cached_feat.shape[1] - logger.info( + logger.debug( f"Required decoding window {decoding_window} frames, and the connection has {num_frames} frames" ) @@ -355,7 +355,7 @@ class ASRServerExecutor(ASRExecutor): lm_url = self.task_resource.res_dict['lm_url'] lm_md5 = self.task_resource.res_dict['lm_md5'] - logger.info(f"Start to load language model {lm_url}") + logger.debug(f"Start to load language model {lm_url}") self.download_lm( lm_url, os.path.dirname(self.config.decode.lang_model_path), lm_md5) @@ -367,7 +367,7 @@ class ASRServerExecutor(ASRExecutor): if "deepspeech2" in self.model_type: # AM predictor - logger.info("ASR engine start to init the am predictor") + logger.debug("ASR engine start to init the am predictor") self.am_predictor = onnx_infer.get_sess( model_path=self.am_model, sess_conf=self.am_predictor_conf) else: @@ -400,7 +400,7 @@ class ASRServerExecutor(ASRExecutor): self.num_decoding_left_chunks = num_decoding_left_chunks # conf for paddleinference predictor or onnx self.am_predictor_conf = am_predictor_conf - logger.info(f"model_type: {self.model_type}") + logger.debug(f"model_type: {self.model_type}") sample_rate_str = '16k' if sample_rate == 16000 else '8k' tag = model_type + '-' + lang + '-' + sample_rate_str @@ -422,12 +422,11 @@ class ASRServerExecutor(ASRExecutor): # self.res_path, self.task_resource.res_dict[ # 'params']) if am_params is None else os.path.abspath(am_params) - logger.info("Load the pretrained model:") - logger.info(f" tag = {tag}") - logger.info(f" res_path: {self.res_path}") - logger.info(f" cfg path: {self.cfg_path}") - logger.info(f" am_model path: {self.am_model}") - # logger.info(f" am_params path: {self.am_params}") + logger.debug("Load the pretrained model:") + logger.debug(f" tag = {tag}") + logger.debug(f" res_path: {self.res_path}") + logger.debug(f" cfg path: {self.cfg_path}") + logger.debug(f" am_model path: {self.am_model}") #Init body. self.config = CfgNode(new_allowed=True) @@ -436,7 +435,7 @@ class ASRServerExecutor(ASRExecutor): if self.config.spm_model_prefix: self.config.spm_model_prefix = os.path.join( self.res_path, self.config.spm_model_prefix) - logger.info(f"spm model path: {self.config.spm_model_prefix}") + logger.debug(f"spm model path: {self.config.spm_model_prefix}") self.vocab = self.config.vocab_filepath @@ -450,7 +449,7 @@ class ASRServerExecutor(ASRExecutor): # AM predictor self.init_model() - logger.info(f"create the {model_type} model success") + logger.debug(f"create the {model_type} model success") return True @@ -501,7 +500,7 @@ class ASREngine(BaseEngine): "If all GPU or XPU is used, you can set the server to 'cpu'") sys.exit(-1) - logger.info(f"paddlespeech_server set the device: {self.device}") + logger.debug(f"paddlespeech_server set the device: {self.device}") if not self.init_model(): logger.error( @@ -509,7 +508,8 @@ class ASREngine(BaseEngine): ) return False - logger.info("Initialize ASR server engine successfully.") + logger.info("Initialize ASR server engine successfully on device: %s." % + (self.device)) return True def new_handler(self): diff --git a/paddlespeech/server/engine/asr/online/paddleinference/asr_engine.py b/paddlespeech/server/engine/asr/online/paddleinference/asr_engine.py index bcd0fa7fa0d3e996d2718ce2956415a9e458bb3b..182e64180709760dbd1639cbcb4a56537dd45aa5 100644 --- a/paddlespeech/server/engine/asr/online/paddleinference/asr_engine.py +++ b/paddlespeech/server/engine/asr/online/paddleinference/asr_engine.py @@ -44,7 +44,7 @@ class PaddleASRConnectionHanddler: asr_engine (ASREngine): the global asr engine """ super().__init__() - logger.info( + logger.debug( "create an paddle asr connection handler to process the websocket connection" ) self.config = asr_engine.config # server config @@ -157,7 +157,7 @@ class PaddleASRConnectionHanddler: assert samples.ndim == 1 self.num_samples += samples.shape[0] - logger.info( + logger.debug( f"This package receive {samples.shape[0]} pcm data. Global samples:{self.num_samples}" ) @@ -168,7 +168,7 @@ class PaddleASRConnectionHanddler: else: assert self.remained_wav.ndim == 1 # (T,) self.remained_wav = np.concatenate([self.remained_wav, samples]) - logger.info( + logger.debug( f"The concatenation of remain and now audio samples length is: {self.remained_wav.shape}" ) @@ -202,14 +202,14 @@ class PaddleASRConnectionHanddler: # update remained wav self.remained_wav = self.remained_wav[self.n_shift * num_frames:] - logger.info( + logger.debug( f"process the audio feature success, the cached feat shape: {self.cached_feat.shape}" ) - logger.info( + logger.debug( f"After extract feat, the cached remain the audio samples: {self.remained_wav.shape}" ) - logger.info(f"global samples: {self.num_samples}") - logger.info(f"global frames: {self.num_frames}") + logger.debug(f"global samples: {self.num_samples}") + logger.debug(f"global frames: {self.num_frames}") def decode(self, is_finished=False): """advance decoding @@ -237,13 +237,13 @@ class PaddleASRConnectionHanddler: return num_frames = self.cached_feat.shape[1] - logger.info( + logger.debug( f"Required decoding window {decoding_window} frames, and the connection has {num_frames} frames" ) # the cached feat must be larger decoding_window if num_frames < decoding_window and not is_finished: - logger.info( + logger.debug( f"frame feat num is less than {decoding_window}, please input more pcm data" ) return None, None @@ -294,7 +294,7 @@ class PaddleASRConnectionHanddler: Returns: logprob: poster probability. """ - logger.info("start to decoce one chunk for deepspeech2") + logger.debug("start to decoce one chunk for deepspeech2") input_names = self.am_predictor.get_input_names() audio_handle = self.am_predictor.get_input_handle(input_names[0]) audio_len_handle = self.am_predictor.get_input_handle(input_names[1]) @@ -369,7 +369,7 @@ class ASRServerExecutor(ASRExecutor): lm_url = self.task_resource.res_dict['lm_url'] lm_md5 = self.task_resource.res_dict['lm_md5'] - logger.info(f"Start to load language model {lm_url}") + logger.debug(f"Start to load language model {lm_url}") self.download_lm( lm_url, os.path.dirname(self.config.decode.lang_model_path), lm_md5) @@ -381,7 +381,7 @@ class ASRServerExecutor(ASRExecutor): if "deepspeech2" in self.model_type: # AM predictor - logger.info("ASR engine start to init the am predictor") + logger.debug("ASR engine start to init the am predictor") self.am_predictor = init_predictor( model_file=self.am_model, params_file=self.am_params, @@ -415,7 +415,7 @@ class ASRServerExecutor(ASRExecutor): self.num_decoding_left_chunks = num_decoding_left_chunks # conf for paddleinference predictor or onnx self.am_predictor_conf = am_predictor_conf - logger.info(f"model_type: {self.model_type}") + logger.debug(f"model_type: {self.model_type}") sample_rate_str = '16k' if sample_rate == 16000 else '8k' tag = model_type + '-' + lang + '-' + sample_rate_str @@ -437,12 +437,12 @@ class ASRServerExecutor(ASRExecutor): self.res_path = os.path.dirname( os.path.dirname(os.path.abspath(self.cfg_path))) - logger.info("Load the pretrained model:") - logger.info(f" tag = {tag}") - logger.info(f" res_path: {self.res_path}") - logger.info(f" cfg path: {self.cfg_path}") - logger.info(f" am_model path: {self.am_model}") - logger.info(f" am_params path: {self.am_params}") + logger.debug("Load the pretrained model:") + logger.debug(f" tag = {tag}") + logger.debug(f" res_path: {self.res_path}") + logger.debug(f" cfg path: {self.cfg_path}") + logger.debug(f" am_model path: {self.am_model}") + logger.debug(f" am_params path: {self.am_params}") #Init body. self.config = CfgNode(new_allowed=True) @@ -451,7 +451,7 @@ class ASRServerExecutor(ASRExecutor): if self.config.spm_model_prefix: self.config.spm_model_prefix = os.path.join( self.res_path, self.config.spm_model_prefix) - logger.info(f"spm model path: {self.config.spm_model_prefix}") + logger.debug(f"spm model path: {self.config.spm_model_prefix}") self.vocab = self.config.vocab_filepath @@ -465,7 +465,7 @@ class ASRServerExecutor(ASRExecutor): # AM predictor self.init_model() - logger.info(f"create the {model_type} model success") + logger.debug(f"create the {model_type} model success") return True @@ -516,7 +516,7 @@ class ASREngine(BaseEngine): "If all GPU or XPU is used, you can set the server to 'cpu'") sys.exit(-1) - logger.info(f"paddlespeech_server set the device: {self.device}") + logger.debug(f"paddlespeech_server set the device: {self.device}") if not self.init_model(): logger.error( @@ -524,7 +524,9 @@ class ASREngine(BaseEngine): ) return False - logger.info("Initialize ASR server engine successfully.") + logger.info("Initialize ASR server engine successfully on device: %s." % + (self.device)) + return True def new_handler(self): diff --git a/paddlespeech/server/engine/asr/online/python/asr_engine.py b/paddlespeech/server/engine/asr/online/python/asr_engine.py index 2ffbba990b601fcdaa2807e5d487b9176818258d..2bacfecd6e7f4529b8b1a5ac9dde6841b9b776f6 100644 --- a/paddlespeech/server/engine/asr/online/python/asr_engine.py +++ b/paddlespeech/server/engine/asr/online/python/asr_engine.py @@ -49,7 +49,7 @@ class PaddleASRConnectionHanddler: asr_engine (ASREngine): the global asr engine """ super().__init__() - logger.info( + logger.debug( "create an paddle asr connection handler to process the websocket connection" ) self.config = asr_engine.config # server config @@ -107,7 +107,7 @@ class PaddleASRConnectionHanddler: # acoustic model self.model = self.asr_engine.executor.model self.continuous_decoding = self.config.continuous_decoding - logger.info(f"continue decoding: {self.continuous_decoding}") + logger.debug(f"continue decoding: {self.continuous_decoding}") # ctc decoding config self.ctc_decode_config = self.asr_engine.executor.config.decode @@ -207,7 +207,7 @@ class PaddleASRConnectionHanddler: assert samples.ndim == 1 self.num_samples += samples.shape[0] - logger.info( + logger.debug( f"This package receive {samples.shape[0]} pcm data. Global samples:{self.num_samples}" ) @@ -218,7 +218,7 @@ class PaddleASRConnectionHanddler: else: assert self.remained_wav.ndim == 1 # (T,) self.remained_wav = np.concatenate([self.remained_wav, samples]) - logger.info( + logger.debug( f"The concatenation of remain and now audio samples length is: {self.remained_wav.shape}" ) @@ -252,14 +252,14 @@ class PaddleASRConnectionHanddler: # update remained wav self.remained_wav = self.remained_wav[self.n_shift * num_frames:] - logger.info( + logger.debug( f"process the audio feature success, the cached feat shape: {self.cached_feat.shape}" ) - logger.info( + logger.debug( f"After extract feat, the cached remain the audio samples: {self.remained_wav.shape}" ) - logger.info(f"global samples: {self.num_samples}") - logger.info(f"global frames: {self.num_frames}") + logger.debug(f"global samples: {self.num_samples}") + logger.debug(f"global frames: {self.num_frames}") def decode(self, is_finished=False): """advance decoding @@ -283,24 +283,24 @@ class PaddleASRConnectionHanddler: stride = subsampling * decoding_chunk_size if self.cached_feat is None: - logger.info("no audio feat, please input more pcm data") + logger.debug("no audio feat, please input more pcm data") return num_frames = self.cached_feat.shape[1] - logger.info( + logger.debug( f"Required decoding window {decoding_window} frames, and the connection has {num_frames} frames" ) # the cached feat must be larger decoding_window if num_frames < decoding_window and not is_finished: - logger.info( + logger.debug( f"frame feat num is less than {decoding_window}, please input more pcm data" ) return None, None # if is_finished=True, we need at least context frames if num_frames < context: - logger.info( + logger.debug( "flast {num_frames} is less than context {context} frames, and we cannot do model forward" ) return None, None @@ -354,7 +354,7 @@ class PaddleASRConnectionHanddler: Returns: logprob: poster probability. """ - logger.info("start to decoce one chunk for deepspeech2") + logger.debug("start to decoce one chunk for deepspeech2") input_names = self.am_predictor.get_input_names() audio_handle = self.am_predictor.get_input_handle(input_names[0]) audio_len_handle = self.am_predictor.get_input_handle(input_names[1]) @@ -391,7 +391,7 @@ class PaddleASRConnectionHanddler: self.decoder.next(output_chunk_probs, output_chunk_lens) trans_best, trans_beam = self.decoder.decode() - logger.info(f"decode one best result for deepspeech2: {trans_best[0]}") + logger.debug(f"decode one best result for deepspeech2: {trans_best[0]}") return trans_best[0] @paddle.no_grad() @@ -402,7 +402,7 @@ class PaddleASRConnectionHanddler: # reset endpiont state self.endpoint_state = False - logger.info( + logger.debug( "Conformer/Transformer: start to decode with advanced_decoding method" ) cfg = self.ctc_decode_config @@ -427,25 +427,25 @@ class PaddleASRConnectionHanddler: stride = subsampling * decoding_chunk_size if self.cached_feat is None: - logger.info("no audio feat, please input more pcm data") + logger.debug("no audio feat, please input more pcm data") return # (B=1,T,D) num_frames = self.cached_feat.shape[1] - logger.info( + logger.debug( f"Required decoding window {decoding_window} frames, and the connection has {num_frames} frames" ) # the cached feat must be larger decoding_window if num_frames < decoding_window and not is_finished: - logger.info( + logger.debug( f"frame feat num is less than {decoding_window}, please input more pcm data" ) return None, None # if is_finished=True, we need at least context frames if num_frames < context: - logger.info( + logger.debug( "flast {num_frames} is less than context {context} frames, and we cannot do model forward" ) return None, None @@ -489,7 +489,7 @@ class PaddleASRConnectionHanddler: self.encoder_out = ys else: self.encoder_out = paddle.concat([self.encoder_out, ys], axis=1) - logger.info( + logger.debug( f"This connection handler encoder out shape: {self.encoder_out.shape}" ) @@ -513,7 +513,8 @@ class PaddleASRConnectionHanddler: if self.endpointer.endpoint_detected(ctc_probs.numpy(), decoding_something): self.endpoint_state = True - logger.info(f"Endpoint is detected at {self.num_frames} frame.") + logger.debug( + f"Endpoint is detected at {self.num_frames} frame.") # advance cache of feat assert self.cached_feat.shape[0] == 1 #(B=1,T,D) @@ -526,7 +527,7 @@ class PaddleASRConnectionHanddler: def update_result(self): """Conformer/Transformer hyps to result. """ - logger.info("update the final result") + logger.debug("update the final result") hyps = self.hyps # output results and tokenids @@ -560,16 +561,16 @@ class PaddleASRConnectionHanddler: only for conformer and transformer model. """ if "deepspeech2" in self.model_type: - logger.info("deepspeech2 not support rescoring decoding.") + logger.debug("deepspeech2 not support rescoring decoding.") return if "attention_rescoring" != self.ctc_decode_config.decoding_method: - logger.info( + logger.debug( f"decoding method not match: {self.ctc_decode_config.decoding_method}, need attention_rescoring" ) return - logger.info("rescoring the final result") + logger.debug("rescoring the final result") # last decoding for last audio self.searcher.finalize_search() @@ -685,7 +686,6 @@ class PaddleASRConnectionHanddler: "bg": global_offset_in_sec + start, "ed": global_offset_in_sec + end }) - # logger.info(f"{word_time_stamp[-1]}") self.word_time_stamp = word_time_stamp logger.info(f"word time stamp: {self.word_time_stamp}") @@ -707,13 +707,13 @@ class ASRServerExecutor(ASRExecutor): lm_url = self.task_resource.res_dict['lm_url'] lm_md5 = self.task_resource.res_dict['lm_md5'] - logger.info(f"Start to load language model {lm_url}") + logger.debug(f"Start to load language model {lm_url}") self.download_lm( lm_url, os.path.dirname(self.config.decode.lang_model_path), lm_md5) elif "conformer" in self.model_type or "transformer" in self.model_type: with UpdateConfig(self.config): - logger.info("start to create the stream conformer asr engine") + logger.debug("start to create the stream conformer asr engine") # update the decoding method if self.decode_method: self.config.decode.decoding_method = self.decode_method @@ -726,7 +726,7 @@ class ASRServerExecutor(ASRExecutor): if self.config.decode.decoding_method not in [ "ctc_prefix_beam_search", "attention_rescoring" ]: - logger.info( + logger.debug( "we set the decoding_method to attention_rescoring") self.config.decode.decoding_method = "attention_rescoring" @@ -739,7 +739,7 @@ class ASRServerExecutor(ASRExecutor): def init_model(self) -> None: if "deepspeech2" in self.model_type: # AM predictor - logger.info("ASR engine start to init the am predictor") + logger.debug("ASR engine start to init the am predictor") self.am_predictor = init_predictor( model_file=self.am_model, params_file=self.am_params, @@ -748,7 +748,7 @@ class ASRServerExecutor(ASRExecutor): # load model # model_type: {model_name}_{dataset} model_name = self.model_type[:self.model_type.rindex('_')] - logger.info(f"model name: {model_name}") + logger.debug(f"model name: {model_name}") model_class = self.task_resource.get_model_class(model_name) model = model_class.from_config(self.config) self.model = model @@ -782,7 +782,7 @@ class ASRServerExecutor(ASRExecutor): self.num_decoding_left_chunks = num_decoding_left_chunks # conf for paddleinference predictor or onnx self.am_predictor_conf = am_predictor_conf - logger.info(f"model_type: {self.model_type}") + logger.debug(f"model_type: {self.model_type}") sample_rate_str = '16k' if sample_rate == 16000 else '8k' tag = model_type + '-' + lang + '-' + sample_rate_str @@ -804,12 +804,12 @@ class ASRServerExecutor(ASRExecutor): self.res_path = os.path.dirname( os.path.dirname(os.path.abspath(self.cfg_path))) - logger.info("Load the pretrained model:") - logger.info(f" tag = {tag}") - logger.info(f" res_path: {self.res_path}") - logger.info(f" cfg path: {self.cfg_path}") - logger.info(f" am_model path: {self.am_model}") - logger.info(f" am_params path: {self.am_params}") + logger.debug("Load the pretrained model:") + logger.debug(f" tag = {tag}") + logger.debug(f" res_path: {self.res_path}") + logger.debug(f" cfg path: {self.cfg_path}") + logger.debug(f" am_model path: {self.am_model}") + logger.debug(f" am_params path: {self.am_params}") #Init body. self.config = CfgNode(new_allowed=True) @@ -818,7 +818,7 @@ class ASRServerExecutor(ASRExecutor): if self.config.spm_model_prefix: self.config.spm_model_prefix = os.path.join( self.res_path, self.config.spm_model_prefix) - logger.info(f"spm model path: {self.config.spm_model_prefix}") + logger.debug(f"spm model path: {self.config.spm_model_prefix}") self.vocab = self.config.vocab_filepath @@ -832,7 +832,7 @@ class ASRServerExecutor(ASRExecutor): # AM predictor self.init_model() - logger.info(f"create the {model_type} model success") + logger.debug(f"create the {model_type} model success") return True @@ -883,7 +883,7 @@ class ASREngine(BaseEngine): "If all GPU or XPU is used, you can set the server to 'cpu'") sys.exit(-1) - logger.info(f"paddlespeech_server set the device: {self.device}") + logger.debug(f"paddlespeech_server set the device: {self.device}") if not self.init_model(): logger.error( @@ -891,7 +891,9 @@ class ASREngine(BaseEngine): ) return False - logger.info("Initialize ASR server engine successfully.") + logger.info("Initialize ASR server engine successfully on device: %s." % + (self.device)) + return True def new_handler(self): diff --git a/paddlespeech/server/engine/asr/paddleinference/asr_engine.py b/paddlespeech/server/engine/asr/paddleinference/asr_engine.py index 572004eb8a7b707563ebceaefe58b98e68cfd12f..6df666ce8090703e0727827e07e0193ffc14cffe 100644 --- a/paddlespeech/server/engine/asr/paddleinference/asr_engine.py +++ b/paddlespeech/server/engine/asr/paddleinference/asr_engine.py @@ -65,10 +65,10 @@ class ASRServerExecutor(ASRExecutor): self.task_resource.res_dict['model']) self.am_params = os.path.join(self.res_path, self.task_resource.res_dict['params']) - logger.info(self.res_path) - logger.info(self.cfg_path) - logger.info(self.am_model) - logger.info(self.am_params) + logger.debug(self.res_path) + logger.debug(self.cfg_path) + logger.debug(self.am_model) + logger.debug(self.am_params) else: self.cfg_path = os.path.abspath(cfg_path) self.am_model = os.path.abspath(am_model) @@ -236,16 +236,16 @@ class PaddleASRConnectionHandler(ASRServerExecutor): if self._check( io.BytesIO(audio_data), self.asr_engine.config.sample_rate, self.asr_engine.config.force_yes): - logger.info("start running asr engine") + logger.debug("start running asr engine") self.preprocess(self.asr_engine.config.model_type, io.BytesIO(audio_data)) st = time.time() self.infer(self.asr_engine.config.model_type) infer_time = time.time() - st self.output = self.postprocess() # Retrieve result of asr. - logger.info("end inferring asr engine") + logger.debug("end inferring asr engine") else: - logger.info("file check failed!") + logger.error("file check failed!") self.output = None logger.info("inference time: {}".format(infer_time)) diff --git a/paddlespeech/server/engine/asr/python/asr_engine.py b/paddlespeech/server/engine/asr/python/asr_engine.py index f9cc3a6650cdaff91fdf5c52ffa285aa4d7f2d16..02c40fd128236b45c75db56378654a1dd4d1ae26 100644 --- a/paddlespeech/server/engine/asr/python/asr_engine.py +++ b/paddlespeech/server/engine/asr/python/asr_engine.py @@ -104,7 +104,7 @@ class PaddleASRConnectionHandler(ASRServerExecutor): if self._check( io.BytesIO(audio_data), self.asr_engine.config.sample_rate, self.asr_engine.config.force_yes): - logger.info("start run asr engine") + logger.debug("start run asr engine") self.preprocess(self.asr_engine.config.model, io.BytesIO(audio_data)) st = time.time() @@ -112,7 +112,7 @@ class PaddleASRConnectionHandler(ASRServerExecutor): infer_time = time.time() - st self.output = self.postprocess() # Retrieve result of asr. else: - logger.info("file check failed!") + logger.error("file check failed!") self.output = None logger.info("inference time: {}".format(infer_time)) diff --git a/paddlespeech/server/engine/cls/paddleinference/cls_engine.py b/paddlespeech/server/engine/cls/paddleinference/cls_engine.py index 389d56055ba617d1628b87e52aaf7301e9928c29..fa62ba67c7c4d07735ba87a7652d8b8e8387bffb 100644 --- a/paddlespeech/server/engine/cls/paddleinference/cls_engine.py +++ b/paddlespeech/server/engine/cls/paddleinference/cls_engine.py @@ -67,22 +67,22 @@ class CLSServerExecutor(CLSExecutor): self.params_path = os.path.abspath(params_path) self.label_file = os.path.abspath(label_file) - logger.info(self.cfg_path) - logger.info(self.model_path) - logger.info(self.params_path) - logger.info(self.label_file) + logger.debug(self.cfg_path) + logger.debug(self.model_path) + logger.debug(self.params_path) + logger.debug(self.label_file) # config with open(self.cfg_path, 'r') as f: self._conf = yaml.safe_load(f) - logger.info("Read cfg file successfully.") + logger.debug("Read cfg file successfully.") # labels self._label_list = [] with open(self.label_file, 'r') as f: for line in f: self._label_list.append(line.strip()) - logger.info("Read label file successfully.") + logger.debug("Read label file successfully.") # Create predictor self.predictor_conf = predictor_conf @@ -90,7 +90,7 @@ class CLSServerExecutor(CLSExecutor): model_file=self.model_path, params_file=self.params_path, predictor_conf=self.predictor_conf) - logger.info("Create predictor successfully.") + logger.debug("Create predictor successfully.") @paddle.no_grad() def infer(self): @@ -148,7 +148,8 @@ class CLSEngine(BaseEngine): logger.error(e) return False - logger.info("Initialize CLS server engine successfully.") + logger.info("Initialize CLS server engine successfully on device: %s." % + (self.device)) return True @@ -160,7 +161,7 @@ class PaddleCLSConnectionHandler(CLSServerExecutor): cls_engine (CLSEngine): The CLS engine """ super().__init__() - logger.info( + logger.debug( "Create PaddleCLSConnectionHandler to process the cls request") self._inputs = OrderedDict() @@ -183,7 +184,7 @@ class PaddleCLSConnectionHandler(CLSServerExecutor): self.infer() infer_time = time.time() - st - logger.info("inference time: {}".format(infer_time)) + logger.debug("inference time: {}".format(infer_time)) logger.info("cls engine type: inference") def postprocess(self, topk: int): diff --git a/paddlespeech/server/engine/cls/python/cls_engine.py b/paddlespeech/server/engine/cls/python/cls_engine.py index f8d8f20ef215da47c823e2bfef056b2c4ec4bb6d..210f4cbbb81b98b6a5a73a7b9bac155a188b6688 100644 --- a/paddlespeech/server/engine/cls/python/cls_engine.py +++ b/paddlespeech/server/engine/cls/python/cls_engine.py @@ -88,7 +88,7 @@ class PaddleCLSConnectionHandler(CLSServerExecutor): cls_engine (CLSEngine): The CLS engine """ super().__init__() - logger.info( + logger.debug( "Create PaddleCLSConnectionHandler to process the cls request") self._inputs = OrderedDict() @@ -110,7 +110,7 @@ class PaddleCLSConnectionHandler(CLSServerExecutor): self.infer() infer_time = time.time() - st - logger.info("inference time: {}".format(infer_time)) + logger.debug("inference time: {}".format(infer_time)) logger.info("cls engine type: python") def postprocess(self, topk: int): diff --git a/paddlespeech/server/engine/engine_factory.py b/paddlespeech/server/engine/engine_factory.py index 6a66a002e4a4986e9000f9d841225e1be0cbfe82..c4f3f980337847df951e0e27d7d68690e75397d7 100644 --- a/paddlespeech/server/engine/engine_factory.py +++ b/paddlespeech/server/engine/engine_factory.py @@ -13,7 +13,7 @@ # limitations under the License. from typing import Text -from ..utils.log import logger +from paddlespeech.cli.log import logger __all__ = ['EngineFactory'] diff --git a/paddlespeech/server/engine/engine_warmup.py b/paddlespeech/server/engine/engine_warmup.py index 5f548f71dbe6c673564350a197b314a55710989f..12c760c6f61ccb8c67a06c79b42b75a6d108cbeb 100644 --- a/paddlespeech/server/engine/engine_warmup.py +++ b/paddlespeech/server/engine/engine_warmup.py @@ -45,7 +45,7 @@ def warm_up(engine_and_type: str, warm_up_time: int=3) -> bool: logger.error("Please check tte engine type.") try: - logger.info("Start to warm up tts engine.") + logger.debug("Start to warm up tts engine.") for i in range(warm_up_time): connection_handler = PaddleTTSConnectionHandler(tts_engine) if flag_online: @@ -53,7 +53,7 @@ def warm_up(engine_and_type: str, warm_up_time: int=3) -> bool: text=sentence, lang=tts_engine.lang, am=tts_engine.config.am): - logger.info( + logger.debug( f"The first response time of the {i} warm up: {connection_handler.first_response_time} s" ) break @@ -62,7 +62,7 @@ def warm_up(engine_and_type: str, warm_up_time: int=3) -> bool: st = time.time() connection_handler.infer(text=sentence) et = time.time() - logger.info( + logger.debug( f"The response time of the {i} warm up: {et - st} s") except Exception as e: logger.error("Failed to warm up on tts engine.") diff --git a/paddlespeech/server/engine/text/python/text_engine.py b/paddlespeech/server/engine/text/python/text_engine.py index 73cf8737beeecbffa5e3ce97eb4b010a1345d719..6167e7784993bbafac08d8fbbc89aca66960403a 100644 --- a/paddlespeech/server/engine/text/python/text_engine.py +++ b/paddlespeech/server/engine/text/python/text_engine.py @@ -28,7 +28,7 @@ class PaddleTextConnectionHandler: text_engine (TextEngine): The Text engine """ super().__init__() - logger.info( + logger.debug( "Create PaddleTextConnectionHandler to process the text request") self.text_engine = text_engine self.task = self.text_engine.executor.task @@ -130,7 +130,7 @@ class TextEngine(BaseEngine): """The Text Engine """ super(TextEngine, self).__init__() - logger.info("Create the TextEngine Instance") + logger.debug("Create the TextEngine Instance") def init(self, config: dict): """Init the Text Engine @@ -141,7 +141,7 @@ class TextEngine(BaseEngine): Returns: bool: The engine instance flag """ - logger.info("Init the text engine") + logger.debug("Init the text engine") try: self.config = config if self.config.device: @@ -150,7 +150,7 @@ class TextEngine(BaseEngine): self.device = paddle.get_device() paddle.set_device(self.device) - logger.info(f"Text Engine set the device: {self.device}") + logger.debug(f"Text Engine set the device: {self.device}") except BaseException as e: logger.error( "Set device failed, please check if device is already used and the parameter 'device' in the yaml file" @@ -168,5 +168,6 @@ class TextEngine(BaseEngine): ckpt_path=config.ckpt_path, vocab_file=config.vocab_file) - logger.info("Init the text engine successfully") + logger.info("Initialize Text server engine successfully on device: %s." + % (self.device)) return True diff --git a/paddlespeech/server/engine/tts/online/onnx/tts_engine.py b/paddlespeech/server/engine/tts/online/onnx/tts_engine.py index f64287af9e1b06be4769a02e0900be2d005599ff..7b8e04e8b73f51a2578f0e730264364b0b132181 100644 --- a/paddlespeech/server/engine/tts/online/onnx/tts_engine.py +++ b/paddlespeech/server/engine/tts/online/onnx/tts_engine.py @@ -62,7 +62,7 @@ class TTSServerExecutor(TTSExecutor): (hasattr(self, 'am_encoder_infer_sess') and hasattr(self, 'am_decoder_sess') and hasattr( self, 'am_postnet_sess'))) and hasattr(self, 'voc_inference'): - logger.info('Models had been initialized.') + logger.debug('Models had been initialized.') return # am am_tag = am + '-' + lang @@ -85,8 +85,7 @@ class TTSServerExecutor(TTSExecutor): else: self.am_ckpt = os.path.abspath(am_ckpt[0]) self.phones_dict = os.path.abspath(phones_dict) - self.am_res_path = os.path.dirname( - os.path.abspath(am_ckpt)) + self.am_res_path = os.path.dirname(os.path.abspath(am_ckpt)) # create am sess self.am_sess = get_sess(self.am_ckpt, am_sess_conf) @@ -119,8 +118,7 @@ class TTSServerExecutor(TTSExecutor): self.am_postnet = os.path.abspath(am_ckpt[2]) self.phones_dict = os.path.abspath(phones_dict) self.am_stat = os.path.abspath(am_stat) - self.am_res_path = os.path.dirname( - os.path.abspath(am_ckpt[0])) + self.am_res_path = os.path.dirname(os.path.abspath(am_ckpt[0])) # create am sess self.am_encoder_infer_sess = get_sess(self.am_encoder_infer, @@ -130,13 +128,13 @@ class TTSServerExecutor(TTSExecutor): self.am_mu, self.am_std = np.load(self.am_stat) - logger.info(f"self.phones_dict: {self.phones_dict}") - logger.info(f"am model dir: {self.am_res_path}") - logger.info("Create am sess successfully.") + logger.debug(f"self.phones_dict: {self.phones_dict}") + logger.debug(f"am model dir: {self.am_res_path}") + logger.debug("Create am sess successfully.") # voc model info voc_tag = voc + '-' + lang - + if voc_ckpt is None: self.task_resource.set_task_model( model_tag=voc_tag, @@ -149,16 +147,16 @@ class TTSServerExecutor(TTSExecutor): else: self.voc_ckpt = os.path.abspath(voc_ckpt) self.voc_res_path = os.path.dirname(os.path.abspath(self.voc_ckpt)) - logger.info(self.voc_res_path) + logger.debug(self.voc_res_path) # create voc sess self.voc_sess = get_sess(self.voc_ckpt, voc_sess_conf) - logger.info("Create voc sess successfully.") + logger.debug("Create voc sess successfully.") with open(self.phones_dict, "r") as f: phn_id = [line.strip().split() for line in f.readlines()] self.vocab_size = len(phn_id) - logger.info(f"vocab_size: {self.vocab_size}") + logger.debug(f"vocab_size: {self.vocab_size}") # frontend self.tones_dict = None @@ -169,7 +167,7 @@ class TTSServerExecutor(TTSExecutor): elif lang == 'en': self.frontend = English(phone_vocab_path=self.phones_dict) - logger.info("frontend done!") + logger.debug("frontend done!") class TTSEngine(BaseEngine): @@ -267,7 +265,7 @@ class PaddleTTSConnectionHandler: tts_engine (TTSEngine): The TTS engine """ super().__init__() - logger.info( + logger.debug( "Create PaddleTTSConnectionHandler to process the tts request") self.tts_engine = tts_engine diff --git a/paddlespeech/server/engine/tts/online/python/tts_engine.py b/paddlespeech/server/engine/tts/online/python/tts_engine.py index 2e8997e0feddd4e308f7e501f31a2233d13da1b1..9bd95849f58952cdd0e1f292b07db137fd475963 100644 --- a/paddlespeech/server/engine/tts/online/python/tts_engine.py +++ b/paddlespeech/server/engine/tts/online/python/tts_engine.py @@ -102,16 +102,22 @@ class TTSServerExecutor(TTSExecutor): Init model and other resources from a specific path. """ if hasattr(self, 'am_inference') and hasattr(self, 'voc_inference'): - logger.info('Models had been initialized.') + logger.debug('Models had been initialized.') return # am model info + if am_ckpt is None or am_config is None or am_stat is None or phones_dict is None: + use_pretrained_am = True + else: + use_pretrained_am = False + am_tag = am + '-' + lang self.task_resource.set_task_model( model_tag=am_tag, model_type=0, # am + skip_download=not use_pretrained_am, version=None, # default version ) - if am_ckpt is None or am_config is None or am_stat is None or phones_dict is None: + if use_pretrained_am: self.am_res_path = self.task_resource.res_dir self.am_config = os.path.join(self.am_res_path, self.task_resource.res_dict['config']) @@ -122,29 +128,33 @@ class TTSServerExecutor(TTSExecutor): # must have phones_dict in acoustic self.phones_dict = os.path.join( self.am_res_path, self.task_resource.res_dict['phones_dict']) - print("self.phones_dict:", self.phones_dict) - logger.info(self.am_res_path) - logger.info(self.am_config) - logger.info(self.am_ckpt) + logger.debug(self.am_res_path) + logger.debug(self.am_config) + logger.debug(self.am_ckpt) else: self.am_config = os.path.abspath(am_config) self.am_ckpt = os.path.abspath(am_ckpt) self.am_stat = os.path.abspath(am_stat) self.phones_dict = os.path.abspath(phones_dict) self.am_res_path = os.path.dirname(os.path.abspath(self.am_config)) - print("self.phones_dict:", self.phones_dict) self.tones_dict = None self.speaker_dict = None # voc model info + if voc_ckpt is None or voc_config is None or voc_stat is None: + use_pretrained_voc = True + else: + use_pretrained_voc = False + voc_tag = voc + '-' + lang self.task_resource.set_task_model( model_tag=voc_tag, model_type=1, # vocoder + skip_download=not use_pretrained_voc, version=None, # default version ) - if voc_ckpt is None or voc_config is None or voc_stat is None: + if use_pretrained_voc: self.voc_res_path = self.task_resource.voc_res_dir self.voc_config = os.path.join( self.voc_res_path, self.task_resource.voc_res_dict['config']) @@ -153,9 +163,9 @@ class TTSServerExecutor(TTSExecutor): self.voc_stat = os.path.join( self.voc_res_path, self.task_resource.voc_res_dict['speech_stats']) - logger.info(self.voc_res_path) - logger.info(self.voc_config) - logger.info(self.voc_ckpt) + logger.debug(self.voc_res_path) + logger.debug(self.voc_config) + logger.debug(self.voc_ckpt) else: self.voc_config = os.path.abspath(voc_config) self.voc_ckpt = os.path.abspath(voc_ckpt) @@ -172,7 +182,6 @@ class TTSServerExecutor(TTSExecutor): with open(self.phones_dict, "r") as f: phn_id = [line.strip().split() for line in f.readlines()] self.vocab_size = len(phn_id) - print("vocab_size:", self.vocab_size) # frontend if lang == 'zh': @@ -182,7 +191,6 @@ class TTSServerExecutor(TTSExecutor): elif lang == 'en': self.frontend = English(phone_vocab_path=self.phones_dict) - print("frontend done!") # am infer info self.am_name = am[:am.rindex('_')] @@ -197,7 +205,6 @@ class TTSServerExecutor(TTSExecutor): self.am_name + '_inference') self.am_inference = am_inference_class(am_normalizer, am) self.am_inference.eval() - print("acoustic model done!") # voc infer info self.voc_name = voc[:voc.rindex('_')] @@ -208,7 +215,6 @@ class TTSServerExecutor(TTSExecutor): '_inference') self.voc_inference = voc_inference_class(voc_normalizer, voc) self.voc_inference.eval() - print("voc done!") class TTSEngine(BaseEngine): @@ -297,7 +303,7 @@ class PaddleTTSConnectionHandler: tts_engine (TTSEngine): The TTS engine """ super().__init__() - logger.info( + logger.debug( "Create PaddleTTSConnectionHandler to process the tts request") self.tts_engine = tts_engine @@ -357,7 +363,7 @@ class PaddleTTSConnectionHandler: text, merge_sentences=merge_sentences) phone_ids = input_ids["phone_ids"] else: - print("lang should in {'zh', 'en'}!") + logger.error("lang should in {'zh', 'en'}!") frontend_et = time.time() self.frontend_time = frontend_et - frontend_st diff --git a/paddlespeech/server/engine/tts/paddleinference/tts_engine.py b/paddlespeech/server/engine/tts/paddleinference/tts_engine.py index ab5b721ff0041c803c1b07fc4256a85040330909..43b0df407b74a50eefa9075ff6f4ef3c36f3f59a 100644 --- a/paddlespeech/server/engine/tts/paddleinference/tts_engine.py +++ b/paddlespeech/server/engine/tts/paddleinference/tts_engine.py @@ -65,16 +65,22 @@ class TTSServerExecutor(TTSExecutor): Init model and other resources from a specific path. """ if hasattr(self, 'am_predictor') and hasattr(self, 'voc_predictor'): - logger.info('Models had been initialized.') + logger.debug('Models had been initialized.') return # am + if am_model is None or am_params is None or phones_dict is None: + use_pretrained_am = True + else: + use_pretrained_am = False + am_tag = am + '-' + lang self.task_resource.set_task_model( model_tag=am_tag, model_type=0, # am + skip_download=not use_pretrained_am, version=None, # default version ) - if am_model is None or am_params is None or phones_dict is None: + if use_pretrained_am: self.am_res_path = self.task_resource.res_dir self.am_model = os.path.join(self.am_res_path, self.task_resource.res_dict['model']) @@ -85,16 +91,16 @@ class TTSServerExecutor(TTSExecutor): self.am_res_path, self.task_resource.res_dict['phones_dict']) self.am_sample_rate = self.task_resource.res_dict['sample_rate'] - logger.info(self.am_res_path) - logger.info(self.am_model) - logger.info(self.am_params) + logger.debug(self.am_res_path) + logger.debug(self.am_model) + logger.debug(self.am_params) else: self.am_model = os.path.abspath(am_model) self.am_params = os.path.abspath(am_params) self.phones_dict = os.path.abspath(phones_dict) self.am_sample_rate = am_sample_rate self.am_res_path = os.path.dirname(os.path.abspath(self.am_model)) - logger.info("self.phones_dict: {}".format(self.phones_dict)) + logger.debug("self.phones_dict: {}".format(self.phones_dict)) # for speedyspeech self.tones_dict = None @@ -113,13 +119,19 @@ class TTSServerExecutor(TTSExecutor): self.speaker_dict = speaker_dict # voc + if voc_model is None or voc_params is None: + use_pretrained_voc = True + else: + use_pretrained_voc = False + voc_tag = voc + '-' + lang self.task_resource.set_task_model( model_tag=voc_tag, model_type=1, # vocoder + skip_download=not use_pretrained_voc, version=None, # default version ) - if voc_model is None or voc_params is None: + if use_pretrained_voc: self.voc_res_path = self.task_resource.voc_res_dir self.voc_model = os.path.join( self.voc_res_path, self.task_resource.voc_res_dict['model']) @@ -127,9 +139,9 @@ class TTSServerExecutor(TTSExecutor): self.voc_res_path, self.task_resource.voc_res_dict['params']) self.voc_sample_rate = self.task_resource.voc_res_dict[ 'sample_rate'] - logger.info(self.voc_res_path) - logger.info(self.voc_model) - logger.info(self.voc_params) + logger.debug(self.voc_res_path) + logger.debug(self.voc_model) + logger.debug(self.voc_params) else: self.voc_model = os.path.abspath(voc_model) self.voc_params = os.path.abspath(voc_params) @@ -144,21 +156,21 @@ class TTSServerExecutor(TTSExecutor): with open(self.phones_dict, "r") as f: phn_id = [line.strip().split() for line in f.readlines()] vocab_size = len(phn_id) - logger.info("vocab_size: {}".format(vocab_size)) + logger.debug("vocab_size: {}".format(vocab_size)) tone_size = None if self.tones_dict: with open(self.tones_dict, "r") as f: tone_id = [line.strip().split() for line in f.readlines()] tone_size = len(tone_id) - logger.info("tone_size: {}".format(tone_size)) + logger.debug("tone_size: {}".format(tone_size)) spk_num = None if self.speaker_dict: with open(self.speaker_dict, 'rt') as f: spk_id = [line.strip().split() for line in f.readlines()] spk_num = len(spk_id) - logger.info("spk_num: {}".format(spk_num)) + logger.debug("spk_num: {}".format(spk_num)) # frontend if lang == 'zh': @@ -168,7 +180,7 @@ class TTSServerExecutor(TTSExecutor): elif lang == 'en': self.frontend = English(phone_vocab_path=self.phones_dict) - logger.info("frontend done!") + logger.debug("frontend done!") # Create am predictor self.am_predictor_conf = am_predictor_conf @@ -176,7 +188,7 @@ class TTSServerExecutor(TTSExecutor): model_file=self.am_model, params_file=self.am_params, predictor_conf=self.am_predictor_conf) - logger.info("Create AM predictor successfully.") + logger.debug("Create AM predictor successfully.") # Create voc predictor self.voc_predictor_conf = voc_predictor_conf @@ -184,7 +196,7 @@ class TTSServerExecutor(TTSExecutor): model_file=self.voc_model, params_file=self.voc_params, predictor_conf=self.voc_predictor_conf) - logger.info("Create Vocoder predictor successfully.") + logger.debug("Create Vocoder predictor successfully.") @paddle.no_grad() def infer(self, @@ -316,7 +328,8 @@ class TTSEngine(BaseEngine): logger.error(e) return False - logger.info("Initialize TTS server engine successfully.") + logger.info("Initialize TTS server engine successfully on device: %s." % + (self.device)) return True @@ -328,7 +341,7 @@ class PaddleTTSConnectionHandler(TTSServerExecutor): tts_engine (TTSEngine): The TTS engine """ super().__init__() - logger.info( + logger.debug( "Create PaddleTTSConnectionHandler to process the tts request") self.tts_engine = tts_engine @@ -366,23 +379,23 @@ class PaddleTTSConnectionHandler(TTSServerExecutor): if target_fs == 0 or target_fs > original_fs: target_fs = original_fs wav_tar_fs = wav - logger.info( + logger.debug( "The sample rate of synthesized audio is the same as model, which is {}Hz". format(original_fs)) else: wav_tar_fs = librosa.resample( np.squeeze(wav), original_fs, target_fs) - logger.info( + logger.debug( "The sample rate of model is {}Hz and the target sample rate is {}Hz. Converting the sample rate of the synthesized audio successfully.". format(original_fs, target_fs)) # transform volume wav_vol = wav_tar_fs * volume - logger.info("Transform the volume of the audio successfully.") + logger.debug("Transform the volume of the audio successfully.") # transform speed try: # windows not support soxbindings wav_speed = change_speed(wav_vol, speed, target_fs) - logger.info("Transform the speed of the audio successfully.") + logger.debug("Transform the speed of the audio successfully.") except ServerBaseException: raise ServerBaseException( ErrorCode.SERVER_INTERNAL_ERR, @@ -399,7 +412,7 @@ class PaddleTTSConnectionHandler(TTSServerExecutor): wavfile.write(buf, target_fs, wav_speed) base64_bytes = base64.b64encode(buf.read()) wav_base64 = base64_bytes.decode('utf-8') - logger.info("Audio to string successfully.") + logger.debug("Audio to string successfully.") # save audio if audio_path is not None: @@ -487,15 +500,15 @@ class PaddleTTSConnectionHandler(TTSServerExecutor): logger.error(e) sys.exit(-1) - logger.info("AM model: {}".format(self.config.am)) - logger.info("Vocoder model: {}".format(self.config.voc)) - logger.info("Language: {}".format(lang)) + logger.debug("AM model: {}".format(self.config.am)) + logger.debug("Vocoder model: {}".format(self.config.voc)) + logger.debug("Language: {}".format(lang)) logger.info("tts engine type: python") logger.info("audio duration: {}".format(duration)) - logger.info("frontend inference time: {}".format(self.frontend_time)) - logger.info("AM inference time: {}".format(self.am_time)) - logger.info("Vocoder inference time: {}".format(self.voc_time)) + logger.debug("frontend inference time: {}".format(self.frontend_time)) + logger.debug("AM inference time: {}".format(self.am_time)) + logger.debug("Vocoder inference time: {}".format(self.voc_time)) logger.info("total inference time: {}".format(infer_time)) logger.info( "postprocess (change speed, volume, target sample rate) time: {}". @@ -503,6 +516,6 @@ class PaddleTTSConnectionHandler(TTSServerExecutor): logger.info("total generate audio time: {}".format(infer_time + postprocess_time)) logger.info("RTF: {}".format(rtf)) - logger.info("device: {}".format(self.tts_engine.device)) + logger.debug("device: {}".format(self.tts_engine.device)) return lang, target_sample_rate, duration, wav_base64 diff --git a/paddlespeech/server/engine/tts/python/tts_engine.py b/paddlespeech/server/engine/tts/python/tts_engine.py index b048b01a49f1cf34a1edd4b10d5b85da74e579f4..4d1801006b87699cbbe19660b103ffcb4068c446 100644 --- a/paddlespeech/server/engine/tts/python/tts_engine.py +++ b/paddlespeech/server/engine/tts/python/tts_engine.py @@ -105,7 +105,7 @@ class PaddleTTSConnectionHandler(TTSServerExecutor): tts_engine (TTSEngine): The TTS engine """ super().__init__() - logger.info( + logger.debug( "Create PaddleTTSConnectionHandler to process the tts request") self.tts_engine = tts_engine @@ -143,23 +143,23 @@ class PaddleTTSConnectionHandler(TTSServerExecutor): if target_fs == 0 or target_fs > original_fs: target_fs = original_fs wav_tar_fs = wav - logger.info( + logger.debug( "The sample rate of synthesized audio is the same as model, which is {}Hz". format(original_fs)) else: wav_tar_fs = librosa.resample( np.squeeze(wav), original_fs, target_fs) - logger.info( + logger.debug( "The sample rate of model is {}Hz and the target sample rate is {}Hz. Converting the sample rate of the synthesized audio successfully.". format(original_fs, target_fs)) # transform volume wav_vol = wav_tar_fs * volume - logger.info("Transform the volume of the audio successfully.") + logger.debug("Transform the volume of the audio successfully.") # transform speed try: # windows not support soxbindings wav_speed = change_speed(wav_vol, speed, target_fs) - logger.info("Transform the speed of the audio successfully.") + logger.debug("Transform the speed of the audio successfully.") except ServerBaseException: raise ServerBaseException( ErrorCode.SERVER_INTERNAL_ERR, @@ -176,7 +176,7 @@ class PaddleTTSConnectionHandler(TTSServerExecutor): wavfile.write(buf, target_fs, wav_speed) base64_bytes = base64.b64encode(buf.read()) wav_base64 = base64_bytes.decode('utf-8') - logger.info("Audio to string successfully.") + logger.debug("Audio to string successfully.") # save audio if audio_path is not None: @@ -264,15 +264,15 @@ class PaddleTTSConnectionHandler(TTSServerExecutor): logger.error(e) sys.exit(-1) - logger.info("AM model: {}".format(self.config.am)) - logger.info("Vocoder model: {}".format(self.config.voc)) - logger.info("Language: {}".format(lang)) + logger.debug("AM model: {}".format(self.config.am)) + logger.debug("Vocoder model: {}".format(self.config.voc)) + logger.debug("Language: {}".format(lang)) logger.info("tts engine type: python") logger.info("audio duration: {}".format(duration)) - logger.info("frontend inference time: {}".format(self.frontend_time)) - logger.info("AM inference time: {}".format(self.am_time)) - logger.info("Vocoder inference time: {}".format(self.voc_time)) + logger.debug("frontend inference time: {}".format(self.frontend_time)) + logger.debug("AM inference time: {}".format(self.am_time)) + logger.debug("Vocoder inference time: {}".format(self.voc_time)) logger.info("total inference time: {}".format(infer_time)) logger.info( "postprocess (change speed, volume, target sample rate) time: {}". @@ -280,6 +280,6 @@ class PaddleTTSConnectionHandler(TTSServerExecutor): logger.info("total generate audio time: {}".format(infer_time + postprocess_time)) logger.info("RTF: {}".format(rtf)) - logger.info("device: {}".format(self.tts_engine.device)) + logger.debug("device: {}".format(self.tts_engine.device)) return lang, target_sample_rate, duration, wav_base64 diff --git a/paddlespeech/server/engine/vector/python/vector_engine.py b/paddlespeech/server/engine/vector/python/vector_engine.py index 3c72f55d4b61328db8ca91b976d4f34071974195..f7d60648d040e0bd3a60883e0a5a3900689b8754 100644 --- a/paddlespeech/server/engine/vector/python/vector_engine.py +++ b/paddlespeech/server/engine/vector/python/vector_engine.py @@ -33,7 +33,7 @@ class PaddleVectorConnectionHandler: vector_engine (VectorEngine): The Vector engine """ super().__init__() - logger.info( + logger.debug( "Create PaddleVectorConnectionHandler to process the vector request") self.vector_engine = vector_engine self.executor = self.vector_engine.executor @@ -54,7 +54,7 @@ class PaddleVectorConnectionHandler: Returns: str: the punctuation text """ - logger.info( + logger.debug( f"start to extract the do vector {self.task} from the http request") if self.task == "spk" and task == "spk": embedding = self.extract_audio_embedding(audio_data) @@ -81,17 +81,17 @@ class PaddleVectorConnectionHandler: Returns: float: the score between enroll and test audio """ - logger.info("start to extract the enroll audio embedding") + logger.debug("start to extract the enroll audio embedding") enroll_emb = self.extract_audio_embedding(enroll_audio) - logger.info("start to extract the test audio embedding") + logger.debug("start to extract the test audio embedding") test_emb = self.extract_audio_embedding(test_audio) - logger.info( + logger.debug( "start to get the score between the enroll and test embedding") score = self.executor.get_embeddings_score(enroll_emb, test_emb) - logger.info(f"get the enroll vs test score: {score}") + logger.debug(f"get the enroll vs test score: {score}") return score @paddle.no_grad() @@ -106,11 +106,12 @@ class PaddleVectorConnectionHandler: # because the soundfile will change the io.BytesIO(audio) to the end # thus we should convert the base64 string to io.BytesIO when we need the audio data if not self.executor._check(io.BytesIO(audio), sample_rate): - logger.info("check the audio sample rate occurs error") + logger.debug("check the audio sample rate occurs error") return np.array([0.0]) waveform, sr = load_audio(io.BytesIO(audio)) - logger.info(f"load the audio sample points, shape is: {waveform.shape}") + logger.debug( + f"load the audio sample points, shape is: {waveform.shape}") # stage 2: get the audio feat # Note: Now we only support fbank feature @@ -121,9 +122,9 @@ class PaddleVectorConnectionHandler: n_mels=self.config.n_mels, window_size=self.config.window_size, hop_length=self.config.hop_size) - logger.info(f"extract the audio feats, shape is: {feats.shape}") + logger.debug(f"extract the audio feats, shape is: {feats.shape}") except Exception as e: - logger.info(f"feats occurs exception {e}") + logger.error(f"feats occurs exception {e}") sys.exit(-1) feats = paddle.to_tensor(feats).unsqueeze(0) @@ -159,7 +160,7 @@ class VectorEngine(BaseEngine): """The Vector Engine """ super(VectorEngine, self).__init__() - logger.info("Create the VectorEngine Instance") + logger.debug("Create the VectorEngine Instance") def init(self, config: dict): """Init the Vector Engine @@ -170,7 +171,7 @@ class VectorEngine(BaseEngine): Returns: bool: The engine instance flag """ - logger.info("Init the vector engine") + logger.debug("Init the vector engine") try: self.config = config if self.config.device: @@ -179,7 +180,7 @@ class VectorEngine(BaseEngine): self.device = paddle.get_device() paddle.set_device(self.device) - logger.info(f"Vector Engine set the device: {self.device}") + logger.debug(f"Vector Engine set the device: {self.device}") except BaseException as e: logger.error( "Set device failed, please check if device is already used and the parameter 'device' in the yaml file" @@ -196,5 +197,7 @@ class VectorEngine(BaseEngine): ckpt_path=config.ckpt_path, task=config.task) - logger.info("Init the Vector engine successfully") + logger.info( + "Initialize Vector server engine successfully on device: %s." % + (self.device)) return True diff --git a/paddlespeech/server/utils/audio_handler.py b/paddlespeech/server/utils/audio_handler.py index e3d90d4694997e04930fe6d083271ed9852dda9d..d4540781d6e195b3395d47a713b135b88f8d0a47 100644 --- a/paddlespeech/server/utils/audio_handler.py +++ b/paddlespeech/server/utils/audio_handler.py @@ -138,7 +138,7 @@ class ASRWsAudioHandler: Returns: str: the final asr result """ - logging.info("send a message to the server") + logging.debug("send a message to the server") if self.url is None: logger.error("No asr server, please input valid ip and port") @@ -160,7 +160,7 @@ class ASRWsAudioHandler: separators=(',', ': ')) await ws.send(audio_info) msg = await ws.recv() - logger.info("client receive msg={}".format(msg)) + logger.debug("client receive msg={}".format(msg)) # 3. send chunk audio data to engine for chunk_data in self.read_wave(wavfile_path): @@ -170,7 +170,7 @@ class ASRWsAudioHandler: if self.punc_server and len(msg["result"]) > 0: msg["result"] = self.punc_server.run(msg["result"]) - logger.info("client receive msg={}".format(msg)) + logger.debug("client receive msg={}".format(msg)) # 4. we must send finished signal to the server audio_info = json.dumps( @@ -310,7 +310,7 @@ class TTSWsHandler: start_request = json.dumps({"task": "tts", "signal": "start"}) await ws.send(start_request) msg = await ws.recv() - logger.info(f"client receive msg={msg}") + logger.debug(f"client receive msg={msg}") msg = json.loads(msg) session = msg["session"] @@ -319,7 +319,7 @@ class TTSWsHandler: request = json.dumps({"text": text_base64}) st = time.time() await ws.send(request) - logging.info("send a message to the server") + logging.debug("send a message to the server") # 4. Process the received response message = await ws.recv() @@ -543,7 +543,6 @@ class VectorHttpHandler: "sample_rate": sample_rate, } - logger.info(self.url) res = requests.post(url=self.url, data=json.dumps(data)) return res.json() diff --git a/paddlespeech/server/utils/audio_process.py b/paddlespeech/server/utils/audio_process.py index 416d77ac41d02794ce8bd5ec3de4f1fd8f5add9a..ae53839794877497c80175bb23bb4ad560dac61f 100644 --- a/paddlespeech/server/utils/audio_process.py +++ b/paddlespeech/server/utils/audio_process.py @@ -169,7 +169,7 @@ def save_audio(bytes_data, audio_path, sample_rate: int=24000) -> bool: sample_rate=sample_rate) os.remove("./tmp.pcm") else: - print("Only supports saved audio format is pcm or wav") + logger.error("Only supports saved audio format is pcm or wav") return False return True diff --git a/paddlespeech/server/utils/log.py b/paddlespeech/server/utils/log.py deleted file mode 100644 index 8644064c73ef407476e7870e65d1149019762723..0000000000000000000000000000000000000000 --- a/paddlespeech/server/utils/log.py +++ /dev/null @@ -1,59 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import functools -import logging - -__all__ = [ - 'logger', -] - - -class Logger(object): - def __init__(self, name: str=None): - name = 'PaddleSpeech' if not name else name - self.logger = logging.getLogger(name) - - log_config = { - 'DEBUG': 10, - 'INFO': 20, - 'TRAIN': 21, - 'EVAL': 22, - 'WARNING': 30, - 'ERROR': 40, - 'CRITICAL': 50, - 'EXCEPTION': 100, - } - for key, level in log_config.items(): - logging.addLevelName(level, key) - if key == 'EXCEPTION': - self.__dict__[key.lower()] = self.logger.exception - else: - self.__dict__[key.lower()] = functools.partial(self.__call__, - level) - - self.format = logging.Formatter( - fmt='[%(asctime)-15s] [%(levelname)8s] - %(message)s') - - self.handler = logging.StreamHandler() - self.handler.setFormatter(self.format) - - self.logger.addHandler(self.handler) - self.logger.setLevel(logging.DEBUG) - self.logger.propagate = False - - def __call__(self, log_level: str, msg: str): - self.logger.log(log_level, msg) - - -logger = Logger() diff --git a/paddlespeech/server/utils/onnx_infer.py b/paddlespeech/server/utils/onnx_infer.py index 1c9d878f83f96d1d7ad44796eabe49ef9160078e..23d83c735a7b1aa0853c37db8e231656712110f2 100644 --- a/paddlespeech/server/utils/onnx_infer.py +++ b/paddlespeech/server/utils/onnx_infer.py @@ -16,11 +16,11 @@ from typing import Optional import onnxruntime as ort -from .log import logger +from paddlespeech.cli.log import logger def get_sess(model_path: Optional[os.PathLike]=None, sess_conf: dict=None): - logger.info(f"ort sessconf: {sess_conf}") + logger.debug(f"ort sessconf: {sess_conf}") sess_options = ort.SessionOptions() sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL if sess_conf.get('graph_optimization_level', 99) == 0: @@ -34,7 +34,7 @@ def get_sess(model_path: Optional[os.PathLike]=None, sess_conf: dict=None): # fastspeech2/mb_melgan can't use trt now! if sess_conf.get("use_trt", 0): providers = ['TensorrtExecutionProvider'] - logger.info(f"ort providers: {providers}") + logger.debug(f"ort providers: {providers}") if 'cpu_threads' in sess_conf: sess_options.intra_op_num_threads = sess_conf.get("cpu_threads", 0) diff --git a/paddlespeech/server/utils/util.py b/paddlespeech/server/utils/util.py index 061b213c78360d523d1cc3cc180f93cfaac387ab..826d923ed0f255e5411e3192b6c4d680ef14c933 100644 --- a/paddlespeech/server/utils/util.py +++ b/paddlespeech/server/utils/util.py @@ -13,6 +13,8 @@ import base64 import math +from paddlespeech.cli.log import logger + def wav2base64(wav_file: str): """ @@ -61,7 +63,7 @@ def get_chunks(data, block_size, pad_size, step): elif step == "voc": data_len = data.shape[0] else: - print("Please set correct type to get chunks, am or voc") + logger.error("Please set correct type to get chunks, am or voc") chunks = [] n = math.ceil(data_len / block_size) @@ -73,7 +75,7 @@ def get_chunks(data, block_size, pad_size, step): elif step == "voc": chunks.append(data[start:end, :]) else: - print("Please set correct type to get chunks, am or voc") + logger.error("Please set correct type to get chunks, am or voc") return chunks diff --git a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py index 48595bb25ca2241b74ebe22be6325708564b5699..347a10e90a2b80d09ddee07d14a19db12377c4be 100644 --- a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py +++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py @@ -141,71 +141,133 @@ class FastSpeech2(nn.Layer): init_dec_alpha: float=1.0, ): """Initialize FastSpeech2 module. Args: - idim (int): Dimension of the inputs. - odim (int): Dimension of the outputs. - adim (int): Attention dimension. - aheads (int): Number of attention heads. - elayers (int): Number of encoder layers. - eunits (int): Number of encoder hidden units. - dlayers (int): Number of decoder layers. - dunits (int): Number of decoder hidden units. - postnet_layers (int): Number of postnet layers. - postnet_chans (int): Number of postnet channels. - postnet_filts (int): Kernel size of postnet. - postnet_dropout_rate (float): Dropout rate in postnet. - use_scaled_pos_enc (bool): Whether to use trainable scaled pos encoding. - use_batch_norm (bool): Whether to use batch normalization in encoder prenet. - encoder_normalize_before (bool): Whether to apply layernorm layer before encoder block. - decoder_normalize_before (bool): Whether to apply layernorm layer before decoder block. - encoder_concat_after (bool): Whether to concatenate attention layer's input and output in encoder. - decoder_concat_after (bool): Whether to concatenate attention layer's input and output in decoder. - reduction_factor (int): Reduction factor. - encoder_type (str): Encoder type ("transformer" or "conformer"). - decoder_type (str): Decoder type ("transformer" or "conformer"). - transformer_enc_dropout_rate (float): Dropout rate in encoder except attention and positional encoding. - transformer_enc_positional_dropout_rate (float): Dropout rate after encoder positional encoding. - transformer_enc_attn_dropout_rate (float): Dropout rate in encoder self-attention module. - transformer_dec_dropout_rate (float): Dropout rate in decoder except attention & positional encoding. - transformer_dec_positional_dropout_rate (float): Dropout rate after decoder positional encoding. - transformer_dec_attn_dropout_rate (float): Dropout rate in decoder self-attention module. - conformer_pos_enc_layer_type (str): Pos encoding layer type in conformer. - conformer_self_attn_layer_type (str): Self-attention layer type in conformer - conformer_activation_type (str): Activation function type in conformer. - use_macaron_style_in_conformer (bool): Whether to use macaron style FFN. - use_cnn_in_conformer (bool): Whether to use CNN in conformer. - zero_triu (bool): Whether to use zero triu in relative self-attention module. - conformer_enc_kernel_size (int): Kernel size of encoder conformer. - conformer_dec_kernel_size (int): Kernel size of decoder conformer. - duration_predictor_layers (int): Number of duration predictor layers. - duration_predictor_chans (int): Number of duration predictor channels. - duration_predictor_kernel_size (int): Kernel size of duration predictor. - duration_predictor_dropout_rate (float): Dropout rate in duration predictor. - pitch_predictor_layers (int): Number of pitch predictor layers. - pitch_predictor_chans (int): Number of pitch predictor channels. - pitch_predictor_kernel_size (int): Kernel size of pitch predictor. - pitch_predictor_dropout_rate (float): Dropout rate in pitch predictor. - pitch_embed_kernel_size (float): Kernel size of pitch embedding. - pitch_embed_dropout_rate (float): Dropout rate for pitch embedding. - stop_gradient_from_pitch_predictor (bool): Whether to stop gradient from pitch predictor to encoder. - energy_predictor_layers (int): Number of energy predictor layers. - energy_predictor_chans (int): Number of energy predictor channels. - energy_predictor_kernel_size (int): Kernel size of energy predictor. - energy_predictor_dropout_rate (float): Dropout rate in energy predictor. - energy_embed_kernel_size (float): Kernel size of energy embedding. - energy_embed_dropout_rate (float): Dropout rate for energy embedding. - stop_gradient_from_energy_predictor(bool): Whether to stop gradient from energy predictor to encoder. - spk_num (Optional[int]): Number of speakers. If not None, assume that the spk_embed_dim is not None, + idim (int): + Dimension of the inputs. + odim (int): + Dimension of the outputs. + adim (int): + Attention dimension. + aheads (int): + Number of attention heads. + elayers (int): + Number of encoder layers. + eunits (int): + Number of encoder hidden units. + dlayers (int): + Number of decoder layers. + dunits (int): + Number of decoder hidden units. + postnet_layers (int): + Number of postnet layers. + postnet_chans (int): + Number of postnet channels. + postnet_filts (int): + Kernel size of postnet. + postnet_dropout_rate (float): + Dropout rate in postnet. + use_scaled_pos_enc (bool): + Whether to use trainable scaled pos encoding. + use_batch_norm (bool): + Whether to use batch normalization in encoder prenet. + encoder_normalize_before (bool): + Whether to apply layernorm layer before encoder block. + decoder_normalize_before (bool): + Whether to apply layernorm layer before decoder block. + encoder_concat_after (bool): + Whether to concatenate attention layer's input and output in encoder. + decoder_concat_after (bool): + Whether to concatenate attention layer's input and output in decoder. + reduction_factor (int): + Reduction factor. + encoder_type (str): + Encoder type ("transformer" or "conformer"). + decoder_type (str): + Decoder type ("transformer" or "conformer"). + transformer_enc_dropout_rate (float): + Dropout rate in encoder except attention and positional encoding. + transformer_enc_positional_dropout_rate (float): + Dropout rate after encoder positional encoding. + transformer_enc_attn_dropout_rate (float): + Dropout rate in encoder self-attention module. + transformer_dec_dropout_rate (float): + Dropout rate in decoder except attention & positional encoding. + transformer_dec_positional_dropout_rate (float): + Dropout rate after decoder positional encoding. + transformer_dec_attn_dropout_rate (float): + Dropout rate in decoder self-attention module. + conformer_pos_enc_layer_type (str): + Pos encoding layer type in conformer. + conformer_self_attn_layer_type (str): + Self-attention layer type in conformer + conformer_activation_type (str): + Activation function type in conformer. + use_macaron_style_in_conformer (bool): + Whether to use macaron style FFN. + use_cnn_in_conformer (bool): + Whether to use CNN in conformer. + zero_triu (bool): + Whether to use zero triu in relative self-attention module. + conformer_enc_kernel_size (int): + Kernel size of encoder conformer. + conformer_dec_kernel_size (int): + Kernel size of decoder conformer. + duration_predictor_layers (int): + Number of duration predictor layers. + duration_predictor_chans (int): + Number of duration predictor channels. + duration_predictor_kernel_size (int): + Kernel size of duration predictor. + duration_predictor_dropout_rate (float): + Dropout rate in duration predictor. + pitch_predictor_layers (int): + Number of pitch predictor layers. + pitch_predictor_chans (int): + Number of pitch predictor channels. + pitch_predictor_kernel_size (int): + Kernel size of pitch predictor. + pitch_predictor_dropout_rate (float): + Dropout rate in pitch predictor. + pitch_embed_kernel_size (float): + Kernel size of pitch embedding. + pitch_embed_dropout_rate (float): + Dropout rate for pitch embedding. + stop_gradient_from_pitch_predictor (bool): + Whether to stop gradient from pitch predictor to encoder. + energy_predictor_layers (int): + Number of energy predictor layers. + energy_predictor_chans (int): + Number of energy predictor channels. + energy_predictor_kernel_size (int): + Kernel size of energy predictor. + energy_predictor_dropout_rate (float): + Dropout rate in energy predictor. + energy_embed_kernel_size (float): + Kernel size of energy embedding. + energy_embed_dropout_rate (float): + Dropout rate for energy embedding. + stop_gradient_from_energy_predictor(bool): + Whether to stop gradient from energy predictor to encoder. + spk_num (Optional[int]): + Number of speakers. If not None, assume that the spk_embed_dim is not None, spk_ids will be provided as the input and use spk_embedding_table. - spk_embed_dim (Optional[int]): Speaker embedding dimension. If not None, + spk_embed_dim (Optional[int]): + Speaker embedding dimension. If not None, assume that spk_emb will be provided as the input or spk_num is not None. - spk_embed_integration_type (str): How to integrate speaker embedding. - tone_num (Optional[int]): Number of tones. If not None, assume that the + spk_embed_integration_type (str): + How to integrate speaker embedding. + tone_num (Optional[int]): + Number of tones. If not None, assume that the tone_ids will be provided as the input and use tone_embedding_table. - tone_embed_dim (Optional[int]): Tone embedding dimension. If not None, assume that tone_num is not None. - tone_embed_integration_type (str): How to integrate tone embedding. - init_type (str): How to initialize transformer parameters. - init_enc_alpha (float): Initial value of alpha in scaled pos encoding of the encoder. - init_dec_alpha (float): Initial value of alpha in scaled pos encoding of the decoder. + tone_embed_dim (Optional[int]): + Tone embedding dimension. If not None, assume that tone_num is not None. + tone_embed_integration_type (str): + How to integrate tone embedding. + init_type (str): + How to initialize transformer parameters. + init_enc_alpha (float): + Initial value of alpha in scaled pos encoding of the encoder. + init_dec_alpha (float): + Initial value of alpha in scaled pos encoding of the decoder. """ assert check_argument_types() @@ -258,7 +320,6 @@ class FastSpeech2(nn.Layer): padding_idx=self.padding_idx) if encoder_type == "transformer": - print("encoder_type is transformer") self.encoder = TransformerEncoder( idim=idim, attention_dim=adim, @@ -275,7 +336,6 @@ class FastSpeech2(nn.Layer): positionwise_layer_type=positionwise_layer_type, positionwise_conv_kernel_size=positionwise_conv_kernel_size, ) elif encoder_type == "conformer": - print("encoder_type is conformer") self.encoder = ConformerEncoder( idim=idim, attention_dim=adim, @@ -362,7 +422,6 @@ class FastSpeech2(nn.Layer): # NOTE: we use encoder as decoder # because fastspeech's decoder is the same as encoder if decoder_type == "transformer": - print("decoder_type is transformer") self.decoder = TransformerEncoder( idim=0, attention_dim=adim, @@ -380,7 +439,6 @@ class FastSpeech2(nn.Layer): positionwise_layer_type=positionwise_layer_type, positionwise_conv_kernel_size=positionwise_conv_kernel_size, ) elif decoder_type == "conformer": - print("decoder_type is conformer") self.decoder = ConformerEncoder( idim=0, attention_dim=adim, @@ -453,20 +511,29 @@ class FastSpeech2(nn.Layer): """Calculate forward propagation. Args: - text(Tensor(int64)): Batch of padded token ids (B, Tmax). - text_lengths(Tensor(int64)): Batch of lengths of each input (B,). - speech(Tensor): Batch of padded target features (B, Lmax, odim). - speech_lengths(Tensor(int64)): Batch of the lengths of each target (B,). - durations(Tensor(int64)): Batch of padded durations (B, Tmax). - pitch(Tensor): Batch of padded token-averaged pitch (B, Tmax, 1). - energy(Tensor): Batch of padded token-averaged energy (B, Tmax, 1). - tone_id(Tensor, optional(int64)): Batch of padded tone ids (B, Tmax). - spk_emb(Tensor, optional): Batch of speaker embeddings (B, spk_embed_dim). - spk_id(Tnesor, optional(int64)): Batch of speaker ids (B,) + text(Tensor(int64)): + Batch of padded token ids (B, Tmax). + text_lengths(Tensor(int64)): + Batch of lengths of each input (B,). + speech(Tensor): + Batch of padded target features (B, Lmax, odim). + speech_lengths(Tensor(int64)): + Batch of the lengths of each target (B,). + durations(Tensor(int64)): + Batch of padded durations (B, Tmax). + pitch(Tensor): + Batch of padded token-averaged pitch (B, Tmax, 1). + energy(Tensor): + Batch of padded token-averaged energy (B, Tmax, 1). + tone_id(Tensor, optional(int64)): + Batch of padded tone ids (B, Tmax). + spk_emb(Tensor, optional): + Batch of speaker embeddings (B, spk_embed_dim). + spk_id(Tnesor, optional(int64)): + Batch of speaker ids (B,) Returns: - """ # input of embedding must be int64 @@ -662,20 +729,28 @@ class FastSpeech2(nn.Layer): """Generate the sequence of features given the sequences of characters. Args: - text(Tensor(int64)): Input sequence of characters (T,). - durations(Tensor, optional (int64)): Groundtruth of duration (T,). - pitch(Tensor, optional): Groundtruth of token-averaged pitch (T, 1). - energy(Tensor, optional): Groundtruth of token-averaged energy (T, 1). - alpha(float, optional): Alpha to control the speed. - use_teacher_forcing(bool, optional): Whether to use teacher forcing. + text(Tensor(int64)): + Input sequence of characters (T,). + durations(Tensor, optional (int64)): + Groundtruth of duration (T,). + pitch(Tensor, optional): + Groundtruth of token-averaged pitch (T, 1). + energy(Tensor, optional): + Groundtruth of token-averaged energy (T, 1). + alpha(float, optional): + Alpha to control the speed. + use_teacher_forcing(bool, optional): + Whether to use teacher forcing. If true, groundtruth of duration, pitch and energy will be used. - spk_emb(Tensor, optional, optional): peaker embedding vector (spk_embed_dim,). (Default value = None) - spk_id(Tensor, optional(int64), optional): spk ids (1,). (Default value = None) - tone_id(Tensor, optional(int64), optional): tone ids (T,). (Default value = None) + spk_emb(Tensor, optional, optional): + peaker embedding vector (spk_embed_dim,). (Default value = None) + spk_id(Tensor, optional(int64), optional): + spk ids (1,). (Default value = None) + tone_id(Tensor, optional(int64), optional): + tone ids (T,). (Default value = None) Returns: - """ # input of embedding must be int64 x = paddle.cast(text, 'int64') @@ -724,8 +799,10 @@ class FastSpeech2(nn.Layer): """Integrate speaker embedding with hidden states. Args: - hs(Tensor): Batch of hidden state sequences (B, Tmax, adim). - spk_emb(Tensor): Batch of speaker embeddings (B, spk_embed_dim). + hs(Tensor): + Batch of hidden state sequences (B, Tmax, adim). + spk_emb(Tensor): + Batch of speaker embeddings (B, spk_embed_dim). Returns: @@ -749,8 +826,10 @@ class FastSpeech2(nn.Layer): """Integrate speaker embedding with hidden states. Args: - hs(Tensor): Batch of hidden state sequences (B, Tmax, adim). - tone_embs(Tensor): Batch of speaker embeddings (B, Tmax, tone_embed_dim). + hs(Tensor): + Batch of hidden state sequences (B, Tmax, adim). + tone_embs(Tensor): + Batch of speaker embeddings (B, Tmax, tone_embed_dim). Returns: @@ -773,10 +852,12 @@ class FastSpeech2(nn.Layer): """Make masks for self-attention. Args: - ilens(Tensor): Batch of lengths (B,). + ilens(Tensor): + Batch of lengths (B,). Returns: - Tensor: Mask tensor for self-attention. dtype=paddle.bool + Tensor: + Mask tensor for self-attention. dtype=paddle.bool Examples: >>> ilens = [5, 3] @@ -858,19 +939,32 @@ class StyleFastSpeech2Inference(FastSpeech2Inference): """ Args: - text(Tensor(int64)): Input sequence of characters (T,). - durations(paddle.Tensor/np.ndarray, optional (int64)): Groundtruth of duration (T,), this will overwrite the set of durations_scale and durations_bias + text(Tensor(int64)): + Input sequence of characters (T,). + durations(paddle.Tensor/np.ndarray, optional (int64)): + Groundtruth of duration (T,), this will overwrite the set of durations_scale and durations_bias durations_scale(int/float, optional): + durations_bias(int/float, optional): - pitch(paddle.Tensor/np.ndarray, optional): Groundtruth of token-averaged pitch (T, 1), this will overwrite the set of pitch_scale and pitch_bias - pitch_scale(int/float, optional): In denormed HZ domain. - pitch_bias(int/float, optional): In denormed HZ domain. - energy(paddle.Tensor/np.ndarray, optional): Groundtruth of token-averaged energy (T, 1), this will overwrite the set of energy_scale and energy_bias - energy_scale(int/float, optional): In denormed domain. - energy_bias(int/float, optional): In denormed domain. - robot: bool: (Default value = False) - spk_emb: (Default value = None) - spk_id: (Default value = None) + + pitch(paddle.Tensor/np.ndarray, optional): + Groundtruth of token-averaged pitch (T, 1), this will overwrite the set of pitch_scale and pitch_bias + pitch_scale(int/float, optional): + In denormed HZ domain. + pitch_bias(int/float, optional): + In denormed HZ domain. + energy(paddle.Tensor/np.ndarray, optional): + Groundtruth of token-averaged energy (T, 1), this will overwrite the set of energy_scale and energy_bias + energy_scale(int/float, optional): + In denormed domain. + energy_bias(int/float, optional): + In denormed domain. + robot(bool) (Default value = False): + + spk_emb(Default value = None): + + spk_id(Default value = None): + Returns: Tensor: logmel @@ -949,8 +1043,10 @@ class FastSpeech2Loss(nn.Layer): use_weighted_masking: bool=False): """Initialize feed-forward Transformer loss module. Args: - use_masking (bool): Whether to apply masking for padded part in loss calculation. - use_weighted_masking (bool): Whether to weighted masking in loss calculation. + use_masking (bool): + Whether to apply masking for padded part in loss calculation. + use_weighted_masking (bool): + Whether to weighted masking in loss calculation. """ assert check_argument_types() super().__init__() @@ -982,17 +1078,28 @@ class FastSpeech2Loss(nn.Layer): """Calculate forward propagation. Args: - after_outs(Tensor): Batch of outputs after postnets (B, Lmax, odim). - before_outs(Tensor): Batch of outputs before postnets (B, Lmax, odim). - d_outs(Tensor): Batch of outputs of duration predictor (B, Tmax). - p_outs(Tensor): Batch of outputs of pitch predictor (B, Tmax, 1). - e_outs(Tensor): Batch of outputs of energy predictor (B, Tmax, 1). - ys(Tensor): Batch of target features (B, Lmax, odim). - ds(Tensor): Batch of durations (B, Tmax). - ps(Tensor): Batch of target token-averaged pitch (B, Tmax, 1). - es(Tensor): Batch of target token-averaged energy (B, Tmax, 1). - ilens(Tensor): Batch of the lengths of each input (B,). - olens(Tensor): Batch of the lengths of each target (B,). + after_outs(Tensor): + Batch of outputs after postnets (B, Lmax, odim). + before_outs(Tensor): + Batch of outputs before postnets (B, Lmax, odim). + d_outs(Tensor): + Batch of outputs of duration predictor (B, Tmax). + p_outs(Tensor): + Batch of outputs of pitch predictor (B, Tmax, 1). + e_outs(Tensor): + Batch of outputs of energy predictor (B, Tmax, 1). + ys(Tensor): + Batch of target features (B, Lmax, odim). + ds(Tensor): + Batch of durations (B, Tmax). + ps(Tensor): + Batch of target token-averaged pitch (B, Tmax, 1). + es(Tensor): + Batch of target token-averaged energy (B, Tmax, 1). + ilens(Tensor): + Batch of the lengths of each input (B,). + olens(Tensor): + Batch of the lengths of each target (B,). Returns: diff --git a/paddlespeech/t2s/models/hifigan/hifigan.py b/paddlespeech/t2s/models/hifigan/hifigan.py index bea9dd9a3e6232fc014e7edec20cd07d2e299db3..7a01840e278ee93370716351fe7055c56d67ad3f 100644 --- a/paddlespeech/t2s/models/hifigan/hifigan.py +++ b/paddlespeech/t2s/models/hifigan/hifigan.py @@ -50,20 +50,34 @@ class HiFiGANGenerator(nn.Layer): init_type: str="xavier_uniform", ): """Initialize HiFiGANGenerator module. Args: - in_channels (int): Number of input channels. - out_channels (int): Number of output channels. - channels (int): Number of hidden representation channels. - global_channels (int): Number of global conditioning channels. - kernel_size (int): Kernel size of initial and final conv layer. - upsample_scales (list): List of upsampling scales. - upsample_kernel_sizes (list): List of kernel sizes for upsampling layers. - resblock_kernel_sizes (list): List of kernel sizes for residual blocks. - resblock_dilations (list): List of dilation list for residual blocks. - use_additional_convs (bool): Whether to use additional conv layers in residual blocks. - bias (bool): Whether to add bias parameter in convolution layers. - nonlinear_activation (str): Activation function module name. - nonlinear_activation_params (dict): Hyperparameters for activation function. - use_weight_norm (bool): Whether to use weight norm. + in_channels (int): + Number of input channels. + out_channels (int): + Number of output channels. + channels (int): + Number of hidden representation channels. + global_channels (int): + Number of global conditioning channels. + kernel_size (int): + Kernel size of initial and final conv layer. + upsample_scales (list): + List of upsampling scales. + upsample_kernel_sizes (list): + List of kernel sizes for upsampling layers. + resblock_kernel_sizes (list): + List of kernel sizes for residual blocks. + resblock_dilations (list): + List of dilation list for residual blocks. + use_additional_convs (bool): + Whether to use additional conv layers in residual blocks. + bias (bool): + Whether to add bias parameter in convolution layers. + nonlinear_activation (str): + Activation function module name. + nonlinear_activation_params (dict): + Hyperparameters for activation function. + use_weight_norm (bool): + Whether to use weight norm. If set to true, it will be applied to all of the conv layers. """ super().__init__() @@ -199,9 +213,10 @@ class HiFiGANGenerator(nn.Layer): def inference(self, c, g: Optional[paddle.Tensor]=None): """Perform inference. Args: - c (Tensor): Input tensor (T, in_channels). - normalize_before (bool): Whether to perform normalization. - g (Optional[Tensor]): Global conditioning tensor (global_channels, 1). + c (Tensor): + Input tensor (T, in_channels). + g (Optional[Tensor]): + Global conditioning tensor (global_channels, 1). Returns: Tensor: Output tensor (T ** prod(upsample_scales), out_channels). @@ -233,20 +248,33 @@ class HiFiGANPeriodDiscriminator(nn.Layer): """Initialize HiFiGANPeriodDiscriminator module. Args: - in_channels (int): Number of input channels. - out_channels (int): Number of output channels. - period (int): Period. - kernel_sizes (list): Kernel sizes of initial conv layers and the final conv layer. - channels (int): Number of initial channels. - downsample_scales (list): List of downsampling scales. - max_downsample_channels (int): Number of maximum downsampling channels. - use_additional_convs (bool): Whether to use additional conv layers in residual blocks. - bias (bool): Whether to add bias parameter in convolution layers. - nonlinear_activation (str): Activation function module name. - nonlinear_activation_params (dict): Hyperparameters for activation function. - use_weight_norm (bool): Whether to use weight norm. + in_channels (int): + Number of input channels. + out_channels (int): + Number of output channels. + period (int): + Period. + kernel_sizes (list): + Kernel sizes of initial conv layers and the final conv layer. + channels (int): + Number of initial channels. + downsample_scales (list): + List of downsampling scales. + max_downsample_channels (int): + Number of maximum downsampling channels. + use_additional_convs (bool): + Whether to use additional conv layers in residual blocks. + bias (bool): + Whether to add bias parameter in convolution layers. + nonlinear_activation (str): + Activation function module name. + nonlinear_activation_params (dict): + Hyperparameters for activation function. + use_weight_norm (bool): + Whether to use weight norm. If set to true, it will be applied to all of the conv layers. - use_spectral_norm (bool): Whether to use spectral norm. + use_spectral_norm (bool): + Whether to use spectral norm. If set to true, it will be applied to all of the conv layers. """ super().__init__() @@ -298,7 +326,8 @@ class HiFiGANPeriodDiscriminator(nn.Layer): """Calculate forward propagation. Args: - c (Tensor): Input tensor (B, in_channels, T). + c (Tensor): + Input tensor (B, in_channels, T). Returns: list: List of each layer's tensors. """ @@ -367,8 +396,10 @@ class HiFiGANMultiPeriodDiscriminator(nn.Layer): """Initialize HiFiGANMultiPeriodDiscriminator module. Args: - periods (list): List of periods. - discriminator_params (dict): Parameters for hifi-gan period discriminator module. + periods (list): + List of periods. + discriminator_params (dict): + Parameters for hifi-gan period discriminator module. The period parameter will be overwritten. """ super().__init__() @@ -385,7 +416,8 @@ class HiFiGANMultiPeriodDiscriminator(nn.Layer): """Calculate forward propagation. Args: - x (Tensor): Input noise signal (B, 1, T). + x (Tensor): + Input noise signal (B, 1, T). Returns: List: List of list of each discriminator outputs, which consists of each layer output tensors. """ @@ -417,16 +449,25 @@ class HiFiGANScaleDiscriminator(nn.Layer): """Initilize HiFiGAN scale discriminator module. Args: - in_channels (int): Number of input channels. - out_channels (int): Number of output channels. - kernel_sizes (list): List of four kernel sizes. The first will be used for the first conv layer, + in_channels (int): + Number of input channels. + out_channels (int): + Number of output channels. + kernel_sizes (list): + List of four kernel sizes. The first will be used for the first conv layer, and the second is for downsampling part, and the remaining two are for output layers. - channels (int): Initial number of channels for conv layer. - max_downsample_channels (int): Maximum number of channels for downsampling layers. - bias (bool): Whether to add bias parameter in convolution layers. - downsample_scales (list): List of downsampling scales. - nonlinear_activation (str): Activation function module name. - nonlinear_activation_params (dict): Hyperparameters for activation function. + channels (int): + Initial number of channels for conv layer. + max_downsample_channels (int): + Maximum number of channels for downsampling layers. + bias (bool): + Whether to add bias parameter in convolution layers. + downsample_scales (list): + List of downsampling scales. + nonlinear_activation (str): + Activation function module name. + nonlinear_activation_params (dict): + Hyperparameters for activation function. use_weight_norm (bool): Whether to use weight norm. If set to true, it will be applied to all of the conv layers. use_spectral_norm (bool): Whether to use spectral norm. @@ -614,7 +655,8 @@ class HiFiGANMultiScaleDiscriminator(nn.Layer): """Calculate forward propagation. Args: - x (Tensor): Input noise signal (B, 1, T). + x (Tensor): + Input noise signal (B, 1, T). Returns: List: List of list of each discriminator outputs, which consists of each layer output tensors. """ @@ -675,14 +717,21 @@ class HiFiGANMultiScaleMultiPeriodDiscriminator(nn.Layer): """Initilize HiFiGAN multi-scale + multi-period discriminator module. Args: - scales (int): Number of multi-scales. - scale_downsample_pooling (str): Pooling module name for downsampling of the inputs. - scale_downsample_pooling_params (dict): Parameters for the above pooling module. - scale_discriminator_params (dict): Parameters for hifi-gan scale discriminator module. - follow_official_norm (bool): Whether to follow the norm setting of the official implementaion. + scales (int): + Number of multi-scales. + scale_downsample_pooling (str): + Pooling module name for downsampling of the inputs. + scale_downsample_pooling_params (dict): + Parameters for the above pooling module. + scale_discriminator_params (dict): + Parameters for hifi-gan scale discriminator module. + follow_official_norm (bool): + Whether to follow the norm setting of the official implementaion. The first discriminator uses spectral norm and the other discriminators use weight norm. - periods (list): List of periods. - period_discriminator_params (dict): Parameters for hifi-gan period discriminator module. + periods (list): + List of periods. + period_discriminator_params (dict): + Parameters for hifi-gan period discriminator module. The period parameter will be overwritten. """ super().__init__() @@ -704,7 +753,8 @@ class HiFiGANMultiScaleMultiPeriodDiscriminator(nn.Layer): """Calculate forward propagation. Args: - x (Tensor): Input noise signal (B, 1, T). + x (Tensor): + Input noise signal (B, 1, T). Returns: List: List of list of each discriminator outputs, diff --git a/paddlespeech/t2s/models/melgan/melgan.py b/paddlespeech/t2s/models/melgan/melgan.py index 22d8fd9e764c5c7f3c71ca1e2d17acc641a029cd..058cf40d9c25199bc7da9bdbdca8ca9c2c386673 100644 --- a/paddlespeech/t2s/models/melgan/melgan.py +++ b/paddlespeech/t2s/models/melgan/melgan.py @@ -53,24 +53,38 @@ class MelGANGenerator(nn.Layer): """Initialize MelGANGenerator module. Args: - in_channels (int): Number of input channels. - out_channels (int): Number of output channels, + in_channels (int): + Number of input channels. + out_channels (int): + Number of output channels, the number of sub-band is out_channels in multi-band melgan. - kernel_size (int): Kernel size of initial and final conv layer. - channels (int): Initial number of channels for conv layer. - bias (bool): Whether to add bias parameter in convolution layers. - upsample_scales (List[int]): List of upsampling scales. - stack_kernel_size (int): Kernel size of dilated conv layers in residual stack. - stacks (int): Number of stacks in a single residual stack. - nonlinear_activation (Optional[str], optional): Non linear activation in upsample network, by default None - nonlinear_activation_params (Dict[str, Any], optional): Parameters passed to the linear activation in the upsample network, - by default {} - pad (str): Padding function module name before dilated convolution layer. - pad_params (dict): Hyperparameters for padding function. - use_final_nonlinear_activation (nn.Layer): Activation function for the final layer. - use_weight_norm (bool): Whether to use weight norm. + kernel_size (int): + Kernel size of initial and final conv layer. + channels (int): + Initial number of channels for conv layer. + bias (bool): + Whether to add bias parameter in convolution layers. + upsample_scales (List[int]): + List of upsampling scales. + stack_kernel_size (int): + Kernel size of dilated conv layers in residual stack. + stacks (int): + Number of stacks in a single residual stack. + nonlinear_activation (Optional[str], optional): + Non linear activation in upsample network, by default None + nonlinear_activation_params (Dict[str, Any], optional): + Parameters passed to the linear activation in the upsample network, by default {} + pad (str): + Padding function module name before dilated convolution layer. + pad_params (dict): + Hyperparameters for padding function. + use_final_nonlinear_activation (nn.Layer): + Activation function for the final layer. + use_weight_norm (bool): + Whether to use weight norm. If set to true, it will be applied to all of the conv layers. - use_causal_conv (bool): Whether to use causal convolution. + use_causal_conv (bool): + Whether to use causal convolution. """ super().__init__() @@ -194,7 +208,8 @@ class MelGANGenerator(nn.Layer): """Calculate forward propagation. Args: - c (Tensor): Input tensor (B, in_channels, T). + c (Tensor): + Input tensor (B, in_channels, T). Returns: Tensor: Output tensor (B, out_channels, T ** prod(upsample_scales)). """ @@ -244,7 +259,8 @@ class MelGANGenerator(nn.Layer): """Perform inference. Args: - c (Union[Tensor, ndarray]): Input tensor (T, in_channels). + c (Union[Tensor, ndarray]): + Input tensor (T, in_channels). Returns: Tensor: Output tensor (out_channels*T ** prod(upsample_scales), 1). """ @@ -279,20 +295,30 @@ class MelGANDiscriminator(nn.Layer): """Initilize MelGAN discriminator module. Args: - in_channels (int): Number of input channels. - out_channels (int): Number of output channels. + in_channels (int): + Number of input channels. + out_channels (int): + Number of output channels. kernel_sizes (List[int]): List of two kernel sizes. The prod will be used for the first conv layer, and the first and the second kernel sizes will be used for the last two layers. For example if kernel_sizes = [5, 3], the first layer kernel size will be 5 * 3 = 15, the last two layers' kernel size will be 5 and 3, respectively. - channels (int): Initial number of channels for conv layer. - max_downsample_channels (int): Maximum number of channels for downsampling layers. - bias (bool): Whether to add bias parameter in convolution layers. - downsample_scales (List[int]): List of downsampling scales. - nonlinear_activation (str): Activation function module name. - nonlinear_activation_params (dict): Hyperparameters for activation function. - pad (str): Padding function module name before dilated convolution layer. - pad_params (dict): Hyperparameters for padding function. + channels (int): + Initial number of channels for conv layer. + max_downsample_channels (int): + Maximum number of channels for downsampling layers. + bias (bool): + Whether to add bias parameter in convolution layers. + downsample_scales (List[int]): + List of downsampling scales. + nonlinear_activation (str): + Activation function module name. + nonlinear_activation_params (dict): + Hyperparameters for activation function. + pad (str): + Padding function module name before dilated convolution layer. + pad_params (dict): + Hyperparameters for padding function. """ super().__init__() @@ -364,7 +390,8 @@ class MelGANDiscriminator(nn.Layer): def forward(self, x): """Calculate forward propagation. Args: - x (Tensor): Input noise signal (B, 1, T). + x (Tensor): + Input noise signal (B, 1, T). Returns: List: List of output tensors of each layer (for feat_match_loss). """ @@ -406,22 +433,37 @@ class MelGANMultiScaleDiscriminator(nn.Layer): """Initilize MelGAN multi-scale discriminator module. Args: - in_channels (int): Number of input channels. - out_channels (int): Number of output channels. - scales (int): Number of multi-scales. - downsample_pooling (str): Pooling module name for downsampling of the inputs. - downsample_pooling_params (dict): Parameters for the above pooling module. - kernel_sizes (List[int]): List of two kernel sizes. The sum will be used for the first conv layer, + in_channels (int): + Number of input channels. + out_channels (int): + Number of output channels. + scales (int): + Number of multi-scales. + downsample_pooling (str): + Pooling module name for downsampling of the inputs. + downsample_pooling_params (dict): + Parameters for the above pooling module. + kernel_sizes (List[int]): + List of two kernel sizes. The sum will be used for the first conv layer, and the first and the second kernel sizes will be used for the last two layers. - channels (int): Initial number of channels for conv layer. - max_downsample_channels (int): Maximum number of channels for downsampling layers. - bias (bool): Whether to add bias parameter in convolution layers. - downsample_scales (List[int]): List of downsampling scales. - nonlinear_activation (str): Activation function module name. - nonlinear_activation_params (dict): Hyperparameters for activation function. - pad (str): Padding function module name before dilated convolution layer. - pad_params (dict): Hyperparameters for padding function. - use_causal_conv (bool): Whether to use causal convolution. + channels (int): + Initial number of channels for conv layer. + max_downsample_channels (int): + Maximum number of channels for downsampling layers. + bias (bool): + Whether to add bias parameter in convolution layers. + downsample_scales (List[int]): + List of downsampling scales. + nonlinear_activation (str): + Activation function module name. + nonlinear_activation_params (dict): + Hyperparameters for activation function. + pad (str): + Padding function module name before dilated convolution layer. + pad_params (dict): + Hyperparameters for padding function. + use_causal_conv (bool): + Whether to use causal convolution. """ super().__init__() @@ -464,7 +506,8 @@ class MelGANMultiScaleDiscriminator(nn.Layer): def forward(self, x): """Calculate forward propagation. Args: - x (Tensor): Input noise signal (B, 1, T). + x (Tensor): + Input noise signal (B, 1, T). Returns: List: List of list of each discriminator outputs, which consists of each layer output tensors. """ diff --git a/paddlespeech/t2s/models/melgan/style_melgan.py b/paddlespeech/t2s/models/melgan/style_melgan.py index 40a2f10096680b0dc0420c54ad0373d7f80f1912..d902a4b014ed372f81100930afc2514abe070744 100644 --- a/paddlespeech/t2s/models/melgan/style_melgan.py +++ b/paddlespeech/t2s/models/melgan/style_melgan.py @@ -54,20 +54,34 @@ class StyleMelGANGenerator(nn.Layer): """Initilize Style MelGAN generator. Args: - in_channels (int): Number of input noise channels. - aux_channels (int): Number of auxiliary input channels. - channels (int): Number of channels for conv layer. - out_channels (int): Number of output channels. - kernel_size (int): Kernel size of conv layers. - dilation (int): Dilation factor for conv layers. - bias (bool): Whether to add bias parameter in convolution layers. - noise_upsample_scales (list): List of noise upsampling scales. - noise_upsample_activation (str): Activation function module name for noise upsampling. - noise_upsample_activation_params (dict): Hyperparameters for the above activation function. - upsample_scales (list): List of upsampling scales. - upsample_mode (str): Upsampling mode in TADE layer. - gated_function (str): Gated function in TADEResBlock ("softmax" or "sigmoid"). - use_weight_norm (bool): Whether to use weight norm. + in_channels (int): + Number of input noise channels. + aux_channels (int): + Number of auxiliary input channels. + channels (int): + Number of channels for conv layer. + out_channels (int): + Number of output channels. + kernel_size (int): + Kernel size of conv layers. + dilation (int): + Dilation factor for conv layers. + bias (bool): + Whether to add bias parameter in convolution layers. + noise_upsample_scales (list): + List of noise upsampling scales. + noise_upsample_activation (str): + Activation function module name for noise upsampling. + noise_upsample_activation_params (dict): + Hyperparameters for the above activation function. + upsample_scales (list): + List of upsampling scales. + upsample_mode (str): + Upsampling mode in TADE layer. + gated_function (str): + Gated function in TADEResBlock ("softmax" or "sigmoid"). + use_weight_norm (bool): + Whether to use weight norm. If set to true, it will be applied to all of the conv layers. """ super().__init__() @@ -194,7 +208,8 @@ class StyleMelGANGenerator(nn.Layer): def inference(self, c): """Perform inference. Args: - c (Tensor): Input tensor (T, in_channels). + c (Tensor): + Input tensor (T, in_channels). Returns: Tensor: Output tensor (T ** prod(upsample_scales), out_channels). """ @@ -258,11 +273,16 @@ class StyleMelGANDiscriminator(nn.Layer): """Initilize Style MelGAN discriminator. Args: - repeats (int): Number of repititons to apply RWD. - window_sizes (list): List of random window sizes. - pqmf_params (list): List of list of Parameters for PQMF modules - discriminator_params (dict): Parameters for base discriminator module. - use_weight_nom (bool): Whether to apply weight normalization. + repeats (int): + Number of repititons to apply RWD. + window_sizes (list): + List of random window sizes. + pqmf_params (list): + List of list of Parameters for PQMF modules + discriminator_params (dict): + Parameters for base discriminator module. + use_weight_nom (bool): + Whether to apply weight normalization. """ super().__init__() @@ -299,7 +319,8 @@ class StyleMelGANDiscriminator(nn.Layer): def forward(self, x): """Calculate forward propagation. Args: - x (Tensor): Input tensor (B, 1, T). + x (Tensor): + Input tensor (B, 1, T). Returns: List: List of discriminator outputs, #items in the list will be equal to repeats * #discriminators. diff --git a/paddlespeech/t2s/models/parallel_wavegan/parallel_wavegan.py b/paddlespeech/t2s/models/parallel_wavegan/parallel_wavegan.py index cc8460e4d7131331e66d55e5119942c531923409..be306d9ccf036b5ade08aeb17f1511258d5e758f 100644 --- a/paddlespeech/t2s/models/parallel_wavegan/parallel_wavegan.py +++ b/paddlespeech/t2s/models/parallel_wavegan/parallel_wavegan.py @@ -32,29 +32,45 @@ class PWGGenerator(nn.Layer): """Wave Generator for Parallel WaveGAN Args: - in_channels (int, optional): Number of channels of the input waveform, by default 1 - out_channels (int, optional): Number of channels of the output waveform, by default 1 - kernel_size (int, optional): Kernel size of the residual blocks inside, by default 3 - layers (int, optional): Number of residual blocks inside, by default 30 - stacks (int, optional): The number of groups to split the residual blocks into, by default 3 + in_channels (int, optional): + Number of channels of the input waveform, by default 1 + out_channels (int, optional): + Number of channels of the output waveform, by default 1 + kernel_size (int, optional): + Kernel size of the residual blocks inside, by default 3 + layers (int, optional): + Number of residual blocks inside, by default 30 + stacks (int, optional): + The number of groups to split the residual blocks into, by default 3 Within each group, the dilation of the residual block grows exponentially. - residual_channels (int, optional): Residual channel of the residual blocks, by default 64 - gate_channels (int, optional): Gate channel of the residual blocks, by default 128 - skip_channels (int, optional): Skip channel of the residual blocks, by default 64 - aux_channels (int, optional): Auxiliary channel of the residual blocks, by default 80 - aux_context_window (int, optional): The context window size of the first convolution applied to the - auxiliary input, by default 2 - dropout (float, optional): Dropout of the residual blocks, by default 0. - bias (bool, optional): Whether to use bias in residual blocks, by default True - use_weight_norm (bool, optional): Whether to use weight norm in all convolutions, by default True - use_causal_conv (bool, optional): Whether to use causal padding in the upsample network and residual - blocks, by default False - upsample_scales (List[int], optional): Upsample scales of the upsample network, by default [4, 4, 4, 4] - nonlinear_activation (Optional[str], optional): Non linear activation in upsample network, by default None - nonlinear_activation_params (Dict[str, Any], optional): Parameters passed to the linear activation in the upsample network, - by default {} - interpolate_mode (str, optional): Interpolation mode of the upsample network, by default "nearest" - freq_axis_kernel_size (int, optional): Kernel size along the frequency axis of the upsample network, by default 1 + residual_channels (int, optional): + Residual channel of the residual blocks, by default 64 + gate_channels (int, optional): + Gate channel of the residual blocks, by default 128 + skip_channels (int, optional): + Skip channel of the residual blocks, by default 64 + aux_channels (int, optional): + Auxiliary channel of the residual blocks, by default 80 + aux_context_window (int, optional): + The context window size of the first convolution applied to the auxiliary input, by default 2 + dropout (float, optional): + Dropout of the residual blocks, by default 0. + bias (bool, optional): + Whether to use bias in residual blocks, by default True + use_weight_norm (bool, optional): + Whether to use weight norm in all convolutions, by default True + use_causal_conv (bool, optional): + Whether to use causal padding in the upsample network and residual blocks, by default False + upsample_scales (List[int], optional): + Upsample scales of the upsample network, by default [4, 4, 4, 4] + nonlinear_activation (Optional[str], optional): + Non linear activation in upsample network, by default None + nonlinear_activation_params (Dict[str, Any], optional): + Parameters passed to the linear activation in the upsample network, by default {} + interpolate_mode (str, optional): + Interpolation mode of the upsample network, by default "nearest" + freq_axis_kernel_size (int, optional): + Kernel size along the frequency axis of the upsample network, by default 1 """ def __init__( @@ -147,9 +163,11 @@ class PWGGenerator(nn.Layer): """Generate waveform. Args: - x(Tensor): Shape (N, C_in, T), The input waveform. - c(Tensor): Shape (N, C_aux, T'). The auxiliary input (e.g. spectrogram). It - is upsampled to match the time resolution of the input. + x(Tensor): + Shape (N, C_in, T), The input waveform. + c(Tensor): + Shape (N, C_aux, T'). The auxiliary input (e.g. spectrogram). + It is upsampled to match the time resolution of the input. Returns: Tensor: Shape (N, C_out, T), the generated waveform. @@ -195,8 +213,10 @@ class PWGGenerator(nn.Layer): """Waveform generation. This function is used for single instance inference. Args: - c(Tensor, optional, optional): Shape (T', C_aux), the auxiliary input, by default None - x(Tensor, optional): Shape (T, C_in), the noise waveform, by default None + c(Tensor, optional, optional): + Shape (T', C_aux), the auxiliary input, by default None + x(Tensor, optional): + Shape (T, C_in), the noise waveform, by default None Returns: Tensor: Shape (T, C_out), the generated waveform @@ -214,20 +234,28 @@ class PWGDiscriminator(nn.Layer): """A convolutional discriminator for audio. Args: - in_channels (int, optional): Number of channels of the input audio, by default 1 - out_channels (int, optional): Output feature size, by default 1 - kernel_size (int, optional): Kernel size of convolutional sublayers, by default 3 - layers (int, optional): Number of layers, by default 10 - conv_channels (int, optional): Feature size of the convolutional sublayers, by default 64 - dilation_factor (int, optional): The factor with which dilation of each convolutional sublayers grows + in_channels (int, optional): + Number of channels of the input audio, by default 1 + out_channels (int, optional): + Output feature size, by default 1 + kernel_size (int, optional): + Kernel size of convolutional sublayers, by default 3 + layers (int, optional): + Number of layers, by default 10 + conv_channels (int, optional): + Feature size of the convolutional sublayers, by default 64 + dilation_factor (int, optional): + The factor with which dilation of each convolutional sublayers grows exponentially if it is greater than 1, else the dilation of each convolutional sublayers grows linearly, by default 1 - nonlinear_activation (str, optional): The activation after each convolutional sublayer, by default "leakyrelu" - nonlinear_activation_params (Dict[str, Any], optional): The parameters passed to the activation's initializer, by default - {"negative_slope": 0.2} - bias (bool, optional): Whether to use bias in convolutional sublayers, by default True - use_weight_norm (bool, optional): Whether to use weight normalization at all convolutional sublayers, - by default True + nonlinear_activation (str, optional): + The activation after each convolutional sublayer, by default "leakyrelu" + nonlinear_activation_params (Dict[str, Any], optional): + The parameters passed to the activation's initializer, by default {"negative_slope": 0.2} + bias (bool, optional): + Whether to use bias in convolutional sublayers, by default True + use_weight_norm (bool, optional): + Whether to use weight normalization at all convolutional sublayers, by default True """ def __init__( @@ -290,7 +318,8 @@ class PWGDiscriminator(nn.Layer): """ Args: - x (Tensor): Shape (N, in_channels, num_samples), the input audio. + x (Tensor): + Shape (N, in_channels, num_samples), the input audio. Returns: Tensor: Shape (N, out_channels, num_samples), the predicted logits. @@ -318,24 +347,35 @@ class ResidualPWGDiscriminator(nn.Layer): """A wavenet-style discriminator for audio. Args: - in_channels (int, optional): Number of channels of the input audio, by default 1 - out_channels (int, optional): Output feature size, by default 1 - kernel_size (int, optional): Kernel size of residual blocks, by default 3 - layers (int, optional): Number of residual blocks, by default 30 - stacks (int, optional): Number of groups of residual blocks, within which the dilation + in_channels (int, optional): + Number of channels of the input audio, by default 1 + out_channels (int, optional): + Output feature size, by default 1 + kernel_size (int, optional): + Kernel size of residual blocks, by default 3 + layers (int, optional): + Number of residual blocks, by default 30 + stacks (int, optional): + Number of groups of residual blocks, within which the dilation of each residual blocks grows exponentially, by default 3 - residual_channels (int, optional): Residual channels of residual blocks, by default 64 - gate_channels (int, optional): Gate channels of residual blocks, by default 128 - skip_channels (int, optional): Skip channels of residual blocks, by default 64 - dropout (float, optional): Dropout probability of residual blocks, by default 0. - bias (bool, optional): Whether to use bias in residual blocks, by default True - use_weight_norm (bool, optional): Whether to use weight normalization in all convolutional layers, - by default True - use_causal_conv (bool, optional): Whether to use causal convolution in residual blocks, by default False - nonlinear_activation (str, optional): Activation after convolutions other than those in residual blocks, - by default "leakyrelu" - nonlinear_activation_params (Dict[str, Any], optional): Parameters to pass to the activation, - by default {"negative_slope": 0.2} + residual_channels (int, optional): + Residual channels of residual blocks, by default 64 + gate_channels (int, optional): + Gate channels of residual blocks, by default 128 + skip_channels (int, optional): + Skip channels of residual blocks, by default 64 + dropout (float, optional): + Dropout probability of residual blocks, by default 0. + bias (bool, optional): + Whether to use bias in residual blocks, by default True + use_weight_norm (bool, optional): + Whether to use weight normalization in all convolutional layers, by default True + use_causal_conv (bool, optional): + Whether to use causal convolution in residual blocks, by default False + nonlinear_activation (str, optional): + Activation after convolutions other than those in residual blocks, by default "leakyrelu" + nonlinear_activation_params (Dict[str, Any], optional): + Parameters to pass to the activation, by default {"negative_slope": 0.2} """ def __init__( @@ -405,7 +445,8 @@ class ResidualPWGDiscriminator(nn.Layer): def forward(self, x): """ Args: - x(Tensor): Shape (N, in_channels, num_samples), the input audio.↩ + x(Tensor): + Shape (N, in_channels, num_samples), the input audio.↩ Returns: Tensor: Shape (N, out_channels, num_samples), the predicted logits. diff --git a/paddlespeech/t2s/models/speedyspeech/speedyspeech.py b/paddlespeech/t2s/models/speedyspeech/speedyspeech.py index ed7c0b7e46733ef851d9f001aa463e5ea9c224ad..395ad69174d16886de9ea8a1c93a58f1edde577f 100644 --- a/paddlespeech/t2s/models/speedyspeech/speedyspeech.py +++ b/paddlespeech/t2s/models/speedyspeech/speedyspeech.py @@ -29,10 +29,14 @@ class ResidualBlock(nn.Layer): n: int=2): """SpeedySpeech encoder module. Args: - channels (int, optional): Feature size of the residual output(and also the input). - kernel_size (int, optional): Kernel size of the 1D convolution. - dilation (int, optional): Dilation of the 1D convolution. - n (int): Number of blocks. + channels (int, optional): + Feature size of the residual output(and also the input). + kernel_size (int, optional): + Kernel size of the 1D convolution. + dilation (int, optional): + Dilation of the 1D convolution. + n (int): + Number of blocks. """ super().__init__() @@ -57,7 +61,8 @@ class ResidualBlock(nn.Layer): def forward(self, x: paddle.Tensor): """Calculate forward propagation. Args: - x(Tensor): Batch of input sequences (B, hidden_size, Tmax). + x(Tensor): + Batch of input sequences (B, hidden_size, Tmax). Returns: Tensor: The residual output (B, hidden_size, Tmax). """ @@ -89,8 +94,10 @@ class TextEmbedding(nn.Layer): def forward(self, text: paddle.Tensor, tone: paddle.Tensor=None): """Calculate forward propagation. Args: - text(Tensor(int64)): Batch of padded token ids (B, Tmax). - tones(Tensor, optional(int64)): Batch of padded tone ids (B, Tmax). + text(Tensor(int64)): + Batch of padded token ids (B, Tmax). + tones(Tensor, optional(int64)): + Batch of padded tone ids (B, Tmax). Returns: Tensor: The residual output (B, Tmax, embedding_size). """ @@ -109,12 +116,18 @@ class TextEmbedding(nn.Layer): class SpeedySpeechEncoder(nn.Layer): """SpeedySpeech encoder module. Args: - vocab_size (int): Dimension of the inputs. - tone_size (Optional[int]): Number of tones. - hidden_size (int): Number of encoder hidden units. - kernel_size (int): Kernel size of encoder. - dilations (List[int]): Dilations of encoder. - spk_num (Optional[int]): Number of speakers. + vocab_size (int): + Dimension of the inputs. + tone_size (Optional[int]): + Number of tones. + hidden_size (int): + Number of encoder hidden units. + kernel_size (int): + Kernel size of encoder. + dilations (List[int]): + Dilations of encoder. + spk_num (Optional[int]): + Number of speakers. """ def __init__(self, @@ -161,9 +174,12 @@ class SpeedySpeechEncoder(nn.Layer): spk_id: paddle.Tensor=None): """Encoder input sequence. Args: - text(Tensor(int64)): Batch of padded token ids (B, Tmax). - tones(Tensor, optional(int64)): Batch of padded tone ids (B, Tmax). - spk_id(Tnesor, optional(int64)): Batch of speaker ids (B,) + text(Tensor(int64)): + Batch of padded token ids (B, Tmax). + tones(Tensor, optional(int64)): + Batch of padded tone ids (B, Tmax). + spk_id(Tnesor, optional(int64)): + Batch of speaker ids (B,) Returns: Tensor: Output tensor (B, Tmax, hidden_size). @@ -192,7 +208,8 @@ class DurationPredictor(nn.Layer): def forward(self, x: paddle.Tensor): """Calculate forward propagation. Args: - x(Tensor): Batch of input sequences (B, Tmax, hidden_size). + x(Tensor): + Batch of input sequences (B, Tmax, hidden_size). Returns: Tensor: Batch of predicted durations in log domain (B, Tmax). @@ -212,10 +229,14 @@ class SpeedySpeechDecoder(nn.Layer): ]): """SpeedySpeech decoder module. Args: - hidden_size (int): Number of decoder hidden units. - kernel_size (int): Kernel size of decoder. - output_size (int): Dimension of the outputs. - dilations (List[int]): Dilations of decoder. + hidden_size (int): + Number of decoder hidden units. + kernel_size (int): + Kernel size of decoder. + output_size (int): + Dimension of the outputs. + dilations (List[int]): + Dilations of decoder. """ super().__init__() res_blocks = [ @@ -230,7 +251,8 @@ class SpeedySpeechDecoder(nn.Layer): def forward(self, x): """Decoder input sequence. Args: - x(Tensor): Input tensor (B, time, hidden_size). + x(Tensor): + Input tensor (B, time, hidden_size). Returns: Tensor: Output tensor (B, time, output_size). @@ -261,18 +283,30 @@ class SpeedySpeech(nn.Layer): positional_dropout_rate: int=0.1): """Initialize SpeedySpeech module. Args: - vocab_size (int): Dimension of the inputs. - encoder_hidden_size (int): Number of encoder hidden units. - encoder_kernel_size (int): Kernel size of encoder. - encoder_dilations (List[int]): Dilations of encoder. - duration_predictor_hidden_size (int): Number of duration predictor hidden units. - decoder_hidden_size (int): Number of decoder hidden units. - decoder_kernel_size (int): Kernel size of decoder. - decoder_dilations (List[int]): Dilations of decoder. - decoder_output_size (int): Dimension of the outputs. - tone_size (Optional[int]): Number of tones. - spk_num (Optional[int]): Number of speakers. - init_type (str): How to initialize transformer parameters. + vocab_size (int): + Dimension of the inputs. + encoder_hidden_size (int): + Number of encoder hidden units. + encoder_kernel_size (int): + Kernel size of encoder. + encoder_dilations (List[int]): + Dilations of encoder. + duration_predictor_hidden_size (int): + Number of duration predictor hidden units. + decoder_hidden_size (int): + Number of decoder hidden units. + decoder_kernel_size (int): + Kernel size of decoder. + decoder_dilations (List[int]): + Dilations of decoder. + decoder_output_size (int): + Dimension of the outputs. + tone_size (Optional[int]): + Number of tones. + spk_num (Optional[int]): + Number of speakers. + init_type (str): + How to initialize transformer parameters. """ super().__init__() @@ -304,14 +338,20 @@ class SpeedySpeech(nn.Layer): spk_id: paddle.Tensor=None): """Calculate forward propagation. Args: - text(Tensor(int64)): Batch of padded token ids (B, Tmax). - durations(Tensor(int64)): Batch of padded durations (B, Tmax). - tones(Tensor, optional(int64)): Batch of padded tone ids (B, Tmax). - spk_id(Tnesor, optional(int64)): Batch of speaker ids (B,) + text(Tensor(int64)): + Batch of padded token ids (B, Tmax). + durations(Tensor(int64)): + Batch of padded durations (B, Tmax). + tones(Tensor, optional(int64)): + Batch of padded tone ids (B, Tmax). + spk_id(Tnesor, optional(int64)): + Batch of speaker ids (B,) Returns: - Tensor: Output tensor (B, T_frames, decoder_output_size). - Tensor: Predicted durations (B, Tmax). + Tensor: + Output tensor (B, T_frames, decoder_output_size). + Tensor: + Predicted durations (B, Tmax). """ # input of embedding must be int64 text = paddle.cast(text, 'int64') @@ -336,10 +376,14 @@ class SpeedySpeech(nn.Layer): spk_id: paddle.Tensor=None): """Generate the sequence of features given the sequences of characters. Args: - text(Tensor(int64)): Input sequence of characters (T,). - tones(Tensor, optional(int64)): Batch of padded tone ids (T, ). - durations(Tensor, optional (int64)): Groundtruth of duration (T,). - spk_id(Tensor, optional(int64), optional): spk ids (1,). (Default value = None) + text(Tensor(int64)): + Input sequence of characters (T,). + tones(Tensor, optional(int64)): + Batch of padded tone ids (T, ). + durations(Tensor, optional (int64)): + Groundtruth of duration (T,). + spk_id(Tensor, optional(int64), optional): + spk ids (1,). (Default value = None) Returns: Tensor: logmel (T, decoder_output_size). diff --git a/paddlespeech/t2s/models/tacotron2/tacotron2.py b/paddlespeech/t2s/models/tacotron2/tacotron2.py index 7b306e4820de10db9ae8551fffe62ab50d055905..25b5c932ae7d406ee01b38f6d88990dd586828ca 100644 --- a/paddlespeech/t2s/models/tacotron2/tacotron2.py +++ b/paddlespeech/t2s/models/tacotron2/tacotron2.py @@ -83,38 +83,67 @@ class Tacotron2(nn.Layer): init_type: str="xavier_uniform", ): """Initialize Tacotron2 module. Args: - idim (int): Dimension of the inputs. - odim (int): Dimension of the outputs. - embed_dim (int): Dimension of the token embedding. - elayers (int): Number of encoder blstm layers. - eunits (int): Number of encoder blstm units. - econv_layers (int): Number of encoder conv layers. - econv_filts (int): Number of encoder conv filter size. - econv_chans (int): Number of encoder conv filter channels. - dlayers (int): Number of decoder lstm layers. - dunits (int): Number of decoder lstm units. - prenet_layers (int): Number of prenet layers. - prenet_units (int): Number of prenet units. - postnet_layers (int): Number of postnet layers. - postnet_filts (int): Number of postnet filter size. - postnet_chans (int): Number of postnet filter channels. - output_activation (str): Name of activation function for outputs. - adim (int): Number of dimension of mlp in attention. - aconv_chans (int): Number of attention conv filter channels. - aconv_filts (int): Number of attention conv filter size. - cumulate_att_w (bool): Whether to cumulate previous attention weight. - use_batch_norm (bool): Whether to use batch normalization. - use_concate (bool): Whether to concat enc outputs w/ dec lstm outputs. - reduction_factor (int): Reduction factor. - spk_num (Optional[int]): Number of speakers. If set to > 1, assume that the + idim (int): + Dimension of the inputs. + odim (int): + Dimension of the outputs. + embed_dim (int): + Dimension of the token embedding. + elayers (int): + Number of encoder blstm layers. + eunits (int): + Number of encoder blstm units. + econv_layers (int): + Number of encoder conv layers. + econv_filts (int): + Number of encoder conv filter size. + econv_chans (int): + Number of encoder conv filter channels. + dlayers (int): + Number of decoder lstm layers. + dunits (int): + Number of decoder lstm units. + prenet_layers (int): + Number of prenet layers. + prenet_units (int): + Number of prenet units. + postnet_layers (int): + Number of postnet layers. + postnet_filts (int): + Number of postnet filter size. + postnet_chans (int): + Number of postnet filter channels. + output_activation (str): + Name of activation function for outputs. + adim (int): + Number of dimension of mlp in attention. + aconv_chans (int): + Number of attention conv filter channels. + aconv_filts (int): + Number of attention conv filter size. + cumulate_att_w (bool): + Whether to cumulate previous attention weight. + use_batch_norm (bool): + Whether to use batch normalization. + use_concate (bool): + Whether to concat enc outputs w/ dec lstm outputs. + reduction_factor (int): + Reduction factor. + spk_num (Optional[int]): + Number of speakers. If set to > 1, assume that the sids will be provided as the input and use sid embedding layer. - lang_num (Optional[int]): Number of languages. If set to > 1, assume that the + lang_num (Optional[int]): + Number of languages. If set to > 1, assume that the lids will be provided as the input and use sid embedding layer. - spk_embed_dim (Optional[int]): Speaker embedding dimension. If set to > 0, + spk_embed_dim (Optional[int]): + Speaker embedding dimension. If set to > 0, assume that spk_emb will be provided as the input. - spk_embed_integration_type (str): How to integrate speaker embedding. - dropout_rate (float): Dropout rate. - zoneout_rate (float): Zoneout rate. + spk_embed_integration_type (str): + How to integrate speaker embedding. + dropout_rate (float): + Dropout rate. + zoneout_rate (float): + Zoneout rate. """ assert check_argument_types() super().__init__() @@ -230,18 +259,28 @@ class Tacotron2(nn.Layer): """Calculate forward propagation. Args: - text (Tensor(int64)): Batch of padded character ids (B, T_text). - text_lengths (Tensor(int64)): Batch of lengths of each input batch (B,). - speech (Tensor): Batch of padded target features (B, T_feats, odim). - speech_lengths (Tensor(int64)): Batch of the lengths of each target (B,). - spk_emb (Optional[Tensor]): Batch of speaker embeddings (B, spk_embed_dim). - spk_id (Optional[Tensor]): Batch of speaker IDs (B, 1). - lang_id (Optional[Tensor]): Batch of language IDs (B, 1). + text (Tensor(int64)): + Batch of padded character ids (B, T_text). + text_lengths (Tensor(int64)): + Batch of lengths of each input batch (B,). + speech (Tensor): + Batch of padded target features (B, T_feats, odim). + speech_lengths (Tensor(int64)): + Batch of the lengths of each target (B,). + spk_emb (Optional[Tensor]): + Batch of speaker embeddings (B, spk_embed_dim). + spk_id (Optional[Tensor]): + Batch of speaker IDs (B, 1). + lang_id (Optional[Tensor]): + Batch of language IDs (B, 1). Returns: - Tensor: Loss scalar value. - Dict: Statistics to be monitored. - Tensor: Weight value if not joint training else model outputs. + Tensor: + Loss scalar value. + Dict: + Statistics to be monitored. + Tensor: + Weight value if not joint training else model outputs. """ text = text[:, :text_lengths.max()] @@ -329,18 +368,30 @@ class Tacotron2(nn.Layer): """Generate the sequence of features given the sequences of characters. Args: - text (Tensor(int64)): Input sequence of characters (T_text,). - speech (Optional[Tensor]): Feature sequence to extract style (N, idim). - spk_emb (ptional[Tensor]): Speaker embedding (spk_embed_dim,). - spk_id (Optional[Tensor]): Speaker ID (1,). - lang_id (Optional[Tensor]): Language ID (1,). - threshold (float): Threshold in inference. - minlenratio (float): Minimum length ratio in inference. - maxlenratio (float): Maximum length ratio in inference. - use_att_constraint (bool): Whether to apply attention constraint. - backward_window (int): Backward window in attention constraint. - forward_window (int): Forward window in attention constraint. - use_teacher_forcing (bool): Whether to use teacher forcing. + text (Tensor(int64)): + Input sequence of characters (T_text,). + speech (Optional[Tensor]): + Feature sequence to extract style (N, idim). + spk_emb (ptional[Tensor]): + Speaker embedding (spk_embed_dim,). + spk_id (Optional[Tensor]): + Speaker ID (1,). + lang_id (Optional[Tensor]): + Language ID (1,). + threshold (float): + Threshold in inference. + minlenratio (float): + Minimum length ratio in inference. + maxlenratio (float): + Maximum length ratio in inference. + use_att_constraint (bool): + Whether to apply attention constraint. + backward_window (int): + Backward window in attention constraint. + forward_window (int): + Forward window in attention constraint. + use_teacher_forcing (bool): + Whether to use teacher forcing. Returns: Dict[str, Tensor] diff --git a/paddlespeech/t2s/models/transformer_tts/transformer_tts.py b/paddlespeech/t2s/models/transformer_tts/transformer_tts.py index 92754c30a47e9619643b5f780205a5b47d971841..355fceb16108dcbff689b34d0777cb089b7ba1c8 100644 --- a/paddlespeech/t2s/models/transformer_tts/transformer_tts.py +++ b/paddlespeech/t2s/models/transformer_tts/transformer_tts.py @@ -49,66 +49,124 @@ class TransformerTTS(nn.Layer): https://arxiv.org/pdf/1809.08895.pdf Args: - idim (int): Dimension of the inputs. - odim (int): Dimension of the outputs. - embed_dim (int, optional): Dimension of character embedding. - eprenet_conv_layers (int, optional): Number of encoder prenet convolution layers. - eprenet_conv_chans (int, optional): Number of encoder prenet convolution channels. - eprenet_conv_filts (int, optional): Filter size of encoder prenet convolution. - dprenet_layers (int, optional): Number of decoder prenet layers. - dprenet_units (int, optional): Number of decoder prenet hidden units. - elayers (int, optional): Number of encoder layers. - eunits (int, optional): Number of encoder hidden units. - adim (int, optional): Number of attention transformation dimensions. - aheads (int, optional): Number of heads for multi head attention. - dlayers (int, optional): Number of decoder layers. - dunits (int, optional): Number of decoder hidden units. - postnet_layers (int, optional): Number of postnet layers. - postnet_chans (int, optional): Number of postnet channels. - postnet_filts (int, optional): Filter size of postnet. - use_scaled_pos_enc (pool, optional): Whether to use trainable scaled positional encoding. - use_batch_norm (bool, optional): Whether to use batch normalization in encoder prenet. - encoder_normalize_before (bool, optional): Whether to perform layer normalization before encoder block. - decoder_normalize_before (bool, optional): Whether to perform layer normalization before decoder block. - encoder_concat_after (bool, optional): Whether to concatenate attention layer's input and output in encoder. - decoder_concat_after (bool, optional): Whether to concatenate attention layer's input and output in decoder. - positionwise_layer_type (str, optional): Position-wise operation type. - positionwise_conv_kernel_size (int, optional): Kernel size in position wise conv 1d. - reduction_factor (int, optional): Reduction factor. - spk_embed_dim (int, optional): Number of speaker embedding dimenstions. - spk_embed_integration_type (str, optional): How to integrate speaker embedding. - use_gst (str, optional): Whether to use global style token. - gst_tokens (int, optional): The number of GST embeddings. - gst_heads (int, optional): The number of heads in GST multihead attention. - gst_conv_layers (int, optional): The number of conv layers in GST. - gst_conv_chans_list (Sequence[int], optional): List of the number of channels of conv layers in GST. - gst_conv_kernel_size (int, optional): Kernal size of conv layers in GST. - gst_conv_stride (int, optional): Stride size of conv layers in GST. - gst_gru_layers (int, optional): The number of GRU layers in GST. - gst_gru_units (int, optional): The number of GRU units in GST. - transformer_lr (float, optional): Initial value of learning rate. - transformer_warmup_steps (int, optional): Optimizer warmup steps. - transformer_enc_dropout_rate (float, optional): Dropout rate in encoder except attention and positional encoding. - transformer_enc_positional_dropout_rate (float, optional): Dropout rate after encoder positional encoding. - transformer_enc_attn_dropout_rate (float, optional): Dropout rate in encoder self-attention module. - transformer_dec_dropout_rate (float, optional): Dropout rate in decoder except attention & positional encoding. - transformer_dec_positional_dropout_rate (float, optional): Dropout rate after decoder positional encoding. - transformer_dec_attn_dropout_rate (float, optional): Dropout rate in deocoder self-attention module. - transformer_enc_dec_attn_dropout_rate (float, optional): Dropout rate in encoder-deocoder attention module. - init_type (str, optional): How to initialize transformer parameters. - init_enc_alpha (float, optional): Initial value of alpha in scaled pos encoding of the encoder. - init_dec_alpha (float, optional): Initial value of alpha in scaled pos encoding of the decoder. - eprenet_dropout_rate (float, optional): Dropout rate in encoder prenet. - dprenet_dropout_rate (float, optional): Dropout rate in decoder prenet. - postnet_dropout_rate (float, optional): Dropout rate in postnet. - use_masking (bool, optional): Whether to apply masking for padded part in loss calculation. - use_weighted_masking (bool, optional): Whether to apply weighted masking in loss calculation. - bce_pos_weight (float, optional): Positive sample weight in bce calculation (only for use_masking=true). - loss_type (str, optional): How to calculate loss. - use_guided_attn_loss (bool, optional): Whether to use guided attention loss. - num_heads_applied_guided_attn (int, optional): Number of heads in each layer to apply guided attention loss. - num_layers_applied_guided_attn (int, optional): Number of layers to apply guided attention loss. - List of module names to apply guided attention loss. + idim (int): + Dimension of the inputs. + odim (int): + Dimension of the outputs. + embed_dim (int, optional): + Dimension of character embedding. + eprenet_conv_layers (int, optional): + Number of encoder prenet convolution layers. + eprenet_conv_chans (int, optional): + Number of encoder prenet convolution channels. + eprenet_conv_filts (int, optional): + Filter size of encoder prenet convolution. + dprenet_layers (int, optional): + Number of decoder prenet layers. + dprenet_units (int, optional): + Number of decoder prenet hidden units. + elayers (int, optional): + Number of encoder layers. + eunits (int, optional): + Number of encoder hidden units. + adim (int, optional): + Number of attention transformation dimensions. + aheads (int, optional): + Number of heads for multi head attention. + dlayers (int, optional): + Number of decoder layers. + dunits (int, optional): + Number of decoder hidden units. + postnet_layers (int, optional): + Number of postnet layers. + postnet_chans (int, optional): + Number of postnet channels. + postnet_filts (int, optional): + Filter size of postnet. + use_scaled_pos_enc (pool, optional): + Whether to use trainable scaled positional encoding. + use_batch_norm (bool, optional): + Whether to use batch normalization in encoder prenet. + encoder_normalize_before (bool, optional): + Whether to perform layer normalization before encoder block. + decoder_normalize_before (bool, optional): + Whether to perform layer normalization before decoder block. + encoder_concat_after (bool, optional): + Whether to concatenate attention layer's input and output in encoder. + decoder_concat_after (bool, optional): + Whether to concatenate attention layer's input and output in decoder. + positionwise_layer_type (str, optional): + Position-wise operation type. + positionwise_conv_kernel_size (int, optional): + Kernel size in position wise conv 1d. + reduction_factor (int, optional): + Reduction factor. + spk_embed_dim (int, optional): + Number of speaker embedding dimenstions. + spk_embed_integration_type (str, optional): + How to integrate speaker embedding. + use_gst (str, optional): + Whether to use global style token. + gst_tokens (int, optional): + The number of GST embeddings. + gst_heads (int, optional): + The number of heads in GST multihead attention. + gst_conv_layers (int, optional): + The number of conv layers in GST. + gst_conv_chans_list (Sequence[int], optional): + List of the number of channels of conv layers in GST. + gst_conv_kernel_size (int, optional): + Kernal size of conv layers in GST. + gst_conv_stride (int, optional): + Stride size of conv layers in GST. + gst_gru_layers (int, optional): + The number of GRU layers in GST. + gst_gru_units (int, optional): + The number of GRU units in GST. + transformer_lr (float, optional): + Initial value of learning rate. + transformer_warmup_steps (int, optional): + Optimizer warmup steps. + transformer_enc_dropout_rate (float, optional): + Dropout rate in encoder except attention and positional encoding. + transformer_enc_positional_dropout_rate (float, optional): + Dropout rate after encoder positional encoding. + transformer_enc_attn_dropout_rate (float, optional): + Dropout rate in encoder self-attention module. + transformer_dec_dropout_rate (float, optional): + Dropout rate in decoder except attention & positional encoding. + transformer_dec_positional_dropout_rate (float, optional): + Dropout rate after decoder positional encoding. + transformer_dec_attn_dropout_rate (float, optional): + Dropout rate in deocoder self-attention module. + transformer_enc_dec_attn_dropout_rate (float, optional): + Dropout rate in encoder-deocoder attention module. + init_type (str, optional): + How to initialize transformer parameters. + init_enc_alpha (float, optional): + Initial value of alpha in scaled pos encoding of the encoder. + init_dec_alpha (float, optional): + Initial value of alpha in scaled pos encoding of the decoder. + eprenet_dropout_rate (float, optional): + Dropout rate in encoder prenet. + dprenet_dropout_rate (float, optional): + Dropout rate in decoder prenet. + postnet_dropout_rate (float, optional): + Dropout rate in postnet. + use_masking (bool, optional): + Whether to apply masking for padded part in loss calculation. + use_weighted_masking (bool, optional): + Whether to apply weighted masking in loss calculation. + bce_pos_weight (float, optional): + Positive sample weight in bce calculation (only for use_masking=true). + loss_type (str, optional): + How to calculate loss. + use_guided_attn_loss (bool, optional): + Whether to use guided attention loss. + num_heads_applied_guided_attn (int, optional): + Number of heads in each layer to apply guided attention loss. + num_layers_applied_guided_attn (int, optional): + Number of layers to apply guided attention loss. """ def __init__( diff --git a/paddlespeech/t2s/models/waveflow.py b/paddlespeech/t2s/models/waveflow.py index 52e6005be3969e1ad89c0d634efbbb62dfc1a68e..8e2ce822fd294c4f7a3eff3716a7c7a827bb60fa 100644 --- a/paddlespeech/t2s/models/waveflow.py +++ b/paddlespeech/t2s/models/waveflow.py @@ -33,8 +33,10 @@ def fold(x, n_group): """Fold audio or spectrogram's temporal dimension in to groups. Args: - x(Tensor): The input tensor. shape=(*, time_steps) - n_group(int): The size of a group. + x(Tensor): + The input tensor. shape=(*, time_steps) + n_group(int): + The size of a group. Returns: Tensor: Folded tensor. shape=(*, time_steps // n_group, group) @@ -53,7 +55,8 @@ class UpsampleNet(nn.LayerList): on mel and time dimension. Args: - upscale_factors(List[int], optional): Time upsampling factors for each Conv2DTranspose Layer. + upscale_factors(List[int], optional): + Time upsampling factors for each Conv2DTranspose Layer. The ``UpsampleNet`` contains ``len(upscale_factor)`` Conv2DTranspose Layers. Each upscale_factor is used as the ``stride`` for the corresponding Conv2DTranspose. Defaults to [16, 16], this the default @@ -94,8 +97,10 @@ class UpsampleNet(nn.LayerList): """Forward pass of the ``UpsampleNet`` Args: - x(Tensor): The input spectrogram. shape=(batch_size, input_channels, time_steps) - trim_conv_artifact(bool, optional, optional): Trim deconvolution artifact at each layer. Defaults to False. + x(Tensor): + The input spectrogram. shape=(batch_size, input_channels, time_steps) + trim_conv_artifact(bool, optional, optional): + Trim deconvolution artifact at each layer. Defaults to False. Returns: Tensor: The upsampled spectrogram. shape=(batch_size, input_channels, time_steps * upsample_factor) @@ -123,10 +128,14 @@ class ResidualBlock(nn.Layer): and output. Args: - channels (int): Feature size of the input. - cond_channels (int): Featuer size of the condition. - kernel_size (Tuple[int]): Kernel size of the Convolution2d applied to the input. - dilations (int): Dilations of the Convolution2d applied to the input. + channels (int): + Feature size of the input. + cond_channels (int): + Featuer size of the condition. + kernel_size (Tuple[int]): + Kernel size of the Convolution2d applied to the input. + dilations (int): + Dilations of the Convolution2d applied to the input. """ def __init__(self, channels, cond_channels, kernel_size, dilations): @@ -173,12 +182,16 @@ class ResidualBlock(nn.Layer): """Compute output for a whole folded sequence. Args: - x (Tensor): The input. [shape=(batch_size, channel, height, width)] - condition (Tensor [shape=(batch_size, condition_channel, height, width)]): The local condition. + x (Tensor): + The input. [shape=(batch_size, channel, height, width)] + condition (Tensor [shape=(batch_size, condition_channel, height, width)]): + The local condition. Returns: - res (Tensor): The residual output. [shape=(batch_size, channel, height, width)] - skip (Tensor): The skip output. [shape=(batch_size, channel, height, width)] + res (Tensor): + The residual output. [shape=(batch_size, channel, height, width)] + skip (Tensor): + The skip output. [shape=(batch_size, channel, height, width)] """ x_in = x x = self.conv(x) @@ -216,12 +229,16 @@ class ResidualBlock(nn.Layer): """Compute the output for a row and update the buffer. Args: - x_row (Tensor): A row of the input. shape=(batch_size, channel, 1, width) - condition_row (Tensor): A row of the condition. shape=(batch_size, condition_channel, 1, width) + x_row (Tensor): + A row of the input. shape=(batch_size, channel, 1, width) + condition_row (Tensor): + A row of the condition. shape=(batch_size, condition_channel, 1, width) Returns: - res (Tensor): A row of the the residual output. shape=(batch_size, channel, 1, width) - skip (Tensor): A row of the skip output. shape=(batch_size, channel, 1, width) + res (Tensor): + A row of the the residual output. shape=(batch_size, channel, 1, width) + skip (Tensor): + A row of the skip output. shape=(batch_size, channel, 1, width) """ x_row_in = x_row @@ -258,11 +275,16 @@ class ResidualNet(nn.LayerList): """A stack of several ResidualBlocks. It merges condition at each layer. Args: - n_layer (int): Number of ResidualBlocks in the ResidualNet. - residual_channels (int): Feature size of each ResidualBlocks. - condition_channels (int): Feature size of the condition. - kernel_size (Tuple[int]): Kernel size of each ResidualBlock. - dilations_h (List[int]): Dilation in height dimension of every ResidualBlock. + n_layer (int): + Number of ResidualBlocks in the ResidualNet. + residual_channels (int): + Feature size of each ResidualBlocks. + condition_channels (int): + Feature size of the condition. + kernel_size (Tuple[int]): + Kernel size of each ResidualBlock. + dilations_h (List[int]): + Dilation in height dimension of every ResidualBlock. Raises: ValueError: If the length of dilations_h does not equals n_layers. @@ -288,11 +310,13 @@ class ResidualNet(nn.LayerList): """Comput the output of given the input and the condition. Args: - x (Tensor): The input. shape=(batch_size, channel, height, width) - condition (Tensor): The local condition. shape=(batch_size, condition_channel, height, width) + x (Tensor): + The input. shape=(batch_size, channel, height, width) + condition (Tensor): + The local condition. shape=(batch_size, condition_channel, height, width) Returns: - Tensor : The output, which is an aggregation of all the skip outputs. shape=(batch_size, channel, height, width) + Tensor: The output, which is an aggregation of all the skip outputs. shape=(batch_size, channel, height, width) """ skip_connections = [] @@ -312,12 +336,16 @@ class ResidualNet(nn.LayerList): """Compute the output for a row and update the buffers. Args: - x_row (Tensor): A row of the input. shape=(batch_size, channel, 1, width) - condition_row (Tensor): A row of the condition. shape=(batch_size, condition_channel, 1, width) + x_row (Tensor): + A row of the input. shape=(batch_size, channel, 1, width) + condition_row (Tensor): + A row of the condition. shape=(batch_size, condition_channel, 1, width) Returns: - res (Tensor): A row of the the residual output. shape=(batch_size, channel, 1, width) - skip (Tensor): A row of the skip output. shape=(batch_size, channel, 1, width) + res (Tensor): + A row of the the residual output. shape=(batch_size, channel, 1, width) + skip (Tensor): + A row of the skip output. shape=(batch_size, channel, 1, width) """ skip_connections = [] @@ -337,11 +365,16 @@ class Flow(nn.Layer): sampling. Args: - n_layers (int): Number of ResidualBlocks in the Flow. - channels (int): Feature size of the ResidualBlocks. - mel_bands (int): Feature size of the mel spectrogram (mel bands). - kernel_size (Tuple[int]): Kernel size of each ResisualBlocks in the Flow. - n_group (int): Number of timesteps to the folded into a group. + n_layers (int): + Number of ResidualBlocks in the Flow. + channels (int): + Feature size of the ResidualBlocks. + mel_bands (int): + Feature size of the mel spectrogram (mel bands). + kernel_size (Tuple[int]): + Kernel size of each ResisualBlocks in the Flow. + n_group (int): + Number of timesteps to the folded into a group. """ dilations_dict = { 8: [1, 1, 1, 1, 1, 1, 1, 1], @@ -393,11 +426,14 @@ class Flow(nn.Layer): a sample from p(X) into a sample from p(Z). Args: - x (Tensor): A input sample of the distribution p(X). shape=(batch, 1, height, width) - condition (Tensor): The local condition. shape=(batch, condition_channel, height, width) + x (Tensor): + A input sample of the distribution p(X). shape=(batch, 1, height, width) + condition (Tensor): + The local condition. shape=(batch, condition_channel, height, width) Returns: - z (Tensor): shape(batch, 1, height, width), the transformed sample. + z (Tensor): + shape(batch, 1, height, width), the transformed sample. Tuple[Tensor, Tensor]: The parameter of the transformation. logs (Tensor): shape(batch, 1, height - 1, width), the log scale of the transformation from x to z. @@ -433,8 +469,10 @@ class Flow(nn.Layer): p(Z) and transform the sample. It is a auto regressive transformation. Args: - z(Tensor): A sample of the distribution p(Z). shape=(batch, 1, time_steps - condition(Tensor): The local condition. shape=(batch, condition_channel, time_steps) + z(Tensor): + A sample of the distribution p(Z). shape=(batch, 1, time_steps + condition(Tensor): + The local condition. shape=(batch, condition_channel, time_steps) Returns: Tensor: The transformed sample. shape=(batch, 1, height, width) @@ -462,12 +500,18 @@ class WaveFlow(nn.LayerList): flows. Args: - n_flows (int): Number of flows in the WaveFlow model. - n_layers (int): Number of ResidualBlocks in each Flow. - n_group (int): Number of timesteps to fold as a group. - channels (int): Feature size of each ResidualBlock. - mel_bands (int): Feature size of mel spectrogram (mel bands). - kernel_size (Union[int, List[int]]): Kernel size of the convolution layer in each ResidualBlock. + n_flows (int): + Number of flows in the WaveFlow model. + n_layers (int): + Number of ResidualBlocks in each Flow. + n_group (int): + Number of timesteps to fold as a group. + channels (int): + Feature size of each ResidualBlock. + mel_bands (int): + Feature size of mel spectrogram (mel bands). + kernel_size (Union[int, List[int]]): + Kernel size of the convolution layer in each ResidualBlock. """ def __init__(self, n_flows, n_layers, n_group, channels, mel_bands, @@ -518,12 +562,16 @@ class WaveFlow(nn.LayerList): condition. Args: - x (Tensor): The audio. shape=(batch_size, time_steps) - condition (Tensor): The local condition (mel spectrogram here). shape=(batch_size, condition channel, time_steps) + x (Tensor): + The audio. shape=(batch_size, time_steps) + condition (Tensor): + The local condition (mel spectrogram here). shape=(batch_size, condition channel, time_steps) Returns: - Tensor: The transformed random variable. shape=(batch_size, time_steps) - Tensor: The log determinant of the jacobian of the transformation from x to z. shape=(1,) + Tensor: + The transformed random variable. shape=(batch_size, time_steps) + Tensor: + The log determinant of the jacobian of the transformation from x to z. shape=(1,) """ # x: (B, T) # condition: (B, C, T) upsampled condition @@ -559,12 +607,13 @@ class WaveFlow(nn.LayerList): autoregressive manner. Args: - z (Tensor): A sample of the distribution p(Z). shape=(batch, 1, time_steps - condition (Tensor): The local condition. shape=(batch, condition_channel, time_steps) + z (Tensor): + A sample of the distribution p(Z). shape=(batch, 1, time_steps + condition (Tensor): + The local condition. shape=(batch, condition_channel, time_steps) Returns: Tensor: The transformed sample (audio here). shape=(batch_size, time_steps) - """ z, condition = self._trim(z, condition) @@ -590,13 +639,20 @@ class ConditionalWaveFlow(nn.LayerList): """ConditionalWaveFlow, a UpsampleNet with a WaveFlow model. Args: - upsample_factors (List[int]): Upsample factors for the upsample net. - n_flows (int): Number of flows in the WaveFlow model. - n_layers (int): Number of ResidualBlocks in each Flow. - n_group (int): Number of timesteps to fold as a group. - channels (int): Feature size of each ResidualBlock. - n_mels (int): Feature size of mel spectrogram (mel bands). - kernel_size (Union[int, List[int]]): Kernel size of the convolution layer in each ResidualBlock. + upsample_factors (List[int]): + Upsample factors for the upsample net. + n_flows (int): + Number of flows in the WaveFlow model. + n_layers (int): + Number of ResidualBlocks in each Flow. + n_group (int): + Number of timesteps to fold as a group. + channels (int): + Feature size of each ResidualBlock. + n_mels (int): + Feature size of mel spectrogram (mel bands). + kernel_size (Union[int, List[int]]): + Kernel size of the convolution layer in each ResidualBlock. """ def __init__(self, @@ -622,12 +678,16 @@ class ConditionalWaveFlow(nn.LayerList): the determinant of the jacobian of the transformation from x to z. Args: - audio(Tensor): The audio. shape=(B, T) - mel(Tensor): The mel spectrogram. shape=(B, C_mel, T_mel) + audio(Tensor): + The audio. shape=(B, T) + mel(Tensor): + The mel spectrogram. shape=(B, C_mel, T_mel) Returns: - Tensor: The inversely transformed random variable z (x to z). shape=(B, T) - Tensor: the log of the determinant of the jacobian of the transformation from x to z. shape=(1,) + Tensor: + The inversely transformed random variable z (x to z). shape=(B, T) + Tensor: + the log of the determinant of the jacobian of the transformation from x to z. shape=(1,) """ condition = self.encoder(mel) z, log_det_jacobian = self.decoder(audio, condition) @@ -638,10 +698,12 @@ class ConditionalWaveFlow(nn.LayerList): """Generate raw audio given mel spectrogram. Args: - mel(np.ndarray): Mel spectrogram of an utterance(in log-magnitude). shape=(C_mel, T_mel) + mel(np.ndarray): + Mel spectrogram of an utterance(in log-magnitude). shape=(C_mel, T_mel) Returns: - Tensor: The synthesized audio, where``T <= T_mel * upsample_factors``. shape=(B, T) + Tensor: + The synthesized audio, where``T <= T_mel * upsample_factors``. shape=(B, T) """ start = time.time() condition = self.encoder(mel, trim_conv_artifact=True) # (B, C, T) @@ -657,7 +719,8 @@ class ConditionalWaveFlow(nn.LayerList): """Generate raw audio given mel spectrogram. Args: - mel(np.ndarray): Mel spectrogram of an utterance(in log-magnitude). shape=(C_mel, T_mel) + mel(np.ndarray): + Mel spectrogram of an utterance(in log-magnitude). shape=(C_mel, T_mel) Returns: np.ndarray: The synthesized audio. shape=(T,) @@ -673,8 +736,10 @@ class ConditionalWaveFlow(nn.LayerList): """Build a ConditionalWaveFlow model from a pretrained model. Args: - config(yacs.config.CfgNode): model configs - checkpoint_path(Path or str): the path of pretrained model checkpoint, without extension name + config(yacs.config.CfgNode): + model configs + checkpoint_path(Path or str): + the path of pretrained model checkpoint, without extension name Returns: ConditionalWaveFlow The model built from pretrained result. @@ -694,8 +759,8 @@ class WaveFlowLoss(nn.Layer): """Criterion of a WaveFlow model. Args: - sigma (float): The standard deviation of the gaussian noise used in WaveFlow, - by default 1.0. + sigma (float): + The standard deviation of the gaussian noise used in WaveFlow, by default 1.0. """ def __init__(self, sigma=1.0): @@ -708,8 +773,10 @@ class WaveFlowLoss(nn.Layer): log_det_jacobian of transformation from x to z. Args: - z(Tensor): The transformed random variable (x to z). shape=(B, T) - log_det_jacobian(Tensor): The log of the determinant of the jacobian matrix of the + z(Tensor): + The transformed random variable (x to z). shape=(B, T) + log_det_jacobian(Tensor): + The log of the determinant of the jacobian matrix of the transformation from x to z. shape=(1,) Returns: @@ -726,7 +793,8 @@ class ConditionalWaveFlow2Infer(ConditionalWaveFlow): """Generate raw audio given mel spectrogram. Args: - mel (np.ndarray): Mel spectrogram of an utterance(in log-magnitude). shape=(C_mel, T_mel) + mel (np.ndarray): + Mel spectrogram of an utterance(in log-magnitude). shape=(C_mel, T_mel) Returns: np.ndarray: The synthesized audio. shape=(T,) diff --git a/paddlespeech/t2s/models/wavernn/wavernn.py b/paddlespeech/t2s/models/wavernn/wavernn.py index eb892eda56e5f412b3bf20fca864dfec0ff150cc..254edbb2df0faec9d896ad1a1cc426e438fa27d1 100644 --- a/paddlespeech/t2s/models/wavernn/wavernn.py +++ b/paddlespeech/t2s/models/wavernn/wavernn.py @@ -165,19 +165,29 @@ class WaveRNN(nn.Layer): init_type: str="xavier_uniform", ): ''' Args: - rnn_dims (int, optional): Hidden dims of RNN Layers. - fc_dims (int, optional): Dims of FC Layers. - bits (int, optional): bit depth of signal. - aux_context_window (int, optional): The context window size of the first convolution applied to the - auxiliary input, by default 2 - upsample_scales (List[int], optional): Upsample scales of the upsample network. - aux_channels (int, optional): Auxiliary channel of the residual blocks. - compute_dims (int, optional): Dims of Conv1D in MelResNet. - res_out_dims (int, optional): Dims of output in MelResNet. - res_blocks (int, optional): Number of residual blocks. - mode (str, optional): Output mode of the WaveRNN vocoder. + rnn_dims (int, optional): + Hidden dims of RNN Layers. + fc_dims (int, optional): + Dims of FC Layers. + bits (int, optional): + bit depth of signal. + aux_context_window (int, optional): + The context window size of the first convolution applied to the auxiliary input, by default 2 + upsample_scales (List[int], optional): + Upsample scales of the upsample network. + aux_channels (int, optional): + Auxiliary channel of the residual blocks. + compute_dims (int, optional): + Dims of Conv1D in MelResNet. + res_out_dims (int, optional): + Dims of output in MelResNet. + res_blocks (int, optional): + Number of residual blocks. + mode (str, optional): + Output mode of the WaveRNN vocoder. `MOL` for Mixture of Logistic Distribution, and `RAW` for quantized bits as the model's output. - init_type (str): How to initialize parameters. + init_type (str): + How to initialize parameters. ''' super().__init__() self.mode = mode @@ -226,8 +236,10 @@ class WaveRNN(nn.Layer): def forward(self, x, c): ''' Args: - x (Tensor): wav sequence, [B, T] - c (Tensor): mel spectrogram [B, C_aux, T'] + x (Tensor): + wav sequence, [B, T] + c (Tensor): + mel spectrogram [B, C_aux, T'] T = (T' - 2 * aux_context_window ) * hop_length Returns: @@ -280,10 +292,14 @@ class WaveRNN(nn.Layer): gen_display: bool=False): """ Args: - c(Tensor): input mels, (T', C_aux) - batched(bool): generate in batch or not - target(int): target number of samples to be generated in each batch entry - overlap(int): number of samples for crossfading between batches + c(Tensor): + input mels, (T', C_aux) + batched(bool): + generate in batch or not + target(int): + target number of samples to be generated in each batch entry + overlap(int): + number of samples for crossfading between batches mu_law(bool) Returns: wav sequence: Output (T' * prod(upsample_scales), out_channels, C_out). @@ -404,7 +420,8 @@ class WaveRNN(nn.Layer): def pad_tensor(self, x, pad, side='both'): ''' Args: - x(Tensor): mel, [1, n_frames, 80] + x(Tensor): + mel, [1, n_frames, 80] pad(int): side(str, optional): (Default value = 'both') @@ -428,12 +445,15 @@ class WaveRNN(nn.Layer): Overlap will be used for crossfading in xfade_and_unfold() Args: - x(Tensor): Upsampled conditioning features. mels or aux + x(Tensor): + Upsampled conditioning features. mels or aux shape=(1, T, features) mels: [1, T, 80] aux: [1, T, 128] - target(int): Target timesteps for each index of batch - overlap(int): Timesteps for both xfade and rnn warmup + target(int): + Target timesteps for each index of batch + overlap(int): + Timesteps for both xfade and rnn warmup Returns: Tensor: diff --git a/paddlespeech/t2s/modules/causal_conv.py b/paddlespeech/t2s/modules/causal_conv.py index 3abccc15f45e0911f18535efe8575177f735c66b..337ee2383a69c4e773f1a345c2582c8b929a0a24 100644 --- a/paddlespeech/t2s/modules/causal_conv.py +++ b/paddlespeech/t2s/modules/causal_conv.py @@ -42,7 +42,8 @@ class CausalConv1D(nn.Layer): def forward(self, x): """Calculate forward propagation. Args: - x (Tensor): Input tensor (B, in_channels, T). + x (Tensor): + Input tensor (B, in_channels, T). Returns: Tensor: Output tensor (B, out_channels, T). """ @@ -67,7 +68,8 @@ class CausalConv1DTranspose(nn.Layer): def forward(self, x): """Calculate forward propagation. Args: - x (Tensor): Input tensor (B, in_channels, T_in). + x (Tensor): + Input tensor (B, in_channels, T_in). Returns: Tensor: Output tensor (B, out_channels, T_out). """ diff --git a/paddlespeech/t2s/modules/conformer/convolution.py b/paddlespeech/t2s/modules/conformer/convolution.py index 185c62fb3c804f9ce495323f590878072d8bafa6..dadda064075d10a79e946258a2bd72d5904b7862 100644 --- a/paddlespeech/t2s/modules/conformer/convolution.py +++ b/paddlespeech/t2s/modules/conformer/convolution.py @@ -20,8 +20,10 @@ class ConvolutionModule(nn.Layer): """ConvolutionModule in Conformer model. Args: - channels (int): The number of channels of conv layers. - kernel_size (int): Kernerl size of conv layers. + channels (int): + The number of channels of conv layers. + kernel_size (int): + Kernerl size of conv layers. """ def __init__(self, channels, kernel_size, activation=nn.ReLU(), bias=True): @@ -59,7 +61,8 @@ class ConvolutionModule(nn.Layer): """Compute convolution module. Args: - x (Tensor): Input tensor (#batch, time, channels). + x (Tensor): + Input tensor (#batch, time, channels). Returns: Tensor: Output tensor (#batch, time, channels). """ diff --git a/paddlespeech/t2s/modules/conformer/encoder_layer.py b/paddlespeech/t2s/modules/conformer/encoder_layer.py index 61c32612527630ec66941b882335a208a50d1b11..26a354565a8b62e3f8941319b69961b794d18fbb 100644 --- a/paddlespeech/t2s/modules/conformer/encoder_layer.py +++ b/paddlespeech/t2s/modules/conformer/encoder_layer.py @@ -23,25 +23,34 @@ class EncoderLayer(nn.Layer): """Encoder layer module. Args: - size (int): Input dimension. - self_attn (nn.Layer): Self-attention module instance. + size (int): + Input dimension. + self_attn (nn.Layer): + Self-attention module instance. `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` instance can be used as the argument. - feed_forward (nn.Layer): Feed-forward module instance. + feed_forward (nn.Layer): + Feed-forward module instance. `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance can be used as the argument. - feed_forward_macaron (nn.Layer): Additional feed-forward module instance. + feed_forward_macaron (nn.Layer): + Additional feed-forward module instance. `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance can be used as the argument. - conv_module (nn.Layer): Convolution module instance. + conv_module (nn.Layer): + Convolution module instance. `ConvlutionModule` instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): Whether to use layer_norm before the first block. - concat_after (bool): Whether to concat attention layer's input and output. + dropout_rate (float): + Dropout rate. + normalize_before (bool): + Whether to use layer_norm before the first block. + concat_after (bool): + Whether to concat attention layer's input and output. if True, additional linear will be applied. i.e. x -> x + linear(concat(x, att(x))) if False, no additional linear will be applied. i.e. x -> x + att(x) - stochastic_depth_rate (float): Proability to skip this layer. + stochastic_depth_rate (float): + Proability to skip this layer. During training, the layer may skip residual computation and return input as-is with given probability. """ @@ -86,15 +95,19 @@ class EncoderLayer(nn.Layer): """Compute encoded features. Args: - x_input(Union[Tuple, Tensor]): Input tensor w/ or w/o pos emb. + x_input(Union[Tuple, Tensor]): + Input tensor w/ or w/o pos emb. - w/ pos emb: Tuple of tensors [(#batch, time, size), (1, time, size)]. - w/o pos emb: Tensor (#batch, time, size). - mask(Tensor): Mask tensor for the input (#batch, time). + mask(Tensor): + Mask tensor for the input (#batch, time). cache (Tensor): Returns: - Tensor: Output tensor (#batch, time, size). - Tensor: Mask tensor (#batch, time). + Tensor: + Output tensor (#batch, time, size). + Tensor: + Mask tensor (#batch, time). """ if isinstance(x_input, tuple): x, pos_emb = x_input[0], x_input[1] diff --git a/paddlespeech/t2s/modules/conv.py b/paddlespeech/t2s/modules/conv.py index aa875bd500124e5bd3d3807b10f63ed8442d3800..922af03f2d1a87094f31d86e5d645fa94be163ff 100644 --- a/paddlespeech/t2s/modules/conv.py +++ b/paddlespeech/t2s/modules/conv.py @@ -42,13 +42,19 @@ class Conv1dCell(nn.Conv1D): class. Args: - in_channels (int): The feature size of the input. - out_channels (int): The feature size of the output. - kernel_size (int or Tuple[int]): The size of the kernel. - dilation (int or Tuple[int]): The dilation of the convolution, by default 1 - weight_attr (ParamAttr, Initializer, str or bool, optional) : The parameter attribute of the convolution kernel, + in_channels (int): + The feature size of the input. + out_channels (int): + The feature size of the output. + kernel_size (int or Tuple[int]): + The size of the kernel. + dilation (int or Tuple[int]): + The dilation of the convolution, by default 1 + weight_attr (ParamAttr, Initializer, str or bool, optional): + The parameter attribute of the convolution kernel, by default None. - bias_attr (ParamAttr, Initializer, str or bool, optional):The parameter attribute of the bias. + bias_attr (ParamAttr, Initializer, str or bool, optional): + The parameter attribute of the bias. If ``False``, this layer does not have a bias, by default None. Examples: @@ -122,7 +128,8 @@ class Conv1dCell(nn.Conv1D): """Initialize the buffer for the step input. Args: - x_t (Tensor): The step input. shape=(batch_size, in_channels) + x_t (Tensor): + The step input. shape=(batch_size, in_channels) """ batch_size, _ = x_t.shape @@ -134,7 +141,8 @@ class Conv1dCell(nn.Conv1D): """Shift the buffer by one step. Args: - x_t (Tensor): The step input. shape=(batch_size, in_channels) + x_t (Tensor): T + he step input. shape=(batch_size, in_channels) """ self._buffer = paddle.concat( @@ -144,10 +152,12 @@ class Conv1dCell(nn.Conv1D): """Add step input and compute step output. Args: - x_t (Tensor): The step input. shape=(batch_size, in_channels) + x_t (Tensor): + The step input. shape=(batch_size, in_channels) Returns: - y_t (Tensor): The step output. shape=(batch_size, out_channels) + y_t (Tensor): + The step output. shape=(batch_size, out_channels) """ batch_size = x_t.shape[0] @@ -173,10 +183,14 @@ class Conv1dBatchNorm(nn.Layer): """A Conv1D Layer followed by a BatchNorm1D. Args: - in_channels (int): The feature size of the input. - out_channels (int): The feature size of the output. - kernel_size (int): The size of the convolution kernel. - stride (int, optional): The stride of the convolution, by default 1. + in_channels (int): + The feature size of the input. + out_channels (int): + The feature size of the output. + kernel_size (int): + The size of the convolution kernel. + stride (int, optional): + The stride of the convolution, by default 1. padding (int, str or Tuple[int], optional): The padding of the convolution. If int, a symmetrical padding is applied before convolution; @@ -189,9 +203,12 @@ class Conv1dBatchNorm(nn.Layer): bias_attr (ParamAttr, Initializer, str or bool, optional): The parameter attribute of the bias of the convolution, by defaultNone. - data_format (str ["NCL" or "NLC"], optional): The data layout of the input, by default "NCL" - momentum (float, optional): The momentum of the BatchNorm1D layer, by default 0.9 - epsilon (float, optional): The epsilon of the BatchNorm1D layer, by default 1e-05 + data_format (str ["NCL" or "NLC"], optional): + The data layout of the input, by default "NCL" + momentum (float, optional): + The momentum of the BatchNorm1D layer, by default 0.9 + epsilon (float, optional): + The epsilon of the BatchNorm1D layer, by default 1e-05 """ def __init__(self, @@ -225,12 +242,13 @@ class Conv1dBatchNorm(nn.Layer): """Forward pass of the Conv1dBatchNorm layer. Args: - x (Tensor): The input tensor. Its data layout depends on ``data_format``. - shape=(B, C_in, T_in) or (B, T_in, C_in) + x (Tensor): + The input tensor. Its data layout depends on ``data_format``. + shape=(B, C_in, T_in) or (B, T_in, C_in) Returns: - Tensor: The output tensor. - shape=(B, C_out, T_out) or (B, T_out, C_out) + Tensor: + The output tensor. shape=(B, C_out, T_out) or (B, T_out, C_out) """ x = self.conv(x) diff --git a/paddlespeech/t2s/modules/geometry.py b/paddlespeech/t2s/modules/geometry.py index 01eb5ad0ab2479cff21d210c3b2f1aa5742fbd4c..80c872a817d6e16c4117dc941580c038e72fe83c 100644 --- a/paddlespeech/t2s/modules/geometry.py +++ b/paddlespeech/t2s/modules/geometry.py @@ -19,8 +19,10 @@ def shuffle_dim(x, axis, perm=None): """Permute input tensor along aixs given the permutation or randomly. Args: - x (Tensor): The input tensor. - axis (int): The axis to shuffle. + x (Tensor): + The input tensor. + axis (int): + The axis to shuffle. perm (List[int], ndarray, optional): The order to reorder the tensor along the ``axis``-th dimension. It is a permutation of ``[0, d)``, where d is the size of the diff --git a/paddlespeech/t2s/modules/layer_norm.py b/paddlespeech/t2s/modules/layer_norm.py index 088b98e02cf3fc987da54b881cf8060dfe15ecf2..9e2add29334a646fb4191eeb2d8bd6a8041531d8 100644 --- a/paddlespeech/t2s/modules/layer_norm.py +++ b/paddlespeech/t2s/modules/layer_norm.py @@ -19,8 +19,10 @@ from paddle import nn class LayerNorm(nn.LayerNorm): """Layer normalization module. Args: - nout (int): Output dim size. - dim (int): Dimension to be normalized. + nout (int): + Output dim size. + dim (int): + Dimension to be normalized. """ def __init__(self, nout, dim=-1): @@ -32,7 +34,8 @@ class LayerNorm(nn.LayerNorm): """Apply layer normalization. Args: - x (Tensor):Input tensor. + x (Tensor): + Input tensor. Returns: Tensor: Normalized tensor. diff --git a/paddlespeech/t2s/modules/losses.py b/paddlespeech/t2s/modules/losses.py index 4726f40ecf1ee3c8208bead0919f348cb679de4a..b2a31a32145afb981bff432579ddc513b937bd6f 100644 --- a/paddlespeech/t2s/modules/losses.py +++ b/paddlespeech/t2s/modules/losses.py @@ -269,8 +269,10 @@ class GuidedAttentionLoss(nn.Layer): """Make masks indicating non-padded part. Args: - ilens(Tensor(int64) or List): Batch of lengths (B,). - olens(Tensor(int64) or List): Batch of lengths (B,). + ilens(Tensor(int64) or List): + Batch of lengths (B,). + olens(Tensor(int64) or List): + Batch of lengths (B,). Returns: Tensor: Mask tensor indicating non-padded part. @@ -322,9 +324,12 @@ class GuidedMultiHeadAttentionLoss(GuidedAttentionLoss): """Calculate forward propagation. Args: - att_ws(Tensor): Batch of multi head attention weights (B, H, T_max_out, T_max_in). - ilens(Tensor): Batch of input lenghts (B,). - olens(Tensor): Batch of output lenghts (B,). + att_ws(Tensor): + Batch of multi head attention weights (B, H, T_max_out, T_max_in). + ilens(Tensor): + Batch of input lenghts (B,). + olens(Tensor): + Batch of output lenghts (B,). Returns: Tensor: Guided attention loss value. @@ -354,9 +359,12 @@ class Tacotron2Loss(nn.Layer): """Initialize Tactoron2 loss module. Args: - use_masking (bool): Whether to apply masking for padded part in loss calculation. - use_weighted_masking (bool): Whether to apply weighted masking in loss calculation. - bce_pos_weight (float): Weight of positive sample of stop token. + use_masking (bool): + Whether to apply masking for padded part in loss calculation. + use_weighted_masking (bool): + Whether to apply weighted masking in loss calculation. + bce_pos_weight (float): + Weight of positive sample of stop token. """ super().__init__() assert (use_masking != use_weighted_masking) or not use_masking @@ -374,17 +382,25 @@ class Tacotron2Loss(nn.Layer): """Calculate forward propagation. Args: - after_outs(Tensor): Batch of outputs after postnets (B, Lmax, odim). - before_outs(Tensor): Batch of outputs before postnets (B, Lmax, odim). - logits(Tensor): Batch of stop logits (B, Lmax). - ys(Tensor): Batch of padded target features (B, Lmax, odim). - stop_labels(Tensor(int64)): Batch of the sequences of stop token labels (B, Lmax). + after_outs(Tensor): + Batch of outputs after postnets (B, Lmax, odim). + before_outs(Tensor): + Batch of outputs before postnets (B, Lmax, odim). + logits(Tensor): + Batch of stop logits (B, Lmax). + ys(Tensor): + Batch of padded target features (B, Lmax, odim). + stop_labels(Tensor(int64)): + Batch of the sequences of stop token labels (B, Lmax). olens(Tensor(int64)): Returns: - Tensor: L1 loss value. - Tensor: Mean square error loss value. - Tensor: Binary cross entropy loss value. + Tensor: + L1 loss value. + Tensor: + Mean square error loss value. + Tensor: + Binary cross entropy loss value. """ # make mask and apply it if self.use_masking: @@ -437,16 +453,24 @@ def stft(x, pad_mode='reflect'): """Perform STFT and convert to magnitude spectrogram. Args: - x(Tensor): Input signal tensor (B, T). - fft_size(int): FFT size. - hop_size(int): Hop size. - win_length(int, optional): window : str, optional (Default value = None) - window(str, optional): Name of window function, see `scipy.signal.get_window` for more - details. Defaults to "hann". - center(bool, optional, optional): center (bool, optional): Whether to pad `x` to make that the + x(Tensor): + Input signal tensor (B, T). + fft_size(int): + FFT size. + hop_size(int): + Hop size. + win_length(int, optional): + window (str, optional): + (Default value = None) + window(str, optional): + Name of window function, see `scipy.signal.get_window` for more details. Defaults to "hann". + center(bool, optional, optional): center (bool, optional): + Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of :math:`t`-th frame. Default: `True`. - pad_mode(str, optional, optional): (Default value = 'reflect') - hop_length: (Default value = None) + pad_mode(str, optional, optional): + (Default value = 'reflect') + hop_length: + (Default value = None) Returns: Tensor: Magnitude spectrogram (B, #frames, fft_size // 2 + 1). @@ -480,8 +504,10 @@ class SpectralConvergenceLoss(nn.Layer): def forward(self, x_mag, y_mag): """Calculate forward propagation. Args: - x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins). - y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins). + x_mag (Tensor): + Magnitude spectrogram of predicted signal (B, #frames, #freq_bins). + y_mag (Tensor): + Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins). Returns: Tensor: Spectral convergence loss value. """ @@ -501,8 +527,10 @@ class LogSTFTMagnitudeLoss(nn.Layer): def forward(self, x_mag, y_mag): """Calculate forward propagation. Args: - x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins). - y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins). + x_mag (Tensor): + Magnitude spectrogram of predicted signal (B, #frames, #freq_bins). + y_mag (Tensor): + Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins). Returns: Tensor: Log STFT magnitude loss value. """ @@ -531,11 +559,15 @@ class STFTLoss(nn.Layer): def forward(self, x, y): """Calculate forward propagation. Args: - x (Tensor): Predicted signal (B, T). - y (Tensor): Groundtruth signal (B, T). + x (Tensor): + Predicted signal (B, T). + y (Tensor): + Groundtruth signal (B, T). Returns: - Tensor: Spectral convergence loss value. - Tensor: Log STFT magnitude loss value. + Tensor: + Spectral convergence loss value. + Tensor: + Log STFT magnitude loss value. """ x_mag = stft(x, self.fft_size, self.shift_size, self.win_length, self.window) @@ -558,10 +590,14 @@ class MultiResolutionSTFTLoss(nn.Layer): window="hann", ): """Initialize Multi resolution STFT loss module. Args: - fft_sizes (list): List of FFT sizes. - hop_sizes (list): List of hop sizes. - win_lengths (list): List of window lengths. - window (str): Window function type. + fft_sizes (list): + List of FFT sizes. + hop_sizes (list): + List of hop sizes. + win_lengths (list): + List of window lengths. + window (str): + Window function type. """ super().__init__() assert len(fft_sizes) == len(hop_sizes) == len(win_lengths) @@ -573,11 +609,15 @@ class MultiResolutionSTFTLoss(nn.Layer): """Calculate forward propagation. Args: - x (Tensor): Predicted signal (B, T) or (B, #subband, T). - y (Tensor): Groundtruth signal (B, T) or (B, #subband, T). + x (Tensor): + Predicted signal (B, T) or (B, #subband, T). + y (Tensor): + Groundtruth signal (B, T) or (B, #subband, T). Returns: - Tensor: Multi resolution spectral convergence loss value. - Tensor: Multi resolution log STFT magnitude loss value. + Tensor: + Multi resolution spectral convergence loss value. + Tensor: + Multi resolution log STFT magnitude loss value. """ if len(x.shape) == 3: # (B, C, T) -> (B x C, T) @@ -615,9 +655,11 @@ class GeneratorAdversarialLoss(nn.Layer): def forward(self, outputs): """Calcualate generator adversarial loss. Args: - outputs (Tensor or List): Discriminator outputs or list of discriminator outputs. + outputs (Tensor or List): + Discriminator outputs or list of discriminator outputs. Returns: - Tensor: Generator adversarial loss value. + Tensor: + Generator adversarial loss value. """ if isinstance(outputs, (tuple, list)): adv_loss = 0.0 @@ -659,13 +701,15 @@ class DiscriminatorAdversarialLoss(nn.Layer): """Calcualate discriminator adversarial loss. Args: - outputs_hat (Tensor or list): Discriminator outputs or list of - discriminator outputs calculated from generator outputs. - outputs (Tensor or list): Discriminator outputs or list of - discriminator outputs calculated from groundtruth. + outputs_hat (Tensor or list): + Discriminator outputs or list of discriminator outputs calculated from generator outputs. + outputs (Tensor or list): + Discriminator outputs or list of discriminator outputs calculated from groundtruth. Returns: - Tensor: Discriminator real loss value. - Tensor: Discriminator fake loss value. + Tensor: + Discriminator real loss value. + Tensor: + Discriminator fake loss value. """ if isinstance(outputs, (tuple, list)): real_loss = 0.0 @@ -766,9 +810,12 @@ def masked_l1_loss(prediction, target, mask): """Compute maksed L1 loss. Args: - prediction(Tensor): The prediction. - target(Tensor): The target. The shape should be broadcastable to ``prediction``. - mask(Tensor): The mask. The shape should be broadcatable to the broadcasted shape of + prediction(Tensor): + The prediction. + target(Tensor): + The target. The shape should be broadcastable to ``prediction``. + mask(Tensor): + The mask. The shape should be broadcatable to the broadcasted shape of ``prediction`` and ``target``. Returns: @@ -916,8 +963,10 @@ class MelSpectrogramLoss(nn.Layer): def forward(self, y_hat, y): """Calculate Mel-spectrogram loss. Args: - y_hat(Tensor): Generated single tensor (B, 1, T). - y(Tensor): Groundtruth single tensor (B, 1, T). + y_hat(Tensor): + Generated single tensor (B, 1, T). + y(Tensor): + Groundtruth single tensor (B, 1, T). Returns: Tensor: Mel-spectrogram loss value. @@ -947,9 +996,11 @@ class FeatureMatchLoss(nn.Layer): """Calcualate feature matching loss. Args: - feats_hat(list): List of list of discriminator outputs + feats_hat(list): + List of list of discriminator outputs calcuated from generater outputs. - feats(list): List of list of discriminator outputs + feats(list): + List of list of discriminator outputs Returns: Tensor: Feature matching loss value. @@ -986,11 +1037,16 @@ class KLDivergenceLoss(nn.Layer): """Calculate KL divergence loss. Args: - z_p (Tensor): Flow hidden representation (B, H, T_feats). - logs_q (Tensor): Posterior encoder projected scale (B, H, T_feats). - m_p (Tensor): Expanded text encoder projected mean (B, H, T_feats). - logs_p (Tensor): Expanded text encoder projected scale (B, H, T_feats). - z_mask (Tensor): Mask tensor (B, 1, T_feats). + z_p (Tensor): + Flow hidden representation (B, H, T_feats). + logs_q (Tensor): + Posterior encoder projected scale (B, H, T_feats). + m_p (Tensor): + Expanded text encoder projected mean (B, H, T_feats). + logs_p (Tensor): + Expanded text encoder projected scale (B, H, T_feats). + z_mask (Tensor): + Mask tensor (B, 1, T_feats). Returns: Tensor: KL divergence loss. diff --git a/paddlespeech/t2s/modules/nets_utils.py b/paddlespeech/t2s/modules/nets_utils.py index 0238f4dba2a79b41fbb2cce1f7f4eaba5bd2294c..a3d5d1354f114952b784ba16b71c9344ef28c9d8 100644 --- a/paddlespeech/t2s/modules/nets_utils.py +++ b/paddlespeech/t2s/modules/nets_utils.py @@ -25,8 +25,10 @@ def pad_list(xs, pad_value): """Perform padding for the list of tensors. Args: - xs (List[Tensor]): List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)]. - pad_value (float): Value for padding. + xs (List[Tensor]): + List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)]. + pad_value (float): + Value for padding. Returns: Tensor: Padded tensor (B, Tmax, `*`). @@ -55,10 +57,13 @@ def make_pad_mask(lengths, xs=None, length_dim=-1): """Make mask tensor containing indices of padded part. Args: - lengths (Tensor(int64)): Batch of lengths (B,). - xs (Tensor, optional): The reference tensor. + lengths (Tensor(int64)): + Batch of lengths (B,). + xs (Tensor, optional): + The reference tensor. If set, masks will be the same shape as this tensor. - length_dim (int, optional): Dimension indicator of the above tensor. + length_dim (int, optional): + Dimension indicator of the above tensor. See the example. Returns: @@ -147,7 +152,7 @@ def make_pad_mask(lengths, xs=None, length_dim=-1): seq_range = paddle.arange(0, maxlen, dtype=paddle.int64) seq_range_expand = seq_range.unsqueeze(0).expand([bs, maxlen]) seq_length_expand = lengths.unsqueeze(-1) - mask = seq_range_expand >= seq_length_expand + mask = seq_range_expand >= seq_length_expand.cast(seq_range_expand.dtype) if xs is not None: assert paddle.shape(xs)[0] == bs, (paddle.shape(xs)[0], bs) @@ -166,14 +171,18 @@ def make_non_pad_mask(lengths, xs=None, length_dim=-1): """Make mask tensor containing indices of non-padded part. Args: - lengths (Tensor(int64) or List): Batch of lengths (B,). - xs (Tensor, optional): The reference tensor. + lengths (Tensor(int64) or List): + Batch of lengths (B,). + xs (Tensor, optional): + The reference tensor. If set, masks will be the same shape as this tensor. - length_dim (int, optional): Dimension indicator of the above tensor. + length_dim (int, optional): + Dimension indicator of the above tensor. See the example. Returns: - Tensor(bool): mask tensor containing indices of padded part bool. + Tensor(bool): + mask tensor containing indices of padded part bool. Examples: With only lengths. @@ -257,8 +266,10 @@ def initialize(model: nn.Layer, init: str): Custom initialization routines can be implemented into submodules Args: - model (nn.Layer): Target. - init (str): Method of initialization. + model (nn.Layer): + Target. + init (str): + Method of initialization. """ assert check_argument_types() @@ -285,12 +296,17 @@ def get_random_segments( segment_size: int, ) -> Tuple[paddle.Tensor, paddle.Tensor]: """Get random segments. Args: - x (Tensor): Input tensor (B, C, T). - x_lengths (Tensor): Length tensor (B,). - segment_size (int): Segment size. + x (Tensor): + Input tensor (B, C, T). + x_lengths (Tensor): + Length tensor (B,). + segment_size (int): + Segment size. Returns: - Tensor: Segmented tensor (B, C, segment_size). - Tensor: Start index tensor (B,). + Tensor: + Segmented tensor (B, C, segment_size). + Tensor: + Start index tensor (B,). """ b, c, t = paddle.shape(x) max_start_idx = x_lengths - segment_size @@ -306,9 +322,12 @@ def get_segments( segment_size: int, ) -> paddle.Tensor: """Get segments. Args: - x (Tensor): Input tensor (B, C, T). - start_idxs (Tensor): Start index tensor (B,). - segment_size (int): Segment size. + x (Tensor): + Input tensor (B, C, T). + start_idxs (Tensor): + Start index tensor (B,). + segment_size (int): + Segment size. Returns: Tensor: Segmented tensor (B, C, segment_size). """ @@ -353,14 +372,20 @@ def phones_masking(xs_pad: paddle.Tensor, span_bdy: paddle.Tensor=None): ''' Args: - xs_pad (paddle.Tensor): input speech (B, Tmax, D). - src_mask (paddle.Tensor): mask of speech (B, 1, Tmax). - align_start (paddle.Tensor): frame level phone alignment start (B, Tmax2). - align_end (paddle.Tensor): frame level phone alignment end (B, Tmax2). - align_start_lens (paddle.Tensor): length of align_start (B, ). + xs_pad (paddle.Tensor): + input speech (B, Tmax, D). + src_mask (paddle.Tensor): + mask of speech (B, 1, Tmax). + align_start (paddle.Tensor): + frame level phone alignment start (B, Tmax2). + align_end (paddle.Tensor): + frame level phone alignment end (B, Tmax2). + align_start_lens (paddle.Tensor): + length of align_start (B, ). mlm_prob (float): mean_phn_span (int): - span_bdy (paddle.Tensor): masked mel boundary of input speech (B, 2). + span_bdy (paddle.Tensor): + masked mel boundary of input speech (B, 2). Returns: paddle.Tensor[bool]: masked position of input speech (B, Tmax). ''' @@ -416,19 +441,29 @@ def phones_text_masking(xs_pad: paddle.Tensor, span_bdy: paddle.Tensor=None): ''' Args: - xs_pad (paddle.Tensor): input speech (B, Tmax, D). - src_mask (paddle.Tensor): mask of speech (B, 1, Tmax). - text_pad (paddle.Tensor): input text (B, Tmax2). - text_mask (paddle.Tensor): mask of text (B, 1, Tmax2). - align_start (paddle.Tensor): frame level phone alignment start (B, Tmax2). - align_end (paddle.Tensor): frame level phone alignment end (B, Tmax2). - align_start_lens (paddle.Tensor): length of align_start (B, ). + xs_pad (paddle.Tensor): + input speech (B, Tmax, D). + src_mask (paddle.Tensor): + mask of speech (B, 1, Tmax). + text_pad (paddle.Tensor): + input text (B, Tmax2). + text_mask (paddle.Tensor): + mask of text (B, 1, Tmax2). + align_start (paddle.Tensor): + frame level phone alignment start (B, Tmax2). + align_end (paddle.Tensor): + frame level phone alignment end (B, Tmax2). + align_start_lens (paddle.Tensor): + length of align_start (B, ). mlm_prob (float): mean_phn_span (int): - span_bdy (paddle.Tensor): masked mel boundary of input speech (B, 2). + span_bdy (paddle.Tensor): + masked mel boundary of input speech (B, 2). Returns: - paddle.Tensor[bool]: masked position of input speech (B, Tmax). - paddle.Tensor[bool]: masked position of input text (B, Tmax2). + paddle.Tensor[bool]: + masked position of input speech (B, Tmax). + paddle.Tensor[bool]: + masked position of input text (B, Tmax2). ''' bz, sent_len, _ = paddle.shape(xs_pad) masked_pos = paddle.zeros((bz, sent_len)) @@ -488,12 +523,18 @@ def get_seg_pos(speech_pad: paddle.Tensor, seg_emb: bool=False): ''' Args: - speech_pad (paddle.Tensor): input speech (B, Tmax, D). - text_pad (paddle.Tensor): input text (B, Tmax2). - align_start (paddle.Tensor): frame level phone alignment start (B, Tmax2). - align_end (paddle.Tensor): frame level phone alignment end (B, Tmax2). - align_start_lens (paddle.Tensor): length of align_start (B, ). - seg_emb (bool): whether to use segment embedding. + speech_pad (paddle.Tensor): + input speech (B, Tmax, D). + text_pad (paddle.Tensor): + input text (B, Tmax2). + align_start (paddle.Tensor): + frame level phone alignment start (B, Tmax2). + align_end (paddle.Tensor): + frame level phone alignment end (B, Tmax2). + align_start_lens (paddle.Tensor): + length of align_start (B, ). + seg_emb (bool): + whether to use segment embedding. Returns: paddle.Tensor[int]: n-th phone of each mel, 0<=n<=Tmax2 (B, Tmax). eg: @@ -579,8 +620,10 @@ def random_spans_noise_mask(length: int, def _random_seg(num_items, num_segs): """Partition a sequence of items randomly into non-empty segments. Args: - num_items: an integer scalar > 0 - num_segs: an integer scalar in [1, num_items] + num_items: + an integer scalar > 0 + num_segs: + an integer scalar in [1, num_items] Returns: a Tensor with shape [num_segs] containing positive integers that add up to num_items diff --git a/paddlespeech/t2s/modules/pqmf.py b/paddlespeech/t2s/modules/pqmf.py index 9860da906094ad930a7791ca527b44cc2a3e51d1..7b42409d8250ed3a2c7f6815f03193f081986243 100644 --- a/paddlespeech/t2s/modules/pqmf.py +++ b/paddlespeech/t2s/modules/pqmf.py @@ -26,9 +26,12 @@ def design_prototype_filter(taps=62, cutoff_ratio=0.142, beta=9.0): filters of cosine modulated filterbanks`_. Args: - taps (int): The number of filter taps. - cutoff_ratio (float): Cut-off frequency ratio. - beta (float): Beta coefficient for kaiser window. + taps (int): + The number of filter taps. + cutoff_ratio (float): + Cut-off frequency ratio. + beta (float): + Beta coefficient for kaiser window. Returns: ndarray: Impluse response of prototype filter (taps + 1,). @@ -66,10 +69,14 @@ class PQMF(nn.Layer): See dicussion in https://github.com/kan-bayashi/ParallelWaveGAN/issues/195. Args: - subbands (int): The number of subbands. - taps (int): The number of filter taps. - cutoff_ratio (float): Cut-off frequency ratio. - beta (float): Beta coefficient for kaiser window. + subbands (int): + The number of subbands. + taps (int): + The number of filter taps. + cutoff_ratio (float): + Cut-off frequency ratio. + beta (float): + Beta coefficient for kaiser window. """ super().__init__() @@ -103,7 +110,8 @@ class PQMF(nn.Layer): def analysis(self, x): """Analysis with PQMF. Args: - x (Tensor): Input tensor (B, 1, T). + x (Tensor): + Input tensor (B, 1, T). Returns: Tensor: Output tensor (B, subbands, T // subbands). """ @@ -113,7 +121,8 @@ class PQMF(nn.Layer): def synthesis(self, x): """Synthesis with PQMF. Args: - x (Tensor): Input tensor (B, subbands, T // subbands). + x (Tensor): + Input tensor (B, subbands, T // subbands). Returns: Tensor: Output tensor (B, 1, T). """ diff --git a/paddlespeech/t2s/modules/predictor/duration_predictor.py b/paddlespeech/t2s/modules/predictor/duration_predictor.py index 33ed575b4245506438e439fff5d5b8a6ff1b238a..cb38fd5b4e37b7fa139fef335ce62130a280bf6c 100644 --- a/paddlespeech/t2s/modules/predictor/duration_predictor.py +++ b/paddlespeech/t2s/modules/predictor/duration_predictor.py @@ -50,12 +50,18 @@ class DurationPredictor(nn.Layer): """Initilize duration predictor module. Args: - idim (int):Input dimension. - n_layers (int, optional): Number of convolutional layers. - n_chans (int, optional): Number of channels of convolutional layers. - kernel_size (int, optional): Kernel size of convolutional layers. - dropout_rate (float, optional): Dropout rate. - offset (float, optional): Offset value to avoid nan in log domain. + idim (int): + Input dimension. + n_layers (int, optional): + Number of convolutional layers. + n_chans (int, optional): + Number of channels of convolutional layers. + kernel_size (int, optional): + Kernel size of convolutional layers. + dropout_rate (float, optional): + Dropout rate. + offset (float, optional): + Offset value to avoid nan in log domain. """ super().__init__() @@ -99,8 +105,10 @@ class DurationPredictor(nn.Layer): def forward(self, xs, x_masks=None): """Calculate forward propagation. Args: - xs(Tensor): Batch of input sequences (B, Tmax, idim). - x_masks(ByteTensor, optional, optional): Batch of masks indicating padded part (B, Tmax). (Default value = None) + xs(Tensor): + Batch of input sequences (B, Tmax, idim). + x_masks(ByteTensor, optional, optional): + Batch of masks indicating padded part (B, Tmax). (Default value = None) Returns: Tensor: Batch of predicted durations in log domain (B, Tmax). @@ -110,8 +118,10 @@ class DurationPredictor(nn.Layer): def inference(self, xs, x_masks=None): """Inference duration. Args: - xs(Tensor): Batch of input sequences (B, Tmax, idim). - x_masks(Tensor(bool), optional, optional): Batch of masks indicating padded part (B, Tmax). (Default value = None) + xs(Tensor): + Batch of input sequences (B, Tmax, idim). + x_masks(Tensor(bool), optional, optional): + Batch of masks indicating padded part (B, Tmax). (Default value = None) Returns: Tensor: Batch of predicted durations in linear domain int64 (B, Tmax). @@ -140,8 +150,10 @@ class DurationPredictorLoss(nn.Layer): """Calculate forward propagation. Args: - outputs(Tensor): Batch of prediction durations in log domain (B, T) - targets(Tensor): Batch of groundtruth durations in linear domain (B, T) + outputs(Tensor): + Batch of prediction durations in log domain (B, T) + targets(Tensor): + Batch of groundtruth durations in linear domain (B, T) Returns: Tensor: Mean squared error loss value. diff --git a/paddlespeech/t2s/modules/predictor/length_regulator.py b/paddlespeech/t2s/modules/predictor/length_regulator.py index e4fbf54916ed98948fffe8bf8325a312928efa57..bdfa18391c6bf6a9ed83ac9bb79c6567b3947dae 100644 --- a/paddlespeech/t2s/modules/predictor/length_regulator.py +++ b/paddlespeech/t2s/modules/predictor/length_regulator.py @@ -36,7 +36,8 @@ class LengthRegulator(nn.Layer): """Initilize length regulator module. Args: - pad_value (float, optional): Value used for padding. + pad_value (float, optional): + Value used for padding. """ super().__init__() @@ -97,9 +98,12 @@ class LengthRegulator(nn.Layer): """Calculate forward propagation. Args: - xs (Tensor): Batch of sequences of char or phoneme embeddings (B, Tmax, D). - ds (Tensor(int64)): Batch of durations of each frame (B, T). - alpha (float, optional): Alpha value to control speed of speech. + xs (Tensor): + Batch of sequences of char or phoneme embeddings (B, Tmax, D). + ds (Tensor(int64)): + Batch of durations of each frame (B, T). + alpha (float, optional): + Alpha value to control speed of speech. Returns: Tensor: replicated input tensor based on durations (B, T*, D). diff --git a/paddlespeech/t2s/modules/predictor/variance_predictor.py b/paddlespeech/t2s/modules/predictor/variance_predictor.py index 8afbf2576d158c9df7a56800f7fdea386bb0ae2b..4c2a67cc4ecd60e77210677e87042bf6d3a554c8 100644 --- a/paddlespeech/t2s/modules/predictor/variance_predictor.py +++ b/paddlespeech/t2s/modules/predictor/variance_predictor.py @@ -43,11 +43,16 @@ class VariancePredictor(nn.Layer): """Initilize duration predictor module. Args: - idim (int): Input dimension. - n_layers (int, optional): Number of convolutional layers. - n_chans (int, optional): Number of channels of convolutional layers. - kernel_size (int, optional): Kernel size of convolutional layers. - dropout_rate (float, optional): Dropout rate. + idim (int): + Input dimension. + n_layers (int, optional): + Number of convolutional layers. + n_chans (int, optional): + Number of channels of convolutional layers. + kernel_size (int, optional): + Kernel size of convolutional layers. + dropout_rate (float, optional): + Dropout rate. """ assert check_argument_types() super().__init__() @@ -74,11 +79,14 @@ class VariancePredictor(nn.Layer): """Calculate forward propagation. Args: - xs (Tensor): Batch of input sequences (B, Tmax, idim). - x_masks (Tensor(bool), optional): Batch of masks indicating padded part (B, Tmax, 1). + xs (Tensor): + Batch of input sequences (B, Tmax, idim). + x_masks (Tensor(bool), optional): + Batch of masks indicating padded part (B, Tmax, 1). Returns: - Tensor: Batch of predicted sequences (B, Tmax, 1). + Tensor: + Batch of predicted sequences (B, Tmax, 1). """ # (B, idim, Tmax) xs = xs.transpose([0, 2, 1]) diff --git a/paddlespeech/t2s/modules/residual_block.py b/paddlespeech/t2s/modules/residual_block.py index 5965a72032720b69ca494fe9ee42a8c7bae17c63..f21eedecb5f3a546ea50942c02394d6fc9e21a0d 100644 --- a/paddlespeech/t2s/modules/residual_block.py +++ b/paddlespeech/t2s/modules/residual_block.py @@ -29,15 +29,24 @@ class WaveNetResidualBlock(nn.Layer): refer to `WaveNet: A Generative Model for Raw Audio `_. Args: - kernel_size (int, optional): Kernel size of the 1D convolution, by default 3 - residual_channels (int, optional): Feature size of the residual output(and also the input), by default 64 - gate_channels (int, optional): Output feature size of the 1D convolution, by default 128 - skip_channels (int, optional): Feature size of the skip output, by default 64 - aux_channels (int, optional): Feature size of the auxiliary input (e.g. spectrogram), by default 80 - dropout (float, optional): Probability of the dropout before the 1D convolution, by default 0. - dilation (int, optional): Dilation of the 1D convolution, by default 1 - bias (bool, optional): Whether to use bias in the 1D convolution, by default True - use_causal_conv (bool, optional): Whether to use causal padding for the 1D convolution, by default False + kernel_size (int, optional): + Kernel size of the 1D convolution, by default 3 + residual_channels (int, optional): + Feature size of the residual output(and also the input), by default 64 + gate_channels (int, optional): + Output feature size of the 1D convolution, by default 128 + skip_channels (int, optional): + Feature size of the skip output, by default 64 + aux_channels (int, optional): + Feature size of the auxiliary input (e.g. spectrogram), by default 80 + dropout (float, optional): + Probability of the dropout before the 1D convolution, by default 0. + dilation (int, optional): + Dilation of the 1D convolution, by default 1 + bias (bool, optional): + Whether to use bias in the 1D convolution, by default True + use_causal_conv (bool, optional): + Whether to use causal padding for the 1D convolution, by default False """ def __init__(self, @@ -81,13 +90,17 @@ class WaveNetResidualBlock(nn.Layer): def forward(self, x, c): """ Args: - x (Tensor): the input features. Shape (N, C_res, T) - c (Tensor): the auxiliary input. Shape (N, C_aux, T) + x (Tensor): + the input features. Shape (N, C_res, T) + c (Tensor): + the auxiliary input. Shape (N, C_aux, T) Returns: - res (Tensor): Shape (N, C_res, T), the residual output, which is used as the + res (Tensor): + Shape (N, C_res, T), the residual output, which is used as the input of the next ResidualBlock in a stack of ResidualBlocks. - skip (Tensor): Shape (N, C_skip, T), the skip output, which is collected among + skip (Tensor): + Shape (N, C_skip, T), the skip output, which is collected among each layer in a stack of ResidualBlocks. """ x_input = x @@ -121,13 +134,20 @@ class HiFiGANResidualBlock(nn.Layer): ): """Initialize HiFiGANResidualBlock module. Args: - kernel_size (int): Kernel size of dilation convolution layer. - channels (int): Number of channels for convolution layer. - dilations (List[int]): List of dilation factors. - use_additional_convs (bool): Whether to use additional convolution layers. - bias (bool): Whether to add bias parameter in convolution layers. - nonlinear_activation (str): Activation function module name. - nonlinear_activation_params (dict): Hyperparameters for activation function. + kernel_size (int): + Kernel size of dilation convolution layer. + channels (int): + Number of channels for convolution layer. + dilations (List[int]): + List of dilation factors. + use_additional_convs (bool): + Whether to use additional convolution layers. + bias (bool): + Whether to add bias parameter in convolution layers. + nonlinear_activation (str): + Activation function module name. + nonlinear_activation_params (dict): + Hyperparameters for activation function. """ super().__init__() @@ -167,7 +187,8 @@ class HiFiGANResidualBlock(nn.Layer): def forward(self, x): """Calculate forward propagation. Args: - x (Tensor): Input tensor (B, channels, T). + x (Tensor): + Input tensor (B, channels, T). Returns: Tensor: Output tensor (B, channels, T). """ diff --git a/paddlespeech/t2s/modules/residual_stack.py b/paddlespeech/t2s/modules/residual_stack.py index 0d949b5635329819a613a748e34015964d2fed5c..98f5db3cf6820f3571d97a35b7b6eb8da6b1bc5f 100644 --- a/paddlespeech/t2s/modules/residual_stack.py +++ b/paddlespeech/t2s/modules/residual_stack.py @@ -39,15 +39,24 @@ class ResidualStack(nn.Layer): """Initialize ResidualStack module. Args: - kernel_size (int): Kernel size of dilation convolution layer. - channels (int): Number of channels of convolution layers. - dilation (int): Dilation factor. - bias (bool): Whether to add bias parameter in convolution layers. - nonlinear_activation (str): Activation function module name. - nonlinear_activation_params (Dict[str,Any]): Hyperparameters for activation function. - pad (str): Padding function module name before dilated convolution layer. - pad_params (Dict[str, Any]): Hyperparameters for padding function. - use_causal_conv (bool): Whether to use causal convolution. + kernel_size (int): + Kernel size of dilation convolution layer. + channels (int): + Number of channels of convolution layers. + dilation (int): + Dilation factor. + bias (bool): + Whether to add bias parameter in convolution layers. + nonlinear_activation (str): + Activation function module name. + nonlinear_activation_params (Dict[str,Any]): + Hyperparameters for activation function. + pad (str): + Padding function module name before dilated convolution layer. + pad_params (Dict[str, Any]): + Hyperparameters for padding function. + use_causal_conv (bool): + Whether to use causal convolution. """ super().__init__() # for compatibility @@ -95,7 +104,8 @@ class ResidualStack(nn.Layer): """Calculate forward propagation. Args: - c (Tensor): Input tensor (B, channels, T). + c (Tensor): + Input tensor (B, channels, T). Returns: Tensor: Output tensor (B, chennels, T). """ diff --git a/paddlespeech/t2s/modules/style_encoder.py b/paddlespeech/t2s/modules/style_encoder.py index 49091eac8215898d1428b937a353adb037f774c6..b558e7693aeee59c6351f78c1a47fd9ad4934249 100644 --- a/paddlespeech/t2s/modules/style_encoder.py +++ b/paddlespeech/t2s/modules/style_encoder.py @@ -32,16 +32,26 @@ class StyleEncoder(nn.Layer): Speech Synthesis`: https://arxiv.org/abs/1803.09017 Args: - idim (int, optional): Dimension of the input mel-spectrogram. - gst_tokens (int, optional): The number of GST embeddings. - gst_token_dim (int, optional): Dimension of each GST embedding. - gst_heads (int, optional): The number of heads in GST multihead attention. - conv_layers (int, optional): The number of conv layers in the reference encoder. - conv_chans_list (Sequence[int], optional): List of the number of channels of conv layers in the referece encoder. - conv_kernel_size (int, optional): Kernal size of conv layers in the reference encoder. - conv_stride (int, optional): Stride size of conv layers in the reference encoder. - gru_layers (int, optional): The number of GRU layers in the reference encoder. - gru_units (int, optional):The number of GRU units in the reference encoder. + idim (int, optional): + Dimension of the input mel-spectrogram. + gst_tokens (int, optional): + The number of GST embeddings. + gst_token_dim (int, optional): + Dimension of each GST embedding. + gst_heads (int, optional): + The number of heads in GST multihead attention. + conv_layers (int, optional): + The number of conv layers in the reference encoder. + conv_chans_list (Sequence[int], optional): + List of the number of channels of conv layers in the referece encoder. + conv_kernel_size (int, optional): + Kernal size of conv layers in the reference encoder. + conv_stride (int, optional): + Stride size of conv layers in the reference encoder. + gru_layers (int, optional): + The number of GRU layers in the reference encoder. + gru_units (int, optional): + The number of GRU units in the reference encoder. Todo: * Support manual weight specification in inference. @@ -82,7 +92,8 @@ class StyleEncoder(nn.Layer): """Calculate forward propagation. Args: - speech (Tensor): Batch of padded target features (B, Lmax, odim). + speech (Tensor): + Batch of padded target features (B, Lmax, odim). Returns: Tensor: Style token embeddings (B, token_dim). @@ -104,13 +115,20 @@ class ReferenceEncoder(nn.Layer): Speech Synthesis`: https://arxiv.org/abs/1803.09017 Args: - idim (int, optional): Dimension of the input mel-spectrogram. - conv_layers (int, optional): The number of conv layers in the reference encoder. - conv_chans_list: (Sequence[int], optional): List of the number of channels of conv layers in the referece encoder. - conv_kernel_size (int, optional): Kernal size of conv layers in the reference encoder. - conv_stride (int, optional): Stride size of conv layers in the reference encoder. - gru_layers (int, optional): The number of GRU layers in the reference encoder. - gru_units (int, optional): The number of GRU units in the reference encoder. + idim (int, optional): + Dimension of the input mel-spectrogram. + conv_layers (int, optional): + The number of conv layers in the reference encoder. + conv_chans_list: (Sequence[int], optional): + List of the number of channels of conv layers in the referece encoder. + conv_kernel_size (int, optional): + Kernal size of conv layers in the reference encoder. + conv_stride (int, optional): + Stride size of conv layers in the reference encoder. + gru_layers (int, optional): + The number of GRU layers in the reference encoder. + gru_units (int, optional): + The number of GRU units in the reference encoder. """ @@ -168,7 +186,8 @@ class ReferenceEncoder(nn.Layer): def forward(self, speech: paddle.Tensor) -> paddle.Tensor: """Calculate forward propagation. Args: - speech (Tensor): Batch of padded target features (B, Lmax, idim). + speech (Tensor): + Batch of padded target features (B, Lmax, idim). Returns: Tensor: Reference embedding (B, gru_units) @@ -200,11 +219,16 @@ class StyleTokenLayer(nn.Layer): .. _`Style Tokens: Unsupervised Style Modeling, Control and Transfer in End-to-End Speech Synthesis`: https://arxiv.org/abs/1803.09017 Args: - ref_embed_dim (int, optional): Dimension of the input reference embedding. - gst_tokens (int, optional): The number of GST embeddings. - gst_token_dim (int, optional): Dimension of each GST embedding. - gst_heads (int, optional): The number of heads in GST multihead attention. - dropout_rate (float, optional): Dropout rate in multi-head attention. + ref_embed_dim (int, optional): + Dimension of the input reference embedding. + gst_tokens (int, optional): + The number of GST embeddings. + gst_token_dim (int, optional): + Dimension of each GST embedding. + gst_heads (int, optional): + The number of heads in GST multihead attention. + dropout_rate (float, optional): + Dropout rate in multi-head attention. """ @@ -236,7 +260,8 @@ class StyleTokenLayer(nn.Layer): """Calculate forward propagation. Args: - ref_embs (Tensor): Reference embeddings (B, ref_embed_dim). + ref_embs (Tensor): + Reference embeddings (B, ref_embed_dim). Returns: Tensor: Style token embeddings (B, gst_token_dim). diff --git a/paddlespeech/t2s/modules/tacotron2/attentions.py b/paddlespeech/t2s/modules/tacotron2/attentions.py index a6fde742d98f90d4db06f734e5f7f4508848d989..cdaef4608a09a283054d8222d44f69ffe2d7c048 100644 --- a/paddlespeech/t2s/modules/tacotron2/attentions.py +++ b/paddlespeech/t2s/modules/tacotron2/attentions.py @@ -31,10 +31,14 @@ def _apply_attention_constraint(e, Text-to-Speech with Convolutional Sequence Learning`_. Args: - e(Tensor): Attention energy before applying softmax (1, T). - last_attended_idx(int): The index of the inputs of the last attended [0, T]. - backward_window(int, optional, optional): Backward window size in attention constraint. (Default value = 1) - forward_window(int, optional, optional): Forward window size in attetion constraint. (Default value = 3) + e(Tensor): + Attention energy before applying softmax (1, T). + last_attended_idx(int): + The index of the inputs of the last attended [0, T]. + backward_window(int, optional, optional): + Backward window size in attention constraint. (Default value = 1) + forward_window(int, optional, optional): + Forward window size in attetion constraint. (Default value = 3) Returns: Tensor: Monotonic constrained attention energy (1, T). @@ -62,12 +66,18 @@ class AttLoc(nn.Layer): (https://arxiv.org/pdf/1506.07503.pdf) Args: - eprojs (int): projection-units of encoder - dunits (int): units of decoder - att_dim (int): attention dimension - aconv_chans (int): channels of attention convolution - aconv_filts (int): filter size of attention convolution - han_mode (bool): flag to swith on mode of hierarchical attention and not store pre_compute_enc_h + eprojs (int): + projection-units of encoder + dunits (int): + units of decoder + att_dim (int): + attention dimension + aconv_chans (int): + channels of attention convolution + aconv_filts (int): + filter size of attention convolution + han_mode (bool): + flag to swith on mode of hierarchical attention and not store pre_compute_enc_h """ def __init__(self, @@ -117,18 +127,29 @@ class AttLoc(nn.Layer): forward_window=3, ): """Calculate AttLoc forward propagation. Args: - enc_hs_pad(Tensor): padded encoder hidden state (B, T_max, D_enc) - enc_hs_len(Tensor): padded encoder hidden state length (B) - dec_z(Tensor dec_z): decoder hidden state (B, D_dec) - att_prev(Tensor): previous attention weight (B, T_max) - scaling(float, optional): scaling parameter before applying softmax (Default value = 2.0) - forward_window(Tensor, optional): forward window size when constraining attention (Default value = 3) - last_attended_idx(int, optional): index of the inputs of the last attended (Default value = None) - backward_window(int, optional): backward window size in attention constraint (Default value = 1) - forward_window(int, optional): forward window size in attetion constraint (Default value = 3) + enc_hs_pad(Tensor): + padded encoder hidden state (B, T_max, D_enc) + enc_hs_len(Tensor): + padded encoder hidden state length (B) + dec_z(Tensor dec_z): + decoder hidden state (B, D_dec) + att_prev(Tensor): + previous attention weight (B, T_max) + scaling(float, optional): + scaling parameter before applying softmax (Default value = 2.0) + forward_window(Tensor, optional): + forward window size when constraining attention (Default value = 3) + last_attended_idx(int, optional): + index of the inputs of the last attended (Default value = None) + backward_window(int, optional): + backward window size in attention constraint (Default value = 1) + forward_window(int, optional): + forward window size in attetion constraint (Default value = 3) Returns: - Tensor: attention weighted encoder state (B, D_enc) - Tensor: previous attention weights (B, T_max) + Tensor: + attention weighted encoder state (B, D_enc) + Tensor: + previous attention weights (B, T_max) """ batch = paddle.shape(enc_hs_pad)[0] # pre-compute all h outside the decoder loop @@ -192,11 +213,16 @@ class AttForward(nn.Layer): (https://arxiv.org/pdf/1807.06736.pdf) Args: - eprojs (int): projection-units of encoder - dunits (int): units of decoder - att_dim (int): attention dimension - aconv_chans (int): channels of attention convolution - aconv_filts (int): filter size of attention convolution + eprojs (int): + projection-units of encoder + dunits (int): + units of decoder + att_dim (int): + attention dimension + aconv_chans (int): + channels of attention convolution + aconv_filts (int): + filter size of attention convolution """ def __init__(self, eprojs, dunits, att_dim, aconv_chans, aconv_filts): @@ -239,18 +265,28 @@ class AttForward(nn.Layer): """Calculate AttForward forward propagation. Args: - enc_hs_pad(Tensor): padded encoder hidden state (B, T_max, D_enc) - enc_hs_len(list): padded encoder hidden state length (B,) - dec_z(Tensor): decoder hidden state (B, D_dec) - att_prev(Tensor): attention weights of previous step (B, T_max) - scaling(float, optional): scaling parameter before applying softmax (Default value = 1.0) - last_attended_idx(int, optional): index of the inputs of the last attended (Default value = None) - backward_window(int, optional): backward window size in attention constraint (Default value = 1) - forward_window(int, optional): (Default value = 3) + enc_hs_pad(Tensor): + padded encoder hidden state (B, T_max, D_enc) + enc_hs_len(list): + padded encoder hidden state length (B,) + dec_z(Tensor): + decoder hidden state (B, D_dec) + att_prev(Tensor): + attention weights of previous step (B, T_max) + scaling(float, optional): + scaling parameter before applying softmax (Default value = 1.0) + last_attended_idx(int, optional): + index of the inputs of the last attended (Default value = None) + backward_window(int, optional): + backward window size in attention constraint (Default value = 1) + forward_window(int, optional): + (Default value = 3) Returns: - Tensor: attention weighted encoder state (B, D_enc) - Tensor: previous attention weights (B, T_max) + Tensor: + attention weighted encoder state (B, D_enc) + Tensor: + previous attention weights (B, T_max) """ batch = len(enc_hs_pad) # pre-compute all h outside the decoder loop @@ -321,12 +357,18 @@ class AttForwardTA(nn.Layer): (https://arxiv.org/pdf/1807.06736.pdf) Args: - eunits (int): units of encoder - dunits (int): units of decoder - att_dim (int): attention dimension - aconv_chans (int): channels of attention convolution - aconv_filts (int): filter size of attention convolution - odim (int): output dimension + eunits (int): + units of encoder + dunits (int): + units of decoder + att_dim (int): + attention dimension + aconv_chans (int): + channels of attention convolution + aconv_filts (int): + filter size of attention convolution + odim (int): + output dimension """ def __init__(self, eunits, dunits, att_dim, aconv_chans, aconv_filts, odim): @@ -372,19 +414,30 @@ class AttForwardTA(nn.Layer): """Calculate AttForwardTA forward propagation. Args: - enc_hs_pad(Tensor): padded encoder hidden state (B, Tmax, eunits) - enc_hs_len(list Tensor): padded encoder hidden state length (B,) - dec_z(Tensor): decoder hidden state (B, dunits) - att_prev(Tensor): attention weights of previous step (B, T_max) - out_prev(Tensor): decoder outputs of previous step (B, odim) - scaling(float, optional): scaling parameter before applying softmax (Default value = 1.0) - last_attended_idx(int, optional): index of the inputs of the last attended (Default value = None) - backward_window(int, optional): backward window size in attention constraint (Default value = 1) - forward_window(int, optional): (Default value = 3) + enc_hs_pad(Tensor): + padded encoder hidden state (B, Tmax, eunits) + enc_hs_len(list Tensor): + padded encoder hidden state length (B,) + dec_z(Tensor): + decoder hidden state (B, dunits) + att_prev(Tensor): + attention weights of previous step (B, T_max) + out_prev(Tensor): + decoder outputs of previous step (B, odim) + scaling(float, optional): + scaling parameter before applying softmax (Default value = 1.0) + last_attended_idx(int, optional): + index of the inputs of the last attended (Default value = None) + backward_window(int, optional): + backward window size in attention constraint (Default value = 1) + forward_window(int, optional): + (Default value = 3) Returns: - Tensor: attention weighted encoder state (B, dunits) - Tensor: previous attention weights (B, Tmax) + Tensor: + attention weighted encoder state (B, dunits) + Tensor: + previous attention weights (B, Tmax) """ batch = len(enc_hs_pad) # pre-compute all h outside the decoder loop diff --git a/paddlespeech/t2s/modules/tacotron2/decoder.py b/paddlespeech/t2s/modules/tacotron2/decoder.py index ebdfa387989828eb4c92df8a1d6bbf215a50b775..41c94b63f3fadbcff0397144e0a395058738f69d 100644 --- a/paddlespeech/t2s/modules/tacotron2/decoder.py +++ b/paddlespeech/t2s/modules/tacotron2/decoder.py @@ -45,10 +45,14 @@ class Prenet(nn.Layer): """Initialize prenet module. Args: - idim (int): Dimension of the inputs. - odim (int): Dimension of the outputs. - n_layers (int, optional): The number of prenet layers. - n_units (int, optional): The number of prenet units. + idim (int): + Dimension of the inputs. + odim (int): + Dimension of the outputs. + n_layers (int, optional): + The number of prenet layers. + n_units (int, optional): + The number of prenet units. """ super().__init__() self.dropout_rate = dropout_rate @@ -62,7 +66,8 @@ class Prenet(nn.Layer): """Calculate forward propagation. Args: - x (Tensor): Batch of input tensors (B, ..., idim). + x (Tensor): + Batch of input tensors (B, ..., idim). Returns: Tensor: Batch of output tensors (B, ..., odim). @@ -212,7 +217,8 @@ class ZoneOutCell(nn.Layer): """Calculate forward propagation. Args: - inputs (Tensor): Batch of input tensor (B, input_size). + inputs (Tensor): + Batch of input tensor (B, input_size). hidden (tuple): - Tensor: Batch of initial hidden states (B, hidden_size). - Tensor: Batch of initial cell states (B, hidden_size). @@ -277,26 +283,39 @@ class Decoder(nn.Layer): """Initialize Tacotron2 decoder module. Args: - idim (int): Dimension of the inputs. - odim (int): Dimension of the outputs. - att (nn.Layer): Instance of attention class. - dlayers (int, optional): The number of decoder lstm layers. - dunits (int, optional): The number of decoder lstm units. - prenet_layers (int, optional): The number of prenet layers. - prenet_units (int, optional): The number of prenet units. - postnet_layers (int, optional): The number of postnet layers. - postnet_filts (int, optional): The number of postnet filter size. - postnet_chans (int, optional): The number of postnet filter channels. - output_activation_fn (nn.Layer, optional): Activation function for outputs. - cumulate_att_w (bool, optional): Whether to cumulate previous attention weight. - use_batch_norm (bool, optional): Whether to use batch normalization. - use_concate : bool, optional + idim (int): + Dimension of the inputs. + odim (int): + Dimension of the outputs. + att (nn.Layer): + Instance of attention class. + dlayers (int, optional): + The number of decoder lstm layers. + dunits (int, optional): + The number of decoder lstm units. + prenet_layers (int, optional): + The number of prenet layers. + prenet_units (int, optional): + The number of prenet units. + postnet_layers (int, optional): + The number of postnet layers. + postnet_filts (int, optional): + The number of postnet filter size. + postnet_chans (int, optional): + The number of postnet filter channels. + output_activation_fn (nn.Layer, optional): + Activation function for outputs. + cumulate_att_w (bool, optional): + Whether to cumulate previous attention weight. + use_batch_norm (bool, optional): + Whether to use batch normalization. + use_concate (bool, optional): Whether to concatenate encoder embedding with decoder lstm outputs. - dropout_rate : float, optional + dropout_rate (float, optional): Dropout rate. - zoneout_rate : float, optional + zoneout_rate (float, optional): Zoneout rate. - reduction_factor : int, optional + reduction_factor (int, optional): Reduction factor. """ super().__init__() @@ -363,15 +382,22 @@ class Decoder(nn.Layer): """Calculate forward propagation. Args: - hs (Tensor): Batch of the sequences of padded hidden states (B, Tmax, idim). - hlens (Tensor(int64) padded): Batch of lengths of each input batch (B,). - ys (Tensor): Batch of the sequences of padded target features (B, Lmax, odim). + hs (Tensor): + Batch of the sequences of padded hidden states (B, Tmax, idim). + hlens (Tensor(int64) padded): + Batch of lengths of each input batch (B,). + ys (Tensor): + Batch of the sequences of padded target features (B, Lmax, odim). Returns: - Tensor: Batch of output tensors after postnet (B, Lmax, odim). - Tensor: Batch of output tensors before postnet (B, Lmax, odim). - Tensor: Batch of logits of stop prediction (B, Lmax). - Tensor: Batch of attention weights (B, Lmax, Tmax). + Tensor: + Batch of output tensors after postnet (B, Lmax, odim). + Tensor: + Batch of output tensors before postnet (B, Lmax, odim). + Tensor: + Batch of logits of stop prediction (B, Lmax). + Tensor: + Batch of attention weights (B, Lmax, Tmax). Note: This computation is performed in teacher-forcing manner. @@ -471,20 +497,30 @@ class Decoder(nn.Layer): forward_window=None, ): """Generate the sequence of features given the sequences of characters. Args: - h(Tensor): Input sequence of encoder hidden states (T, C). - threshold(float, optional, optional): Threshold to stop generation. (Default value = 0.5) - minlenratio(float, optional, optional): Minimum length ratio. If set to 1.0 and the length of input is 10, + h(Tensor): + Input sequence of encoder hidden states (T, C). + threshold(float, optional, optional): + Threshold to stop generation. (Default value = 0.5) + minlenratio(float, optional, optional): + Minimum length ratio. If set to 1.0 and the length of input is 10, the minimum length of outputs will be 10 * 1 = 10. (Default value = 0.0) - maxlenratio(float, optional, optional): Minimum length ratio. If set to 10 and the length of input is 10, + maxlenratio(float, optional, optional): + Minimum length ratio. If set to 10 and the length of input is 10, the maximum length of outputs will be 10 * 10 = 100. (Default value = 0.0) - use_att_constraint(bool, optional): Whether to apply attention constraint introduced in `Deep Voice 3`_. (Default value = False) - backward_window(int, optional): Backward window size in attention constraint. (Default value = None) - forward_window(int, optional): (Default value = None) + use_att_constraint(bool, optional): + Whether to apply attention constraint introduced in `Deep Voice 3`_. (Default value = False) + backward_window(int, optional): + Backward window size in attention constraint. (Default value = None) + forward_window(int, optional): + (Default value = None) Returns: - Tensor: Output sequence of features (L, odim). - Tensor: Output sequence of stop probabilities (L,). - Tensor: Attention weights (L, T). + Tensor: + Output sequence of features (L, odim). + Tensor: + Output sequence of stop probabilities (L,). + Tensor: + Attention weights (L, T). Note: This computation is performed in auto-regressive manner. @@ -625,9 +661,12 @@ class Decoder(nn.Layer): """Calculate all of the attention weights. Args: - hs (Tensor): Batch of the sequences of padded hidden states (B, Tmax, idim). - hlens (Tensor(int64)): Batch of lengths of each input batch (B,). - ys (Tensor): Batch of the sequences of padded target features (B, Lmax, odim). + hs (Tensor): + Batch of the sequences of padded hidden states (B, Tmax, idim). + hlens (Tensor(int64)): + Batch of lengths of each input batch (B,). + ys (Tensor): + Batch of the sequences of padded target features (B, Lmax, odim). Returns: numpy.ndarray: diff --git a/paddlespeech/t2s/modules/tacotron2/encoder.py b/paddlespeech/t2s/modules/tacotron2/encoder.py index db102a115a067a0c9872cf0bebceb355711da482..224c82400d2b2815ee5669d225c720633651b5ab 100644 --- a/paddlespeech/t2s/modules/tacotron2/encoder.py +++ b/paddlespeech/t2s/modules/tacotron2/encoder.py @@ -46,17 +46,28 @@ class Encoder(nn.Layer): padding_idx=0, ): """Initialize Tacotron2 encoder module. Args: - idim (int): Dimension of the inputs. - input_layer (str): Input layer type. - embed_dim (int, optional): Dimension of character embedding. - elayers (int, optional): The number of encoder blstm layers. - eunits (int, optional): The number of encoder blstm units. - econv_layers (int, optional): The number of encoder conv layers. - econv_filts (int, optional): The number of encoder conv filter size. - econv_chans (int, optional): The number of encoder conv filter channels. - use_batch_norm (bool, optional): Whether to use batch normalization. - use_residual (bool, optional): Whether to use residual connection. - dropout_rate (float, optional): Dropout rate. + idim (int): + Dimension of the inputs. + input_layer (str): + Input layer type. + embed_dim (int, optional): + Dimension of character embedding. + elayers (int, optional): + The number of encoder blstm layers. + eunits (int, optional): + The number of encoder blstm units. + econv_layers (int, optional): + The number of encoder conv layers. + econv_filts (int, optional): + The number of encoder conv filter size. + econv_chans (int, optional): + The number of encoder conv filter channels. + use_batch_norm (bool, optional): + Whether to use batch normalization. + use_residual (bool, optional): + Whether to use residual connection. + dropout_rate (float, optional): + Dropout rate. """ super().__init__() @@ -127,14 +138,18 @@ class Encoder(nn.Layer): """Calculate forward propagation. Args: - xs (Tensor): Batch of the padded sequence. Either character ids (B, Tmax) + xs (Tensor): + Batch of the padded sequence. Either character ids (B, Tmax) or acoustic feature (B, Tmax, idim * encoder_reduction_factor). Padded value should be 0. - ilens (Tensor(int64)): Batch of lengths of each input batch (B,). + ilens (Tensor(int64)): + Batch of lengths of each input batch (B,). Returns: - Tensor: Batch of the sequences of encoder states(B, Tmax, eunits). - Tensor(int64): Batch of lengths of each sequence (B,) + Tensor: + Batch of the sequences of encoder states(B, Tmax, eunits). + Tensor(int64): + Batch of lengths of each sequence (B,) """ xs = self.embed(xs).transpose([0, 2, 1]) if self.convs is not None: @@ -161,8 +176,8 @@ class Encoder(nn.Layer): """Inference. Args: - x (Tensor): The sequeunce of character ids (T,) - or acoustic feature (T, idim * encoder_reduction_factor). + x (Tensor): + The sequeunce of character ids (T,) or acoustic feature (T, idim * encoder_reduction_factor). Returns: Tensor: The sequences of encoder states(T, eunits). diff --git a/paddlespeech/t2s/modules/tade_res_block.py b/paddlespeech/t2s/modules/tade_res_block.py index b2275e2361405c81542042d92ea161c5dc6bb4bf..799cbe9fd4041363571ede7a743e6c74f97c1aa3 100644 --- a/paddlespeech/t2s/modules/tade_res_block.py +++ b/paddlespeech/t2s/modules/tade_res_block.py @@ -60,11 +60,15 @@ class TADELayer(nn.Layer): def forward(self, x, c): """Calculate forward propagation. Args: - x (Tensor): Input tensor (B, in_channels, T). - c (Tensor): Auxiliary input tensor (B, aux_channels, T). + x (Tensor): + Input tensor (B, in_channels, T). + c (Tensor): + Auxiliary input tensor (B, aux_channels, T). Returns: - Tensor: Output tensor (B, in_channels, T * upsample_factor). - Tensor: Upsampled aux tensor (B, in_channels, T * upsample_factor). + Tensor: + Output tensor (B, in_channels, T * upsample_factor). + Tensor: + Upsampled aux tensor (B, in_channels, T * upsample_factor). """ x = self.norm(x) @@ -138,11 +142,15 @@ class TADEResBlock(nn.Layer): """Calculate forward propagation. Args: - x (Tensor): Input tensor (B, in_channels, T). - c (Tensor): Auxiliary input tensor (B, aux_channels, T). + x (Tensor): + Input tensor (B, in_channels, T). + c (Tensor): + Auxiliary input tensor (B, aux_channels, T). Returns: - Tensor: Output tensor (B, in_channels, T * upsample_factor). - Tensor: Upsampled auxirialy tensor (B, in_channels, T * upsample_factor). + Tensor: + Output tensor (B, in_channels, T * upsample_factor). + Tensor: + Upsampled auxirialy tensor (B, in_channels, T * upsample_factor). """ residual = x x, c = self.tade1(x, c) diff --git a/paddlespeech/t2s/modules/transformer/attention.py b/paddlespeech/t2s/modules/transformer/attention.py index 538a36b6bfa3c8a1ec82798ef1b7923f4d4bdfb5..d7a032445304fe5a6d6b9294912665f29ff53858 100644 --- a/paddlespeech/t2s/modules/transformer/attention.py +++ b/paddlespeech/t2s/modules/transformer/attention.py @@ -25,9 +25,12 @@ from paddlespeech.t2s.modules.masked_fill import masked_fill class MultiHeadedAttention(nn.Layer): """Multi-Head Attention layer. Args: - n_head (int): The number of heads. - n_feat (int): The number of features. - dropout_rate (float): Dropout rate. + n_head (int): + The number of heads. + n_feat (int): + The number of features. + dropout_rate (float): + Dropout rate. """ def __init__(self, n_head, n_feat, dropout_rate): @@ -48,14 +51,20 @@ class MultiHeadedAttention(nn.Layer): """Transform query, key and value. Args: - query(Tensor): query tensor (#batch, time1, size). - key(Tensor): Key tensor (#batch, time2, size). - value(Tensor): Value tensor (#batch, time2, size). + query(Tensor): + query tensor (#batch, time1, size). + key(Tensor): + Key tensor (#batch, time2, size). + value(Tensor): + Value tensor (#batch, time2, size). Returns: - Tensor: Transformed query tensor (#batch, n_head, time1, d_k). - Tensor: Transformed key tensor (#batch, n_head, time2, d_k). - Tensor: Transformed value tensor (#batch, n_head, time2, d_k). + Tensor: + Transformed query tensor (#batch, n_head, time1, d_k). + Tensor: + Transformed key tensor (#batch, n_head, time2, d_k). + Tensor: + Transformed value tensor (#batch, n_head, time2, d_k). """ n_batch = paddle.shape(query)[0] @@ -77,9 +86,12 @@ class MultiHeadedAttention(nn.Layer): """Compute attention context vector. Args: - value(Tensor): Transformed value (#batch, n_head, time2, d_k). - scores(Tensor): Attention score (#batch, n_head, time1, time2). - mask(Tensor, optional): Mask (#batch, 1, time2) or (#batch, time1, time2). (Default value = None) + value(Tensor): + Transformed value (#batch, n_head, time2, d_k). + scores(Tensor): + Attention score (#batch, n_head, time1, time2). + mask(Tensor, optional): + Mask (#batch, 1, time2) or (#batch, time1, time2). (Default value = None) Returns: Tensor: Transformed value (#batch, time1, d_model) weighted by the attention score (#batch, time1, time2). @@ -113,10 +125,14 @@ class MultiHeadedAttention(nn.Layer): """Compute scaled dot product attention. Args: - query(Tensor): Query tensor (#batch, time1, size). - key(Tensor): Key tensor (#batch, time2, size). - value(Tensor): Value tensor (#batch, time2, size). - mask(Tensor, optional): Mask tensor (#batch, 1, time2) or (#batch, time1, time2). (Default value = None) + query(Tensor): + Query tensor (#batch, time1, size). + key(Tensor): + Key tensor (#batch, time2, size). + value(Tensor): + Value tensor (#batch, time2, size). + mask(Tensor, optional): + Mask tensor (#batch, 1, time2) or (#batch, time1, time2). (Default value = None) Returns: Tensor: Output tensor (#batch, time1, d_model). @@ -134,10 +150,14 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention): Paper: https://arxiv.org/abs/1901.02860 Args: - n_head (int): The number of heads. - n_feat (int): The number of features. - dropout_rate (float): Dropout rate. - zero_triu (bool): Whether to zero the upper triangular part of attention matrix. + n_head (int): + The number of heads. + n_feat (int): + The number of features. + dropout_rate (float): + Dropout rate. + zero_triu (bool): + Whether to zero the upper triangular part of attention matrix. """ def __init__(self, n_head, n_feat, dropout_rate, zero_triu=False): @@ -161,10 +181,11 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention): def rel_shift(self, x): """Compute relative positional encoding. Args: - x(Tensor): Input tensor (batch, head, time1, 2*time1-1). + x(Tensor): + Input tensor (batch, head, time1, 2*time1-1). Returns: - Tensor:Output tensor. + Tensor: Output tensor. """ b, h, t1, t2 = paddle.shape(x) zero_pad = paddle.zeros((b, h, t1, 1)) @@ -183,11 +204,16 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention): """Compute 'Scaled Dot Product Attention' with rel. positional encoding. Args: - query(Tensor): Query tensor (#batch, time1, size). - key(Tensor): Key tensor (#batch, time2, size). - value(Tensor): Value tensor (#batch, time2, size). - pos_emb(Tensor): Positional embedding tensor (#batch, 2*time1-1, size). - mask(Tensor): Mask tensor (#batch, 1, time2) or (#batch, time1, time2). + query(Tensor): + Query tensor (#batch, time1, size). + key(Tensor): + Key tensor (#batch, time2, size). + value(Tensor): + Value tensor (#batch, time2, size). + pos_emb(Tensor): + Positional embedding tensor (#batch, 2*time1-1, size). + mask(Tensor): + Mask tensor (#batch, 1, time2) or (#batch, time1, time2). Returns: Tensor: Output tensor (#batch, time1, d_model). @@ -228,10 +254,14 @@ class LegacyRelPositionMultiHeadedAttention(MultiHeadedAttention): Paper: https://arxiv.org/abs/1901.02860 Args: - n_head (int): The number of heads. - n_feat (int): The number of features. - dropout_rate (float): Dropout rate. - zero_triu (bool): Whether to zero the upper triangular part of attention matrix. + n_head (int): + The number of heads. + n_feat (int): + The number of features. + dropout_rate (float): + Dropout rate. + zero_triu (bool): + Whether to zero the upper triangular part of attention matrix. """ def __init__(self, n_head, n_feat, dropout_rate, zero_triu=False): @@ -255,8 +285,8 @@ class LegacyRelPositionMultiHeadedAttention(MultiHeadedAttention): def rel_shift(self, x): """Compute relative positional encoding. Args: - x(Tensor): Input tensor (batch, head, time1, time2). - + x(Tensor): + Input tensor (batch, head, time1, time2). Returns: Tensor:Output tensor. """ diff --git a/paddlespeech/t2s/modules/transformer/decoder.py b/paddlespeech/t2s/modules/transformer/decoder.py index a8db7345ad07b336debee14ff692cfe4a363a1dd..e68487678560702e484844b804a3d916ee40c838 100644 --- a/paddlespeech/t2s/modules/transformer/decoder.py +++ b/paddlespeech/t2s/modules/transformer/decoder.py @@ -37,28 +37,46 @@ class Decoder(nn.Layer): """Transfomer decoder module. Args: - odim (int): Output diminsion. - self_attention_layer_type (str): Self-attention layer type. - attention_dim (int): Dimention of attention. - attention_heads (int): The number of heads of multi head attention. - conv_wshare (int): The number of kernel of convolution. Only used in + odim (int): + Output diminsion. + self_attention_layer_type (str): + Self-attention layer type. + attention_dim (int): + Dimention of attention. + attention_heads (int): + The number of heads of multi head attention. + conv_wshare (int): + The number of kernel of convolution. Only used in self_attention_layer_type == "lightconv*" or "dynamiconv*". - conv_kernel_length (Union[int, str]):Kernel size str of convolution + conv_kernel_length (Union[int, str]): + Kernel size str of convolution (e.g. 71_71_71_71_71_71). Only used in self_attention_layer_type == "lightconv*" or "dynamiconv*". - conv_usebias (bool): Whether to use bias in convolution. Only used in + conv_usebias (bool): + Whether to use bias in convolution. Only used in self_attention_layer_type == "lightconv*" or "dynamiconv*". - linear_units(int): The number of units of position-wise feed forward. - num_blocks (int): The number of decoder blocks. - dropout_rate (float): Dropout rate. - positional_dropout_rate (float): Dropout rate after adding positional encoding. - self_attention_dropout_rate (float): Dropout rate in self-attention. - src_attention_dropout_rate (float): Dropout rate in source-attention. - input_layer (Union[str, nn.Layer]): Input layer type. - use_output_layer (bool): Whether to use output layer. - pos_enc_class (nn.Layer): Positional encoding module class. + linear_units(int): + The number of units of position-wise feed forward. + num_blocks (int): + The number of decoder blocks. + dropout_rate (float): + Dropout rate. + positional_dropout_rate (float): + Dropout rate after adding positional encoding. + self_attention_dropout_rate (float): + Dropout rate in self-attention. + src_attention_dropout_rate (float): + Dropout rate in source-attention. + input_layer (Union[str, nn.Layer]): + Input layer type. + use_output_layer (bool): + Whether to use output layer. + pos_enc_class (nn.Layer): + Positional encoding module class. `PositionalEncoding `or `ScaledPositionalEncoding` - normalize_before (bool): Whether to use layer_norm before the first block. - concat_after (bool): Whether to concat attention layer's input and output. + normalize_before (bool): + Whether to use layer_norm before the first block. + concat_after (bool): + Whether to concat attention layer's input and output. if True, additional linear will be applied. i.e. x -> x + linear(concat(x, att(x))) if False, no additional linear will be applied. i.e. x -> x + att(x) @@ -143,17 +161,22 @@ class Decoder(nn.Layer): def forward(self, tgt, tgt_mask, memory, memory_mask): """Forward decoder. Args: - tgt(Tensor): Input token ids, int64 (#batch, maxlen_out) if input_layer == "embed". + tgt(Tensor): + Input token ids, int64 (#batch, maxlen_out) if input_layer == "embed". In the other case, input tensor (#batch, maxlen_out, odim). - tgt_mask(Tensor): Input token mask (#batch, maxlen_out). - memory(Tensor): Encoded memory, float32 (#batch, maxlen_in, feat). - memory_mask(Tensor): Encoded memory mask (#batch, maxlen_in). + tgt_mask(Tensor): + Input token mask (#batch, maxlen_out). + memory(Tensor): + Encoded memory, float32 (#batch, maxlen_in, feat). + memory_mask(Tensor): + Encoded memory mask (#batch, maxlen_in). Returns: Tensor: Decoded token score before softmax (#batch, maxlen_out, odim) if use_output_layer is True. In the other case,final block outputs (#batch, maxlen_out, attention_dim). - Tensor: Score mask before softmax (#batch, maxlen_out). + Tensor: + Score mask before softmax (#batch, maxlen_out). """ x = self.embed(tgt) @@ -169,14 +192,20 @@ class Decoder(nn.Layer): """Forward one step. Args: - tgt(Tensor): Input token ids, int64 (#batch, maxlen_out). - tgt_mask(Tensor): Input token mask (#batch, maxlen_out). - memory(Tensor): Encoded memory, float32 (#batch, maxlen_in, feat). - cache((List[Tensor]), optional): List of cached tensors. (Default value = None) + tgt(Tensor): + Input token ids, int64 (#batch, maxlen_out). + tgt_mask(Tensor): + Input token mask (#batch, maxlen_out). + memory(Tensor): + Encoded memory, float32 (#batch, maxlen_in, feat). + cache((List[Tensor]), optional): + List of cached tensors. (Default value = None) Returns: - Tensor: Output tensor (batch, maxlen_out, odim). - List[Tensor]: List of cache tensors of each decoder layer. + Tensor: + Output tensor (batch, maxlen_out, odim). + List[Tensor]: + List of cache tensors of each decoder layer. """ x = self.embed(tgt) @@ -219,9 +248,12 @@ class Decoder(nn.Layer): """Score new token batch (required). Args: - ys(Tensor): paddle.int64 prefix tokens (n_batch, ylen). - states(List[Any]): Scorer states for prefix tokens. - xs(Tensor): The encoder feature that generates ys (n_batch, xlen, n_feat). + ys(Tensor): + paddle.int64 prefix tokens (n_batch, ylen). + states(List[Any]): + Scorer states for prefix tokens. + xs(Tensor): + The encoder feature that generates ys (n_batch, xlen, n_feat). Returns: tuple[Tensor, List[Any]]: diff --git a/paddlespeech/t2s/modules/transformer/decoder_layer.py b/paddlespeech/t2s/modules/transformer/decoder_layer.py index 9a13cd794c52cdfab8e7e5ae4cc3aa7842a71688..0a79e95480871288a8cc2c3d7c5b7af4b8f1ff90 100644 --- a/paddlespeech/t2s/modules/transformer/decoder_layer.py +++ b/paddlespeech/t2s/modules/transformer/decoder_layer.py @@ -24,16 +24,23 @@ class DecoderLayer(nn.Layer): Args: - size (int): Input dimension. - self_attn (nn.Layer): Self-attention module instance. + size (int): + Input dimension. + self_attn (nn.Layer): + Self-attention module instance. `MultiHeadedAttention` instance can be used as the argument. - src_attn (nn.Layer): Self-attention module instance. + src_attn (nn.Layer): + Self-attention module instance. `MultiHeadedAttention` instance can be used as the argument. - feed_forward (nn.Layer): Feed-forward module instance. + feed_forward (nn.Layer): + Feed-forward module instance. `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): Whether to use layer_norm before the first block. - concat_after (bool): Whether to concat attention layer's input and output. + dropout_rate (float): + Dropout rate. + normalize_before (bool): + Whether to use layer_norm before the first block. + concat_after (bool): + Whether to concat attention layer's input and output. if True, additional linear will be applied. i.e. x -> x + linear(concat(x, att(x))) if False, no additional linear will be applied. i.e. x -> x + att(x) @@ -69,11 +76,16 @@ class DecoderLayer(nn.Layer): """Compute decoded features. Args: - tgt(Tensor): Input tensor (#batch, maxlen_out, size). - tgt_mask(Tensor): Mask for input tensor (#batch, maxlen_out). - memory(Tensor): Encoded memory, float32 (#batch, maxlen_in, size). - memory_mask(Tensor): Encoded memory mask (#batch, maxlen_in). - cache(List[Tensor], optional): List of cached tensors. + tgt(Tensor): + Input tensor (#batch, maxlen_out, size). + tgt_mask(Tensor): + Mask for input tensor (#batch, maxlen_out). + memory(Tensor): + Encoded memory, float32 (#batch, maxlen_in, size). + memory_mask(Tensor): + Encoded memory mask (#batch, maxlen_in). + cache(List[Tensor], optional): + List of cached tensors. Each tensor shape should be (#batch, maxlen_out - 1, size). (Default value = None) Returns: Tensor diff --git a/paddlespeech/t2s/modules/transformer/embedding.py b/paddlespeech/t2s/modules/transformer/embedding.py index 9524f07ee6db6cc6e9ac64998b3f1adb3d7b9ee4..7ba301cbd6af16ccda30dbfba7b432a15ec74c2c 100644 --- a/paddlespeech/t2s/modules/transformer/embedding.py +++ b/paddlespeech/t2s/modules/transformer/embedding.py @@ -23,11 +23,16 @@ class PositionalEncoding(nn.Layer): """Positional encoding. Args: - d_model (int): Embedding dimension. - dropout_rate (float): Dropout rate. - max_len (int): Maximum input length. - reverse (bool): Whether to reverse the input position. - type (str): dtype of param + d_model (int): + Embedding dimension. + dropout_rate (float): + Dropout rate. + max_len (int): + Maximum input length. + reverse (bool): + Whether to reverse the input position. + type (str): + dtype of param """ def __init__(self, @@ -68,7 +73,8 @@ class PositionalEncoding(nn.Layer): """Add positional encoding. Args: - x (Tensor): Input tensor (batch, time, `*`). + x (Tensor): + Input tensor (batch, time, `*`). Returns: Tensor: Encoded tensor (batch, time, `*`). @@ -84,10 +90,14 @@ class ScaledPositionalEncoding(PositionalEncoding): See Sec. 3.2 https://arxiv.org/abs/1809.08895 Args: - d_model (int): Embedding dimension. - dropout_rate (float): Dropout rate. - max_len (int): Maximum input length. - dtype (str): dtype of param + d_model (int): + Embedding dimension. + dropout_rate (float): + Dropout rate. + max_len (int): + Maximum input length. + dtype (str): + dtype of param """ def __init__(self, d_model, dropout_rate, max_len=5000, dtype="float32"): @@ -111,7 +121,8 @@ class ScaledPositionalEncoding(PositionalEncoding): """Add positional encoding. Args: - x (Tensor): Input tensor (batch, time, `*`). + x (Tensor): + Input tensor (batch, time, `*`). Returns: Tensor: Encoded tensor (batch, time, `*`). """ @@ -127,9 +138,12 @@ class RelPositionalEncoding(nn.Layer): See : Appendix B in https://arxiv.org/abs/1901.02860 Args: - d_model (int): Embedding dimension. - dropout_rate (float): Dropout rate. - max_len (int): Maximum input length. + d_model (int): + Embedding dimension. + dropout_rate (float): + Dropout rate. + max_len (int): + Maximum input length. """ def __init__(self, d_model, dropout_rate, max_len=5000, dtype="float32"): @@ -175,7 +189,8 @@ class RelPositionalEncoding(nn.Layer): def forward(self, x: paddle.Tensor): """Add positional encoding. Args: - x (Tensor):Input tensor (batch, time, `*`). + x (Tensor): + Input tensor (batch, time, `*`). Returns: Tensor: Encoded tensor (batch, time, `*`). """ @@ -195,18 +210,24 @@ class LegacyRelPositionalEncoding(PositionalEncoding): See : Appendix B in https://arxiv.org/abs/1901.02860 Args: - d_model (int): Embedding dimension. - dropout_rate (float): Dropout rate. - max_len (int): Maximum input length. + d_model (int): + Embedding dimension. + dropout_rate (float): + Dropout rate. + max_len (int): + Maximum input length. """ def __init__(self, d_model: int, dropout_rate: float, max_len: int=5000): """ Args: - d_model (int): Embedding dimension. - dropout_rate (float): Dropout rate. - max_len (int, optional): [Maximum input length.]. Defaults to 5000. + d_model (int): + Embedding dimension. + dropout_rate (float): + Dropout rate. + max_len (int, optional): + [Maximum input length.]. Defaults to 5000. """ super().__init__(d_model, dropout_rate, max_len, reverse=True) @@ -234,10 +255,13 @@ class LegacyRelPositionalEncoding(PositionalEncoding): def forward(self, x: paddle.Tensor): """Compute positional encoding. Args: - x (paddle.Tensor): Input tensor (batch, time, `*`). + x (Tensor): + Input tensor (batch, time, `*`). Returns: - paddle.Tensor: Encoded tensor (batch, time, `*`). - paddle.Tensor: Positional embedding tensor (1, time, `*`). + Tensor: + Encoded tensor (batch, time, `*`). + Tensor: + Positional embedding tensor (1, time, `*`). """ self.extend_pe(x) x = x * self.xscale diff --git a/paddlespeech/t2s/modules/transformer/encoder.py b/paddlespeech/t2s/modules/transformer/encoder.py index 11986360a30ed7dfd7dfcaed4a50bdf259e26983..f2aed58926d7c44392ffa4a69038d32c1506b7bf 100644 --- a/paddlespeech/t2s/modules/transformer/encoder.py +++ b/paddlespeech/t2s/modules/transformer/encoder.py @@ -38,32 +38,55 @@ class BaseEncoder(nn.Layer): """Base Encoder module. Args: - idim (int): Input dimension. - attention_dim (int): Dimention of attention. - attention_heads (int): The number of heads of multi head attention. - linear_units (int): The number of units of position-wise feed forward. - num_blocks (int): The number of decoder blocks. - dropout_rate (float): Dropout rate. - positional_dropout_rate (float): Dropout rate after adding positional encoding. - attention_dropout_rate (float): Dropout rate in attention. - input_layer (Union[str, nn.Layer]): Input layer type. - normalize_before (bool): Whether to use layer_norm before the first block. - concat_after (bool): Whether to concat attention layer's input and output. + idim (int): + Input dimension. + attention_dim (int): + Dimention of attention. + attention_heads (int): + The number of heads of multi head attention. + linear_units (int): + The number of units of position-wise feed forward. + num_blocks (int): + The number of decoder blocks. + dropout_rate (float): + Dropout rate. + positional_dropout_rate (float): + Dropout rate after adding positional encoding. + attention_dropout_rate (float): + Dropout rate in attention. + input_layer (Union[str, nn.Layer]): + Input layer type. + normalize_before (bool): + Whether to use layer_norm before the first block. + concat_after (bool): + Whether to concat attention layer's input and output. if True, additional linear will be applied. i.e. x -> x + linear(concat(x, att(x))) if False, no additional linear will be applied. i.e. x -> x + att(x) - positionwise_layer_type (str): "linear", "conv1d", or "conv1d-linear". - positionwise_conv_kernel_size (int): Kernel size of positionwise conv1d layer. - macaron_style (bool): Whether to use macaron style for positionwise layer. - pos_enc_layer_type (str): Encoder positional encoding layer type. - selfattention_layer_type (str): Encoder attention layer type. - activation_type (str): Encoder activation function type. - use_cnn_module (bool): Whether to use convolution module. - zero_triu (bool): Whether to zero the upper triangular part of attention matrix. - cnn_module_kernel (int): Kernerl size of convolution module. - padding_idx (int): Padding idx for input_layer=embed. - stochastic_depth_rate (float): Maximum probability to skip the encoder layer. - intermediate_layers (Union[List[int], None]): indices of intermediate CTC layer. + positionwise_layer_type (str): + "linear", "conv1d", or "conv1d-linear". + positionwise_conv_kernel_size (int): + Kernel size of positionwise conv1d layer. + macaron_style (bool): + Whether to use macaron style for positionwise layer. + pos_enc_layer_type (str): + Encoder positional encoding layer type. + selfattention_layer_type (str): + Encoder attention layer type. + activation_type (str): + Encoder activation function type. + use_cnn_module (bool): + Whether to use convolution module. + zero_triu (bool): + Whether to zero the upper triangular part of attention matrix. + cnn_module_kernel (int): + Kernerl size of convolution module. + padding_idx (int): + Padding idx for input_layer=embed. + stochastic_depth_rate (float): + Maximum probability to skip the encoder layer. + intermediate_layers (Union[List[int], None]): + indices of intermediate CTC layer. indices start from 1. if not None, intermediate outputs are returned (which changes return type signature.) @@ -266,12 +289,16 @@ class BaseEncoder(nn.Layer): """Encode input sequence. Args: - xs (Tensor): Input tensor (#batch, time, idim). - masks (Tensor): Mask tensor (#batch, 1, time). + xs (Tensor): + Input tensor (#batch, time, idim). + masks (Tensor): + Mask tensor (#batch, 1, time). Returns: - Tensor: Output tensor (#batch, time, attention_dim). - Tensor: Mask tensor (#batch, 1, time). + Tensor: + Output tensor (#batch, time, attention_dim). + Tensor: + Mask tensor (#batch, 1, time). """ xs = self.embed(xs) xs, masks = self.encoders(xs, masks) @@ -284,26 +311,43 @@ class TransformerEncoder(BaseEncoder): """Transformer encoder module. Args: - idim (int): Input dimension. - attention_dim (int): Dimention of attention. - attention_heads (int): The number of heads of multi head attention. - linear_units (int): The number of units of position-wise feed forward. - num_blocks (int): The number of decoder blocks. - dropout_rate (float): Dropout rate. - positional_dropout_rate (float): Dropout rate after adding positional encoding. - attention_dropout_rate (float): Dropout rate in attention. - input_layer (Union[str, paddle.nn.Layer]): Input layer type. - pos_enc_layer_type (str): Encoder positional encoding layer type. - normalize_before (bool): Whether to use layer_norm before the first block. - concat_after (bool): Whether to concat attention layer's input and output. + idim (int): + Input dimension. + attention_dim (int): + Dimention of attention. + attention_heads (int): + The number of heads of multi head attention. + linear_units (int): + The number of units of position-wise feed forward. + num_blocks (int): + The number of decoder blocks. + dropout_rate (float): + Dropout rate. + positional_dropout_rate (float): + Dropout rate after adding positional encoding. + attention_dropout_rate (float): + Dropout rate in attention. + input_layer (Union[str, paddle.nn.Layer]): + Input layer type. + pos_enc_layer_type (str): + Encoder positional encoding layer type. + normalize_before (bool): + Whether to use layer_norm before the first block. + concat_after (bool): + Whether to concat attention layer's input and output. if True, additional linear will be applied. i.e. x -> x + linear(concat(x, att(x))) if False, no additional linear will be applied. i.e. x -> x + att(x) - positionwise_layer_type (str): "linear", "conv1d", or "conv1d-linear". - positionwise_conv_kernel_size (int): Kernel size of positionwise conv1d layer. - selfattention_layer_type (str): Encoder attention layer type. - activation_type (str): Encoder activation function type. - padding_idx (int): Padding idx for input_layer=embed. + positionwise_layer_type (str): + "linear", "conv1d", or "conv1d-linear". + positionwise_conv_kernel_size (int): + Kernel size of positionwise conv1d layer. + selfattention_layer_type (str): + Encoder attention layer type. + activation_type (str): + Encoder activation function type. + padding_idx (int): + Padding idx for input_layer=embed. """ def __init__( @@ -350,12 +394,16 @@ class TransformerEncoder(BaseEncoder): """Encoder input sequence. Args: - xs(Tensor): Input tensor (#batch, time, idim). - masks(Tensor): Mask tensor (#batch, 1, time). + xs(Tensor): + Input tensor (#batch, time, idim). + masks(Tensor): + Mask tensor (#batch, 1, time). Returns: - Tensor: Output tensor (#batch, time, attention_dim). - Tensor: Mask tensor (#batch, 1, time). + Tensor: + Output tensor (#batch, time, attention_dim). + Tensor: + Mask tensor (#batch, 1, time). """ xs = self.embed(xs) xs, masks = self.encoders(xs, masks) @@ -367,14 +415,20 @@ class TransformerEncoder(BaseEncoder): """Encode input frame. Args: - xs (Tensor): Input tensor. - masks (Tensor): Mask tensor. - cache (List[Tensor]): List of cache tensors. + xs (Tensor): + Input tensor. + masks (Tensor): + Mask tensor. + cache (List[Tensor]): + List of cache tensors. Returns: - Tensor: Output tensor. - Tensor: Mask tensor. - List[Tensor]: List of new cache tensors. + Tensor: + Output tensor. + Tensor: + Mask tensor. + List[Tensor]: + List of new cache tensors. """ xs = self.embed(xs) @@ -393,32 +447,55 @@ class ConformerEncoder(BaseEncoder): """Conformer encoder module. Args: - idim (int): Input dimension. - attention_dim (int): Dimention of attention. - attention_heads (int): The number of heads of multi head attention. - linear_units (int): The number of units of position-wise feed forward. - num_blocks (int): The number of decoder blocks. - dropout_rate (float): Dropout rate. - positional_dropout_rate (float): Dropout rate after adding positional encoding. - attention_dropout_rate (float): Dropout rate in attention. - input_layer (Union[str, nn.Layer]): Input layer type. - normalize_before (bool): Whether to use layer_norm before the first block. - concat_after (bool):Whether to concat attention layer's input and output. + idim (int): + Input dimension. + attention_dim (int): + Dimention of attention. + attention_heads (int): + The number of heads of multi head attention. + linear_units (int): + The number of units of position-wise feed forward. + num_blocks (int): + The number of decoder blocks. + dropout_rate (float): + Dropout rate. + positional_dropout_rate (float): + Dropout rate after adding positional encoding. + attention_dropout_rate (float): + Dropout rate in attention. + input_layer (Union[str, nn.Layer]): + Input layer type. + normalize_before (bool): + Whether to use layer_norm before the first block. + concat_after (bool): + Whether to concat attention layer's input and output. if True, additional linear will be applied. i.e. x -> x + linear(concat(x, att(x))) if False, no additional linear will be applied. i.e. x -> x + att(x) - positionwise_layer_type (str): "linear", "conv1d", or "conv1d-linear". - positionwise_conv_kernel_size (int): Kernel size of positionwise conv1d layer. - macaron_style (bool): Whether to use macaron style for positionwise layer. - pos_enc_layer_type (str): Encoder positional encoding layer type. - selfattention_layer_type (str): Encoder attention layer type. - activation_type (str): Encoder activation function type. - use_cnn_module (bool): Whether to use convolution module. - zero_triu (bool): Whether to zero the upper triangular part of attention matrix. - cnn_module_kernel (int): Kernerl size of convolution module. - padding_idx (int): Padding idx for input_layer=embed. - stochastic_depth_rate (float): Maximum probability to skip the encoder layer. - intermediate_layers (Union[List[int], None]):indices of intermediate CTC layer. indices start from 1. + positionwise_layer_type (str): + "linear", "conv1d", or "conv1d-linear". + positionwise_conv_kernel_size (int): + Kernel size of positionwise conv1d layer. + macaron_style (bool): + Whether to use macaron style for positionwise layer. + pos_enc_layer_type (str): + Encoder positional encoding layer type. + selfattention_layer_type (str): + Encoder attention layer type. + activation_type (str): + Encoder activation function type. + use_cnn_module (bool): + Whether to use convolution module. + zero_triu (bool): + Whether to zero the upper triangular part of attention matrix. + cnn_module_kernel (int): + Kernerl size of convolution module. + padding_idx (int): + Padding idx for input_layer=embed. + stochastic_depth_rate (float): + Maximum probability to skip the encoder layer. + intermediate_layers (Union[List[int], None]): + indices of intermediate CTC layer. indices start from 1. if not None, intermediate outputs are returned (which changes return type signature.) """ @@ -478,11 +555,15 @@ class ConformerEncoder(BaseEncoder): """Encode input sequence. Args: - xs (Tensor): Input tensor (#batch, time, idim). - masks (Tensor): Mask tensor (#batch, 1, time). + xs (Tensor): + Input tensor (#batch, time, idim). + masks (Tensor): + Mask tensor (#batch, 1, time). Returns: - Tensor: Output tensor (#batch, time, attention_dim). - Tensor: Mask tensor (#batch, 1, time). + Tensor: + Output tensor (#batch, time, attention_dim). + Tensor: + Mask tensor (#batch, 1, time). """ if isinstance(self.embed, (Conv2dSubsampling)): xs, masks = self.embed(xs, masks) @@ -539,7 +620,8 @@ class Conv1dResidualBlock(nn.Layer): def forward(self, xs): """Encode input sequence. Args: - xs (Tensor): Input tensor (#batch, idim, T). + xs (Tensor): + Input tensor (#batch, idim, T). Returns: Tensor: Output tensor (#batch, odim, T). """ @@ -582,8 +664,10 @@ class CNNDecoder(nn.Layer): def forward(self, xs, masks=None): """Encode input sequence. Args: - xs (Tensor): Input tensor (#batch, time, idim). - masks (Tensor): Mask tensor (#batch, 1, time). + xs (Tensor): + Input tensor (#batch, time, idim). + masks (Tensor): + Mask tensor (#batch, 1, time). Returns: Tensor: Output tensor (#batch, time, odim). """ @@ -629,8 +713,10 @@ class CNNPostnet(nn.Layer): def forward(self, xs, masks=None): """Encode input sequence. Args: - xs (Tensor): Input tensor (#batch, odim, time). - masks (Tensor): Mask tensor (#batch, 1, time). + xs (Tensor): + Input tensor (#batch, odim, time). + masks (Tensor): + Mask tensor (#batch, 1, time). Returns: Tensor: Output tensor (#batch, odim, time). """ diff --git a/paddlespeech/t2s/modules/transformer/encoder_layer.py b/paddlespeech/t2s/modules/transformer/encoder_layer.py index 72372b69b92bcae4dab8485498f56d0ad639f91f..63494b0de8d3cbf16edc4dc7d8dd09b7cdec65d9 100644 --- a/paddlespeech/t2s/modules/transformer/encoder_layer.py +++ b/paddlespeech/t2s/modules/transformer/encoder_layer.py @@ -21,14 +21,20 @@ class EncoderLayer(nn.Layer): """Encoder layer module. Args: - size (int): Input dimension. - self_attn (nn.Layer): Self-attention module instance. + size (int): + Input dimension. + self_attn (nn.Layer): + Self-attention module instance. `MultiHeadedAttention` instance can be used as the argument. - feed_forward (nn.Layer): Feed-forward module instance. + feed_forward (nn.Layer): + Feed-forward module instance. `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): Whether to use layer_norm before the first block. - concat_after (bool): Whether to concat attention layer's input and output. + dropout_rate (float): + Dropout rate. + normalize_before (bool): + Whether to use layer_norm before the first block. + concat_after (bool): + Whether to concat attention layer's input and output. if True, additional linear will be applied. i.e. x -> x + linear(concat(x, att(x))) if False, no additional linear will be applied. i.e. x -> x + att(x) @@ -59,13 +65,18 @@ class EncoderLayer(nn.Layer): """Compute encoded features. Args: - x(Tensor): Input tensor (#batch, time, size). - mask(Tensor): Mask tensor for the input (#batch, time). - cache(Tensor, optional): Cache tensor of the input (#batch, time - 1, size). + x(Tensor): + Input tensor (#batch, time, size). + mask(Tensor): + Mask tensor for the input (#batch, time). + cache(Tensor, optional): + Cache tensor of the input (#batch, time - 1, size). Returns: - Tensor: Output tensor (#batch, time, size). - Tensor: Mask tensor (#batch, time). + Tensor: + Output tensor (#batch, time, size). + Tensor: + Mask tensor (#batch, time). """ residual = x if self.normalize_before: diff --git a/paddlespeech/t2s/modules/transformer/lightconv.py b/paddlespeech/t2s/modules/transformer/lightconv.py index 9bcc1acfba021d91bec798f4eea41461cfffb81e..22217d50f512699a88cb649af11628a2e5d111bf 100644 --- a/paddlespeech/t2s/modules/transformer/lightconv.py +++ b/paddlespeech/t2s/modules/transformer/lightconv.py @@ -31,12 +31,18 @@ class LightweightConvolution(nn.Layer): https://github.com/pytorch/fairseq/tree/master/fairseq Args: - wshare (int): the number of kernel of convolution - n_feat (int): the number of features - dropout_rate (float): dropout_rate - kernel_size (int): kernel size (length) - use_kernel_mask (bool): Use causal mask or not for convolution kernel - use_bias (bool): Use bias term or not. + wshare (int): + the number of kernel of convolution + n_feat (int): + the number of features + dropout_rate (float): + dropout_rate + kernel_size (int): + kernel size (length) + use_kernel_mask (bool): + Use causal mask or not for convolution kernel + use_bias (bool): + Use bias term or not. """ @@ -94,10 +100,14 @@ class LightweightConvolution(nn.Layer): This is just for compatibility with self-attention layer (attention.py) Args: - query (Tensor): input tensor. (batch, time1, d_model) - key (Tensor): NOT USED. (batch, time2, d_model) - value (Tensor): NOT USED. (batch, time2, d_model) - mask : (Tensor): (batch, time1, time2) mask + query (Tensor): + input tensor. (batch, time1, d_model) + key (Tensor): + NOT USED. (batch, time2, d_model) + value (Tensor): + NOT USED. (batch, time2, d_model) + mask : (Tensor): + (batch, time1, time2) mask Return: Tensor: ouput. (batch, time1, d_model) diff --git a/paddlespeech/t2s/modules/transformer/mask.py b/paddlespeech/t2s/modules/transformer/mask.py index c10e6add2a0c37e052a9df9d3ae6a8535d03e942..71dd379756df892a729994022e84f1467118d814 100644 --- a/paddlespeech/t2s/modules/transformer/mask.py +++ b/paddlespeech/t2s/modules/transformer/mask.py @@ -19,8 +19,10 @@ def subsequent_mask(size, dtype=paddle.bool): """Create mask for subsequent steps (size, size). Args: - size (int): size of mask - dtype (paddle.dtype): result dtype + size (int): + size of mask + dtype (paddle.dtype): + result dtype Return: Tensor: >>> subsequent_mask(3) @@ -36,9 +38,12 @@ def target_mask(ys_in_pad, ignore_id, dtype=paddle.bool): """Create mask for decoder self-attention. Args: - ys_pad (Tensor): batch of padded target sequences (B, Lmax) - ignore_id (int): index of padding - dtype (paddle.dtype): result dtype + ys_pad (Tensor): + batch of padded target sequences (B, Lmax) + ignore_id (int): + index of padding + dtype (paddle.dtype): + result dtype Return: Tensor: (B, Lmax, Lmax) """ diff --git a/paddlespeech/t2s/modules/transformer/multi_layer_conv.py b/paddlespeech/t2s/modules/transformer/multi_layer_conv.py index d3285b65f3113c4aaa844d5ccb35d0399e3f6331..91d67ca58376967f4eeaf6d069691446c2714b79 100644 --- a/paddlespeech/t2s/modules/transformer/multi_layer_conv.py +++ b/paddlespeech/t2s/modules/transformer/multi_layer_conv.py @@ -32,10 +32,14 @@ class MultiLayeredConv1d(nn.Layer): """Initialize MultiLayeredConv1d module. Args: - in_chans (int): Number of input channels. - hidden_chans (int): Number of hidden channels. - kernel_size (int): Kernel size of conv1d. - dropout_rate (float): Dropout rate. + in_chans (int): + Number of input channels. + hidden_chans (int): + Number of hidden channels. + kernel_size (int): + Kernel size of conv1d. + dropout_rate (float): + Dropout rate. """ super().__init__() @@ -58,7 +62,8 @@ class MultiLayeredConv1d(nn.Layer): """Calculate forward propagation. Args: - x (Tensor): Batch of input tensors (B, T, in_chans). + x (Tensor): + Batch of input tensors (B, T, in_chans). Returns: Tensor: Batch of output tensors (B, T, in_chans). @@ -79,10 +84,14 @@ class Conv1dLinear(nn.Layer): """Initialize Conv1dLinear module. Args: - in_chans (int): Number of input channels. - hidden_chans (int): Number of hidden channels. - kernel_size (int): Kernel size of conv1d. - dropout_rate (float): Dropout rate. + in_chans (int): + Number of input channels. + hidden_chans (int): + Number of hidden channels. + kernel_size (int): + Kernel size of conv1d. + dropout_rate (float): + Dropout rate. """ super().__init__() self.w_1 = nn.Conv1D( @@ -99,7 +108,8 @@ class Conv1dLinear(nn.Layer): """Calculate forward propagation. Args: - x (Tensor): Batch of input tensors (B, T, in_chans). + x (Tensor): + Batch of input tensors (B, T, in_chans). Returns: Tensor: Batch of output tensors (B, T, in_chans). diff --git a/paddlespeech/t2s/modules/transformer/positionwise_feed_forward.py b/paddlespeech/t2s/modules/transformer/positionwise_feed_forward.py index 92af6851c402b969a5e590be287ba5e7f9c5a262..45ea279bfdde9707a96e219270f9a24886c9c094 100644 --- a/paddlespeech/t2s/modules/transformer/positionwise_feed_forward.py +++ b/paddlespeech/t2s/modules/transformer/positionwise_feed_forward.py @@ -21,9 +21,12 @@ class PositionwiseFeedForward(nn.Layer): """Positionwise feed forward layer. Args: - idim (int): Input dimenstion. - hidden_units (int): The number of hidden units. - dropout_rate (float): Dropout rate. + idim (int): + Input dimenstion. + hidden_units (int): + The number of hidden units. + dropout_rate (float): + Dropout rate. """ def __init__(self, diff --git a/paddlespeech/t2s/modules/transformer/repeat.py b/paddlespeech/t2s/modules/transformer/repeat.py index 1e946adf7e469fd6c05c2a8c8d9e6f16f638524e..43d11e9f96ee203d003fabac046a8620edd0bdec 100644 --- a/paddlespeech/t2s/modules/transformer/repeat.py +++ b/paddlespeech/t2s/modules/transformer/repeat.py @@ -30,8 +30,10 @@ def repeat(N, fn): """Repeat module N times. Args: - N (int): Number of repeat time. - fn (Callable): Function to generate module. + N (int): + Number of repeat time. + fn (Callable): + Function to generate module. Returns: MultiSequential: Repeated model instance. diff --git a/paddlespeech/t2s/modules/transformer/subsampling.py b/paddlespeech/t2s/modules/transformer/subsampling.py index 07439705a66cb6bc683bfa5a977aef0db379516c..a17278c0b52ee39d0157ad36e07a61d123bc1b9a 100644 --- a/paddlespeech/t2s/modules/transformer/subsampling.py +++ b/paddlespeech/t2s/modules/transformer/subsampling.py @@ -23,10 +23,14 @@ class Conv2dSubsampling(nn.Layer): """Convolutional 2D subsampling (to 1/4 length). Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - pos_enc (nn.Layer): Custom position encoding layer. + idim (int): + Input dimension. + odim (int): + Output dimension. + dropout_rate (float): + Dropout rate. + pos_enc (nn.Layer): + Custom position encoding layer. """ def __init__(self, idim, odim, dropout_rate, pos_enc=None): @@ -45,11 +49,15 @@ class Conv2dSubsampling(nn.Layer): def forward(self, x, x_mask): """Subsample x. Args: - x (Tensor): Input tensor (#batch, time, idim). - x_mask (Tensor): Input mask (#batch, 1, time). + x (Tensor): + Input tensor (#batch, time, idim). + x_mask (Tensor): + Input mask (#batch, 1, time). Returns: - Tensor: Subsampled tensor (#batch, time', odim), where time' = time // 4. - Tensor: Subsampled mask (#batch, 1, time'), where time' = time // 4. + Tensor: + Subsampled tensor (#batch, time', odim), where time' = time // 4. + Tensor: + Subsampled mask (#batch, 1, time'), where time' = time // 4. """ # (b, c, t, f) x = x.unsqueeze(1) diff --git a/paddlespeech/t2s/modules/upsample.py b/paddlespeech/t2s/modules/upsample.py index 65e78a8928adcab69379c883a00bd1ab90bccbc0..164db65ddbf996cf13ffd0a726fb63ae214da649 100644 --- a/paddlespeech/t2s/modules/upsample.py +++ b/paddlespeech/t2s/modules/upsample.py @@ -28,9 +28,12 @@ class Stretch2D(nn.Layer): """Strech an image (or image-like object) with some interpolation. Args: - w_scale (int): Scalar of width. - h_scale (int): Scalar of the height. - mode (str, optional): Interpolation mode, modes suppored are "nearest", "bilinear", + w_scale (int): + Scalar of width. + h_scale (int): + Scalar of the height. + mode (str, optional): + Interpolation mode, modes suppored are "nearest", "bilinear", "trilinear", "bicubic", "linear" and "area",by default "nearest" For more details about interpolation, see `paddle.nn.functional.interpolate `_. @@ -44,11 +47,12 @@ class Stretch2D(nn.Layer): """ Args: - x (Tensor): Shape (N, C, H, W) + x (Tensor): + Shape (N, C, H, W) Returns: - Tensor: The stretched image. - Shape (N, C, H', W'), where ``H'=h_scale * H``, ``W'=w_scale * W``. + Tensor: + The stretched image. Shape (N, C, H', W'), where ``H'=h_scale * H``, ``W'=w_scale * W``. """ out = F.interpolate( @@ -61,12 +65,18 @@ class UpsampleNet(nn.Layer): convolutions. Args: - upsample_scales (List[int]): Upsampling factors for each strech. - nonlinear_activation (Optional[str], optional): Activation after each convolution, by default None - nonlinear_activation_params (Dict[str, Any], optional): Parameters passed to construct the activation, by default {} - interpolate_mode (str, optional): Interpolation mode of the strech, by default "nearest" - freq_axis_kernel_size (int, optional): Convolution kernel size along the frequency axis, by default 1 - use_causal_conv (bool, optional): Whether to use causal padding before convolution, by default False + upsample_scales (List[int]): + Upsampling factors for each strech. + nonlinear_activation (Optional[str], optional): + Activation after each convolution, by default None + nonlinear_activation_params (Dict[str, Any], optional): + Parameters passed to construct the activation, by default {} + interpolate_mode (str, optional): + Interpolation mode of the strech, by default "nearest" + freq_axis_kernel_size (int, optional): + Convolution kernel size along the frequency axis, by default 1 + use_causal_conv (bool, optional): + Whether to use causal padding before convolution, by default False If True, Causal padding is used along the time axis, i.e. padding amount is ``receptive field - 1`` and 0 for before and after, respectively. If False, "same" padding is used along the time axis. @@ -106,7 +116,8 @@ class UpsampleNet(nn.Layer): def forward(self, c): """ Args: - c (Tensor): spectrogram. Shape (N, F, T) + c (Tensor): + spectrogram. Shape (N, F, T) Returns: Tensor: upsampled spectrogram. @@ -126,17 +137,25 @@ class ConvInUpsampleNet(nn.Layer): UpsampleNet. Args: - upsample_scales (List[int]): Upsampling factors for each strech. - nonlinear_activation (Optional[str], optional): Activation after each convolution, by default None - nonlinear_activation_params (Dict[str, Any], optional): Parameters passed to construct the activation, by default {} - interpolate_mode (str, optional): Interpolation mode of the strech, by default "nearest" - freq_axis_kernel_size (int, optional): Convolution kernel size along the frequency axis, by default 1 - aux_channels (int, optional): Feature size of the input, by default 80 - aux_context_window (int, optional): Context window of the first 1D convolution applied to the input. It + upsample_scales (List[int]): + Upsampling factors for each strech. + nonlinear_activation (Optional[str], optional): + Activation after each convolution, by default None + nonlinear_activation_params (Dict[str, Any], optional): + Parameters passed to construct the activation, by default {} + interpolate_mode (str, optional): + Interpolation mode of the strech, by default "nearest" + freq_axis_kernel_size (int, optional): + Convolution kernel size along the frequency axis, by default 1 + aux_channels (int, optional): + Feature size of the input, by default 80 + aux_context_window (int, optional): + Context window of the first 1D convolution applied to the input. It related to the kernel size of the convolution, by default 0 If use causal convolution, the kernel size is ``window + 1``, else the kernel size is ``2 * window + 1``. - use_causal_conv (bool, optional): Whether to use causal padding before convolution, by default False + use_causal_conv (bool, optional): + Whether to use causal padding before convolution, by default False If True, Causal padding is used along the time axis, i.e. padding amount is ``receptive field - 1`` and 0 for before and after, respectively. If False, "same" padding is used along the time axis. @@ -171,7 +190,8 @@ class ConvInUpsampleNet(nn.Layer): def forward(self, c): """ Args: - c (Tensor): spectrogram. Shape (N, F, T) + c (Tensor): + spectrogram. Shape (N, F, T) Returns: Tensors: upsampled spectrogram. Shape (N, F, T'), where ``T' = upsample_factor * T``, diff --git a/paddlespeech/t2s/training/experiment.py b/paddlespeech/t2s/training/experiment.py index 05a363ff204511ae5c18390277612ad69496732f..1eba826df4785739c9fddf749fff0f212b89c48c 100644 --- a/paddlespeech/t2s/training/experiment.py +++ b/paddlespeech/t2s/training/experiment.py @@ -58,8 +58,10 @@ class ExperimentBase(object): need. Args: - config (yacs.config.CfgNode): The configuration used for the experiment. - args (argparse.Namespace): The parsed command line arguments. + config (yacs.config.CfgNode): + The configuration used for the experiment. + args (argparse.Namespace): + The parsed command line arguments. Examples: >>> def main_sp(config, args): diff --git a/paddlespeech/t2s/utils/checkpoint.py b/paddlespeech/t2s/utils/checkpoint.py index 1e222c50c12790f3ef5b63d24a6ebd1483122b1b..a3a19c0a022ae287bd5d9489e6c18683d06f72b9 100644 --- a/paddlespeech/t2s/utils/checkpoint.py +++ b/paddlespeech/t2s/utils/checkpoint.py @@ -25,7 +25,8 @@ def _load_latest_checkpoint(checkpoint_dir: str) -> int: """Get the iteration number corresponding to the latest saved checkpoint. Args: - checkpoint_dir (str): the directory where checkpoint is saved. + checkpoint_dir (str): + the directory where checkpoint is saved. Returns: int: the latest iteration number. @@ -46,8 +47,10 @@ def _save_checkpoint(checkpoint_dir: str, iteration: int): """Save the iteration number of the latest model to be checkpointed. Args: - checkpoint_dir (str): the directory where checkpoint is saved. - iteration (int): the latest iteration number. + checkpoint_dir (str): + the directory where checkpoint is saved. + iteration (int): + the latest iteration number. Returns: None @@ -65,11 +68,14 @@ def load_parameters(model, """Load a specific model checkpoint from disk. Args: - model (Layer): model to load parameters. - optimizer (Optimizer, optional): optimizer to load states if needed. - Defaults to None. - checkpoint_dir (str, optional): the directory where checkpoint is saved. - checkpoint_path (str, optional): if specified, load the checkpoint + model (Layer): + model to load parameters. + optimizer (Optimizer, optional): + optimizer to load states if needed. Defaults to None. + checkpoint_dir (str, optional): + the directory where checkpoint is saved. + checkpoint_path (str, optional): + if specified, load the checkpoint stored in the checkpoint_path and the argument 'checkpoint_dir' will be ignored. Defaults to None. @@ -113,11 +119,14 @@ def save_parameters(checkpoint_dir, iteration, model, optimizer=None): """Checkpoint the latest trained model parameters. Args: - checkpoint_dir (str): the directory where checkpoint is saved. - iteration (int): the latest iteration number. - model (Layer): model to be checkpointed. - optimizer (Optimizer, optional): optimizer to be checkpointed. - Defaults to None. + checkpoint_dir (str): + the directory where checkpoint is saved. + iteration (int): + the latest iteration number. + model (Layer): + model to be checkpointed. + optimizer (Optimizer, optional): + optimizer to be checkpointed. Defaults to None. Returns: None diff --git a/paddlespeech/t2s/utils/error_rate.py b/paddlespeech/t2s/utils/error_rate.py index 41b13b75f06eceefa1c35492fece64864037adc7..76a4f45bee0618668725e6ab88dfa223918b219e 100644 --- a/paddlespeech/t2s/utils/error_rate.py +++ b/paddlespeech/t2s/utils/error_rate.py @@ -71,10 +71,14 @@ def word_errors(reference, hypothesis, ignore_case=False, delimiter=' '): hypothesis sequence in word-level. Args: - reference (str): The reference sentence. - hypothesis (str): The hypothesis sentence. - ignore_case (bool): Whether case-sensitive or not. - delimiter (char(str)): Delimiter of input sentences. + reference (str): + The reference sentence. + hypothesis (str): + The hypothesis sentence. + ignore_case (bool): + Whether case-sensitive or not. + delimiter (char(str)): + Delimiter of input sentences. Returns: list: Levenshtein distance and word number of reference sentence. diff --git a/paddlespeech/t2s/utils/h5_utils.py b/paddlespeech/t2s/utils/h5_utils.py index 75c2e448820da8a6dc183e69e5b1e7683f258b28..7558e046a8a7d234cde64eaaa301fc1ec82144f2 100644 --- a/paddlespeech/t2s/utils/h5_utils.py +++ b/paddlespeech/t2s/utils/h5_utils.py @@ -24,8 +24,10 @@ import numpy as np def read_hdf5(filename: Union[Path, str], dataset_name: str) -> Any: """Read a dataset from a HDF5 file. Args: - filename (Union[Path, str]): Path of the HDF5 file. - dataset_name (str): Name of the dataset to read. + filename (Union[Path, str]): + Path of the HDF5 file. + dataset_name (str): + Name of the dataset to read. Returns: Any: The retrieved dataset. diff --git a/paddlespeech/t2s/utils/internals.py b/paddlespeech/t2s/utils/internals.py index 6c10bd2d53ebb944e065ab8fac4fc1ffdfadd994..830e8a80fcb3f7186ba15c416bb310f4a4f17ed1 100644 --- a/paddlespeech/t2s/utils/internals.py +++ b/paddlespeech/t2s/utils/internals.py @@ -22,7 +22,8 @@ def convert_dtype_to_np_dtype_(dtype): Convert paddle's data type to corrsponding numpy data type. Args: - dtype(np.dtype): the data type in paddle. + dtype(np.dtype): + the data type in paddle. Returns: type: the data type in numpy. diff --git a/setup.py b/setup.py index 1df759b8d952b841ae41102df5aa11667aa6a22c..a3ef753a026de9638491e4f42463d5988a38b04c 100644 --- a/setup.py +++ b/setup.py @@ -48,6 +48,7 @@ base = [ "pandas", "paddlenlp", "paddlespeech_feat", + "Pillow>=9.0.0" "praatio==5.0.0", "pypinyin", "pypinyin-dict", @@ -77,7 +78,7 @@ server = [ "fastapi", "uvicorn", "pattern_singleton", - "websockets", + "websockets" ] requirements = { @@ -89,7 +90,6 @@ requirements = { "gpustat", "paddlespeech_ctcdecoders", "phkit", - "Pillow", "pybind11", "pypi-kenlm", "snakeviz",