diff --git a/demos/streaming_asr_server/web/app.py b/demos/streaming_asr_server/web/app.py deleted file mode 100644 index 22993c08efe9f81b5bddd316b624ee0d6f5ef821..0000000000000000000000000000000000000000 --- a/demos/streaming_asr_server/web/app.py +++ /dev/null @@ -1,23 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -# Copyright 2021 Mobvoi Inc. All Rights Reserved. -# Author: zhendong.peng@mobvoi.com (Zhendong Peng) -import argparse - -from flask import Flask -from flask import render_template - -parser = argparse.ArgumentParser(description='training your network') -parser.add_argument('--port', default=19999, type=int, help='port id') -args = parser.parse_args() - -app = Flask(__name__) - - -@app.route('/') -def index(): - return render_template('index.html') - - -if __name__ == '__main__': - app.run(host='0.0.0.0', port=args.port, debug=True) diff --git a/demos/streaming_asr_server/web/favicon.ico b/demos/streaming_asr_server/web/favicon.ico new file mode 100644 index 0000000000000000000000000000000000000000..342038720d7c5a8fbbef1110d098e50f7a0e6274 Binary files /dev/null and b/demos/streaming_asr_server/web/favicon.ico differ diff --git a/demos/streaming_asr_server/web/index.html b/demos/streaming_asr_server/web/index.html new file mode 100644 index 0000000000000000000000000000000000000000..33c676c55c3cb3618a1388570d45466f0fddc7e3 --- /dev/null +++ b/demos/streaming_asr_server/web/index.html @@ -0,0 +1,218 @@ + + + + + + + 飞桨PaddleSpeech + + + + +
+ + + diff --git a/demos/streaming_asr_server/web/paddle_web_demo.png b/demos/streaming_asr_server/web/paddle_web_demo.png index 214edffd076bd4f6df18b4faa3587239154b958a..db4b63ab9ed39cdc2c4ab75e291ae0dab02859c6 100644 Binary files a/demos/streaming_asr_server/web/paddle_web_demo.png and b/demos/streaming_asr_server/web/paddle_web_demo.png differ diff --git a/demos/streaming_asr_server/web/readme.md b/demos/streaming_asr_server/web/readme.md index 8310a25714d99e1015199c0899e2af15eb05c809..bef421711a22d161880077c6dce2d28248cd1612 100644 --- a/demos/streaming_asr_server/web/readme.md +++ b/demos/streaming_asr_server/web/readme.md @@ -1,18 +1,20 @@ # paddlespeech serving 网页Demo -- 感谢[wenet](https://github.com/wenet-e2e/wenet)团队的前端demo代码. +![图片](./paddle_web_demo.png) +step1: 开启流式语音识别服务器端 -## 使用方法 -### 1. 在本地电脑启动网页服务 - ``` - python app.py +``` +# 开启流式语音识别服务 +cd PaddleSpeech/demos/streaming_asr_server +paddlespeech_server start --config_file conf/ws_conformer_wenetspeech_application_faster.yaml +``` - ``` +step2: 谷歌游览器打开 `web`目录下`index.html` -### 2. 本地电脑浏览器 +step3: 点击`连接`,验证WebSocket是否成功连接 + +step4:点击开始录音(弹窗询问,允许录音) -在浏览器中输入127.0.0.1:19999 即可看到相关网页Demo。 -![图片](./paddle_web_demo.png) diff --git a/demos/streaming_asr_server/web/static/css/font-awesome.min.css b/demos/streaming_asr_server/web/static/css/font-awesome.min.css deleted file mode 100644 index 540440ce89f2a408aa699b65100e18f15e0f09ca..0000000000000000000000000000000000000000 --- a/demos/streaming_asr_server/web/static/css/font-awesome.min.css +++ /dev/null @@ -1,4 +0,0 @@ -/*! - * Font Awesome 4.7.0 by @davegandy - http://fontawesome.io - @fontawesome - * License - http://fontawesome.io/license (Font: SIL OFL 1.1, CSS: MIT License) - */@font-face{font-family:'FontAwesome';src:url('../fonts/fontawesome-webfont.eot?v=4.7.0');src:url('../fonts/fontawesome-webfont.eot?#iefix&v=4.7.0') format('embedded-opentype'),url('../fonts/fontawesome-webfont.woff2?v=4.7.0') format('woff2'),url('../fonts/fontawesome-webfont.woff?v=4.7.0') format('woff'),url('../fonts/fontawesome-webfont.ttf?v=4.7.0') format('truetype'),url('../fonts/fontawesome-webfont.svg?v=4.7.0#fontawesomeregular') format('svg');font-weight:normal;font-style:normal}.fa{display:inline-block;font:normal normal normal 14px/1 FontAwesome;font-size:inherit;text-rendering:auto;-webkit-font-smoothing:antialiased;-moz-osx-font-smoothing:grayscale}.fa-lg{font-size:1.33333333em;line-height:.75em;vertical-align:-15%}.fa-2x{font-size:2em}.fa-3x{font-size:3em}.fa-4x{font-size:4em}.fa-5x{font-size:5em}.fa-fw{width:1.28571429em;text-align:center}.fa-ul{padding-left:0;margin-left:2.14285714em;list-style-type:none}.fa-ul>li{position:relative}.fa-li{position:absolute;left:-2.14285714em;width:2.14285714em;top:.14285714em;text-align:center}.fa-li.fa-lg{left:-1.85714286em}.fa-border{padding:.2em .25em .15em;border:solid .08em #eee;border-radius:.1em}.fa-pull-left{float:left}.fa-pull-right{float:right}.fa.fa-pull-left{margin-right:.3em}.fa.fa-pull-right{margin-left:.3em}.pull-right{float:right}.pull-left{float:left}.fa.pull-left{margin-right:.3em}.fa.pull-right{margin-left:.3em}.fa-spin{-webkit-animation:fa-spin 2s infinite linear;animation:fa-spin 2s infinite linear}.fa-pulse{-webkit-animation:fa-spin 1s infinite steps(8);animation:fa-spin 1s infinite steps(8)}@-webkit-keyframes fa-spin{0%{-webkit-transform:rotate(0deg);transform:rotate(0deg)}100%{-webkit-transform:rotate(359deg);transform:rotate(359deg)}}@keyframes fa-spin{0%{-webkit-transform:rotate(0deg);transform:rotate(0deg)}100%{-webkit-transform:rotate(359deg);transform:rotate(359deg)}}.fa-rotate-90{-ms-filter:"progid:DXImageTransform.Microsoft.BasicImage(rotation=1)";-webkit-transform:rotate(90deg);-ms-transform:rotate(90deg);transform:rotate(90deg)}.fa-rotate-180{-ms-filter:"progid:DXImageTransform.Microsoft.BasicImage(rotation=2)";-webkit-transform:rotate(180deg);-ms-transform:rotate(180deg);transform:rotate(180deg)}.fa-rotate-270{-ms-filter:"progid:DXImageTransform.Microsoft.BasicImage(rotation=3)";-webkit-transform:rotate(270deg);-ms-transform:rotate(270deg);transform:rotate(270deg)}.fa-flip-horizontal{-ms-filter:"progid:DXImageTransform.Microsoft.BasicImage(rotation=0, mirror=1)";-webkit-transform:scale(-1, 1);-ms-transform:scale(-1, 1);transform:scale(-1, 1)}.fa-flip-vertical{-ms-filter:"progid:DXImageTransform.Microsoft.BasicImage(rotation=2, mirror=1)";-webkit-transform:scale(1, -1);-ms-transform:scale(1, -1);transform:scale(1, -1)}:root .fa-rotate-90,:root .fa-rotate-180,:root .fa-rotate-270,:root .fa-flip-horizontal,:root .fa-flip-vertical{filter:none}.fa-stack{position:relative;display:inline-block;width:2em;height:2em;line-height:2em;vertical-align:middle}.fa-stack-1x,.fa-stack-2x{position:absolute;left:0;width:100%;text-align:center}.fa-stack-1x{line-height:inherit}.fa-stack-2x{font-size:2em}.fa-inverse{color:#fff}.fa-glass:before{content:"\f000"}.fa-music:before{content:"\f001"}.fa-search:before{content:"\f002"}.fa-envelope-o:before{content:"\f003"}.fa-heart:before{content:"\f004"}.fa-star:before{content:"\f005"}.fa-star-o:before{content:"\f006"}.fa-user:before{content:"\f007"}.fa-film:before{content:"\f008"}.fa-th-large:before{content:"\f009"}.fa-th:before{content:"\f00a"}.fa-th-list:before{content:"\f00b"}.fa-check:before{content:"\f00c"}.fa-remove:before,.fa-close:before,.fa-times:before{content:"\f00d"}.fa-search-plus:before{content:"\f00e"}.fa-search-minus:before{content:"\f010"}.fa-power-off:before{content:"\f011"}.fa-signal:before{content:"\f012"}.fa-gear:before,.fa-cog:before{content:"\f013"}.fa-trash-o:before{content:"\f014"}.fa-home:before{content:"\f015"}.fa-file-o:before{content:"\f016"}.fa-clock-o:before{content:"\f017"}.fa-road:before{content:"\f018"}.fa-download:before{content:"\f019"}.fa-arrow-circle-o-down:before{content:"\f01a"}.fa-arrow-circle-o-up:before{content:"\f01b"}.fa-inbox:before{content:"\f01c"}.fa-play-circle-o:before{content:"\f01d"}.fa-rotate-right:before,.fa-repeat:before{content:"\f01e"}.fa-refresh:before{content:"\f021"}.fa-list-alt:before{content:"\f022"}.fa-lock:before{content:"\f023"}.fa-flag:before{content:"\f024"}.fa-headphones:before{content:"\f025"}.fa-volume-off:before{content:"\f026"}.fa-volume-down:before{content:"\f027"}.fa-volume-up:before{content:"\f028"}.fa-qrcode:before{content:"\f029"}.fa-barcode:before{content:"\f02a"}.fa-tag:before{content:"\f02b"}.fa-tags:before{content:"\f02c"}.fa-book:before{content:"\f02d"}.fa-bookmark:before{content:"\f02e"}.fa-print:before{content:"\f02f"}.fa-camera:before{content:"\f030"}.fa-font:before{content:"\f031"}.fa-bold:before{content:"\f032"}.fa-italic:before{content:"\f033"}.fa-text-height:before{content:"\f034"}.fa-text-width:before{content:"\f035"}.fa-align-left:before{content:"\f036"}.fa-align-center:before{content:"\f037"}.fa-align-right:before{content:"\f038"}.fa-align-justify:before{content:"\f039"}.fa-list:before{content:"\f03a"}.fa-dedent:before,.fa-outdent:before{content:"\f03b"}.fa-indent:before{content:"\f03c"}.fa-video-camera:before{content:"\f03d"}.fa-photo:before,.fa-image:before,.fa-picture-o:before{content:"\f03e"}.fa-pencil:before{content:"\f040"}.fa-map-marker:before{content:"\f041"}.fa-adjust:before{content:"\f042"}.fa-tint:before{content:"\f043"}.fa-edit:before,.fa-pencil-square-o:before{content:"\f044"}.fa-share-square-o:before{content:"\f045"}.fa-check-square-o:before{content:"\f046"}.fa-arrows:before{content:"\f047"}.fa-step-backward:before{content:"\f048"}.fa-fast-backward:before{content:"\f049"}.fa-backward:before{content:"\f04a"}.fa-play:before{content:"\f04b"}.fa-pause:before{content:"\f04c"}.fa-stop:before{content:"\f04d"}.fa-forward:before{content:"\f04e"}.fa-fast-forward:before{content:"\f050"}.fa-step-forward:before{content:"\f051"}.fa-eject:before{content:"\f052"}.fa-chevron-left:before{content:"\f053"}.fa-chevron-right:before{content:"\f054"}.fa-plus-circle:before{content:"\f055"}.fa-minus-circle:before{content:"\f056"}.fa-times-circle:before{content:"\f057"}.fa-check-circle:before{content:"\f058"}.fa-question-circle:before{content:"\f059"}.fa-info-circle:before{content:"\f05a"}.fa-crosshairs:before{content:"\f05b"}.fa-times-circle-o:before{content:"\f05c"}.fa-check-circle-o:before{content:"\f05d"}.fa-ban:before{content:"\f05e"}.fa-arrow-left:before{content:"\f060"}.fa-arrow-right:before{content:"\f061"}.fa-arrow-up:before{content:"\f062"}.fa-arrow-down:before{content:"\f063"}.fa-mail-forward:before,.fa-share:before{content:"\f064"}.fa-expand:before{content:"\f065"}.fa-compress:before{content:"\f066"}.fa-plus:before{content:"\f067"}.fa-minus:before{content:"\f068"}.fa-asterisk:before{content:"\f069"}.fa-exclamation-circle:before{content:"\f06a"}.fa-gift:before{content:"\f06b"}.fa-leaf:before{content:"\f06c"}.fa-fire:before{content:"\f06d"}.fa-eye:before{content:"\f06e"}.fa-eye-slash:before{content:"\f070"}.fa-warning:before,.fa-exclamation-triangle:before{content:"\f071"}.fa-plane:before{content:"\f072"}.fa-calendar:before{content:"\f073"}.fa-random:before{content:"\f074"}.fa-comment:before{content:"\f075"}.fa-magnet:before{content:"\f076"}.fa-chevron-up:before{content:"\f077"}.fa-chevron-down:before{content:"\f078"}.fa-retweet:before{content:"\f079"}.fa-shopping-cart:before{content:"\f07a"}.fa-folder:before{content:"\f07b"}.fa-folder-open:before{content:"\f07c"}.fa-arrows-v:before{content:"\f07d"}.fa-arrows-h:before{content:"\f07e"}.fa-bar-chart-o:before,.fa-bar-chart:before{content:"\f080"}.fa-twitter-square:before{content:"\f081"}.fa-facebook-square:before{content:"\f082"}.fa-camera-retro:before{content:"\f083"}.fa-key:before{content:"\f084"}.fa-gears:before,.fa-cogs:before{content:"\f085"}.fa-comments:before{content:"\f086"}.fa-thumbs-o-up:before{content:"\f087"}.fa-thumbs-o-down:before{content:"\f088"}.fa-star-half:before{content:"\f089"}.fa-heart-o:before{content:"\f08a"}.fa-sign-out:before{content:"\f08b"}.fa-linkedin-square:before{content:"\f08c"}.fa-thumb-tack:before{content:"\f08d"}.fa-external-link:before{content:"\f08e"}.fa-sign-in:before{content:"\f090"}.fa-trophy:before{content:"\f091"}.fa-github-square:before{content:"\f092"}.fa-upload:before{content:"\f093"}.fa-lemon-o:before{content:"\f094"}.fa-phone:before{content:"\f095"}.fa-square-o:before{content:"\f096"}.fa-bookmark-o:before{content:"\f097"}.fa-phone-square:before{content:"\f098"}.fa-twitter:before{content:"\f099"}.fa-facebook-f:before,.fa-facebook:before{content:"\f09a"}.fa-github:before{content:"\f09b"}.fa-unlock:before{content:"\f09c"}.fa-credit-card:before{content:"\f09d"}.fa-feed:before,.fa-rss:before{content:"\f09e"}.fa-hdd-o:before{content:"\f0a0"}.fa-bullhorn:before{content:"\f0a1"}.fa-bell:before{content:"\f0f3"}.fa-certificate:before{content:"\f0a3"}.fa-hand-o-right:before{content:"\f0a4"}.fa-hand-o-left:before{content:"\f0a5"}.fa-hand-o-up:before{content:"\f0a6"}.fa-hand-o-down:before{content:"\f0a7"}.fa-arrow-circle-left:before{content:"\f0a8"}.fa-arrow-circle-right:before{content:"\f0a9"}.fa-arrow-circle-up:before{content:"\f0aa"}.fa-arrow-circle-down:before{content:"\f0ab"}.fa-globe:before{content:"\f0ac"}.fa-wrench:before{content:"\f0ad"}.fa-tasks:before{content:"\f0ae"}.fa-filter:before{content:"\f0b0"}.fa-briefcase:before{content:"\f0b1"}.fa-arrows-alt:before{content:"\f0b2"}.fa-group:before,.fa-users:before{content:"\f0c0"}.fa-chain:before,.fa-link:before{content:"\f0c1"}.fa-cloud:before{content:"\f0c2"}.fa-flask:before{content:"\f0c3"}.fa-cut:before,.fa-scissors:before{content:"\f0c4"}.fa-copy:before,.fa-files-o:before{content:"\f0c5"}.fa-paperclip:before{content:"\f0c6"}.fa-save:before,.fa-floppy-o:before{content:"\f0c7"}.fa-square:before{content:"\f0c8"}.fa-navicon:before,.fa-reorder:before,.fa-bars:before{content:"\f0c9"}.fa-list-ul:before{content:"\f0ca"}.fa-list-ol:before{content:"\f0cb"}.fa-strikethrough:before{content:"\f0cc"}.fa-underline:before{content:"\f0cd"}.fa-table:before{content:"\f0ce"}.fa-magic:before{content:"\f0d0"}.fa-truck:before{content:"\f0d1"}.fa-pinterest:before{content:"\f0d2"}.fa-pinterest-square:before{content:"\f0d3"}.fa-google-plus-square:before{content:"\f0d4"}.fa-google-plus:before{content:"\f0d5"}.fa-money:before{content:"\f0d6"}.fa-caret-down:before{content:"\f0d7"}.fa-caret-up:before{content:"\f0d8"}.fa-caret-left:before{content:"\f0d9"}.fa-caret-right:before{content:"\f0da"}.fa-columns:before{content:"\f0db"}.fa-unsorted:before,.fa-sort:before{content:"\f0dc"}.fa-sort-down:before,.fa-sort-desc:before{content:"\f0dd"}.fa-sort-up:before,.fa-sort-asc:before{content:"\f0de"}.fa-envelope:before{content:"\f0e0"}.fa-linkedin:before{content:"\f0e1"}.fa-rotate-left:before,.fa-undo:before{content:"\f0e2"}.fa-legal:before,.fa-gavel:before{content:"\f0e3"}.fa-dashboard:before,.fa-tachometer:before{content:"\f0e4"}.fa-comment-o:before{content:"\f0e5"}.fa-comments-o:before{content:"\f0e6"}.fa-flash:before,.fa-bolt:before{content:"\f0e7"}.fa-sitemap:before{content:"\f0e8"}.fa-umbrella:before{content:"\f0e9"}.fa-paste:before,.fa-clipboard:before{content:"\f0ea"}.fa-lightbulb-o:before{content:"\f0eb"}.fa-exchange:before{content:"\f0ec"}.fa-cloud-download:before{content:"\f0ed"}.fa-cloud-upload:before{content:"\f0ee"}.fa-user-md:before{content:"\f0f0"}.fa-stethoscope:before{content:"\f0f1"}.fa-suitcase:before{content:"\f0f2"}.fa-bell-o:before{content:"\f0a2"}.fa-coffee:before{content:"\f0f4"}.fa-cutlery:before{content:"\f0f5"}.fa-file-text-o:before{content:"\f0f6"}.fa-building-o:before{content:"\f0f7"}.fa-hospital-o:before{content:"\f0f8"}.fa-ambulance:before{content:"\f0f9"}.fa-medkit:before{content:"\f0fa"}.fa-fighter-jet:before{content:"\f0fb"}.fa-beer:before{content:"\f0fc"}.fa-h-square:before{content:"\f0fd"}.fa-plus-square:before{content:"\f0fe"}.fa-angle-double-left:before{content:"\f100"}.fa-angle-double-right:before{content:"\f101"}.fa-angle-double-up:before{content:"\f102"}.fa-angle-double-down:before{content:"\f103"}.fa-angle-left:before{content:"\f104"}.fa-angle-right:before{content:"\f105"}.fa-angle-up:before{content:"\f106"}.fa-angle-down:before{content:"\f107"}.fa-desktop:before{content:"\f108"}.fa-laptop:before{content:"\f109"}.fa-tablet:before{content:"\f10a"}.fa-mobile-phone:before,.fa-mobile:before{content:"\f10b"}.fa-circle-o:before{content:"\f10c"}.fa-quote-left:before{content:"\f10d"}.fa-quote-right:before{content:"\f10e"}.fa-spinner:before{content:"\f110"}.fa-circle:before{content:"\f111"}.fa-mail-reply:before,.fa-reply:before{content:"\f112"}.fa-github-alt:before{content:"\f113"}.fa-folder-o:before{content:"\f114"}.fa-folder-open-o:before{content:"\f115"}.fa-smile-o:before{content:"\f118"}.fa-frown-o:before{content:"\f119"}.fa-meh-o:before{content:"\f11a"}.fa-gamepad:before{content:"\f11b"}.fa-keyboard-o:before{content:"\f11c"}.fa-flag-o:before{content:"\f11d"}.fa-flag-checkered:before{content:"\f11e"}.fa-terminal:before{content:"\f120"}.fa-code:before{content:"\f121"}.fa-mail-reply-all:before,.fa-reply-all:before{content:"\f122"}.fa-star-half-empty:before,.fa-star-half-full:before,.fa-star-half-o:before{content:"\f123"}.fa-location-arrow:before{content:"\f124"}.fa-crop:before{content:"\f125"}.fa-code-fork:before{content:"\f126"}.fa-unlink:before,.fa-chain-broken:before{content:"\f127"}.fa-question:before{content:"\f128"}.fa-info:before{content:"\f129"}.fa-exclamation:before{content:"\f12a"}.fa-superscript:before{content:"\f12b"}.fa-subscript:before{content:"\f12c"}.fa-eraser:before{content:"\f12d"}.fa-puzzle-piece:before{content:"\f12e"}.fa-microphone:before{content:"\f130"}.fa-microphone-slash:before{content:"\f131"}.fa-shield:before{content:"\f132"}.fa-calendar-o:before{content:"\f133"}.fa-fire-extinguisher:before{content:"\f134"}.fa-rocket:before{content:"\f135"}.fa-maxcdn:before{content:"\f136"}.fa-chevron-circle-left:before{content:"\f137"}.fa-chevron-circle-right:before{content:"\f138"}.fa-chevron-circle-up:before{content:"\f139"}.fa-chevron-circle-down:before{content:"\f13a"}.fa-html5:before{content:"\f13b"}.fa-css3:before{content:"\f13c"}.fa-anchor:before{content:"\f13d"}.fa-unlock-alt:before{content:"\f13e"}.fa-bullseye:before{content:"\f140"}.fa-ellipsis-h:before{content:"\f141"}.fa-ellipsis-v:before{content:"\f142"}.fa-rss-square:before{content:"\f143"}.fa-play-circle:before{content:"\f144"}.fa-ticket:before{content:"\f145"}.fa-minus-square:before{content:"\f146"}.fa-minus-square-o:before{content:"\f147"}.fa-level-up:before{content:"\f148"}.fa-level-down:before{content:"\f149"}.fa-check-square:before{content:"\f14a"}.fa-pencil-square:before{content:"\f14b"}.fa-external-link-square:before{content:"\f14c"}.fa-share-square:before{content:"\f14d"}.fa-compass:before{content:"\f14e"}.fa-toggle-down:before,.fa-caret-square-o-down:before{content:"\f150"}.fa-toggle-up:before,.fa-caret-square-o-up:before{content:"\f151"}.fa-toggle-right:before,.fa-caret-square-o-right:before{content:"\f152"}.fa-euro:before,.fa-eur:before{content:"\f153"}.fa-gbp:before{content:"\f154"}.fa-dollar:before,.fa-usd:before{content:"\f155"}.fa-rupee:before,.fa-inr:before{content:"\f156"}.fa-cny:before,.fa-rmb:before,.fa-yen:before,.fa-jpy:before{content:"\f157"}.fa-ruble:before,.fa-rouble:before,.fa-rub:before{content:"\f158"}.fa-won:before,.fa-krw:before{content:"\f159"}.fa-bitcoin:before,.fa-btc:before{content:"\f15a"}.fa-file:before{content:"\f15b"}.fa-file-text:before{content:"\f15c"}.fa-sort-alpha-asc:before{content:"\f15d"}.fa-sort-alpha-desc:before{content:"\f15e"}.fa-sort-amount-asc:before{content:"\f160"}.fa-sort-amount-desc:before{content:"\f161"}.fa-sort-numeric-asc:before{content:"\f162"}.fa-sort-numeric-desc:before{content:"\f163"}.fa-thumbs-up:before{content:"\f164"}.fa-thumbs-down:before{content:"\f165"}.fa-youtube-square:before{content:"\f166"}.fa-youtube:before{content:"\f167"}.fa-xing:before{content:"\f168"}.fa-xing-square:before{content:"\f169"}.fa-youtube-play:before{content:"\f16a"}.fa-dropbox:before{content:"\f16b"}.fa-stack-overflow:before{content:"\f16c"}.fa-instagram:before{content:"\f16d"}.fa-flickr:before{content:"\f16e"}.fa-adn:before{content:"\f170"}.fa-bitbucket:before{content:"\f171"}.fa-bitbucket-square:before{content:"\f172"}.fa-tumblr:before{content:"\f173"}.fa-tumblr-square:before{content:"\f174"}.fa-long-arrow-down:before{content:"\f175"}.fa-long-arrow-up:before{content:"\f176"}.fa-long-arrow-left:before{content:"\f177"}.fa-long-arrow-right:before{content:"\f178"}.fa-apple:before{content:"\f179"}.fa-windows:before{content:"\f17a"}.fa-android:before{content:"\f17b"}.fa-linux:before{content:"\f17c"}.fa-dribbble:before{content:"\f17d"}.fa-skype:before{content:"\f17e"}.fa-foursquare:before{content:"\f180"}.fa-trello:before{content:"\f181"}.fa-female:before{content:"\f182"}.fa-male:before{content:"\f183"}.fa-gittip:before,.fa-gratipay:before{content:"\f184"}.fa-sun-o:before{content:"\f185"}.fa-moon-o:before{content:"\f186"}.fa-archive:before{content:"\f187"}.fa-bug:before{content:"\f188"}.fa-vk:before{content:"\f189"}.fa-weibo:before{content:"\f18a"}.fa-renren:before{content:"\f18b"}.fa-pagelines:before{content:"\f18c"}.fa-stack-exchange:before{content:"\f18d"}.fa-arrow-circle-o-right:before{content:"\f18e"}.fa-arrow-circle-o-left:before{content:"\f190"}.fa-toggle-left:before,.fa-caret-square-o-left:before{content:"\f191"}.fa-dot-circle-o:before{content:"\f192"}.fa-wheelchair:before{content:"\f193"}.fa-vimeo-square:before{content:"\f194"}.fa-turkish-lira:before,.fa-try:before{content:"\f195"}.fa-plus-square-o:before{content:"\f196"}.fa-space-shuttle:before{content:"\f197"}.fa-slack:before{content:"\f198"}.fa-envelope-square:before{content:"\f199"}.fa-wordpress:before{content:"\f19a"}.fa-openid:before{content:"\f19b"}.fa-institution:before,.fa-bank:before,.fa-university:before{content:"\f19c"}.fa-mortar-board:before,.fa-graduation-cap:before{content:"\f19d"}.fa-yahoo:before{content:"\f19e"}.fa-google:before{content:"\f1a0"}.fa-reddit:before{content:"\f1a1"}.fa-reddit-square:before{content:"\f1a2"}.fa-stumbleupon-circle:before{content:"\f1a3"}.fa-stumbleupon:before{content:"\f1a4"}.fa-delicious:before{content:"\f1a5"}.fa-digg:before{content:"\f1a6"}.fa-pied-piper-pp:before{content:"\f1a7"}.fa-pied-piper-alt:before{content:"\f1a8"}.fa-drupal:before{content:"\f1a9"}.fa-joomla:before{content:"\f1aa"}.fa-language:before{content:"\f1ab"}.fa-fax:before{content:"\f1ac"}.fa-building:before{content:"\f1ad"}.fa-child:before{content:"\f1ae"}.fa-paw:before{content:"\f1b0"}.fa-spoon:before{content:"\f1b1"}.fa-cube:before{content:"\f1b2"}.fa-cubes:before{content:"\f1b3"}.fa-behance:before{content:"\f1b4"}.fa-behance-square:before{content:"\f1b5"}.fa-steam:before{content:"\f1b6"}.fa-steam-square:before{content:"\f1b7"}.fa-recycle:before{content:"\f1b8"}.fa-automobile:before,.fa-car:before{content:"\f1b9"}.fa-cab:before,.fa-taxi:before{content:"\f1ba"}.fa-tree:before{content:"\f1bb"}.fa-spotify:before{content:"\f1bc"}.fa-deviantart:before{content:"\f1bd"}.fa-soundcloud:before{content:"\f1be"}.fa-database:before{content:"\f1c0"}.fa-file-pdf-o:before{content:"\f1c1"}.fa-file-word-o:before{content:"\f1c2"}.fa-file-excel-o:before{content:"\f1c3"}.fa-file-powerpoint-o:before{content:"\f1c4"}.fa-file-photo-o:before,.fa-file-picture-o:before,.fa-file-image-o:before{content:"\f1c5"}.fa-file-zip-o:before,.fa-file-archive-o:before{content:"\f1c6"}.fa-file-sound-o:before,.fa-file-audio-o:before{content:"\f1c7"}.fa-file-movie-o:before,.fa-file-video-o:before{content:"\f1c8"}.fa-file-code-o:before{content:"\f1c9"}.fa-vine:before{content:"\f1ca"}.fa-codepen:before{content:"\f1cb"}.fa-jsfiddle:before{content:"\f1cc"}.fa-life-bouy:before,.fa-life-buoy:before,.fa-life-saver:before,.fa-support:before,.fa-life-ring:before{content:"\f1cd"}.fa-circle-o-notch:before{content:"\f1ce"}.fa-ra:before,.fa-resistance:before,.fa-rebel:before{content:"\f1d0"}.fa-ge:before,.fa-empire:before{content:"\f1d1"}.fa-git-square:before{content:"\f1d2"}.fa-git:before{content:"\f1d3"}.fa-y-combinator-square:before,.fa-yc-square:before,.fa-hacker-news:before{content:"\f1d4"}.fa-tencent-weibo:before{content:"\f1d5"}.fa-qq:before{content:"\f1d6"}.fa-wechat:before,.fa-weixin:before{content:"\f1d7"}.fa-send:before,.fa-paper-plane:before{content:"\f1d8"}.fa-send-o:before,.fa-paper-plane-o:before{content:"\f1d9"}.fa-history:before{content:"\f1da"}.fa-circle-thin:before{content:"\f1db"}.fa-header:before{content:"\f1dc"}.fa-paragraph:before{content:"\f1dd"}.fa-sliders:before{content:"\f1de"}.fa-share-alt:before{content:"\f1e0"}.fa-share-alt-square:before{content:"\f1e1"}.fa-bomb:before{content:"\f1e2"}.fa-soccer-ball-o:before,.fa-futbol-o:before{content:"\f1e3"}.fa-tty:before{content:"\f1e4"}.fa-binoculars:before{content:"\f1e5"}.fa-plug:before{content:"\f1e6"}.fa-slideshare:before{content:"\f1e7"}.fa-twitch:before{content:"\f1e8"}.fa-yelp:before{content:"\f1e9"}.fa-newspaper-o:before{content:"\f1ea"}.fa-wifi:before{content:"\f1eb"}.fa-calculator:before{content:"\f1ec"}.fa-paypal:before{content:"\f1ed"}.fa-google-wallet:before{content:"\f1ee"}.fa-cc-visa:before{content:"\f1f0"}.fa-cc-mastercard:before{content:"\f1f1"}.fa-cc-discover:before{content:"\f1f2"}.fa-cc-amex:before{content:"\f1f3"}.fa-cc-paypal:before{content:"\f1f4"}.fa-cc-stripe:before{content:"\f1f5"}.fa-bell-slash:before{content:"\f1f6"}.fa-bell-slash-o:before{content:"\f1f7"}.fa-trash:before{content:"\f1f8"}.fa-copyright:before{content:"\f1f9"}.fa-at:before{content:"\f1fa"}.fa-eyedropper:before{content:"\f1fb"}.fa-paint-brush:before{content:"\f1fc"}.fa-birthday-cake:before{content:"\f1fd"}.fa-area-chart:before{content:"\f1fe"}.fa-pie-chart:before{content:"\f200"}.fa-line-chart:before{content:"\f201"}.fa-lastfm:before{content:"\f202"}.fa-lastfm-square:before{content:"\f203"}.fa-toggle-off:before{content:"\f204"}.fa-toggle-on:before{content:"\f205"}.fa-bicycle:before{content:"\f206"}.fa-bus:before{content:"\f207"}.fa-ioxhost:before{content:"\f208"}.fa-angellist:before{content:"\f209"}.fa-cc:before{content:"\f20a"}.fa-shekel:before,.fa-sheqel:before,.fa-ils:before{content:"\f20b"}.fa-meanpath:before{content:"\f20c"}.fa-buysellads:before{content:"\f20d"}.fa-connectdevelop:before{content:"\f20e"}.fa-dashcube:before{content:"\f210"}.fa-forumbee:before{content:"\f211"}.fa-leanpub:before{content:"\f212"}.fa-sellsy:before{content:"\f213"}.fa-shirtsinbulk:before{content:"\f214"}.fa-simplybuilt:before{content:"\f215"}.fa-skyatlas:before{content:"\f216"}.fa-cart-plus:before{content:"\f217"}.fa-cart-arrow-down:before{content:"\f218"}.fa-diamond:before{content:"\f219"}.fa-ship:before{content:"\f21a"}.fa-user-secret:before{content:"\f21b"}.fa-motorcycle:before{content:"\f21c"}.fa-street-view:before{content:"\f21d"}.fa-heartbeat:before{content:"\f21e"}.fa-venus:before{content:"\f221"}.fa-mars:before{content:"\f222"}.fa-mercury:before{content:"\f223"}.fa-intersex:before,.fa-transgender:before{content:"\f224"}.fa-transgender-alt:before{content:"\f225"}.fa-venus-double:before{content:"\f226"}.fa-mars-double:before{content:"\f227"}.fa-venus-mars:before{content:"\f228"}.fa-mars-stroke:before{content:"\f229"}.fa-mars-stroke-v:before{content:"\f22a"}.fa-mars-stroke-h:before{content:"\f22b"}.fa-neuter:before{content:"\f22c"}.fa-genderless:before{content:"\f22d"}.fa-facebook-official:before{content:"\f230"}.fa-pinterest-p:before{content:"\f231"}.fa-whatsapp:before{content:"\f232"}.fa-server:before{content:"\f233"}.fa-user-plus:before{content:"\f234"}.fa-user-times:before{content:"\f235"}.fa-hotel:before,.fa-bed:before{content:"\f236"}.fa-viacoin:before{content:"\f237"}.fa-train:before{content:"\f238"}.fa-subway:before{content:"\f239"}.fa-medium:before{content:"\f23a"}.fa-yc:before,.fa-y-combinator:before{content:"\f23b"}.fa-optin-monster:before{content:"\f23c"}.fa-opencart:before{content:"\f23d"}.fa-expeditedssl:before{content:"\f23e"}.fa-battery-4:before,.fa-battery:before,.fa-battery-full:before{content:"\f240"}.fa-battery-3:before,.fa-battery-three-quarters:before{content:"\f241"}.fa-battery-2:before,.fa-battery-half:before{content:"\f242"}.fa-battery-1:before,.fa-battery-quarter:before{content:"\f243"}.fa-battery-0:before,.fa-battery-empty:before{content:"\f244"}.fa-mouse-pointer:before{content:"\f245"}.fa-i-cursor:before{content:"\f246"}.fa-object-group:before{content:"\f247"}.fa-object-ungroup:before{content:"\f248"}.fa-sticky-note:before{content:"\f249"}.fa-sticky-note-o:before{content:"\f24a"}.fa-cc-jcb:before{content:"\f24b"}.fa-cc-diners-club:before{content:"\f24c"}.fa-clone:before{content:"\f24d"}.fa-balance-scale:before{content:"\f24e"}.fa-hourglass-o:before{content:"\f250"}.fa-hourglass-1:before,.fa-hourglass-start:before{content:"\f251"}.fa-hourglass-2:before,.fa-hourglass-half:before{content:"\f252"}.fa-hourglass-3:before,.fa-hourglass-end:before{content:"\f253"}.fa-hourglass:before{content:"\f254"}.fa-hand-grab-o:before,.fa-hand-rock-o:before{content:"\f255"}.fa-hand-stop-o:before,.fa-hand-paper-o:before{content:"\f256"}.fa-hand-scissors-o:before{content:"\f257"}.fa-hand-lizard-o:before{content:"\f258"}.fa-hand-spock-o:before{content:"\f259"}.fa-hand-pointer-o:before{content:"\f25a"}.fa-hand-peace-o:before{content:"\f25b"}.fa-trademark:before{content:"\f25c"}.fa-registered:before{content:"\f25d"}.fa-creative-commons:before{content:"\f25e"}.fa-gg:before{content:"\f260"}.fa-gg-circle:before{content:"\f261"}.fa-tripadvisor:before{content:"\f262"}.fa-odnoklassniki:before{content:"\f263"}.fa-odnoklassniki-square:before{content:"\f264"}.fa-get-pocket:before{content:"\f265"}.fa-wikipedia-w:before{content:"\f266"}.fa-safari:before{content:"\f267"}.fa-chrome:before{content:"\f268"}.fa-firefox:before{content:"\f269"}.fa-opera:before{content:"\f26a"}.fa-internet-explorer:before{content:"\f26b"}.fa-tv:before,.fa-television:before{content:"\f26c"}.fa-contao:before{content:"\f26d"}.fa-500px:before{content:"\f26e"}.fa-amazon:before{content:"\f270"}.fa-calendar-plus-o:before{content:"\f271"}.fa-calendar-minus-o:before{content:"\f272"}.fa-calendar-times-o:before{content:"\f273"}.fa-calendar-check-o:before{content:"\f274"}.fa-industry:before{content:"\f275"}.fa-map-pin:before{content:"\f276"}.fa-map-signs:before{content:"\f277"}.fa-map-o:before{content:"\f278"}.fa-map:before{content:"\f279"}.fa-commenting:before{content:"\f27a"}.fa-commenting-o:before{content:"\f27b"}.fa-houzz:before{content:"\f27c"}.fa-vimeo:before{content:"\f27d"}.fa-black-tie:before{content:"\f27e"}.fa-fonticons:before{content:"\f280"}.fa-reddit-alien:before{content:"\f281"}.fa-edge:before{content:"\f282"}.fa-credit-card-alt:before{content:"\f283"}.fa-codiepie:before{content:"\f284"}.fa-modx:before{content:"\f285"}.fa-fort-awesome:before{content:"\f286"}.fa-usb:before{content:"\f287"}.fa-product-hunt:before{content:"\f288"}.fa-mixcloud:before{content:"\f289"}.fa-scribd:before{content:"\f28a"}.fa-pause-circle:before{content:"\f28b"}.fa-pause-circle-o:before{content:"\f28c"}.fa-stop-circle:before{content:"\f28d"}.fa-stop-circle-o:before{content:"\f28e"}.fa-shopping-bag:before{content:"\f290"}.fa-shopping-basket:before{content:"\f291"}.fa-hashtag:before{content:"\f292"}.fa-bluetooth:before{content:"\f293"}.fa-bluetooth-b:before{content:"\f294"}.fa-percent:before{content:"\f295"}.fa-gitlab:before{content:"\f296"}.fa-wpbeginner:before{content:"\f297"}.fa-wpforms:before{content:"\f298"}.fa-envira:before{content:"\f299"}.fa-universal-access:before{content:"\f29a"}.fa-wheelchair-alt:before{content:"\f29b"}.fa-question-circle-o:before{content:"\f29c"}.fa-blind:before{content:"\f29d"}.fa-audio-description:before{content:"\f29e"}.fa-volume-control-phone:before{content:"\f2a0"}.fa-braille:before{content:"\f2a1"}.fa-assistive-listening-systems:before{content:"\f2a2"}.fa-asl-interpreting:before,.fa-american-sign-language-interpreting:before{content:"\f2a3"}.fa-deafness:before,.fa-hard-of-hearing:before,.fa-deaf:before{content:"\f2a4"}.fa-glide:before{content:"\f2a5"}.fa-glide-g:before{content:"\f2a6"}.fa-signing:before,.fa-sign-language:before{content:"\f2a7"}.fa-low-vision:before{content:"\f2a8"}.fa-viadeo:before{content:"\f2a9"}.fa-viadeo-square:before{content:"\f2aa"}.fa-snapchat:before{content:"\f2ab"}.fa-snapchat-ghost:before{content:"\f2ac"}.fa-snapchat-square:before{content:"\f2ad"}.fa-pied-piper:before{content:"\f2ae"}.fa-first-order:before{content:"\f2b0"}.fa-yoast:before{content:"\f2b1"}.fa-themeisle:before{content:"\f2b2"}.fa-google-plus-circle:before,.fa-google-plus-official:before{content:"\f2b3"}.fa-fa:before,.fa-font-awesome:before{content:"\f2b4"}.fa-handshake-o:before{content:"\f2b5"}.fa-envelope-open:before{content:"\f2b6"}.fa-envelope-open-o:before{content:"\f2b7"}.fa-linode:before{content:"\f2b8"}.fa-address-book:before{content:"\f2b9"}.fa-address-book-o:before{content:"\f2ba"}.fa-vcard:before,.fa-address-card:before{content:"\f2bb"}.fa-vcard-o:before,.fa-address-card-o:before{content:"\f2bc"}.fa-user-circle:before{content:"\f2bd"}.fa-user-circle-o:before{content:"\f2be"}.fa-user-o:before{content:"\f2c0"}.fa-id-badge:before{content:"\f2c1"}.fa-drivers-license:before,.fa-id-card:before{content:"\f2c2"}.fa-drivers-license-o:before,.fa-id-card-o:before{content:"\f2c3"}.fa-quora:before{content:"\f2c4"}.fa-free-code-camp:before{content:"\f2c5"}.fa-telegram:before{content:"\f2c6"}.fa-thermometer-4:before,.fa-thermometer:before,.fa-thermometer-full:before{content:"\f2c7"}.fa-thermometer-3:before,.fa-thermometer-three-quarters:before{content:"\f2c8"}.fa-thermometer-2:before,.fa-thermometer-half:before{content:"\f2c9"}.fa-thermometer-1:before,.fa-thermometer-quarter:before{content:"\f2ca"}.fa-thermometer-0:before,.fa-thermometer-empty:before{content:"\f2cb"}.fa-shower:before{content:"\f2cc"}.fa-bathtub:before,.fa-s15:before,.fa-bath:before{content:"\f2cd"}.fa-podcast:before{content:"\f2ce"}.fa-window-maximize:before{content:"\f2d0"}.fa-window-minimize:before{content:"\f2d1"}.fa-window-restore:before{content:"\f2d2"}.fa-times-rectangle:before,.fa-window-close:before{content:"\f2d3"}.fa-times-rectangle-o:before,.fa-window-close-o:before{content:"\f2d4"}.fa-bandcamp:before{content:"\f2d5"}.fa-grav:before{content:"\f2d6"}.fa-etsy:before{content:"\f2d7"}.fa-imdb:before{content:"\f2d8"}.fa-ravelry:before{content:"\f2d9"}.fa-eercast:before{content:"\f2da"}.fa-microchip:before{content:"\f2db"}.fa-snowflake-o:before{content:"\f2dc"}.fa-superpowers:before{content:"\f2dd"}.fa-wpexplorer:before{content:"\f2de"}.fa-meetup:before{content:"\f2e0"}.sr-only{position:absolute;width:1px;height:1px;padding:0;margin:-1px;overflow:hidden;clip:rect(0, 0, 0, 0);border:0}.sr-only-focusable:active,.sr-only-focusable:focus{position:static;width:auto;height:auto;margin:0;overflow:visible;clip:auto} diff --git a/demos/streaming_asr_server/web/static/css/style.css b/demos/streaming_asr_server/web/static/css/style.css deleted file mode 100644 index a3040718b8f1caa8fed98832b8c82778b0003a9f..0000000000000000000000000000000000000000 --- a/demos/streaming_asr_server/web/static/css/style.css +++ /dev/null @@ -1,453 +0,0 @@ -/* -* @Author: baipengxia -* @Date: 2021-03-12 11:44:28 -* @Last Modified by: baipengxia -* @Last Modified time: 2021-03-12 15:14:24 -*/ - -/** COMMON RESET **/ -* { - -webkit-tap-highlight-color: rgba(0, 0, 0, 0); -} - -body, -h1, -h2, -h3, -h4, -h5, -h6, -hr, -p, -dl, -dt, -dd, -ul, -ol, -li, -fieldset, -lengend, -button, -input, -textarea, -th, -td { - margin: 0; - padding: 0; - color: #000; -} - -body { - font-size: 14px; -} -html, body { - min-width: 1200px; -} - -button, -input, -select, -textarea { - font-size: 14px; -} - -h1 { - font-size: 18px; -} - -h2 { - font-size: 14px; -} - -h3 { - font-size: 14px; -} - -ul, -ol, -li { - list-style: none; -} - -a { - text-decoration: none; -} - -a:hover { - text-decoration: none; -} - -fieldset, -img { - border: none; -} - -table { - border-collapse: collapse; - border-spacing: 0; -} - -i { - font-style: normal; -} - -label { - position: inherit; -} - -.clearfix:after { - content: "."; - display: block; - height: 0; - clear: both; - visibility: hidden; -} - -.clearfix { - zoom: 1; - display: block; -} - -html, -body { - font-family: Tahoma, Arial, 'microsoft yahei', 'Roboto', 'Droid Sans', 'Helvetica Neue', 'Droid Sans Fallback', 'Heiti SC', 'Hiragino Sans GB', 'Simsun', 'sans-self'; -} - - - -.audio-banner { - width: 100%; - overflow: auto; - padding: 0; - background: url('../image/voice-dictation.svg'); - background-size: cover; -} -.weaper { - width: 1200px; - height: 155px; - margin: 72px auto; -} -.text-content { - width: 670px; - height: 100%; - float: left; -} -.text-content .title { - font-size: 34px; - font-family: 'PingFangSC-Medium'; - font-weight: 500; - color: rgba(255, 255, 255, 1); - line-height: 48px; -} -.text-content .con { - font-size: 16px; - font-family: PingFangSC-Light; - font-weight: 300; - color: rgba(255, 255, 255, 1); - line-height: 30px; -} -.img-con { - width: 416px; - height: 100%; - float: right; -} -.img-con img { - width: 100%; - height: 100%; -} -.con-container { - margin-top: 34px; -} - -.audio-advantage { - background: #f8f9fa; -} -.asr-advantage { - width: 1200px; - margin: 0 auto; -} -.asr-advantage h2 { - text-align: center; - font-size: 22px; - padding: 30px 0 0 0; -} -.asr-advantage > ul > li { - box-sizing: border-box; - padding: 0 16px; - width: 33%; - text-align: center; - margin-bottom: 35px; -} -.asr-advantage > ul > li .icons{ - margin-top: 10px; - margin-bottom: 20px; - width: 42px; - height: 42px; -} -.service-item-content { - margin-top: 35px; - display: flex; - justify-content: center; - flex-wrap: wrap; -} -.service-item-content img { - width: 160px; - vertical-align: bottom; -} -.service-item-content > li { - box-sizing: border-box; - padding: 0 16px; - width: 33%; - text-align: center; - margin-bottom: 35px; -} -.service-item-content > li .service-item-content-title { - line-height: 1.5; - font-weight: 700; - margin-top: 10px; -} -.service-item-content > li .service-item-content-desc { - margin-top: 5px; - line-height: 1.8; - color: #657384; -} - - -.audio-scene-con { - width: 100%; - padding-bottom: 84px; - background: #fff; -} -.audio-scene { - overflow: auto; - width: 1200px; - background: #fff; - text-align: center; - padding: 0; - margin: 0 auto; -} -.audio-scene h2 { - padding: 30px 0 0 0; - font-size: 22px; - text-align: center; -} - -.audio-experience { - width: 100%; - height: 538px; - background: #fff; - padding: 0; - margin: 0; - overflow: auto; -} -.asr-box { - width: 1200px; - height: 394px; - margin: 64px auto; -} -.asr-box h2 { - font-size: 22px; - text-align: center; - margin-bottom: 64px; -} -.voice-container { - position: relative; - width: 1200px; - height: 308px; - background: rgba(255, 255, 255, 1); - border-radius: 8px; - border: 1px solid rgba(225, 225, 225, 1); -} -.voice-container .voice { - height: 236px; - width: 100%; - border-radius: 8px; -} -.voice-container .voice textarea { - height: 100%; - width: 100%; - border: none; - outline: none; - border-radius: 8px; - padding: 25px; - font-size: 14px; - box-sizing: border-box; - resize: none; -} -.voice-input { - width: 100%; - height: 72px; - box-sizing: border-box; - padding-left: 35px; - background: rgba(242, 244, 245, 1); - border-radius: 8px; - line-height: 72px; -} -.voice-input .el-select { - width: 492px; -} -.start-voice { - display: inline-block; - margin-left: 10px; -} -.start-voice .time { - margin-right: 25px; -} -.asr-advantage > ul > li { - margin-bottom: 77px; -} -#msg { - width: 100%; - line-height: 40px; - font-size: 14px; - margin-left: 330px; -} -#captcha { - margin-left: 350px !important; - display: inline-block; - position: relative; -} -.black { - position: fixed; - width: 100%; - height: 100%; - z-index: 5; - background: rgba(0, 0, 0, 0.5); - top: 0; - left: 0; -} -.container { - position: fixed; - z-index: 6; - top: 25%; - left: 10%; -} -.audio-scene-con { - width: 100%; - padding-bottom: 84px; - background: #fff; -} -#sound { - color: #fff; - cursor: pointer; - background: #147ede; - padding: 10px; - margin-top: 30px; - margin-left: 135px; - width: 176px; - height: 30px !important; - text-align: center; - line-height: 30px !important; - border-radius: 10px; -} -.con-ten { - position: absolute; - width: 100%; - height: 100%; - z-index: 5; - background: #fff; - opacity: 0.5; - top: 0; - left: 0; -} -.websocket-url { - width: 320px; - height: 20px; - border: 1px solid #dcdfe6; - line-height: 20px; - padding: 10px; - border-radius: 4px; -} -.voice-btn { - color: #fff; - background-color: #409eff; - font-weight: 500; - padding: 12px 20px; - font-size: 14px; - border-radius: 4px; - border: 0; - cursor: pointer; -} -.voice-btn.end { - display: none; -} -.result-text { - background: #fff; - padding: 20px; -} -.voice-footer { - border-top: 1px solid #dddede; - background: #f7f9fa; - text-align: center; - margin-bottom: 8px; - color: #333; - font-size: 12px; - padding: 20px 0; -} - -/** line animate **/ -.time-box { - display: none; - margin-left: 10px; - width: 300px; -} -.total-time { - font-size: 14px; - color: #545454; -} -.voice-btn.end.show, -.time-box.show { - display: inline; -} -.start-taste-line { - margin-right: 20px; - display: inline-block; -} -.start-taste-line hr { - background-color: #187cff; - width: 3px; - height: 8px; - margin: 0 3px; - display: inline-block; - border: none; -} -.hr { - animation: note 0.2s ease-in-out; - animation-iteration-count: infinite; - animation-direction: alternate; -} -.hr-one { - animation-delay: -0.9s; -} -.hr-two { - animation-delay: -0.8s; -} -.hr-three { - animation-delay: -0.7s; -} -.hr-four { - animation-delay: -0.6s; -} -.hr-five { - animation-delay: -0.5s; -} -.hr-six { - animation-delay: -0.4s; -} -.hr-seven { - animation-delay: -0.3s; -} -.hr-eight { - animation-delay: -0.2s; -} -.hr-nine { - animation-delay: -0.1s; -} -@keyframes note { - from { - transform: scaleY(1); - } - to { - transform: scaleY(4); - } -} \ No newline at end of file diff --git a/demos/streaming_asr_server/web/static/fonts/FontAwesome.otf b/demos/streaming_asr_server/web/static/fonts/FontAwesome.otf deleted file mode 100644 index 401ec0f36e4f73b8efa40bd6f604fe80d286db70..0000000000000000000000000000000000000000 Binary files a/demos/streaming_asr_server/web/static/fonts/FontAwesome.otf and /dev/null differ diff --git a/demos/streaming_asr_server/web/static/fonts/fontawesome-webfont.eot b/demos/streaming_asr_server/web/static/fonts/fontawesome-webfont.eot deleted file mode 100644 index e9f60ca953f93e35eab4108bd414bc02ddcf3928..0000000000000000000000000000000000000000 Binary files a/demos/streaming_asr_server/web/static/fonts/fontawesome-webfont.eot and /dev/null differ diff --git a/demos/streaming_asr_server/web/static/fonts/fontawesome-webfont.svg b/demos/streaming_asr_server/web/static/fonts/fontawesome-webfont.svg deleted file mode 100644 index 6cd0326be380a32c3193c42e1879b7a6c6cf527e..0000000000000000000000000000000000000000 --- a/demos/streaming_asr_server/web/static/fonts/fontawesome-webfont.svg +++ /dev/null @@ -1,1951 +0,0 @@ - - - - -Created by FontForge 20120731 at Mon Oct 24 17:37:40 2016 - By ,,, -Copyright Dave Gandy 2016. All rights reserved. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/demos/streaming_asr_server/web/static/fonts/fontawesome-webfont.ttf b/demos/streaming_asr_server/web/static/fonts/fontawesome-webfont.ttf deleted file mode 100644 index 35acda2fa1196aad98c2adf4378a7611dd713aa3..0000000000000000000000000000000000000000 Binary files a/demos/streaming_asr_server/web/static/fonts/fontawesome-webfont.ttf and /dev/null differ diff --git a/demos/streaming_asr_server/web/static/fonts/fontawesome-webfont.woff b/demos/streaming_asr_server/web/static/fonts/fontawesome-webfont.woff deleted file mode 100644 index 400014a4b06eee3d0c0d54402a47ab2601b2862b..0000000000000000000000000000000000000000 Binary files a/demos/streaming_asr_server/web/static/fonts/fontawesome-webfont.woff and /dev/null differ diff --git a/demos/streaming_asr_server/web/static/fonts/fontawesome-webfont.woff2 b/demos/streaming_asr_server/web/static/fonts/fontawesome-webfont.woff2 deleted file mode 100644 index 4d13fc60404b91e398a37200c4a77b645cfd9586..0000000000000000000000000000000000000000 Binary files a/demos/streaming_asr_server/web/static/fonts/fontawesome-webfont.woff2 and /dev/null differ diff --git a/demos/streaming_asr_server/web/static/image/PaddleSpeech_logo.png b/demos/streaming_asr_server/web/static/image/PaddleSpeech_logo.png deleted file mode 100644 index fb25277540c9023c8a7d010e22e7e033ad0d74d7..0000000000000000000000000000000000000000 Binary files a/demos/streaming_asr_server/web/static/image/PaddleSpeech_logo.png and /dev/null differ diff --git a/demos/streaming_asr_server/web/static/image/voice-dictation.svg b/demos/streaming_asr_server/web/static/image/voice-dictation.svg deleted file mode 100644 index d35971499ddfed4ab0016419fb87e8d6a0d695cc..0000000000000000000000000000000000000000 --- a/demos/streaming_asr_server/web/static/image/voice-dictation.svg +++ /dev/null @@ -1,94 +0,0 @@ - - - - 背景 - Created with Sketch. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/demos/streaming_asr_server/web/static/js/SoundRecognizer.js b/demos/streaming_asr_server/web/static/js/SoundRecognizer.js deleted file mode 100644 index 5ef3d2e89dc27945d9e356b3c9eb5519f9cea69a..0000000000000000000000000000000000000000 --- a/demos/streaming_asr_server/web/static/js/SoundRecognizer.js +++ /dev/null @@ -1,133 +0,0 @@ -SoundRecognizer = { - rec: null, - wave: null, - SampleRate: 16000, - testBitRate: 16, - isCloseRecorder: false, - SendInterval: 300, - realTimeSendTryType: 'pcm', - realTimeSendTryEncBusy: 0, - realTimeSendTryTime: 0, - realTimeSendTryNumber: 0, - transferUploadNumberMax: 0, - realTimeSendTryChunk: null, - soundType: "pcm", - init: function (config) { - this.soundType = config.soundType || 'pcm'; - this.SampleRate = config.sampleRate || 16000; - this.recwaveElm = config.recwaveElm || ''; - this.TransferUpload = config.translerCallBack || this.TransferProcess; - this.initRecorder(); - }, - RealTimeSendTryReset: function (type) { - this.realTimeSendTryType = type; - this.realTimeSendTryTime = 0; - }, - RealTimeSendTry: function (rec, isClose) { - var that = this; - var t1 = Date.now(), endT = 0, recImpl = Recorder.prototype; - if (this.realTimeSendTryTime == 0) { - this.realTimeSendTryTime = t1; - this.realTimeSendTryEncBusy = 0; - this.realTimeSendTryNumber = 0; - this.transferUploadNumberMax = 0; - this.realTimeSendTryChunk = null; - } - if (!isClose && t1 - this.realTimeSendTryTime < this.SendInterval) { - return;//控制缓冲达到指定间隔才进行传输 - } - this.realTimeSendTryTime = t1; - var number = ++this.realTimeSendTryNumber; - - //借用SampleData函数进行数据的连续处理,采样率转换是顺带的 - var chunk = Recorder.SampleData(rec.buffers, rec.srcSampleRate, this.SampleRate, this.realTimeSendTryChunk, { frameType: isClose ? "" : this.realTimeSendTryType }); - - //清理已处理完的缓冲数据,释放内存以支持长时间录音,最后完成录音时不能调用stop,因为数据已经被清掉了 - for (var i = this.realTimeSendTryChunk ? this.realTimeSendTryChunk.index : 0; i < chunk.index; i++) { - rec.buffers[i] = null; - } - this.realTimeSendTryChunk = chunk; - - //没有新数据,或结束时的数据量太小,不能进行mock转码 - if (chunk.data.length == 0 || isClose && chunk.data.length < 2000) { - this.TransferUpload(number, null, 0, null, isClose); - return; - } - //实时编码队列阻塞处理 - if (!isClose) { - if (this.realTimeSendTryEncBusy >= 2) { - console.log("编码队列阻塞,已丢弃一帧", 1); - return; - } - } - this.realTimeSendTryEncBusy++; - - //通过mock方法实时转码成mp3、wav - var encStartTime = Date.now(); - var recMock = Recorder({ - type: this.realTimeSendTryType - , sampleRate: this.SampleRate //采样率 - , bitRate: this.testBitRate //比特率 - }); - recMock.mock(chunk.data, chunk.sampleRate); - recMock.stop(function (blob, duration) { - that.realTimeSendTryEncBusy && (that.realTimeSendTryEncBusy--); - blob.encTime = Date.now() - encStartTime; - - //转码好就推入传输 - that.TransferUpload(number, blob, duration, recMock, isClose); - }, function (msg) { - that.realTimeSendTryEncBusy && (that.realTimeSendTryEncBusy--); - //转码错误?没想到什么时候会产生错误! - console.log("不应该出现的错误:" + msg, 1); - }); - }, - recordClose: function () { - try { - this.rec.close(function () { - this.isCloseRecorder = true; - }); - this.RealTimeSendTry(this.rec, true);//最后一次发送 - } catch (ex) { - // recordClose(); - } - }, - recordEnd: function () { - try { - this.rec.stop(function (blob, time) { - this.recordClose(); - }, function (s) { - this.recordClose(); - }); - } catch (ex) { - } - }, - initRecorder: function () { - var that = this; - var rec = Recorder({ - type: that.soundType - , bitRate: that.testBitRate - , sampleRate: that.SampleRate - , onProcess: function (buffers, level, time, sampleRate) { - that.wave.input(buffers[buffers.length - 1], level, sampleRate); - that.RealTimeSendTry(rec, false);//推入实时处理,因为是unknown格式,这里简化函数调用,没有用到buffers和bufferSampleRate,因为这些数据和rec.buffers是完全相同的。 - } - }); - - rec.open(function () { - that.wave = Recorder.FrequencyHistogramView({ - elem: that.recwaveElm, lineCount: 90 - , position: 0 - , minHeight: 1 - , stripeEnable: false - }); - rec.start(); - that.isCloseRecorder = false; - that.RealTimeSendTryReset(that.soundType);//重置 - }); - this.rec = rec; - }, - TransferProcess: function (number, blobOrNull, duration, blobRec, isClose) { - - } -} \ No newline at end of file diff --git a/demos/streaming_asr_server/web/static/js/jquery-3.2.1.min.js b/demos/streaming_asr_server/web/static/js/jquery-3.2.1.min.js deleted file mode 100644 index 644d35e274fd64ddaf6d12af813e820c424176a9..0000000000000000000000000000000000000000 --- a/demos/streaming_asr_server/web/static/js/jquery-3.2.1.min.js +++ /dev/null @@ -1,4 +0,0 @@ -/*! jQuery v3.2.1 | (c) JS Foundation and other contributors | jquery.org/license */ -!function(a,b){"use strict";"object"==typeof module&&"object"==typeof module.exports?module.exports=a.document?b(a,!0):function(a){if(!a.document)throw new Error("jQuery requires a window with a document");return b(a)}:b(a)}("undefined"!=typeof window?window:this,function(a,b){"use strict";var c=[],d=a.document,e=Object.getPrototypeOf,f=c.slice,g=c.concat,h=c.push,i=c.indexOf,j={},k=j.toString,l=j.hasOwnProperty,m=l.toString,n=m.call(Object),o={};function p(a,b){b=b||d;var c=b.createElement("script");c.text=a,b.head.appendChild(c).parentNode.removeChild(c)}var q="3.2.1",r=function(a,b){return new r.fn.init(a,b)},s=/^[\s\uFEFF\xA0]+|[\s\uFEFF\xA0]+$/g,t=/^-ms-/,u=/-([a-z])/g,v=function(a,b){return b.toUpperCase()};r.fn=r.prototype={jquery:q,constructor:r,length:0,toArray:function(){return f.call(this)},get:function(a){return null==a?f.call(this):a<0?this[a+this.length]:this[a]},pushStack:function(a){var b=r.merge(this.constructor(),a);return b.prevObject=this,b},each:function(a){return r.each(this,a)},map:function(a){return this.pushStack(r.map(this,function(b,c){return a.call(b,c,b)}))},slice:function(){return this.pushStack(f.apply(this,arguments))},first:function(){return this.eq(0)},last:function(){return this.eq(-1)},eq:function(a){var b=this.length,c=+a+(a<0?b:0);return this.pushStack(c>=0&&c0&&b-1 in a)}var x=function(a){var b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u="sizzle"+1*new Date,v=a.document,w=0,x=0,y=ha(),z=ha(),A=ha(),B=function(a,b){return a===b&&(l=!0),0},C={}.hasOwnProperty,D=[],E=D.pop,F=D.push,G=D.push,H=D.slice,I=function(a,b){for(var c=0,d=a.length;c+~]|"+K+")"+K+"*"),S=new RegExp("="+K+"*([^\\]'\"]*?)"+K+"*\\]","g"),T=new RegExp(N),U=new RegExp("^"+L+"$"),V={ID:new RegExp("^#("+L+")"),CLASS:new RegExp("^\\.("+L+")"),TAG:new RegExp("^("+L+"|[*])"),ATTR:new RegExp("^"+M),PSEUDO:new RegExp("^"+N),CHILD:new RegExp("^:(only|first|last|nth|nth-last)-(child|of-type)(?:\\("+K+"*(even|odd|(([+-]|)(\\d*)n|)"+K+"*(?:([+-]|)"+K+"*(\\d+)|))"+K+"*\\)|)","i"),bool:new RegExp("^(?:"+J+")$","i"),needsContext:new RegExp("^"+K+"*[>+~]|:(even|odd|eq|gt|lt|nth|first|last)(?:\\("+K+"*((?:-\\d)?\\d*)"+K+"*\\)|)(?=[^-]|$)","i")},W=/^(?:input|select|textarea|button)$/i,X=/^h\d$/i,Y=/^[^{]+\{\s*\[native \w/,Z=/^(?:#([\w-]+)|(\w+)|\.([\w-]+))$/,$=/[+~]/,_=new RegExp("\\\\([\\da-f]{1,6}"+K+"?|("+K+")|.)","ig"),aa=function(a,b,c){var d="0x"+b-65536;return d!==d||c?b:d<0?String.fromCharCode(d+65536):String.fromCharCode(d>>10|55296,1023&d|56320)},ba=/([\0-\x1f\x7f]|^-?\d)|^-$|[^\0-\x1f\x7f-\uFFFF\w-]/g,ca=function(a,b){return b?"\0"===a?"\ufffd":a.slice(0,-1)+"\\"+a.charCodeAt(a.length-1).toString(16)+" ":"\\"+a},da=function(){m()},ea=ta(function(a){return a.disabled===!0&&("form"in a||"label"in a)},{dir:"parentNode",next:"legend"});try{G.apply(D=H.call(v.childNodes),v.childNodes),D[v.childNodes.length].nodeType}catch(fa){G={apply:D.length?function(a,b){F.apply(a,H.call(b))}:function(a,b){var c=a.length,d=0;while(a[c++]=b[d++]);a.length=c-1}}}function ga(a,b,d,e){var f,h,j,k,l,o,r,s=b&&b.ownerDocument,w=b?b.nodeType:9;if(d=d||[],"string"!=typeof a||!a||1!==w&&9!==w&&11!==w)return d;if(!e&&((b?b.ownerDocument||b:v)!==n&&m(b),b=b||n,p)){if(11!==w&&(l=Z.exec(a)))if(f=l[1]){if(9===w){if(!(j=b.getElementById(f)))return d;if(j.id===f)return d.push(j),d}else if(s&&(j=s.getElementById(f))&&t(b,j)&&j.id===f)return d.push(j),d}else{if(l[2])return G.apply(d,b.getElementsByTagName(a)),d;if((f=l[3])&&c.getElementsByClassName&&b.getElementsByClassName)return G.apply(d,b.getElementsByClassName(f)),d}if(c.qsa&&!A[a+" "]&&(!q||!q.test(a))){if(1!==w)s=b,r=a;else if("object"!==b.nodeName.toLowerCase()){(k=b.getAttribute("id"))?k=k.replace(ba,ca):b.setAttribute("id",k=u),o=g(a),h=o.length;while(h--)o[h]="#"+k+" "+sa(o[h]);r=o.join(","),s=$.test(a)&&qa(b.parentNode)||b}if(r)try{return G.apply(d,s.querySelectorAll(r)),d}catch(x){}finally{k===u&&b.removeAttribute("id")}}}return i(a.replace(P,"$1"),b,d,e)}function ha(){var a=[];function b(c,e){return a.push(c+" ")>d.cacheLength&&delete b[a.shift()],b[c+" "]=e}return b}function ia(a){return a[u]=!0,a}function ja(a){var b=n.createElement("fieldset");try{return!!a(b)}catch(c){return!1}finally{b.parentNode&&b.parentNode.removeChild(b),b=null}}function ka(a,b){var c=a.split("|"),e=c.length;while(e--)d.attrHandle[c[e]]=b}function la(a,b){var c=b&&a,d=c&&1===a.nodeType&&1===b.nodeType&&a.sourceIndex-b.sourceIndex;if(d)return d;if(c)while(c=c.nextSibling)if(c===b)return-1;return a?1:-1}function ma(a){return function(b){var c=b.nodeName.toLowerCase();return"input"===c&&b.type===a}}function na(a){return function(b){var c=b.nodeName.toLowerCase();return("input"===c||"button"===c)&&b.type===a}}function oa(a){return function(b){return"form"in b?b.parentNode&&b.disabled===!1?"label"in b?"label"in b.parentNode?b.parentNode.disabled===a:b.disabled===a:b.isDisabled===a||b.isDisabled!==!a&&ea(b)===a:b.disabled===a:"label"in b&&b.disabled===a}}function pa(a){return ia(function(b){return b=+b,ia(function(c,d){var e,f=a([],c.length,b),g=f.length;while(g--)c[e=f[g]]&&(c[e]=!(d[e]=c[e]))})})}function qa(a){return a&&"undefined"!=typeof a.getElementsByTagName&&a}c=ga.support={},f=ga.isXML=function(a){var b=a&&(a.ownerDocument||a).documentElement;return!!b&&"HTML"!==b.nodeName},m=ga.setDocument=function(a){var b,e,g=a?a.ownerDocument||a:v;return g!==n&&9===g.nodeType&&g.documentElement?(n=g,o=n.documentElement,p=!f(n),v!==n&&(e=n.defaultView)&&e.top!==e&&(e.addEventListener?e.addEventListener("unload",da,!1):e.attachEvent&&e.attachEvent("onunload",da)),c.attributes=ja(function(a){return a.className="i",!a.getAttribute("className")}),c.getElementsByTagName=ja(function(a){return a.appendChild(n.createComment("")),!a.getElementsByTagName("*").length}),c.getElementsByClassName=Y.test(n.getElementsByClassName),c.getById=ja(function(a){return o.appendChild(a).id=u,!n.getElementsByName||!n.getElementsByName(u).length}),c.getById?(d.filter.ID=function(a){var b=a.replace(_,aa);return function(a){return a.getAttribute("id")===b}},d.find.ID=function(a,b){if("undefined"!=typeof b.getElementById&&p){var c=b.getElementById(a);return c?[c]:[]}}):(d.filter.ID=function(a){var b=a.replace(_,aa);return function(a){var c="undefined"!=typeof a.getAttributeNode&&a.getAttributeNode("id");return c&&c.value===b}},d.find.ID=function(a,b){if("undefined"!=typeof b.getElementById&&p){var c,d,e,f=b.getElementById(a);if(f){if(c=f.getAttributeNode("id"),c&&c.value===a)return[f];e=b.getElementsByName(a),d=0;while(f=e[d++])if(c=f.getAttributeNode("id"),c&&c.value===a)return[f]}return[]}}),d.find.TAG=c.getElementsByTagName?function(a,b){return"undefined"!=typeof b.getElementsByTagName?b.getElementsByTagName(a):c.qsa?b.querySelectorAll(a):void 0}:function(a,b){var c,d=[],e=0,f=b.getElementsByTagName(a);if("*"===a){while(c=f[e++])1===c.nodeType&&d.push(c);return d}return f},d.find.CLASS=c.getElementsByClassName&&function(a,b){if("undefined"!=typeof b.getElementsByClassName&&p)return b.getElementsByClassName(a)},r=[],q=[],(c.qsa=Y.test(n.querySelectorAll))&&(ja(function(a){o.appendChild(a).innerHTML="",a.querySelectorAll("[msallowcapture^='']").length&&q.push("[*^$]="+K+"*(?:''|\"\")"),a.querySelectorAll("[selected]").length||q.push("\\["+K+"*(?:value|"+J+")"),a.querySelectorAll("[id~="+u+"-]").length||q.push("~="),a.querySelectorAll(":checked").length||q.push(":checked"),a.querySelectorAll("a#"+u+"+*").length||q.push(".#.+[+~]")}),ja(function(a){a.innerHTML="";var b=n.createElement("input");b.setAttribute("type","hidden"),a.appendChild(b).setAttribute("name","D"),a.querySelectorAll("[name=d]").length&&q.push("name"+K+"*[*^$|!~]?="),2!==a.querySelectorAll(":enabled").length&&q.push(":enabled",":disabled"),o.appendChild(a).disabled=!0,2!==a.querySelectorAll(":disabled").length&&q.push(":enabled",":disabled"),a.querySelectorAll("*,:x"),q.push(",.*:")})),(c.matchesSelector=Y.test(s=o.matches||o.webkitMatchesSelector||o.mozMatchesSelector||o.oMatchesSelector||o.msMatchesSelector))&&ja(function(a){c.disconnectedMatch=s.call(a,"*"),s.call(a,"[s!='']:x"),r.push("!=",N)}),q=q.length&&new RegExp(q.join("|")),r=r.length&&new RegExp(r.join("|")),b=Y.test(o.compareDocumentPosition),t=b||Y.test(o.contains)?function(a,b){var c=9===a.nodeType?a.documentElement:a,d=b&&b.parentNode;return a===d||!(!d||1!==d.nodeType||!(c.contains?c.contains(d):a.compareDocumentPosition&&16&a.compareDocumentPosition(d)))}:function(a,b){if(b)while(b=b.parentNode)if(b===a)return!0;return!1},B=b?function(a,b){if(a===b)return l=!0,0;var d=!a.compareDocumentPosition-!b.compareDocumentPosition;return d?d:(d=(a.ownerDocument||a)===(b.ownerDocument||b)?a.compareDocumentPosition(b):1,1&d||!c.sortDetached&&b.compareDocumentPosition(a)===d?a===n||a.ownerDocument===v&&t(v,a)?-1:b===n||b.ownerDocument===v&&t(v,b)?1:k?I(k,a)-I(k,b):0:4&d?-1:1)}:function(a,b){if(a===b)return l=!0,0;var c,d=0,e=a.parentNode,f=b.parentNode,g=[a],h=[b];if(!e||!f)return a===n?-1:b===n?1:e?-1:f?1:k?I(k,a)-I(k,b):0;if(e===f)return la(a,b);c=a;while(c=c.parentNode)g.unshift(c);c=b;while(c=c.parentNode)h.unshift(c);while(g[d]===h[d])d++;return d?la(g[d],h[d]):g[d]===v?-1:h[d]===v?1:0},n):n},ga.matches=function(a,b){return ga(a,null,null,b)},ga.matchesSelector=function(a,b){if((a.ownerDocument||a)!==n&&m(a),b=b.replace(S,"='$1']"),c.matchesSelector&&p&&!A[b+" "]&&(!r||!r.test(b))&&(!q||!q.test(b)))try{var d=s.call(a,b);if(d||c.disconnectedMatch||a.document&&11!==a.document.nodeType)return d}catch(e){}return ga(b,n,null,[a]).length>0},ga.contains=function(a,b){return(a.ownerDocument||a)!==n&&m(a),t(a,b)},ga.attr=function(a,b){(a.ownerDocument||a)!==n&&m(a);var e=d.attrHandle[b.toLowerCase()],f=e&&C.call(d.attrHandle,b.toLowerCase())?e(a,b,!p):void 0;return void 0!==f?f:c.attributes||!p?a.getAttribute(b):(f=a.getAttributeNode(b))&&f.specified?f.value:null},ga.escape=function(a){return(a+"").replace(ba,ca)},ga.error=function(a){throw new Error("Syntax error, unrecognized expression: "+a)},ga.uniqueSort=function(a){var b,d=[],e=0,f=0;if(l=!c.detectDuplicates,k=!c.sortStable&&a.slice(0),a.sort(B),l){while(b=a[f++])b===a[f]&&(e=d.push(f));while(e--)a.splice(d[e],1)}return k=null,a},e=ga.getText=function(a){var b,c="",d=0,f=a.nodeType;if(f){if(1===f||9===f||11===f){if("string"==typeof a.textContent)return a.textContent;for(a=a.firstChild;a;a=a.nextSibling)c+=e(a)}else if(3===f||4===f)return a.nodeValue}else while(b=a[d++])c+=e(b);return c},d=ga.selectors={cacheLength:50,createPseudo:ia,match:V,attrHandle:{},find:{},relative:{">":{dir:"parentNode",first:!0}," ":{dir:"parentNode"},"+":{dir:"previousSibling",first:!0},"~":{dir:"previousSibling"}},preFilter:{ATTR:function(a){return a[1]=a[1].replace(_,aa),a[3]=(a[3]||a[4]||a[5]||"").replace(_,aa),"~="===a[2]&&(a[3]=" "+a[3]+" "),a.slice(0,4)},CHILD:function(a){return a[1]=a[1].toLowerCase(),"nth"===a[1].slice(0,3)?(a[3]||ga.error(a[0]),a[4]=+(a[4]?a[5]+(a[6]||1):2*("even"===a[3]||"odd"===a[3])),a[5]=+(a[7]+a[8]||"odd"===a[3])):a[3]&&ga.error(a[0]),a},PSEUDO:function(a){var b,c=!a[6]&&a[2];return V.CHILD.test(a[0])?null:(a[3]?a[2]=a[4]||a[5]||"":c&&T.test(c)&&(b=g(c,!0))&&(b=c.indexOf(")",c.length-b)-c.length)&&(a[0]=a[0].slice(0,b),a[2]=c.slice(0,b)),a.slice(0,3))}},filter:{TAG:function(a){var b=a.replace(_,aa).toLowerCase();return"*"===a?function(){return!0}:function(a){return a.nodeName&&a.nodeName.toLowerCase()===b}},CLASS:function(a){var b=y[a+" "];return b||(b=new RegExp("(^|"+K+")"+a+"("+K+"|$)"))&&y(a,function(a){return b.test("string"==typeof a.className&&a.className||"undefined"!=typeof a.getAttribute&&a.getAttribute("class")||"")})},ATTR:function(a,b,c){return function(d){var e=ga.attr(d,a);return null==e?"!="===b:!b||(e+="","="===b?e===c:"!="===b?e!==c:"^="===b?c&&0===e.indexOf(c):"*="===b?c&&e.indexOf(c)>-1:"$="===b?c&&e.slice(-c.length)===c:"~="===b?(" "+e.replace(O," ")+" ").indexOf(c)>-1:"|="===b&&(e===c||e.slice(0,c.length+1)===c+"-"))}},CHILD:function(a,b,c,d,e){var f="nth"!==a.slice(0,3),g="last"!==a.slice(-4),h="of-type"===b;return 1===d&&0===e?function(a){return!!a.parentNode}:function(b,c,i){var j,k,l,m,n,o,p=f!==g?"nextSibling":"previousSibling",q=b.parentNode,r=h&&b.nodeName.toLowerCase(),s=!i&&!h,t=!1;if(q){if(f){while(p){m=b;while(m=m[p])if(h?m.nodeName.toLowerCase()===r:1===m.nodeType)return!1;o=p="only"===a&&!o&&"nextSibling"}return!0}if(o=[g?q.firstChild:q.lastChild],g&&s){m=q,l=m[u]||(m[u]={}),k=l[m.uniqueID]||(l[m.uniqueID]={}),j=k[a]||[],n=j[0]===w&&j[1],t=n&&j[2],m=n&&q.childNodes[n];while(m=++n&&m&&m[p]||(t=n=0)||o.pop())if(1===m.nodeType&&++t&&m===b){k[a]=[w,n,t];break}}else if(s&&(m=b,l=m[u]||(m[u]={}),k=l[m.uniqueID]||(l[m.uniqueID]={}),j=k[a]||[],n=j[0]===w&&j[1],t=n),t===!1)while(m=++n&&m&&m[p]||(t=n=0)||o.pop())if((h?m.nodeName.toLowerCase()===r:1===m.nodeType)&&++t&&(s&&(l=m[u]||(m[u]={}),k=l[m.uniqueID]||(l[m.uniqueID]={}),k[a]=[w,t]),m===b))break;return t-=e,t===d||t%d===0&&t/d>=0}}},PSEUDO:function(a,b){var c,e=d.pseudos[a]||d.setFilters[a.toLowerCase()]||ga.error("unsupported pseudo: "+a);return e[u]?e(b):e.length>1?(c=[a,a,"",b],d.setFilters.hasOwnProperty(a.toLowerCase())?ia(function(a,c){var d,f=e(a,b),g=f.length;while(g--)d=I(a,f[g]),a[d]=!(c[d]=f[g])}):function(a){return e(a,0,c)}):e}},pseudos:{not:ia(function(a){var b=[],c=[],d=h(a.replace(P,"$1"));return d[u]?ia(function(a,b,c,e){var f,g=d(a,null,e,[]),h=a.length;while(h--)(f=g[h])&&(a[h]=!(b[h]=f))}):function(a,e,f){return b[0]=a,d(b,null,f,c),b[0]=null,!c.pop()}}),has:ia(function(a){return function(b){return ga(a,b).length>0}}),contains:ia(function(a){return a=a.replace(_,aa),function(b){return(b.textContent||b.innerText||e(b)).indexOf(a)>-1}}),lang:ia(function(a){return U.test(a||"")||ga.error("unsupported lang: "+a),a=a.replace(_,aa).toLowerCase(),function(b){var c;do if(c=p?b.lang:b.getAttribute("xml:lang")||b.getAttribute("lang"))return c=c.toLowerCase(),c===a||0===c.indexOf(a+"-");while((b=b.parentNode)&&1===b.nodeType);return!1}}),target:function(b){var c=a.location&&a.location.hash;return c&&c.slice(1)===b.id},root:function(a){return a===o},focus:function(a){return a===n.activeElement&&(!n.hasFocus||n.hasFocus())&&!!(a.type||a.href||~a.tabIndex)},enabled:oa(!1),disabled:oa(!0),checked:function(a){var b=a.nodeName.toLowerCase();return"input"===b&&!!a.checked||"option"===b&&!!a.selected},selected:function(a){return a.parentNode&&a.parentNode.selectedIndex,a.selected===!0},empty:function(a){for(a=a.firstChild;a;a=a.nextSibling)if(a.nodeType<6)return!1;return!0},parent:function(a){return!d.pseudos.empty(a)},header:function(a){return X.test(a.nodeName)},input:function(a){return W.test(a.nodeName)},button:function(a){var b=a.nodeName.toLowerCase();return"input"===b&&"button"===a.type||"button"===b},text:function(a){var b;return"input"===a.nodeName.toLowerCase()&&"text"===a.type&&(null==(b=a.getAttribute("type"))||"text"===b.toLowerCase())},first:pa(function(){return[0]}),last:pa(function(a,b){return[b-1]}),eq:pa(function(a,b,c){return[c<0?c+b:c]}),even:pa(function(a,b){for(var c=0;c=0;)a.push(d);return a}),gt:pa(function(a,b,c){for(var d=c<0?c+b:c;++d1?function(b,c,d){var e=a.length;while(e--)if(!a[e](b,c,d))return!1;return!0}:a[0]}function va(a,b,c){for(var d=0,e=b.length;d-1&&(f[j]=!(g[j]=l))}}else r=wa(r===g?r.splice(o,r.length):r),e?e(null,g,r,i):G.apply(g,r)})}function ya(a){for(var b,c,e,f=a.length,g=d.relative[a[0].type],h=g||d.relative[" "],i=g?1:0,k=ta(function(a){return a===b},h,!0),l=ta(function(a){return I(b,a)>-1},h,!0),m=[function(a,c,d){var e=!g&&(d||c!==j)||((b=c).nodeType?k(a,c,d):l(a,c,d));return b=null,e}];i1&&ua(m),i>1&&sa(a.slice(0,i-1).concat({value:" "===a[i-2].type?"*":""})).replace(P,"$1"),c,i0,e=a.length>0,f=function(f,g,h,i,k){var l,o,q,r=0,s="0",t=f&&[],u=[],v=j,x=f||e&&d.find.TAG("*",k),y=w+=null==v?1:Math.random()||.1,z=x.length;for(k&&(j=g===n||g||k);s!==z&&null!=(l=x[s]);s++){if(e&&l){o=0,g||l.ownerDocument===n||(m(l),h=!p);while(q=a[o++])if(q(l,g||n,h)){i.push(l);break}k&&(w=y)}c&&((l=!q&&l)&&r--,f&&t.push(l))}if(r+=s,c&&s!==r){o=0;while(q=b[o++])q(t,u,g,h);if(f){if(r>0)while(s--)t[s]||u[s]||(u[s]=E.call(i));u=wa(u)}G.apply(i,u),k&&!f&&u.length>0&&r+b.length>1&&ga.uniqueSort(i)}return k&&(w=y,j=v),t};return c?ia(f):f}return h=ga.compile=function(a,b){var c,d=[],e=[],f=A[a+" "];if(!f){b||(b=g(a)),c=b.length;while(c--)f=ya(b[c]),f[u]?d.push(f):e.push(f);f=A(a,za(e,d)),f.selector=a}return f},i=ga.select=function(a,b,c,e){var f,i,j,k,l,m="function"==typeof a&&a,n=!e&&g(a=m.selector||a);if(c=c||[],1===n.length){if(i=n[0]=n[0].slice(0),i.length>2&&"ID"===(j=i[0]).type&&9===b.nodeType&&p&&d.relative[i[1].type]){if(b=(d.find.ID(j.matches[0].replace(_,aa),b)||[])[0],!b)return c;m&&(b=b.parentNode),a=a.slice(i.shift().value.length)}f=V.needsContext.test(a)?0:i.length;while(f--){if(j=i[f],d.relative[k=j.type])break;if((l=d.find[k])&&(e=l(j.matches[0].replace(_,aa),$.test(i[0].type)&&qa(b.parentNode)||b))){if(i.splice(f,1),a=e.length&&sa(i),!a)return G.apply(c,e),c;break}}}return(m||h(a,n))(e,b,!p,c,!b||$.test(a)&&qa(b.parentNode)||b),c},c.sortStable=u.split("").sort(B).join("")===u,c.detectDuplicates=!!l,m(),c.sortDetached=ja(function(a){return 1&a.compareDocumentPosition(n.createElement("fieldset"))}),ja(function(a){return a.innerHTML="","#"===a.firstChild.getAttribute("href")})||ka("type|href|height|width",function(a,b,c){if(!c)return a.getAttribute(b,"type"===b.toLowerCase()?1:2)}),c.attributes&&ja(function(a){return a.innerHTML="",a.firstChild.setAttribute("value",""),""===a.firstChild.getAttribute("value")})||ka("value",function(a,b,c){if(!c&&"input"===a.nodeName.toLowerCase())return a.defaultValue}),ja(function(a){return null==a.getAttribute("disabled")})||ka(J,function(a,b,c){var d;if(!c)return a[b]===!0?b.toLowerCase():(d=a.getAttributeNode(b))&&d.specified?d.value:null}),ga}(a);r.find=x,r.expr=x.selectors,r.expr[":"]=r.expr.pseudos,r.uniqueSort=r.unique=x.uniqueSort,r.text=x.getText,r.isXMLDoc=x.isXML,r.contains=x.contains,r.escapeSelector=x.escape;var y=function(a,b,c){var d=[],e=void 0!==c;while((a=a[b])&&9!==a.nodeType)if(1===a.nodeType){if(e&&r(a).is(c))break;d.push(a)}return d},z=function(a,b){for(var c=[];a;a=a.nextSibling)1===a.nodeType&&a!==b&&c.push(a);return c},A=r.expr.match.needsContext;function B(a,b){return a.nodeName&&a.nodeName.toLowerCase()===b.toLowerCase()}var C=/^<([a-z][^\/\0>:\x20\t\r\n\f]*)[\x20\t\r\n\f]*\/?>(?:<\/\1>|)$/i,D=/^.[^:#\[\.,]*$/;function E(a,b,c){return r.isFunction(b)?r.grep(a,function(a,d){return!!b.call(a,d,a)!==c}):b.nodeType?r.grep(a,function(a){return a===b!==c}):"string"!=typeof b?r.grep(a,function(a){return i.call(b,a)>-1!==c}):D.test(b)?r.filter(b,a,c):(b=r.filter(b,a),r.grep(a,function(a){return i.call(b,a)>-1!==c&&1===a.nodeType}))}r.filter=function(a,b,c){var d=b[0];return c&&(a=":not("+a+")"),1===b.length&&1===d.nodeType?r.find.matchesSelector(d,a)?[d]:[]:r.find.matches(a,r.grep(b,function(a){return 1===a.nodeType}))},r.fn.extend({find:function(a){var b,c,d=this.length,e=this;if("string"!=typeof a)return this.pushStack(r(a).filter(function(){for(b=0;b1?r.uniqueSort(c):c},filter:function(a){return this.pushStack(E(this,a||[],!1))},not:function(a){return this.pushStack(E(this,a||[],!0))},is:function(a){return!!E(this,"string"==typeof a&&A.test(a)?r(a):a||[],!1).length}});var F,G=/^(?:\s*(<[\w\W]+>)[^>]*|#([\w-]+))$/,H=r.fn.init=function(a,b,c){var e,f;if(!a)return this;if(c=c||F,"string"==typeof a){if(e="<"===a[0]&&">"===a[a.length-1]&&a.length>=3?[null,a,null]:G.exec(a),!e||!e[1]&&b)return!b||b.jquery?(b||c).find(a):this.constructor(b).find(a);if(e[1]){if(b=b instanceof r?b[0]:b,r.merge(this,r.parseHTML(e[1],b&&b.nodeType?b.ownerDocument||b:d,!0)),C.test(e[1])&&r.isPlainObject(b))for(e in b)r.isFunction(this[e])?this[e](b[e]):this.attr(e,b[e]);return this}return f=d.getElementById(e[2]),f&&(this[0]=f,this.length=1),this}return a.nodeType?(this[0]=a,this.length=1,this):r.isFunction(a)?void 0!==c.ready?c.ready(a):a(r):r.makeArray(a,this)};H.prototype=r.fn,F=r(d);var I=/^(?:parents|prev(?:Until|All))/,J={children:!0,contents:!0,next:!0,prev:!0};r.fn.extend({has:function(a){var b=r(a,this),c=b.length;return this.filter(function(){for(var a=0;a-1:1===c.nodeType&&r.find.matchesSelector(c,a))){f.push(c);break}return this.pushStack(f.length>1?r.uniqueSort(f):f)},index:function(a){return a?"string"==typeof a?i.call(r(a),this[0]):i.call(this,a.jquery?a[0]:a):this[0]&&this[0].parentNode?this.first().prevAll().length:-1},add:function(a,b){return this.pushStack(r.uniqueSort(r.merge(this.get(),r(a,b))))},addBack:function(a){return this.add(null==a?this.prevObject:this.prevObject.filter(a))}});function K(a,b){while((a=a[b])&&1!==a.nodeType);return a}r.each({parent:function(a){var b=a.parentNode;return b&&11!==b.nodeType?b:null},parents:function(a){return y(a,"parentNode")},parentsUntil:function(a,b,c){return y(a,"parentNode",c)},next:function(a){return K(a,"nextSibling")},prev:function(a){return K(a,"previousSibling")},nextAll:function(a){return y(a,"nextSibling")},prevAll:function(a){return y(a,"previousSibling")},nextUntil:function(a,b,c){return y(a,"nextSibling",c)},prevUntil:function(a,b,c){return y(a,"previousSibling",c)},siblings:function(a){return z((a.parentNode||{}).firstChild,a)},children:function(a){return z(a.firstChild)},contents:function(a){return B(a,"iframe")?a.contentDocument:(B(a,"template")&&(a=a.content||a),r.merge([],a.childNodes))}},function(a,b){r.fn[a]=function(c,d){var e=r.map(this,b,c);return"Until"!==a.slice(-5)&&(d=c),d&&"string"==typeof d&&(e=r.filter(d,e)),this.length>1&&(J[a]||r.uniqueSort(e),I.test(a)&&e.reverse()),this.pushStack(e)}});var L=/[^\x20\t\r\n\f]+/g;function M(a){var b={};return r.each(a.match(L)||[],function(a,c){b[c]=!0}),b}r.Callbacks=function(a){a="string"==typeof a?M(a):r.extend({},a);var b,c,d,e,f=[],g=[],h=-1,i=function(){for(e=e||a.once,d=b=!0;g.length;h=-1){c=g.shift();while(++h-1)f.splice(c,1),c<=h&&h--}),this},has:function(a){return a?r.inArray(a,f)>-1:f.length>0},empty:function(){return f&&(f=[]),this},disable:function(){return e=g=[],f=c="",this},disabled:function(){return!f},lock:function(){return e=g=[],c||b||(f=c=""),this},locked:function(){return!!e},fireWith:function(a,c){return e||(c=c||[],c=[a,c.slice?c.slice():c],g.push(c),b||i()),this},fire:function(){return j.fireWith(this,arguments),this},fired:function(){return!!d}};return j};function N(a){return a}function O(a){throw a}function P(a,b,c,d){var e;try{a&&r.isFunction(e=a.promise)?e.call(a).done(b).fail(c):a&&r.isFunction(e=a.then)?e.call(a,b,c):b.apply(void 0,[a].slice(d))}catch(a){c.apply(void 0,[a])}}r.extend({Deferred:function(b){var c=[["notify","progress",r.Callbacks("memory"),r.Callbacks("memory"),2],["resolve","done",r.Callbacks("once memory"),r.Callbacks("once memory"),0,"resolved"],["reject","fail",r.Callbacks("once memory"),r.Callbacks("once memory"),1,"rejected"]],d="pending",e={state:function(){return d},always:function(){return f.done(arguments).fail(arguments),this},"catch":function(a){return e.then(null,a)},pipe:function(){var a=arguments;return r.Deferred(function(b){r.each(c,function(c,d){var e=r.isFunction(a[d[4]])&&a[d[4]];f[d[1]](function(){var a=e&&e.apply(this,arguments);a&&r.isFunction(a.promise)?a.promise().progress(b.notify).done(b.resolve).fail(b.reject):b[d[0]+"With"](this,e?[a]:arguments)})}),a=null}).promise()},then:function(b,d,e){var f=0;function g(b,c,d,e){return function(){var h=this,i=arguments,j=function(){var a,j;if(!(b=f&&(d!==O&&(h=void 0,i=[a]),c.rejectWith(h,i))}};b?k():(r.Deferred.getStackHook&&(k.stackTrace=r.Deferred.getStackHook()),a.setTimeout(k))}}return r.Deferred(function(a){c[0][3].add(g(0,a,r.isFunction(e)?e:N,a.notifyWith)),c[1][3].add(g(0,a,r.isFunction(b)?b:N)),c[2][3].add(g(0,a,r.isFunction(d)?d:O))}).promise()},promise:function(a){return null!=a?r.extend(a,e):e}},f={};return r.each(c,function(a,b){var g=b[2],h=b[5];e[b[1]]=g.add,h&&g.add(function(){d=h},c[3-a][2].disable,c[0][2].lock),g.add(b[3].fire),f[b[0]]=function(){return f[b[0]+"With"](this===f?void 0:this,arguments),this},f[b[0]+"With"]=g.fireWith}),e.promise(f),b&&b.call(f,f),f},when:function(a){var b=arguments.length,c=b,d=Array(c),e=f.call(arguments),g=r.Deferred(),h=function(a){return function(c){d[a]=this,e[a]=arguments.length>1?f.call(arguments):c,--b||g.resolveWith(d,e)}};if(b<=1&&(P(a,g.done(h(c)).resolve,g.reject,!b),"pending"===g.state()||r.isFunction(e[c]&&e[c].then)))return g.then();while(c--)P(e[c],h(c),g.reject);return g.promise()}});var Q=/^(Eval|Internal|Range|Reference|Syntax|Type|URI)Error$/;r.Deferred.exceptionHook=function(b,c){a.console&&a.console.warn&&b&&Q.test(b.name)&&a.console.warn("jQuery.Deferred exception: "+b.message,b.stack,c)},r.readyException=function(b){a.setTimeout(function(){throw b})};var R=r.Deferred();r.fn.ready=function(a){return R.then(a)["catch"](function(a){r.readyException(a)}),this},r.extend({isReady:!1,readyWait:1,ready:function(a){(a===!0?--r.readyWait:r.isReady)||(r.isReady=!0,a!==!0&&--r.readyWait>0||R.resolveWith(d,[r]))}}),r.ready.then=R.then;function S(){d.removeEventListener("DOMContentLoaded",S), -a.removeEventListener("load",S),r.ready()}"complete"===d.readyState||"loading"!==d.readyState&&!d.documentElement.doScroll?a.setTimeout(r.ready):(d.addEventListener("DOMContentLoaded",S),a.addEventListener("load",S));var T=function(a,b,c,d,e,f,g){var h=0,i=a.length,j=null==c;if("object"===r.type(c)){e=!0;for(h in c)T(a,b,h,c[h],!0,f,g)}else if(void 0!==d&&(e=!0,r.isFunction(d)||(g=!0),j&&(g?(b.call(a,d),b=null):(j=b,b=function(a,b,c){return j.call(r(a),c)})),b))for(;h1,null,!0)},removeData:function(a){return this.each(function(){X.remove(this,a)})}}),r.extend({queue:function(a,b,c){var d;if(a)return b=(b||"fx")+"queue",d=W.get(a,b),c&&(!d||Array.isArray(c)?d=W.access(a,b,r.makeArray(c)):d.push(c)),d||[]},dequeue:function(a,b){b=b||"fx";var c=r.queue(a,b),d=c.length,e=c.shift(),f=r._queueHooks(a,b),g=function(){r.dequeue(a,b)};"inprogress"===e&&(e=c.shift(),d--),e&&("fx"===b&&c.unshift("inprogress"),delete f.stop,e.call(a,g,f)),!d&&f&&f.empty.fire()},_queueHooks:function(a,b){var c=b+"queueHooks";return W.get(a,c)||W.access(a,c,{empty:r.Callbacks("once memory").add(function(){W.remove(a,[b+"queue",c])})})}}),r.fn.extend({queue:function(a,b){var c=2;return"string"!=typeof a&&(b=a,a="fx",c--),arguments.length\x20\t\r\n\f]+)/i,la=/^$|\/(?:java|ecma)script/i,ma={option:[1,""],thead:[1,"","
"],col:[2,"","
"],tr:[2,"","
"],td:[3,"","
"],_default:[0,"",""]};ma.optgroup=ma.option,ma.tbody=ma.tfoot=ma.colgroup=ma.caption=ma.thead,ma.th=ma.td;function na(a,b){var c;return c="undefined"!=typeof a.getElementsByTagName?a.getElementsByTagName(b||"*"):"undefined"!=typeof a.querySelectorAll?a.querySelectorAll(b||"*"):[],void 0===b||b&&B(a,b)?r.merge([a],c):c}function oa(a,b){for(var c=0,d=a.length;c-1)e&&e.push(f);else if(j=r.contains(f.ownerDocument,f),g=na(l.appendChild(f),"script"),j&&oa(g),c){k=0;while(f=g[k++])la.test(f.type||"")&&c.push(f)}return l}!function(){var a=d.createDocumentFragment(),b=a.appendChild(d.createElement("div")),c=d.createElement("input");c.setAttribute("type","radio"),c.setAttribute("checked","checked"),c.setAttribute("name","t"),b.appendChild(c),o.checkClone=b.cloneNode(!0).cloneNode(!0).lastChild.checked,b.innerHTML="",o.noCloneChecked=!!b.cloneNode(!0).lastChild.defaultValue}();var ra=d.documentElement,sa=/^key/,ta=/^(?:mouse|pointer|contextmenu|drag|drop)|click/,ua=/^([^.]*)(?:\.(.+)|)/;function va(){return!0}function wa(){return!1}function xa(){try{return d.activeElement}catch(a){}}function ya(a,b,c,d,e,f){var g,h;if("object"==typeof b){"string"!=typeof c&&(d=d||c,c=void 0);for(h in b)ya(a,h,c,d,b[h],f);return a}if(null==d&&null==e?(e=c,d=c=void 0):null==e&&("string"==typeof c?(e=d,d=void 0):(e=d,d=c,c=void 0)),e===!1)e=wa;else if(!e)return a;return 1===f&&(g=e,e=function(a){return r().off(a),g.apply(this,arguments)},e.guid=g.guid||(g.guid=r.guid++)),a.each(function(){r.event.add(this,b,e,d,c)})}r.event={global:{},add:function(a,b,c,d,e){var f,g,h,i,j,k,l,m,n,o,p,q=W.get(a);if(q){c.handler&&(f=c,c=f.handler,e=f.selector),e&&r.find.matchesSelector(ra,e),c.guid||(c.guid=r.guid++),(i=q.events)||(i=q.events={}),(g=q.handle)||(g=q.handle=function(b){return"undefined"!=typeof r&&r.event.triggered!==b.type?r.event.dispatch.apply(a,arguments):void 0}),b=(b||"").match(L)||[""],j=b.length;while(j--)h=ua.exec(b[j])||[],n=p=h[1],o=(h[2]||"").split(".").sort(),n&&(l=r.event.special[n]||{},n=(e?l.delegateType:l.bindType)||n,l=r.event.special[n]||{},k=r.extend({type:n,origType:p,data:d,handler:c,guid:c.guid,selector:e,needsContext:e&&r.expr.match.needsContext.test(e),namespace:o.join(".")},f),(m=i[n])||(m=i[n]=[],m.delegateCount=0,l.setup&&l.setup.call(a,d,o,g)!==!1||a.addEventListener&&a.addEventListener(n,g)),l.add&&(l.add.call(a,k),k.handler.guid||(k.handler.guid=c.guid)),e?m.splice(m.delegateCount++,0,k):m.push(k),r.event.global[n]=!0)}},remove:function(a,b,c,d,e){var f,g,h,i,j,k,l,m,n,o,p,q=W.hasData(a)&&W.get(a);if(q&&(i=q.events)){b=(b||"").match(L)||[""],j=b.length;while(j--)if(h=ua.exec(b[j])||[],n=p=h[1],o=(h[2]||"").split(".").sort(),n){l=r.event.special[n]||{},n=(d?l.delegateType:l.bindType)||n,m=i[n]||[],h=h[2]&&new RegExp("(^|\\.)"+o.join("\\.(?:.*\\.|)")+"(\\.|$)"),g=f=m.length;while(f--)k=m[f],!e&&p!==k.origType||c&&c.guid!==k.guid||h&&!h.test(k.namespace)||d&&d!==k.selector&&("**"!==d||!k.selector)||(m.splice(f,1),k.selector&&m.delegateCount--,l.remove&&l.remove.call(a,k));g&&!m.length&&(l.teardown&&l.teardown.call(a,o,q.handle)!==!1||r.removeEvent(a,n,q.handle),delete i[n])}else for(n in i)r.event.remove(a,n+b[j],c,d,!0);r.isEmptyObject(i)&&W.remove(a,"handle events")}},dispatch:function(a){var b=r.event.fix(a),c,d,e,f,g,h,i=new Array(arguments.length),j=(W.get(this,"events")||{})[b.type]||[],k=r.event.special[b.type]||{};for(i[0]=b,c=1;c=1))for(;j!==this;j=j.parentNode||this)if(1===j.nodeType&&("click"!==a.type||j.disabled!==!0)){for(f=[],g={},c=0;c-1:r.find(e,this,null,[j]).length),g[e]&&f.push(d);f.length&&h.push({elem:j,handlers:f})}return j=this,i\x20\t\r\n\f]*)[^>]*)\/>/gi,Aa=/\s*$/g;function Ea(a,b){return B(a,"table")&&B(11!==b.nodeType?b:b.firstChild,"tr")?r(">tbody",a)[0]||a:a}function Fa(a){return a.type=(null!==a.getAttribute("type"))+"/"+a.type,a}function Ga(a){var b=Ca.exec(a.type);return b?a.type=b[1]:a.removeAttribute("type"),a}function Ha(a,b){var c,d,e,f,g,h,i,j;if(1===b.nodeType){if(W.hasData(a)&&(f=W.access(a),g=W.set(b,f),j=f.events)){delete g.handle,g.events={};for(e in j)for(c=0,d=j[e].length;c1&&"string"==typeof q&&!o.checkClone&&Ba.test(q))return a.each(function(e){var f=a.eq(e);s&&(b[0]=q.call(this,e,f.html())),Ja(f,b,c,d)});if(m&&(e=qa(b,a[0].ownerDocument,!1,a,d),f=e.firstChild,1===e.childNodes.length&&(e=f),f||d)){for(h=r.map(na(e,"script"),Fa),i=h.length;l")},clone:function(a,b,c){var d,e,f,g,h=a.cloneNode(!0),i=r.contains(a.ownerDocument,a);if(!(o.noCloneChecked||1!==a.nodeType&&11!==a.nodeType||r.isXMLDoc(a)))for(g=na(h),f=na(a),d=0,e=f.length;d0&&oa(g,!i&&na(a,"script")),h},cleanData:function(a){for(var b,c,d,e=r.event.special,f=0;void 0!==(c=a[f]);f++)if(U(c)){if(b=c[W.expando]){if(b.events)for(d in b.events)e[d]?r.event.remove(c,d):r.removeEvent(c,d,b.handle);c[W.expando]=void 0}c[X.expando]&&(c[X.expando]=void 0)}}}),r.fn.extend({detach:function(a){return Ka(this,a,!0)},remove:function(a){return Ka(this,a)},text:function(a){return T(this,function(a){return void 0===a?r.text(this):this.empty().each(function(){1!==this.nodeType&&11!==this.nodeType&&9!==this.nodeType||(this.textContent=a)})},null,a,arguments.length)},append:function(){return Ja(this,arguments,function(a){if(1===this.nodeType||11===this.nodeType||9===this.nodeType){var b=Ea(this,a);b.appendChild(a)}})},prepend:function(){return Ja(this,arguments,function(a){if(1===this.nodeType||11===this.nodeType||9===this.nodeType){var b=Ea(this,a);b.insertBefore(a,b.firstChild)}})},before:function(){return Ja(this,arguments,function(a){this.parentNode&&this.parentNode.insertBefore(a,this)})},after:function(){return Ja(this,arguments,function(a){this.parentNode&&this.parentNode.insertBefore(a,this.nextSibling)})},empty:function(){for(var a,b=0;null!=(a=this[b]);b++)1===a.nodeType&&(r.cleanData(na(a,!1)),a.textContent="");return this},clone:function(a,b){return a=null!=a&&a,b=null==b?a:b,this.map(function(){return r.clone(this,a,b)})},html:function(a){return T(this,function(a){var b=this[0]||{},c=0,d=this.length;if(void 0===a&&1===b.nodeType)return b.innerHTML;if("string"==typeof a&&!Aa.test(a)&&!ma[(ka.exec(a)||["",""])[1].toLowerCase()]){a=r.htmlPrefilter(a);try{for(;c1)}});function _a(a,b,c,d,e){return new _a.prototype.init(a,b,c,d,e)}r.Tween=_a,_a.prototype={constructor:_a,init:function(a,b,c,d,e,f){this.elem=a,this.prop=c,this.easing=e||r.easing._default,this.options=b,this.start=this.now=this.cur(),this.end=d,this.unit=f||(r.cssNumber[c]?"":"px")},cur:function(){var a=_a.propHooks[this.prop];return a&&a.get?a.get(this):_a.propHooks._default.get(this)},run:function(a){var b,c=_a.propHooks[this.prop];return this.options.duration?this.pos=b=r.easing[this.easing](a,this.options.duration*a,0,1,this.options.duration):this.pos=b=a,this.now=(this.end-this.start)*b+this.start,this.options.step&&this.options.step.call(this.elem,this.now,this),c&&c.set?c.set(this):_a.propHooks._default.set(this),this}},_a.prototype.init.prototype=_a.prototype,_a.propHooks={_default:{get:function(a){var b;return 1!==a.elem.nodeType||null!=a.elem[a.prop]&&null==a.elem.style[a.prop]?a.elem[a.prop]:(b=r.css(a.elem,a.prop,""),b&&"auto"!==b?b:0)},set:function(a){r.fx.step[a.prop]?r.fx.step[a.prop](a):1!==a.elem.nodeType||null==a.elem.style[r.cssProps[a.prop]]&&!r.cssHooks[a.prop]?a.elem[a.prop]=a.now:r.style(a.elem,a.prop,a.now+a.unit)}}},_a.propHooks.scrollTop=_a.propHooks.scrollLeft={set:function(a){a.elem.nodeType&&a.elem.parentNode&&(a.elem[a.prop]=a.now)}},r.easing={linear:function(a){return a},swing:function(a){return.5-Math.cos(a*Math.PI)/2},_default:"swing"},r.fx=_a.prototype.init,r.fx.step={};var ab,bb,cb=/^(?:toggle|show|hide)$/,db=/queueHooks$/;function eb(){bb&&(d.hidden===!1&&a.requestAnimationFrame?a.requestAnimationFrame(eb):a.setTimeout(eb,r.fx.interval),r.fx.tick())}function fb(){return a.setTimeout(function(){ab=void 0}),ab=r.now()}function gb(a,b){var c,d=0,e={height:a};for(b=b?1:0;d<4;d+=2-b)c=ca[d],e["margin"+c]=e["padding"+c]=a;return b&&(e.opacity=e.width=a),e}function hb(a,b,c){for(var d,e=(kb.tweeners[b]||[]).concat(kb.tweeners["*"]),f=0,g=e.length;f1)},removeAttr:function(a){return this.each(function(){r.removeAttr(this,a)})}}),r.extend({attr:function(a,b,c){var d,e,f=a.nodeType;if(3!==f&&8!==f&&2!==f)return"undefined"==typeof a.getAttribute?r.prop(a,b,c):(1===f&&r.isXMLDoc(a)||(e=r.attrHooks[b.toLowerCase()]||(r.expr.match.bool.test(b)?lb:void 0)),void 0!==c?null===c?void r.removeAttr(a,b):e&&"set"in e&&void 0!==(d=e.set(a,c,b))?d:(a.setAttribute(b,c+""),c):e&&"get"in e&&null!==(d=e.get(a,b))?d:(d=r.find.attr(a,b), -null==d?void 0:d))},attrHooks:{type:{set:function(a,b){if(!o.radioValue&&"radio"===b&&B(a,"input")){var c=a.value;return a.setAttribute("type",b),c&&(a.value=c),b}}}},removeAttr:function(a,b){var c,d=0,e=b&&b.match(L);if(e&&1===a.nodeType)while(c=e[d++])a.removeAttribute(c)}}),lb={set:function(a,b,c){return b===!1?r.removeAttr(a,c):a.setAttribute(c,c),c}},r.each(r.expr.match.bool.source.match(/\w+/g),function(a,b){var c=mb[b]||r.find.attr;mb[b]=function(a,b,d){var e,f,g=b.toLowerCase();return d||(f=mb[g],mb[g]=e,e=null!=c(a,b,d)?g:null,mb[g]=f),e}});var nb=/^(?:input|select|textarea|button)$/i,ob=/^(?:a|area)$/i;r.fn.extend({prop:function(a,b){return T(this,r.prop,a,b,arguments.length>1)},removeProp:function(a){return this.each(function(){delete this[r.propFix[a]||a]})}}),r.extend({prop:function(a,b,c){var d,e,f=a.nodeType;if(3!==f&&8!==f&&2!==f)return 1===f&&r.isXMLDoc(a)||(b=r.propFix[b]||b,e=r.propHooks[b]),void 0!==c?e&&"set"in e&&void 0!==(d=e.set(a,c,b))?d:a[b]=c:e&&"get"in e&&null!==(d=e.get(a,b))?d:a[b]},propHooks:{tabIndex:{get:function(a){var b=r.find.attr(a,"tabindex");return b?parseInt(b,10):nb.test(a.nodeName)||ob.test(a.nodeName)&&a.href?0:-1}}},propFix:{"for":"htmlFor","class":"className"}}),o.optSelected||(r.propHooks.selected={get:function(a){var b=a.parentNode;return b&&b.parentNode&&b.parentNode.selectedIndex,null},set:function(a){var b=a.parentNode;b&&(b.selectedIndex,b.parentNode&&b.parentNode.selectedIndex)}}),r.each(["tabIndex","readOnly","maxLength","cellSpacing","cellPadding","rowSpan","colSpan","useMap","frameBorder","contentEditable"],function(){r.propFix[this.toLowerCase()]=this});function pb(a){var b=a.match(L)||[];return b.join(" ")}function qb(a){return a.getAttribute&&a.getAttribute("class")||""}r.fn.extend({addClass:function(a){var b,c,d,e,f,g,h,i=0;if(r.isFunction(a))return this.each(function(b){r(this).addClass(a.call(this,b,qb(this)))});if("string"==typeof a&&a){b=a.match(L)||[];while(c=this[i++])if(e=qb(c),d=1===c.nodeType&&" "+pb(e)+" "){g=0;while(f=b[g++])d.indexOf(" "+f+" ")<0&&(d+=f+" ");h=pb(d),e!==h&&c.setAttribute("class",h)}}return this},removeClass:function(a){var b,c,d,e,f,g,h,i=0;if(r.isFunction(a))return this.each(function(b){r(this).removeClass(a.call(this,b,qb(this)))});if(!arguments.length)return this.attr("class","");if("string"==typeof a&&a){b=a.match(L)||[];while(c=this[i++])if(e=qb(c),d=1===c.nodeType&&" "+pb(e)+" "){g=0;while(f=b[g++])while(d.indexOf(" "+f+" ")>-1)d=d.replace(" "+f+" "," ");h=pb(d),e!==h&&c.setAttribute("class",h)}}return this},toggleClass:function(a,b){var c=typeof a;return"boolean"==typeof b&&"string"===c?b?this.addClass(a):this.removeClass(a):r.isFunction(a)?this.each(function(c){r(this).toggleClass(a.call(this,c,qb(this),b),b)}):this.each(function(){var b,d,e,f;if("string"===c){d=0,e=r(this),f=a.match(L)||[];while(b=f[d++])e.hasClass(b)?e.removeClass(b):e.addClass(b)}else void 0!==a&&"boolean"!==c||(b=qb(this),b&&W.set(this,"__className__",b),this.setAttribute&&this.setAttribute("class",b||a===!1?"":W.get(this,"__className__")||""))})},hasClass:function(a){var b,c,d=0;b=" "+a+" ";while(c=this[d++])if(1===c.nodeType&&(" "+pb(qb(c))+" ").indexOf(b)>-1)return!0;return!1}});var rb=/\r/g;r.fn.extend({val:function(a){var b,c,d,e=this[0];{if(arguments.length)return d=r.isFunction(a),this.each(function(c){var e;1===this.nodeType&&(e=d?a.call(this,c,r(this).val()):a,null==e?e="":"number"==typeof e?e+="":Array.isArray(e)&&(e=r.map(e,function(a){return null==a?"":a+""})),b=r.valHooks[this.type]||r.valHooks[this.nodeName.toLowerCase()],b&&"set"in b&&void 0!==b.set(this,e,"value")||(this.value=e))});if(e)return b=r.valHooks[e.type]||r.valHooks[e.nodeName.toLowerCase()],b&&"get"in b&&void 0!==(c=b.get(e,"value"))?c:(c=e.value,"string"==typeof c?c.replace(rb,""):null==c?"":c)}}}),r.extend({valHooks:{option:{get:function(a){var b=r.find.attr(a,"value");return null!=b?b:pb(r.text(a))}},select:{get:function(a){var b,c,d,e=a.options,f=a.selectedIndex,g="select-one"===a.type,h=g?null:[],i=g?f+1:e.length;for(d=f<0?i:g?f:0;d-1)&&(c=!0);return c||(a.selectedIndex=-1),f}}}}),r.each(["radio","checkbox"],function(){r.valHooks[this]={set:function(a,b){if(Array.isArray(b))return a.checked=r.inArray(r(a).val(),b)>-1}},o.checkOn||(r.valHooks[this].get=function(a){return null===a.getAttribute("value")?"on":a.value})});var sb=/^(?:focusinfocus|focusoutblur)$/;r.extend(r.event,{trigger:function(b,c,e,f){var g,h,i,j,k,m,n,o=[e||d],p=l.call(b,"type")?b.type:b,q=l.call(b,"namespace")?b.namespace.split("."):[];if(h=i=e=e||d,3!==e.nodeType&&8!==e.nodeType&&!sb.test(p+r.event.triggered)&&(p.indexOf(".")>-1&&(q=p.split("."),p=q.shift(),q.sort()),k=p.indexOf(":")<0&&"on"+p,b=b[r.expando]?b:new r.Event(p,"object"==typeof b&&b),b.isTrigger=f?2:3,b.namespace=q.join("."),b.rnamespace=b.namespace?new RegExp("(^|\\.)"+q.join("\\.(?:.*\\.|)")+"(\\.|$)"):null,b.result=void 0,b.target||(b.target=e),c=null==c?[b]:r.makeArray(c,[b]),n=r.event.special[p]||{},f||!n.trigger||n.trigger.apply(e,c)!==!1)){if(!f&&!n.noBubble&&!r.isWindow(e)){for(j=n.delegateType||p,sb.test(j+p)||(h=h.parentNode);h;h=h.parentNode)o.push(h),i=h;i===(e.ownerDocument||d)&&o.push(i.defaultView||i.parentWindow||a)}g=0;while((h=o[g++])&&!b.isPropagationStopped())b.type=g>1?j:n.bindType||p,m=(W.get(h,"events")||{})[b.type]&&W.get(h,"handle"),m&&m.apply(h,c),m=k&&h[k],m&&m.apply&&U(h)&&(b.result=m.apply(h,c),b.result===!1&&b.preventDefault());return b.type=p,f||b.isDefaultPrevented()||n._default&&n._default.apply(o.pop(),c)!==!1||!U(e)||k&&r.isFunction(e[p])&&!r.isWindow(e)&&(i=e[k],i&&(e[k]=null),r.event.triggered=p,e[p](),r.event.triggered=void 0,i&&(e[k]=i)),b.result}},simulate:function(a,b,c){var d=r.extend(new r.Event,c,{type:a,isSimulated:!0});r.event.trigger(d,null,b)}}),r.fn.extend({trigger:function(a,b){return this.each(function(){r.event.trigger(a,b,this)})},triggerHandler:function(a,b){var c=this[0];if(c)return r.event.trigger(a,b,c,!0)}}),r.each("blur focus focusin focusout resize scroll click dblclick mousedown mouseup mousemove mouseover mouseout mouseenter mouseleave change select submit keydown keypress keyup contextmenu".split(" "),function(a,b){r.fn[b]=function(a,c){return arguments.length>0?this.on(b,null,a,c):this.trigger(b)}}),r.fn.extend({hover:function(a,b){return this.mouseenter(a).mouseleave(b||a)}}),o.focusin="onfocusin"in a,o.focusin||r.each({focus:"focusin",blur:"focusout"},function(a,b){var c=function(a){r.event.simulate(b,a.target,r.event.fix(a))};r.event.special[b]={setup:function(){var d=this.ownerDocument||this,e=W.access(d,b);e||d.addEventListener(a,c,!0),W.access(d,b,(e||0)+1)},teardown:function(){var d=this.ownerDocument||this,e=W.access(d,b)-1;e?W.access(d,b,e):(d.removeEventListener(a,c,!0),W.remove(d,b))}}});var tb=a.location,ub=r.now(),vb=/\?/;r.parseXML=function(b){var c;if(!b||"string"!=typeof b)return null;try{c=(new a.DOMParser).parseFromString(b,"text/xml")}catch(d){c=void 0}return c&&!c.getElementsByTagName("parsererror").length||r.error("Invalid XML: "+b),c};var wb=/\[\]$/,xb=/\r?\n/g,yb=/^(?:submit|button|image|reset|file)$/i,zb=/^(?:input|select|textarea|keygen)/i;function Ab(a,b,c,d){var e;if(Array.isArray(b))r.each(b,function(b,e){c||wb.test(a)?d(a,e):Ab(a+"["+("object"==typeof e&&null!=e?b:"")+"]",e,c,d)});else if(c||"object"!==r.type(b))d(a,b);else for(e in b)Ab(a+"["+e+"]",b[e],c,d)}r.param=function(a,b){var c,d=[],e=function(a,b){var c=r.isFunction(b)?b():b;d[d.length]=encodeURIComponent(a)+"="+encodeURIComponent(null==c?"":c)};if(Array.isArray(a)||a.jquery&&!r.isPlainObject(a))r.each(a,function(){e(this.name,this.value)});else for(c in a)Ab(c,a[c],b,e);return d.join("&")},r.fn.extend({serialize:function(){return r.param(this.serializeArray())},serializeArray:function(){return this.map(function(){var a=r.prop(this,"elements");return a?r.makeArray(a):this}).filter(function(){var a=this.type;return this.name&&!r(this).is(":disabled")&&zb.test(this.nodeName)&&!yb.test(a)&&(this.checked||!ja.test(a))}).map(function(a,b){var c=r(this).val();return null==c?null:Array.isArray(c)?r.map(c,function(a){return{name:b.name,value:a.replace(xb,"\r\n")}}):{name:b.name,value:c.replace(xb,"\r\n")}}).get()}});var Bb=/%20/g,Cb=/#.*$/,Db=/([?&])_=[^&]*/,Eb=/^(.*?):[ \t]*([^\r\n]*)$/gm,Fb=/^(?:about|app|app-storage|.+-extension|file|res|widget):$/,Gb=/^(?:GET|HEAD)$/,Hb=/^\/\//,Ib={},Jb={},Kb="*/".concat("*"),Lb=d.createElement("a");Lb.href=tb.href;function Mb(a){return function(b,c){"string"!=typeof b&&(c=b,b="*");var d,e=0,f=b.toLowerCase().match(L)||[];if(r.isFunction(c))while(d=f[e++])"+"===d[0]?(d=d.slice(1)||"*",(a[d]=a[d]||[]).unshift(c)):(a[d]=a[d]||[]).push(c)}}function Nb(a,b,c,d){var e={},f=a===Jb;function g(h){var i;return e[h]=!0,r.each(a[h]||[],function(a,h){var j=h(b,c,d);return"string"!=typeof j||f||e[j]?f?!(i=j):void 0:(b.dataTypes.unshift(j),g(j),!1)}),i}return g(b.dataTypes[0])||!e["*"]&&g("*")}function Ob(a,b){var c,d,e=r.ajaxSettings.flatOptions||{};for(c in b)void 0!==b[c]&&((e[c]?a:d||(d={}))[c]=b[c]);return d&&r.extend(!0,a,d),a}function Pb(a,b,c){var d,e,f,g,h=a.contents,i=a.dataTypes;while("*"===i[0])i.shift(),void 0===d&&(d=a.mimeType||b.getResponseHeader("Content-Type"));if(d)for(e in h)if(h[e]&&h[e].test(d)){i.unshift(e);break}if(i[0]in c)f=i[0];else{for(e in c){if(!i[0]||a.converters[e+" "+i[0]]){f=e;break}g||(g=e)}f=f||g}if(f)return f!==i[0]&&i.unshift(f),c[f]}function Qb(a,b,c,d){var e,f,g,h,i,j={},k=a.dataTypes.slice();if(k[1])for(g in a.converters)j[g.toLowerCase()]=a.converters[g];f=k.shift();while(f)if(a.responseFields[f]&&(c[a.responseFields[f]]=b),!i&&d&&a.dataFilter&&(b=a.dataFilter(b,a.dataType)),i=f,f=k.shift())if("*"===f)f=i;else if("*"!==i&&i!==f){if(g=j[i+" "+f]||j["* "+f],!g)for(e in j)if(h=e.split(" "),h[1]===f&&(g=j[i+" "+h[0]]||j["* "+h[0]])){g===!0?g=j[e]:j[e]!==!0&&(f=h[0],k.unshift(h[1]));break}if(g!==!0)if(g&&a["throws"])b=g(b);else try{b=g(b)}catch(l){return{state:"parsererror",error:g?l:"No conversion from "+i+" to "+f}}}return{state:"success",data:b}}r.extend({active:0,lastModified:{},etag:{},ajaxSettings:{url:tb.href,type:"GET",isLocal:Fb.test(tb.protocol),global:!0,processData:!0,async:!0,contentType:"application/x-www-form-urlencoded; charset=UTF-8",accepts:{"*":Kb,text:"text/plain",html:"text/html",xml:"application/xml, text/xml",json:"application/json, text/javascript"},contents:{xml:/\bxml\b/,html:/\bhtml/,json:/\bjson\b/},responseFields:{xml:"responseXML",text:"responseText",json:"responseJSON"},converters:{"* text":String,"text html":!0,"text json":JSON.parse,"text xml":r.parseXML},flatOptions:{url:!0,context:!0}},ajaxSetup:function(a,b){return b?Ob(Ob(a,r.ajaxSettings),b):Ob(r.ajaxSettings,a)},ajaxPrefilter:Mb(Ib),ajaxTransport:Mb(Jb),ajax:function(b,c){"object"==typeof b&&(c=b,b=void 0),c=c||{};var e,f,g,h,i,j,k,l,m,n,o=r.ajaxSetup({},c),p=o.context||o,q=o.context&&(p.nodeType||p.jquery)?r(p):r.event,s=r.Deferred(),t=r.Callbacks("once memory"),u=o.statusCode||{},v={},w={},x="canceled",y={readyState:0,getResponseHeader:function(a){var b;if(k){if(!h){h={};while(b=Eb.exec(g))h[b[1].toLowerCase()]=b[2]}b=h[a.toLowerCase()]}return null==b?null:b},getAllResponseHeaders:function(){return k?g:null},setRequestHeader:function(a,b){return null==k&&(a=w[a.toLowerCase()]=w[a.toLowerCase()]||a,v[a]=b),this},overrideMimeType:function(a){return null==k&&(o.mimeType=a),this},statusCode:function(a){var b;if(a)if(k)y.always(a[y.status]);else for(b in a)u[b]=[u[b],a[b]];return this},abort:function(a){var b=a||x;return e&&e.abort(b),A(0,b),this}};if(s.promise(y),o.url=((b||o.url||tb.href)+"").replace(Hb,tb.protocol+"//"),o.type=c.method||c.type||o.method||o.type,o.dataTypes=(o.dataType||"*").toLowerCase().match(L)||[""],null==o.crossDomain){j=d.createElement("a");try{j.href=o.url,j.href=j.href,o.crossDomain=Lb.protocol+"//"+Lb.host!=j.protocol+"//"+j.host}catch(z){o.crossDomain=!0}}if(o.data&&o.processData&&"string"!=typeof o.data&&(o.data=r.param(o.data,o.traditional)),Nb(Ib,o,c,y),k)return y;l=r.event&&o.global,l&&0===r.active++&&r.event.trigger("ajaxStart"),o.type=o.type.toUpperCase(),o.hasContent=!Gb.test(o.type),f=o.url.replace(Cb,""),o.hasContent?o.data&&o.processData&&0===(o.contentType||"").indexOf("application/x-www-form-urlencoded")&&(o.data=o.data.replace(Bb,"+")):(n=o.url.slice(f.length),o.data&&(f+=(vb.test(f)?"&":"?")+o.data,delete o.data),o.cache===!1&&(f=f.replace(Db,"$1"),n=(vb.test(f)?"&":"?")+"_="+ub++ +n),o.url=f+n),o.ifModified&&(r.lastModified[f]&&y.setRequestHeader("If-Modified-Since",r.lastModified[f]),r.etag[f]&&y.setRequestHeader("If-None-Match",r.etag[f])),(o.data&&o.hasContent&&o.contentType!==!1||c.contentType)&&y.setRequestHeader("Content-Type",o.contentType),y.setRequestHeader("Accept",o.dataTypes[0]&&o.accepts[o.dataTypes[0]]?o.accepts[o.dataTypes[0]]+("*"!==o.dataTypes[0]?", "+Kb+"; q=0.01":""):o.accepts["*"]);for(m in o.headers)y.setRequestHeader(m,o.headers[m]);if(o.beforeSend&&(o.beforeSend.call(p,y,o)===!1||k))return y.abort();if(x="abort",t.add(o.complete),y.done(o.success),y.fail(o.error),e=Nb(Jb,o,c,y)){if(y.readyState=1,l&&q.trigger("ajaxSend",[y,o]),k)return y;o.async&&o.timeout>0&&(i=a.setTimeout(function(){y.abort("timeout")},o.timeout));try{k=!1,e.send(v,A)}catch(z){if(k)throw z;A(-1,z)}}else A(-1,"No Transport");function A(b,c,d,h){var j,m,n,v,w,x=c;k||(k=!0,i&&a.clearTimeout(i),e=void 0,g=h||"",y.readyState=b>0?4:0,j=b>=200&&b<300||304===b,d&&(v=Pb(o,y,d)),v=Qb(o,v,y,j),j?(o.ifModified&&(w=y.getResponseHeader("Last-Modified"),w&&(r.lastModified[f]=w),w=y.getResponseHeader("etag"),w&&(r.etag[f]=w)),204===b||"HEAD"===o.type?x="nocontent":304===b?x="notmodified":(x=v.state,m=v.data,n=v.error,j=!n)):(n=x,!b&&x||(x="error",b<0&&(b=0))),y.status=b,y.statusText=(c||x)+"",j?s.resolveWith(p,[m,x,y]):s.rejectWith(p,[y,x,n]),y.statusCode(u),u=void 0,l&&q.trigger(j?"ajaxSuccess":"ajaxError",[y,o,j?m:n]),t.fireWith(p,[y,x]),l&&(q.trigger("ajaxComplete",[y,o]),--r.active||r.event.trigger("ajaxStop")))}return y},getJSON:function(a,b,c){return r.get(a,b,c,"json")},getScript:function(a,b){return r.get(a,void 0,b,"script")}}),r.each(["get","post"],function(a,b){r[b]=function(a,c,d,e){return r.isFunction(c)&&(e=e||d,d=c,c=void 0),r.ajax(r.extend({url:a,type:b,dataType:e,data:c,success:d},r.isPlainObject(a)&&a))}}),r._evalUrl=function(a){return r.ajax({url:a,type:"GET",dataType:"script",cache:!0,async:!1,global:!1,"throws":!0})},r.fn.extend({wrapAll:function(a){var b;return this[0]&&(r.isFunction(a)&&(a=a.call(this[0])),b=r(a,this[0].ownerDocument).eq(0).clone(!0),this[0].parentNode&&b.insertBefore(this[0]),b.map(function(){var a=this;while(a.firstElementChild)a=a.firstElementChild;return a}).append(this)),this},wrapInner:function(a){return r.isFunction(a)?this.each(function(b){r(this).wrapInner(a.call(this,b))}):this.each(function(){var b=r(this),c=b.contents();c.length?c.wrapAll(a):b.append(a)})},wrap:function(a){var b=r.isFunction(a);return this.each(function(c){r(this).wrapAll(b?a.call(this,c):a)})},unwrap:function(a){return this.parent(a).not("body").each(function(){r(this).replaceWith(this.childNodes)}),this}}),r.expr.pseudos.hidden=function(a){return!r.expr.pseudos.visible(a)},r.expr.pseudos.visible=function(a){return!!(a.offsetWidth||a.offsetHeight||a.getClientRects().length)},r.ajaxSettings.xhr=function(){try{return new a.XMLHttpRequest}catch(b){}};var Rb={0:200,1223:204},Sb=r.ajaxSettings.xhr();o.cors=!!Sb&&"withCredentials"in Sb,o.ajax=Sb=!!Sb,r.ajaxTransport(function(b){var c,d;if(o.cors||Sb&&!b.crossDomain)return{send:function(e,f){var g,h=b.xhr();if(h.open(b.type,b.url,b.async,b.username,b.password),b.xhrFields)for(g in b.xhrFields)h[g]=b.xhrFields[g];b.mimeType&&h.overrideMimeType&&h.overrideMimeType(b.mimeType),b.crossDomain||e["X-Requested-With"]||(e["X-Requested-With"]="XMLHttpRequest");for(g in e)h.setRequestHeader(g,e[g]);c=function(a){return function(){c&&(c=d=h.onload=h.onerror=h.onabort=h.onreadystatechange=null,"abort"===a?h.abort():"error"===a?"number"!=typeof h.status?f(0,"error"):f(h.status,h.statusText):f(Rb[h.status]||h.status,h.statusText,"text"!==(h.responseType||"text")||"string"!=typeof h.responseText?{binary:h.response}:{text:h.responseText},h.getAllResponseHeaders()))}},h.onload=c(),d=h.onerror=c("error"),void 0!==h.onabort?h.onabort=d:h.onreadystatechange=function(){4===h.readyState&&a.setTimeout(function(){c&&d()})},c=c("abort");try{h.send(b.hasContent&&b.data||null)}catch(i){if(c)throw i}},abort:function(){c&&c()}}}),r.ajaxPrefilter(function(a){a.crossDomain&&(a.contents.script=!1)}),r.ajaxSetup({accepts:{script:"text/javascript, application/javascript, application/ecmascript, application/x-ecmascript"},contents:{script:/\b(?:java|ecma)script\b/},converters:{"text script":function(a){return r.globalEval(a),a}}}),r.ajaxPrefilter("script",function(a){void 0===a.cache&&(a.cache=!1),a.crossDomain&&(a.type="GET")}),r.ajaxTransport("script",function(a){if(a.crossDomain){var b,c;return{send:function(e,f){b=r(" - - - - - - - - - - -
-
-
-
-

PaddleSpeech Serving简介

-

- PaddleSpeech 是基于飞桨 PaddlePaddle 的语音方向的开源模型库,用于语音和音频中的各种关键任务的开发。PaddleSpeech Serving是基于python + fastapi 的语音算法模型的C/S类型后端服务,旨在统一paddle speech下的各语音算子来对外提供后端服务。 -

-
-
- -
-
-
-
-
-

产品体验

-
-
-
-
-
-
-
-
- WebSocket URL: - -
- - -
- 识别中, 秒后自动停止识别 -
-
-
-
-
此处显示识别结果
-
-
-
-
-
- - - - diff --git a/demos/streaming_tts_server/README.md b/demos/streaming_tts_server/README.md index 860d9a9783fa46fe992cdfd46291ba8b04003442..cbea6bf774140f16fa30b09b0c30cd798782c5fa 100644 --- a/demos/streaming_tts_server/README.md +++ b/demos/streaming_tts_server/README.md @@ -119,12 +119,9 @@ The configuration file can be found in `conf/tts_online_application.yaml`. - `protocol`: Service protocol, choices: [http, websocket], default: http. - `input`: (required): Input text to generate. - `spk_id`: Speaker id for multi-speaker text to speech. Default: 0 - - `speed`: Audio speed, the value should be set between 0 and 3. Default: 1.0 - - `volume`: Audio volume, the value should be set between 0 and 3. Default: 1.0 - - `sample_rate`: Sampling rate, choices: [0, 8000, 16000], the default is the same as the model. Default: 0 - - `output`: Output wave filepath. Default: None, which means not to save the audio to the local. + - `output`: Client output wave filepath. Default: None, which means not to save the audio to the local. - `play`: Whether to play audio, play while synthesizing, default value: False, which means not playing. **Playing audio needs to rely on the pyaudio library**. - - `spk_id, speed, volume, sample_rate` do not take effect in streaming speech synthesis service temporarily. + - Currently, only the single-speaker model is supported in the code, so `spk_id` does not take effect. Streaming TTS does not support changing sample rate, variable speed and volume. Output: ```bash @@ -150,9 +147,6 @@ The configuration file can be found in `conf/tts_online_application.yaml`. port=8092, protocol="http", spk_id=0, - speed=1.0, - volume=1.0, - sample_rate=0, output="./output.wav", play=False) @@ -256,12 +250,10 @@ The configuration file can be found in `conf/tts_online_application.yaml`. - `protocol`: Service protocol, choices: [http, websocket], default: http. - `input`: (required): Input text to generate. - `spk_id`: Speaker id for multi-speaker text to speech. Default: 0 - - `speed`: Audio speed, the value should be set between 0 and 3. Default: 1.0 - - `volume`: Audio volume, the value should be set between 0 and 3. Default: 1.0 - - `sample_rate`: Sampling rate, choices: [0, 8000, 16000], the default is the same as the model. Default: 0 - - `output`: Output wave filepath. Default: None, which means not to save the audio to the local. + - `output`: Client output wave filepath. Default: None, which means not to save the audio to the local. - `play`: Whether to play audio, play while synthesizing, default value: False, which means not playing. **Playing audio needs to rely on the pyaudio library**. - - `spk_id, speed, volume, sample_rate` do not take effect in streaming speech synthesis service temporarily. + - Currently, only the single-speaker model is supported in the code, so `spk_id` does not take effect. Streaming TTS does not support changing sample rate, variable speed and volume. + Output: @@ -288,9 +280,6 @@ The configuration file can be found in `conf/tts_online_application.yaml`. port=8092, protocol="websocket", spk_id=0, - speed=1.0, - volume=1.0, - sample_rate=0, output="./output.wav", play=False) diff --git a/demos/streaming_tts_server/README_cn.md b/demos/streaming_tts_server/README_cn.md index 254ec26a2b6967611414fee7c2492804acbb8063..3cd2817096e41d0a5d8b808df34ceb50767206ae 100644 --- a/demos/streaming_tts_server/README_cn.md +++ b/demos/streaming_tts_server/README_cn.md @@ -118,12 +118,9 @@ - `protocol`: 服务协议,可选 [http, websocket], 默认: http。 - `input`: (必须输入): 待合成的文本。 - `spk_id`: 说话人 id,用于多说话人语音合成,默认值: 0。 - - `speed`: 音频速度,该值应设置在 0 到 3 之间。 默认值:1.0 - - `volume`: 音频音量,该值应设置在 0 到 3 之间。 默认值: 1.0 - - `sample_rate`: 采样率,可选 [0, 8000, 16000],默认值:0,表示与模型采样率相同 - - `output`: 输出音频的路径, 默认值:None,表示不保存音频到本地。 + - `output`: 客户端输出音频的路径, 默认值:None,表示不保存音频。 - `play`: 是否播放音频,边合成边播放, 默认值:False,表示不播放。**播放音频需要依赖pyaudio库**。 - - `spk_id, speed, volume, sample_rate` 在流式语音合成服务中暂时不生效。 + - 目前代码中只支持单说话人的模型,因此 spk_id 的选择并不生效。流式 TTS 不支持更换采样率,变速和变音量等功能。 输出: @@ -150,9 +147,6 @@ port=8092, protocol="http", spk_id=0, - speed=1.0, - volume=1.0, - sample_rate=0, output="./output.wav", play=False) @@ -256,12 +250,10 @@ - `protocol`: 服务协议,可选 [http, websocket], 默认: http。 - `input`: (必须输入): 待合成的文本。 - `spk_id`: 说话人 id,用于多说话人语音合成,默认值: 0。 - - `speed`: 音频速度,该值应设置在 0 到 3 之间。 默认值:1.0 - - `volume`: 音频音量,该值应设置在 0 到 3 之间。 默认值: 1.0 - - `sample_rate`: 采样率,可选 [0, 8000, 16000],默认值:0,表示与模型采样率相同 - - `output`: 输出音频的路径, 默认值:None,表示不保存音频到本地。 + - `output`: 客户端输出音频的路径, 默认值:None,表示不保存音频。 - `play`: 是否播放音频,边合成边播放, 默认值:False,表示不播放。**播放音频需要依赖pyaudio库**。 - - `spk_id, speed, volume, sample_rate` 在流式语音合成服务中暂时不生效。 + - 目前代码中只支持单说话人的模型,因此 spk_id 的选择并不生效。流式 TTS 不支持更换采样率,变速和变音量等功能。 + 输出: @@ -288,9 +280,6 @@ port=8092, protocol="websocket", spk_id=0, - speed=1.0, - volume=1.0, - sample_rate=0, output="./output.wav", play=False) diff --git a/docs/requirements.txt b/docs/requirements.txt index a5409a5448231dbb6793a6feacc2b6cd5ee15809..08a049c1be0089cb236cfd1439985ae2665b44fd 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -22,6 +22,7 @@ onnxruntime pandas paddlenlp paddlespeech_feat +Pillow>=9.0.0 praatio==5.0.0 pypinyin pypinyin-dict diff --git a/docs/source/install.md b/docs/source/install.md index 4291b87afa231164f9df30e7811be95ed129d30e..e8bd5adf91df09d68385ad50cbaa8aa90f4ced63 100644 --- a/docs/source/install.md +++ b/docs/source/install.md @@ -63,7 +63,7 @@ pip install paddlespeech -i https://pypi.tuna.tsinghua.edu.cn/simple ``` > If you encounter problem with downloading **nltk_data** while using paddlespeech, it maybe due to your poor network, we suggest you download the [nltk_data](https://paddlespeech.bj.bcebos.com/Parakeet/tools/nltk_data.tar.gz) provided by us, and extract it to your `${HOME}`. -> If you fail to install paddlespeech-ctcdecoders, it doesn't matter. +> If you fail to install paddlespeech-ctcdecoders, you only can not use deepspeech2 model inference. For other models, it doesn't matter. ## Medium: Get the Major Functions (Support Linux, mac and windows not support training) If you want to get the major function of `paddlespeech`, you need to do following steps: diff --git a/docs/source/install_cn.md b/docs/source/install_cn.md index 9294a2bcdfa15ee29462989fa18fbe93df66b870..75f4174e06ee285f5e6ef85037402ccc5d76512a 100644 --- a/docs/source/install_cn.md +++ b/docs/source/install_cn.md @@ -60,7 +60,7 @@ pip install paddlespeech -i https://pypi.tuna.tsinghua.edu.cn/simple ``` > 如果您在使用 paddlespeech 的过程中遇到关于下载 **nltk_data** 的问题,可能是您的网络不佳,我们建议您下载我们提供的 [nltk_data](https://paddlespeech.bj.bcebos.com/Parakeet/tools/nltk_data.tar.gz) 并解压缩到您的 `${HOME}` 目录下。 -> 如果出现 paddlespeech-ctcdecoders 无法安装的问题,无须担心,这不影响使用。 +> 如果出现 paddlespeech-ctcdecoders 无法安装的问题,无须担心,这个只影响 deepspeech2 模型的推理,不影响其他模型的使用。 ## 中等: 获取主要功能(支持 Linux, Mac 和 Windows 不支持训练) 如果你想要使用 `paddlespeech` 的主要功能。你需要完成以下几个步骤 diff --git a/docs/source/released_model.md b/docs/source/released_model.md index 551a86ef0bd013120597be512f6a78242314f59f..a1e3eb8795557e40ad2a1c3e521a52c114bab253 100644 --- a/docs/source/released_model.md +++ b/docs/source/released_model.md @@ -10,7 +10,7 @@ Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | [Ds2 Offline Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_offline_aishell_ckpt_1.0.1.model.tar.gz)| Aishell Dataset | Char-based | 1.4 GB | 2 Conv + 5 bidirectional LSTM layers| 0.0554 |-| 151 h | [Ds2 Offline Aishell ASR0](../../examples/aishell/asr0) | inference/python | [Conformer Online Wenetspeech ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_wenetspeech_ckpt_1.0.0a.model.tar.gz) | WenetSpeech Dataset | Char-based | 457 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring| 0.11 (test\_net) 0.1879 (test\_meeting) |-| 10000 h |- | python | [Conformer Online Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_chunk_conformer_aishell_ckpt_0.2.0.model.tar.gz) | Aishell Dataset | Char-based | 189 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring| 0.0544 |-| 151 h | [Conformer Online Aishell ASR1](../../examples/aishell/asr1) | python | -[Conformer Offline Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_conformer_aishell_ckpt_0.1.2.model.tar.gz) | Aishell Dataset | Char-based | 189 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0464 |-| 151 h | [Conformer Offline Aishell ASR1](../../examples/aishell/asr1) | python | +[Conformer Offline Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_conformer_aishell_ckpt_1.0.1.model.tar.gz) | Aishell Dataset | Char-based | 189 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0460 |-| 151 h | [Conformer Offline Aishell ASR1](../../examples/aishell/asr1) | python | [Transformer Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_transformer_aishell_ckpt_0.1.1.model.tar.gz) | Aishell Dataset | Char-based | 128 MB | Encoder:Transformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0523 || 151 h | [Transformer Aishell ASR1](../../examples/aishell/asr1) | python | [Ds2 Offline Librispeech ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr0/asr0_deepspeech2_offline_librispeech_ckpt_1.0.1.model.tar.gz)| Librispeech Dataset | Char-based | 1.3 GB | 2 Conv + 5 bidirectional LSTM layers| - |0.0467| 960 h | [Ds2 Offline Librispeech ASR0](../../examples/librispeech/asr0) | inference/python | [Conformer Librispeech ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr1/asr1_conformer_librispeech_ckpt_0.1.1.model.tar.gz) | Librispeech Dataset | subword-based | 191 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring |-| 0.0338 | 960 h | [Conformer Librispeech ASR1](../../examples/librispeech/asr1) | python | diff --git a/examples/aishell/asr1/RESULTS.md b/examples/aishell/asr1/RESULTS.md index f16d423a2dc11f08aeac2a8061f4532d56e6ebbf..79c695b1b5df536b5e8086ece2f2bd5e46f412bb 100644 --- a/examples/aishell/asr1/RESULTS.md +++ b/examples/aishell/asr1/RESULTS.md @@ -2,13 +2,13 @@ ## Conformer paddle version: 2.2.2 -paddlespeech version: 0.2.0 +paddlespeech version: 1.0.1 | Model | Params | Config | Augmentation| Test set | Decode method | Loss | CER | | --- | --- | --- | --- | --- | --- | --- | --- | -| conformer | 47.07M | conf/conformer.yaml | spec_aug | test | attention | - | 0.0530 | -| conformer | 47.07M | conf/conformer.yaml | spec_aug | test | ctc_greedy_search | - | 0.0495 | -| conformer | 47.07M | conf/conformer.yaml | spec_aug| test | ctc_prefix_beam_search | - | 0.0494 | -| conformer | 47.07M | conf/conformer.yaml | spec_aug | test | attention_rescoring | - | 0.0464 | +| conformer | 47.07M | conf/conformer.yaml | spec_aug | test | attention | - | 0.0522 | +| conformer | 47.07M | conf/conformer.yaml | spec_aug | test | ctc_greedy_search | - | 0.0481 | +| conformer | 47.07M | conf/conformer.yaml | spec_aug| test | ctc_prefix_beam_search | - | 0.0480 | +| conformer | 47.07M | conf/conformer.yaml | spec_aug | test | attention_rescoring | - | 0.0460 | ## Conformer Streaming diff --git a/examples/aishell/asr1/conf/conformer.yaml b/examples/aishell/asr1/conf/conformer.yaml index 2419d07a4066d635fdf93f2e4258f56fcf4ea76d..0d12a9ef8f5fbcea21d07a1cd53647030b33532e 100644 --- a/examples/aishell/asr1/conf/conformer.yaml +++ b/examples/aishell/asr1/conf/conformer.yaml @@ -57,7 +57,7 @@ feat_dim: 80 stride_ms: 10.0 window_ms: 25.0 sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs -batch_size: 64 +batch_size: 32 maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced minibatches: 0 # for debug @@ -73,10 +73,10 @@ num_encs: 1 ########################################### # Training # ########################################### -n_epoch: 240 -accum_grad: 2 +n_epoch: 150 +accum_grad: 8 global_grad_clip: 5.0 -dist_sampler: True +dist_sampler: False optim: adam optim_conf: lr: 0.002 diff --git a/examples/wenetspeech/asr1/conf/conformer.yaml b/examples/wenetspeech/asr1/conf/conformer.yaml index 6c2bbca4166e06f0b64e1bb00a197fb0295252d1..d1ac20b9be02e174c26068f154066fdff18db520 100644 --- a/examples/wenetspeech/asr1/conf/conformer.yaml +++ b/examples/wenetspeech/asr1/conf/conformer.yaml @@ -1,7 +1,6 @@ ############################################ # Network Architecture # ############################################ -cmvn_file: cmvn_file_type: "json" # encoder related encoder: conformer @@ -43,40 +42,42 @@ model_conf: ########################################### # Data # ########################################### -train_manifest: data/manifest.train -dev_manifest: data/manifest.dev -test_manifest: data/manifest.test +train_manifest: data/train_l/data.list +dev_manifest: data/dev/data.list +test_manifest: data/test_meeting/data.list ########################################### # Dataloader # ########################################### -vocab_filepath: data/lang_char/vocab.txt +use_stream_data: True unit_type: 'char' +vocab_filepath: data/lang_char/vocab.txt preprocess_config: conf/preprocess.yaml +cmvn_file: data/mean_std.json spm_model_prefix: '' feat_dim: 80 stride_ms: 10.0 window_ms: 25.0 +dither: 0.1 sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs -batch_size: 64 -maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced -maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced -minibatches: 0 # for debug -batch_count: auto -batch_bins: 0 -batch_frames_in: 0 -batch_frames_out: 0 -batch_frames_inout: 0 -num_workers: 0 -subsampling_factor: 1 +batch_size: 32 +minlen_in: 10 +maxlen_in: 1200 # if input length(number of frames) > maxlen-in, data is automatically removed +minlen_out: 0 +maxlen_out: 150 # if output length(number of tokens) > maxlen-out, data is automatically removed +resample_rate: 16000 +shuffle_size: 1500 # read number of 'shuffle_size' data as a chunk, shuffle the data in the chunk +sort_size: 1000 # read number of 'sort_size' data as a chunk, sort the data in the chunk +num_workers: 8 +prefetch_factor: 10 +dist_sampler: True num_encs: 1 - ########################################### # Training # ########################################### -n_epoch: 240 -accum_grad: 16 +n_epoch: 32 +accum_grad: 32 global_grad_clip: 5.0 log_interval: 100 checkpoint: diff --git a/examples/wenetspeech/asr1/local/data.sh b/examples/wenetspeech/asr1/local/data.sh index d216dd84abbf1fb0604ba4095d1833315e70a016..62579ba3230e2640eb83695b05f8a32b2718fd3c 100755 --- a/examples/wenetspeech/asr1/local/data.sh +++ b/examples/wenetspeech/asr1/local/data.sh @@ -2,6 +2,8 @@ # Copyright 2021 Mobvoi Inc(Author: Di Wu, Binbin Zhang) # NPU, ASLP Group (Author: Qijie Shao) +# +# Modified from wenet(https://github.com/wenet-e2e/wenet) stage=-1 stop_stage=100 @@ -30,7 +32,7 @@ mkdir -p data TARGET_DIR=${MAIN_ROOT}/dataset mkdir -p ${TARGET_DIR} -if [ ${stage} -le -2 ] && [ ${stop_stage} -ge -2 ]; then +if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then # download data echo "Please follow https://github.com/wenet-e2e/WenetSpeech to download the data." exit 0; @@ -44,86 +46,57 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then data || exit 1; fi -if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then - # generate manifests - python3 ${TARGET_DIR}/aishell/aishell.py \ - --manifest_prefix="data/manifest" \ - --target_dir="${TARGET_DIR}/aishell" - - if [ $? -ne 0 ]; then - echo "Prepare Aishell failed. Terminated." - exit 1 - fi - - for dataset in train dev test; do - mv data/manifest.${dataset} data/manifest.${dataset}.raw - done -fi - -if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - # compute mean and stddev for normalizer - if $cmvn; then - full_size=`cat data/${train_set}/wav.scp | wc -l` - sampling_size=$((full_size / cmvn_sampling_divisor)) - shuf -n $sampling_size data/$train_set/wav.scp \ - > data/$train_set/wav.scp.sampled - num_workers=$(nproc) - - python3 ${MAIN_ROOT}/utils/compute_mean_std.py \ - --manifest_path="data/manifest.train.raw" \ - --spectrum_type="fbank" \ - --feat_dim=80 \ - --delta_delta=false \ - --stride_ms=10 \ - --window_ms=25 \ - --sample_rate=16000 \ - --use_dB_normalization=False \ - --num_samples=-1 \ - --num_workers=${num_workers} \ - --output_path="data/mean_std.json" - - if [ $? -ne 0 ]; then - echo "Compute mean and stddev failed. Terminated." - exit 1 - fi - fi -fi - -dict=data/dict/lang_char.txt +dict=data/lang_char/vocab.txt if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then - # download data, generate manifests - # build vocabulary - python3 ${MAIN_ROOT}/utils/build_vocab.py \ - --unit_type="char" \ - --count_threshold=0 \ - --vocab_path="data/lang_char/vocab.txt" \ - --manifest_paths "data/manifest.train.raw" - - if [ $? -ne 0 ]; then - echo "Build vocabulary failed. Terminated." - exit 1 - fi + echo "Make a dictionary" + echo "dictionary: ${dict}" + mkdir -p $(dirname $dict) + echo "" > ${dict} # 0 will be used for "blank" in CTC + echo "" >> ${dict} # must be 1 + echo "▁" >> ${dict} # ▁ is for space + utils/text2token.py -s 1 -n 1 --space "▁" data/${train_set}/text \ + | cut -f 2- -d" " | tr " " "\n" \ + | sort | uniq | grep -a -v -e '^\s*$' \ + | grep -v "▁" \ + | awk '{print $0}' >> ${dict} \ + || exit 1; + echo "" >> $dict fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then - # format manifest with tokenids, vocab size - for dataset in train dev test; do - { - python3 ${MAIN_ROOT}/utils/format_data.py \ - --cmvn_path "data/mean_std.json" \ - --unit_type "char" \ - --vocab_path="data/vocab.txt" \ - --manifest_path="data/manifest.${dataset}.raw" \ - --output_path="data/manifest.${dataset}" + echo "Compute cmvn" + # Here we use all the training data, you can sample some some data to save time + # BUG!!! We should use the segmented data for CMVN + if $cmvn; then + full_size=`cat data/${train_set}/wav.scp | wc -l` + sampling_size=$((full_size / cmvn_sampling_divisor)) + shuf -n $sampling_size data/$train_set/wav.scp \ + > data/$train_set/wav.scp.sampled + python3 utils/compute_cmvn_stats.py \ + --num_workers 16 \ + --train_config $train_config \ + --in_scp data/$train_set/wav.scp.sampled \ + --out_cmvn data/$train_set/mean_std.json \ + || exit 1; + fi +fi - if [ $? -ne 0 ]; then - echo "Formt mnaifest failed. Terminated." - exit 1 - fi - } & - done - wait +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + echo "Making shards, please wait..." + RED='\033[0;31m' + NOCOLOR='\033[0m' + echo -e "It requires ${RED}1.2T ${NOCOLOR}space for $shards_dir, please make sure you have enough space" + echo -e "It takes about ${RED}12 ${NOCOLOR}hours with 32 threads" + for x in $dev_set $test_sets ${train_set}; do + dst=$shards_dir/$x + mkdir -p $dst + utils/make_filted_shard_list.py --num_node 1 --num_gpus_per_node 8 --num_utts_per_shard 1000 \ + --do_filter --resample 16000 \ + --num_threads 32 --segments data/$x/segments \ + data/$x/wav.scp data/$x/text \ + $(realpath $dst) data/$x/data.list + done fi -echo "Aishell data preparation done." +echo "Wenetspeech data preparation done." exit 0 diff --git a/examples/wenetspeech/asr1/local/train.sh b/examples/wenetspeech/asr1/local/train.sh new file mode 100755 index 0000000000000000000000000000000000000000..01af00b61efb8a55d1b8da99bfd2a754e732cb1a --- /dev/null +++ b/examples/wenetspeech/asr1/local/train.sh @@ -0,0 +1,68 @@ +#!/bin/bash + +profiler_options= +benchmark_batch_size=0 +benchmark_max_step=0 + +# seed may break model convergence +seed=0 + +source ${MAIN_ROOT}/utils/parse_options.sh || exit 1; + +ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') +echo "using $ngpu gpus..." + +if [ ${seed} != 0 ]; then + export FLAGS_cudnn_deterministic=True + echo "using seed $seed & FLAGS_cudnn_deterministic=True ..." +fi + +if [ $# -lt 2 ] && [ $# -gt 3 ];then + echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name ips(optional)" + exit -1 +fi + +config_path=$1 +ckpt_name=$2 +ips=$3 + +if [ ! $ips ];then + ips_config= +else + ips_config="--ips="${ips} +fi +echo ${ips_config} + +mkdir -p exp + +if [ ${ngpu} == 0 ]; then +python3 -u ${BIN_DIR}/train.py \ +--ngpu ${ngpu} \ +--seed ${seed} \ +--config ${config_path} \ +--output exp/${ckpt_name} \ +--profiler-options "${profiler_options}" \ +--benchmark-batch-size ${benchmark_batch_size} \ +--benchmark-max-step ${benchmark_max_step} +else +NCCL_SOCKET_IFNAME=eth0 python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${ips_config} ${BIN_DIR}/train.py \ +--ngpu ${ngpu} \ +--seed ${seed} \ +--config ${config_path} \ +--output exp/${ckpt_name} \ +--profiler-options "${profiler_options}" \ +--benchmark-batch-size ${benchmark_batch_size} \ +--benchmark-max-step ${benchmark_max_step} +fi + + +if [ ${seed} != 0 ]; then + unset FLAGS_cudnn_deterministic +fi + +if [ $? -ne 0 ]; then + echo "Failed in training!" + exit 1 +fi + +exit 0 diff --git a/examples/wenetspeech/asr1/local/wenetspeech_data_prep.sh b/examples/wenetspeech/asr1/local/wenetspeech_data_prep.sh index 858530534efdaf28818ea6a6f1cc742667d6e71b..baa2b32df78773fe84065f2168a007f72dfee364 100755 --- a/examples/wenetspeech/asr1/local/wenetspeech_data_prep.sh +++ b/examples/wenetspeech/asr1/local/wenetspeech_data_prep.sh @@ -24,7 +24,7 @@ stage=1 prefix= train_subset=L -. ./tools/parse_options.sh || exit 1; +. ./utils/parse_options.sh || exit 1; filter_by_id () { idlist=$1 @@ -132,4 +132,4 @@ if [ $stage -le 2 ]; then done fi -echo "$0: Done" \ No newline at end of file +echo "$0: Done" diff --git a/examples/wenetspeech/asr1/run.sh b/examples/wenetspeech/asr1/run.sh index 9995bc63eb8f9c55252abdc2ac400ea01bf389d9..ddce0a9c8bcb280d5bcf1726060ede62623cdbdf 100644 --- a/examples/wenetspeech/asr1/run.sh +++ b/examples/wenetspeech/asr1/run.sh @@ -7,6 +7,7 @@ gpus=0,1,2,3,4,5,6,7 stage=0 stop_stage=100 conf_path=conf/conformer.yaml +ips= #xxx.xxx.xxx.xxx,xxx.xxx.xxx.xxx decode_conf_path=conf/tuning/decode.yaml average_checkpoint=true avg_num=10 @@ -26,7 +27,7 @@ fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then # train model, all `ckpt` under `exp` dir - CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} + CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} ${ips} fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then diff --git a/paddlespeech/audio/streamdata/__init__.py b/paddlespeech/audio/streamdata/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..753fcc11bc76ad7071fa22fdb815cd3db3d4e954 --- /dev/null +++ b/paddlespeech/audio/streamdata/__init__.py @@ -0,0 +1,70 @@ +# Copyright (c) 2017-2019 NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# See the LICENSE file for licensing terms (BSD-style). +# Modified from https://github.com/webdataset/webdataset +# +# flake8: noqa + +from .cache import ( + cached_tarfile_samples, + cached_tarfile_to_samples, + lru_cleanup, + pipe_cleaner, +) +from .compat import WebDataset, WebLoader, FluidWrapper +from .extradatasets import MockDataset, with_epoch, with_length +from .filters import ( + associate, + batched, + decode, + detshuffle, + extract_keys, + getfirst, + info, + map, + map_dict, + map_tuple, + pipelinefilter, + rename, + rename_keys, + audio_resample, + select, + shuffle, + slice, + to_tuple, + transform_with, + unbatched, + xdecode, + audio_data_filter, + audio_tokenize, + audio_resample, + audio_compute_fbank, + audio_spec_aug, + sort, + audio_padding, + audio_cmvn, + placeholder, +) +from .handlers import ( + ignore_and_continue, + ignore_and_stop, + reraise_exception, + warn_and_continue, + warn_and_stop, +) +from .pipeline import DataPipeline +from .shardlists import ( + MultiShardSample, + ResampledShards, + SimpleShardList, + non_empty, + resampled, + shardspec, + single_node_only, + split_by_node, + split_by_worker, +) +from .tariterators import tarfile_samples, tarfile_to_samples +from .utils import PipelineStage, repeatedly +from .writer import ShardWriter, TarWriter, numpy_dumps +from .mix import RandomMix, RoundRobin diff --git a/paddlespeech/audio/streamdata/autodecode.py b/paddlespeech/audio/streamdata/autodecode.py new file mode 100644 index 0000000000000000000000000000000000000000..ca0e2ea2f9cb904edcfd3033dc26b7fda6cd9bc6 --- /dev/null +++ b/paddlespeech/audio/streamdata/autodecode.py @@ -0,0 +1,445 @@ +# +# Copyright (c) 2017-2021 NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# This file is part of the WebDataset library. +# See the LICENSE file for licensing terms (BSD-style). +# Modified from https://github.com/webdataset/webdataset +# + +"""Automatically decode webdataset samples.""" + +import io, json, os, pickle, re, tempfile +from functools import partial + +import numpy as np + +"""Extensions passed on to the image decoder.""" +image_extensions = "jpg jpeg png ppm pgm pbm pnm".split() + + +################################################################ +# handle basic datatypes +################################################################ + + +def paddle_loads(data): + """Load data using paddle.loads, importing paddle only if needed. + + :param data: data to be decoded + """ + import io + + import paddle + + stream = io.BytesIO(data) + return paddle.load(stream) + + +def tenbin_loads(data): + from . import tenbin + + return tenbin.decode_buffer(data) + + +def msgpack_loads(data): + import msgpack + + return msgpack.unpackb(data) + + +def npy_loads(data): + import numpy.lib.format + + stream = io.BytesIO(data) + return numpy.lib.format.read_array(stream) + + +def cbor_loads(data): + import cbor + + return cbor.loads(data) + + +decoders = { + "txt": lambda data: data.decode("utf-8"), + "text": lambda data: data.decode("utf-8"), + "transcript": lambda data: data.decode("utf-8"), + "cls": lambda data: int(data), + "cls2": lambda data: int(data), + "index": lambda data: int(data), + "inx": lambda data: int(data), + "id": lambda data: int(data), + "json": lambda data: json.loads(data), + "jsn": lambda data: json.loads(data), + "pyd": lambda data: pickle.loads(data), + "pickle": lambda data: pickle.loads(data), + "pdparams": lambda data: paddle_loads(data), + "ten": tenbin_loads, + "tb": tenbin_loads, + "mp": msgpack_loads, + "msg": msgpack_loads, + "npy": npy_loads, + "npz": lambda data: np.load(io.BytesIO(data)), + "cbor": cbor_loads, +} + + +def basichandlers(key, data): + """Handle basic file decoding. + + This function is usually part of the post= decoders. + This handles the following forms of decoding: + + - txt -> unicode string + - cls cls2 class count index inx id -> int + - json jsn -> JSON decoding + - pyd pickle -> pickle decoding + - pdparams -> paddle.loads + - ten tenbin -> fast tensor loading + - mp messagepack msg -> messagepack decoding + - npy -> Python NPY decoding + + :param key: file name extension + :param data: binary data to be decoded + """ + extension = re.sub(r".*[.]", "", key) + + if extension in decoders: + return decoders[extension](data) + + return None + + +################################################################ +# Generic extension handler. +################################################################ + + +def call_extension_handler(key, data, f, extensions): + """Call the function f with the given data if the key matches the extensions. + + :param key: actual key found in the sample + :param data: binary data + :param f: decoder function + :param extensions: list of matching extensions + """ + extension = key.lower().split(".") + for target in extensions: + target = target.split(".") + if len(target) > len(extension): + continue + if extension[-len(target) :] == target: + return f(data) + return None + + +def handle_extension(extensions, f): + """Return a decoder function for the list of extensions. + + Extensions can be a space separated list of extensions. + Extensions can contain dots, in which case the corresponding number + of extension components must be present in the key given to f. + Comparisons are case insensitive. + + Examples: + handle_extension("jpg jpeg", my_decode_jpg) # invoked for any file.jpg + handle_extension("seg.jpg", special_case_jpg) # invoked only for file.seg.jpg + """ + extensions = extensions.lower().split() + return partial(call_extension_handler, f=f, extensions=extensions) + + +################################################################ +# handle images +################################################################ + +imagespecs = { + "l8": ("numpy", "uint8", "l"), + "rgb8": ("numpy", "uint8", "rgb"), + "rgba8": ("numpy", "uint8", "rgba"), + "l": ("numpy", "float", "l"), + "rgb": ("numpy", "float", "rgb"), + "rgba": ("numpy", "float", "rgba"), + "paddlel8": ("paddle", "uint8", "l"), + "paddlergb8": ("paddle", "uint8", "rgb"), + "paddlergba8": ("paddle", "uint8", "rgba"), + "paddlel": ("paddle", "float", "l"), + "paddlergb": ("paddle", "float", "rgb"), + "paddle": ("paddle", "float", "rgb"), + "paddlergba": ("paddle", "float", "rgba"), + "pill": ("pil", None, "l"), + "pil": ("pil", None, "rgb"), + "pilrgb": ("pil", None, "rgb"), + "pilrgba": ("pil", None, "rgba"), +} + + +class ImageHandler: + """Decode image data using the given `imagespec`. + + The `imagespec` specifies whether the image is decoded + to numpy/paddle/pi, decoded to uint8/float, and decoded + to l/rgb/rgba: + + - l8: numpy uint8 l + - rgb8: numpy uint8 rgb + - rgba8: numpy uint8 rgba + - l: numpy float l + - rgb: numpy float rgb + - rgba: numpy float rgba + - paddlel8: paddle uint8 l + - paddlergb8: paddle uint8 rgb + - paddlergba8: paddle uint8 rgba + - paddlel: paddle float l + - paddlergb: paddle float rgb + - paddle: paddle float rgb + - paddlergba: paddle float rgba + - pill: pil None l + - pil: pil None rgb + - pilrgb: pil None rgb + - pilrgba: pil None rgba + + """ + + def __init__(self, imagespec, extensions=image_extensions): + """Create an image handler. + + :param imagespec: short string indicating the type of decoding + :param extensions: list of extensions the image handler is invoked for + """ + if imagespec not in list(imagespecs.keys()): + raise ValueError("Unknown imagespec: %s" % imagespec) + self.imagespec = imagespec.lower() + self.extensions = extensions + + def __call__(self, key, data): + """Perform image decoding. + + :param key: file name extension + :param data: binary data + """ + import PIL.Image + + extension = re.sub(r".*[.]", "", key) + if extension.lower() not in self.extensions: + return None + imagespec = self.imagespec + atype, etype, mode = imagespecs[imagespec] + with io.BytesIO(data) as stream: + img = PIL.Image.open(stream) + img.load() + img = img.convert(mode.upper()) + if atype == "pil": + return img + elif atype == "numpy": + result = np.asarray(img) + if result.dtype != np.uint8: + raise ValueError("ImageHandler: numpy image must be uint8") + if etype == "uint8": + return result + else: + return result.astype("f") / 255.0 + elif atype == "paddle": + import paddle + + result = np.asarray(img) + if result.dtype != np.uint8: + raise ValueError("ImageHandler: paddle image must be uint8") + if etype == "uint8": + result = np.array(result.transpose(2, 0, 1)) + return paddle.tensor(result) + else: + result = np.array(result.transpose(2, 0, 1)) + return paddle.tensor(result) / 255.0 + return None + + +def imagehandler(imagespec, extensions=image_extensions): + """Create an image handler. + + This is just a lower case alias for ImageHander. + + :param imagespec: textual image spec + :param extensions: list of extensions the handler should be applied for + """ + return ImageHandler(imagespec, extensions) + + +################################################################ +# torch video +################################################################ + +''' +def torch_video(key, data): + """Decode video using the torchvideo library. + + :param key: file name extension + :param data: data to be decoded + """ + extension = re.sub(r".*[.]", "", key) + if extension not in "mp4 ogv mjpeg avi mov h264 mpg webm wmv".split(): + return None + + import torchvision.io + + with tempfile.TemporaryDirectory() as dirname: + fname = os.path.join(dirname, f"file.{extension}") + with open(fname, "wb") as stream: + stream.write(data) + return torchvision.io.read_video(fname, pts_unit="sec") +''' + + +################################################################ +# paddlespeech.audio +################################################################ + + +def paddle_audio(key, data): + """Decode audio using the paddlespeech.audio library. + + :param key: file name extension + :param data: data to be decoded + """ + extension = re.sub(r".*[.]", "", key) + if extension not in ["flac", "mp3", "sox", "wav", "m4a", "ogg", "wma"]: + return None + + import paddlespeech.audio + + with tempfile.TemporaryDirectory() as dirname: + fname = os.path.join(dirname, f"file.{extension}") + with open(fname, "wb") as stream: + stream.write(data) + return paddlespeech.audio.load(fname) + + +################################################################ +# special class for continuing decoding +################################################################ + + +class Continue: + """Special class for continuing decoding. + + This is mostly used for decompression, as in: + + def decompressor(key, data): + if key.endswith(".gz"): + return Continue(key[:-3], decompress(data)) + return None + """ + + def __init__(self, key, data): + """__init__. + + :param key: + :param data: + """ + self.key, self.data = key, data + + +def gzfilter(key, data): + """Decode .gz files. + + This decodes compressed files and the continues decoding. + + :param key: file name extension + :param data: binary data + """ + import gzip + + if not key.endswith(".gz"): + return None + decompressed = gzip.open(io.BytesIO(data)).read() + return Continue(key[:-3], decompressed) + + +################################################################ +# decode entire training amples +################################################################ + + +default_pre_handlers = [gzfilter] +default_post_handlers = [basichandlers] + + +class Decoder: + """Decode samples using a list of handlers. + + For each key/data item, this iterates through the list of + handlers until some handler returns something other than None. + """ + + def __init__(self, handlers, pre=None, post=None, only=None, partial=False): + """Create a Decoder. + + :param handlers: main list of handlers + :param pre: handlers called before the main list (.gz handler by default) + :param post: handlers called after the main list (default handlers by default) + :param only: a list of extensions; when give, only ignores files with those extensions + :param partial: allow partial decoding (i.e., don't decode fields that aren't of type bytes) + """ + if isinstance(only, str): + only = only.split() + self.only = only if only is None else set(only) + if pre is None: + pre = default_pre_handlers + if post is None: + post = default_post_handlers + assert all(callable(h) for h in handlers), f"one of {handlers} not callable" + assert all(callable(h) for h in pre), f"one of {pre} not callable" + assert all(callable(h) for h in post), f"one of {post} not callable" + self.handlers = pre + handlers + post + self.partial = partial + + def decode1(self, key, data): + """Decode a single field of a sample. + + :param key: file name extension + :param data: binary data + """ + key = "." + key + for f in self.handlers: + result = f(key, data) + if isinstance(result, Continue): + key, data = result.key, result.data + continue + if result is not None: + return result + return data + + def decode(self, sample): + """Decode an entire sample. + + :param sample: the sample, a dictionary of key value pairs + """ + result = {} + assert isinstance(sample, dict), sample + for k, v in list(sample.items()): + if k[0] == "_": + if isinstance(v, bytes): + v = v.decode("utf-8") + result[k] = v + continue + if self.only is not None and k not in self.only: + result[k] = v + continue + assert v is not None + if self.partial: + if isinstance(v, bytes): + result[k] = self.decode1(k, v) + else: + result[k] = v + else: + assert isinstance(v, bytes) + result[k] = self.decode1(k, v) + return result + + def __call__(self, sample): + """Decode an entire sample. + + :param sample: the sample + """ + assert isinstance(sample, dict), (len(sample), sample) + return self.decode(sample) diff --git a/paddlespeech/audio/streamdata/cache.py b/paddlespeech/audio/streamdata/cache.py new file mode 100644 index 0000000000000000000000000000000000000000..e7bbffa1bad19b0d8f40424981c206b94d549f11 --- /dev/null +++ b/paddlespeech/audio/streamdata/cache.py @@ -0,0 +1,190 @@ +# Copyright (c) 2017-2019 NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# See the LICENSE file for licensing terms (BSD-style). +# Modified from https://github.com/webdataset/webdataset +import itertools, os, random, re, sys +from urllib.parse import urlparse + +from . import filters +from . import gopen +from .handlers import reraise_exception +from .tariterators import tar_file_and_group_expander + +default_cache_dir = os.environ.get("WDS_CACHE", "./_cache") +default_cache_size = float(os.environ.get("WDS_CACHE_SIZE", "1e18")) + + +def lru_cleanup(cache_dir, cache_size, keyfn=os.path.getctime, verbose=False): + """Performs cleanup of the file cache in cache_dir using an LRU strategy, + keeping the total size of all remaining files below cache_size.""" + if not os.path.exists(cache_dir): + return + total_size = 0 + for dirpath, dirnames, filenames in os.walk(cache_dir): + for filename in filenames: + total_size += os.path.getsize(os.path.join(dirpath, filename)) + if total_size <= cache_size: + return + # sort files by last access time + files = [] + for dirpath, dirnames, filenames in os.walk(cache_dir): + for filename in filenames: + files.append(os.path.join(dirpath, filename)) + files.sort(key=keyfn, reverse=True) + # delete files until we're under the cache size + while len(files) > 0 and total_size > cache_size: + fname = files.pop() + total_size -= os.path.getsize(fname) + if verbose: + print("# deleting %s" % fname, file=sys.stderr) + os.remove(fname) + + +def download(url, dest, chunk_size=1024 ** 2, verbose=False): + """Download a file from `url` to `dest`.""" + temp = dest + f".temp{os.getpid()}" + with gopen.gopen(url) as stream: + with open(temp, "wb") as f: + while True: + data = stream.read(chunk_size) + if not data: + break + f.write(data) + os.rename(temp, dest) + + +def pipe_cleaner(spec): + """Guess the actual URL from a "pipe:" specification.""" + if spec.startswith("pipe:"): + spec = spec[5:] + words = spec.split(" ") + for word in words: + if re.match(r"^(https?|gs|ais|s3)", word): + return word + return spec + + +def get_file_cached( + spec, + cache_size=-1, + cache_dir=None, + url_to_name=pipe_cleaner, + verbose=False, +): + if cache_size == -1: + cache_size = default_cache_size + if cache_dir is None: + cache_dir = default_cache_dir + url = url_to_name(spec) + parsed = urlparse(url) + dirname, filename = os.path.split(parsed.path) + dirname = dirname.lstrip("/") + dirname = re.sub(r"[:/|;]", "_", dirname) + destdir = os.path.join(cache_dir, dirname) + os.makedirs(destdir, exist_ok=True) + dest = os.path.join(cache_dir, dirname, filename) + if not os.path.exists(dest): + if verbose: + print("# downloading %s to %s" % (url, dest), file=sys.stderr) + lru_cleanup(cache_dir, cache_size, verbose=verbose) + download(spec, dest, verbose=verbose) + return dest + + +def get_filetype(fname): + with os.popen("file '%s'" % fname) as f: + ftype = f.read() + return ftype + + +def check_tar_format(fname): + """Check whether a file is a tar archive.""" + ftype = get_filetype(fname) + return "tar archive" in ftype or "gzip compressed" in ftype + + +verbose_cache = int(os.environ.get("WDS_VERBOSE_CACHE", "0")) + + +def cached_url_opener( + data, + handler=reraise_exception, + cache_size=-1, + cache_dir=None, + url_to_name=pipe_cleaner, + validator=check_tar_format, + verbose=False, + always=False, +): + """Given a stream of url names (packaged in `dict(url=url)`), yield opened streams.""" + verbose = verbose or verbose_cache + for sample in data: + assert isinstance(sample, dict), sample + assert "url" in sample + url = sample["url"] + attempts = 5 + try: + if not always and os.path.exists(url): + dest = url + else: + dest = get_file_cached( + url, + cache_size=cache_size, + cache_dir=cache_dir, + url_to_name=url_to_name, + verbose=verbose, + ) + if verbose: + print("# opening %s" % dest, file=sys.stderr) + assert os.path.exists(dest) + if not validator(dest): + ftype = get_filetype(dest) + with open(dest, "rb") as f: + data = f.read(200) + os.remove(dest) + raise ValueError( + "%s (%s) is not a tar archive, but a %s, contains %s" + % (dest, url, ftype, repr(data)) + ) + try: + stream = open(dest, "rb") + sample.update(stream=stream) + yield sample + except FileNotFoundError as exn: + # dealing with race conditions in lru_cleanup + attempts -= 1 + if attempts > 0: + time.sleep(random.random() * 10) + continue + raise exn + except Exception as exn: + exn.args = exn.args + (url,) + if handler(exn): + continue + else: + break + + +def cached_tarfile_samples( + src, + handler=reraise_exception, + cache_size=-1, + cache_dir=None, + verbose=False, + url_to_name=pipe_cleaner, + always=False, +): + streams = cached_url_opener( + src, + handler=handler, + cache_size=cache_size, + cache_dir=cache_dir, + verbose=verbose, + url_to_name=url_to_name, + always=always, + ) + samples = tar_file_and_group_expander(streams, handler=handler) + return samples + + +cached_tarfile_to_samples = filters.pipelinefilter(cached_tarfile_samples) diff --git a/paddlespeech/audio/streamdata/compat.py b/paddlespeech/audio/streamdata/compat.py new file mode 100644 index 0000000000000000000000000000000000000000..deda53384e35ddd9ae919c6852d889e6d7479a0a --- /dev/null +++ b/paddlespeech/audio/streamdata/compat.py @@ -0,0 +1,170 @@ +# Copyright (c) 2017-2019 NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# See the LICENSE file for licensing terms (BSD-style). +# Modified from https://github.com/webdataset/webdataset +from dataclasses import dataclass +from itertools import islice +from typing import List + +import braceexpand, yaml + +from . import autodecode +from . import cache, filters, shardlists, tariterators +from .filters import reraise_exception +from .pipeline import DataPipeline +from .paddle_utils import DataLoader, IterableDataset + + +class FluidInterface: + def batched(self, batchsize): + return self.compose(filters.batched(batchsize)) + + def dynamic_batched(self, max_frames_in_batch): + return self.compose(filter.dynamic_batched(max_frames_in_batch)) + + def unbatched(self): + return self.compose(filters.unbatched()) + + def listed(self, batchsize, partial=True): + return self.compose(filters.batched(), batchsize=batchsize, collation_fn=None) + + def unlisted(self): + return self.compose(filters.unlisted()) + + def log_keys(self, logfile=None): + return self.compose(filters.log_keys(logfile)) + + def shuffle(self, size, **kw): + if size < 1: + return self + else: + return self.compose(filters.shuffle(size, **kw)) + + def map(self, f, handler=reraise_exception): + return self.compose(filters.map(f, handler=handler)) + + def decode(self, *args, pre=None, post=None, only=None, partial=False, handler=reraise_exception): + handlers = [autodecode.ImageHandler(x) if isinstance(x, str) else x for x in args] + decoder = autodecode.Decoder(handlers, pre=pre, post=post, only=only, partial=partial) + return self.map(decoder, handler=handler) + + def map_dict(self, handler=reraise_exception, **kw): + return self.compose(filters.map_dict(handler=handler, **kw)) + + def select(self, predicate, **kw): + return self.compose(filters.select(predicate, **kw)) + + def to_tuple(self, *args, handler=reraise_exception): + return self.compose(filters.to_tuple(*args, handler=handler)) + + def map_tuple(self, *args, handler=reraise_exception): + return self.compose(filters.map_tuple(*args, handler=handler)) + + def slice(self, *args): + return self.compose(filters.slice(*args)) + + def rename(self, **kw): + return self.compose(filters.rename(**kw)) + + def rsample(self, p=0.5): + return self.compose(filters.rsample(p)) + + def rename_keys(self, *args, **kw): + return self.compose(filters.rename_keys(*args, **kw)) + + def extract_keys(self, *args, **kw): + return self.compose(filters.extract_keys(*args, **kw)) + + def xdecode(self, *args, **kw): + return self.compose(filters.xdecode(*args, **kw)) + + def audio_data_filter(self, *args, **kw): + return self.compose(filters.audio_data_filter(*args, **kw)) + + def audio_tokenize(self, *args, **kw): + return self.compose(filters.audio_tokenize(*args, **kw)) + + def resample(self, *args, **kw): + return self.compose(filters.resample(*args, **kw)) + + def audio_compute_fbank(self, *args, **kw): + return self.compose(filters.audio_compute_fbank(*args, **kw)) + + def audio_spec_aug(self, *args, **kw): + return self.compose(filters.audio_spec_aug(*args, **kw)) + + def sort(self, size=500): + return self.compose(filters.sort(size)) + + def audio_padding(self): + return self.compose(filters.audio_padding()) + + def audio_cmvn(self, cmvn_file): + return self.compose(filters.audio_cmvn(cmvn_file)) + +class WebDataset(DataPipeline, FluidInterface): + """Small fluid-interface wrapper for DataPipeline.""" + + def __init__( + self, + urls, + handler=reraise_exception, + resampled=False, + repeat=False, + shardshuffle=None, + cache_size=0, + cache_dir=None, + detshuffle=False, + nodesplitter=shardlists.single_node_only, + verbose=False, + ): + super().__init__() + if isinstance(urls, IterableDataset): + assert not resampled + self.append(urls) + elif isinstance(urls, str) and (urls.endswith(".yaml") or urls.endswith(".yml")): + with (open(urls)) as stream: + spec = yaml.safe_load(stream) + assert "datasets" in spec + self.append(shardlists.MultiShardSample(spec)) + elif isinstance(urls, dict): + assert "datasets" in urls + self.append(shardlists.MultiShardSample(urls)) + elif resampled: + self.append(shardlists.ResampledShards(urls)) + else: + self.append(shardlists.SimpleShardList(urls)) + self.append(nodesplitter) + self.append(shardlists.split_by_worker) + if shardshuffle is True: + shardshuffle = 100 + if shardshuffle is not None: + if detshuffle: + self.append(filters.detshuffle(shardshuffle)) + else: + self.append(filters.shuffle(shardshuffle)) + if cache_size == 0: + self.append(tariterators.tarfile_to_samples(handler=handler)) + else: + assert cache_size == -1 or cache_size > 0 + self.append( + cache.cached_tarfile_to_samples( + handler=handler, + verbose=verbose, + cache_size=cache_size, + cache_dir=cache_dir, + ) + ) + + +class FluidWrapper(DataPipeline, FluidInterface): + """Small fluid-interface wrapper for DataPipeline.""" + + def __init__(self, initial): + super().__init__() + self.append(initial) + + +class WebLoader(DataPipeline, FluidInterface): + def __init__(self, *args, **kw): + super().__init__(DataLoader(*args, **kw)) diff --git a/paddlespeech/audio/streamdata/extradatasets.py b/paddlespeech/audio/streamdata/extradatasets.py new file mode 100644 index 0000000000000000000000000000000000000000..e6d6177243551b15272a614d300276a62f559b5c --- /dev/null +++ b/paddlespeech/audio/streamdata/extradatasets.py @@ -0,0 +1,141 @@ +# +# Copyright (c) 2017-2021 NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# This file is part of the WebDataset library. +# See the LICENSE file for licensing terms (BSD-style). +# Modified from https://github.com/webdataset/webdataset +# + + +"""Train PyTorch models directly from POSIX tar archive. + +Code works locally or over HTTP connections. +""" + +import itertools as itt +import os +import random +import sys + +import braceexpand + +from . import utils +from .paddle_utils import IterableDataset +from .utils import PipelineStage + + +class MockDataset(IterableDataset): + """MockDataset. + + A mock dataset for performance testing and unit testing. + """ + + def __init__(self, sample, length): + """Create a mock dataset instance. + + :param sample: the sample to be returned repeatedly + :param length: the length of the mock dataset + """ + self.sample = sample + self.length = length + + def __iter__(self): + """Return an iterator over this mock dataset.""" + for i in range(self.length): + yield self.sample + + +class repeatedly(IterableDataset, PipelineStage): + """Repeatedly yield samples from a dataset.""" + + def __init__(self, source, nepochs=None, nbatches=None, length=None): + """Create an instance of Repeatedly. + + :param nepochs: repeat for a maximum of nepochs + :param nbatches: repeat for a maximum of nbatches + """ + self.source = source + self.length = length + self.nbatches = nbatches + + def invoke(self, source): + """Return an iterator that iterates repeatedly over a source.""" + return utils.repeatedly( + source, + nepochs=self.nepochs, + nbatches=self.nbatches, + ) + + +class with_epoch(IterableDataset): + """Change the actual and nominal length of an IterableDataset. + + This will continuously iterate through the original dataset, but + impose new epoch boundaries at the given length/nominal. + This exists mainly as a workaround for the odd logic in DataLoader. + It is also useful for choosing smaller nominal epoch sizes with + very large datasets. + + """ + + def __init__(self, dataset, length): + """Chop the dataset to the given length. + + :param dataset: IterableDataset + :param length: declared length of the dataset + :param nominal: nominal length of dataset (if different from declared) + """ + super().__init__() + self.length = length + self.source = None + + def __getstate__(self): + """Return the pickled state of the dataset. + + This resets the dataset iterator, since that can't be pickled. + """ + result = dict(self.__dict__) + result["source"] = None + return result + + def invoke(self, dataset): + """Return an iterator over the dataset. + + This iterator returns as many samples as given by the `length` + parameter. + """ + if self.source is None: + self.source = iter(dataset) + for i in range(self.length): + try: + sample = next(self.source) + except StopIteration: + self.source = iter(dataset) + try: + sample = next(self.source) + except StopIteration: + return + yield sample + self.source = None + + +class with_length(IterableDataset, PipelineStage): + """Repeatedly yield samples from a dataset.""" + + def __init__(self, dataset, length): + """Create an instance of Repeatedly. + + :param dataset: source dataset + :param length: stated length + """ + super().__init__() + self.dataset = dataset + self.length = length + + def invoke(self, dataset): + """Return an iterator that iterates repeatedly over a source.""" + return iter(dataset) + + def __len__(self): + """Return the user specified length.""" + return self.length diff --git a/paddlespeech/audio/streamdata/filters.py b/paddlespeech/audio/streamdata/filters.py new file mode 100644 index 0000000000000000000000000000000000000000..82b9c6bab9197d6832d641b52289bbaf290dd122 --- /dev/null +++ b/paddlespeech/audio/streamdata/filters.py @@ -0,0 +1,935 @@ +# Copyright (c) 2017-2021 NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# This file is part of the WebDataset library. +# See the LICENSE file for licensing terms (BSD-style). +# + +# Modified from https://github.com/webdataset/webdataset +# Modified from wenet(https://github.com/wenet-e2e/wenet) +"""A collection of iterators for data transformations. + +These functions are plain iterator functions. You can find curried versions +in webdataset.filters, and you can find IterableDataset wrappers in +webdataset.processing. +""" + +import io +from fnmatch import fnmatch +import re +import itertools, os, random, sys, time +from functools import reduce, wraps + +import numpy as np + +from . import autodecode +from . import utils +from .paddle_utils import PaddleTensor +from .utils import PipelineStage + +from .. import backends +from ..compliance import kaldi +import paddle +from ..transform.cmvn import GlobalCMVN +from ..utils.tensor_utils import pad_sequence +from ..transform.spec_augment import time_warp +from ..transform.spec_augment import time_mask +from ..transform.spec_augment import freq_mask + +class FilterFunction(object): + """Helper class for currying pipeline stages. + + We use this roundabout construct becauce it can be pickled. + """ + + def __init__(self, f, *args, **kw): + """Create a curried function.""" + self.f = f + self.args = args + self.kw = kw + + def __call__(self, data): + """Call the curried function with the given argument.""" + return self.f(data, *self.args, **self.kw) + + def __str__(self): + """Compute a string representation.""" + return f"<{self.f.__name__} {self.args} {self.kw}>" + + def __repr__(self): + """Compute a string representation.""" + return f"<{self.f.__name__} {self.args} {self.kw}>" + + +class RestCurried(object): + """Helper class for currying pipeline stages. + + We use this roundabout construct because it can be pickled. + """ + + def __init__(self, f): + """Store the function for future currying.""" + self.f = f + + def __call__(self, *args, **kw): + """Curry with the given arguments.""" + return FilterFunction(self.f, *args, **kw) + + +def pipelinefilter(f): + """Turn the decorated function into one that is partially applied for + all arguments other than the first.""" + result = RestCurried(f) + return result + + +def reraise_exception(exn): + """Reraises the given exception; used as a handler. + + :param exn: exception + """ + raise exn + + +def identity(x): + """Return the argument.""" + return x + + +def compose2(f, g): + """Compose two functions, g(f(x)).""" + return lambda x: g(f(x)) + + +def compose(*args): + """Compose a sequence of functions (left-to-right).""" + return reduce(compose2, args) + + +def pipeline(source, *args): + """Write an input pipeline; first argument is source, rest are filters.""" + if len(args) == 0: + return source + return compose(*args)(source) + + +def getfirst(a, keys, default=None, missing_is_error=True): + """Get the first matching key from a dictionary. + + Keys can be specified as a list, or as a string of keys separated by ';'. + """ + if isinstance(keys, str): + assert " " not in keys + keys = keys.split(";") + for k in keys: + if k in a: + return a[k] + if missing_is_error: + raise ValueError(f"didn't find {keys} in {list(a.keys())}") + return default + + +def parse_field_spec(fields): + """Parse a specification for a list of fields to be extracted. + + Keys are separated by spaces in the spec. Each key can itself + be composed of key alternatives separated by ';'. + """ + if isinstance(fields, str): + fields = fields.split() + return [field.split(";") for field in fields] + + +def transform_with(sample, transformers): + """Transform a list of values using a list of functions. + + sample: list of values + transformers: list of functions + + If there are fewer transformers than inputs, or if a transformer + function is None, then the identity function is used for the + corresponding sample fields. + """ + if transformers is None or len(transformers) == 0: + return sample + result = list(sample) + assert len(transformers) <= len(sample) + for i in range(len(transformers)): # skipcq: PYL-C0200 + f = transformers[i] + if f is not None: + result[i] = f(sample[i]) + return result + +### +# Iterators +### + +def _info(data, fmt=None, n=3, every=-1, width=50, stream=sys.stderr, name=""): + """Print information about the samples that are passing through. + + :param data: source iterator + :param fmt: format statement (using sample dict as keyword) + :param n: when to stop + :param every: how often to print + :param width: maximum width + :param stream: output stream + :param name: identifier printed before any output + """ + for i, sample in enumerate(data): + if i < n or (every > 0 and (i + 1) % every == 0): + if fmt is None: + print("---", name, file=stream) + for k, v in sample.items(): + print(k, repr(v)[:width], file=stream) + else: + print(fmt.format(**sample), file=stream) + yield sample + + +info = pipelinefilter(_info) + + +def pick(buf, rng): + k = rng.randint(0, len(buf) - 1) + sample = buf[k] + buf[k] = buf[-1] + buf.pop() + return sample + + +def _shuffle(data, bufsize=1000, initial=100, rng=None, handler=None): + """Shuffle the data in the stream. + + This uses a buffer of size `bufsize`. Shuffling at + startup is less random; this is traded off against + yielding samples quickly. + + data: iterator + bufsize: buffer size for shuffling + returns: iterator + rng: either random module or random.Random instance + + """ + if rng is None: + rng = random.Random(int((os.getpid() + time.time()) * 1e9)) + initial = min(initial, bufsize) + buf = [] + for sample in data: + buf.append(sample) + if len(buf) < bufsize: + try: + buf.append(next(data)) # skipcq: PYL-R1708 + except StopIteration: + pass + if len(buf) >= initial: + yield pick(buf, rng) + while len(buf) > 0: + yield pick(buf, rng) + + +shuffle = pipelinefilter(_shuffle) + + +class detshuffle(PipelineStage): + def __init__(self, bufsize=1000, initial=100, seed=0, epoch=-1): + self.bufsize = bufsize + self.initial = initial + self.seed = seed + self.epoch = epoch + + def run(self, src): + self.epoch += 1 + rng = random.Random() + rng.seed((self.seed, self.epoch)) + return _shuffle(src, self.bufsize, self.initial, rng) + + +def _select(data, predicate): + """Select samples based on a predicate. + + :param data: source iterator + :param predicate: predicate (function) + """ + for sample in data: + if predicate(sample): + yield sample + + +select = pipelinefilter(_select) + + +def _log_keys(data, logfile=None): + import fcntl + + if logfile is None or logfile == "": + for sample in data: + yield sample + else: + with open(logfile, "a") as stream: + for i, sample in enumerate(data): + buf = f"{i}\t{sample.get('__worker__')}\t{sample.get('__rank__')}\t{sample.get('__key__')}\n" + try: + fcntl.flock(stream.fileno(), fcntl.LOCK_EX) + stream.write(buf) + finally: + fcntl.flock(stream.fileno(), fcntl.LOCK_UN) + yield sample + + +log_keys = pipelinefilter(_log_keys) + + +def _decode(data, *args, handler=reraise_exception, **kw): + """Decode data based on the decoding functions given as arguments.""" + + decoder = lambda x: autodecode.imagehandler(x) if isinstance(x, str) else x + handlers = [decoder(x) for x in args] + f = autodecode.Decoder(handlers, **kw) + + for sample in data: + assert isinstance(sample, dict), sample + try: + decoded = f(sample) + except Exception as exn: # skipcq: PYL-W0703 + if handler(exn): + continue + else: + break + yield decoded + + +decode = pipelinefilter(_decode) + + +def _map(data, f, handler=reraise_exception): + """Map samples.""" + for sample in data: + try: + result = f(sample) + except Exception as exn: + if handler(exn): + continue + else: + break + if result is None: + continue + if isinstance(sample, dict) and isinstance(result, dict): + result["__key__"] = sample.get("__key__") + yield result + + +map = pipelinefilter(_map) + + +def _rename(data, handler=reraise_exception, keep=True, **kw): + """Rename samples based on keyword arguments.""" + for sample in data: + try: + if not keep: + yield {k: getfirst(sample, v, missing_is_error=True) for k, v in kw.items()} + else: + + def listify(v): + return v.split(";") if isinstance(v, str) else v + + to_be_replaced = {x for v in kw.values() for x in listify(v)} + result = {k: v for k, v in sample.items() if k not in to_be_replaced} + result.update({k: getfirst(sample, v, missing_is_error=True) for k, v in kw.items()}) + yield result + except Exception as exn: + if handler(exn): + continue + else: + break + + +rename = pipelinefilter(_rename) + + +def _associate(data, associator, **kw): + """Associate additional data with samples.""" + for sample in data: + if callable(associator): + extra = associator(sample["__key__"]) + else: + extra = associator.get(sample["__key__"], {}) + sample.update(extra) # destructive + yield sample + + +associate = pipelinefilter(_associate) + + +def _map_dict(data, handler=reraise_exception, **kw): + """Map the entries in a dict sample with individual functions.""" + assert len(list(kw.keys())) > 0 + for key, f in kw.items(): + assert callable(f), (key, f) + + for sample in data: + assert isinstance(sample, dict) + try: + for k, f in kw.items(): + sample[k] = f(sample[k]) + except Exception as exn: + if handler(exn): + continue + else: + break + yield sample + + +map_dict = pipelinefilter(_map_dict) + + +def _to_tuple(data, *args, handler=reraise_exception, missing_is_error=True, none_is_error=None): + """Convert dict samples to tuples.""" + if none_is_error is None: + none_is_error = missing_is_error + if len(args) == 1 and isinstance(args[0], str) and " " in args[0]: + args = args[0].split() + + for sample in data: + try: + result = tuple([getfirst(sample, f, missing_is_error=missing_is_error) for f in args]) + if none_is_error and any(x is None for x in result): + raise ValueError(f"to_tuple {args} got {sample.keys()}") + yield result + except Exception as exn: + if handler(exn): + continue + else: + break + + +to_tuple = pipelinefilter(_to_tuple) + + +def _map_tuple(data, *args, handler=reraise_exception): + """Map the entries of a tuple with individual functions.""" + args = [f if f is not None else utils.identity for f in args] + for f in args: + assert callable(f), f + for sample in data: + assert isinstance(sample, (list, tuple)) + sample = list(sample) + n = min(len(args), len(sample)) + try: + for i in range(n): + sample[i] = args[i](sample[i]) + except Exception as exn: + if handler(exn): + continue + else: + break + yield tuple(sample) + + +map_tuple = pipelinefilter(_map_tuple) + + +def _unlisted(data): + """Turn batched data back into unbatched data.""" + for batch in data: + assert isinstance(batch, list), sample + for sample in batch: + yield sample + + +unlisted = pipelinefilter(_unlisted) + + +def _unbatched(data): + """Turn batched data back into unbatched data.""" + for sample in data: + assert isinstance(sample, (tuple, list)), sample + assert len(sample) > 0 + for i in range(len(sample[0])): + yield tuple(x[i] for x in sample) + + +unbatched = pipelinefilter(_unbatched) + + +def _rsample(data, p=0.5): + """Randomly subsample a stream of data.""" + assert p >= 0.0 and p <= 1.0 + for sample in data: + if random.uniform(0.0, 1.0) < p: + yield sample + + +rsample = pipelinefilter(_rsample) + +slice = pipelinefilter(itertools.islice) + + +def _extract_keys(source, *patterns, duplicate_is_error=True, ignore_missing=False): + for sample in source: + result = [] + for pattern in patterns: + pattern = pattern.split(";") if isinstance(pattern, str) else pattern + matches = [x for x in sample.keys() if any(fnmatch("." + x, p) for p in pattern)] + if len(matches) == 0: + if ignore_missing: + continue + else: + raise ValueError(f"Cannot find {pattern} in sample keys {sample.keys()}.") + if len(matches) > 1 and duplicate_is_error: + raise ValueError(f"Multiple sample keys {sample.keys()} match {pattern}.") + value = sample[matches[0]] + result.append(value) + yield tuple(result) + + +extract_keys = pipelinefilter(_extract_keys) + + +def _rename_keys(source, *args, keep_unselected=False, must_match=True, duplicate_is_error=True, **kw): + renamings = [(pattern, output) for output, pattern in args] + renamings += [(pattern, output) for output, pattern in kw.items()] + for sample in source: + new_sample = {} + matched = {k: False for k, _ in renamings} + for path, value in sample.items(): + fname = re.sub(r".*/", "", path) + new_name = None + for pattern, name in renamings[::-1]: + if fnmatch(fname.lower(), pattern): + matched[pattern] = True + new_name = name + break + if new_name is None: + if keep_unselected: + new_sample[path] = value + continue + if new_name in new_sample: + if duplicate_is_error: + raise ValueError(f"Duplicate value in sample {sample.keys()} after rename.") + continue + new_sample[new_name] = value + if must_match and not all(matched.values()): + raise ValueError(f"Not all patterns ({matched}) matched sample keys ({sample.keys()}).") + + yield new_sample + + +rename_keys = pipelinefilter(_rename_keys) + + +def decode_bin(stream): + return stream.read() + + +def decode_text(stream): + binary = stream.read() + return binary.decode("utf-8") + + +def decode_pickle(stream): + return pickle.load(stream) + + +default_decoders = [ + ("*.bin", decode_bin), + ("*.txt", decode_text), + ("*.pyd", decode_pickle), +] + + +def find_decoder(decoders, path): + fname = re.sub(r".*/", "", path) + if fname.startswith("__"): + return lambda x: x + for pattern, fun in decoders[::-1]: + if fnmatch(fname.lower(), pattern) or fnmatch("." + fname.lower(), pattern): + return fun + return None + + +def _xdecode( + source, + *args, + must_decode=True, + defaults=default_decoders, + **kw, +): + decoders = list(defaults) + list(args) + decoders += [("*." + k, v) for k, v in kw.items()] + for sample in source: + new_sample = {} + for path, data in sample.items(): + if path.startswith("__"): + new_sample[path] = data + continue + decoder = find_decoder(decoders, path) + if decoder is False: + value = data + elif decoder is None: + if must_decode: + raise ValueError(f"No decoder found for {path}.") + value = data + else: + if isinstance(data, bytes): + data = io.BytesIO(data) + value = decoder(data) + new_sample[path] = value + yield new_sample + +xdecode = pipelinefilter(_xdecode) + + + +def _audio_data_filter(source, + frame_shift=10, + max_length=10240, + min_length=10, + token_max_length=200, + token_min_length=1, + min_output_input_ratio=0.0005, + max_output_input_ratio=1): + """ Filter sample according to feature and label length + Inplace operation. + + Args:: + source: Iterable[{fname, wav, label, sample_rate}] + frame_shift: length of frame shift (ms) + max_length: drop utterance which is greater than max_length(10ms) + min_length: drop utterance which is less than min_length(10ms) + token_max_length: drop utterance which is greater than + token_max_length, especially when use char unit for + english modeling + token_min_length: drop utterance which is + less than token_max_length + min_output_input_ratio: minimal ration of + token_length / feats_length(10ms) + max_output_input_ratio: maximum ration of + token_length / feats_length(10ms) + + Returns: + Iterable[{fname, wav, label, sample_rate}] + """ + for sample in source: + assert 'sample_rate' in sample + assert 'wav' in sample + assert 'label' in sample + # sample['wav'] is paddle.Tensor, we have 100 frames every second (default) + num_frames = sample['wav'].shape[1] / sample['sample_rate'] * (1000 / frame_shift) + if num_frames < min_length: + continue + if num_frames > max_length: + continue + if len(sample['label']) < token_min_length: + continue + if len(sample['label']) > token_max_length: + continue + if num_frames != 0: + if len(sample['label']) / num_frames < min_output_input_ratio: + continue + if len(sample['label']) / num_frames > max_output_input_ratio: + continue + yield sample + +audio_data_filter = pipelinefilter(_audio_data_filter) + +def _audio_tokenize(source, + symbol_table, + bpe_model=None, + non_lang_syms=None, + split_with_space=False): + """ Decode text to chars or BPE + Inplace operation + + Args: + source: Iterable[{fname, wav, txt, sample_rate}] + + Returns: + Iterable[{fname, wav, txt, tokens, label, sample_rate}] + """ + if non_lang_syms is not None: + non_lang_syms_pattern = re.compile(r"(\[[^\[\]]+\]|<[^<>]+>|{[^{}]+})") + else: + non_lang_syms = {} + non_lang_syms_pattern = None + + if bpe_model is not None: + import sentencepiece as spm + sp = spm.SentencePieceProcessor() + sp.load(bpe_model) + else: + sp = None + + for sample in source: + assert 'txt' in sample + txt = sample['txt'].strip() + if non_lang_syms_pattern is not None: + parts = non_lang_syms_pattern.split(txt.upper()) + parts = [w for w in parts if len(w.strip()) > 0] + else: + parts = [txt] + + label = [] + tokens = [] + for part in parts: + if part in non_lang_syms: + tokens.append(part) + else: + if bpe_model is not None: + tokens.extend(__tokenize_by_bpe_model(sp, part)) + else: + if split_with_space: + part = part.split(" ") + for ch in part: + if ch == ' ': + ch = "" + tokens.append(ch) + + for ch in tokens: + if ch in symbol_table: + label.append(symbol_table[ch]) + elif '' in symbol_table: + label.append(symbol_table['']) + + sample['tokens'] = tokens + sample['label'] = label + yield sample + +audio_tokenize = pipelinefilter(_audio_tokenize) + +def _audio_resample(source, resample_rate=16000): + """ Resample data. + Inplace operation. + + Args: + data: Iterable[{fname, wav, label, sample_rate}] + resample_rate: target resample rate + + Returns: + Iterable[{fname, wav, label, sample_rate}] + """ + for sample in source: + assert 'sample_rate' in sample + assert 'wav' in sample + sample_rate = sample['sample_rate'] + waveform = sample['wav'] + if sample_rate != resample_rate: + sample['sample_rate'] = resample_rate + sample['wav'] = paddle.to_tensor(backends.soundfile_backend.resample( + waveform.numpy(), src_sr = sample_rate, target_sr = resample_rate + )) + yield sample + +audio_resample = pipelinefilter(_audio_resample) + +def _audio_compute_fbank(source, + num_mel_bins=80, + frame_length=25, + frame_shift=10, + dither=0.0): + """ Extract fbank + + Args: + source: Iterable[{fname, wav, label, sample_rate}] + num_mel_bins: number of mel filter bank + frame_length: length of one frame (ms) + frame_shift: length of frame shift (ms) + dither: value of dither + + Returns: + Iterable[{fname, feat, label}] + """ + for sample in source: + assert 'sample_rate' in sample + assert 'wav' in sample + assert 'fname' in sample + assert 'label' in sample + sample_rate = sample['sample_rate'] + waveform = sample['wav'] + waveform = waveform * (1 << 15) + # Only keep fname, feat, label + mat = kaldi.fbank(waveform, + n_mels=num_mel_bins, + frame_length=frame_length, + frame_shift=frame_shift, + dither=dither, + energy_floor=0.0, + sr=sample_rate) + yield dict(fname=sample['fname'], label=sample['label'], feat=mat) + + +audio_compute_fbank = pipelinefilter(_audio_compute_fbank) + +def _audio_spec_aug(source, + max_w=5, + w_inplace=True, + w_mode="PIL", + max_f=30, + num_f_mask=2, + f_inplace=True, + f_replace_with_zero=False, + max_t=40, + num_t_mask=2, + t_inplace=True, + t_replace_with_zero=False,): + """ Do spec augmentation + Inplace operation + + Args: + source: Iterable[{fname, feat, label}] + max_w: max width of time warp + w_inplace: whether to inplace the original data while time warping + w_mode: time warp mode + max_f: max width of freq mask + num_f_mask: number of freq mask to apply + f_inplace: whether to inplace the original data while frequency masking + f_replace_with_zero: use zero to mask + max_t: max width of time mask + num_t_mask: number of time mask to apply + t_inplace: whether to inplace the original data while time masking + t_replace_with_zero: use zero to mask + + Returns + Iterable[{fname, feat, label}] + """ + for sample in source: + x = sample['feat'] + x = x.numpy() + x = time_warp(x, max_time_warp=max_w, inplace = w_inplace, mode= w_mode) + x = freq_mask(x, F = max_f, n_mask = num_f_mask, inplace = f_inplace, replace_with_zero = f_replace_with_zero) + x = time_mask(x, T = max_t, n_mask = num_t_mask, inplace = t_inplace, replace_with_zero = t_replace_with_zero) + sample['feat'] = paddle.to_tensor(x, dtype=paddle.float32) + yield sample + +audio_spec_aug = pipelinefilter(_audio_spec_aug) + + +def _sort(source, sort_size=500): + """ Sort the data by feature length. + Sort is used after shuffle and before batch, so we can group + utts with similar lengths into a batch, and `sort_size` should + be less than `shuffle_size` + + Args: + source: Iterable[{fname, feat, label}] + sort_size: buffer size for sort + + Returns: + Iterable[{fname, feat, label}] + """ + + buf = [] + for sample in source: + buf.append(sample) + if len(buf) >= sort_size: + buf.sort(key=lambda x: x['feat'].shape[0]) + for x in buf: + yield x + buf = [] + # The sample left over + buf.sort(key=lambda x: x['feat'].shape[0]) + for x in buf: + yield x + +sort = pipelinefilter(_sort) + +def _batched(source, batch_size=16): + """ Static batch the data by `batch_size` + + Args: + data: Iterable[{fname, feat, label}] + batch_size: batch size + + Returns: + Iterable[List[{fname, feat, label}]] + """ + buf = [] + for sample in source: + buf.append(sample) + if len(buf) >= batch_size: + yield buf + buf = [] + if len(buf) > 0: + yield buf + +batched = pipelinefilter(_batched) + +def dynamic_batched(source, max_frames_in_batch=12000): + """ Dynamic batch the data until the total frames in batch + reach `max_frames_in_batch` + + Args: + source: Iterable[{fname, feat, label}] + max_frames_in_batch: max_frames in one batch + + Returns: + Iterable[List[{fname, feat, label}]] + """ + buf = [] + longest_frames = 0 + for sample in source: + assert 'feat' in sample + assert isinstance(sample['feat'], paddle.Tensor) + new_sample_frames = sample['feat'].size(0) + longest_frames = max(longest_frames, new_sample_frames) + frames_after_padding = longest_frames * (len(buf) + 1) + if frames_after_padding > max_frames_in_batch: + yield buf + buf = [sample] + longest_frames = new_sample_frames + else: + buf.append(sample) + if len(buf) > 0: + yield buf + + +def _audio_padding(source): + """ Padding the data into training data + + Args: + source: Iterable[List[{fname, feat, label}]] + + Returns: + Iterable[Tuple(fname, feats, labels, feats lengths, label lengths)] + """ + for sample in source: + assert isinstance(sample, list) + feats_length = paddle.to_tensor([x['feat'].shape[0] for x in sample], + dtype="int64") + order = paddle.argsort(feats_length, descending=True) + feats_lengths = paddle.to_tensor( + [sample[i]['feat'].shape[0] for i in order], dtype="int64") + sorted_feats = [sample[i]['feat'] for i in order] + sorted_keys = [sample[i]['fname'] for i in order] + sorted_labels = [ + paddle.to_tensor(sample[i]['label'], dtype="int32") for i in order + ] + label_lengths = paddle.to_tensor([x.shape[0] for x in sorted_labels], + dtype="int64") + padded_feats = pad_sequence(sorted_feats, + batch_first=True, + padding_value=0) + padding_labels = pad_sequence(sorted_labels, + batch_first=True, + padding_value=-1) + + yield (sorted_keys, padded_feats, feats_lengths, padding_labels, + label_lengths) + +audio_padding = pipelinefilter(_audio_padding) + +def _audio_cmvn(source, cmvn_file): + global_cmvn = GlobalCMVN(cmvn_file) + for batch in source: + sorted_keys, padded_feats, feats_lengths, padding_labels, label_lengths = batch + padded_feats = padded_feats.numpy() + padded_feats = global_cmvn(padded_feats) + padded_feats = paddle.to_tensor(padded_feats, dtype=paddle.float32) + yield (sorted_keys, padded_feats, feats_lengths, padding_labels, + label_lengths) + +audio_cmvn = pipelinefilter(_audio_cmvn) + +def _placeholder(source): + for data in source: + yield data + +placeholder = pipelinefilter(_placeholder) diff --git a/paddlespeech/audio/streamdata/gopen.py b/paddlespeech/audio/streamdata/gopen.py new file mode 100644 index 0000000000000000000000000000000000000000..457d048a6df940369025ed306c68561f6846322b --- /dev/null +++ b/paddlespeech/audio/streamdata/gopen.py @@ -0,0 +1,340 @@ +# +# Copyright (c) 2017-2021 NVIDIA CORPORATION. All rights reserved. +# This file is part of the WebDataset library. +# See the LICENSE file for licensing terms (BSD-style). +# + + +"""Open URLs by calling subcommands.""" + +import os, sys, re +from subprocess import PIPE, Popen +from urllib.parse import urlparse + +# global used for printing additional node information during verbose output +info = {} + + +class Pipe: + """Wrapper class for subprocess.Pipe. + + This class looks like a stream from the outside, but it checks + subprocess status and handles timeouts with exceptions. + This way, clients of the class do not need to know that they are + dealing with subprocesses. + + :param *args: passed to `subprocess.Pipe` + :param **kw: passed to `subprocess.Pipe` + :param timeout: timeout for closing/waiting + :param ignore_errors: don't raise exceptions on subprocess errors + :param ignore_status: list of status codes to ignore + """ + + def __init__( + self, + *args, + mode=None, + timeout=7200.0, + ignore_errors=False, + ignore_status=[], + **kw, + ): + """Create an IO Pipe.""" + self.ignore_errors = ignore_errors + self.ignore_status = [0] + ignore_status + self.timeout = timeout + self.args = (args, kw) + if mode[0] == "r": + self.proc = Popen(*args, stdout=PIPE, **kw) + self.stream = self.proc.stdout + if self.stream is None: + raise ValueError(f"{args}: couldn't open") + elif mode[0] == "w": + self.proc = Popen(*args, stdin=PIPE, **kw) + self.stream = self.proc.stdin + if self.stream is None: + raise ValueError(f"{args}: couldn't open") + self.status = None + + def __str__(self): + return f"" + + def check_status(self): + """Poll the process and handle any errors.""" + status = self.proc.poll() + if status is not None: + self.wait_for_child() + + def wait_for_child(self): + """Check the status variable and raise an exception if necessary.""" + verbose = int(os.environ.get("GOPEN_VERBOSE", 0)) + if self.status is not None and verbose: + # print(f"(waiting again [{self.status} {os.getpid()}:{self.proc.pid}])", file=sys.stderr) + return + self.status = self.proc.wait() + if verbose: + print( + f"pipe exit [{self.status} {os.getpid()}:{self.proc.pid}] {self.args} {info}", + file=sys.stderr, + ) + if self.status not in self.ignore_status and not self.ignore_errors: + raise Exception(f"{self.args}: exit {self.status} (read) {info}") + + def read(self, *args, **kw): + """Wrap stream.read and checks status.""" + result = self.stream.read(*args, **kw) + self.check_status() + return result + + def write(self, *args, **kw): + """Wrap stream.write and checks status.""" + result = self.stream.write(*args, **kw) + self.check_status() + return result + + def readLine(self, *args, **kw): + """Wrap stream.readLine and checks status.""" + result = self.stream.readLine(*args, **kw) + self.status = self.proc.poll() + self.check_status() + return result + + def close(self): + """Wrap stream.close, wait for the subprocess, and handle errors.""" + self.stream.close() + self.status = self.proc.wait(self.timeout) + self.wait_for_child() + + def __enter__(self): + """Context handler.""" + return self + + def __exit__(self, etype, value, traceback): + """Context handler.""" + self.close() + + +def set_options( + obj, timeout=None, ignore_errors=None, ignore_status=None, handler=None +): + """Set options for Pipes. + + This function can be called on any stream. It will set pipe options only + when its argument is a pipe. + + :param obj: any kind of stream + :param timeout: desired timeout + :param ignore_errors: desired ignore_errors setting + :param ignore_status: desired ignore_status setting + :param handler: desired error handler + """ + if not isinstance(obj, Pipe): + return False + if timeout is not None: + obj.timeout = timeout + if ignore_errors is not None: + obj.ignore_errors = ignore_errors + if ignore_status is not None: + obj.ignore_status = ignore_status + if handler is not None: + obj.handler = handler + return True + + +def gopen_file(url, mode="rb", bufsize=8192): + """Open a file. + + This works for local files, files over HTTP, and pipe: files. + + :param url: URL to be opened + :param mode: mode to open it with + :param bufsize: requested buffer size + """ + return open(url, mode) + + +def gopen_pipe(url, mode="rb", bufsize=8192): + """Use gopen to open a pipe. + + :param url: a pipe: URL + :param mode: desired mode + :param bufsize: desired buffer size + """ + assert url.startswith("pipe:") + cmd = url[5:] + if mode[0] == "r": + return Pipe( + cmd, + mode=mode, + shell=True, + bufsize=bufsize, + ignore_status=[141], + ) # skipcq: BAN-B604 + elif mode[0] == "w": + return Pipe( + cmd, + mode=mode, + shell=True, + bufsize=bufsize, + ignore_status=[141], + ) # skipcq: BAN-B604 + else: + raise ValueError(f"{mode}: unknown mode") + + +def gopen_curl(url, mode="rb", bufsize=8192): + """Open a URL with `curl`. + + :param url: url (usually, http:// etc.) + :param mode: file mode + :param bufsize: buffer size + """ + if mode[0] == "r": + cmd = f"curl -s -L '{url}'" + return Pipe( + cmd, + mode=mode, + shell=True, + bufsize=bufsize, + ignore_status=[141, 23], + ) # skipcq: BAN-B604 + elif mode[0] == "w": + cmd = f"curl -s -L -T - '{url}'" + return Pipe( + cmd, + mode=mode, + shell=True, + bufsize=bufsize, + ignore_status=[141, 26], + ) # skipcq: BAN-B604 + else: + raise ValueError(f"{mode}: unknown mode") + + +def gopen_htgs(url, mode="rb", bufsize=8192): + """Open a URL with `curl`. + + :param url: url (usually, http:// etc.) + :param mode: file mode + :param bufsize: buffer size + """ + if mode[0] == "r": + url = re.sub(r"(?i)^htgs://", "gs://", url) + cmd = f"curl -s -L '{url}'" + return Pipe( + cmd, + mode=mode, + shell=True, + bufsize=bufsize, + ignore_status=[141, 23], + ) # skipcq: BAN-B604 + elif mode[0] == "w": + raise ValueError(f"{mode}: cannot write") + else: + raise ValueError(f"{mode}: unknown mode") + + + +def gopen_gsutil(url, mode="rb", bufsize=8192): + """Open a URL with `curl`. + + :param url: url (usually, http:// etc.) + :param mode: file mode + :param bufsize: buffer size + """ + if mode[0] == "r": + cmd = f"gsutil cat '{url}'" + return Pipe( + cmd, + mode=mode, + shell=True, + bufsize=bufsize, + ignore_status=[141, 23], + ) # skipcq: BAN-B604 + elif mode[0] == "w": + cmd = f"gsutil cp - '{url}'" + return Pipe( + cmd, + mode=mode, + shell=True, + bufsize=bufsize, + ignore_status=[141, 26], + ) # skipcq: BAN-B604 + else: + raise ValueError(f"{mode}: unknown mode") + + + +def gopen_error(url, *args, **kw): + """Raise a value error. + + :param url: url + :param args: other arguments + :param kw: other keywords + """ + raise ValueError(f"{url}: no gopen handler defined") + + +"""A dispatch table mapping URL schemes to handlers.""" +gopen_schemes = dict( + __default__=gopen_error, + pipe=gopen_pipe, + http=gopen_curl, + https=gopen_curl, + sftp=gopen_curl, + ftps=gopen_curl, + scp=gopen_curl, + gs=gopen_gsutil, + htgs=gopen_htgs, +) + + +def gopen(url, mode="rb", bufsize=8192, **kw): + """Open the URL. + + This uses the `gopen_schemes` dispatch table to dispatch based + on scheme. + + Support for the following schemes is built-in: pipe, file, + http, https, sftp, ftps, scp. + + When no scheme is given the url is treated as a file. + + You can use the OPEN_VERBOSE argument to get info about + files being opened. + + :param url: the source URL + :param mode: the mode ("rb", "r") + :param bufsize: the buffer size + """ + global fallback_gopen + verbose = int(os.environ.get("GOPEN_VERBOSE", 0)) + if verbose: + print("GOPEN", url, info, file=sys.stderr) + assert mode in ["rb", "wb"], mode + if url == "-": + if mode == "rb": + return sys.stdin.buffer + elif mode == "wb": + return sys.stdout.buffer + else: + raise ValueError(f"unknown mode {mode}") + pr = urlparse(url) + if pr.scheme == "": + bufsize = int(os.environ.get("GOPEN_BUFFER", -1)) + return open(url, mode, buffering=bufsize) + if pr.scheme == "file": + bufsize = int(os.environ.get("GOPEN_BUFFER", -1)) + return open(pr.path, mode, buffering=bufsize) + handler = gopen_schemes["__default__"] + handler = gopen_schemes.get(pr.scheme, handler) + return handler(url, mode, bufsize, **kw) + + +def reader(url, **kw): + """Open url with gopen and mode "rb". + + :param url: source URL + :param kw: other keywords forwarded to gopen + """ + return gopen(url, "rb", **kw) diff --git a/paddlespeech/audio/streamdata/handlers.py b/paddlespeech/audio/streamdata/handlers.py new file mode 100644 index 0000000000000000000000000000000000000000..7f3d28b62e914e4b07e87c71f1a07f2cbeefbfca --- /dev/null +++ b/paddlespeech/audio/streamdata/handlers.py @@ -0,0 +1,47 @@ +# +# Copyright (c) 2017-2021 NVIDIA CORPORATION. All rights reserved. +# This file is part of the WebDataset library. +# See the LICENSE file for licensing terms (BSD-style). +# + +"""Pluggable exception handlers. + +These are functions that take an exception as an argument and then return... + +- the exception (in order to re-raise it) +- True (in order to continue and ignore the exception) +- False (in order to ignore the exception and stop processing) + +They are used as handler= arguments in much of the library. +""" + +import time, warnings + + +def reraise_exception(exn): + """Call in an exception handler to re-raise the exception.""" + raise exn + + +def ignore_and_continue(exn): + """Call in an exception handler to ignore any exception and continue.""" + return True + + +def warn_and_continue(exn): + """Call in an exception handler to ignore any exception, isssue a warning, and continue.""" + warnings.warn(repr(exn)) + time.sleep(0.5) + return True + + +def ignore_and_stop(exn): + """Call in an exception handler to ignore any exception and stop further processing.""" + return False + + +def warn_and_stop(exn): + """Call in an exception handler to ignore any exception and stop further processing.""" + warnings.warn(repr(exn)) + time.sleep(0.5) + return False diff --git a/paddlespeech/audio/streamdata/mix.py b/paddlespeech/audio/streamdata/mix.py new file mode 100644 index 0000000000000000000000000000000000000000..7d790f00f242584ef6bbca73f35e85c4803818ec --- /dev/null +++ b/paddlespeech/audio/streamdata/mix.py @@ -0,0 +1,85 @@ +# +# Copyright (c) 2017-2021 NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# This file is part of the WebDataset library. +# See the LICENSE file for licensing terms (BSD-style). +# Modified from https://github.com/webdataset/webdataset +# + +"""Classes for mixing samples from multiple sources.""" + +import itertools, os, random, time, sys +from functools import reduce, wraps + +import numpy as np + +from . import autodecode, utils +from .paddle_utils import PaddleTensor, IterableDataset +from .utils import PipelineStage + + +def round_robin_shortest(*sources): + i = 0 + while True: + try: + sample = next(sources[i % len(sources)]) + yield sample + except StopIteration: + break + i += 1 + + +def round_robin_longest(*sources): + i = 0 + while len(sources) > 0: + try: + sample = next(sources[i]) + i += 1 + yield sample + except StopIteration: + del sources[i] + + +class RoundRobin(IterableDataset): + def __init__(self, datasets, longest=False): + self.datasets = datasets + self.longest = longest + + def __iter__(self): + """Return an iterator over the sources.""" + sources = [iter(d) for d in self.datasets] + if self.longest: + return round_robin_longest(*sources) + else: + return round_robin_shortest(*sources) + + +def random_samples(sources, probs=None, longest=False): + if probs is None: + probs = [1] * len(sources) + else: + probs = list(probs) + while len(sources) > 0: + cum = (np.array(probs) / np.sum(probs)).cumsum() + r = random.random() + i = np.searchsorted(cum, r) + try: + yield next(sources[i]) + except StopIteration: + if longest: + del sources[i] + del probs[i] + else: + break + + +class RandomMix(IterableDataset): + def __init__(self, datasets, probs=None, longest=False): + self.datasets = datasets + self.probs = probs + self.longest = longest + + def __iter__(self): + """Return an iterator over the sources.""" + sources = [iter(d) for d in self.datasets] + return random_samples(sources, self.probs, longest=self.longest) diff --git a/paddlespeech/audio/streamdata/paddle_utils.py b/paddlespeech/audio/streamdata/paddle_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..02bc4c84155b033478f6d2de7a80f9c773da6abf --- /dev/null +++ b/paddlespeech/audio/streamdata/paddle_utils.py @@ -0,0 +1,33 @@ +# +# Copyright (c) 2017-2021 NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# This file is part of the WebDataset library. +# See the LICENSE file for licensing terms (BSD-style). +# Modified from https://github.com/webdataset/webdataset +# + +"""Mock implementations of paddle interfaces when paddle is not available.""" + + +try: + from paddle.io import DataLoader, IterableDataset +except ModuleNotFoundError: + + class IterableDataset: + """Empty implementation of IterableDataset when paddle is not available.""" + + pass + + class DataLoader: + """Empty implementation of DataLoader when paddle is not available.""" + + pass + +try: + from paddle import Tensor as PaddleTensor +except ModuleNotFoundError: + + class TorchTensor: + """Empty implementation of PaddleTensor when paddle is not available.""" + + pass diff --git a/paddlespeech/audio/streamdata/pipeline.py b/paddlespeech/audio/streamdata/pipeline.py new file mode 100644 index 0000000000000000000000000000000000000000..7339a762a0b935f7308521811155c06cb989cfd5 --- /dev/null +++ b/paddlespeech/audio/streamdata/pipeline.py @@ -0,0 +1,132 @@ +# Copyright (c) 2017-2019 NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# See the LICENSE file for licensing terms (BSD-style). +# Modified from https://github.com/webdataset/webdataset +#%% +import copy, os, random, sys, time +from dataclasses import dataclass +from itertools import islice +from typing import List + +import braceexpand, yaml + +from .handlers import reraise_exception +from .paddle_utils import DataLoader, IterableDataset +from .utils import PipelineStage + + +def add_length_method(obj): + def length(self): + return self.size + + Combined = type( + obj.__class__.__name__ + "_Length", + (obj.__class__, IterableDataset), + {"__len__": length}, + ) + obj.__class__ = Combined + return obj + + +class DataPipeline(IterableDataset, PipelineStage): + """A pipeline starting with an IterableDataset and a series of filters.""" + + def __init__(self, *args, **kwargs): + super().__init__() + self.pipeline = [] + self.length = -1 + self.repetitions = 1 + self.nsamples = -1 + for arg in args: + if arg is None: + continue + if isinstance(arg, list): + self.pipeline.extend(arg) + else: + self.pipeline.append(arg) + + def invoke(self, f, *args, **kwargs): + """Apply a pipeline stage, possibly to the output of a previous stage.""" + if isinstance(f, PipelineStage): + return f.run(*args, **kwargs) + if isinstance(f, (IterableDataset, DataLoader)) and len(args) == 0: + return iter(f) + if isinstance(f, list): + return iter(f) + if callable(f): + result = f(*args, **kwargs) + return result + raise ValueError(f"{f}: not a valid pipeline stage") + + def iterator1(self): + """Create an iterator through one epoch in the pipeline.""" + source = self.invoke(self.pipeline[0]) + for step in self.pipeline[1:]: + source = self.invoke(step, source) + return source + + def iterator(self): + """Create an iterator through the entire dataset, using the given number of repetitions.""" + for i in range(self.repetitions): + for sample in self.iterator1(): + yield sample + + def __iter__(self): + """Create an iterator through the pipeline, repeating and slicing as requested.""" + if self.repetitions != 1: + if self.nsamples > 0: + return islice(self.iterator(), self.nsamples) + else: + return self.iterator() + else: + return self.iterator() + + def stage(self, i): + """Return pipeline stage i.""" + return self.pipeline[i] + + def append(self, f): + """Append a pipeline stage (modifies the object).""" + self.pipeline.append(f) + return self + + def append_list(self, *args): + for arg in args: + self.pipeline.append(arg) + return self + + def compose(self, *args): + """Append a pipeline stage to a copy of the pipeline and returns the copy.""" + result = copy.copy(self) + for arg in args: + result.append(arg) + return result + + def with_length(self, n): + """Add a __len__ method returning the desired value. + + This does not change the actual number of samples in an epoch. + PyTorch IterableDataset should not have a __len__ method. + This is provided only as a workaround for some broken training environments + that require a __len__ method. + """ + self.size = n + return add_length_method(self) + + def with_epoch(self, nsamples=-1, nbatches=-1): + """Change the epoch to return the given number of samples/batches. + + The two arguments mean the same thing.""" + self.repetitions = sys.maxsize + self.nsamples = max(nsamples, nbatches) + return self + + def repeat(self, nepochs=-1, nbatches=-1): + """Repeat iterating through the dataset for the given #epochs up to the given #samples.""" + if nepochs > 0: + self.repetitions = nepochs + self.nsamples = nbatches + else: + self.repetitions = sys.maxsize + self.nsamples = nbatches + return self diff --git a/paddlespeech/audio/streamdata/shardlists.py b/paddlespeech/audio/streamdata/shardlists.py new file mode 100644 index 0000000000000000000000000000000000000000..cfaf9a64b27f6678402b425d191a98f6a600b6ac --- /dev/null +++ b/paddlespeech/audio/streamdata/shardlists.py @@ -0,0 +1,261 @@ +# +# Copyright (c) 2017-2021 NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# This file is part of the WebDataset library. +# See the LICENSE file for licensing terms (BSD-style). +# + +# Modified from https://github.com/webdataset/webdataset + +"""Train PyTorch models directly from POSIX tar archive. + +Code works locally or over HTTP connections. +""" + +import os, random, sys, time +from dataclasses import dataclass, field +from itertools import islice +from typing import List + +import braceexpand, yaml + +from . import utils +from .filters import pipelinefilter +from .paddle_utils import IterableDataset + + +from ..utils.log import Logger +logger = Logger(__name__) +def expand_urls(urls): + if isinstance(urls, str): + urllist = urls.split("::") + result = [] + for url in urllist: + result.extend(braceexpand.braceexpand(url)) + return result + else: + return list(urls) + + +class SimpleShardList(IterableDataset): + """An iterable dataset yielding a list of urls.""" + + def __init__(self, urls, seed=None): + """Iterate through the list of shards. + + :param urls: a list of URLs as a Python list or brace notation string + """ + super().__init__() + urls = expand_urls(urls) + self.urls = urls + assert isinstance(self.urls[0], str) + self.seed = seed + + def __len__(self): + return len(self.urls) + + def __iter__(self): + """Return an iterator over the shards.""" + urls = self.urls.copy() + if self.seed is not None: + random.Random(self.seed).shuffle(urls) + for url in urls: + yield dict(url=url) + + +def split_by_node(src, group=None): + rank, world_size, worker, num_workers = utils.paddle_worker_info(group=group) + logger.info(f"world_size:{world_size}, rank:{rank}") + if world_size > 1: + for s in islice(src, rank, None, world_size): + yield s + else: + for s in src: + yield s + + +def single_node_only(src, group=None): + rank, world_size, worker, num_workers = utils.paddle_worker_info(group=group) + if world_size > 1: + raise ValueError("input pipeline needs to be reconfigured for multinode training") + for s in src: + yield s + + +def split_by_worker(src): + rank, world_size, worker, num_workers = utils.paddle_worker_info() + logger.info(f"num_workers:{num_workers}, worker:{worker}") + if num_workers > 1: + for s in islice(src, worker, None, num_workers): + yield s + else: + for s in src: + yield s + + +def resampled_(src, n=sys.maxsize): + import random + + seed = time.time() + try: + seed = open("/dev/random", "rb").read(20) + except Exception as exn: + print(repr(exn)[:50], file=sys.stderr) + rng = random.Random(seed) + print("# resampled loading", file=sys.stderr) + items = list(src) + print(f"# resampled got {len(items)} samples, yielding {n}", file=sys.stderr) + for i in range(n): + yield rng.choice(items) + + +resampled = pipelinefilter(resampled_) + + +def non_empty(src): + count = 0 + for s in src: + yield s + count += 1 + if count == 0: + raise ValueError("pipeline stage received no data at all and this was declared as an error") + + +@dataclass +class MSSource: + """Class representing a data source.""" + + name: str = "" + perepoch: int = -1 + resample: bool = False + urls: List[str] = field(default_factory=list) + + +default_rng = random.Random() + + +def expand(s): + return os.path.expanduser(os.path.expandvars(s)) + + +class MultiShardSample(IterableDataset): + def __init__(self, fname): + """Construct a shardlist from multiple sources using a YAML spec.""" + self.epoch = -1 +class MultiShardSample(IterableDataset): + def __init__(self, fname): + """Construct a shardlist from multiple sources using a YAML spec.""" + self.epoch = -1 + self.parse_spec(fname) + + def parse_spec(self, fname): + self.rng = default_rng # capture default_rng if we fork + if isinstance(fname, dict): + spec = fname + fname = "{dict}" + else: + with open(fname) as stream: + spec = yaml.safe_load(stream) + assert set(spec.keys()).issubset(set("prefix datasets buckets".split())), list(spec.keys()) + prefix = expand(spec.get("prefix", "")) + self.sources = [] + for ds in spec["datasets"]: + assert set(ds.keys()).issubset(set("buckets name shards resample choose".split())), list( + ds.keys() + ) + buckets = ds.get("buckets", spec.get("buckets", [])) + if isinstance(buckets, str): + buckets = [buckets] + buckets = [expand(s) for s in buckets] + if buckets == []: + buckets = [""] + assert len(buckets) == 1, f"{buckets}: FIXME support for multiple buckets unimplemented" + bucket = buckets[0] + name = ds.get("name", "@" + bucket) + urls = ds["shards"] + if isinstance(urls, str): + urls = [urls] + # urls = [u for url in urls for u in braceexpand.braceexpand(url)] + urls = [ + prefix + os.path.join(bucket, u) for url in urls for u in braceexpand.braceexpand(expand(url)) + ] + resample = ds.get("resample", -1) + nsample = ds.get("choose", -1) + if nsample > len(urls): + raise ValueError(f"perepoch {nsample} must be no greater than the number of shards") + if (nsample > 0) and (resample > 0): + raise ValueError("specify only one of perepoch or choose") + entry = MSSource(name=name, urls=urls, perepoch=nsample, resample=resample) + self.sources.append(entry) + print(f"# {name} {len(urls)} {nsample}", file=sys.stderr) + + def set_epoch(self, seed): + """Set the current epoch (for consistent shard selection among nodes).""" + self.rng = random.Random(seed) + + def get_shards_for_epoch(self): + result = [] + for source in self.sources: + if source.resample > 0: + # sample with replacement + l = self.rng.choices(source.urls, k=source.resample) + elif source.perepoch > 0: + # sample without replacement + l = list(source.urls) + self.rng.shuffle(l) + l = l[: source.perepoch] + else: + l = list(source.urls) + result += l + self.rng.shuffle(result) + return result + + def __iter__(self): + shards = self.get_shards_for_epoch() + for shard in shards: + yield dict(url=shard) + + +def shardspec(spec): + if spec.endswith(".yaml"): + return MultiShardSample(spec) + else: + return SimpleShardList(spec) + + +class ResampledShards(IterableDataset): + """An iterable dataset yielding a list of urls.""" + + def __init__( + self, + urls, + nshards=sys.maxsize, + worker_seed=None, + deterministic=False, + ): + """Sample shards from the shard list with replacement. + + :param urls: a list of URLs as a Python list or brace notation string + """ + super().__init__() + urls = expand_urls(urls) + self.urls = urls + assert isinstance(self.urls[0], str) + self.nshards = nshards + self.worker_seed = utils.paddle_worker_seed if worker_seed is None else worker_seed + self.deterministic = deterministic + self.epoch = -1 + + def __iter__(self): + """Return an iterator over the shards.""" + self.epoch += 1 + if self.deterministic: + seed = utils.make_seed(self.worker_seed(), self.epoch) + else: + seed = utils.make_seed(self.worker_seed(), self.epoch, os.getpid(), time.time_ns(), os.urandom(4)) + if os.environ.get("WDS_SHOW_SEED", "0") == "1": + print(f"# ResampledShards seed {seed}") + self.rng = random.Random(seed) + for _ in range(self.nshards): + index = self.rng.randint(0, len(self.urls) - 1) + yield dict(url=self.urls[index]) diff --git a/paddlespeech/audio/streamdata/tariterators.py b/paddlespeech/audio/streamdata/tariterators.py new file mode 100644 index 0000000000000000000000000000000000000000..b1616918ca52b2833ce636c33dcc278569fefff7 --- /dev/null +++ b/paddlespeech/audio/streamdata/tariterators.py @@ -0,0 +1,283 @@ +# +# Copyright (c) 2017-2021 NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# This file is part of the WebDataset library. +# See the LICENSE file for licensing terms (BSD-style). + +# Modified from https://github.com/webdataset/webdataset +# Modified from wenet(https://github.com/wenet-e2e/wenet) + +"""Low level iteration functions for tar archives.""" + +import random, re, tarfile + +import braceexpand + +from . import filters +from . import gopen +from .handlers import reraise_exception + +trace = False +meta_prefix = "__" +meta_suffix = "__" + +import paddlespeech +import paddle +import numpy as np + +AUDIO_FORMAT_SETS = set(['flac', 'mp3', 'm4a', 'ogg', 'opus', 'wav', 'wma']) + +def base_plus_ext(path): + """Split off all file extensions. + + Returns base, allext. + + :param path: path with extensions + :param returns: path with all extensions removed + + """ + match = re.match(r"^((?:.*/|)[^.]+)[.]([^/]*)$", path) + if not match: + return None, None + return match.group(1), match.group(2) + + +def valid_sample(sample): + """Check whether a sample is valid. + + :param sample: sample to be checked + """ + return ( + sample is not None + and isinstance(sample, dict) + and len(list(sample.keys())) > 0 + and not sample.get("__bad__", False) + ) + + +# FIXME: UNUSED +def shardlist(urls, *, shuffle=False): + """Given a list of URLs, yields that list, possibly shuffled.""" + if isinstance(urls, str): + urls = braceexpand.braceexpand(urls) + else: + urls = list(urls) + if shuffle: + random.shuffle(urls) + for url in urls: + yield dict(url=url) + + +def url_opener(data, handler=reraise_exception, **kw): + """Given a stream of url names (packaged in `dict(url=url)`), yield opened streams.""" + for sample in data: + assert isinstance(sample, dict), sample + assert "url" in sample + url = sample["url"] + try: + stream = gopen.gopen(url, **kw) + sample.update(stream=stream) + yield sample + except Exception as exn: + exn.args = exn.args + (url,) + if handler(exn): + continue + else: + break + + +def tar_file_iterator( + fileobj, skip_meta=r"__[^/]*__($|/)", handler=reraise_exception +): + """Iterate over tar file, yielding filename, content pairs for the given tar stream. + + :param fileobj: byte stream suitable for tarfile + :param skip_meta: regexp for keys that are skipped entirely (Default value = r"__[^/]*__($|/)") + + """ + stream = tarfile.open(fileobj=fileobj, mode="r:*") + for tarinfo in stream: + fname = tarinfo.name + try: + if not tarinfo.isreg(): + continue + if fname is None: + continue + if ( + "/" not in fname + and fname.startswith(meta_prefix) + and fname.endswith(meta_suffix) + ): + # skipping metadata for now + continue + if skip_meta is not None and re.match(skip_meta, fname): + continue + + name = tarinfo.name + pos = name.rfind('.') + assert pos > 0 + prefix, postfix = name[:pos], name[pos + 1:] + if postfix == 'wav': + waveform, sample_rate = paddlespeech.audio.load(stream.extractfile(tarinfo), normal=False) + result = dict(fname=prefix, wav=waveform, sample_rate = sample_rate) + else: + txt = stream.extractfile(tarinfo).read().decode('utf8').strip() + result = dict(fname=prefix, txt=txt) + #result = dict(fname=fname, data=data) + yield result + stream.members = [] + except Exception as exn: + if hasattr(exn, "args") and len(exn.args) > 0: + exn.args = (exn.args[0] + " @ " + str(fileobj),) + exn.args[1:] + if handler(exn): + continue + else: + break + del stream + +def tar_file_and_group_iterator( + fileobj, skip_meta=r"__[^/]*__($|/)", handler=reraise_exception +): + """ Expand a stream of open tar files into a stream of tar file contents. + And groups the file with same prefix + + Args: + data: Iterable[{src, stream}] + + Returns: + Iterable[{key, wav, txt, sample_rate}] + """ + stream = tarfile.open(fileobj=fileobj, mode="r:*") + prev_prefix = None + example = {} + valid = True + for tarinfo in stream: + name = tarinfo.name + pos = name.rfind('.') + assert pos > 0 + prefix, postfix = name[:pos], name[pos + 1:] + if prev_prefix is not None and prefix != prev_prefix: + example['fname'] = prev_prefix + if valid: + yield example + example = {} + valid = True + with stream.extractfile(tarinfo) as file_obj: + try: + if postfix == 'txt': + example['txt'] = file_obj.read().decode('utf8').strip() + elif postfix in AUDIO_FORMAT_SETS: + waveform, sample_rate = paddlespeech.audio.load(file_obj, normal=False) + waveform = paddle.to_tensor(np.expand_dims(np.array(waveform),0), dtype=paddle.float32) + + example['wav'] = waveform + example['sample_rate'] = sample_rate + else: + example[postfix] = file_obj.read() + except Exception as exn: + if hasattr(exn, "args") and len(exn.args) > 0: + exn.args = (exn.args[0] + " @ " + str(fileobj),) + exn.args[1:] + if handler(exn): + continue + else: + break + valid = False + # logging.warning('error to parse {}'.format(name)) + prev_prefix = prefix + if prev_prefix is not None: + example['fname'] = prev_prefix + yield example + stream.close() + +def tar_file_expander(data, handler=reraise_exception): + """Expand a stream of open tar files into a stream of tar file contents. + + This returns an iterator over (filename, file_contents). + """ + for source in data: + url = source["url"] + try: + assert isinstance(source, dict) + assert "stream" in source + for sample in tar_file_iterator(source["stream"]): + assert ( + isinstance(sample, dict) and "data" in sample and "fname" in sample + ) + sample["__url__"] = url + yield sample + except Exception as exn: + exn.args = exn.args + (source.get("stream"), source.get("url")) + if handler(exn): + continue + else: + break + + + + +def tar_file_and_group_expander(data, handler=reraise_exception): + """Expand a stream of open tar files into a stream of tar file contents. + + This returns an iterator over (filename, file_contents). + """ + for source in data: + url = source["url"] + try: + assert isinstance(source, dict) + assert "stream" in source + for sample in tar_file_and_group_iterator(source["stream"]): + assert ( + isinstance(sample, dict) and "wav" in sample and "txt" in sample and "fname" in sample + ) + sample["__url__"] = url + yield sample + except Exception as exn: + exn.args = exn.args + (source.get("stream"), source.get("url")) + if handler(exn): + continue + else: + break + + +def group_by_keys(data, keys=base_plus_ext, lcase=True, suffixes=None, handler=None): + """Return function over iterator that groups key, value pairs into samples. + + :param keys: function that splits the key into key and extension (base_plus_ext) + :param lcase: convert suffixes to lower case (Default value = True) + """ + current_sample = None + for filesample in data: + assert isinstance(filesample, dict) + fname, value = filesample["fname"], filesample["data"] + prefix, suffix = keys(fname) + if trace: + print( + prefix, + suffix, + current_sample.keys() if isinstance(current_sample, dict) else None, + ) + if prefix is None: + continue + if lcase: + suffix = suffix.lower() + if current_sample is None or prefix != current_sample["__key__"]: + if valid_sample(current_sample): + yield current_sample + current_sample = dict(__key__=prefix, __url__=filesample["__url__"]) + if suffix in current_sample: + raise ValueError( + f"{fname}: duplicate file name in tar file {suffix} {current_sample.keys()}" + ) + if suffixes is None or suffix in suffixes: + current_sample[suffix] = value + if valid_sample(current_sample): + yield current_sample + + +def tarfile_samples(src, handler=reraise_exception): + streams = url_opener(src, handler=handler) + samples = tar_file_and_group_expander(streams, handler=handler) + return samples + + +tarfile_to_samples = filters.pipelinefilter(tarfile_samples) diff --git a/paddlespeech/audio/streamdata/utils.py b/paddlespeech/audio/streamdata/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..c7294f2bfb4fd29dfca8cb32c33ec1ec4abe5913 --- /dev/null +++ b/paddlespeech/audio/streamdata/utils.py @@ -0,0 +1,132 @@ +# +# Copyright (c) 2017-2021 NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# This file is part of the WebDataset library. +# See the LICENSE file for licensing terms (BSD-style). +# + +# Modified from https://github.com/webdataset/webdataset + +"""Miscellaneous utility functions.""" + +import importlib +import itertools as itt +import os +import re +import sys +from typing import Any, Callable, Iterator, Optional, Union + +from ..utils.log import Logger + +logger = Logger(__name__) + +def make_seed(*args): + seed = 0 + for arg in args: + seed = (seed * 31 + hash(arg)) & 0x7FFFFFFF + return seed + + +class PipelineStage: + def invoke(self, *args, **kw): + raise NotImplementedError + + +def identity(x: Any) -> Any: + """Return the argument as is.""" + return x + + +def safe_eval(s: str, expr: str = "{}"): + """Evaluate the given expression more safely.""" + if re.sub("[^A-Za-z0-9_]", "", s) != s: + raise ValueError(f"safe_eval: illegal characters in: '{s}'") + return eval(expr.format(s)) + + +def lookup_sym(sym: str, modules: list): + """Look up a symbol in a list of modules.""" + for mname in modules: + module = importlib.import_module(mname, package="webdataset") + result = getattr(module, sym, None) + if result is not None: + return result + return None + + +def repeatedly0( + loader: Iterator, nepochs: int = sys.maxsize, nbatches: int = sys.maxsize +): + """Repeatedly returns batches from a DataLoader.""" + for epoch in range(nepochs): + for sample in itt.islice(loader, nbatches): + yield sample + + +def guess_batchsize(batch: Union[tuple, list]): + """Guess the batch size by looking at the length of the first element in a tuple.""" + return len(batch[0]) + + +def repeatedly( + source: Iterator, + nepochs: int = None, + nbatches: int = None, + nsamples: int = None, + batchsize: Callable[..., int] = guess_batchsize, +): + """Repeatedly yield samples from an iterator.""" + epoch = 0 + batch = 0 + total = 0 + while True: + for sample in source: + yield sample + batch += 1 + if nbatches is not None and batch >= nbatches: + return + if nsamples is not None: + total += guess_batchsize(sample) + if total >= nsamples: + return + epoch += 1 + if nepochs is not None and epoch >= nepochs: + return + +def paddle_worker_info(group=None): + """Return node and worker info for PyTorch and some distributed environments.""" + rank = 0 + world_size = 1 + worker = 0 + num_workers = 1 + if "RANK" in os.environ and "WORLD_SIZE" in os.environ: + rank = int(os.environ["RANK"]) + world_size = int(os.environ["WORLD_SIZE"]) + else: + try: + import paddle.distributed + group = group or paddle.distributed.get_group() + rank = paddle.distributed.get_rank() + world_size = paddle.distributed.get_world_size() + except ModuleNotFoundError: + pass + if "WORKER" in os.environ and "NUM_WORKERS" in os.environ: + worker = int(os.environ["WORKER"]) + num_workers = int(os.environ["NUM_WORKERS"]) + else: + try: + from paddle.io import get_worker_info + worker_info = paddle.io.get_worker_info() + if worker_info is not None: + worker = worker_info.id + num_workers = worker_info.num_workers + except ModuleNotFoundError as E: + logger.info(f"not found {E}") + exit(-1) + + return rank, world_size, worker, num_workers + +def paddle_worker_seed(group=None): + """Compute a distinct, deterministic RNG seed for each worker and node.""" + rank, world_size, worker, num_workers = paddle_worker_info(group=group) + return rank * 1000 + worker diff --git a/paddlespeech/audio/streamdata/writer.py b/paddlespeech/audio/streamdata/writer.py new file mode 100644 index 0000000000000000000000000000000000000000..7d4f7703bb98607fc511976583728d9b012b26a6 --- /dev/null +++ b/paddlespeech/audio/streamdata/writer.py @@ -0,0 +1,450 @@ +# +# Copyright (c) 2017-2021 NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# This file is part of the WebDataset library. +# See the LICENSE file for licensing terms (BSD-style). +# Modified from https://github.com/webdataset/webdataset +# + +"""Classes and functions for writing tar files and WebDataset files.""" + +import io, json, pickle, re, tarfile, time +from typing import Any, Callable, Optional, Union + +import numpy as np + +from . import gopen + + +def imageencoder(image: Any, format: str = "PNG"): # skipcq: PYL-W0622 + """Compress an image using PIL and return it as a string. + + Can handle float or uint8 images. + + :param image: ndarray representing an image + :param format: compression format (PNG, JPEG, PPM) + + """ + import PIL + + assert isinstance(image, (PIL.Image.Image, np.ndarray)), type(image) + + if isinstance(image, np.ndarray): + if image.dtype in [np.dtype("f"), np.dtype("d")]: + if not (np.amin(image) > -0.001 and np.amax(image) < 1.001): + raise ValueError( + f"image values out of range {np.amin(image)} {np.amax(image)}" + ) + image = np.clip(image, 0.0, 1.0) + image = np.array(image * 255.0, "uint8") + assert image.ndim in [2, 3] + if image.ndim == 3: + assert image.shape[2] in [1, 3] + image = PIL.Image.fromarray(image) + if format.upper() == "JPG": + format = "JPEG" + elif format.upper() in ["IMG", "IMAGE"]: + format = "PPM" + if format == "JPEG": + opts = dict(quality=100) + else: + opts = {} + with io.BytesIO() as result: + image.save(result, format=format, **opts) + return result.getvalue() + + +def bytestr(data: Any): + """Convert data into a bytestring. + + Uses str and ASCII encoding for data that isn't already in string format. + + :param data: data + """ + if isinstance(data, bytes): + return data + if isinstance(data, str): + return data.encode("ascii") + return str(data).encode("ascii") + +def paddle_dumps(data: Any): + """Dump data into a bytestring using paddle.dumps. + + This delays importing paddle until needed. + + :param data: data to be dumped + """ + import io + + import paddle + + stream = io.BytesIO() + paddle.save(data, stream) + return stream.getvalue() + +def numpy_dumps(data: np.ndarray): + """Dump data into a bytestring using numpy npy format. + + :param data: data to be dumped + """ + import io + + import numpy.lib.format + + stream = io.BytesIO() + numpy.lib.format.write_array(stream, data) + return stream.getvalue() + + +def numpy_npz_dumps(data: np.ndarray): + """Dump data into a bytestring using numpy npz format. + + :param data: data to be dumped + """ + import io + + stream = io.BytesIO() + np.savez_compressed(stream, **data) + return stream.getvalue() + + +def tenbin_dumps(x): + from . import tenbin + + if isinstance(x, list): + return memoryview(tenbin.encode_buffer(x)) + else: + return memoryview(tenbin.encode_buffer([x])) + + +def cbor_dumps(x): + import cbor + + return cbor.dumps(x) + + +def mp_dumps(x): + import msgpack + + return msgpack.packb(x) + + +def add_handlers(d, keys, value): + if isinstance(keys, str): + keys = keys.split() + for k in keys: + d[k] = value + + +def make_handlers(): + """Create a list of handlers for encoding data.""" + handlers = {} + add_handlers( + handlers, "cls cls2 class count index inx id", lambda x: str(x).encode("ascii") + ) + add_handlers(handlers, "txt text transcript", lambda x: x.encode("utf-8")) + add_handlers(handlers, "html htm", lambda x: x.encode("utf-8")) + add_handlers(handlers, "pyd pickle", pickle.dumps) + add_handlers(handlers, "pdparams", paddle_dumps) + add_handlers(handlers, "npy", numpy_dumps) + add_handlers(handlers, "npz", numpy_npz_dumps) + add_handlers(handlers, "ten tenbin tb", tenbin_dumps) + add_handlers(handlers, "json jsn", lambda x: json.dumps(x).encode("utf-8")) + add_handlers(handlers, "mp msgpack msg", mp_dumps) + add_handlers(handlers, "cbor", cbor_dumps) + add_handlers(handlers, "jpg jpeg img image", lambda data: imageencoder(data, "jpg")) + add_handlers(handlers, "png", lambda data: imageencoder(data, "png")) + add_handlers(handlers, "pbm", lambda data: imageencoder(data, "pbm")) + add_handlers(handlers, "pgm", lambda data: imageencoder(data, "pgm")) + add_handlers(handlers, "ppm", lambda data: imageencoder(data, "ppm")) + return handlers + + +default_handlers = make_handlers() + + +def encode_based_on_extension1(data: Any, tname: str, handlers: dict): + """Encode data based on its extension and a dict of handlers. + + :param data: data + :param tname: file extension + :param handlers: handlers + """ + if tname[0] == "_": + if not isinstance(data, str): + raise ValueError("the values of metadata must be of string type") + return data + extension = re.sub(r".*\.", "", tname).lower() + if isinstance(data, bytes): + return data + if isinstance(data, str): + return data.encode("utf-8") + handler = handlers.get(extension) + if handler is None: + raise ValueError(f"no handler found for {extension}") + return handler(data) + + +def encode_based_on_extension(sample: dict, handlers: dict): + """Encode an entire sample with a collection of handlers. + + :param sample: data sample (a dict) + :param handlers: handlers for encoding + """ + return { + k: encode_based_on_extension1(v, k, handlers) for k, v in list(sample.items()) + } + + +def make_encoder(spec: Union[bool, str, dict, Callable]): + """Make an encoder function from a specification. + + :param spec: specification + """ + if spec is False or spec is None: + + def encoder(x): + """Do not encode at all.""" + return x + + elif callable(spec): + encoder = spec + elif isinstance(spec, dict): + + def f(sample): + """Encode based on extension.""" + return encode_based_on_extension(sample, spec) + + encoder = f + + elif spec is True: + handlers = default_handlers + + def g(sample): + """Encode based on extension.""" + return encode_based_on_extension(sample, handlers) + + encoder = g + + else: + raise ValueError(f"{spec}: unknown decoder spec") + if not callable(encoder): + raise ValueError(f"{spec} did not yield a callable encoder") + return encoder + + +class TarWriter: + """A class for writing dictionaries to tar files. + + :param fileobj: fileobj: file name for tar file (.tgz/.tar) or open file descriptor + :param encoder: sample encoding (Default value = True) + :param compress: (Default value = None) + + `True` will use an encoder that behaves similar to the automatic + decoder for `Dataset`. `False` disables encoding and expects byte strings + (except for metadata, which must be strings). The `encoder` argument can + also be a `callable`, or a dictionary mapping extensions to encoders. + + The following code will add two file to the tar archive: `a/b.png` and + `a/b.output.png`. + + ```Python + tarwriter = TarWriter(stream) + image = imread("b.jpg") + image2 = imread("b.out.jpg") + sample = {"__key__": "a/b", "png": image, "output.png": image2} + tarwriter.write(sample) + ``` + """ + + def __init__( + self, + fileobj, + user: str = "bigdata", + group: str = "bigdata", + mode: int = 0o0444, + compress: Optional[bool] = None, + encoder: Union[None, bool, Callable] = True, + keep_meta: bool = False, + ): + """Create a tar writer. + + :param fileobj: stream to write data to + :param user: user for tar files + :param group: group for tar files + :param mode: mode for tar files + :param compress: desired compression + :param encoder: encoder function + :param keep_meta: keep metadata (entries starting with "_") + """ + if isinstance(fileobj, str): + if compress is False: + tarmode = "w|" + elif compress is True: + tarmode = "w|gz" + else: + tarmode = "w|gz" if fileobj.endswith("gz") else "w|" + fileobj = gopen.gopen(fileobj, "wb") + self.own_fileobj = fileobj + else: + tarmode = "w|gz" if compress is True else "w|" + self.own_fileobj = None + self.encoder = make_encoder(encoder) + self.keep_meta = keep_meta + self.stream = fileobj + self.tarstream = tarfile.open(fileobj=fileobj, mode=tarmode) + + self.user = user + self.group = group + self.mode = mode + self.compress = compress + + def __enter__(self): + """Enter context.""" + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + """Exit context.""" + self.close() + + def close(self): + """Close the tar file.""" + self.tarstream.close() + if self.own_fileobj is not None: + self.own_fileobj.close() + self.own_fileobj = None + + def write(self, obj): + """Write a dictionary to the tar file. + + :param obj: dictionary of objects to be stored + :returns: size of the entry + + """ + total = 0 + obj = self.encoder(obj) + if "__key__" not in obj: + raise ValueError("object must contain a __key__") + for k, v in list(obj.items()): + if k[0] == "_": + continue + if not isinstance(v, (bytes, bytearray, memoryview)): + raise ValueError( + f"{k} doesn't map to a bytes after encoding ({type(v)})" + ) + key = obj["__key__"] + for k in sorted(obj.keys()): + if k == "__key__": + continue + if not self.keep_meta and k[0] == "_": + continue + v = obj[k] + if isinstance(v, str): + v = v.encode("utf-8") + now = time.time() + ti = tarfile.TarInfo(key + "." + k) + ti.size = len(v) + ti.mtime = now + ti.mode = self.mode + ti.uname = self.user + ti.gname = self.group + if not isinstance(v, (bytes, bytearray, memoryview)): + raise ValueError(f"converter didn't yield bytes: {k}, {type(v)}") + stream = io.BytesIO(v) + self.tarstream.addfile(ti, stream) + total += ti.size + return total + + +class ShardWriter: + """Like TarWriter but splits into multiple shards.""" + + def __init__( + self, + pattern: str, + maxcount: int = 100000, + maxsize: float = 3e9, + post: Optional[Callable] = None, + start_shard: int = 0, + **kw, + ): + """Create a ShardWriter. + + :param pattern: output file pattern + :param maxcount: maximum number of records per shard (Default value = 100000) + :param maxsize: maximum size of each shard (Default value = 3e9) + :param kw: other options passed to TarWriter + """ + self.verbose = 1 + self.kw = kw + self.maxcount = maxcount + self.maxsize = maxsize + self.post = post + + self.tarstream = None + self.shard = start_shard + self.pattern = pattern + self.total = 0 + self.count = 0 + self.size = 0 + self.fname = None + self.next_stream() + + def next_stream(self): + """Close the current stream and move to the next.""" + self.finish() + self.fname = self.pattern % self.shard + if self.verbose: + print( + "# writing", + self.fname, + self.count, + "%.1f GB" % (self.size / 1e9), + self.total, + ) + self.shard += 1 + stream = open(self.fname, "wb") + self.tarstream = TarWriter(stream, **self.kw) + self.count = 0 + self.size = 0 + + def write(self, obj): + """Write a sample. + + :param obj: sample to be written + """ + if ( + self.tarstream is None + or self.count >= self.maxcount + or self.size >= self.maxsize + ): + self.next_stream() + size = self.tarstream.write(obj) + self.count += 1 + self.total += 1 + self.size += size + + def finish(self): + """Finish all writing (use close instead).""" + if self.tarstream is not None: + self.tarstream.close() + assert self.fname is not None + if callable(self.post): + self.post(self.fname) + self.tarstream = None + + def close(self): + """Close the stream.""" + self.finish() + del self.tarstream + del self.shard + del self.count + del self.size + + def __enter__(self): + """Enter context.""" + return self + + def __exit__(self, *args, **kw): + """Exit context.""" + self.close() diff --git a/paddlespeech/audio/text/text_featurizer.py b/paddlespeech/audio/text/text_featurizer.py new file mode 100644 index 0000000000000000000000000000000000000000..91c4d75c35fe49a6d391ec21a1e5fd3df5b3f310 --- /dev/null +++ b/paddlespeech/audio/text/text_featurizer.py @@ -0,0 +1,235 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Contains the text featurizer class.""" +from pprint import pformat +from typing import Union + +import sentencepiece as spm + +from .utility import BLANK +from .utility import EOS +from .utility import load_dict +from .utility import MASKCTC +from .utility import SOS +from .utility import SPACE +from .utility import UNK +from ..utils.log import Logger + +logger = Logger(__name__) + +__all__ = ["TextFeaturizer"] + + +class TextFeaturizer(): + def __init__(self, unit_type, vocab, spm_model_prefix=None, maskctc=False): + """Text featurizer, for processing or extracting features from text. + + Currently, it supports char/word/sentence-piece level tokenizing and conversion into + a list of token indices. Note that the token indexing order follows the + given vocabulary file. + + Args: + unit_type (str): unit type, e.g. char, word, spm + vocab Option[str, list]: Filepath to load vocabulary for token indices conversion, or vocab list. + spm_model_prefix (str, optional): spm model prefix. Defaults to None. + """ + assert unit_type in ('char', 'spm', 'word') + self.unit_type = unit_type + self.unk = UNK + self.maskctc = maskctc + + if vocab: + self.vocab_dict, self._id2token, self.vocab_list, self.unk_id, self.eos_id, self.blank_id = self._load_vocabulary_from_file( + vocab, maskctc) + self.vocab_size = len(self.vocab_list) + else: + logger.warning("TextFeaturizer: not have vocab file or vocab list.") + + if unit_type == 'spm': + spm_model = spm_model_prefix + '.model' + self.sp = spm.SentencePieceProcessor() + self.sp.Load(spm_model) + + def tokenize(self, text, replace_space=True): + if self.unit_type == 'char': + tokens = self.char_tokenize(text, replace_space) + elif self.unit_type == 'word': + tokens = self.word_tokenize(text) + else: # spm + tokens = self.spm_tokenize(text) + return tokens + + def detokenize(self, tokens): + if self.unit_type == 'char': + text = self.char_detokenize(tokens) + elif self.unit_type == 'word': + text = self.word_detokenize(tokens) + else: # spm + text = self.spm_detokenize(tokens) + return text + + def featurize(self, text): + """Convert text string to a list of token indices. + + Args: + text (str): Text to process. + + Returns: + List[int]: List of token indices. + """ + tokens = self.tokenize(text) + ids = [] + for token in tokens: + if token not in self.vocab_dict: + logger.debug(f"Text Token: {token} -> {self.unk}") + token = self.unk + ids.append(self.vocab_dict[token]) + return ids + + def defeaturize(self, idxs): + """Convert a list of token indices to text string, + ignore index after eos_id. + + Args: + idxs (List[int]): List of token indices. + + Returns: + str: Text. + """ + tokens = [] + for idx in idxs: + if idx == self.eos_id: + break + tokens.append(self._id2token[idx]) + text = self.detokenize(tokens) + return text + + def char_tokenize(self, text, replace_space=True): + """Character tokenizer. + + Args: + text (str): text string. + replace_space (bool): False only used by build_vocab.py. + + Returns: + List[str]: tokens. + """ + text = text.strip() + if replace_space: + text_list = [SPACE if item == " " else item for item in list(text)] + else: + text_list = list(text) + return text_list + + def char_detokenize(self, tokens): + """Character detokenizer. + + Args: + tokens (List[str]): tokens. + + Returns: + str: text string. + """ + tokens = [t.replace(SPACE, " ") for t in tokens] + return "".join(tokens) + + def word_tokenize(self, text): + """Word tokenizer, separate by .""" + return text.strip().split() + + def word_detokenize(self, tokens): + """Word detokenizer, separate by .""" + return " ".join(tokens) + + def spm_tokenize(self, text): + """spm tokenize. + + Args: + text (str): text string. + + Returns: + List[str]: sentence pieces str code + """ + stats = {"num_empty": 0, "num_filtered": 0} + + def valid(line): + return True + + def encode(l): + return self.sp.EncodeAsPieces(l) + + def encode_line(line): + line = line.strip() + if len(line) > 0: + line = encode(line) + if valid(line): + return line + else: + stats["num_filtered"] += 1 + else: + stats["num_empty"] += 1 + return None + + enc_line = encode_line(text) + return enc_line + + def spm_detokenize(self, tokens, input_format='piece'): + """spm detokenize. + + Args: + ids (List[str]): tokens. + + Returns: + str: text + """ + if input_format == "piece": + + def decode(l): + return "".join(self.sp.DecodePieces(l)) + elif input_format == "id": + + def decode(l): + return "".join(self.sp.DecodeIds(l)) + + return decode(tokens) + + def _load_vocabulary_from_file(self, vocab: Union[str, list], + maskctc: bool): + """Load vocabulary from file.""" + if isinstance(vocab, list): + vocab_list = vocab + else: + vocab_list = load_dict(vocab, maskctc) + assert vocab_list is not None + logger.debug(f"Vocab: {pformat(vocab_list)}") + + id2token = dict( + [(idx, token) for (idx, token) in enumerate(vocab_list)]) + token2id = dict( + [(token, idx) for (idx, token) in enumerate(vocab_list)]) + + blank_id = vocab_list.index(BLANK) if BLANK in vocab_list else -1 + maskctc_id = vocab_list.index(MASKCTC) if MASKCTC in vocab_list else -1 + unk_id = vocab_list.index(UNK) if UNK in vocab_list else -1 + eos_id = vocab_list.index(EOS) if EOS in vocab_list else -1 + sos_id = vocab_list.index(SOS) if SOS in vocab_list else -1 + space_id = vocab_list.index(SPACE) if SPACE in vocab_list else -1 + + logger.info(f"BLANK id: {blank_id}") + logger.info(f"UNK id: {unk_id}") + logger.info(f"EOS id: {eos_id}") + logger.info(f"SOS id: {sos_id}") + logger.info(f"SPACE id: {space_id}") + logger.info(f"MASKCTC id: {maskctc_id}") + return token2id, id2token, vocab_list, unk_id, eos_id, blank_id diff --git a/paddlespeech/audio/text/utility.py b/paddlespeech/audio/text/utility.py new file mode 100644 index 0000000000000000000000000000000000000000..d35785db6825761e8bc26aada4c2c4d9d8066b0c --- /dev/null +++ b/paddlespeech/audio/text/utility.py @@ -0,0 +1,393 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Contains data helper functions.""" +import json +import math +import tarfile +from collections import namedtuple +from typing import List +from typing import Optional +from typing import Text + +import jsonlines +import numpy as np + +from paddlespeech.s2t.utils.log import Log + +logger = Log(__name__).getlog() + +__all__ = [ + "load_dict", "load_cmvn", "read_manifest", "rms_to_db", "rms_to_dbfs", + "max_dbfs", "mean_dbfs", "gain_db_to_ratio", "normalize_audio", "SOS", + "EOS", "UNK", "BLANK", "MASKCTC", "SPACE", "convert_samples_to_float32", + "convert_samples_from_float32" +] + +IGNORE_ID = -1 +# `sos` and `eos` using same token +SOS = "" +EOS = SOS +UNK = "" +BLANK = "" +MASKCTC = "" +SPACE = "" + + +def load_dict(dict_path: Optional[Text], maskctc=False) -> Optional[List[Text]]: + if dict_path is None: + return None + + with open(dict_path, "r") as f: + dictionary = f.readlines() + # first token is `` + # multi line: ` 0\n` + # one line: `` + # space is relpace with + char_list = [entry[:-1].split(" ")[0] for entry in dictionary] + if BLANK not in char_list: + char_list.insert(0, BLANK) + if EOS not in char_list: + char_list.append(EOS) + # for non-autoregressive maskctc model + if maskctc and MASKCTC not in char_list: + char_list.append(MASKCTC) + return char_list + + +def read_manifest( + manifest_path, + max_input_len=float('inf'), + min_input_len=0.0, + max_output_len=float('inf'), + min_output_len=0.0, + max_output_input_ratio=float('inf'), + min_output_input_ratio=0.0, ): + """Load and parse manifest file. + + Args: + manifest_path ([type]): Manifest file to load and parse. + max_input_len ([type], optional): maximum output seq length, + in seconds for raw wav, in frame numbers for feature data. + Defaults to float('inf'). + min_input_len (float, optional): minimum input seq length, + in seconds for raw wav, in frame numbers for feature data. + Defaults to 0.0. + max_output_len (float, optional): maximum input seq length, + in modeling units. Defaults to 500.0. + min_output_len (float, optional): minimum input seq length, + in modeling units. Defaults to 0.0. + max_output_input_ratio (float, optional): + maximum output seq length/output seq length ratio. Defaults to 10.0. + min_output_input_ratio (float, optional): + minimum output seq length/output seq length ratio. Defaults to 0.05. + + Raises: + IOError: If failed to parse the manifest. + + Returns: + List[dict]: Manifest parsing results. + """ + manifest = [] + with jsonlines.open(manifest_path, 'r') as reader: + for json_data in reader: + feat_len = json_data["input"][0]["shape"][ + 0] if "input" in json_data and "shape" in json_data["input"][ + 0] else 1.0 + token_len = json_data["output"][0]["shape"][ + 0] if "output" in json_data and "shape" in json_data["output"][ + 0] else 1.0 + conditions = [ + feat_len >= min_input_len, + feat_len <= max_input_len, + token_len >= min_output_len, + token_len <= max_output_len, + token_len / feat_len >= min_output_input_ratio, + token_len / feat_len <= max_output_input_ratio, + ] + if all(conditions): + manifest.append(json_data) + return manifest + + +# Tar File read +TarLocalData = namedtuple('TarLocalData', ['tar2info', 'tar2object']) + + +def parse_tar(file): + """Parse a tar file to get a tarfile object + and a map containing tarinfoes + """ + result = {} + f = tarfile.open(file) + for tarinfo in f.getmembers(): + result[tarinfo.name] = tarinfo + return f, result + + +def subfile_from_tar(file, local_data=None): + """Get subfile object from tar. + + tar:tarpath#filename + + It will return a subfile object from tar file + and cached tar file info for next reading request. + """ + tarpath, filename = file.split(':', 1)[1].split('#', 1) + + if local_data is None: + local_data = TarLocalData(tar2info={}, tar2object={}) + + assert isinstance(local_data, TarLocalData) + + if 'tar2info' not in local_data.__dict__: + local_data.tar2info = {} + if 'tar2object' not in local_data.__dict__: + local_data.tar2object = {} + + if tarpath not in local_data.tar2info: + fobj, infos = parse_tar(tarpath) + local_data.tar2info[tarpath] = infos + local_data.tar2object[tarpath] = fobj + else: + fobj = local_data.tar2object[tarpath] + infos = local_data.tar2info[tarpath] + return fobj.extractfile(infos[filename]) + + +def rms_to_db(rms: float): + """Root Mean Square to dB. + + Args: + rms ([float]): root mean square + + Returns: + float: dB + """ + return 20.0 * math.log10(max(1e-16, rms)) + + +def rms_to_dbfs(rms: float): + """Root Mean Square to dBFS. + https://fireattack.wordpress.com/2017/02/06/replaygain-loudness-normalization-and-applications/ + Audio is mix of sine wave, so 1 amp sine wave's Full scale is 0.7071, equal to -3.0103dB. + + dB = dBFS + 3.0103 + dBFS = db - 3.0103 + e.g. 0 dB = -3.0103 dBFS + + Args: + rms ([float]): root mean square + + Returns: + float: dBFS + """ + return rms_to_db(rms) - 3.0103 + + +def max_dbfs(sample_data: np.ndarray): + """Peak dBFS based on the maximum energy sample. + + Args: + sample_data ([np.ndarray]): float array, [-1, 1]. + + Returns: + float: dBFS + """ + # Peak dBFS based on the maximum energy sample. Will prevent overdrive if used for normalization. + return rms_to_dbfs(max(abs(np.min(sample_data)), abs(np.max(sample_data)))) + + +def mean_dbfs(sample_data): + """Peak dBFS based on the RMS energy. + + Args: + sample_data ([np.ndarray]): float array, [-1, 1]. + + Returns: + float: dBFS + """ + return rms_to_dbfs( + math.sqrt(np.mean(np.square(sample_data, dtype=np.float64)))) + + +def gain_db_to_ratio(gain_db: float): + """dB to ratio + + Args: + gain_db (float): gain in dB + + Returns: + float: scale in amp + """ + return math.pow(10.0, gain_db / 20.0) + + +def normalize_audio(sample_data: np.ndarray, dbfs: float=-3.0103): + """Nomalize audio to dBFS. + + Args: + sample_data (np.ndarray): input wave samples, [-1, 1]. + dbfs (float, optional): target dBFS. Defaults to -3.0103. + + Returns: + np.ndarray: normalized wave + """ + return np.maximum( + np.minimum(sample_data * gain_db_to_ratio(dbfs - max_dbfs(sample_data)), + 1.0), -1.0) + + +def _load_json_cmvn(json_cmvn_file): + """ Load the json format cmvn stats file and calculate cmvn + + Args: + json_cmvn_file: cmvn stats file in json format + + Returns: + a numpy array of [means, vars] + """ + with open(json_cmvn_file) as f: + cmvn_stats = json.load(f) + + means = cmvn_stats['mean_stat'] + variance = cmvn_stats['var_stat'] + count = cmvn_stats['frame_num'] + for i in range(len(means)): + means[i] /= count + variance[i] = variance[i] / count - means[i] * means[i] + if variance[i] < 1.0e-20: + variance[i] = 1.0e-20 + variance[i] = 1.0 / math.sqrt(variance[i]) + cmvn = np.array([means, variance]) + return cmvn + + +def _load_kaldi_cmvn(kaldi_cmvn_file): + """ Load the kaldi format cmvn stats file and calculate cmvn + + Args: + kaldi_cmvn_file: kaldi text style global cmvn file, which + is generated by: + compute-cmvn-stats --binary=false scp:feats.scp global_cmvn + + Returns: + a numpy array of [means, vars] + """ + means = [] + variance = [] + with open(kaldi_cmvn_file, 'r') as fid: + # kaldi binary file start with '\0B' + if fid.read(2) == '\0B': + logger.error('kaldi cmvn binary file is not supported, please ' + 'recompute it by: compute-cmvn-stats --binary=false ' + ' scp:feats.scp global_cmvn') + sys.exit(1) + fid.seek(0) + arr = fid.read().split() + assert (arr[0] == '[') + assert (arr[-2] == '0') + assert (arr[-1] == ']') + feat_dim = int((len(arr) - 2 - 2) / 2) + for i in range(1, feat_dim + 1): + means.append(float(arr[i])) + count = float(arr[feat_dim + 1]) + for i in range(feat_dim + 2, 2 * feat_dim + 2): + variance.append(float(arr[i])) + + for i in range(len(means)): + means[i] /= count + variance[i] = variance[i] / count - means[i] * means[i] + if variance[i] < 1.0e-20: + variance[i] = 1.0e-20 + variance[i] = 1.0 / math.sqrt(variance[i]) + cmvn = np.array([means, variance]) + return cmvn + + +def load_cmvn(cmvn_file: str, filetype: str): + """load cmvn from file. + + Args: + cmvn_file (str): cmvn path. + filetype (str): file type, optional[npz, json, kaldi]. + + Raises: + ValueError: file type not support. + + Returns: + Tuple[np.ndarray, np.ndarray]: mean, istd + """ + assert filetype in ['npz', 'json', 'kaldi'], filetype + filetype = filetype.lower() + if filetype == "json": + cmvn = _load_json_cmvn(cmvn_file) + elif filetype == "kaldi": + cmvn = _load_kaldi_cmvn(cmvn_file) + elif filetype == "npz": + eps = 1e-14 + npzfile = np.load(cmvn_file) + mean = np.squeeze(npzfile["mean"]) + std = np.squeeze(npzfile["std"]) + istd = 1 / (std + eps) + cmvn = [mean, istd] + else: + raise ValueError(f"cmvn file type no support: {filetype}") + return cmvn[0], cmvn[1] + + +def convert_samples_to_float32(samples): + """Convert sample type to float32. + + Audio sample type is usually integer or float-point. + Integers will be scaled to [-1, 1] in float32. + + PCM16 -> PCM32 + """ + float32_samples = samples.astype('float32') + if samples.dtype in np.sctypes['int']: + bits = np.iinfo(samples.dtype).bits + float32_samples *= (1. / 2**(bits - 1)) + elif samples.dtype in np.sctypes['float']: + pass + else: + raise TypeError("Unsupported sample type: %s." % samples.dtype) + return float32_samples + + +def convert_samples_from_float32(samples, dtype): + """Convert sample type from float32 to dtype. + + Audio sample type is usually integer or float-point. For integer + type, float32 will be rescaled from [-1, 1] to the maximum range + supported by the integer type. + + PCM32 -> PCM16 + """ + dtype = np.dtype(dtype) + output_samples = samples.copy() + if dtype in np.sctypes['int']: + bits = np.iinfo(dtype).bits + output_samples *= (2**(bits - 1) / 1.) + min_val = np.iinfo(dtype).min + max_val = np.iinfo(dtype).max + output_samples[output_samples > max_val] = max_val + output_samples[output_samples < min_val] = min_val + elif samples.dtype in np.sctypes['float']: + min_val = np.finfo(dtype).min + max_val = np.finfo(dtype).max + output_samples[output_samples > max_val] = max_val + output_samples[output_samples < min_val] = min_val + else: + raise TypeError("Unsupported sample type: %s." % samples.dtype) + return output_samples.astype(dtype) diff --git a/paddlespeech/s2t/transform/__init__.py b/paddlespeech/audio/transform/__init__.py similarity index 100% rename from paddlespeech/s2t/transform/__init__.py rename to paddlespeech/audio/transform/__init__.py diff --git a/paddlespeech/s2t/transform/add_deltas.py b/paddlespeech/audio/transform/add_deltas.py similarity index 100% rename from paddlespeech/s2t/transform/add_deltas.py rename to paddlespeech/audio/transform/add_deltas.py diff --git a/paddlespeech/s2t/transform/channel_selector.py b/paddlespeech/audio/transform/channel_selector.py similarity index 100% rename from paddlespeech/s2t/transform/channel_selector.py rename to paddlespeech/audio/transform/channel_selector.py diff --git a/paddlespeech/s2t/transform/cmvn.py b/paddlespeech/audio/transform/cmvn.py similarity index 100% rename from paddlespeech/s2t/transform/cmvn.py rename to paddlespeech/audio/transform/cmvn.py diff --git a/paddlespeech/s2t/transform/functional.py b/paddlespeech/audio/transform/functional.py similarity index 94% rename from paddlespeech/s2t/transform/functional.py rename to paddlespeech/audio/transform/functional.py index ccb500819e171bada581811905737fbfd7af015d..271819adb982f390d1b89478d58248b39eaa61ea 100644 --- a/paddlespeech/s2t/transform/functional.py +++ b/paddlespeech/audio/transform/functional.py @@ -14,8 +14,8 @@ # Modified from espnet(https://github.com/espnet/espnet) import inspect -from paddlespeech.s2t.transform.transform_interface import TransformInterface -from paddlespeech.s2t.utils.check_kwargs import check_kwargs +from paddlespeech.audio.transform.transform_interface import TransformInterface +from paddlespeech.audio.utils.check_kwargs import check_kwargs class FuncTrans(TransformInterface): diff --git a/paddlespeech/s2t/transform/perturb.py b/paddlespeech/audio/transform/perturb.py similarity index 86% rename from paddlespeech/s2t/transform/perturb.py rename to paddlespeech/audio/transform/perturb.py index b18caefb8b978d426a1b105cb1480067176cc6ff..8044dc36fa74e410b37eff35c54020ca0fef5c9e 100644 --- a/paddlespeech/s2t/transform/perturb.py +++ b/paddlespeech/audio/transform/perturb.py @@ -17,8 +17,97 @@ import numpy import scipy import soundfile -from paddlespeech.s2t.io.reader import SoundHDF5File +import io +import os +import h5py +import numpy as np +class SoundHDF5File(): + """Collecting sound files to a HDF5 file + + >>> f = SoundHDF5File('a.flac.h5', mode='a') + >>> array = np.random.randint(0, 100, 100, dtype=np.int16) + >>> f['id'] = (array, 16000) + >>> array, rate = f['id'] + + + :param: str filepath: + :param: str mode: + :param: str format: The type used when saving wav. flac, nist, htk, etc. + :param: str dtype: + + """ + + def __init__(self, + filepath, + mode="r+", + format=None, + dtype="int16", + **kwargs): + self.filepath = filepath + self.mode = mode + self.dtype = dtype + + self.file = h5py.File(filepath, mode, **kwargs) + if format is None: + # filepath = a.flac.h5 -> format = flac + second_ext = os.path.splitext(os.path.splitext(filepath)[0])[1] + format = second_ext[1:] + if format.upper() not in soundfile.available_formats(): + # If not found, flac is selected + format = "flac" + + # This format affects only saving + self.format = format + + def __repr__(self): + return ''.format( + self.filepath, self.mode, self.format, self.dtype) + + def create_dataset(self, name, shape=None, data=None, **kwds): + f = io.BytesIO() + array, rate = data + soundfile.write(f, array, rate, format=self.format) + self.file.create_dataset( + name, shape=shape, data=np.void(f.getvalue()), **kwds) + + def __setitem__(self, name, data): + self.create_dataset(name, data=data) + + def __getitem__(self, key): + data = self.file[key][()] + f = io.BytesIO(data.tobytes()) + array, rate = soundfile.read(f, dtype=self.dtype) + return array, rate + + def keys(self): + return self.file.keys() + + def values(self): + for k in self.file: + yield self[k] + + def items(self): + for k in self.file: + yield k, self[k] + + def __iter__(self): + return iter(self.file) + + def __contains__(self, item): + return item in self.file + + def __len__(self, item): + return len(self.file) + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.file.close() + + def close(self): + self.file.close() class SpeedPerturbation(): """SpeedPerturbation @@ -469,3 +558,4 @@ class RIRConvolve(): [scipy.convolve(x, r, mode="same") for r in rir], axis=-1) else: return scipy.convolve(x, rir, mode="same") + diff --git a/paddlespeech/s2t/transform/spec_augment.py b/paddlespeech/audio/transform/spec_augment.py similarity index 97% rename from paddlespeech/s2t/transform/spec_augment.py rename to paddlespeech/audio/transform/spec_augment.py index 5ce950851a4ee6dbaa2bcbe529cbc89ce714a60b..029e7b8f5a2f316e081df3b8e5b2f780f533e258 100644 --- a/paddlespeech/s2t/transform/spec_augment.py +++ b/paddlespeech/audio/transform/spec_augment.py @@ -14,12 +14,10 @@ # Modified from espnet(https://github.com/espnet/espnet) """Spec Augment module for preprocessing i.e., data augmentation""" import random - import numpy from PIL import Image -from PIL.Image import BICUBIC -from paddlespeech.s2t.transform.functional import FuncTrans +from .functional import FuncTrans def time_warp(x, max_time_warp=80, inplace=False, mode="PIL"): @@ -46,9 +44,10 @@ def time_warp(x, max_time_warp=80, inplace=False, mode="PIL"): warped = random.randrange(center - window, center + window) + 1 # 1 ... t - 1 - left = Image.fromarray(x[:center]).resize((x.shape[1], warped), BICUBIC) + left = Image.fromarray(x[:center]).resize((x.shape[1], warped), + Image.BICUBIC) right = Image.fromarray(x[center:]).resize((x.shape[1], t - warped), - BICUBIC) + Image.BICUBIC) if inplace: x[:warped] = left x[warped:] = right diff --git a/paddlespeech/s2t/transform/spectrogram.py b/paddlespeech/audio/transform/spectrogram.py similarity index 99% rename from paddlespeech/s2t/transform/spectrogram.py rename to paddlespeech/audio/transform/spectrogram.py index 19f0237bff0effeec29ee96e98928448e72f057d..864f3f9940b7a34d81c1836157c2740c9b85c1ca 100644 --- a/paddlespeech/s2t/transform/spectrogram.py +++ b/paddlespeech/audio/transform/spectrogram.py @@ -17,7 +17,7 @@ import numpy as np import paddle from python_speech_features import logfbank -import paddlespeech.audio.compliance.kaldi as kaldi +from ..compliance import kaldi def stft(x, diff --git a/paddlespeech/s2t/transform/transform_interface.py b/paddlespeech/audio/transform/transform_interface.py similarity index 100% rename from paddlespeech/s2t/transform/transform_interface.py rename to paddlespeech/audio/transform/transform_interface.py diff --git a/paddlespeech/s2t/transform/transformation.py b/paddlespeech/audio/transform/transformation.py similarity index 75% rename from paddlespeech/s2t/transform/transformation.py rename to paddlespeech/audio/transform/transformation.py index 3b433cb0bc50c7c3e3cbf847f2906d0f6b554d99..d24d6437c5bb78920bc33ae3fd98b3bebc32016c 100644 --- a/paddlespeech/s2t/transform/transformation.py +++ b/paddlespeech/audio/transform/transformation.py @@ -22,32 +22,32 @@ from inspect import signature import yaml -from paddlespeech.s2t.utils.dynamic_import import dynamic_import +from ..utils.dynamic_import import dynamic_import import_alias = dict( - identity="paddlespeech.s2t.transform.transform_interface:Identity", - time_warp="paddlespeech.s2t.transform.spec_augment:TimeWarp", - time_mask="paddlespeech.s2t.transform.spec_augment:TimeMask", - freq_mask="paddlespeech.s2t.transform.spec_augment:FreqMask", - spec_augment="paddlespeech.s2t.transform.spec_augment:SpecAugment", - speed_perturbation="paddlespeech.s2t.transform.perturb:SpeedPerturbation", - speed_perturbation_sox="paddlespeech.s2t.transform.perturb:SpeedPerturbationSox", - volume_perturbation="paddlespeech.s2t.transform.perturb:VolumePerturbation", - noise_injection="paddlespeech.s2t.transform.perturb:NoiseInjection", - bandpass_perturbation="paddlespeech.s2t.transform.perturb:BandpassPerturbation", - rir_convolve="paddlespeech.s2t.transform.perturb:RIRConvolve", - delta="paddlespeech.s2t.transform.add_deltas:AddDeltas", - cmvn="paddlespeech.s2t.transform.cmvn:CMVN", - utterance_cmvn="paddlespeech.s2t.transform.cmvn:UtteranceCMVN", - fbank="paddlespeech.s2t.transform.spectrogram:LogMelSpectrogram", - spectrogram="paddlespeech.s2t.transform.spectrogram:Spectrogram", - stft="paddlespeech.s2t.transform.spectrogram:Stft", - istft="paddlespeech.s2t.transform.spectrogram:IStft", - stft2fbank="paddlespeech.s2t.transform.spectrogram:Stft2LogMelSpectrogram", - wpe="paddlespeech.s2t.transform.wpe:WPE", - channel_selector="paddlespeech.s2t.transform.channel_selector:ChannelSelector", - fbank_kaldi="paddlespeech.s2t.transform.spectrogram:LogMelSpectrogramKaldi", - cmvn_json="paddlespeech.s2t.transform.cmvn:GlobalCMVN") + identity="paddlespeech.audio.transform.transform_interface:Identity", + time_warp="paddlespeech.audio.transform.spec_augment:TimeWarp", + time_mask="paddlespeech.audio.transform.spec_augment:TimeMask", + freq_mask="paddlespeech.audio.transform.spec_augment:FreqMask", + spec_augment="paddlespeech.audio.transform.spec_augment:SpecAugment", + speed_perturbation="paddlespeech.audio.transform.perturb:SpeedPerturbation", + speed_perturbation_sox="paddlespeech.audio.transform.perturb:SpeedPerturbationSox", + volume_perturbation="paddlespeech.audio.transform.perturb:VolumePerturbation", + noise_injection="paddlespeech.audio.transform.perturb:NoiseInjection", + bandpass_perturbation="paddlespeech.audio.transform.perturb:BandpassPerturbation", + rir_convolve="paddlespeech.audio.transform.perturb:RIRConvolve", + delta="paddlespeech.audio.transform.add_deltas:AddDeltas", + cmvn="paddlespeech.audio.transform.cmvn:CMVN", + utterance_cmvn="paddlespeech.audio.transform.cmvn:UtteranceCMVN", + fbank="paddlespeech.audio.transform.spectrogram:LogMelSpectrogram", + spectrogram="paddlespeech.audio.transform.spectrogram:Spectrogram", + stft="paddlespeech.audio.transform.spectrogram:Stft", + istft="paddlespeech.audio.transform.spectrogram:IStft", + stft2fbank="paddlespeech.audio.transform.spectrogram:Stft2LogMelSpectrogram", + wpe="paddlespeech.audio.transform.wpe:WPE", + channel_selector="paddlespeech.audio.transform.channel_selector:ChannelSelector", + fbank_kaldi="paddlespeech.audio.transform.spectrogram:LogMelSpectrogramKaldi", + cmvn_json="paddlespeech.audio.transform.cmvn:GlobalCMVN") class Transformation(): diff --git a/paddlespeech/s2t/transform/wpe.py b/paddlespeech/audio/transform/wpe.py similarity index 100% rename from paddlespeech/s2t/transform/wpe.py rename to paddlespeech/audio/transform/wpe.py diff --git a/paddlespeech/audio/utils/check_kwargs.py b/paddlespeech/audio/utils/check_kwargs.py new file mode 100644 index 0000000000000000000000000000000000000000..0aa839aca8bf177d39bd174db730413bfc8a3b90 --- /dev/null +++ b/paddlespeech/audio/utils/check_kwargs.py @@ -0,0 +1,35 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Modified from espnet(https://github.com/espnet/espnet) +import inspect + + +def check_kwargs(func, kwargs, name=None): + """check kwargs are valid for func + + If kwargs are invalid, raise TypeError as same as python default + :param function func: function to be validated + :param dict kwargs: keyword arguments for func + :param str name: name used in TypeError (default is func name) + """ + try: + params = inspect.signature(func).parameters + except ValueError: + return + if name is None: + name = func.__name__ + for k in kwargs.keys(): + if k not in params: + raise TypeError( + f"{name}() got an unexpected keyword argument '{k}'") diff --git a/paddlespeech/audio/utils/dynamic_import.py b/paddlespeech/audio/utils/dynamic_import.py new file mode 100644 index 0000000000000000000000000000000000000000..99f93356fe0c3dfa31f468d412c113d9f3379d3d --- /dev/null +++ b/paddlespeech/audio/utils/dynamic_import.py @@ -0,0 +1,38 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Modified from espnet(https://github.com/espnet/espnet) +import importlib + +__all__ = ["dynamic_import"] + + +def dynamic_import(import_path, alias=dict()): + """dynamic import module and class + + :param str import_path: syntax 'module_name:class_name' + e.g., 'paddlespeech.s2t.models.u2:U2Model' + :param dict alias: shortcut for registered class + :return: imported class + """ + if import_path not in alias and ":" not in import_path: + raise ValueError( + "import_path should be one of {} or " + 'include ":", e.g. "paddlespeech.s2t.models.u2:U2Model" : ' + "{}".format(set(alias), import_path)) + if ":" not in import_path: + import_path = alias[import_path] + + module_name, objname = import_path.split(":") + m = importlib.import_module(module_name) + return getattr(m, objname) diff --git a/paddlespeech/audio/utils/log.py b/paddlespeech/audio/utils/log.py index 5656b286a0370a16ace6b89222f8a85f9f23df69..0a25bbd5fe9edac3b62b0bc58621550e16d94f8e 100644 --- a/paddlespeech/audio/utils/log.py +++ b/paddlespeech/audio/utils/log.py @@ -65,6 +65,7 @@ class Logger(object): def __init__(self, name: str=None): name = 'PaddleAudio' if not name else name + self.name = name self.logger = logging.getLogger(name) for key, conf in log_config.items(): @@ -101,7 +102,7 @@ class Logger(object): if not self.is_enable: return - self.logger.log(log_level, msg) + self.logger.log(log_level, self.name + " | " + msg) @contextlib.contextmanager def use_terminator(self, terminator: str): diff --git a/paddlespeech/audio/utils/tensor_utils.py b/paddlespeech/audio/utils/tensor_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..16f60810e6a426d764685a66f0ecc8c0bacdc6b0 --- /dev/null +++ b/paddlespeech/audio/utils/tensor_utils.py @@ -0,0 +1,192 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Unility functions for Transformer.""" +from typing import List +from typing import Tuple + +import paddle + +from .log import Logger + +__all__ = ["pad_sequence", "add_sos_eos", "th_accuracy", "has_tensor"] + +logger = Logger(__name__) + + +def has_tensor(val): + if isinstance(val, (list, tuple)): + for item in val: + if has_tensor(item): + return True + elif isinstance(val, dict): + for k, v in val.items(): + print(k) + if has_tensor(v): + return True + else: + return paddle.is_tensor(val) + + +def pad_sequence(sequences: List[paddle.Tensor], + batch_first: bool=False, + padding_value: float=0.0) -> paddle.Tensor: + r"""Pad a list of variable length Tensors with ``padding_value`` + + ``pad_sequence`` stacks a list of Tensors along a new dimension, + and pads them to equal length. For example, if the input is list of + sequences with size ``L x *`` and if batch_first is False, and ``T x B x *`` + otherwise. + + `B` is batch size. It is equal to the number of elements in ``sequences``. + `T` is length of the longest sequence. + `L` is length of the sequence. + `*` is any number of trailing dimensions, including none. + + Example: + >>> from paddle.nn.utils.rnn import pad_sequence + >>> a = paddle.ones(25, 300) + >>> b = paddle.ones(22, 300) + >>> c = paddle.ones(15, 300) + >>> pad_sequence([a, b, c]).shape + paddle.Tensor([25, 3, 300]) + + Note: + This function returns a Tensor of size ``T x B x *`` or ``B x T x *`` + where `T` is the length of the longest sequence. This function assumes + trailing dimensions and type of all the Tensors in sequences are same. + + Args: + sequences (list[Tensor]): list of variable length sequences. + batch_first (bool, optional): output will be in ``B x T x *`` if True, or in + ``T x B x *`` otherwise + padding_value (float, optional): value for padded elements. Default: 0. + + Returns: + Tensor of size ``T x B x *`` if :attr:`batch_first` is ``False``. + Tensor of size ``B x T x *`` otherwise + """ + + # assuming trailing dimensions and type of all the Tensors + # in sequences are same and fetching those from sequences[0] + max_size = paddle.shape(sequences[0]) + # (TODO Hui Zhang): slice not supprot `end==start` + # trailing_dims = max_size[1:] + trailing_dims = tuple( + max_size[1:].numpy().tolist()) if sequences[0].ndim >= 2 else () + max_len = max([s.shape[0] for s in sequences]) + if batch_first: + out_dims = (len(sequences), max_len) + trailing_dims + else: + out_dims = (max_len, len(sequences)) + trailing_dims + out_tensor = paddle.full(out_dims, padding_value, sequences[0].dtype) + for i, tensor in enumerate(sequences): + length = tensor.shape[0] + # use index notation to prevent duplicate references to the tensor + if batch_first: + # TODO (Hui Zhang): set_value op not supprot `end==start` + # TODO (Hui Zhang): set_value op not support int16 + # TODO (Hui Zhang): set_varbase 2 rank not support [0,0,...] + # out_tensor[i, :length, ...] = tensor + if length != 0: + out_tensor[i, :length] = tensor + else: + out_tensor[i, length] = tensor + else: + # TODO (Hui Zhang): set_value op not supprot `end==start` + # out_tensor[:length, i, ...] = tensor + if length != 0: + out_tensor[:length, i] = tensor + else: + out_tensor[length, i] = tensor + + return out_tensor + + +def add_sos_eos(ys_pad: paddle.Tensor, sos: int, eos: int, + ignore_id: int) -> Tuple[paddle.Tensor, paddle.Tensor]: + """Add and labels. + Args: + ys_pad (paddle.Tensor): batch of padded target sequences (B, Lmax) + sos (int): index of + eos (int): index of + ignore_id (int): index of padding + Returns: + ys_in (paddle.Tensor) : (B, Lmax + 1) + ys_out (paddle.Tensor) : (B, Lmax + 1) + Examples: + >>> sos_id = 10 + >>> eos_id = 11 + >>> ignore_id = -1 + >>> ys_pad + tensor([[ 1, 2, 3, 4, 5], + [ 4, 5, 6, -1, -1], + [ 7, 8, 9, -1, -1]], dtype=paddle.int32) + >>> ys_in,ys_out=add_sos_eos(ys_pad, sos_id , eos_id, ignore_id) + >>> ys_in + tensor([[10, 1, 2, 3, 4, 5], + [10, 4, 5, 6, 11, 11], + [10, 7, 8, 9, 11, 11]]) + >>> ys_out + tensor([[ 1, 2, 3, 4, 5, 11], + [ 4, 5, 6, 11, -1, -1], + [ 7, 8, 9, 11, -1, -1]]) + """ + # TODO(Hui Zhang): using comment code, + #_sos = paddle.to_tensor( + # [sos], dtype=paddle.long, stop_gradient=True, place=ys_pad.place) + #_eos = paddle.to_tensor( + # [eos], dtype=paddle.long, stop_gradient=True, place=ys_pad.place) + #ys = [y[y != ignore_id] for y in ys_pad] # parse padded ys + #ys_in = [paddle.cat([_sos, y], dim=0) for y in ys] + #ys_out = [paddle.cat([y, _eos], dim=0) for y in ys] + #return pad_sequence(ys_in, padding_value=eos), pad_sequence(ys_out, padding_value=ignore_id) + B = ys_pad.shape[0] + _sos = paddle.ones([B, 1], dtype=ys_pad.dtype) * sos + _eos = paddle.ones([B, 1], dtype=ys_pad.dtype) * eos + ys_in = paddle.cat([_sos, ys_pad], dim=1) + mask_pad = (ys_in == ignore_id) + ys_in = ys_in.masked_fill(mask_pad, eos) + + ys_out = paddle.cat([ys_pad, _eos], dim=1) + ys_out = ys_out.masked_fill(mask_pad, eos) + mask_eos = (ys_out == ignore_id) + ys_out = ys_out.masked_fill(mask_eos, eos) + ys_out = ys_out.masked_fill(mask_pad, ignore_id) + return ys_in, ys_out + + +def th_accuracy(pad_outputs: paddle.Tensor, + pad_targets: paddle.Tensor, + ignore_label: int) -> float: + """Calculate accuracy. + Args: + pad_outputs (Tensor): Prediction tensors (B * Lmax, D). + pad_targets (LongTensor): Target label tensors (B, Lmax, D). + ignore_label (int): Ignore label id. + Returns: + float: Accuracy value (0.0 - 1.0). + """ + pad_pred = pad_outputs.view(pad_targets.shape[0], pad_targets.shape[1], + pad_outputs.shape[1]).argmax(2) + mask = pad_targets != ignore_label + #TODO(Hui Zhang): sum not support bool type + # numerator = paddle.sum( + # pad_pred.masked_select(mask) == pad_targets.masked_select(mask)) + numerator = ( + pad_pred.masked_select(mask) == pad_targets.masked_select(mask)) + numerator = paddle.sum(numerator.type_as(pad_targets)) + #TODO(Hui Zhang): sum not support bool type + # denominator = paddle.sum(mask) + denominator = paddle.sum(mask.type_as(pad_targets)) + return float(numerator) / float(denominator) diff --git a/paddlespeech/cli/asr/infer.py b/paddlespeech/cli/asr/infer.py index 24839a8988ca8adcde8ad015f8abdd0ff4a3b9d9..76dfafb926135abc9295ca9a640648b837ebfe84 100644 --- a/paddlespeech/cli/asr/infer.py +++ b/paddlespeech/cli/asr/infer.py @@ -33,8 +33,8 @@ from ..log import logger from ..utils import CLI_TIMER from ..utils import stats_wrapper from ..utils import timer_register +from paddlespeech.audio.transform.transformation import Transformation from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer -from paddlespeech.s2t.transform.transformation import Transformation from paddlespeech.s2t.utils.utility import UpdateConfig __all__ = ['ASRExecutor'] @@ -133,11 +133,11 @@ class ASRExecutor(BaseExecutor): """ Init model and other resources from a specific path. """ - logger.info("start to init the model") + logger.debug("start to init the model") # default max_len: unit:second self.max_len = 50 if hasattr(self, 'model'): - logger.info('Model had been initialized.') + logger.debug('Model had been initialized.') return if cfg_path is None or ckpt_path is None: @@ -151,15 +151,15 @@ class ASRExecutor(BaseExecutor): self.ckpt_path = os.path.join( self.res_path, self.task_resource.res_dict['ckpt_path'] + ".pdparams") - logger.info(self.res_path) + logger.debug(self.res_path) else: self.cfg_path = os.path.abspath(cfg_path) self.ckpt_path = os.path.abspath(ckpt_path + ".pdparams") self.res_path = os.path.dirname( os.path.dirname(os.path.abspath(self.cfg_path))) - logger.info(self.cfg_path) - logger.info(self.ckpt_path) + logger.debug(self.cfg_path) + logger.debug(self.ckpt_path) #Init body. self.config = CfgNode(new_allowed=True) @@ -216,7 +216,7 @@ class ASRExecutor(BaseExecutor): max_len = self.config.encoder_conf.max_len self.max_len = frame_shift_ms * max_len * subsample_rate - logger.info( + logger.debug( f"The asr server limit max duration len: {self.max_len}") def preprocess(self, model_type: str, input: Union[str, os.PathLike]): @@ -227,15 +227,15 @@ class ASRExecutor(BaseExecutor): audio_file = input if isinstance(audio_file, (str, os.PathLike)): - logger.info("Preprocess audio_file:" + audio_file) + logger.debug("Preprocess audio_file:" + audio_file) # Get the object for feature extraction if "deepspeech2" in model_type or "conformer" in model_type or "transformer" in model_type: - logger.info("get the preprocess conf") + logger.debug("get the preprocess conf") preprocess_conf = self.config.preprocess_config preprocess_args = {"train": False} preprocessing = Transformation(preprocess_conf) - logger.info("read the audio file") + logger.debug("read the audio file") audio, audio_sample_rate = soundfile.read( audio_file, dtype="int16", always_2d=True) if self.change_format: @@ -255,7 +255,7 @@ class ASRExecutor(BaseExecutor): else: audio = audio[:, 0] - logger.info(f"audio shape: {audio.shape}") + logger.debug(f"audio shape: {audio.shape}") # fbank audio = preprocessing(audio, **preprocess_args) @@ -264,19 +264,19 @@ class ASRExecutor(BaseExecutor): self._inputs["audio"] = audio self._inputs["audio_len"] = audio_len - logger.info(f"audio feat shape: {audio.shape}") + logger.debug(f"audio feat shape: {audio.shape}") else: raise Exception("wrong type") - logger.info("audio feat process success") + logger.debug("audio feat process success") @paddle.no_grad() def infer(self, model_type: str): """ Model inference and result stored in self.output. """ - logger.info("start to infer the model to get the output") + logger.debug("start to infer the model to get the output") cfg = self.config.decode audio = self._inputs["audio"] audio_len = self._inputs["audio_len"] @@ -293,7 +293,7 @@ class ASRExecutor(BaseExecutor): self._outputs["result"] = result_transcripts[0] elif "conformer" in model_type or "transformer" in model_type: - logger.info( + logger.debug( f"we will use the transformer like model : {model_type}") try: result_transcripts = self.model.decode( @@ -352,7 +352,7 @@ class ASRExecutor(BaseExecutor): logger.error("Please input the right audio file path") return False - logger.info("checking the audio file format......") + logger.debug("checking the audio file format......") try: audio, audio_sample_rate = soundfile.read( audio_file, dtype="int16", always_2d=True) @@ -374,7 +374,7 @@ class ASRExecutor(BaseExecutor): sox input_audio.xx --rate 8k --bits 16 --channels 1 output_audio.wav \n \ ") return False - logger.info("The sample rate is %d" % audio_sample_rate) + logger.debug("The sample rate is %d" % audio_sample_rate) if audio_sample_rate != self.sample_rate: logger.warning("The sample rate of the input file is not {}.\n \ The program will resample the wav file to {}.\n \ @@ -383,28 +383,28 @@ class ASRExecutor(BaseExecutor): ".format(self.sample_rate, self.sample_rate)) if force_yes is False: while (True): - logger.info( + logger.debug( "Whether to change the sample rate and the channel. Y: change the sample. N: exit the prgream." ) content = input("Input(Y/N):") if content.strip() == "Y" or content.strip( ) == "y" or content.strip() == "yes" or content.strip( ) == "Yes": - logger.info( + logger.debug( "change the sampele rate, channel to 16k and 1 channel" ) break elif content.strip() == "N" or content.strip( ) == "n" or content.strip() == "no" or content.strip( ) == "No": - logger.info("Exit the program") + logger.debug("Exit the program") return False else: logger.warning("Not regular input, please input again") self.change_format = True else: - logger.info("The audio file format is right") + logger.debug("The audio file format is right") self.change_format = False return True diff --git a/paddlespeech/cli/cls/infer.py b/paddlespeech/cli/cls/infer.py index 942dc3b9230b796057eb6fc9065d611867de6d4f..c869e28bfa30c61091df20d3cda2e4a3d56040cd 100644 --- a/paddlespeech/cli/cls/infer.py +++ b/paddlespeech/cli/cls/infer.py @@ -92,7 +92,7 @@ class CLSExecutor(BaseExecutor): Init model and other resources from a specific path. """ if hasattr(self, 'model'): - logger.info('Model had been initialized.') + logger.debug('Model had been initialized.') return if label_file is None or ckpt_path is None: @@ -135,14 +135,14 @@ class CLSExecutor(BaseExecutor): Input content can be a text(tts), a file(asr, cls) or a streaming(not supported yet). """ feat_conf = self._conf['feature'] - logger.info(feat_conf) + logger.debug(feat_conf) waveform, _ = load( file=audio_file, sr=feat_conf['sample_rate'], mono=True, dtype='float32') if isinstance(audio_file, (str, os.PathLike)): - logger.info("Preprocessing audio_file:" + audio_file) + logger.debug("Preprocessing audio_file:" + audio_file) # Feature extraction feature_extractor = LogMelSpectrogram( diff --git a/paddlespeech/cli/download.py b/paddlespeech/cli/download.py index ec72587470e8f0e211e453e3b2b2ea3d1f54f25b..5661f18f938eeffdb829f9f091bafc75baddb388 100644 --- a/paddlespeech/cli/download.py +++ b/paddlespeech/cli/download.py @@ -61,7 +61,7 @@ def _get_unique_endpoints(trainer_endpoints): continue ips.add(ip) unique_endpoints.add(endpoint) - logger.info("unique_endpoints {}".format(unique_endpoints)) + logger.debug("unique_endpoints {}".format(unique_endpoints)) return unique_endpoints @@ -96,7 +96,7 @@ def get_path_from_url(url, # data, and the same ip will only download data once. unique_endpoints = _get_unique_endpoints(ParallelEnv().trainer_endpoints[:]) if osp.exists(fullpath) and check_exist and _md5check(fullpath, md5sum): - logger.info("Found {}".format(fullpath)) + logger.debug("Found {}".format(fullpath)) else: if ParallelEnv().current_endpoint in unique_endpoints: fullpath = _download(url, root_dir, md5sum, method=method) @@ -118,7 +118,7 @@ def _get_download(url, fullname): try: req = requests.get(url, stream=True) except Exception as e: # requests.exceptions.ConnectionError - logger.info("Downloading {} from {} failed with exception {}".format( + logger.debug("Downloading {} from {} failed with exception {}".format( fname, url, str(e))) return False @@ -190,7 +190,7 @@ def _download(url, path, md5sum=None, method='get'): fullname = osp.join(path, fname) retry_cnt = 0 - logger.info("Downloading {} from {}".format(fname, url)) + logger.debug("Downloading {} from {}".format(fname, url)) while not (osp.exists(fullname) and _md5check(fullname, md5sum)): if retry_cnt < DOWNLOAD_RETRY_LIMIT: retry_cnt += 1 @@ -209,7 +209,7 @@ def _md5check(fullname, md5sum=None): if md5sum is None: return True - logger.info("File {} md5 checking...".format(fullname)) + logger.debug("File {} md5 checking...".format(fullname)) md5 = hashlib.md5() with open(fullname, 'rb') as f: for chunk in iter(lambda: f.read(4096), b""): @@ -217,8 +217,8 @@ def _md5check(fullname, md5sum=None): calc_md5sum = md5.hexdigest() if calc_md5sum != md5sum: - logger.info("File {} md5 check failed, {}(calc) != " - "{}(base)".format(fullname, calc_md5sum, md5sum)) + logger.debug("File {} md5 check failed, {}(calc) != " + "{}(base)".format(fullname, calc_md5sum, md5sum)) return False return True @@ -227,7 +227,7 @@ def _decompress(fname): """ Decompress for zip and tar file """ - logger.info("Decompressing {}...".format(fname)) + logger.debug("Decompressing {}...".format(fname)) # For protecting decompressing interupted, # decompress to fpath_tmp directory firstly, if decompress diff --git a/paddlespeech/cli/executor.py b/paddlespeech/cli/executor.py index d390f947d17cccc99a12eee75f634242e4bac9bb..d4187a51459498bcf3b5130cbfb03c97a4506077 100644 --- a/paddlespeech/cli/executor.py +++ b/paddlespeech/cli/executor.py @@ -217,7 +217,7 @@ class BaseExecutor(ABC): logging.getLogger(name) for name in logging.root.manager.loggerDict ] for l in loggers: - l.disabled = True + l.setLevel(logging.ERROR) def show_rtf(self, info: Dict[str, List[float]]): """ diff --git a/paddlespeech/cli/kws/infer.py b/paddlespeech/cli/kws/infer.py index e3f426f5776e3a280d21ff1cb7bf48a98c13107c..111cfd7542bc46a301862fb8df94934e1639dbfb 100644 --- a/paddlespeech/cli/kws/infer.py +++ b/paddlespeech/cli/kws/infer.py @@ -88,7 +88,7 @@ class KWSExecutor(BaseExecutor): Init model and other resources from a specific path. """ if hasattr(self, 'model'): - logger.info('Model had been initialized.') + logger.debug('Model had been initialized.') return if ckpt_path is None: @@ -141,7 +141,7 @@ class KWSExecutor(BaseExecutor): assert os.path.isfile(audio_file) waveform, _ = load(audio_file) if isinstance(audio_file, (str, os.PathLike)): - logger.info("Preprocessing audio_file:" + audio_file) + logger.debug("Preprocessing audio_file:" + audio_file) # Feature extraction waveform = paddle.to_tensor(waveform).unsqueeze(0) diff --git a/paddlespeech/cli/log.py b/paddlespeech/cli/log.py index 8644064c73ef407476e7870e65d1149019762723..8b33e71e100a1d56f17cc9839a004f49d8f2431d 100644 --- a/paddlespeech/cli/log.py +++ b/paddlespeech/cli/log.py @@ -49,7 +49,7 @@ class Logger(object): self.handler.setFormatter(self.format) self.logger.addHandler(self.handler) - self.logger.setLevel(logging.DEBUG) + self.logger.setLevel(logging.INFO) self.logger.propagate = False def __call__(self, log_level: str, msg: str): diff --git a/paddlespeech/cli/st/infer.py b/paddlespeech/cli/st/infer.py index 4e099c4021eca94bfea64eaefcd66267136eada1..bc2bdd1ac202a0a86ca6b50056047fba8146db25 100644 --- a/paddlespeech/cli/st/infer.py +++ b/paddlespeech/cli/st/infer.py @@ -110,7 +110,7 @@ class STExecutor(BaseExecutor): """ decompressed_path = download_and_decompress(self.kaldi_bins, MODEL_HOME) decompressed_path = os.path.abspath(decompressed_path) - logger.info("Kaldi_bins stored in: {}".format(decompressed_path)) + logger.debug("Kaldi_bins stored in: {}".format(decompressed_path)) if "LD_LIBRARY_PATH" in os.environ: os.environ["LD_LIBRARY_PATH"] += f":{decompressed_path}" else: @@ -128,7 +128,7 @@ class STExecutor(BaseExecutor): Init model and other resources from a specific path. """ if hasattr(self, 'model'): - logger.info('Model had been initialized.') + logger.debug('Model had been initialized.') return if cfg_path is None or ckpt_path is None: @@ -140,8 +140,8 @@ class STExecutor(BaseExecutor): self.ckpt_path = os.path.join( self.task_resource.res_dir, self.task_resource.res_dict['ckpt_path']) - logger.info(self.cfg_path) - logger.info(self.ckpt_path) + logger.debug(self.cfg_path) + logger.debug(self.ckpt_path) res_path = self.task_resource.res_dir else: self.cfg_path = os.path.abspath(cfg_path) @@ -192,7 +192,7 @@ class STExecutor(BaseExecutor): Input content can be a file(wav). """ audio_file = os.path.abspath(wav_file) - logger.info("Preprocess audio_file:" + audio_file) + logger.debug("Preprocess audio_file:" + audio_file) if "fat_st" in model_type: cmvn = self.config.cmvn_path diff --git a/paddlespeech/cli/text/infer.py b/paddlespeech/cli/text/infer.py index 7b8faf99c84691971744fbef291a714900dc60bc..24b8c9c2593ccdfe9a07cdb5d607a0416844e0a0 100644 --- a/paddlespeech/cli/text/infer.py +++ b/paddlespeech/cli/text/infer.py @@ -98,7 +98,7 @@ class TextExecutor(BaseExecutor): Init model and other resources from a specific path. """ if hasattr(self, 'model'): - logger.info('Model had been initialized.') + logger.debug('Model had been initialized.') return self.task = task diff --git a/paddlespeech/cli/tts/infer.py b/paddlespeech/cli/tts/infer.py index 4e0337bccea500c382f0782860cec36ad4897c46..ade8cdd6dc5f4f255a582b40fe6a7aa336b04fa0 100644 --- a/paddlespeech/cli/tts/infer.py +++ b/paddlespeech/cli/tts/infer.py @@ -173,16 +173,23 @@ class TTSExecutor(BaseExecutor): Init model and other resources from a specific path. """ if hasattr(self, 'am_inference') and hasattr(self, 'voc_inference'): - logger.info('Models had been initialized.') + logger.debug('Models had been initialized.') return + # am + if am_ckpt is None or am_config is None or am_stat is None or phones_dict is None: + use_pretrained_am = True + else: + use_pretrained_am = False + am_tag = am + '-' + lang self.task_resource.set_task_model( model_tag=am_tag, model_type=0, # am + skip_download=not use_pretrained_am, version=None, # default version ) - if am_ckpt is None or am_config is None or am_stat is None or phones_dict is None: + if use_pretrained_am: self.am_res_path = self.task_resource.res_dir self.am_config = os.path.join(self.am_res_path, self.task_resource.res_dict['config']) @@ -193,9 +200,9 @@ class TTSExecutor(BaseExecutor): # must have phones_dict in acoustic self.phones_dict = os.path.join( self.am_res_path, self.task_resource.res_dict['phones_dict']) - logger.info(self.am_res_path) - logger.info(self.am_config) - logger.info(self.am_ckpt) + logger.debug(self.am_res_path) + logger.debug(self.am_config) + logger.debug(self.am_ckpt) else: self.am_config = os.path.abspath(am_config) self.am_ckpt = os.path.abspath(am_ckpt) @@ -220,13 +227,19 @@ class TTSExecutor(BaseExecutor): self.speaker_dict = speaker_dict # voc + if voc_ckpt is None or voc_config is None or voc_stat is None: + use_pretrained_voc = True + else: + use_pretrained_voc = False + voc_tag = voc + '-' + lang self.task_resource.set_task_model( model_tag=voc_tag, model_type=1, # vocoder + skip_download=not use_pretrained_voc, version=None, # default version ) - if voc_ckpt is None or voc_config is None or voc_stat is None: + if use_pretrained_voc: self.voc_res_path = self.task_resource.voc_res_dir self.voc_config = os.path.join( self.voc_res_path, self.task_resource.voc_res_dict['config']) @@ -235,9 +248,9 @@ class TTSExecutor(BaseExecutor): self.voc_stat = os.path.join( self.voc_res_path, self.task_resource.voc_res_dict['speech_stats']) - logger.info(self.voc_res_path) - logger.info(self.voc_config) - logger.info(self.voc_ckpt) + logger.debug(self.voc_res_path) + logger.debug(self.voc_config) + logger.debug(self.voc_ckpt) else: self.voc_config = os.path.abspath(voc_config) self.voc_ckpt = os.path.abspath(voc_ckpt) @@ -254,21 +267,18 @@ class TTSExecutor(BaseExecutor): with open(self.phones_dict, "r") as f: phn_id = [line.strip().split() for line in f.readlines()] vocab_size = len(phn_id) - print("vocab_size:", vocab_size) tone_size = None if self.tones_dict: with open(self.tones_dict, "r") as f: tone_id = [line.strip().split() for line in f.readlines()] tone_size = len(tone_id) - print("tone_size:", tone_size) spk_num = None if self.speaker_dict: with open(self.speaker_dict, 'rt') as f: spk_id = [line.strip().split() for line in f.readlines()] spk_num = len(spk_id) - print("spk_num:", spk_num) # frontend if lang == 'zh': @@ -278,7 +288,6 @@ class TTSExecutor(BaseExecutor): elif lang == 'en': self.frontend = English(phone_vocab_path=self.phones_dict) - print("frontend done!") # acoustic model odim = self.am_config.n_mels @@ -311,7 +320,6 @@ class TTSExecutor(BaseExecutor): am_normalizer = ZScore(am_mu, am_std) self.am_inference = am_inference_class(am_normalizer, am) self.am_inference.eval() - print("acoustic model done!") # vocoder # model: {model_name}_{dataset} @@ -334,7 +342,6 @@ class TTSExecutor(BaseExecutor): voc_normalizer = ZScore(voc_mu, voc_std) self.voc_inference = voc_inference_class(voc_normalizer, voc) self.voc_inference.eval() - print("voc done!") def preprocess(self, input: Any, *args, **kwargs): """ @@ -375,7 +382,7 @@ class TTSExecutor(BaseExecutor): text, merge_sentences=merge_sentences) phone_ids = input_ids["phone_ids"] else: - print("lang should in {'zh', 'en'}!") + logger.error("lang should in {'zh', 'en'}!") self.frontend_time = time.time() - frontend_st self.am_time = 0 diff --git a/paddlespeech/cli/vector/infer.py b/paddlespeech/cli/vector/infer.py index 4bc8e135ad1226f41455d62f392099d283b53d08..48ca1f98dedb1ba1caa454720a5211bacddb7ad9 100644 --- a/paddlespeech/cli/vector/infer.py +++ b/paddlespeech/cli/vector/infer.py @@ -117,7 +117,7 @@ class VectorExecutor(BaseExecutor): # stage 2: read the input data and store them as a list task_source = self.get_input_source(parser_args.input) - logger.info(f"task source: {task_source}") + logger.debug(f"task source: {task_source}") # stage 3: process the audio one by one # we do action according the task type @@ -127,13 +127,13 @@ class VectorExecutor(BaseExecutor): try: # extract the speaker audio embedding if parser_args.task == "spk": - logger.info("do vector spk task") + logger.debug("do vector spk task") res = self(input_, model, sample_rate, config, ckpt_path, device) task_result[id_] = res elif parser_args.task == "score": - logger.info("do vector score task") - logger.info(f"input content {input_}") + logger.debug("do vector score task") + logger.debug(f"input content {input_}") if len(input_.split()) != 2: logger.error( f"vector score task input {input_} wav num is not two," @@ -142,7 +142,7 @@ class VectorExecutor(BaseExecutor): # get the enroll and test embedding enroll_audio, test_audio = input_.split() - logger.info( + logger.debug( f"score task, enroll audio: {enroll_audio}, test audio: {test_audio}" ) enroll_embedding = self(enroll_audio, model, sample_rate, @@ -158,8 +158,8 @@ class VectorExecutor(BaseExecutor): has_exceptions = True task_result[id_] = f'{e.__class__.__name__}: {e}' - logger.info("task result as follows: ") - logger.info(f"{task_result}") + logger.debug("task result as follows: ") + logger.debug(f"{task_result}") # stage 4: process the all the task results self.process_task_results(parser_args.input, task_result, @@ -207,7 +207,7 @@ class VectorExecutor(BaseExecutor): """ if not hasattr(self, "score_func"): self.score_func = paddle.nn.CosineSimilarity(axis=0) - logger.info("create the cosine score function ") + logger.debug("create the cosine score function ") score = self.score_func( paddle.to_tensor(enroll_embedding), @@ -244,7 +244,7 @@ class VectorExecutor(BaseExecutor): sys.exit(-1) # stage 1: set the paddle runtime host device - logger.info(f"device type: {device}") + logger.debug(f"device type: {device}") paddle.device.set_device(device) # stage 2: read the specific pretrained model @@ -283,7 +283,7 @@ class VectorExecutor(BaseExecutor): # stage 0: avoid to init the mode again self.task = task if hasattr(self, "model"): - logger.info("Model has been initialized") + logger.debug("Model has been initialized") return # stage 1: get the model and config path @@ -294,7 +294,7 @@ class VectorExecutor(BaseExecutor): sample_rate_str = "16k" if sample_rate == 16000 else "8k" tag = model_type + "-" + sample_rate_str self.task_resource.set_task_model(tag, version=None) - logger.info(f"load the pretrained model: {tag}") + logger.debug(f"load the pretrained model: {tag}") # get the model from the pretrained list # we download the pretrained model and store it in the res_path self.res_path = self.task_resource.res_dir @@ -312,19 +312,19 @@ class VectorExecutor(BaseExecutor): self.res_path = os.path.dirname( os.path.dirname(os.path.abspath(self.cfg_path))) - logger.info(f"start to read the ckpt from {self.ckpt_path}") - logger.info(f"read the config from {self.cfg_path}") - logger.info(f"get the res path {self.res_path}") + logger.debug(f"start to read the ckpt from {self.ckpt_path}") + logger.debug(f"read the config from {self.cfg_path}") + logger.debug(f"get the res path {self.res_path}") # stage 2: read and config and init the model body self.config = CfgNode(new_allowed=True) self.config.merge_from_file(self.cfg_path) # stage 3: get the model name to instance the model network with dynamic_import - logger.info("start to dynamic import the model class") + logger.debug("start to dynamic import the model class") model_name = model_type[:model_type.rindex('_')] model_class = self.task_resource.get_model_class(model_name) - logger.info(f"model name {model_name}") + logger.debug(f"model name {model_name}") model_conf = self.config.model backbone = model_class(**model_conf) model = SpeakerIdetification( @@ -333,11 +333,11 @@ class VectorExecutor(BaseExecutor): self.model.eval() # stage 4: load the model parameters - logger.info("start to set the model parameters to model") + logger.debug("start to set the model parameters to model") model_dict = paddle.load(self.ckpt_path) self.model.set_state_dict(model_dict) - logger.info("create the model instance success") + logger.debug("create the model instance success") @paddle.no_grad() def infer(self, model_type: str): @@ -349,14 +349,14 @@ class VectorExecutor(BaseExecutor): # stage 0: get the feat and length from _inputs feats = self._inputs["feats"] lengths = self._inputs["lengths"] - logger.info("start to do backbone network model forward") - logger.info( + logger.debug("start to do backbone network model forward") + logger.debug( f"feats shape:{feats.shape}, lengths shape: {lengths.shape}") # stage 1: get the audio embedding # embedding from (1, emb_size, 1) -> (emb_size) embedding = self.model.backbone(feats, lengths).squeeze().numpy() - logger.info(f"embedding size: {embedding.shape}") + logger.debug(f"embedding size: {embedding.shape}") # stage 2: put the embedding and dim info to _outputs property # the embedding type is numpy.array @@ -380,12 +380,13 @@ class VectorExecutor(BaseExecutor): """ audio_file = input_file if isinstance(audio_file, (str, os.PathLike)): - logger.info(f"Preprocess audio file: {audio_file}") + logger.debug(f"Preprocess audio file: {audio_file}") # stage 1: load the audio sample points # Note: this process must match the training process waveform, sr = load_audio(audio_file) - logger.info(f"load the audio sample points, shape is: {waveform.shape}") + logger.debug( + f"load the audio sample points, shape is: {waveform.shape}") # stage 2: get the audio feat # Note: Now we only support fbank feature @@ -396,9 +397,9 @@ class VectorExecutor(BaseExecutor): n_mels=self.config.n_mels, window_size=self.config.window_size, hop_length=self.config.hop_size) - logger.info(f"extract the audio feat, shape is: {feat.shape}") + logger.debug(f"extract the audio feat, shape is: {feat.shape}") except Exception as e: - logger.info(f"feat occurs exception {e}") + logger.debug(f"feat occurs exception {e}") sys.exit(-1) feat = paddle.to_tensor(feat).unsqueeze(0) @@ -411,11 +412,11 @@ class VectorExecutor(BaseExecutor): # stage 4: store the feat and length in the _inputs, # which will be used in other function - logger.info(f"feats shape: {feat.shape}") + logger.debug(f"feats shape: {feat.shape}") self._inputs["feats"] = feat self._inputs["lengths"] = lengths - logger.info("audio extract the feat success") + logger.debug("audio extract the feat success") def _check(self, audio_file: str, sample_rate: int): """Check if the model sample match the audio sample rate @@ -441,7 +442,7 @@ class VectorExecutor(BaseExecutor): logger.error("Please input the right audio file path") return False - logger.info("checking the aduio file format......") + logger.debug("checking the aduio file format......") try: audio, audio_sample_rate = soundfile.read( audio_file, dtype="float32", always_2d=True) @@ -458,7 +459,7 @@ class VectorExecutor(BaseExecutor): ") return False - logger.info(f"The sample rate is {audio_sample_rate}") + logger.debug(f"The sample rate is {audio_sample_rate}") if audio_sample_rate != self.sample_rate: logger.error("The sample rate of the input file is not {}.\n \ @@ -468,6 +469,6 @@ class VectorExecutor(BaseExecutor): ".format(self.sample_rate, self.sample_rate)) sys.exit(-1) else: - logger.info("The audio file format is right") + logger.debug("The audio file format is right") return True diff --git a/paddlespeech/resource/resource.py b/paddlespeech/resource/resource.py index 70f12b64c2dc5bbf6ef508b41872e0504855d6fb..8e9914b2e13912d34413f92ff042cd1f3cbd95d0 100644 --- a/paddlespeech/resource/resource.py +++ b/paddlespeech/resource/resource.py @@ -60,6 +60,7 @@ class CommonTaskResource: def set_task_model(self, model_tag: str, model_type: int=0, + skip_download: bool=False, version: Optional[str]=None): """Set model tag and version of current task. @@ -83,16 +84,18 @@ class CommonTaskResource: self.version = version self.res_dict = self.pretrained_models[model_tag][version] self._format_path(self.res_dict) - self.res_dir = self._fetch(self.res_dict, - self._get_model_dir(model_type)) + if not skip_download: + self.res_dir = self._fetch(self.res_dict, + self._get_model_dir(model_type)) else: assert self.task == 'tts', 'Vocoder will only be used in tts task.' self.voc_model_tag = model_tag self.voc_version = version self.voc_res_dict = self.pretrained_models[model_tag][version] self._format_path(self.voc_res_dict) - self.voc_res_dir = self._fetch(self.voc_res_dict, - self._get_model_dir(model_type)) + if not skip_download: + self.voc_res_dir = self._fetch(self.voc_res_dict, + self._get_model_dir(model_type)) @staticmethod def get_model_class(model_name) -> List[object]: diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/export.py b/paddlespeech/s2t/exps/deepspeech2/bin/export.py index 049e7b688bc595e0e1eace1d0f1179b7c4e5f8ca..8acd46dfce48270f4fe6b0c402ecd91b693f9344 100644 --- a/paddlespeech/s2t/exps/deepspeech2/bin/export.py +++ b/paddlespeech/s2t/exps/deepspeech2/bin/export.py @@ -35,12 +35,6 @@ if __name__ == "__main__": # save jit model to parser.add_argument( "--export_path", type=str, help="path of the jit model to save") - parser.add_argument( - '--nxpu', - type=int, - default=0, - choices=[0, 1], - help="if nxpu == 0 and ngpu == 0, use cpu.") args = parser.parse_args() print_arguments(args) diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/test.py b/paddlespeech/s2t/exps/deepspeech2/bin/test.py index a9828f6e71c8b2303f6ce70948d1175d274c4a77..030168a9ad510f9e90d947b7fa2fed52e697a871 100644 --- a/paddlespeech/s2t/exps/deepspeech2/bin/test.py +++ b/paddlespeech/s2t/exps/deepspeech2/bin/test.py @@ -35,12 +35,6 @@ if __name__ == "__main__": # save asr result to parser.add_argument( "--result_file", type=str, help="path of save the asr result") - parser.add_argument( - '--nxpu', - type=int, - default=0, - choices=[0, 1], - help="if nxpu == 0 and ngpu == 0, use cpu.") args = parser.parse_args() print_arguments(args, globals()) diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/test_export.py b/paddlespeech/s2t/exps/deepspeech2/bin/test_export.py index 8db081e7bbb79c18666432743b53c9c249d86063..d7a9402b9182764ffaa80f22861b08756f61d275 100644 --- a/paddlespeech/s2t/exps/deepspeech2/bin/test_export.py +++ b/paddlespeech/s2t/exps/deepspeech2/bin/test_export.py @@ -38,12 +38,6 @@ if __name__ == "__main__": #load jit model from parser.add_argument( "--export_path", type=str, help="path of the jit model to save") - parser.add_argument( - '--nxpu', - type=int, - default=0, - choices=[0, 1], - help="if nxpu == 0 and ngpu == 0, use cpu.") parser.add_argument( "--enable-auto-log", action="store_true", help="use auto log") args = parser.parse_args() diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/train.py b/paddlespeech/s2t/exps/deepspeech2/bin/train.py index fee7079d9abb4d514e756a1ca4bcd8cd4449bc66..2c9942f9b35af051471004c6b09312c619fa87c2 100644 --- a/paddlespeech/s2t/exps/deepspeech2/bin/train.py +++ b/paddlespeech/s2t/exps/deepspeech2/bin/train.py @@ -31,12 +31,6 @@ def main(config, args): if __name__ == "__main__": parser = default_argument_parser() - parser.add_argument( - '--nxpu', - type=int, - default=0, - choices=[0, 1], - help="if nxpu == 0 and ngpu == 0, use cpu.") args = parser.parse_args() print_arguments(args, globals()) diff --git a/paddlespeech/s2t/exps/deepspeech2/model.py b/paddlespeech/s2t/exps/deepspeech2/model.py index 511997a7c1cd637366f1f8fb9cc7b73c51c5acaa..7ab8cf853adb25ee03c8b3f4f41743d5ed4e2cfb 100644 --- a/paddlespeech/s2t/exps/deepspeech2/model.py +++ b/paddlespeech/s2t/exps/deepspeech2/model.py @@ -23,7 +23,7 @@ import paddle from paddle import distributed as dist from paddle import inference -from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer +from paddlespeech.audio.text.text_featurizer import TextFeaturizer from paddlespeech.s2t.io.dataloader import BatchDataLoader from paddlespeech.s2t.models.ds2 import DeepSpeech2InferModel from paddlespeech.s2t.models.ds2 import DeepSpeech2Model diff --git a/paddlespeech/s2t/exps/u2/bin/test_wav.py b/paddlespeech/s2t/exps/u2/bin/test_wav.py index 86c3db89f82878e093d71ec9a56740fb05767b7b..887ec7a6d20dda6e802c3aade15a381633efac16 100644 --- a/paddlespeech/s2t/exps/u2/bin/test_wav.py +++ b/paddlespeech/s2t/exps/u2/bin/test_wav.py @@ -20,10 +20,10 @@ import paddle import soundfile from yacs.config import CfgNode +from paddlespeech.audio.transform.transformation import Transformation from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer from paddlespeech.s2t.models.u2 import U2Model from paddlespeech.s2t.training.cli import default_argument_parser -from paddlespeech.s2t.transform.transformation import Transformation from paddlespeech.s2t.utils.log import Log from paddlespeech.s2t.utils.utility import UpdateConfig logger = Log(__name__).getlog() diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py index efcc9629fdbf63981cfdc4cc5b91693e5f3a85ee..cdad3b8f7352f35bb0796063ccb1ebe6d3acb9e4 100644 --- a/paddlespeech/s2t/exps/u2/model.py +++ b/paddlespeech/s2t/exps/u2/model.py @@ -26,6 +26,8 @@ from paddle import distributed as dist from paddlespeech.s2t.frontend.featurizer import TextFeaturizer from paddlespeech.s2t.io.dataloader import BatchDataLoader +from paddlespeech.s2t.io.dataloader import StreamDataLoader +from paddlespeech.s2t.io.dataloader import DataLoaderFactory from paddlespeech.s2t.models.u2 import U2Model from paddlespeech.s2t.training.optimizer import OptimizerFactory from paddlespeech.s2t.training.reporter import ObsScope @@ -106,7 +108,8 @@ class U2Trainer(Trainer): @paddle.no_grad() def valid(self): self.model.eval() - logger.info(f"Valid Total Examples: {len(self.valid_loader.dataset)}") + if not self.use_streamdata: + logger.info(f"Valid Total Examples: {len(self.valid_loader.dataset)}") valid_losses = defaultdict(list) num_seen_utts = 1 total_loss = 0.0 @@ -132,7 +135,8 @@ class U2Trainer(Trainer): msg = f"Valid: Rank: {dist.get_rank()}, " msg += "epoch: {}, ".format(self.epoch) msg += "step: {}, ".format(self.iteration) - msg += "batch: {}/{}, ".format(i + 1, len(self.valid_loader)) + if not self.use_streamdata: + msg += "batch: {}/{}, ".format(i + 1, len(self.valid_loader)) msg += ', '.join('{}: {:>.6f}'.format(k, v) for k, v in valid_dump.items()) logger.info(msg) @@ -152,7 +156,8 @@ class U2Trainer(Trainer): self.before_train() - logger.info(f"Train Total Examples: {len(self.train_loader.dataset)}") + if not self.use_streamdata: + logger.info(f"Train Total Examples: {len(self.train_loader.dataset)}") while self.epoch < self.config.n_epoch: with Timer("Epoch-Train Time Cost: {}"): self.model.train() @@ -170,7 +175,8 @@ class U2Trainer(Trainer): self.train_batch(batch_index, batch, msg) self.after_train_batch() report('iter', batch_index + 1) - report('total', len(self.train_loader)) + if not self.use_streamdata: + report('total', len(self.train_loader)) report('reader_cost', dataload_time) observation['batch_cost'] = observation[ 'reader_cost'] + observation['step_cost'] @@ -191,7 +197,6 @@ class U2Trainer(Trainer): except Exception as e: logger.error(e) raise e - with Timer("Eval Time Cost: {}"): total_loss, num_seen_utts = self.valid() if dist.get_world_size() > 1: @@ -218,92 +223,16 @@ class U2Trainer(Trainer): def setup_dataloader(self): config = self.config.clone() - + self.use_streamdata = config.get("use_stream_data", False) if self.train: - # train/valid dataset, return token ids - self.train_loader = BatchDataLoader( - json_file=config.train_manifest, - train_mode=True, - sortagrad=config.sortagrad, - batch_size=config.batch_size, - maxlen_in=config.maxlen_in, - maxlen_out=config.maxlen_out, - minibatches=config.minibatches, - mini_batch_size=self.args.ngpu, - batch_count=config.batch_count, - batch_bins=config.batch_bins, - batch_frames_in=config.batch_frames_in, - batch_frames_out=config.batch_frames_out, - batch_frames_inout=config.batch_frames_inout, - preprocess_conf=config.preprocess_config, - n_iter_processes=config.num_workers, - subsampling_factor=1, - num_encs=1, - dist_sampler=config.get('dist_sampler', False), - shortest_first=False) - - self.valid_loader = BatchDataLoader( - json_file=config.dev_manifest, - train_mode=False, - sortagrad=False, - batch_size=config.batch_size, - maxlen_in=float('inf'), - maxlen_out=float('inf'), - minibatches=0, - mini_batch_size=self.args.ngpu, - batch_count='auto', - batch_bins=0, - batch_frames_in=0, - batch_frames_out=0, - batch_frames_inout=0, - preprocess_conf=config.preprocess_config, - n_iter_processes=config.num_workers, - subsampling_factor=1, - num_encs=1, - dist_sampler=config.get('dist_sampler', False), - shortest_first=False) + self.train_loader = DataLoaderFactory.get_dataloader('train', config, self.args) + self.valid_loader = DataLoaderFactory.get_dataloader('valid', config, self.args) logger.info("Setup train/valid Dataloader!") else: decode_batch_size = config.get('decode', dict()).get( 'decode_batch_size', 1) - # test dataset, return raw text - self.test_loader = BatchDataLoader( - json_file=config.test_manifest, - train_mode=False, - sortagrad=False, - batch_size=decode_batch_size, - maxlen_in=float('inf'), - maxlen_out=float('inf'), - minibatches=0, - mini_batch_size=1, - batch_count='auto', - batch_bins=0, - batch_frames_in=0, - batch_frames_out=0, - batch_frames_inout=0, - preprocess_conf=config.preprocess_config, - n_iter_processes=1, - subsampling_factor=1, - num_encs=1) - - self.align_loader = BatchDataLoader( - json_file=config.test_manifest, - train_mode=False, - sortagrad=False, - batch_size=decode_batch_size, - maxlen_in=float('inf'), - maxlen_out=float('inf'), - minibatches=0, - mini_batch_size=1, - batch_count='auto', - batch_bins=0, - batch_frames_in=0, - batch_frames_out=0, - batch_frames_inout=0, - preprocess_conf=config.preprocess_config, - n_iter_processes=1, - subsampling_factor=1, - num_encs=1) + self.test_loader = DataLoaderFactory.get_dataloader('test', config, self.args) + self.align_loader = DataLoaderFactory.get_dataloader('align', config, self.args) logger.info("Setup test/align Dataloader!") def setup_model(self): @@ -452,7 +381,8 @@ class U2Tester(U2Trainer): def test(self): assert self.args.result_file self.model.eval() - logger.info(f"Test Total Examples: {len(self.test_loader.dataset)}") + if not self.use_streamdata: + logger.info(f"Test Total Examples: {len(self.test_loader.dataset)}") stride_ms = self.config.stride_ms error_rate_type = None diff --git a/paddlespeech/s2t/exps/u2_kaldi/model.py b/paddlespeech/s2t/exps/u2_kaldi/model.py index bc995977ada577770612f99d05387ed0bb87d39e..cb015c116c8563c162a2ed5beaa2b89be91ab348 100644 --- a/paddlespeech/s2t/exps/u2_kaldi/model.py +++ b/paddlespeech/s2t/exps/u2_kaldi/model.py @@ -25,7 +25,7 @@ from paddle import distributed as dist from paddlespeech.s2t.frontend.featurizer import TextFeaturizer from paddlespeech.s2t.frontend.utility import load_dict -from paddlespeech.s2t.io.dataloader import BatchDataLoader +from paddlespeech.s2t.io.dataloader import DataLoaderFactory from paddlespeech.s2t.models.u2 import U2Model from paddlespeech.s2t.training.optimizer import OptimizerFactory from paddlespeech.s2t.training.scheduler import LRSchedulerFactory @@ -104,7 +104,8 @@ class U2Trainer(Trainer): @paddle.no_grad() def valid(self): self.model.eval() - logger.info(f"Valid Total Examples: {len(self.valid_loader.dataset)}") + if not self.use_streamdata: + logger.info(f"Valid Total Examples: {len(self.valid_loader.dataset)}") valid_losses = defaultdict(list) num_seen_utts = 1 total_loss = 0.0 @@ -131,7 +132,8 @@ class U2Trainer(Trainer): msg = f"Valid: Rank: {dist.get_rank()}, " msg += "epoch: {}, ".format(self.epoch) msg += "step: {}, ".format(self.iteration) - msg += "batch: {}/{}, ".format(i + 1, len(self.valid_loader)) + if not self.use_streamdata: + msg += "batch: {}/{}, ".format(i + 1, len(self.valid_loader)) msg += ', '.join('{}: {:>.6f}'.format(k, v) for k, v in valid_dump.items()) logger.info(msg) @@ -150,8 +152,8 @@ class U2Trainer(Trainer): # paddle.jit.save(script_model, script_model_path) self.before_train() - - logger.info(f"Train Total Examples: {len(self.train_loader.dataset)}") + if not self.use_streamdata: + logger.info(f"Train Total Examples: {len(self.train_loader.dataset)}") while self.epoch < self.config.n_epoch: with Timer("Epoch-Train Time Cost: {}"): self.model.train() @@ -162,7 +164,8 @@ class U2Trainer(Trainer): msg = "Train: Rank: {}, ".format(dist.get_rank()) msg += "epoch: {}, ".format(self.epoch) msg += "step: {}, ".format(self.iteration) - msg += "batch : {}/{}, ".format(batch_index + 1, + if not self.use_streamdata: + msg += "batch : {}/{}, ".format(batch_index + 1, len(self.train_loader)) msg += "lr: {:>.8f}, ".format(self.lr_scheduler()) msg += "data time: {:>.3f}s, ".format(dataload_time) @@ -198,87 +201,23 @@ class U2Trainer(Trainer): self.new_epoch() def setup_dataloader(self): - config = self.config.clone() - # train/valid dataset, return token ids - self.train_loader = BatchDataLoader( - json_file=config.train_manifest, - train_mode=True, - sortagrad=False, - batch_size=config.batch_size, - maxlen_in=float('inf'), - maxlen_out=float('inf'), - minibatches=0, - mini_batch_size=self.args.ngpu, - batch_count='auto', - batch_bins=0, - batch_frames_in=0, - batch_frames_out=0, - batch_frames_inout=0, - preprocess_conf=config.preprocess_config, - n_iter_processes=config.num_workers, - subsampling_factor=1, - num_encs=1) - - self.valid_loader = BatchDataLoader( - json_file=config.dev_manifest, - train_mode=False, - sortagrad=False, - batch_size=config.batch_size, - maxlen_in=float('inf'), - maxlen_out=float('inf'), - minibatches=0, - mini_batch_size=self.args.ngpu, - batch_count='auto', - batch_bins=0, - batch_frames_in=0, - batch_frames_out=0, - batch_frames_inout=0, - preprocess_conf=None, - n_iter_processes=config.num_workers, - subsampling_factor=1, - num_encs=1) - - decode_batch_size = config.get('decode', dict()).get( - 'decode_batch_size', 1) - # test dataset, return raw text - self.test_loader = BatchDataLoader( - json_file=config.test_manifest, - train_mode=False, - sortagrad=False, - batch_size=decode_batch_size, - maxlen_in=float('inf'), - maxlen_out=float('inf'), - minibatches=0, - mini_batch_size=1, - batch_count='auto', - batch_bins=0, - batch_frames_in=0, - batch_frames_out=0, - batch_frames_inout=0, - preprocess_conf=None, - n_iter_processes=1, - subsampling_factor=1, - num_encs=1) - - self.align_loader = BatchDataLoader( - json_file=config.test_manifest, - train_mode=False, - sortagrad=False, - batch_size=decode_batch_size, - maxlen_in=float('inf'), - maxlen_out=float('inf'), - minibatches=0, - mini_batch_size=1, - batch_count='auto', - batch_bins=0, - batch_frames_in=0, - batch_frames_out=0, - batch_frames_inout=0, - preprocess_conf=None, - n_iter_processes=1, - subsampling_factor=1, - num_encs=1) - logger.info("Setup train/valid/test/align Dataloader!") + self.use_streamdata = config.get("use_stream_data", False) + if self.train: + config = self.config.clone() + self.train_loader = DataLoaderFactory.get_dataloader('train', config, self.args) + config = self.config.clone() + config['preprocess_config'] = None + self.valid_loader = DataLoaderFactory.get_dataloader('valid', config, self.args) + logger.info("Setup train/valid Dataloader!") + else: + config = self.config.clone() + config['preprocess_config'] = None + self.test_loader = DataLoaderFactory.get_dataloader('test', config, self.args) + config = self.config.clone() + config['preprocess_config'] = None + self.align_loader = DataLoaderFactory.get_dataloader('align', config, self.args) + logger.info("Setup test/align Dataloader!") + def setup_model(self): config = self.config @@ -406,7 +345,8 @@ class U2Tester(U2Trainer): def test(self): assert self.args.result_file self.model.eval() - logger.info(f"Test Total Examples: {len(self.test_loader.dataset)}") + if not self.use_streamdata: + logger.info(f"Test Total Examples: {len(self.test_loader.dataset)}") stride_ms = self.config.stride_ms error_rate_type = None diff --git a/paddlespeech/s2t/exps/u2_st/model.py b/paddlespeech/s2t/exps/u2_st/model.py index 6a32eda7717cc4077a90eb561e0f01ac8a212f51..60382543527840e4516fcba967f2e5966a6aab36 100644 --- a/paddlespeech/s2t/exps/u2_st/model.py +++ b/paddlespeech/s2t/exps/u2_st/model.py @@ -25,7 +25,7 @@ import paddle from paddle import distributed as dist from paddlespeech.s2t.frontend.featurizer import TextFeaturizer -from paddlespeech.s2t.io.dataloader import BatchDataLoader +from paddlespeech.s2t.io.dataloader import DataLoaderFactory from paddlespeech.s2t.models.u2_st import U2STModel from paddlespeech.s2t.training.optimizer import OptimizerFactory from paddlespeech.s2t.training.reporter import ObsScope @@ -120,7 +120,8 @@ class U2STTrainer(Trainer): @paddle.no_grad() def valid(self): self.model.eval() - logger.info(f"Valid Total Examples: {len(self.valid_loader.dataset)}") + if not self.use_streamdata: + logger.info(f"Valid Total Examples: {len(self.valid_loader.dataset)}") valid_losses = defaultdict(list) num_seen_utts = 1 total_loss = 0.0 @@ -153,7 +154,8 @@ class U2STTrainer(Trainer): msg = f"Valid: Rank: {dist.get_rank()}, " msg += "epoch: {}, ".format(self.epoch) msg += "step: {}, ".format(self.iteration) - msg += "batch: {}/{}, ".format(i + 1, len(self.valid_loader)) + if not self.use_streamdata: + msg += "batch: {}/{}, ".format(i + 1, len(self.valid_loader)) msg += ', '.join('{}: {:>.6f}'.format(k, v) for k, v in valid_dump.items()) logger.info(msg) @@ -172,8 +174,8 @@ class U2STTrainer(Trainer): # paddle.jit.save(script_model, script_model_path) self.before_train() - - logger.info(f"Train Total Examples: {len(self.train_loader.dataset)}") + if not self.use_streamdata: + logger.info(f"Train Total Examples: {len(self.train_loader.dataset)}") while self.epoch < self.config.n_epoch: with Timer("Epoch-Train Time Cost: {}"): self.model.train() @@ -191,7 +193,8 @@ class U2STTrainer(Trainer): self.train_batch(batch_index, batch, msg) self.after_train_batch() report('iter', batch_index + 1) - report('total', len(self.train_loader)) + if not self.use_streamdata: + report('total', len(self.train_loader)) report('reader_cost', dataload_time) observation['batch_cost'] = observation[ 'reader_cost'] + observation['step_cost'] @@ -241,79 +244,18 @@ class U2STTrainer(Trainer): load_transcript = True if config.model_conf.asr_weight > 0 else False + config = self.config.clone() + config['load_transcript'] = load_transcript + self.use_streamdata = config.get("use_stream_data", False) if self.train: - # train/valid dataset, return token ids - self.train_loader = BatchDataLoader( - json_file=config.train_manifest, - train_mode=True, - sortagrad=False, - batch_size=config.batch_size, - maxlen_in=config.maxlen_in, - maxlen_out=config.maxlen_out, - minibatches=0, - mini_batch_size=1, - batch_count='auto', - batch_bins=0, - batch_frames_in=0, - batch_frames_out=0, - batch_frames_inout=0, - preprocess_conf=config. - preprocess_config, # aug will be off when train_mode=False - n_iter_processes=config.num_workers, - subsampling_factor=1, - load_aux_output=load_transcript, - num_encs=1, - dist_sampler=True) - - self.valid_loader = BatchDataLoader( - json_file=config.dev_manifest, - train_mode=False, - sortagrad=False, - batch_size=config.batch_size, - maxlen_in=float('inf'), - maxlen_out=float('inf'), - minibatches=0, - mini_batch_size=1, - batch_count='auto', - batch_bins=0, - batch_frames_in=0, - batch_frames_out=0, - batch_frames_inout=0, - preprocess_conf=config. - preprocess_config, # aug will be off when train_mode=False - n_iter_processes=config.num_workers, - subsampling_factor=1, - load_aux_output=load_transcript, - num_encs=1, - dist_sampler=False) + self.train_loader = DataLoaderFactory.get_dataloader('train', config, self.args) + self.valid_loader = DataLoaderFactory.get_dataloader('valid', config, self.args) logger.info("Setup train/valid Dataloader!") else: - # test dataset, return raw text - decode_batch_size = config.get('decode', dict()).get( - 'decode_batch_size', 1) - self.test_loader = BatchDataLoader( - json_file=config.test_manifest, - train_mode=False, - sortagrad=False, - batch_size=decode_batch_size, - maxlen_in=float('inf'), - maxlen_out=float('inf'), - minibatches=0, - mini_batch_size=1, - batch_count='auto', - batch_bins=0, - batch_frames_in=0, - batch_frames_out=0, - batch_frames_inout=0, - preprocess_conf=config. - preprocess_config, # aug will be off when train_mode=False - n_iter_processes=config.num_workers, - subsampling_factor=1, - num_encs=1, - dist_sampler=False) - + self.test_loader = DataLoaderFactory.get_dataloader('test', config, self.args) logger.info("Setup test Dataloader!") + def setup_model(self): config = self.config model_conf = config @@ -468,7 +410,8 @@ class U2STTester(U2STTrainer): def test(self): assert self.args.result_file self.model.eval() - logger.info(f"Test Total Examples: {len(self.test_loader.dataset)}") + if not self.use_streamdata: + logger.info(f"Test Total Examples: {len(self.test_loader.dataset)}") decode_cfg = self.config.decode bleu_func = bleu_score.char_bleu if decode_cfg.error_rate_type == 'char-bleu' else bleu_score.bleu diff --git a/paddlespeech/s2t/frontend/augmentor/spec_augment.py b/paddlespeech/s2t/frontend/augmentor/spec_augment.py index e91cfdce42b621934fa25b69cc629ad03c7fec34..380712851e9b0d5fcb031366da91b7233e1c9ec5 100644 --- a/paddlespeech/s2t/frontend/augmentor/spec_augment.py +++ b/paddlespeech/s2t/frontend/augmentor/spec_augment.py @@ -16,7 +16,6 @@ import random import numpy as np from PIL import Image -from PIL.Image import BICUBIC from paddlespeech.s2t.frontend.augmentor.base import AugmentorBase from paddlespeech.s2t.utils.log import Log @@ -164,9 +163,9 @@ class SpecAugmentor(AugmentorBase): window) + 1 # 1 ... t - 1 left = Image.fromarray(x[:center]).resize((x.shape[1], warped), - BICUBIC) + Image.BICUBIC) right = Image.fromarray(x[center:]).resize((x.shape[1], t - warped), - BICUBIC) + Image.BICUBIC) if self.inplace: x[:warped] = left x[warped:] = right diff --git a/paddlespeech/s2t/frontend/featurizer/text_featurizer.py b/paddlespeech/s2t/frontend/featurizer/text_featurizer.py index 0c0fa5e2f63b05387cd6ce9af6fb0331c400cfb8..982c6b8fe47e51f3f94de8f47b9a4b6110544052 100644 --- a/paddlespeech/s2t/frontend/featurizer/text_featurizer.py +++ b/paddlespeech/s2t/frontend/featurizer/text_featurizer.py @@ -226,10 +226,10 @@ class TextFeaturizer(): sos_id = vocab_list.index(SOS) if SOS in vocab_list else -1 space_id = vocab_list.index(SPACE) if SPACE in vocab_list else -1 - logger.info(f"BLANK id: {blank_id}") - logger.info(f"UNK id: {unk_id}") - logger.info(f"EOS id: {eos_id}") - logger.info(f"SOS id: {sos_id}") - logger.info(f"SPACE id: {space_id}") - logger.info(f"MASKCTC id: {maskctc_id}") + logger.debug(f"BLANK id: {blank_id}") + logger.debug(f"UNK id: {unk_id}") + logger.debug(f"EOS id: {eos_id}") + logger.debug(f"SOS id: {sos_id}") + logger.debug(f"SPACE id: {space_id}") + logger.debug(f"MASKCTC id: {maskctc_id}") return token2id, id2token, vocab_list, unk_id, eos_id, blank_id diff --git a/paddlespeech/s2t/io/dataloader.py b/paddlespeech/s2t/io/dataloader.py index 55aa13ff10a1e34b3063a760544fdd33b150f61f..831830241dc77cc97977f5c4404acb98e7f09df5 100644 --- a/paddlespeech/s2t/io/dataloader.py +++ b/paddlespeech/s2t/io/dataloader.py @@ -18,6 +18,7 @@ from typing import Text import jsonlines import numpy as np +import paddle from paddle.io import BatchSampler from paddle.io import DataLoader from paddle.io import DistributedBatchSampler @@ -28,7 +29,11 @@ from paddlespeech.s2t.io.dataset import TransformDataset from paddlespeech.s2t.io.reader import LoadInputsAndTargets from paddlespeech.s2t.utils.log import Log -__all__ = ["BatchDataLoader"] +import paddlespeech.audio.streamdata as streamdata +from paddlespeech.audio.text.text_featurizer import TextFeaturizer +from yacs.config import CfgNode + +__all__ = ["BatchDataLoader", "StreamDataLoader"] logger = Log(__name__).getlog() @@ -56,6 +61,136 @@ def batch_collate(x): """ return x[0] +def read_preprocess_cfg(preprocess_conf_file): + augment_conf = dict() + preprocess_cfg = CfgNode(new_allowed=True) + preprocess_cfg.merge_from_file(preprocess_conf_file) + for idx, process in enumerate(preprocess_cfg["process"]): + opts = dict(process) + process_type = opts.pop("type") + if process_type == 'time_warp': + augment_conf['max_w'] = process['max_time_warp'] + augment_conf['w_inplace'] = process['inplace'] + augment_conf['w_mode'] = process['mode'] + if process_type == 'freq_mask': + augment_conf['max_f'] = process['F'] + augment_conf['num_f_mask'] = process['n_mask'] + augment_conf['f_inplace'] = process['inplace'] + augment_conf['f_replace_with_zero'] = process['replace_with_zero'] + if process_type == 'time_mask': + augment_conf['max_t'] = process['T'] + augment_conf['num_t_mask'] = process['n_mask'] + augment_conf['t_inplace'] = process['inplace'] + augment_conf['t_replace_with_zero'] = process['replace_with_zero'] + return augment_conf + +class StreamDataLoader(): + def __init__(self, + manifest_file: str, + train_mode: bool, + unit_type: str='char', + batch_size: int=0, + preprocess_conf=None, + num_mel_bins=80, + frame_length=25, + frame_shift=10, + dither=0.0, + minlen_in: float=0.0, + maxlen_in: float=float('inf'), + minlen_out: float=0.0, + maxlen_out: float=float('inf'), + resample_rate: int=16000, + shuffle_size: int=10000, + sort_size: int=1000, + n_iter_processes: int=1, + prefetch_factor: int=2, + dist_sampler: bool=False, + cmvn_file="data/mean_std.json", + vocab_filepath='data/lang_char/vocab.txt'): + self.manifest_file = manifest_file + self.train_model = train_mode + self.batch_size = batch_size + self.prefetch_factor = prefetch_factor + self.dist_sampler = dist_sampler + self.n_iter_processes = n_iter_processes + + text_featurizer = TextFeaturizer(unit_type, vocab_filepath) + symbol_table = text_featurizer.vocab_dict + self.feat_dim = num_mel_bins + self.vocab_size = text_featurizer.vocab_size + + augment_conf = read_preprocess_cfg(preprocess_conf) + + # The list of shard + shardlist = [] + with open(manifest_file, "r") as f: + for line in f.readlines(): + shardlist.append(line.strip()) + world_size = 1 + try: + world_size = paddle.distributed.get_world_size() + except Exception as e: + logger.warninig(e) + logger.warninig("can not get world_size using paddle.distributed.get_world_size(), use world_size=1") + assert(len(shardlist) >= world_size, "the length of shard list should >= number of gpus/xpus/...") + + update_n_iter_processes = int(max(min(len(shardlist)/world_size - 1, self.n_iter_processes), 0)) + logger.info(f"update_n_iter_processes {update_n_iter_processes}") + if update_n_iter_processes != self.n_iter_processes: + self.n_iter_processes = update_n_iter_processes + logger.info(f"change nun_workers to {self.n_iter_processes}") + + if self.dist_sampler: + base_dataset = streamdata.DataPipeline( + streamdata.SimpleShardList(shardlist), + streamdata.split_by_node if train_mode else streamdata.placeholder(), + streamdata.split_by_worker, + streamdata.tarfile_to_samples(streamdata.reraise_exception) + ) + else: + base_dataset = streamdata.DataPipeline( + streamdata.SimpleShardList(shardlist), + streamdata.split_by_worker, + streamdata.tarfile_to_samples(streamdata.reraise_exception) + ) + + self.dataset = base_dataset.append_list( + streamdata.audio_tokenize(symbol_table), + streamdata.audio_data_filter(frame_shift=frame_shift, max_length=maxlen_in, min_length=minlen_in, token_max_length=maxlen_out, token_min_length=minlen_out), + streamdata.audio_resample(resample_rate=resample_rate), + streamdata.audio_compute_fbank(num_mel_bins=num_mel_bins, frame_length=frame_length, frame_shift=frame_shift, dither=dither), + streamdata.audio_spec_aug(**augment_conf) if train_mode else streamdata.placeholder(), # num_t_mask=2, num_f_mask=2, max_t=40, max_f=30, max_w=80) + streamdata.shuffle(shuffle_size), + streamdata.sort(sort_size=sort_size), + streamdata.batched(batch_size), + streamdata.audio_padding(), + streamdata.audio_cmvn(cmvn_file) + ) + + if paddle.__version__ >= '2.3.2': + self.loader = streamdata.WebLoader( + self.dataset, + num_workers=self.n_iter_processes, + prefetch_factor = self.prefetch_factor, + batch_size=None + ) + else: + self.loader = streamdata.WebLoader( + self.dataset, + num_workers=self.n_iter_processes, + batch_size=None + ) + + def __iter__(self): + return self.loader.__iter__() + + def __call__(self): + return self.__iter__() + + def __len__(self): + logger.info("Stream dataloader does not support calculate the length of the dataset") + return -1 + class BatchDataLoader(): def __init__(self, @@ -199,3 +334,119 @@ class BatchDataLoader(): echo += f"shortest_first: {self.shortest_first}, " echo += f"file: {self.json_file}" return echo + + +class DataLoaderFactory(): + @staticmethod + def get_dataloader(mode: str, config, args): + config = config.clone() + use_streamdata = config.get("use_stream_data", False) + if use_streamdata: + if mode == 'train': + config['manifest'] = config.train_manifest + config['train_mode'] = True + elif mode == 'valid': + config['manifest'] = config.dev_manifest + config['train_mode'] = False + elif model == 'test' or mode == 'align': + config['manifest'] = config.test_manifest + config['train_mode'] = False + config['dither'] = 0.0 + config['minlen_in'] = 0.0 + config['maxlen_in'] = float('inf') + config['minlen_out'] = 0 + config['maxlen_out'] = float('inf') + config['dist_sampler'] = False + else: + raise KeyError("not valid mode type!!, please input one of 'train, valid, test, align'") + return StreamDataLoader( + manifest_file=config.manifest, + train_mode=config.train_mode, + unit_type=config.unit_type, + preprocess_conf=config.preprocess_config, + batch_size=config.batch_size, + num_mel_bins=config.feat_dim, + frame_length=config.window_ms, + frame_shift=config.stride_ms, + dither=config.dither, + minlen_in=config.minlen_in, + maxlen_in=config.maxlen_in, + minlen_out=config.minlen_out, + maxlen_out=config.maxlen_out, + resample_rate=config.resample_rate, + shuffle_size=config.shuffle_size, + sort_size=config.sort_size, + n_iter_processes=config.num_workers, + prefetch_factor=config.prefetch_factor, + dist_sampler=config.dist_sampler, + cmvn_file=config.cmvn_file, + vocab_filepath=config.vocab_filepath, + ) + else: + if mode == 'train': + config['manifest'] = config.train_manifest + config['train_mode'] = True + config['mini_batch_size'] = args.ngpu + config['subsampling_factor'] = 1 + config['num_encs'] = 1 + elif mode == 'valid': + config['manifest'] = config.dev_manifest + config['train_mode'] = False + config['sortagrad'] = False + config['maxlen_in'] = float('inf') + config['maxlen_out'] = float('inf') + config['minibatches'] = 0 + config['mini_batch_size'] = args.ngpu + config['batch_count'] = 'auto' + config['batch_bins'] = 0 + config['batch_frames_in'] = 0 + config['batch_frames_out'] = 0 + config['batch_frames_inout'] = 0 + config['subsampling_factor'] = 1 + config['num_encs'] = 1 + config['shortest_first'] = False + elif mode == 'test' or mode == 'align': + config['manifest'] = config.test_manifest + config['train_mode'] = False + config['sortagrad'] = False + config['batch_size'] = config.get('decode', dict()).get( + 'decode_batch_size', 1) + config['maxlen_in'] = float('inf') + config['maxlen_out'] = float('inf') + config['minibatches'] = 0 + config['mini_batch_size'] = 1 + config['batch_count'] = 'auto' + config['batch_bins'] = 0 + config['batch_frames_in'] = 0 + config['batch_frames_out'] = 0 + config['batch_frames_inout'] = 0 + config['num_workers'] = 1 + config['subsampling_factor'] = 1 + config['num_encs'] = 1 + config['dist_sampler'] = False + config['shortest_first'] = False + else: + raise KeyError("not valid mode type!!, please input one of 'train, valid, test, align'") + + return BatchDataLoader( + json_file=config.manifest, + train_mode=config.train_mode, + sortagrad=config.sortagrad, + batch_size=config.batch_size, + maxlen_in=config.maxlen_in, + maxlen_out=config.maxlen_out, + minibatches=config.minibatches, + mini_batch_size=config.mini_batch_size, + batch_count=config.batch_count, + batch_bins=config.batch_bins, + batch_frames_in=config.batch_frames_in, + batch_frames_out=config.batch_frames_out, + batch_frames_inout=config.batch_frames_inout, + preprocess_conf=config.preprocess_config, + n_iter_processes=config.num_workers, + subsampling_factor=config.subsampling_factor, + load_aux_output=config.get('load_transcript', None), + num_encs=config.num_encs, + dist_sampler=config.dist_sampler, + shortest_first=config.shortest_first) + diff --git a/paddlespeech/s2t/io/reader.py b/paddlespeech/s2t/io/reader.py index 4e136bdce1d9b5490dadf58ab6359e6430121ced..5e018befb0e439a9f49df05cc9a57bb37dcbd2e7 100644 --- a/paddlespeech/s2t/io/reader.py +++ b/paddlespeech/s2t/io/reader.py @@ -19,7 +19,7 @@ import numpy as np import soundfile from .utility import feat_type -from paddlespeech.s2t.transform.transformation import Transformation +from paddlespeech.audio.transform.transformation import Transformation from paddlespeech.s2t.utils.log import Log # from paddlespeech.s2t.frontend.augmentor.augmentation import AugmentationPipeline as Transformation diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py index b4b61666f24f0fe67ea85d92565916617d5d20b2..100aca18b7dca8bafb2d2a03ffc7391d15b434f5 100644 --- a/paddlespeech/s2t/models/u2/u2.py +++ b/paddlespeech/s2t/models/u2/u2.py @@ -48,9 +48,9 @@ from paddlespeech.s2t.utils import checkpoint from paddlespeech.s2t.utils import layer_tools from paddlespeech.s2t.utils.ctc_utils import remove_duplicates_and_blank from paddlespeech.s2t.utils.log import Log -from paddlespeech.s2t.utils.tensor_utils import add_sos_eos -from paddlespeech.s2t.utils.tensor_utils import pad_sequence -from paddlespeech.s2t.utils.tensor_utils import th_accuracy +from paddlespeech.audio.utils.tensor_utils import add_sos_eos +from paddlespeech.audio.utils.tensor_utils import pad_sequence +from paddlespeech.audio.utils.tensor_utils import th_accuracy from paddlespeech.s2t.utils.utility import log_add from paddlespeech.s2t.utils.utility import UpdateConfig @@ -827,7 +827,7 @@ class U2Model(U2DecodeModel): # encoder encoder_type = configs.get('encoder', 'transformer') - logger.info(f"U2 Encoder type: {encoder_type}") + logger.debug(f"U2 Encoder type: {encoder_type}") if encoder_type == 'transformer': encoder = TransformerEncoder( input_dim, global_cmvn=global_cmvn, **configs['encoder_conf']) @@ -894,7 +894,7 @@ class U2Model(U2DecodeModel): if checkpoint_path: infos = checkpoint.Checkpoint().load_parameters( model, checkpoint_path=checkpoint_path) - logger.info(f"checkpoint info: {infos}") + logger.debug(f"checkpoint info: {infos}") layer_tools.summary(model) return model diff --git a/paddlespeech/s2t/models/u2_st/u2_st.py b/paddlespeech/s2t/models/u2_st/u2_st.py index 6447753c50f0f27bbfc3ed87495ec8cd42d79c59..00ded9125d2b6f39ba19f7acdbfaf1bf661aab04 100644 --- a/paddlespeech/s2t/models/u2_st/u2_st.py +++ b/paddlespeech/s2t/models/u2_st/u2_st.py @@ -38,8 +38,8 @@ from paddlespeech.s2t.modules.mask import subsequent_mask from paddlespeech.s2t.utils import checkpoint from paddlespeech.s2t.utils import layer_tools from paddlespeech.s2t.utils.log import Log -from paddlespeech.s2t.utils.tensor_utils import add_sos_eos -from paddlespeech.s2t.utils.tensor_utils import th_accuracy +from paddlespeech.audio.utils.tensor_utils import add_sos_eos +from paddlespeech.audio.utils.tensor_utils import th_accuracy from paddlespeech.s2t.utils.utility import UpdateConfig __all__ = ["U2STModel", "U2STInferModel"] diff --git a/paddlespeech/s2t/modules/loss.py b/paddlespeech/s2t/modules/loss.py index c7d9bd45dd2bf005a575098456c435a173678d26..884fb70c10ffa024833c61cf35b413e446cbf9d0 100644 --- a/paddlespeech/s2t/modules/loss.py +++ b/paddlespeech/s2t/modules/loss.py @@ -37,9 +37,9 @@ class CTCLoss(nn.Layer): self.loss = nn.CTCLoss(blank=blank, reduction=reduction) self.batch_average = batch_average - logger.info( + logger.debug( f"CTCLoss Loss reduction: {reduction}, div-bs: {batch_average}") - logger.info(f"CTCLoss Grad Norm Type: {grad_norm_type}") + logger.debug(f"CTCLoss Grad Norm Type: {grad_norm_type}") assert grad_norm_type in ('instance', 'batch', 'frame', None) self.norm_by_times = False @@ -70,7 +70,8 @@ class CTCLoss(nn.Layer): param = {} self._kwargs = {k: v for k, v in kwargs.items() if k in param} _notin = {k: v for k, v in kwargs.items() if k not in param} - logger.info(f"{self.loss} kwargs:{self._kwargs}, not support: {_notin}") + logger.debug( + f"{self.loss} kwargs:{self._kwargs}, not support: {_notin}") def forward(self, logits, ys_pad, hlens, ys_lens): """Compute CTC loss. diff --git a/paddlespeech/s2t/training/cli.py b/paddlespeech/s2t/training/cli.py index bb85732a6f7f33ee9c5f2f7febabcd7912b78374..1b6bec8a801ecf739fc0efd9ce827df97b1c1591 100644 --- a/paddlespeech/s2t/training/cli.py +++ b/paddlespeech/s2t/training/cli.py @@ -82,6 +82,12 @@ def default_argument_parser(parser=None): type=int, default=1, help="number of parallel processes. 0 for cpu.") + train_group.add_argument( + '--nxpu', + type=int, + default=0, + choices=[0, 1], + help="if nxpu == 0 and ngpu == 0, use cpu.") train_group.add_argument( "--config", metavar="CONFIG_FILE", help="config file.") train_group.add_argument( diff --git a/paddlespeech/s2t/utils/tensor_utils.py b/paddlespeech/s2t/utils/tensor_utils.py index f9a843ea15c58bf6351e141d16397429674f5eef..422d4f82a9a6a7421d4fab2a773b43f5ca31410b 100644 --- a/paddlespeech/s2t/utils/tensor_utils.py +++ b/paddlespeech/s2t/utils/tensor_utils.py @@ -94,7 +94,7 @@ def pad_sequence(sequences: List[paddle.Tensor], for i, tensor in enumerate(sequences): length = tensor.shape[0] # use index notation to prevent duplicate references to the tensor - logger.info( + logger.debug( f"length {length}, out_tensor {out_tensor.shape}, tensor {tensor.shape}" ) if batch_first: diff --git a/paddlespeech/server/bin/paddlespeech_client.py b/paddlespeech/server/bin/paddlespeech_client.py index ad201b95b3802e16664736c82aec94a92a8434c1..e8e57fff052d7e67403b60474110b60422372ba5 100644 --- a/paddlespeech/server/bin/paddlespeech_client.py +++ b/paddlespeech/server/bin/paddlespeech_client.py @@ -123,7 +123,6 @@ class TTSClientExecutor(BaseExecutor): time_end = time.time() time_consume = time_end - time_start response_dict = res.json() - logger.info(response_dict["message"]) logger.info("Save synthesized audio successfully on %s." % (output)) logger.info("Audio duration: %f s." % (response_dict['result']['duration'])) @@ -192,7 +191,10 @@ class TTSOnlineClientExecutor(BaseExecutor): self.parser.add_argument( '--spk_id', type=int, default=0, help='Speaker id') self.parser.add_argument( - '--output', type=str, default=None, help='Client saves synthesized audio') + '--output', + type=str, + default=None, + help='Client saves synthesized audio') self.parser.add_argument( "--play", type=bool, help="whether to play audio", default=False) @@ -677,7 +679,6 @@ class VectorClientExecutor(BaseExecutor): test_audio=args.test, task=task) time_end = time.time() - logger.info(f"The vector: {res}") logger.info("Response time %f s." % (time_end - time_start)) return True except Exception as e: diff --git a/paddlespeech/server/engine/acs/python/acs_engine.py b/paddlespeech/server/engine/acs/python/acs_engine.py index 930101ac91a8d236947de2f2e409507bbf90a40c..63964a82550c754f58209a5c966da78110210b44 100644 --- a/paddlespeech/server/engine/acs/python/acs_engine.py +++ b/paddlespeech/server/engine/acs/python/acs_engine.py @@ -30,7 +30,7 @@ class ACSEngine(BaseEngine): """The ACSEngine Engine """ super(ACSEngine, self).__init__() - logger.info("Create the ACSEngine Instance") + logger.debug("Create the ACSEngine Instance") self.word_list = [] def init(self, config: dict): @@ -42,7 +42,7 @@ class ACSEngine(BaseEngine): Returns: bool: The engine instance flag """ - logger.info("Init the acs engine") + logger.debug("Init the acs engine") try: self.config = config self.device = self.config.get("device", paddle.get_device()) @@ -50,7 +50,7 @@ class ACSEngine(BaseEngine): # websocket default ping timeout is 20 seconds self.ping_timeout = self.config.get("ping_timeout", 20) paddle.set_device(self.device) - logger.info(f"ACS Engine set the device: {self.device}") + logger.debug(f"ACS Engine set the device: {self.device}") except BaseException as e: logger.error( @@ -66,7 +66,9 @@ class ACSEngine(BaseEngine): self.url = "ws://" + self.config.asr_server_ip + ":" + str( self.config.asr_server_port) + "/paddlespeech/asr/streaming" - logger.info("Init the acs engine successfully") + logger.info("Initialize acs server engine successfully on device: %s." % + (self.device)) + return True def read_search_words(self): @@ -95,12 +97,12 @@ class ACSEngine(BaseEngine): Returns: _type_: _description_ """ - logger.info("send a message to the server") + logger.debug("send a message to the server") if self.url is None: logger.error("No asr server, please input valid ip and port") return "" ws = websocket.WebSocket() - logger.info(f"set the ping timeout: {self.ping_timeout} seconds") + logger.debug(f"set the ping timeout: {self.ping_timeout} seconds") ws.connect(self.url, ping_timeout=self.ping_timeout) audio_info = json.dumps( { @@ -123,7 +125,7 @@ class ACSEngine(BaseEngine): logger.info(f"audio result: {msg}") # 3. send chunk audio data to engine - logger.info("send the end signal") + logger.debug("send the end signal") audio_info = json.dumps( { "name": "test.wav", @@ -197,7 +199,7 @@ class ACSEngine(BaseEngine): start = max(time_stamp[m.start(0)]['bg'] - offset, 0) end = min(time_stamp[m.end(0) - 1]['ed'] + offset, max_ed) - logger.info(f'start: {start}, end: {end}') + logger.debug(f'start: {start}, end: {end}') acs_result.append({'w': w, 'bg': start, 'ed': end}) return acs_result, asr_result @@ -212,7 +214,7 @@ class ACSEngine(BaseEngine): Returns: acs_result, asr_result: the acs result and the asr result """ - logger.info("start to process the audio content search") + logger.debug("start to process the audio content search") msg = self.get_asr_content(io.BytesIO(audio_data)) acs_result, asr_result = self.get_macthed_word(msg) diff --git a/paddlespeech/server/engine/asr/online/onnx/asr_engine.py b/paddlespeech/server/engine/asr/online/onnx/asr_engine.py index 0679316437f8b050773c69986430640bbc83a6dd..ab4f113056ba7f635012f5e486c8d88fd083f9ca 100644 --- a/paddlespeech/server/engine/asr/online/onnx/asr_engine.py +++ b/paddlespeech/server/engine/asr/online/onnx/asr_engine.py @@ -26,7 +26,7 @@ from paddlespeech.cli.log import logger from paddlespeech.resource import CommonTaskResource from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer from paddlespeech.s2t.modules.ctc import CTCDecoder -from paddlespeech.s2t.transform.transformation import Transformation +from paddlespeech.audio.transform.transformation import Transformation from paddlespeech.s2t.utils.utility import UpdateConfig from paddlespeech.server.engine.base_engine import BaseEngine from paddlespeech.server.utils import onnx_infer @@ -44,7 +44,7 @@ class PaddleASRConnectionHanddler: asr_engine (ASREngine): the global asr engine """ super().__init__() - logger.info( + logger.debug( "create an paddle asr connection handler to process the websocket connection" ) self.config = asr_engine.config # server config @@ -152,12 +152,12 @@ class PaddleASRConnectionHanddler: self.output_reset() def extract_feat(self, samples: ByteString): - logger.info("Online ASR extract the feat") + logger.debug("Online ASR extract the feat") samples = np.frombuffer(samples, dtype=np.int16) assert samples.ndim == 1 self.num_samples += samples.shape[0] - logger.info( + logger.debug( f"This package receive {samples.shape[0]} pcm data. Global samples:{self.num_samples}" ) @@ -168,7 +168,7 @@ class PaddleASRConnectionHanddler: else: assert self.remained_wav.ndim == 1 # (T,) self.remained_wav = np.concatenate([self.remained_wav, samples]) - logger.info( + logger.debug( f"The concatenation of remain and now audio samples length is: {self.remained_wav.shape}" ) @@ -202,14 +202,14 @@ class PaddleASRConnectionHanddler: # update remained wav self.remained_wav = self.remained_wav[self.n_shift * num_frames:] - logger.info( + logger.debug( f"process the audio feature success, the cached feat shape: {self.cached_feat.shape}" ) - logger.info( + logger.debug( f"After extract feat, the cached remain the audio samples: {self.remained_wav.shape}" ) - logger.info(f"global samples: {self.num_samples}") - logger.info(f"global frames: {self.num_frames}") + logger.debug(f"global samples: {self.num_samples}") + logger.debug(f"global frames: {self.num_frames}") def decode(self, is_finished=False): """advance decoding @@ -237,7 +237,7 @@ class PaddleASRConnectionHanddler: return num_frames = self.cached_feat.shape[1] - logger.info( + logger.debug( f"Required decoding window {decoding_window} frames, and the connection has {num_frames} frames" ) @@ -355,7 +355,7 @@ class ASRServerExecutor(ASRExecutor): lm_url = self.task_resource.res_dict['lm_url'] lm_md5 = self.task_resource.res_dict['lm_md5'] - logger.info(f"Start to load language model {lm_url}") + logger.debug(f"Start to load language model {lm_url}") self.download_lm( lm_url, os.path.dirname(self.config.decode.lang_model_path), lm_md5) @@ -367,7 +367,7 @@ class ASRServerExecutor(ASRExecutor): if "deepspeech2" in self.model_type: # AM predictor - logger.info("ASR engine start to init the am predictor") + logger.debug("ASR engine start to init the am predictor") self.am_predictor = onnx_infer.get_sess( model_path=self.am_model, sess_conf=self.am_predictor_conf) else: @@ -400,7 +400,7 @@ class ASRServerExecutor(ASRExecutor): self.num_decoding_left_chunks = num_decoding_left_chunks # conf for paddleinference predictor or onnx self.am_predictor_conf = am_predictor_conf - logger.info(f"model_type: {self.model_type}") + logger.debug(f"model_type: {self.model_type}") sample_rate_str = '16k' if sample_rate == 16000 else '8k' tag = model_type + '-' + lang + '-' + sample_rate_str @@ -422,12 +422,11 @@ class ASRServerExecutor(ASRExecutor): # self.res_path, self.task_resource.res_dict[ # 'params']) if am_params is None else os.path.abspath(am_params) - logger.info("Load the pretrained model:") - logger.info(f" tag = {tag}") - logger.info(f" res_path: {self.res_path}") - logger.info(f" cfg path: {self.cfg_path}") - logger.info(f" am_model path: {self.am_model}") - # logger.info(f" am_params path: {self.am_params}") + logger.debug("Load the pretrained model:") + logger.debug(f" tag = {tag}") + logger.debug(f" res_path: {self.res_path}") + logger.debug(f" cfg path: {self.cfg_path}") + logger.debug(f" am_model path: {self.am_model}") #Init body. self.config = CfgNode(new_allowed=True) @@ -436,7 +435,7 @@ class ASRServerExecutor(ASRExecutor): if self.config.spm_model_prefix: self.config.spm_model_prefix = os.path.join( self.res_path, self.config.spm_model_prefix) - logger.info(f"spm model path: {self.config.spm_model_prefix}") + logger.debug(f"spm model path: {self.config.spm_model_prefix}") self.vocab = self.config.vocab_filepath @@ -450,7 +449,7 @@ class ASRServerExecutor(ASRExecutor): # AM predictor self.init_model() - logger.info(f"create the {model_type} model success") + logger.debug(f"create the {model_type} model success") return True @@ -501,7 +500,7 @@ class ASREngine(BaseEngine): "If all GPU or XPU is used, you can set the server to 'cpu'") sys.exit(-1) - logger.info(f"paddlespeech_server set the device: {self.device}") + logger.debug(f"paddlespeech_server set the device: {self.device}") if not self.init_model(): logger.error( @@ -509,7 +508,8 @@ class ASREngine(BaseEngine): ) return False - logger.info("Initialize ASR server engine successfully.") + logger.info("Initialize ASR server engine successfully on device: %s." % + (self.device)) return True def new_handler(self): diff --git a/paddlespeech/server/engine/asr/online/paddleinference/asr_engine.py b/paddlespeech/server/engine/asr/online/paddleinference/asr_engine.py index efb726aaff3813a456cfac37f641a057fd2d5ed6..182e64180709760dbd1639cbcb4a56537dd45aa5 100644 --- a/paddlespeech/server/engine/asr/online/paddleinference/asr_engine.py +++ b/paddlespeech/server/engine/asr/online/paddleinference/asr_engine.py @@ -24,9 +24,9 @@ from yacs.config import CfgNode from paddlespeech.cli.asr.infer import ASRExecutor from paddlespeech.cli.log import logger from paddlespeech.resource import CommonTaskResource +from paddlespeech.audio.transform.transformation import Transformation from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer from paddlespeech.s2t.modules.ctc import CTCDecoder -from paddlespeech.s2t.transform.transformation import Transformation from paddlespeech.s2t.utils.utility import UpdateConfig from paddlespeech.server.engine.base_engine import BaseEngine from paddlespeech.server.utils.paddle_predictor import init_predictor @@ -44,7 +44,7 @@ class PaddleASRConnectionHanddler: asr_engine (ASREngine): the global asr engine """ super().__init__() - logger.info( + logger.debug( "create an paddle asr connection handler to process the websocket connection" ) self.config = asr_engine.config # server config @@ -157,7 +157,7 @@ class PaddleASRConnectionHanddler: assert samples.ndim == 1 self.num_samples += samples.shape[0] - logger.info( + logger.debug( f"This package receive {samples.shape[0]} pcm data. Global samples:{self.num_samples}" ) @@ -168,7 +168,7 @@ class PaddleASRConnectionHanddler: else: assert self.remained_wav.ndim == 1 # (T,) self.remained_wav = np.concatenate([self.remained_wav, samples]) - logger.info( + logger.debug( f"The concatenation of remain and now audio samples length is: {self.remained_wav.shape}" ) @@ -202,14 +202,14 @@ class PaddleASRConnectionHanddler: # update remained wav self.remained_wav = self.remained_wav[self.n_shift * num_frames:] - logger.info( + logger.debug( f"process the audio feature success, the cached feat shape: {self.cached_feat.shape}" ) - logger.info( + logger.debug( f"After extract feat, the cached remain the audio samples: {self.remained_wav.shape}" ) - logger.info(f"global samples: {self.num_samples}") - logger.info(f"global frames: {self.num_frames}") + logger.debug(f"global samples: {self.num_samples}") + logger.debug(f"global frames: {self.num_frames}") def decode(self, is_finished=False): """advance decoding @@ -237,13 +237,13 @@ class PaddleASRConnectionHanddler: return num_frames = self.cached_feat.shape[1] - logger.info( + logger.debug( f"Required decoding window {decoding_window} frames, and the connection has {num_frames} frames" ) # the cached feat must be larger decoding_window if num_frames < decoding_window and not is_finished: - logger.info( + logger.debug( f"frame feat num is less than {decoding_window}, please input more pcm data" ) return None, None @@ -294,7 +294,7 @@ class PaddleASRConnectionHanddler: Returns: logprob: poster probability. """ - logger.info("start to decoce one chunk for deepspeech2") + logger.debug("start to decoce one chunk for deepspeech2") input_names = self.am_predictor.get_input_names() audio_handle = self.am_predictor.get_input_handle(input_names[0]) audio_len_handle = self.am_predictor.get_input_handle(input_names[1]) @@ -369,7 +369,7 @@ class ASRServerExecutor(ASRExecutor): lm_url = self.task_resource.res_dict['lm_url'] lm_md5 = self.task_resource.res_dict['lm_md5'] - logger.info(f"Start to load language model {lm_url}") + logger.debug(f"Start to load language model {lm_url}") self.download_lm( lm_url, os.path.dirname(self.config.decode.lang_model_path), lm_md5) @@ -381,7 +381,7 @@ class ASRServerExecutor(ASRExecutor): if "deepspeech2" in self.model_type: # AM predictor - logger.info("ASR engine start to init the am predictor") + logger.debug("ASR engine start to init the am predictor") self.am_predictor = init_predictor( model_file=self.am_model, params_file=self.am_params, @@ -415,7 +415,7 @@ class ASRServerExecutor(ASRExecutor): self.num_decoding_left_chunks = num_decoding_left_chunks # conf for paddleinference predictor or onnx self.am_predictor_conf = am_predictor_conf - logger.info(f"model_type: {self.model_type}") + logger.debug(f"model_type: {self.model_type}") sample_rate_str = '16k' if sample_rate == 16000 else '8k' tag = model_type + '-' + lang + '-' + sample_rate_str @@ -437,12 +437,12 @@ class ASRServerExecutor(ASRExecutor): self.res_path = os.path.dirname( os.path.dirname(os.path.abspath(self.cfg_path))) - logger.info("Load the pretrained model:") - logger.info(f" tag = {tag}") - logger.info(f" res_path: {self.res_path}") - logger.info(f" cfg path: {self.cfg_path}") - logger.info(f" am_model path: {self.am_model}") - logger.info(f" am_params path: {self.am_params}") + logger.debug("Load the pretrained model:") + logger.debug(f" tag = {tag}") + logger.debug(f" res_path: {self.res_path}") + logger.debug(f" cfg path: {self.cfg_path}") + logger.debug(f" am_model path: {self.am_model}") + logger.debug(f" am_params path: {self.am_params}") #Init body. self.config = CfgNode(new_allowed=True) @@ -451,7 +451,7 @@ class ASRServerExecutor(ASRExecutor): if self.config.spm_model_prefix: self.config.spm_model_prefix = os.path.join( self.res_path, self.config.spm_model_prefix) - logger.info(f"spm model path: {self.config.spm_model_prefix}") + logger.debug(f"spm model path: {self.config.spm_model_prefix}") self.vocab = self.config.vocab_filepath @@ -465,7 +465,7 @@ class ASRServerExecutor(ASRExecutor): # AM predictor self.init_model() - logger.info(f"create the {model_type} model success") + logger.debug(f"create the {model_type} model success") return True @@ -516,7 +516,7 @@ class ASREngine(BaseEngine): "If all GPU or XPU is used, you can set the server to 'cpu'") sys.exit(-1) - logger.info(f"paddlespeech_server set the device: {self.device}") + logger.debug(f"paddlespeech_server set the device: {self.device}") if not self.init_model(): logger.error( @@ -524,7 +524,9 @@ class ASREngine(BaseEngine): ) return False - logger.info("Initialize ASR server engine successfully.") + logger.info("Initialize ASR server engine successfully on device: %s." % + (self.device)) + return True def new_handler(self): diff --git a/paddlespeech/server/engine/asr/online/python/asr_engine.py b/paddlespeech/server/engine/asr/online/python/asr_engine.py index daa9fc500d5092d63bbda449dc35f9f93559491a..2bacfecd6e7f4529b8b1a5ac9dde6841b9b776f6 100644 --- a/paddlespeech/server/engine/asr/online/python/asr_engine.py +++ b/paddlespeech/server/engine/asr/online/python/asr_engine.py @@ -24,9 +24,9 @@ from yacs.config import CfgNode from paddlespeech.cli.asr.infer import ASRExecutor from paddlespeech.cli.log import logger from paddlespeech.resource import CommonTaskResource +from paddlespeech.audio.transform.transformation import Transformation from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer from paddlespeech.s2t.modules.ctc import CTCDecoder -from paddlespeech.s2t.transform.transformation import Transformation from paddlespeech.s2t.utils.tensor_utils import add_sos_eos from paddlespeech.s2t.utils.tensor_utils import pad_sequence from paddlespeech.s2t.utils.utility import UpdateConfig @@ -49,7 +49,7 @@ class PaddleASRConnectionHanddler: asr_engine (ASREngine): the global asr engine """ super().__init__() - logger.info( + logger.debug( "create an paddle asr connection handler to process the websocket connection" ) self.config = asr_engine.config # server config @@ -107,7 +107,7 @@ class PaddleASRConnectionHanddler: # acoustic model self.model = self.asr_engine.executor.model self.continuous_decoding = self.config.continuous_decoding - logger.info(f"continue decoding: {self.continuous_decoding}") + logger.debug(f"continue decoding: {self.continuous_decoding}") # ctc decoding config self.ctc_decode_config = self.asr_engine.executor.config.decode @@ -207,7 +207,7 @@ class PaddleASRConnectionHanddler: assert samples.ndim == 1 self.num_samples += samples.shape[0] - logger.info( + logger.debug( f"This package receive {samples.shape[0]} pcm data. Global samples:{self.num_samples}" ) @@ -218,7 +218,7 @@ class PaddleASRConnectionHanddler: else: assert self.remained_wav.ndim == 1 # (T,) self.remained_wav = np.concatenate([self.remained_wav, samples]) - logger.info( + logger.debug( f"The concatenation of remain and now audio samples length is: {self.remained_wav.shape}" ) @@ -252,14 +252,14 @@ class PaddleASRConnectionHanddler: # update remained wav self.remained_wav = self.remained_wav[self.n_shift * num_frames:] - logger.info( + logger.debug( f"process the audio feature success, the cached feat shape: {self.cached_feat.shape}" ) - logger.info( + logger.debug( f"After extract feat, the cached remain the audio samples: {self.remained_wav.shape}" ) - logger.info(f"global samples: {self.num_samples}") - logger.info(f"global frames: {self.num_frames}") + logger.debug(f"global samples: {self.num_samples}") + logger.debug(f"global frames: {self.num_frames}") def decode(self, is_finished=False): """advance decoding @@ -283,24 +283,24 @@ class PaddleASRConnectionHanddler: stride = subsampling * decoding_chunk_size if self.cached_feat is None: - logger.info("no audio feat, please input more pcm data") + logger.debug("no audio feat, please input more pcm data") return num_frames = self.cached_feat.shape[1] - logger.info( + logger.debug( f"Required decoding window {decoding_window} frames, and the connection has {num_frames} frames" ) # the cached feat must be larger decoding_window if num_frames < decoding_window and not is_finished: - logger.info( + logger.debug( f"frame feat num is less than {decoding_window}, please input more pcm data" ) return None, None # if is_finished=True, we need at least context frames if num_frames < context: - logger.info( + logger.debug( "flast {num_frames} is less than context {context} frames, and we cannot do model forward" ) return None, None @@ -354,7 +354,7 @@ class PaddleASRConnectionHanddler: Returns: logprob: poster probability. """ - logger.info("start to decoce one chunk for deepspeech2") + logger.debug("start to decoce one chunk for deepspeech2") input_names = self.am_predictor.get_input_names() audio_handle = self.am_predictor.get_input_handle(input_names[0]) audio_len_handle = self.am_predictor.get_input_handle(input_names[1]) @@ -391,7 +391,7 @@ class PaddleASRConnectionHanddler: self.decoder.next(output_chunk_probs, output_chunk_lens) trans_best, trans_beam = self.decoder.decode() - logger.info(f"decode one best result for deepspeech2: {trans_best[0]}") + logger.debug(f"decode one best result for deepspeech2: {trans_best[0]}") return trans_best[0] @paddle.no_grad() @@ -402,7 +402,7 @@ class PaddleASRConnectionHanddler: # reset endpiont state self.endpoint_state = False - logger.info( + logger.debug( "Conformer/Transformer: start to decode with advanced_decoding method" ) cfg = self.ctc_decode_config @@ -427,25 +427,25 @@ class PaddleASRConnectionHanddler: stride = subsampling * decoding_chunk_size if self.cached_feat is None: - logger.info("no audio feat, please input more pcm data") + logger.debug("no audio feat, please input more pcm data") return # (B=1,T,D) num_frames = self.cached_feat.shape[1] - logger.info( + logger.debug( f"Required decoding window {decoding_window} frames, and the connection has {num_frames} frames" ) # the cached feat must be larger decoding_window if num_frames < decoding_window and not is_finished: - logger.info( + logger.debug( f"frame feat num is less than {decoding_window}, please input more pcm data" ) return None, None # if is_finished=True, we need at least context frames if num_frames < context: - logger.info( + logger.debug( "flast {num_frames} is less than context {context} frames, and we cannot do model forward" ) return None, None @@ -489,7 +489,7 @@ class PaddleASRConnectionHanddler: self.encoder_out = ys else: self.encoder_out = paddle.concat([self.encoder_out, ys], axis=1) - logger.info( + logger.debug( f"This connection handler encoder out shape: {self.encoder_out.shape}" ) @@ -513,7 +513,8 @@ class PaddleASRConnectionHanddler: if self.endpointer.endpoint_detected(ctc_probs.numpy(), decoding_something): self.endpoint_state = True - logger.info(f"Endpoint is detected at {self.num_frames} frame.") + logger.debug( + f"Endpoint is detected at {self.num_frames} frame.") # advance cache of feat assert self.cached_feat.shape[0] == 1 #(B=1,T,D) @@ -526,7 +527,7 @@ class PaddleASRConnectionHanddler: def update_result(self): """Conformer/Transformer hyps to result. """ - logger.info("update the final result") + logger.debug("update the final result") hyps = self.hyps # output results and tokenids @@ -560,16 +561,16 @@ class PaddleASRConnectionHanddler: only for conformer and transformer model. """ if "deepspeech2" in self.model_type: - logger.info("deepspeech2 not support rescoring decoding.") + logger.debug("deepspeech2 not support rescoring decoding.") return if "attention_rescoring" != self.ctc_decode_config.decoding_method: - logger.info( + logger.debug( f"decoding method not match: {self.ctc_decode_config.decoding_method}, need attention_rescoring" ) return - logger.info("rescoring the final result") + logger.debug("rescoring the final result") # last decoding for last audio self.searcher.finalize_search() @@ -685,7 +686,6 @@ class PaddleASRConnectionHanddler: "bg": global_offset_in_sec + start, "ed": global_offset_in_sec + end }) - # logger.info(f"{word_time_stamp[-1]}") self.word_time_stamp = word_time_stamp logger.info(f"word time stamp: {self.word_time_stamp}") @@ -707,13 +707,13 @@ class ASRServerExecutor(ASRExecutor): lm_url = self.task_resource.res_dict['lm_url'] lm_md5 = self.task_resource.res_dict['lm_md5'] - logger.info(f"Start to load language model {lm_url}") + logger.debug(f"Start to load language model {lm_url}") self.download_lm( lm_url, os.path.dirname(self.config.decode.lang_model_path), lm_md5) elif "conformer" in self.model_type or "transformer" in self.model_type: with UpdateConfig(self.config): - logger.info("start to create the stream conformer asr engine") + logger.debug("start to create the stream conformer asr engine") # update the decoding method if self.decode_method: self.config.decode.decoding_method = self.decode_method @@ -726,7 +726,7 @@ class ASRServerExecutor(ASRExecutor): if self.config.decode.decoding_method not in [ "ctc_prefix_beam_search", "attention_rescoring" ]: - logger.info( + logger.debug( "we set the decoding_method to attention_rescoring") self.config.decode.decoding_method = "attention_rescoring" @@ -739,7 +739,7 @@ class ASRServerExecutor(ASRExecutor): def init_model(self) -> None: if "deepspeech2" in self.model_type: # AM predictor - logger.info("ASR engine start to init the am predictor") + logger.debug("ASR engine start to init the am predictor") self.am_predictor = init_predictor( model_file=self.am_model, params_file=self.am_params, @@ -748,7 +748,7 @@ class ASRServerExecutor(ASRExecutor): # load model # model_type: {model_name}_{dataset} model_name = self.model_type[:self.model_type.rindex('_')] - logger.info(f"model name: {model_name}") + logger.debug(f"model name: {model_name}") model_class = self.task_resource.get_model_class(model_name) model = model_class.from_config(self.config) self.model = model @@ -782,7 +782,7 @@ class ASRServerExecutor(ASRExecutor): self.num_decoding_left_chunks = num_decoding_left_chunks # conf for paddleinference predictor or onnx self.am_predictor_conf = am_predictor_conf - logger.info(f"model_type: {self.model_type}") + logger.debug(f"model_type: {self.model_type}") sample_rate_str = '16k' if sample_rate == 16000 else '8k' tag = model_type + '-' + lang + '-' + sample_rate_str @@ -804,12 +804,12 @@ class ASRServerExecutor(ASRExecutor): self.res_path = os.path.dirname( os.path.dirname(os.path.abspath(self.cfg_path))) - logger.info("Load the pretrained model:") - logger.info(f" tag = {tag}") - logger.info(f" res_path: {self.res_path}") - logger.info(f" cfg path: {self.cfg_path}") - logger.info(f" am_model path: {self.am_model}") - logger.info(f" am_params path: {self.am_params}") + logger.debug("Load the pretrained model:") + logger.debug(f" tag = {tag}") + logger.debug(f" res_path: {self.res_path}") + logger.debug(f" cfg path: {self.cfg_path}") + logger.debug(f" am_model path: {self.am_model}") + logger.debug(f" am_params path: {self.am_params}") #Init body. self.config = CfgNode(new_allowed=True) @@ -818,7 +818,7 @@ class ASRServerExecutor(ASRExecutor): if self.config.spm_model_prefix: self.config.spm_model_prefix = os.path.join( self.res_path, self.config.spm_model_prefix) - logger.info(f"spm model path: {self.config.spm_model_prefix}") + logger.debug(f"spm model path: {self.config.spm_model_prefix}") self.vocab = self.config.vocab_filepath @@ -832,7 +832,7 @@ class ASRServerExecutor(ASRExecutor): # AM predictor self.init_model() - logger.info(f"create the {model_type} model success") + logger.debug(f"create the {model_type} model success") return True @@ -883,7 +883,7 @@ class ASREngine(BaseEngine): "If all GPU or XPU is used, you can set the server to 'cpu'") sys.exit(-1) - logger.info(f"paddlespeech_server set the device: {self.device}") + logger.debug(f"paddlespeech_server set the device: {self.device}") if not self.init_model(): logger.error( @@ -891,7 +891,9 @@ class ASREngine(BaseEngine): ) return False - logger.info("Initialize ASR server engine successfully.") + logger.info("Initialize ASR server engine successfully on device: %s." % + (self.device)) + return True def new_handler(self): diff --git a/paddlespeech/server/engine/asr/paddleinference/asr_engine.py b/paddlespeech/server/engine/asr/paddleinference/asr_engine.py index 572004eb8a7b707563ebceaefe58b98e68cfd12f..6df666ce8090703e0727827e07e0193ffc14cffe 100644 --- a/paddlespeech/server/engine/asr/paddleinference/asr_engine.py +++ b/paddlespeech/server/engine/asr/paddleinference/asr_engine.py @@ -65,10 +65,10 @@ class ASRServerExecutor(ASRExecutor): self.task_resource.res_dict['model']) self.am_params = os.path.join(self.res_path, self.task_resource.res_dict['params']) - logger.info(self.res_path) - logger.info(self.cfg_path) - logger.info(self.am_model) - logger.info(self.am_params) + logger.debug(self.res_path) + logger.debug(self.cfg_path) + logger.debug(self.am_model) + logger.debug(self.am_params) else: self.cfg_path = os.path.abspath(cfg_path) self.am_model = os.path.abspath(am_model) @@ -236,16 +236,16 @@ class PaddleASRConnectionHandler(ASRServerExecutor): if self._check( io.BytesIO(audio_data), self.asr_engine.config.sample_rate, self.asr_engine.config.force_yes): - logger.info("start running asr engine") + logger.debug("start running asr engine") self.preprocess(self.asr_engine.config.model_type, io.BytesIO(audio_data)) st = time.time() self.infer(self.asr_engine.config.model_type) infer_time = time.time() - st self.output = self.postprocess() # Retrieve result of asr. - logger.info("end inferring asr engine") + logger.debug("end inferring asr engine") else: - logger.info("file check failed!") + logger.error("file check failed!") self.output = None logger.info("inference time: {}".format(infer_time)) diff --git a/paddlespeech/server/engine/asr/python/asr_engine.py b/paddlespeech/server/engine/asr/python/asr_engine.py index f9cc3a6650cdaff91fdf5c52ffa285aa4d7f2d16..02c40fd128236b45c75db56378654a1dd4d1ae26 100644 --- a/paddlespeech/server/engine/asr/python/asr_engine.py +++ b/paddlespeech/server/engine/asr/python/asr_engine.py @@ -104,7 +104,7 @@ class PaddleASRConnectionHandler(ASRServerExecutor): if self._check( io.BytesIO(audio_data), self.asr_engine.config.sample_rate, self.asr_engine.config.force_yes): - logger.info("start run asr engine") + logger.debug("start run asr engine") self.preprocess(self.asr_engine.config.model, io.BytesIO(audio_data)) st = time.time() @@ -112,7 +112,7 @@ class PaddleASRConnectionHandler(ASRServerExecutor): infer_time = time.time() - st self.output = self.postprocess() # Retrieve result of asr. else: - logger.info("file check failed!") + logger.error("file check failed!") self.output = None logger.info("inference time: {}".format(infer_time)) diff --git a/paddlespeech/server/engine/cls/paddleinference/cls_engine.py b/paddlespeech/server/engine/cls/paddleinference/cls_engine.py index 389d56055ba617d1628b87e52aaf7301e9928c29..fa62ba67c7c4d07735ba87a7652d8b8e8387bffb 100644 --- a/paddlespeech/server/engine/cls/paddleinference/cls_engine.py +++ b/paddlespeech/server/engine/cls/paddleinference/cls_engine.py @@ -67,22 +67,22 @@ class CLSServerExecutor(CLSExecutor): self.params_path = os.path.abspath(params_path) self.label_file = os.path.abspath(label_file) - logger.info(self.cfg_path) - logger.info(self.model_path) - logger.info(self.params_path) - logger.info(self.label_file) + logger.debug(self.cfg_path) + logger.debug(self.model_path) + logger.debug(self.params_path) + logger.debug(self.label_file) # config with open(self.cfg_path, 'r') as f: self._conf = yaml.safe_load(f) - logger.info("Read cfg file successfully.") + logger.debug("Read cfg file successfully.") # labels self._label_list = [] with open(self.label_file, 'r') as f: for line in f: self._label_list.append(line.strip()) - logger.info("Read label file successfully.") + logger.debug("Read label file successfully.") # Create predictor self.predictor_conf = predictor_conf @@ -90,7 +90,7 @@ class CLSServerExecutor(CLSExecutor): model_file=self.model_path, params_file=self.params_path, predictor_conf=self.predictor_conf) - logger.info("Create predictor successfully.") + logger.debug("Create predictor successfully.") @paddle.no_grad() def infer(self): @@ -148,7 +148,8 @@ class CLSEngine(BaseEngine): logger.error(e) return False - logger.info("Initialize CLS server engine successfully.") + logger.info("Initialize CLS server engine successfully on device: %s." % + (self.device)) return True @@ -160,7 +161,7 @@ class PaddleCLSConnectionHandler(CLSServerExecutor): cls_engine (CLSEngine): The CLS engine """ super().__init__() - logger.info( + logger.debug( "Create PaddleCLSConnectionHandler to process the cls request") self._inputs = OrderedDict() @@ -183,7 +184,7 @@ class PaddleCLSConnectionHandler(CLSServerExecutor): self.infer() infer_time = time.time() - st - logger.info("inference time: {}".format(infer_time)) + logger.debug("inference time: {}".format(infer_time)) logger.info("cls engine type: inference") def postprocess(self, topk: int): diff --git a/paddlespeech/server/engine/cls/python/cls_engine.py b/paddlespeech/server/engine/cls/python/cls_engine.py index f8d8f20ef215da47c823e2bfef056b2c4ec4bb6d..210f4cbbb81b98b6a5a73a7b9bac155a188b6688 100644 --- a/paddlespeech/server/engine/cls/python/cls_engine.py +++ b/paddlespeech/server/engine/cls/python/cls_engine.py @@ -88,7 +88,7 @@ class PaddleCLSConnectionHandler(CLSServerExecutor): cls_engine (CLSEngine): The CLS engine """ super().__init__() - logger.info( + logger.debug( "Create PaddleCLSConnectionHandler to process the cls request") self._inputs = OrderedDict() @@ -110,7 +110,7 @@ class PaddleCLSConnectionHandler(CLSServerExecutor): self.infer() infer_time = time.time() - st - logger.info("inference time: {}".format(infer_time)) + logger.debug("inference time: {}".format(infer_time)) logger.info("cls engine type: python") def postprocess(self, topk: int): diff --git a/paddlespeech/server/engine/engine_warmup.py b/paddlespeech/server/engine/engine_warmup.py index 5f548f71dbe6c673564350a197b314a55710989f..12c760c6f61ccb8c67a06c79b42b75a6d108cbeb 100644 --- a/paddlespeech/server/engine/engine_warmup.py +++ b/paddlespeech/server/engine/engine_warmup.py @@ -45,7 +45,7 @@ def warm_up(engine_and_type: str, warm_up_time: int=3) -> bool: logger.error("Please check tte engine type.") try: - logger.info("Start to warm up tts engine.") + logger.debug("Start to warm up tts engine.") for i in range(warm_up_time): connection_handler = PaddleTTSConnectionHandler(tts_engine) if flag_online: @@ -53,7 +53,7 @@ def warm_up(engine_and_type: str, warm_up_time: int=3) -> bool: text=sentence, lang=tts_engine.lang, am=tts_engine.config.am): - logger.info( + logger.debug( f"The first response time of the {i} warm up: {connection_handler.first_response_time} s" ) break @@ -62,7 +62,7 @@ def warm_up(engine_and_type: str, warm_up_time: int=3) -> bool: st = time.time() connection_handler.infer(text=sentence) et = time.time() - logger.info( + logger.debug( f"The response time of the {i} warm up: {et - st} s") except Exception as e: logger.error("Failed to warm up on tts engine.") diff --git a/paddlespeech/server/engine/text/python/text_engine.py b/paddlespeech/server/engine/text/python/text_engine.py index 73cf8737beeecbffa5e3ce97eb4b010a1345d719..6167e7784993bbafac08d8fbbc89aca66960403a 100644 --- a/paddlespeech/server/engine/text/python/text_engine.py +++ b/paddlespeech/server/engine/text/python/text_engine.py @@ -28,7 +28,7 @@ class PaddleTextConnectionHandler: text_engine (TextEngine): The Text engine """ super().__init__() - logger.info( + logger.debug( "Create PaddleTextConnectionHandler to process the text request") self.text_engine = text_engine self.task = self.text_engine.executor.task @@ -130,7 +130,7 @@ class TextEngine(BaseEngine): """The Text Engine """ super(TextEngine, self).__init__() - logger.info("Create the TextEngine Instance") + logger.debug("Create the TextEngine Instance") def init(self, config: dict): """Init the Text Engine @@ -141,7 +141,7 @@ class TextEngine(BaseEngine): Returns: bool: The engine instance flag """ - logger.info("Init the text engine") + logger.debug("Init the text engine") try: self.config = config if self.config.device: @@ -150,7 +150,7 @@ class TextEngine(BaseEngine): self.device = paddle.get_device() paddle.set_device(self.device) - logger.info(f"Text Engine set the device: {self.device}") + logger.debug(f"Text Engine set the device: {self.device}") except BaseException as e: logger.error( "Set device failed, please check if device is already used and the parameter 'device' in the yaml file" @@ -168,5 +168,6 @@ class TextEngine(BaseEngine): ckpt_path=config.ckpt_path, vocab_file=config.vocab_file) - logger.info("Init the text engine successfully") + logger.info("Initialize Text server engine successfully on device: %s." + % (self.device)) return True diff --git a/paddlespeech/server/engine/tts/online/onnx/tts_engine.py b/paddlespeech/server/engine/tts/online/onnx/tts_engine.py index 64ea47274cb2a8958674ebfae71217f3a205be7d..0995a55daa7ef8c0450c96ef94ad54bbeb277d5a 100644 --- a/paddlespeech/server/engine/tts/online/onnx/tts_engine.py +++ b/paddlespeech/server/engine/tts/online/onnx/tts_engine.py @@ -19,8 +19,6 @@ from typing import Optional import numpy as np import paddle -import librosa -from scipy import signal from paddlespeech.cli.log import logger from paddlespeech.cli.tts.infer import TTSExecutor @@ -32,8 +30,6 @@ from paddlespeech.server.utils.util import denorm from paddlespeech.server.utils.util import get_chunks from paddlespeech.t2s.frontend import English from paddlespeech.t2s.frontend.zh_frontend import Frontend -from paddlespeech.server.utils.audio_process import change_speed -from paddlespeech.server.utils.exception import ServerBaseException __all__ = ['TTSEngine', 'PaddleTTSConnectionHandler'] @@ -66,10 +62,9 @@ class TTSServerExecutor(TTSExecutor): (hasattr(self, 'am_encoder_infer_sess') and hasattr(self, 'am_decoder_sess') and hasattr( self, 'am_postnet_sess'))) and hasattr(self, 'voc_inference'): - logger.info('Models had been initialized.') + logger.debug('Models had been initialized.') return - # am am_tag = am + '-' + lang if am == "fastspeech2_csmsc_onnx": @@ -91,8 +86,7 @@ class TTSServerExecutor(TTSExecutor): else: self.am_ckpt = os.path.abspath(am_ckpt[0]) self.phones_dict = os.path.abspath(phones_dict) - self.am_res_path = os.path.dirname( - os.path.abspath(am_ckpt)) + self.am_res_path = os.path.dirname(os.path.abspath(am_ckpt)) # create am sess self.am_sess = get_sess(self.am_ckpt, am_sess_conf) @@ -125,8 +119,7 @@ class TTSServerExecutor(TTSExecutor): self.am_postnet = os.path.abspath(am_ckpt[2]) self.phones_dict = os.path.abspath(phones_dict) self.am_stat = os.path.abspath(am_stat) - self.am_res_path = os.path.dirname( - os.path.abspath(am_ckpt[0])) + self.am_res_path = os.path.dirname(os.path.abspath(am_ckpt[0])) # create am sess self.am_encoder_infer_sess = get_sess(self.am_encoder_infer, @@ -136,13 +129,13 @@ class TTSServerExecutor(TTSExecutor): self.am_mu, self.am_std = np.load(self.am_stat) - logger.info(f"self.phones_dict: {self.phones_dict}") - logger.info(f"am model dir: {self.am_res_path}") - logger.info("Create am sess successfully.") + logger.debug(f"self.phones_dict: {self.phones_dict}") + logger.debug(f"am model dir: {self.am_res_path}") + logger.debug("Create am sess successfully.") # voc model info voc_tag = voc + '-' + lang - + if voc_ckpt is None: self.task_resource.set_task_model( model_tag=voc_tag, @@ -155,16 +148,16 @@ class TTSServerExecutor(TTSExecutor): else: self.voc_ckpt = os.path.abspath(voc_ckpt) self.voc_res_path = os.path.dirname(os.path.abspath(self.voc_ckpt)) - logger.info(self.voc_res_path) + logger.debug(self.voc_res_path) # create voc sess self.voc_sess = get_sess(self.voc_ckpt, voc_sess_conf) - logger.info("Create voc sess successfully.") + logger.debug("Create voc sess successfully.") with open(self.phones_dict, "r") as f: phn_id = [line.strip().split() for line in f.readlines()] self.vocab_size = len(phn_id) - logger.info(f"vocab_size: {self.vocab_size}") + logger.debug(f"vocab_size: {self.vocab_size}") # frontend self.tones_dict = None @@ -175,7 +168,7 @@ class TTSServerExecutor(TTSExecutor): elif lang == 'en': self.frontend = English(phone_vocab_path=self.phones_dict) - logger.info("frontend done!") + logger.debug("frontend done!") class TTSEngine(BaseEngine): @@ -275,7 +268,7 @@ class PaddleTTSConnectionHandler: tts_engine (TTSEngine): The TTS engine """ super().__init__() - logger.info( + logger.debug( "Create PaddleTTSConnectionHandler to process the tts request") self.tts_engine = tts_engine @@ -449,10 +442,7 @@ class PaddleTTSConnectionHandler: self.final_response_time = time.time() - frontend_st - - def run(self, - sentence: str, - spk_id: int=0): + def run(self, sentence: str, spk_id: int=0): """ run include inference and postprocess. Args: @@ -487,4 +477,4 @@ class PaddleTTSConnectionHandler: logger.info(f"RTF: {self.final_response_time / duration}") logger.info( f"Other info: front time: {self.frontend_time} s, first am infer time: {self.first_am_infer} s, first voc infer time: {self.first_voc_infer} s," - ) \ No newline at end of file + ) diff --git a/paddlespeech/server/engine/tts/online/python/tts_engine.py b/paddlespeech/server/engine/tts/online/python/tts_engine.py index f7c57fdabe50ddf0e188448a52448789fc073c45..a46b84bd969e56e0e0650990ca30560cfaadc902 100644 --- a/paddlespeech/server/engine/tts/online/python/tts_engine.py +++ b/paddlespeech/server/engine/tts/online/python/tts_engine.py @@ -102,16 +102,22 @@ class TTSServerExecutor(TTSExecutor): Init model and other resources from a specific path. """ if hasattr(self, 'am_inference') and hasattr(self, 'voc_inference'): - logger.info('Models had been initialized.') + logger.debug('Models had been initialized.') return # am model info + if am_ckpt is None or am_config is None or am_stat is None or phones_dict is None: + use_pretrained_am = True + else: + use_pretrained_am = False + am_tag = am + '-' + lang self.task_resource.set_task_model( model_tag=am_tag, model_type=0, # am + skip_download=not use_pretrained_am, version=None, # default version ) - if am_ckpt is None or am_config is None or am_stat is None or phones_dict is None: + if use_pretrained_am: self.am_res_path = self.task_resource.res_dir self.am_config = os.path.join(self.am_res_path, self.task_resource.res_dict['config']) @@ -122,29 +128,33 @@ class TTSServerExecutor(TTSExecutor): # must have phones_dict in acoustic self.phones_dict = os.path.join( self.am_res_path, self.task_resource.res_dict['phones_dict']) - print("self.phones_dict:", self.phones_dict) - logger.info(self.am_res_path) - logger.info(self.am_config) - logger.info(self.am_ckpt) + logger.debug(self.am_res_path) + logger.debug(self.am_config) + logger.debug(self.am_ckpt) else: self.am_config = os.path.abspath(am_config) self.am_ckpt = os.path.abspath(am_ckpt) self.am_stat = os.path.abspath(am_stat) self.phones_dict = os.path.abspath(phones_dict) self.am_res_path = os.path.dirname(os.path.abspath(self.am_config)) - print("self.phones_dict:", self.phones_dict) self.tones_dict = None self.speaker_dict = None # voc model info + if voc_ckpt is None or voc_config is None or voc_stat is None: + use_pretrained_voc = True + else: + use_pretrained_voc = False + voc_tag = voc + '-' + lang self.task_resource.set_task_model( model_tag=voc_tag, model_type=1, # vocoder + skip_download=not use_pretrained_voc, version=None, # default version ) - if voc_ckpt is None or voc_config is None or voc_stat is None: + if use_pretrained_voc: self.voc_res_path = self.task_resource.voc_res_dir self.voc_config = os.path.join( self.voc_res_path, self.task_resource.voc_res_dict['config']) @@ -153,9 +163,9 @@ class TTSServerExecutor(TTSExecutor): self.voc_stat = os.path.join( self.voc_res_path, self.task_resource.voc_res_dict['speech_stats']) - logger.info(self.voc_res_path) - logger.info(self.voc_config) - logger.info(self.voc_ckpt) + logger.debug(self.voc_res_path) + logger.debug(self.voc_config) + logger.debug(self.voc_ckpt) else: self.voc_config = os.path.abspath(voc_config) self.voc_ckpt = os.path.abspath(voc_ckpt) @@ -172,7 +182,6 @@ class TTSServerExecutor(TTSExecutor): with open(self.phones_dict, "r") as f: phn_id = [line.strip().split() for line in f.readlines()] self.vocab_size = len(phn_id) - print("vocab_size:", self.vocab_size) # frontend if lang == 'zh': @@ -182,7 +191,6 @@ class TTSServerExecutor(TTSExecutor): elif lang == 'en': self.frontend = English(phone_vocab_path=self.phones_dict) - print("frontend done!") # am infer info self.am_name = am[:am.rindex('_')] @@ -197,7 +205,6 @@ class TTSServerExecutor(TTSExecutor): self.am_name + '_inference') self.am_inference = am_inference_class(am_normalizer, am) self.am_inference.eval() - print("acoustic model done!") # voc infer info self.voc_name = voc[:voc.rindex('_')] @@ -208,7 +215,6 @@ class TTSServerExecutor(TTSExecutor): '_inference') self.voc_inference = voc_inference_class(voc_normalizer, voc) self.voc_inference.eval() - print("voc done!") class TTSEngine(BaseEngine): @@ -276,7 +282,6 @@ class TTSEngine(BaseEngine): logger.error(e) return False - assert ( self.executor.am_config.fs == self.executor.voc_config.fs ), "The sample rate of AM and Vocoder model are different, please check model." @@ -304,7 +309,7 @@ class PaddleTTSConnectionHandler: tts_engine (TTSEngine): The TTS engine """ super().__init__() - logger.info( + logger.debug( "Create PaddleTTSConnectionHandler to process the tts request") self.tts_engine = tts_engine @@ -364,7 +369,7 @@ class PaddleTTSConnectionHandler: text, merge_sentences=merge_sentences) phone_ids = input_ids["phone_ids"] else: - print("lang should in {'zh', 'en'}!") + logger.error("lang should in {'zh', 'en'}!") frontend_et = time.time() self.frontend_time = frontend_et - frontend_st @@ -465,11 +470,11 @@ class PaddleTTSConnectionHandler: ) self.final_response_time = time.time() - frontend_st - - def run(self, + def run( + self, sentence: str, - spk_id: int=0,): + spk_id: int=0, ): """ run include inference and postprocess. Args: diff --git a/paddlespeech/server/engine/tts/paddleinference/tts_engine.py b/paddlespeech/server/engine/tts/paddleinference/tts_engine.py index ab5b721ff0041c803c1b07fc4256a85040330909..43b0df407b74a50eefa9075ff6f4ef3c36f3f59a 100644 --- a/paddlespeech/server/engine/tts/paddleinference/tts_engine.py +++ b/paddlespeech/server/engine/tts/paddleinference/tts_engine.py @@ -65,16 +65,22 @@ class TTSServerExecutor(TTSExecutor): Init model and other resources from a specific path. """ if hasattr(self, 'am_predictor') and hasattr(self, 'voc_predictor'): - logger.info('Models had been initialized.') + logger.debug('Models had been initialized.') return # am + if am_model is None or am_params is None or phones_dict is None: + use_pretrained_am = True + else: + use_pretrained_am = False + am_tag = am + '-' + lang self.task_resource.set_task_model( model_tag=am_tag, model_type=0, # am + skip_download=not use_pretrained_am, version=None, # default version ) - if am_model is None or am_params is None or phones_dict is None: + if use_pretrained_am: self.am_res_path = self.task_resource.res_dir self.am_model = os.path.join(self.am_res_path, self.task_resource.res_dict['model']) @@ -85,16 +91,16 @@ class TTSServerExecutor(TTSExecutor): self.am_res_path, self.task_resource.res_dict['phones_dict']) self.am_sample_rate = self.task_resource.res_dict['sample_rate'] - logger.info(self.am_res_path) - logger.info(self.am_model) - logger.info(self.am_params) + logger.debug(self.am_res_path) + logger.debug(self.am_model) + logger.debug(self.am_params) else: self.am_model = os.path.abspath(am_model) self.am_params = os.path.abspath(am_params) self.phones_dict = os.path.abspath(phones_dict) self.am_sample_rate = am_sample_rate self.am_res_path = os.path.dirname(os.path.abspath(self.am_model)) - logger.info("self.phones_dict: {}".format(self.phones_dict)) + logger.debug("self.phones_dict: {}".format(self.phones_dict)) # for speedyspeech self.tones_dict = None @@ -113,13 +119,19 @@ class TTSServerExecutor(TTSExecutor): self.speaker_dict = speaker_dict # voc + if voc_model is None or voc_params is None: + use_pretrained_voc = True + else: + use_pretrained_voc = False + voc_tag = voc + '-' + lang self.task_resource.set_task_model( model_tag=voc_tag, model_type=1, # vocoder + skip_download=not use_pretrained_voc, version=None, # default version ) - if voc_model is None or voc_params is None: + if use_pretrained_voc: self.voc_res_path = self.task_resource.voc_res_dir self.voc_model = os.path.join( self.voc_res_path, self.task_resource.voc_res_dict['model']) @@ -127,9 +139,9 @@ class TTSServerExecutor(TTSExecutor): self.voc_res_path, self.task_resource.voc_res_dict['params']) self.voc_sample_rate = self.task_resource.voc_res_dict[ 'sample_rate'] - logger.info(self.voc_res_path) - logger.info(self.voc_model) - logger.info(self.voc_params) + logger.debug(self.voc_res_path) + logger.debug(self.voc_model) + logger.debug(self.voc_params) else: self.voc_model = os.path.abspath(voc_model) self.voc_params = os.path.abspath(voc_params) @@ -144,21 +156,21 @@ class TTSServerExecutor(TTSExecutor): with open(self.phones_dict, "r") as f: phn_id = [line.strip().split() for line in f.readlines()] vocab_size = len(phn_id) - logger.info("vocab_size: {}".format(vocab_size)) + logger.debug("vocab_size: {}".format(vocab_size)) tone_size = None if self.tones_dict: with open(self.tones_dict, "r") as f: tone_id = [line.strip().split() for line in f.readlines()] tone_size = len(tone_id) - logger.info("tone_size: {}".format(tone_size)) + logger.debug("tone_size: {}".format(tone_size)) spk_num = None if self.speaker_dict: with open(self.speaker_dict, 'rt') as f: spk_id = [line.strip().split() for line in f.readlines()] spk_num = len(spk_id) - logger.info("spk_num: {}".format(spk_num)) + logger.debug("spk_num: {}".format(spk_num)) # frontend if lang == 'zh': @@ -168,7 +180,7 @@ class TTSServerExecutor(TTSExecutor): elif lang == 'en': self.frontend = English(phone_vocab_path=self.phones_dict) - logger.info("frontend done!") + logger.debug("frontend done!") # Create am predictor self.am_predictor_conf = am_predictor_conf @@ -176,7 +188,7 @@ class TTSServerExecutor(TTSExecutor): model_file=self.am_model, params_file=self.am_params, predictor_conf=self.am_predictor_conf) - logger.info("Create AM predictor successfully.") + logger.debug("Create AM predictor successfully.") # Create voc predictor self.voc_predictor_conf = voc_predictor_conf @@ -184,7 +196,7 @@ class TTSServerExecutor(TTSExecutor): model_file=self.voc_model, params_file=self.voc_params, predictor_conf=self.voc_predictor_conf) - logger.info("Create Vocoder predictor successfully.") + logger.debug("Create Vocoder predictor successfully.") @paddle.no_grad() def infer(self, @@ -316,7 +328,8 @@ class TTSEngine(BaseEngine): logger.error(e) return False - logger.info("Initialize TTS server engine successfully.") + logger.info("Initialize TTS server engine successfully on device: %s." % + (self.device)) return True @@ -328,7 +341,7 @@ class PaddleTTSConnectionHandler(TTSServerExecutor): tts_engine (TTSEngine): The TTS engine """ super().__init__() - logger.info( + logger.debug( "Create PaddleTTSConnectionHandler to process the tts request") self.tts_engine = tts_engine @@ -366,23 +379,23 @@ class PaddleTTSConnectionHandler(TTSServerExecutor): if target_fs == 0 or target_fs > original_fs: target_fs = original_fs wav_tar_fs = wav - logger.info( + logger.debug( "The sample rate of synthesized audio is the same as model, which is {}Hz". format(original_fs)) else: wav_tar_fs = librosa.resample( np.squeeze(wav), original_fs, target_fs) - logger.info( + logger.debug( "The sample rate of model is {}Hz and the target sample rate is {}Hz. Converting the sample rate of the synthesized audio successfully.". format(original_fs, target_fs)) # transform volume wav_vol = wav_tar_fs * volume - logger.info("Transform the volume of the audio successfully.") + logger.debug("Transform the volume of the audio successfully.") # transform speed try: # windows not support soxbindings wav_speed = change_speed(wav_vol, speed, target_fs) - logger.info("Transform the speed of the audio successfully.") + logger.debug("Transform the speed of the audio successfully.") except ServerBaseException: raise ServerBaseException( ErrorCode.SERVER_INTERNAL_ERR, @@ -399,7 +412,7 @@ class PaddleTTSConnectionHandler(TTSServerExecutor): wavfile.write(buf, target_fs, wav_speed) base64_bytes = base64.b64encode(buf.read()) wav_base64 = base64_bytes.decode('utf-8') - logger.info("Audio to string successfully.") + logger.debug("Audio to string successfully.") # save audio if audio_path is not None: @@ -487,15 +500,15 @@ class PaddleTTSConnectionHandler(TTSServerExecutor): logger.error(e) sys.exit(-1) - logger.info("AM model: {}".format(self.config.am)) - logger.info("Vocoder model: {}".format(self.config.voc)) - logger.info("Language: {}".format(lang)) + logger.debug("AM model: {}".format(self.config.am)) + logger.debug("Vocoder model: {}".format(self.config.voc)) + logger.debug("Language: {}".format(lang)) logger.info("tts engine type: python") logger.info("audio duration: {}".format(duration)) - logger.info("frontend inference time: {}".format(self.frontend_time)) - logger.info("AM inference time: {}".format(self.am_time)) - logger.info("Vocoder inference time: {}".format(self.voc_time)) + logger.debug("frontend inference time: {}".format(self.frontend_time)) + logger.debug("AM inference time: {}".format(self.am_time)) + logger.debug("Vocoder inference time: {}".format(self.voc_time)) logger.info("total inference time: {}".format(infer_time)) logger.info( "postprocess (change speed, volume, target sample rate) time: {}". @@ -503,6 +516,6 @@ class PaddleTTSConnectionHandler(TTSServerExecutor): logger.info("total generate audio time: {}".format(infer_time + postprocess_time)) logger.info("RTF: {}".format(rtf)) - logger.info("device: {}".format(self.tts_engine.device)) + logger.debug("device: {}".format(self.tts_engine.device)) return lang, target_sample_rate, duration, wav_base64 diff --git a/paddlespeech/server/engine/tts/python/tts_engine.py b/paddlespeech/server/engine/tts/python/tts_engine.py index b048b01a49f1cf34a1edd4b10d5b85da74e579f4..4d1801006b87699cbbe19660b103ffcb4068c446 100644 --- a/paddlespeech/server/engine/tts/python/tts_engine.py +++ b/paddlespeech/server/engine/tts/python/tts_engine.py @@ -105,7 +105,7 @@ class PaddleTTSConnectionHandler(TTSServerExecutor): tts_engine (TTSEngine): The TTS engine """ super().__init__() - logger.info( + logger.debug( "Create PaddleTTSConnectionHandler to process the tts request") self.tts_engine = tts_engine @@ -143,23 +143,23 @@ class PaddleTTSConnectionHandler(TTSServerExecutor): if target_fs == 0 or target_fs > original_fs: target_fs = original_fs wav_tar_fs = wav - logger.info( + logger.debug( "The sample rate of synthesized audio is the same as model, which is {}Hz". format(original_fs)) else: wav_tar_fs = librosa.resample( np.squeeze(wav), original_fs, target_fs) - logger.info( + logger.debug( "The sample rate of model is {}Hz and the target sample rate is {}Hz. Converting the sample rate of the synthesized audio successfully.". format(original_fs, target_fs)) # transform volume wav_vol = wav_tar_fs * volume - logger.info("Transform the volume of the audio successfully.") + logger.debug("Transform the volume of the audio successfully.") # transform speed try: # windows not support soxbindings wav_speed = change_speed(wav_vol, speed, target_fs) - logger.info("Transform the speed of the audio successfully.") + logger.debug("Transform the speed of the audio successfully.") except ServerBaseException: raise ServerBaseException( ErrorCode.SERVER_INTERNAL_ERR, @@ -176,7 +176,7 @@ class PaddleTTSConnectionHandler(TTSServerExecutor): wavfile.write(buf, target_fs, wav_speed) base64_bytes = base64.b64encode(buf.read()) wav_base64 = base64_bytes.decode('utf-8') - logger.info("Audio to string successfully.") + logger.debug("Audio to string successfully.") # save audio if audio_path is not None: @@ -264,15 +264,15 @@ class PaddleTTSConnectionHandler(TTSServerExecutor): logger.error(e) sys.exit(-1) - logger.info("AM model: {}".format(self.config.am)) - logger.info("Vocoder model: {}".format(self.config.voc)) - logger.info("Language: {}".format(lang)) + logger.debug("AM model: {}".format(self.config.am)) + logger.debug("Vocoder model: {}".format(self.config.voc)) + logger.debug("Language: {}".format(lang)) logger.info("tts engine type: python") logger.info("audio duration: {}".format(duration)) - logger.info("frontend inference time: {}".format(self.frontend_time)) - logger.info("AM inference time: {}".format(self.am_time)) - logger.info("Vocoder inference time: {}".format(self.voc_time)) + logger.debug("frontend inference time: {}".format(self.frontend_time)) + logger.debug("AM inference time: {}".format(self.am_time)) + logger.debug("Vocoder inference time: {}".format(self.voc_time)) logger.info("total inference time: {}".format(infer_time)) logger.info( "postprocess (change speed, volume, target sample rate) time: {}". @@ -280,6 +280,6 @@ class PaddleTTSConnectionHandler(TTSServerExecutor): logger.info("total generate audio time: {}".format(infer_time + postprocess_time)) logger.info("RTF: {}".format(rtf)) - logger.info("device: {}".format(self.tts_engine.device)) + logger.debug("device: {}".format(self.tts_engine.device)) return lang, target_sample_rate, duration, wav_base64 diff --git a/paddlespeech/server/engine/vector/python/vector_engine.py b/paddlespeech/server/engine/vector/python/vector_engine.py index 3c72f55d4b61328db8ca91b976d4f34071974195..f7d60648d040e0bd3a60883e0a5a3900689b8754 100644 --- a/paddlespeech/server/engine/vector/python/vector_engine.py +++ b/paddlespeech/server/engine/vector/python/vector_engine.py @@ -33,7 +33,7 @@ class PaddleVectorConnectionHandler: vector_engine (VectorEngine): The Vector engine """ super().__init__() - logger.info( + logger.debug( "Create PaddleVectorConnectionHandler to process the vector request") self.vector_engine = vector_engine self.executor = self.vector_engine.executor @@ -54,7 +54,7 @@ class PaddleVectorConnectionHandler: Returns: str: the punctuation text """ - logger.info( + logger.debug( f"start to extract the do vector {self.task} from the http request") if self.task == "spk" and task == "spk": embedding = self.extract_audio_embedding(audio_data) @@ -81,17 +81,17 @@ class PaddleVectorConnectionHandler: Returns: float: the score between enroll and test audio """ - logger.info("start to extract the enroll audio embedding") + logger.debug("start to extract the enroll audio embedding") enroll_emb = self.extract_audio_embedding(enroll_audio) - logger.info("start to extract the test audio embedding") + logger.debug("start to extract the test audio embedding") test_emb = self.extract_audio_embedding(test_audio) - logger.info( + logger.debug( "start to get the score between the enroll and test embedding") score = self.executor.get_embeddings_score(enroll_emb, test_emb) - logger.info(f"get the enroll vs test score: {score}") + logger.debug(f"get the enroll vs test score: {score}") return score @paddle.no_grad() @@ -106,11 +106,12 @@ class PaddleVectorConnectionHandler: # because the soundfile will change the io.BytesIO(audio) to the end # thus we should convert the base64 string to io.BytesIO when we need the audio data if not self.executor._check(io.BytesIO(audio), sample_rate): - logger.info("check the audio sample rate occurs error") + logger.debug("check the audio sample rate occurs error") return np.array([0.0]) waveform, sr = load_audio(io.BytesIO(audio)) - logger.info(f"load the audio sample points, shape is: {waveform.shape}") + logger.debug( + f"load the audio sample points, shape is: {waveform.shape}") # stage 2: get the audio feat # Note: Now we only support fbank feature @@ -121,9 +122,9 @@ class PaddleVectorConnectionHandler: n_mels=self.config.n_mels, window_size=self.config.window_size, hop_length=self.config.hop_size) - logger.info(f"extract the audio feats, shape is: {feats.shape}") + logger.debug(f"extract the audio feats, shape is: {feats.shape}") except Exception as e: - logger.info(f"feats occurs exception {e}") + logger.error(f"feats occurs exception {e}") sys.exit(-1) feats = paddle.to_tensor(feats).unsqueeze(0) @@ -159,7 +160,7 @@ class VectorEngine(BaseEngine): """The Vector Engine """ super(VectorEngine, self).__init__() - logger.info("Create the VectorEngine Instance") + logger.debug("Create the VectorEngine Instance") def init(self, config: dict): """Init the Vector Engine @@ -170,7 +171,7 @@ class VectorEngine(BaseEngine): Returns: bool: The engine instance flag """ - logger.info("Init the vector engine") + logger.debug("Init the vector engine") try: self.config = config if self.config.device: @@ -179,7 +180,7 @@ class VectorEngine(BaseEngine): self.device = paddle.get_device() paddle.set_device(self.device) - logger.info(f"Vector Engine set the device: {self.device}") + logger.debug(f"Vector Engine set the device: {self.device}") except BaseException as e: logger.error( "Set device failed, please check if device is already used and the parameter 'device' in the yaml file" @@ -196,5 +197,7 @@ class VectorEngine(BaseEngine): ckpt_path=config.ckpt_path, task=config.task) - logger.info("Init the Vector engine successfully") + logger.info( + "Initialize Vector server engine successfully on device: %s." % + (self.device)) return True diff --git a/paddlespeech/server/restful/tts_api.py b/paddlespeech/server/restful/tts_api.py index 53fe159fdc02f75f76d438d8ab5876d440fc19c0..61e4c49f33818264bd999e873945ef2765abad7d 100644 --- a/paddlespeech/server/restful/tts_api.py +++ b/paddlespeech/server/restful/tts_api.py @@ -140,7 +140,9 @@ def tts(request_body: TTSRequest): @router.post("/paddlespeech/tts/streaming") async def stream_tts(request_body: TTSRequest): + # get params text = request_body.text + spk_id = request_body.spk_id engine_pool = get_engine_pool() tts_engine = engine_pool['tts'] @@ -156,4 +158,24 @@ async def stream_tts(request_body: TTSRequest): connection_handler = PaddleTTSConnectionHandler(tts_engine) - return StreamingResponse(connection_handler.run(sentence=text)) + return StreamingResponse( + connection_handler.run(sentence=text, spk_id=spk_id)) + + +@router.get("/paddlespeech/tts/streaming/samplerate") +def get_samplerate(): + try: + engine_pool = get_engine_pool() + tts_engine = engine_pool['tts'] + logger.info("Get tts engine successfully.") + sample_rate = tts_engine.sample_rate + + response = {"sample_rate": sample_rate} + + except ServerBaseException as e: + response = failed_response(e.error_code, e.msg) + except BaseException: + response = failed_response(ErrorCode.SERVER_UNKOWN_ERR) + traceback.print_exc() + + return response diff --git a/paddlespeech/server/utils/audio_handler.py b/paddlespeech/server/utils/audio_handler.py index 0f09069beb28c592d490c102f6f234f5afb23723..b5629037a838fff1b87922972f1cba4e30e71443 100644 --- a/paddlespeech/server/utils/audio_handler.py +++ b/paddlespeech/server/utils/audio_handler.py @@ -138,7 +138,7 @@ class ASRWsAudioHandler: Returns: str: the final asr result """ - logging.info("send a message to the server") + logging.debug("send a message to the server") if self.url is None: logger.error("No asr server, please input valid ip and port") @@ -160,7 +160,7 @@ class ASRWsAudioHandler: separators=(',', ': ')) await ws.send(audio_info) msg = await ws.recv() - logger.info("client receive msg={}".format(msg)) + logger.debug("client receive msg={}".format(msg)) # 3. send chunk audio data to engine for chunk_data in self.read_wave(wavfile_path): @@ -170,7 +170,7 @@ class ASRWsAudioHandler: if self.punc_server and len(msg["result"]) > 0: msg["result"] = self.punc_server.run(msg["result"]) - logger.info("client receive msg={}".format(msg)) + logger.debug("client receive msg={}".format(msg)) # 4. we must send finished signal to the server audio_info = json.dumps( @@ -299,10 +299,7 @@ class TTSWsHandler: self.buffer = b'' self.mutex.release() - async def run(self, - text: str, - spk_id=0, - output: str=None): + async def run(self, text: str, spk_id=0, output: str=None): """Send a text to online server Args: @@ -320,7 +317,7 @@ class TTSWsHandler: start_request = json.dumps({"task": "tts", "signal": "start"}) await ws.send(start_request) msg = await ws.recv() - logger.info(f"client receive msg={msg}") + logger.debug(f"client receive msg={msg}") msg = json.loads(msg) session = msg["session"] @@ -334,7 +331,7 @@ class TTSWsHandler: request = json.dumps(params) st = time.time() await ws.send(request) - logging.info("send a message to the server") + logging.debug("send a message to the server") # 4. Process the received response message = await ws.recv() @@ -359,7 +356,8 @@ class TTSWsHandler: duration = len(all_bytes) / 2.0 / self.sample_rate if output is not None: - save_audio_success = save_audio(all_bytes, output, self.sample_rate) + save_audio_success = save_audio(all_bytes, output, + self.sample_rate) else: save_audio_success = False @@ -377,7 +375,8 @@ class TTSWsHandler: receive_time_list.append(time.time()) audio = message["audio"] audio = base64.b64decode(audio) # bytes - chunk_duration_list.append(len(audio) / 2.0 / self.sample_rate) + chunk_duration_list.append( + len(audio) / 2.0 / self.sample_rate) all_bytes += audio if self.play: self.mutex.acquire() @@ -437,7 +436,7 @@ class TTSHttpHandler: output=True) self.mutex = threading.Lock() self.t = threading.Thread(target=self.play_audio) - + logger.info(f"endpoint: {self.url}") def play_audio(self): @@ -452,10 +451,7 @@ class TTSHttpHandler: self.buffer = b'' self.mutex.release() - def run(self, - text: str, - spk_id=0, - output: str=None): + def run(self, text: str, spk_id=0, output: str=None): """Send a text to tts online server Args: @@ -463,7 +459,7 @@ class TTSHttpHandler: spk_id (int, optional): speaker id. Defaults to 0. output (str, optional): client save audio path. Defaults to None. """ - + # 1. Create request params = { "text": text, @@ -556,7 +552,6 @@ class VectorHttpHandler: "sample_rate": sample_rate, } - logger.info(self.url) res = requests.post(url=self.url, data=json.dumps(data)) return res.json() diff --git a/paddlespeech/server/utils/audio_process.py b/paddlespeech/server/utils/audio_process.py index 416d77ac41d02794ce8bd5ec3de4f1fd8f5add9a..ae53839794877497c80175bb23bb4ad560dac61f 100644 --- a/paddlespeech/server/utils/audio_process.py +++ b/paddlespeech/server/utils/audio_process.py @@ -169,7 +169,7 @@ def save_audio(bytes_data, audio_path, sample_rate: int=24000) -> bool: sample_rate=sample_rate) os.remove("./tmp.pcm") else: - print("Only supports saved audio format is pcm or wav") + logger.error("Only supports saved audio format is pcm or wav") return False return True diff --git a/paddlespeech/server/utils/log.py b/paddlespeech/server/utils/log.py deleted file mode 100644 index 8644064c73ef407476e7870e65d1149019762723..0000000000000000000000000000000000000000 --- a/paddlespeech/server/utils/log.py +++ /dev/null @@ -1,59 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import functools -import logging - -__all__ = [ - 'logger', -] - - -class Logger(object): - def __init__(self, name: str=None): - name = 'PaddleSpeech' if not name else name - self.logger = logging.getLogger(name) - - log_config = { - 'DEBUG': 10, - 'INFO': 20, - 'TRAIN': 21, - 'EVAL': 22, - 'WARNING': 30, - 'ERROR': 40, - 'CRITICAL': 50, - 'EXCEPTION': 100, - } - for key, level in log_config.items(): - logging.addLevelName(level, key) - if key == 'EXCEPTION': - self.__dict__[key.lower()] = self.logger.exception - else: - self.__dict__[key.lower()] = functools.partial(self.__call__, - level) - - self.format = logging.Formatter( - fmt='[%(asctime)-15s] [%(levelname)8s] - %(message)s') - - self.handler = logging.StreamHandler() - self.handler.setFormatter(self.format) - - self.logger.addHandler(self.handler) - self.logger.setLevel(logging.DEBUG) - self.logger.propagate = False - - def __call__(self, log_level: str, msg: str): - self.logger.log(log_level, msg) - - -logger = Logger() diff --git a/paddlespeech/server/utils/onnx_infer.py b/paddlespeech/server/utils/onnx_infer.py index 087eb9c006a1dc764cbbefa4756af49aa3389376..23d83c735a7b1aa0853c37db8e231656712110f2 100644 --- a/paddlespeech/server/utils/onnx_infer.py +++ b/paddlespeech/server/utils/onnx_infer.py @@ -20,7 +20,7 @@ from paddlespeech.cli.log import logger def get_sess(model_path: Optional[os.PathLike]=None, sess_conf: dict=None): - logger.info(f"ort sessconf: {sess_conf}") + logger.debug(f"ort sessconf: {sess_conf}") sess_options = ort.SessionOptions() sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL if sess_conf.get('graph_optimization_level', 99) == 0: @@ -34,7 +34,7 @@ def get_sess(model_path: Optional[os.PathLike]=None, sess_conf: dict=None): # fastspeech2/mb_melgan can't use trt now! if sess_conf.get("use_trt", 0): providers = ['TensorrtExecutionProvider'] - logger.info(f"ort providers: {providers}") + logger.debug(f"ort providers: {providers}") if 'cpu_threads' in sess_conf: sess_options.intra_op_num_threads = sess_conf.get("cpu_threads", 0) diff --git a/paddlespeech/server/utils/util.py b/paddlespeech/server/utils/util.py index 061b213c78360d523d1cc3cc180f93cfaac387ab..826d923ed0f255e5411e3192b6c4d680ef14c933 100644 --- a/paddlespeech/server/utils/util.py +++ b/paddlespeech/server/utils/util.py @@ -13,6 +13,8 @@ import base64 import math +from paddlespeech.cli.log import logger + def wav2base64(wav_file: str): """ @@ -61,7 +63,7 @@ def get_chunks(data, block_size, pad_size, step): elif step == "voc": data_len = data.shape[0] else: - print("Please set correct type to get chunks, am or voc") + logger.error("Please set correct type to get chunks, am or voc") chunks = [] n = math.ceil(data_len / block_size) @@ -73,7 +75,7 @@ def get_chunks(data, block_size, pad_size, step): elif step == "voc": chunks.append(data[start:end, :]) else: - print("Please set correct type to get chunks, am or voc") + logger.error("Please set correct type to get chunks, am or voc") return chunks diff --git a/paddlespeech/server/ws/tts_api.py b/paddlespeech/server/ws/tts_api.py index 3d8b222ead1f8568417e2ba005b04cd2ddd6fbef..275711f5879c04cf5d8a430115c55186a1f59a45 100644 --- a/paddlespeech/server/ws/tts_api.py +++ b/paddlespeech/server/ws/tts_api.py @@ -87,12 +87,12 @@ async def websocket_endpoint(websocket: WebSocket): # speech synthesis request elif 'text' in message: - text_bese64 = message["text"] - sentence = connection_handler.preprocess( - text_bese64=text_bese64) + text = message["text"] + spk_id = message["spk_id"] # run - wav_generator = connection_handler.run(sentence) + wav_generator = connection_handler.run( + sentence=text, spk_id=spk_id) while True: try: @@ -116,3 +116,22 @@ async def websocket_endpoint(websocket: WebSocket): except Exception as e: logger.error(e) + + +@router.get("/paddlespeech/tts/streaming/samplerate") +def get_samplerate(): + try: + engine_pool = get_engine_pool() + tts_engine = engine_pool['tts'] + logger.info("Get tts engine successfully.") + sample_rate = tts_engine.sample_rate + + response = {"sample_rate": sample_rate} + + except ServerBaseException as e: + response = failed_response(e.error_code, e.msg) + except BaseException: + response = failed_response(ErrorCode.SERVER_UNKOWN_ERR) + traceback.print_exc() + + return response diff --git a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py index 48595bb25ca2241b74ebe22be6325708564b5699..347a10e90a2b80d09ddee07d14a19db12377c4be 100644 --- a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py +++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py @@ -141,71 +141,133 @@ class FastSpeech2(nn.Layer): init_dec_alpha: float=1.0, ): """Initialize FastSpeech2 module. Args: - idim (int): Dimension of the inputs. - odim (int): Dimension of the outputs. - adim (int): Attention dimension. - aheads (int): Number of attention heads. - elayers (int): Number of encoder layers. - eunits (int): Number of encoder hidden units. - dlayers (int): Number of decoder layers. - dunits (int): Number of decoder hidden units. - postnet_layers (int): Number of postnet layers. - postnet_chans (int): Number of postnet channels. - postnet_filts (int): Kernel size of postnet. - postnet_dropout_rate (float): Dropout rate in postnet. - use_scaled_pos_enc (bool): Whether to use trainable scaled pos encoding. - use_batch_norm (bool): Whether to use batch normalization in encoder prenet. - encoder_normalize_before (bool): Whether to apply layernorm layer before encoder block. - decoder_normalize_before (bool): Whether to apply layernorm layer before decoder block. - encoder_concat_after (bool): Whether to concatenate attention layer's input and output in encoder. - decoder_concat_after (bool): Whether to concatenate attention layer's input and output in decoder. - reduction_factor (int): Reduction factor. - encoder_type (str): Encoder type ("transformer" or "conformer"). - decoder_type (str): Decoder type ("transformer" or "conformer"). - transformer_enc_dropout_rate (float): Dropout rate in encoder except attention and positional encoding. - transformer_enc_positional_dropout_rate (float): Dropout rate after encoder positional encoding. - transformer_enc_attn_dropout_rate (float): Dropout rate in encoder self-attention module. - transformer_dec_dropout_rate (float): Dropout rate in decoder except attention & positional encoding. - transformer_dec_positional_dropout_rate (float): Dropout rate after decoder positional encoding. - transformer_dec_attn_dropout_rate (float): Dropout rate in decoder self-attention module. - conformer_pos_enc_layer_type (str): Pos encoding layer type in conformer. - conformer_self_attn_layer_type (str): Self-attention layer type in conformer - conformer_activation_type (str): Activation function type in conformer. - use_macaron_style_in_conformer (bool): Whether to use macaron style FFN. - use_cnn_in_conformer (bool): Whether to use CNN in conformer. - zero_triu (bool): Whether to use zero triu in relative self-attention module. - conformer_enc_kernel_size (int): Kernel size of encoder conformer. - conformer_dec_kernel_size (int): Kernel size of decoder conformer. - duration_predictor_layers (int): Number of duration predictor layers. - duration_predictor_chans (int): Number of duration predictor channels. - duration_predictor_kernel_size (int): Kernel size of duration predictor. - duration_predictor_dropout_rate (float): Dropout rate in duration predictor. - pitch_predictor_layers (int): Number of pitch predictor layers. - pitch_predictor_chans (int): Number of pitch predictor channels. - pitch_predictor_kernel_size (int): Kernel size of pitch predictor. - pitch_predictor_dropout_rate (float): Dropout rate in pitch predictor. - pitch_embed_kernel_size (float): Kernel size of pitch embedding. - pitch_embed_dropout_rate (float): Dropout rate for pitch embedding. - stop_gradient_from_pitch_predictor (bool): Whether to stop gradient from pitch predictor to encoder. - energy_predictor_layers (int): Number of energy predictor layers. - energy_predictor_chans (int): Number of energy predictor channels. - energy_predictor_kernel_size (int): Kernel size of energy predictor. - energy_predictor_dropout_rate (float): Dropout rate in energy predictor. - energy_embed_kernel_size (float): Kernel size of energy embedding. - energy_embed_dropout_rate (float): Dropout rate for energy embedding. - stop_gradient_from_energy_predictor(bool): Whether to stop gradient from energy predictor to encoder. - spk_num (Optional[int]): Number of speakers. If not None, assume that the spk_embed_dim is not None, + idim (int): + Dimension of the inputs. + odim (int): + Dimension of the outputs. + adim (int): + Attention dimension. + aheads (int): + Number of attention heads. + elayers (int): + Number of encoder layers. + eunits (int): + Number of encoder hidden units. + dlayers (int): + Number of decoder layers. + dunits (int): + Number of decoder hidden units. + postnet_layers (int): + Number of postnet layers. + postnet_chans (int): + Number of postnet channels. + postnet_filts (int): + Kernel size of postnet. + postnet_dropout_rate (float): + Dropout rate in postnet. + use_scaled_pos_enc (bool): + Whether to use trainable scaled pos encoding. + use_batch_norm (bool): + Whether to use batch normalization in encoder prenet. + encoder_normalize_before (bool): + Whether to apply layernorm layer before encoder block. + decoder_normalize_before (bool): + Whether to apply layernorm layer before decoder block. + encoder_concat_after (bool): + Whether to concatenate attention layer's input and output in encoder. + decoder_concat_after (bool): + Whether to concatenate attention layer's input and output in decoder. + reduction_factor (int): + Reduction factor. + encoder_type (str): + Encoder type ("transformer" or "conformer"). + decoder_type (str): + Decoder type ("transformer" or "conformer"). + transformer_enc_dropout_rate (float): + Dropout rate in encoder except attention and positional encoding. + transformer_enc_positional_dropout_rate (float): + Dropout rate after encoder positional encoding. + transformer_enc_attn_dropout_rate (float): + Dropout rate in encoder self-attention module. + transformer_dec_dropout_rate (float): + Dropout rate in decoder except attention & positional encoding. + transformer_dec_positional_dropout_rate (float): + Dropout rate after decoder positional encoding. + transformer_dec_attn_dropout_rate (float): + Dropout rate in decoder self-attention module. + conformer_pos_enc_layer_type (str): + Pos encoding layer type in conformer. + conformer_self_attn_layer_type (str): + Self-attention layer type in conformer + conformer_activation_type (str): + Activation function type in conformer. + use_macaron_style_in_conformer (bool): + Whether to use macaron style FFN. + use_cnn_in_conformer (bool): + Whether to use CNN in conformer. + zero_triu (bool): + Whether to use zero triu in relative self-attention module. + conformer_enc_kernel_size (int): + Kernel size of encoder conformer. + conformer_dec_kernel_size (int): + Kernel size of decoder conformer. + duration_predictor_layers (int): + Number of duration predictor layers. + duration_predictor_chans (int): + Number of duration predictor channels. + duration_predictor_kernel_size (int): + Kernel size of duration predictor. + duration_predictor_dropout_rate (float): + Dropout rate in duration predictor. + pitch_predictor_layers (int): + Number of pitch predictor layers. + pitch_predictor_chans (int): + Number of pitch predictor channels. + pitch_predictor_kernel_size (int): + Kernel size of pitch predictor. + pitch_predictor_dropout_rate (float): + Dropout rate in pitch predictor. + pitch_embed_kernel_size (float): + Kernel size of pitch embedding. + pitch_embed_dropout_rate (float): + Dropout rate for pitch embedding. + stop_gradient_from_pitch_predictor (bool): + Whether to stop gradient from pitch predictor to encoder. + energy_predictor_layers (int): + Number of energy predictor layers. + energy_predictor_chans (int): + Number of energy predictor channels. + energy_predictor_kernel_size (int): + Kernel size of energy predictor. + energy_predictor_dropout_rate (float): + Dropout rate in energy predictor. + energy_embed_kernel_size (float): + Kernel size of energy embedding. + energy_embed_dropout_rate (float): + Dropout rate for energy embedding. + stop_gradient_from_energy_predictor(bool): + Whether to stop gradient from energy predictor to encoder. + spk_num (Optional[int]): + Number of speakers. If not None, assume that the spk_embed_dim is not None, spk_ids will be provided as the input and use spk_embedding_table. - spk_embed_dim (Optional[int]): Speaker embedding dimension. If not None, + spk_embed_dim (Optional[int]): + Speaker embedding dimension. If not None, assume that spk_emb will be provided as the input or spk_num is not None. - spk_embed_integration_type (str): How to integrate speaker embedding. - tone_num (Optional[int]): Number of tones. If not None, assume that the + spk_embed_integration_type (str): + How to integrate speaker embedding. + tone_num (Optional[int]): + Number of tones. If not None, assume that the tone_ids will be provided as the input and use tone_embedding_table. - tone_embed_dim (Optional[int]): Tone embedding dimension. If not None, assume that tone_num is not None. - tone_embed_integration_type (str): How to integrate tone embedding. - init_type (str): How to initialize transformer parameters. - init_enc_alpha (float): Initial value of alpha in scaled pos encoding of the encoder. - init_dec_alpha (float): Initial value of alpha in scaled pos encoding of the decoder. + tone_embed_dim (Optional[int]): + Tone embedding dimension. If not None, assume that tone_num is not None. + tone_embed_integration_type (str): + How to integrate tone embedding. + init_type (str): + How to initialize transformer parameters. + init_enc_alpha (float): + Initial value of alpha in scaled pos encoding of the encoder. + init_dec_alpha (float): + Initial value of alpha in scaled pos encoding of the decoder. """ assert check_argument_types() @@ -258,7 +320,6 @@ class FastSpeech2(nn.Layer): padding_idx=self.padding_idx) if encoder_type == "transformer": - print("encoder_type is transformer") self.encoder = TransformerEncoder( idim=idim, attention_dim=adim, @@ -275,7 +336,6 @@ class FastSpeech2(nn.Layer): positionwise_layer_type=positionwise_layer_type, positionwise_conv_kernel_size=positionwise_conv_kernel_size, ) elif encoder_type == "conformer": - print("encoder_type is conformer") self.encoder = ConformerEncoder( idim=idim, attention_dim=adim, @@ -362,7 +422,6 @@ class FastSpeech2(nn.Layer): # NOTE: we use encoder as decoder # because fastspeech's decoder is the same as encoder if decoder_type == "transformer": - print("decoder_type is transformer") self.decoder = TransformerEncoder( idim=0, attention_dim=adim, @@ -380,7 +439,6 @@ class FastSpeech2(nn.Layer): positionwise_layer_type=positionwise_layer_type, positionwise_conv_kernel_size=positionwise_conv_kernel_size, ) elif decoder_type == "conformer": - print("decoder_type is conformer") self.decoder = ConformerEncoder( idim=0, attention_dim=adim, @@ -453,20 +511,29 @@ class FastSpeech2(nn.Layer): """Calculate forward propagation. Args: - text(Tensor(int64)): Batch of padded token ids (B, Tmax). - text_lengths(Tensor(int64)): Batch of lengths of each input (B,). - speech(Tensor): Batch of padded target features (B, Lmax, odim). - speech_lengths(Tensor(int64)): Batch of the lengths of each target (B,). - durations(Tensor(int64)): Batch of padded durations (B, Tmax). - pitch(Tensor): Batch of padded token-averaged pitch (B, Tmax, 1). - energy(Tensor): Batch of padded token-averaged energy (B, Tmax, 1). - tone_id(Tensor, optional(int64)): Batch of padded tone ids (B, Tmax). - spk_emb(Tensor, optional): Batch of speaker embeddings (B, spk_embed_dim). - spk_id(Tnesor, optional(int64)): Batch of speaker ids (B,) + text(Tensor(int64)): + Batch of padded token ids (B, Tmax). + text_lengths(Tensor(int64)): + Batch of lengths of each input (B,). + speech(Tensor): + Batch of padded target features (B, Lmax, odim). + speech_lengths(Tensor(int64)): + Batch of the lengths of each target (B,). + durations(Tensor(int64)): + Batch of padded durations (B, Tmax). + pitch(Tensor): + Batch of padded token-averaged pitch (B, Tmax, 1). + energy(Tensor): + Batch of padded token-averaged energy (B, Tmax, 1). + tone_id(Tensor, optional(int64)): + Batch of padded tone ids (B, Tmax). + spk_emb(Tensor, optional): + Batch of speaker embeddings (B, spk_embed_dim). + spk_id(Tnesor, optional(int64)): + Batch of speaker ids (B,) Returns: - """ # input of embedding must be int64 @@ -662,20 +729,28 @@ class FastSpeech2(nn.Layer): """Generate the sequence of features given the sequences of characters. Args: - text(Tensor(int64)): Input sequence of characters (T,). - durations(Tensor, optional (int64)): Groundtruth of duration (T,). - pitch(Tensor, optional): Groundtruth of token-averaged pitch (T, 1). - energy(Tensor, optional): Groundtruth of token-averaged energy (T, 1). - alpha(float, optional): Alpha to control the speed. - use_teacher_forcing(bool, optional): Whether to use teacher forcing. + text(Tensor(int64)): + Input sequence of characters (T,). + durations(Tensor, optional (int64)): + Groundtruth of duration (T,). + pitch(Tensor, optional): + Groundtruth of token-averaged pitch (T, 1). + energy(Tensor, optional): + Groundtruth of token-averaged energy (T, 1). + alpha(float, optional): + Alpha to control the speed. + use_teacher_forcing(bool, optional): + Whether to use teacher forcing. If true, groundtruth of duration, pitch and energy will be used. - spk_emb(Tensor, optional, optional): peaker embedding vector (spk_embed_dim,). (Default value = None) - spk_id(Tensor, optional(int64), optional): spk ids (1,). (Default value = None) - tone_id(Tensor, optional(int64), optional): tone ids (T,). (Default value = None) + spk_emb(Tensor, optional, optional): + peaker embedding vector (spk_embed_dim,). (Default value = None) + spk_id(Tensor, optional(int64), optional): + spk ids (1,). (Default value = None) + tone_id(Tensor, optional(int64), optional): + tone ids (T,). (Default value = None) Returns: - """ # input of embedding must be int64 x = paddle.cast(text, 'int64') @@ -724,8 +799,10 @@ class FastSpeech2(nn.Layer): """Integrate speaker embedding with hidden states. Args: - hs(Tensor): Batch of hidden state sequences (B, Tmax, adim). - spk_emb(Tensor): Batch of speaker embeddings (B, spk_embed_dim). + hs(Tensor): + Batch of hidden state sequences (B, Tmax, adim). + spk_emb(Tensor): + Batch of speaker embeddings (B, spk_embed_dim). Returns: @@ -749,8 +826,10 @@ class FastSpeech2(nn.Layer): """Integrate speaker embedding with hidden states. Args: - hs(Tensor): Batch of hidden state sequences (B, Tmax, adim). - tone_embs(Tensor): Batch of speaker embeddings (B, Tmax, tone_embed_dim). + hs(Tensor): + Batch of hidden state sequences (B, Tmax, adim). + tone_embs(Tensor): + Batch of speaker embeddings (B, Tmax, tone_embed_dim). Returns: @@ -773,10 +852,12 @@ class FastSpeech2(nn.Layer): """Make masks for self-attention. Args: - ilens(Tensor): Batch of lengths (B,). + ilens(Tensor): + Batch of lengths (B,). Returns: - Tensor: Mask tensor for self-attention. dtype=paddle.bool + Tensor: + Mask tensor for self-attention. dtype=paddle.bool Examples: >>> ilens = [5, 3] @@ -858,19 +939,32 @@ class StyleFastSpeech2Inference(FastSpeech2Inference): """ Args: - text(Tensor(int64)): Input sequence of characters (T,). - durations(paddle.Tensor/np.ndarray, optional (int64)): Groundtruth of duration (T,), this will overwrite the set of durations_scale and durations_bias + text(Tensor(int64)): + Input sequence of characters (T,). + durations(paddle.Tensor/np.ndarray, optional (int64)): + Groundtruth of duration (T,), this will overwrite the set of durations_scale and durations_bias durations_scale(int/float, optional): + durations_bias(int/float, optional): - pitch(paddle.Tensor/np.ndarray, optional): Groundtruth of token-averaged pitch (T, 1), this will overwrite the set of pitch_scale and pitch_bias - pitch_scale(int/float, optional): In denormed HZ domain. - pitch_bias(int/float, optional): In denormed HZ domain. - energy(paddle.Tensor/np.ndarray, optional): Groundtruth of token-averaged energy (T, 1), this will overwrite the set of energy_scale and energy_bias - energy_scale(int/float, optional): In denormed domain. - energy_bias(int/float, optional): In denormed domain. - robot: bool: (Default value = False) - spk_emb: (Default value = None) - spk_id: (Default value = None) + + pitch(paddle.Tensor/np.ndarray, optional): + Groundtruth of token-averaged pitch (T, 1), this will overwrite the set of pitch_scale and pitch_bias + pitch_scale(int/float, optional): + In denormed HZ domain. + pitch_bias(int/float, optional): + In denormed HZ domain. + energy(paddle.Tensor/np.ndarray, optional): + Groundtruth of token-averaged energy (T, 1), this will overwrite the set of energy_scale and energy_bias + energy_scale(int/float, optional): + In denormed domain. + energy_bias(int/float, optional): + In denormed domain. + robot(bool) (Default value = False): + + spk_emb(Default value = None): + + spk_id(Default value = None): + Returns: Tensor: logmel @@ -949,8 +1043,10 @@ class FastSpeech2Loss(nn.Layer): use_weighted_masking: bool=False): """Initialize feed-forward Transformer loss module. Args: - use_masking (bool): Whether to apply masking for padded part in loss calculation. - use_weighted_masking (bool): Whether to weighted masking in loss calculation. + use_masking (bool): + Whether to apply masking for padded part in loss calculation. + use_weighted_masking (bool): + Whether to weighted masking in loss calculation. """ assert check_argument_types() super().__init__() @@ -982,17 +1078,28 @@ class FastSpeech2Loss(nn.Layer): """Calculate forward propagation. Args: - after_outs(Tensor): Batch of outputs after postnets (B, Lmax, odim). - before_outs(Tensor): Batch of outputs before postnets (B, Lmax, odim). - d_outs(Tensor): Batch of outputs of duration predictor (B, Tmax). - p_outs(Tensor): Batch of outputs of pitch predictor (B, Tmax, 1). - e_outs(Tensor): Batch of outputs of energy predictor (B, Tmax, 1). - ys(Tensor): Batch of target features (B, Lmax, odim). - ds(Tensor): Batch of durations (B, Tmax). - ps(Tensor): Batch of target token-averaged pitch (B, Tmax, 1). - es(Tensor): Batch of target token-averaged energy (B, Tmax, 1). - ilens(Tensor): Batch of the lengths of each input (B,). - olens(Tensor): Batch of the lengths of each target (B,). + after_outs(Tensor): + Batch of outputs after postnets (B, Lmax, odim). + before_outs(Tensor): + Batch of outputs before postnets (B, Lmax, odim). + d_outs(Tensor): + Batch of outputs of duration predictor (B, Tmax). + p_outs(Tensor): + Batch of outputs of pitch predictor (B, Tmax, 1). + e_outs(Tensor): + Batch of outputs of energy predictor (B, Tmax, 1). + ys(Tensor): + Batch of target features (B, Lmax, odim). + ds(Tensor): + Batch of durations (B, Tmax). + ps(Tensor): + Batch of target token-averaged pitch (B, Tmax, 1). + es(Tensor): + Batch of target token-averaged energy (B, Tmax, 1). + ilens(Tensor): + Batch of the lengths of each input (B,). + olens(Tensor): + Batch of the lengths of each target (B,). Returns: diff --git a/paddlespeech/t2s/models/hifigan/hifigan.py b/paddlespeech/t2s/models/hifigan/hifigan.py index bea9dd9a3e6232fc014e7edec20cd07d2e299db3..7a01840e278ee93370716351fe7055c56d67ad3f 100644 --- a/paddlespeech/t2s/models/hifigan/hifigan.py +++ b/paddlespeech/t2s/models/hifigan/hifigan.py @@ -50,20 +50,34 @@ class HiFiGANGenerator(nn.Layer): init_type: str="xavier_uniform", ): """Initialize HiFiGANGenerator module. Args: - in_channels (int): Number of input channels. - out_channels (int): Number of output channels. - channels (int): Number of hidden representation channels. - global_channels (int): Number of global conditioning channels. - kernel_size (int): Kernel size of initial and final conv layer. - upsample_scales (list): List of upsampling scales. - upsample_kernel_sizes (list): List of kernel sizes for upsampling layers. - resblock_kernel_sizes (list): List of kernel sizes for residual blocks. - resblock_dilations (list): List of dilation list for residual blocks. - use_additional_convs (bool): Whether to use additional conv layers in residual blocks. - bias (bool): Whether to add bias parameter in convolution layers. - nonlinear_activation (str): Activation function module name. - nonlinear_activation_params (dict): Hyperparameters for activation function. - use_weight_norm (bool): Whether to use weight norm. + in_channels (int): + Number of input channels. + out_channels (int): + Number of output channels. + channels (int): + Number of hidden representation channels. + global_channels (int): + Number of global conditioning channels. + kernel_size (int): + Kernel size of initial and final conv layer. + upsample_scales (list): + List of upsampling scales. + upsample_kernel_sizes (list): + List of kernel sizes for upsampling layers. + resblock_kernel_sizes (list): + List of kernel sizes for residual blocks. + resblock_dilations (list): + List of dilation list for residual blocks. + use_additional_convs (bool): + Whether to use additional conv layers in residual blocks. + bias (bool): + Whether to add bias parameter in convolution layers. + nonlinear_activation (str): + Activation function module name. + nonlinear_activation_params (dict): + Hyperparameters for activation function. + use_weight_norm (bool): + Whether to use weight norm. If set to true, it will be applied to all of the conv layers. """ super().__init__() @@ -199,9 +213,10 @@ class HiFiGANGenerator(nn.Layer): def inference(self, c, g: Optional[paddle.Tensor]=None): """Perform inference. Args: - c (Tensor): Input tensor (T, in_channels). - normalize_before (bool): Whether to perform normalization. - g (Optional[Tensor]): Global conditioning tensor (global_channels, 1). + c (Tensor): + Input tensor (T, in_channels). + g (Optional[Tensor]): + Global conditioning tensor (global_channels, 1). Returns: Tensor: Output tensor (T ** prod(upsample_scales), out_channels). @@ -233,20 +248,33 @@ class HiFiGANPeriodDiscriminator(nn.Layer): """Initialize HiFiGANPeriodDiscriminator module. Args: - in_channels (int): Number of input channels. - out_channels (int): Number of output channels. - period (int): Period. - kernel_sizes (list): Kernel sizes of initial conv layers and the final conv layer. - channels (int): Number of initial channels. - downsample_scales (list): List of downsampling scales. - max_downsample_channels (int): Number of maximum downsampling channels. - use_additional_convs (bool): Whether to use additional conv layers in residual blocks. - bias (bool): Whether to add bias parameter in convolution layers. - nonlinear_activation (str): Activation function module name. - nonlinear_activation_params (dict): Hyperparameters for activation function. - use_weight_norm (bool): Whether to use weight norm. + in_channels (int): + Number of input channels. + out_channels (int): + Number of output channels. + period (int): + Period. + kernel_sizes (list): + Kernel sizes of initial conv layers and the final conv layer. + channels (int): + Number of initial channels. + downsample_scales (list): + List of downsampling scales. + max_downsample_channels (int): + Number of maximum downsampling channels. + use_additional_convs (bool): + Whether to use additional conv layers in residual blocks. + bias (bool): + Whether to add bias parameter in convolution layers. + nonlinear_activation (str): + Activation function module name. + nonlinear_activation_params (dict): + Hyperparameters for activation function. + use_weight_norm (bool): + Whether to use weight norm. If set to true, it will be applied to all of the conv layers. - use_spectral_norm (bool): Whether to use spectral norm. + use_spectral_norm (bool): + Whether to use spectral norm. If set to true, it will be applied to all of the conv layers. """ super().__init__() @@ -298,7 +326,8 @@ class HiFiGANPeriodDiscriminator(nn.Layer): """Calculate forward propagation. Args: - c (Tensor): Input tensor (B, in_channels, T). + c (Tensor): + Input tensor (B, in_channels, T). Returns: list: List of each layer's tensors. """ @@ -367,8 +396,10 @@ class HiFiGANMultiPeriodDiscriminator(nn.Layer): """Initialize HiFiGANMultiPeriodDiscriminator module. Args: - periods (list): List of periods. - discriminator_params (dict): Parameters for hifi-gan period discriminator module. + periods (list): + List of periods. + discriminator_params (dict): + Parameters for hifi-gan period discriminator module. The period parameter will be overwritten. """ super().__init__() @@ -385,7 +416,8 @@ class HiFiGANMultiPeriodDiscriminator(nn.Layer): """Calculate forward propagation. Args: - x (Tensor): Input noise signal (B, 1, T). + x (Tensor): + Input noise signal (B, 1, T). Returns: List: List of list of each discriminator outputs, which consists of each layer output tensors. """ @@ -417,16 +449,25 @@ class HiFiGANScaleDiscriminator(nn.Layer): """Initilize HiFiGAN scale discriminator module. Args: - in_channels (int): Number of input channels. - out_channels (int): Number of output channels. - kernel_sizes (list): List of four kernel sizes. The first will be used for the first conv layer, + in_channels (int): + Number of input channels. + out_channels (int): + Number of output channels. + kernel_sizes (list): + List of four kernel sizes. The first will be used for the first conv layer, and the second is for downsampling part, and the remaining two are for output layers. - channels (int): Initial number of channels for conv layer. - max_downsample_channels (int): Maximum number of channels for downsampling layers. - bias (bool): Whether to add bias parameter in convolution layers. - downsample_scales (list): List of downsampling scales. - nonlinear_activation (str): Activation function module name. - nonlinear_activation_params (dict): Hyperparameters for activation function. + channels (int): + Initial number of channels for conv layer. + max_downsample_channels (int): + Maximum number of channels for downsampling layers. + bias (bool): + Whether to add bias parameter in convolution layers. + downsample_scales (list): + List of downsampling scales. + nonlinear_activation (str): + Activation function module name. + nonlinear_activation_params (dict): + Hyperparameters for activation function. use_weight_norm (bool): Whether to use weight norm. If set to true, it will be applied to all of the conv layers. use_spectral_norm (bool): Whether to use spectral norm. @@ -614,7 +655,8 @@ class HiFiGANMultiScaleDiscriminator(nn.Layer): """Calculate forward propagation. Args: - x (Tensor): Input noise signal (B, 1, T). + x (Tensor): + Input noise signal (B, 1, T). Returns: List: List of list of each discriminator outputs, which consists of each layer output tensors. """ @@ -675,14 +717,21 @@ class HiFiGANMultiScaleMultiPeriodDiscriminator(nn.Layer): """Initilize HiFiGAN multi-scale + multi-period discriminator module. Args: - scales (int): Number of multi-scales. - scale_downsample_pooling (str): Pooling module name for downsampling of the inputs. - scale_downsample_pooling_params (dict): Parameters for the above pooling module. - scale_discriminator_params (dict): Parameters for hifi-gan scale discriminator module. - follow_official_norm (bool): Whether to follow the norm setting of the official implementaion. + scales (int): + Number of multi-scales. + scale_downsample_pooling (str): + Pooling module name for downsampling of the inputs. + scale_downsample_pooling_params (dict): + Parameters for the above pooling module. + scale_discriminator_params (dict): + Parameters for hifi-gan scale discriminator module. + follow_official_norm (bool): + Whether to follow the norm setting of the official implementaion. The first discriminator uses spectral norm and the other discriminators use weight norm. - periods (list): List of periods. - period_discriminator_params (dict): Parameters for hifi-gan period discriminator module. + periods (list): + List of periods. + period_discriminator_params (dict): + Parameters for hifi-gan period discriminator module. The period parameter will be overwritten. """ super().__init__() @@ -704,7 +753,8 @@ class HiFiGANMultiScaleMultiPeriodDiscriminator(nn.Layer): """Calculate forward propagation. Args: - x (Tensor): Input noise signal (B, 1, T). + x (Tensor): + Input noise signal (B, 1, T). Returns: List: List of list of each discriminator outputs, diff --git a/paddlespeech/t2s/models/melgan/melgan.py b/paddlespeech/t2s/models/melgan/melgan.py index 22d8fd9e764c5c7f3c71ca1e2d17acc641a029cd..058cf40d9c25199bc7da9bdbdca8ca9c2c386673 100644 --- a/paddlespeech/t2s/models/melgan/melgan.py +++ b/paddlespeech/t2s/models/melgan/melgan.py @@ -53,24 +53,38 @@ class MelGANGenerator(nn.Layer): """Initialize MelGANGenerator module. Args: - in_channels (int): Number of input channels. - out_channels (int): Number of output channels, + in_channels (int): + Number of input channels. + out_channels (int): + Number of output channels, the number of sub-band is out_channels in multi-band melgan. - kernel_size (int): Kernel size of initial and final conv layer. - channels (int): Initial number of channels for conv layer. - bias (bool): Whether to add bias parameter in convolution layers. - upsample_scales (List[int]): List of upsampling scales. - stack_kernel_size (int): Kernel size of dilated conv layers in residual stack. - stacks (int): Number of stacks in a single residual stack. - nonlinear_activation (Optional[str], optional): Non linear activation in upsample network, by default None - nonlinear_activation_params (Dict[str, Any], optional): Parameters passed to the linear activation in the upsample network, - by default {} - pad (str): Padding function module name before dilated convolution layer. - pad_params (dict): Hyperparameters for padding function. - use_final_nonlinear_activation (nn.Layer): Activation function for the final layer. - use_weight_norm (bool): Whether to use weight norm. + kernel_size (int): + Kernel size of initial and final conv layer. + channels (int): + Initial number of channels for conv layer. + bias (bool): + Whether to add bias parameter in convolution layers. + upsample_scales (List[int]): + List of upsampling scales. + stack_kernel_size (int): + Kernel size of dilated conv layers in residual stack. + stacks (int): + Number of stacks in a single residual stack. + nonlinear_activation (Optional[str], optional): + Non linear activation in upsample network, by default None + nonlinear_activation_params (Dict[str, Any], optional): + Parameters passed to the linear activation in the upsample network, by default {} + pad (str): + Padding function module name before dilated convolution layer. + pad_params (dict): + Hyperparameters for padding function. + use_final_nonlinear_activation (nn.Layer): + Activation function for the final layer. + use_weight_norm (bool): + Whether to use weight norm. If set to true, it will be applied to all of the conv layers. - use_causal_conv (bool): Whether to use causal convolution. + use_causal_conv (bool): + Whether to use causal convolution. """ super().__init__() @@ -194,7 +208,8 @@ class MelGANGenerator(nn.Layer): """Calculate forward propagation. Args: - c (Tensor): Input tensor (B, in_channels, T). + c (Tensor): + Input tensor (B, in_channels, T). Returns: Tensor: Output tensor (B, out_channels, T ** prod(upsample_scales)). """ @@ -244,7 +259,8 @@ class MelGANGenerator(nn.Layer): """Perform inference. Args: - c (Union[Tensor, ndarray]): Input tensor (T, in_channels). + c (Union[Tensor, ndarray]): + Input tensor (T, in_channels). Returns: Tensor: Output tensor (out_channels*T ** prod(upsample_scales), 1). """ @@ -279,20 +295,30 @@ class MelGANDiscriminator(nn.Layer): """Initilize MelGAN discriminator module. Args: - in_channels (int): Number of input channels. - out_channels (int): Number of output channels. + in_channels (int): + Number of input channels. + out_channels (int): + Number of output channels. kernel_sizes (List[int]): List of two kernel sizes. The prod will be used for the first conv layer, and the first and the second kernel sizes will be used for the last two layers. For example if kernel_sizes = [5, 3], the first layer kernel size will be 5 * 3 = 15, the last two layers' kernel size will be 5 and 3, respectively. - channels (int): Initial number of channels for conv layer. - max_downsample_channels (int): Maximum number of channels for downsampling layers. - bias (bool): Whether to add bias parameter in convolution layers. - downsample_scales (List[int]): List of downsampling scales. - nonlinear_activation (str): Activation function module name. - nonlinear_activation_params (dict): Hyperparameters for activation function. - pad (str): Padding function module name before dilated convolution layer. - pad_params (dict): Hyperparameters for padding function. + channels (int): + Initial number of channels for conv layer. + max_downsample_channels (int): + Maximum number of channels for downsampling layers. + bias (bool): + Whether to add bias parameter in convolution layers. + downsample_scales (List[int]): + List of downsampling scales. + nonlinear_activation (str): + Activation function module name. + nonlinear_activation_params (dict): + Hyperparameters for activation function. + pad (str): + Padding function module name before dilated convolution layer. + pad_params (dict): + Hyperparameters for padding function. """ super().__init__() @@ -364,7 +390,8 @@ class MelGANDiscriminator(nn.Layer): def forward(self, x): """Calculate forward propagation. Args: - x (Tensor): Input noise signal (B, 1, T). + x (Tensor): + Input noise signal (B, 1, T). Returns: List: List of output tensors of each layer (for feat_match_loss). """ @@ -406,22 +433,37 @@ class MelGANMultiScaleDiscriminator(nn.Layer): """Initilize MelGAN multi-scale discriminator module. Args: - in_channels (int): Number of input channels. - out_channels (int): Number of output channels. - scales (int): Number of multi-scales. - downsample_pooling (str): Pooling module name for downsampling of the inputs. - downsample_pooling_params (dict): Parameters for the above pooling module. - kernel_sizes (List[int]): List of two kernel sizes. The sum will be used for the first conv layer, + in_channels (int): + Number of input channels. + out_channels (int): + Number of output channels. + scales (int): + Number of multi-scales. + downsample_pooling (str): + Pooling module name for downsampling of the inputs. + downsample_pooling_params (dict): + Parameters for the above pooling module. + kernel_sizes (List[int]): + List of two kernel sizes. The sum will be used for the first conv layer, and the first and the second kernel sizes will be used for the last two layers. - channels (int): Initial number of channels for conv layer. - max_downsample_channels (int): Maximum number of channels for downsampling layers. - bias (bool): Whether to add bias parameter in convolution layers. - downsample_scales (List[int]): List of downsampling scales. - nonlinear_activation (str): Activation function module name. - nonlinear_activation_params (dict): Hyperparameters for activation function. - pad (str): Padding function module name before dilated convolution layer. - pad_params (dict): Hyperparameters for padding function. - use_causal_conv (bool): Whether to use causal convolution. + channels (int): + Initial number of channels for conv layer. + max_downsample_channels (int): + Maximum number of channels for downsampling layers. + bias (bool): + Whether to add bias parameter in convolution layers. + downsample_scales (List[int]): + List of downsampling scales. + nonlinear_activation (str): + Activation function module name. + nonlinear_activation_params (dict): + Hyperparameters for activation function. + pad (str): + Padding function module name before dilated convolution layer. + pad_params (dict): + Hyperparameters for padding function. + use_causal_conv (bool): + Whether to use causal convolution. """ super().__init__() @@ -464,7 +506,8 @@ class MelGANMultiScaleDiscriminator(nn.Layer): def forward(self, x): """Calculate forward propagation. Args: - x (Tensor): Input noise signal (B, 1, T). + x (Tensor): + Input noise signal (B, 1, T). Returns: List: List of list of each discriminator outputs, which consists of each layer output tensors. """ diff --git a/paddlespeech/t2s/models/melgan/style_melgan.py b/paddlespeech/t2s/models/melgan/style_melgan.py index 40a2f10096680b0dc0420c54ad0373d7f80f1912..d902a4b014ed372f81100930afc2514abe070744 100644 --- a/paddlespeech/t2s/models/melgan/style_melgan.py +++ b/paddlespeech/t2s/models/melgan/style_melgan.py @@ -54,20 +54,34 @@ class StyleMelGANGenerator(nn.Layer): """Initilize Style MelGAN generator. Args: - in_channels (int): Number of input noise channels. - aux_channels (int): Number of auxiliary input channels. - channels (int): Number of channels for conv layer. - out_channels (int): Number of output channels. - kernel_size (int): Kernel size of conv layers. - dilation (int): Dilation factor for conv layers. - bias (bool): Whether to add bias parameter in convolution layers. - noise_upsample_scales (list): List of noise upsampling scales. - noise_upsample_activation (str): Activation function module name for noise upsampling. - noise_upsample_activation_params (dict): Hyperparameters for the above activation function. - upsample_scales (list): List of upsampling scales. - upsample_mode (str): Upsampling mode in TADE layer. - gated_function (str): Gated function in TADEResBlock ("softmax" or "sigmoid"). - use_weight_norm (bool): Whether to use weight norm. + in_channels (int): + Number of input noise channels. + aux_channels (int): + Number of auxiliary input channels. + channels (int): + Number of channels for conv layer. + out_channels (int): + Number of output channels. + kernel_size (int): + Kernel size of conv layers. + dilation (int): + Dilation factor for conv layers. + bias (bool): + Whether to add bias parameter in convolution layers. + noise_upsample_scales (list): + List of noise upsampling scales. + noise_upsample_activation (str): + Activation function module name for noise upsampling. + noise_upsample_activation_params (dict): + Hyperparameters for the above activation function. + upsample_scales (list): + List of upsampling scales. + upsample_mode (str): + Upsampling mode in TADE layer. + gated_function (str): + Gated function in TADEResBlock ("softmax" or "sigmoid"). + use_weight_norm (bool): + Whether to use weight norm. If set to true, it will be applied to all of the conv layers. """ super().__init__() @@ -194,7 +208,8 @@ class StyleMelGANGenerator(nn.Layer): def inference(self, c): """Perform inference. Args: - c (Tensor): Input tensor (T, in_channels). + c (Tensor): + Input tensor (T, in_channels). Returns: Tensor: Output tensor (T ** prod(upsample_scales), out_channels). """ @@ -258,11 +273,16 @@ class StyleMelGANDiscriminator(nn.Layer): """Initilize Style MelGAN discriminator. Args: - repeats (int): Number of repititons to apply RWD. - window_sizes (list): List of random window sizes. - pqmf_params (list): List of list of Parameters for PQMF modules - discriminator_params (dict): Parameters for base discriminator module. - use_weight_nom (bool): Whether to apply weight normalization. + repeats (int): + Number of repititons to apply RWD. + window_sizes (list): + List of random window sizes. + pqmf_params (list): + List of list of Parameters for PQMF modules + discriminator_params (dict): + Parameters for base discriminator module. + use_weight_nom (bool): + Whether to apply weight normalization. """ super().__init__() @@ -299,7 +319,8 @@ class StyleMelGANDiscriminator(nn.Layer): def forward(self, x): """Calculate forward propagation. Args: - x (Tensor): Input tensor (B, 1, T). + x (Tensor): + Input tensor (B, 1, T). Returns: List: List of discriminator outputs, #items in the list will be equal to repeats * #discriminators. diff --git a/paddlespeech/t2s/models/parallel_wavegan/parallel_wavegan.py b/paddlespeech/t2s/models/parallel_wavegan/parallel_wavegan.py index cc8460e4d7131331e66d55e5119942c531923409..be306d9ccf036b5ade08aeb17f1511258d5e758f 100644 --- a/paddlespeech/t2s/models/parallel_wavegan/parallel_wavegan.py +++ b/paddlespeech/t2s/models/parallel_wavegan/parallel_wavegan.py @@ -32,29 +32,45 @@ class PWGGenerator(nn.Layer): """Wave Generator for Parallel WaveGAN Args: - in_channels (int, optional): Number of channels of the input waveform, by default 1 - out_channels (int, optional): Number of channels of the output waveform, by default 1 - kernel_size (int, optional): Kernel size of the residual blocks inside, by default 3 - layers (int, optional): Number of residual blocks inside, by default 30 - stacks (int, optional): The number of groups to split the residual blocks into, by default 3 + in_channels (int, optional): + Number of channels of the input waveform, by default 1 + out_channels (int, optional): + Number of channels of the output waveform, by default 1 + kernel_size (int, optional): + Kernel size of the residual blocks inside, by default 3 + layers (int, optional): + Number of residual blocks inside, by default 30 + stacks (int, optional): + The number of groups to split the residual blocks into, by default 3 Within each group, the dilation of the residual block grows exponentially. - residual_channels (int, optional): Residual channel of the residual blocks, by default 64 - gate_channels (int, optional): Gate channel of the residual blocks, by default 128 - skip_channels (int, optional): Skip channel of the residual blocks, by default 64 - aux_channels (int, optional): Auxiliary channel of the residual blocks, by default 80 - aux_context_window (int, optional): The context window size of the first convolution applied to the - auxiliary input, by default 2 - dropout (float, optional): Dropout of the residual blocks, by default 0. - bias (bool, optional): Whether to use bias in residual blocks, by default True - use_weight_norm (bool, optional): Whether to use weight norm in all convolutions, by default True - use_causal_conv (bool, optional): Whether to use causal padding in the upsample network and residual - blocks, by default False - upsample_scales (List[int], optional): Upsample scales of the upsample network, by default [4, 4, 4, 4] - nonlinear_activation (Optional[str], optional): Non linear activation in upsample network, by default None - nonlinear_activation_params (Dict[str, Any], optional): Parameters passed to the linear activation in the upsample network, - by default {} - interpolate_mode (str, optional): Interpolation mode of the upsample network, by default "nearest" - freq_axis_kernel_size (int, optional): Kernel size along the frequency axis of the upsample network, by default 1 + residual_channels (int, optional): + Residual channel of the residual blocks, by default 64 + gate_channels (int, optional): + Gate channel of the residual blocks, by default 128 + skip_channels (int, optional): + Skip channel of the residual blocks, by default 64 + aux_channels (int, optional): + Auxiliary channel of the residual blocks, by default 80 + aux_context_window (int, optional): + The context window size of the first convolution applied to the auxiliary input, by default 2 + dropout (float, optional): + Dropout of the residual blocks, by default 0. + bias (bool, optional): + Whether to use bias in residual blocks, by default True + use_weight_norm (bool, optional): + Whether to use weight norm in all convolutions, by default True + use_causal_conv (bool, optional): + Whether to use causal padding in the upsample network and residual blocks, by default False + upsample_scales (List[int], optional): + Upsample scales of the upsample network, by default [4, 4, 4, 4] + nonlinear_activation (Optional[str], optional): + Non linear activation in upsample network, by default None + nonlinear_activation_params (Dict[str, Any], optional): + Parameters passed to the linear activation in the upsample network, by default {} + interpolate_mode (str, optional): + Interpolation mode of the upsample network, by default "nearest" + freq_axis_kernel_size (int, optional): + Kernel size along the frequency axis of the upsample network, by default 1 """ def __init__( @@ -147,9 +163,11 @@ class PWGGenerator(nn.Layer): """Generate waveform. Args: - x(Tensor): Shape (N, C_in, T), The input waveform. - c(Tensor): Shape (N, C_aux, T'). The auxiliary input (e.g. spectrogram). It - is upsampled to match the time resolution of the input. + x(Tensor): + Shape (N, C_in, T), The input waveform. + c(Tensor): + Shape (N, C_aux, T'). The auxiliary input (e.g. spectrogram). + It is upsampled to match the time resolution of the input. Returns: Tensor: Shape (N, C_out, T), the generated waveform. @@ -195,8 +213,10 @@ class PWGGenerator(nn.Layer): """Waveform generation. This function is used for single instance inference. Args: - c(Tensor, optional, optional): Shape (T', C_aux), the auxiliary input, by default None - x(Tensor, optional): Shape (T, C_in), the noise waveform, by default None + c(Tensor, optional, optional): + Shape (T', C_aux), the auxiliary input, by default None + x(Tensor, optional): + Shape (T, C_in), the noise waveform, by default None Returns: Tensor: Shape (T, C_out), the generated waveform @@ -214,20 +234,28 @@ class PWGDiscriminator(nn.Layer): """A convolutional discriminator for audio. Args: - in_channels (int, optional): Number of channels of the input audio, by default 1 - out_channels (int, optional): Output feature size, by default 1 - kernel_size (int, optional): Kernel size of convolutional sublayers, by default 3 - layers (int, optional): Number of layers, by default 10 - conv_channels (int, optional): Feature size of the convolutional sublayers, by default 64 - dilation_factor (int, optional): The factor with which dilation of each convolutional sublayers grows + in_channels (int, optional): + Number of channels of the input audio, by default 1 + out_channels (int, optional): + Output feature size, by default 1 + kernel_size (int, optional): + Kernel size of convolutional sublayers, by default 3 + layers (int, optional): + Number of layers, by default 10 + conv_channels (int, optional): + Feature size of the convolutional sublayers, by default 64 + dilation_factor (int, optional): + The factor with which dilation of each convolutional sublayers grows exponentially if it is greater than 1, else the dilation of each convolutional sublayers grows linearly, by default 1 - nonlinear_activation (str, optional): The activation after each convolutional sublayer, by default "leakyrelu" - nonlinear_activation_params (Dict[str, Any], optional): The parameters passed to the activation's initializer, by default - {"negative_slope": 0.2} - bias (bool, optional): Whether to use bias in convolutional sublayers, by default True - use_weight_norm (bool, optional): Whether to use weight normalization at all convolutional sublayers, - by default True + nonlinear_activation (str, optional): + The activation after each convolutional sublayer, by default "leakyrelu" + nonlinear_activation_params (Dict[str, Any], optional): + The parameters passed to the activation's initializer, by default {"negative_slope": 0.2} + bias (bool, optional): + Whether to use bias in convolutional sublayers, by default True + use_weight_norm (bool, optional): + Whether to use weight normalization at all convolutional sublayers, by default True """ def __init__( @@ -290,7 +318,8 @@ class PWGDiscriminator(nn.Layer): """ Args: - x (Tensor): Shape (N, in_channels, num_samples), the input audio. + x (Tensor): + Shape (N, in_channels, num_samples), the input audio. Returns: Tensor: Shape (N, out_channels, num_samples), the predicted logits. @@ -318,24 +347,35 @@ class ResidualPWGDiscriminator(nn.Layer): """A wavenet-style discriminator for audio. Args: - in_channels (int, optional): Number of channels of the input audio, by default 1 - out_channels (int, optional): Output feature size, by default 1 - kernel_size (int, optional): Kernel size of residual blocks, by default 3 - layers (int, optional): Number of residual blocks, by default 30 - stacks (int, optional): Number of groups of residual blocks, within which the dilation + in_channels (int, optional): + Number of channels of the input audio, by default 1 + out_channels (int, optional): + Output feature size, by default 1 + kernel_size (int, optional): + Kernel size of residual blocks, by default 3 + layers (int, optional): + Number of residual blocks, by default 30 + stacks (int, optional): + Number of groups of residual blocks, within which the dilation of each residual blocks grows exponentially, by default 3 - residual_channels (int, optional): Residual channels of residual blocks, by default 64 - gate_channels (int, optional): Gate channels of residual blocks, by default 128 - skip_channels (int, optional): Skip channels of residual blocks, by default 64 - dropout (float, optional): Dropout probability of residual blocks, by default 0. - bias (bool, optional): Whether to use bias in residual blocks, by default True - use_weight_norm (bool, optional): Whether to use weight normalization in all convolutional layers, - by default True - use_causal_conv (bool, optional): Whether to use causal convolution in residual blocks, by default False - nonlinear_activation (str, optional): Activation after convolutions other than those in residual blocks, - by default "leakyrelu" - nonlinear_activation_params (Dict[str, Any], optional): Parameters to pass to the activation, - by default {"negative_slope": 0.2} + residual_channels (int, optional): + Residual channels of residual blocks, by default 64 + gate_channels (int, optional): + Gate channels of residual blocks, by default 128 + skip_channels (int, optional): + Skip channels of residual blocks, by default 64 + dropout (float, optional): + Dropout probability of residual blocks, by default 0. + bias (bool, optional): + Whether to use bias in residual blocks, by default True + use_weight_norm (bool, optional): + Whether to use weight normalization in all convolutional layers, by default True + use_causal_conv (bool, optional): + Whether to use causal convolution in residual blocks, by default False + nonlinear_activation (str, optional): + Activation after convolutions other than those in residual blocks, by default "leakyrelu" + nonlinear_activation_params (Dict[str, Any], optional): + Parameters to pass to the activation, by default {"negative_slope": 0.2} """ def __init__( @@ -405,7 +445,8 @@ class ResidualPWGDiscriminator(nn.Layer): def forward(self, x): """ Args: - x(Tensor): Shape (N, in_channels, num_samples), the input audio.↩ + x(Tensor): + Shape (N, in_channels, num_samples), the input audio.↩ Returns: Tensor: Shape (N, out_channels, num_samples), the predicted logits. diff --git a/paddlespeech/t2s/models/speedyspeech/speedyspeech.py b/paddlespeech/t2s/models/speedyspeech/speedyspeech.py index ed7c0b7e46733ef851d9f001aa463e5ea9c224ad..395ad69174d16886de9ea8a1c93a58f1edde577f 100644 --- a/paddlespeech/t2s/models/speedyspeech/speedyspeech.py +++ b/paddlespeech/t2s/models/speedyspeech/speedyspeech.py @@ -29,10 +29,14 @@ class ResidualBlock(nn.Layer): n: int=2): """SpeedySpeech encoder module. Args: - channels (int, optional): Feature size of the residual output(and also the input). - kernel_size (int, optional): Kernel size of the 1D convolution. - dilation (int, optional): Dilation of the 1D convolution. - n (int): Number of blocks. + channels (int, optional): + Feature size of the residual output(and also the input). + kernel_size (int, optional): + Kernel size of the 1D convolution. + dilation (int, optional): + Dilation of the 1D convolution. + n (int): + Number of blocks. """ super().__init__() @@ -57,7 +61,8 @@ class ResidualBlock(nn.Layer): def forward(self, x: paddle.Tensor): """Calculate forward propagation. Args: - x(Tensor): Batch of input sequences (B, hidden_size, Tmax). + x(Tensor): + Batch of input sequences (B, hidden_size, Tmax). Returns: Tensor: The residual output (B, hidden_size, Tmax). """ @@ -89,8 +94,10 @@ class TextEmbedding(nn.Layer): def forward(self, text: paddle.Tensor, tone: paddle.Tensor=None): """Calculate forward propagation. Args: - text(Tensor(int64)): Batch of padded token ids (B, Tmax). - tones(Tensor, optional(int64)): Batch of padded tone ids (B, Tmax). + text(Tensor(int64)): + Batch of padded token ids (B, Tmax). + tones(Tensor, optional(int64)): + Batch of padded tone ids (B, Tmax). Returns: Tensor: The residual output (B, Tmax, embedding_size). """ @@ -109,12 +116,18 @@ class TextEmbedding(nn.Layer): class SpeedySpeechEncoder(nn.Layer): """SpeedySpeech encoder module. Args: - vocab_size (int): Dimension of the inputs. - tone_size (Optional[int]): Number of tones. - hidden_size (int): Number of encoder hidden units. - kernel_size (int): Kernel size of encoder. - dilations (List[int]): Dilations of encoder. - spk_num (Optional[int]): Number of speakers. + vocab_size (int): + Dimension of the inputs. + tone_size (Optional[int]): + Number of tones. + hidden_size (int): + Number of encoder hidden units. + kernel_size (int): + Kernel size of encoder. + dilations (List[int]): + Dilations of encoder. + spk_num (Optional[int]): + Number of speakers. """ def __init__(self, @@ -161,9 +174,12 @@ class SpeedySpeechEncoder(nn.Layer): spk_id: paddle.Tensor=None): """Encoder input sequence. Args: - text(Tensor(int64)): Batch of padded token ids (B, Tmax). - tones(Tensor, optional(int64)): Batch of padded tone ids (B, Tmax). - spk_id(Tnesor, optional(int64)): Batch of speaker ids (B,) + text(Tensor(int64)): + Batch of padded token ids (B, Tmax). + tones(Tensor, optional(int64)): + Batch of padded tone ids (B, Tmax). + spk_id(Tnesor, optional(int64)): + Batch of speaker ids (B,) Returns: Tensor: Output tensor (B, Tmax, hidden_size). @@ -192,7 +208,8 @@ class DurationPredictor(nn.Layer): def forward(self, x: paddle.Tensor): """Calculate forward propagation. Args: - x(Tensor): Batch of input sequences (B, Tmax, hidden_size). + x(Tensor): + Batch of input sequences (B, Tmax, hidden_size). Returns: Tensor: Batch of predicted durations in log domain (B, Tmax). @@ -212,10 +229,14 @@ class SpeedySpeechDecoder(nn.Layer): ]): """SpeedySpeech decoder module. Args: - hidden_size (int): Number of decoder hidden units. - kernel_size (int): Kernel size of decoder. - output_size (int): Dimension of the outputs. - dilations (List[int]): Dilations of decoder. + hidden_size (int): + Number of decoder hidden units. + kernel_size (int): + Kernel size of decoder. + output_size (int): + Dimension of the outputs. + dilations (List[int]): + Dilations of decoder. """ super().__init__() res_blocks = [ @@ -230,7 +251,8 @@ class SpeedySpeechDecoder(nn.Layer): def forward(self, x): """Decoder input sequence. Args: - x(Tensor): Input tensor (B, time, hidden_size). + x(Tensor): + Input tensor (B, time, hidden_size). Returns: Tensor: Output tensor (B, time, output_size). @@ -261,18 +283,30 @@ class SpeedySpeech(nn.Layer): positional_dropout_rate: int=0.1): """Initialize SpeedySpeech module. Args: - vocab_size (int): Dimension of the inputs. - encoder_hidden_size (int): Number of encoder hidden units. - encoder_kernel_size (int): Kernel size of encoder. - encoder_dilations (List[int]): Dilations of encoder. - duration_predictor_hidden_size (int): Number of duration predictor hidden units. - decoder_hidden_size (int): Number of decoder hidden units. - decoder_kernel_size (int): Kernel size of decoder. - decoder_dilations (List[int]): Dilations of decoder. - decoder_output_size (int): Dimension of the outputs. - tone_size (Optional[int]): Number of tones. - spk_num (Optional[int]): Number of speakers. - init_type (str): How to initialize transformer parameters. + vocab_size (int): + Dimension of the inputs. + encoder_hidden_size (int): + Number of encoder hidden units. + encoder_kernel_size (int): + Kernel size of encoder. + encoder_dilations (List[int]): + Dilations of encoder. + duration_predictor_hidden_size (int): + Number of duration predictor hidden units. + decoder_hidden_size (int): + Number of decoder hidden units. + decoder_kernel_size (int): + Kernel size of decoder. + decoder_dilations (List[int]): + Dilations of decoder. + decoder_output_size (int): + Dimension of the outputs. + tone_size (Optional[int]): + Number of tones. + spk_num (Optional[int]): + Number of speakers. + init_type (str): + How to initialize transformer parameters. """ super().__init__() @@ -304,14 +338,20 @@ class SpeedySpeech(nn.Layer): spk_id: paddle.Tensor=None): """Calculate forward propagation. Args: - text(Tensor(int64)): Batch of padded token ids (B, Tmax). - durations(Tensor(int64)): Batch of padded durations (B, Tmax). - tones(Tensor, optional(int64)): Batch of padded tone ids (B, Tmax). - spk_id(Tnesor, optional(int64)): Batch of speaker ids (B,) + text(Tensor(int64)): + Batch of padded token ids (B, Tmax). + durations(Tensor(int64)): + Batch of padded durations (B, Tmax). + tones(Tensor, optional(int64)): + Batch of padded tone ids (B, Tmax). + spk_id(Tnesor, optional(int64)): + Batch of speaker ids (B,) Returns: - Tensor: Output tensor (B, T_frames, decoder_output_size). - Tensor: Predicted durations (B, Tmax). + Tensor: + Output tensor (B, T_frames, decoder_output_size). + Tensor: + Predicted durations (B, Tmax). """ # input of embedding must be int64 text = paddle.cast(text, 'int64') @@ -336,10 +376,14 @@ class SpeedySpeech(nn.Layer): spk_id: paddle.Tensor=None): """Generate the sequence of features given the sequences of characters. Args: - text(Tensor(int64)): Input sequence of characters (T,). - tones(Tensor, optional(int64)): Batch of padded tone ids (T, ). - durations(Tensor, optional (int64)): Groundtruth of duration (T,). - spk_id(Tensor, optional(int64), optional): spk ids (1,). (Default value = None) + text(Tensor(int64)): + Input sequence of characters (T,). + tones(Tensor, optional(int64)): + Batch of padded tone ids (T, ). + durations(Tensor, optional (int64)): + Groundtruth of duration (T,). + spk_id(Tensor, optional(int64), optional): + spk ids (1,). (Default value = None) Returns: Tensor: logmel (T, decoder_output_size). diff --git a/paddlespeech/t2s/models/tacotron2/tacotron2.py b/paddlespeech/t2s/models/tacotron2/tacotron2.py index 7b306e4820de10db9ae8551fffe62ab50d055905..25b5c932ae7d406ee01b38f6d88990dd586828ca 100644 --- a/paddlespeech/t2s/models/tacotron2/tacotron2.py +++ b/paddlespeech/t2s/models/tacotron2/tacotron2.py @@ -83,38 +83,67 @@ class Tacotron2(nn.Layer): init_type: str="xavier_uniform", ): """Initialize Tacotron2 module. Args: - idim (int): Dimension of the inputs. - odim (int): Dimension of the outputs. - embed_dim (int): Dimension of the token embedding. - elayers (int): Number of encoder blstm layers. - eunits (int): Number of encoder blstm units. - econv_layers (int): Number of encoder conv layers. - econv_filts (int): Number of encoder conv filter size. - econv_chans (int): Number of encoder conv filter channels. - dlayers (int): Number of decoder lstm layers. - dunits (int): Number of decoder lstm units. - prenet_layers (int): Number of prenet layers. - prenet_units (int): Number of prenet units. - postnet_layers (int): Number of postnet layers. - postnet_filts (int): Number of postnet filter size. - postnet_chans (int): Number of postnet filter channels. - output_activation (str): Name of activation function for outputs. - adim (int): Number of dimension of mlp in attention. - aconv_chans (int): Number of attention conv filter channels. - aconv_filts (int): Number of attention conv filter size. - cumulate_att_w (bool): Whether to cumulate previous attention weight. - use_batch_norm (bool): Whether to use batch normalization. - use_concate (bool): Whether to concat enc outputs w/ dec lstm outputs. - reduction_factor (int): Reduction factor. - spk_num (Optional[int]): Number of speakers. If set to > 1, assume that the + idim (int): + Dimension of the inputs. + odim (int): + Dimension of the outputs. + embed_dim (int): + Dimension of the token embedding. + elayers (int): + Number of encoder blstm layers. + eunits (int): + Number of encoder blstm units. + econv_layers (int): + Number of encoder conv layers. + econv_filts (int): + Number of encoder conv filter size. + econv_chans (int): + Number of encoder conv filter channels. + dlayers (int): + Number of decoder lstm layers. + dunits (int): + Number of decoder lstm units. + prenet_layers (int): + Number of prenet layers. + prenet_units (int): + Number of prenet units. + postnet_layers (int): + Number of postnet layers. + postnet_filts (int): + Number of postnet filter size. + postnet_chans (int): + Number of postnet filter channels. + output_activation (str): + Name of activation function for outputs. + adim (int): + Number of dimension of mlp in attention. + aconv_chans (int): + Number of attention conv filter channels. + aconv_filts (int): + Number of attention conv filter size. + cumulate_att_w (bool): + Whether to cumulate previous attention weight. + use_batch_norm (bool): + Whether to use batch normalization. + use_concate (bool): + Whether to concat enc outputs w/ dec lstm outputs. + reduction_factor (int): + Reduction factor. + spk_num (Optional[int]): + Number of speakers. If set to > 1, assume that the sids will be provided as the input and use sid embedding layer. - lang_num (Optional[int]): Number of languages. If set to > 1, assume that the + lang_num (Optional[int]): + Number of languages. If set to > 1, assume that the lids will be provided as the input and use sid embedding layer. - spk_embed_dim (Optional[int]): Speaker embedding dimension. If set to > 0, + spk_embed_dim (Optional[int]): + Speaker embedding dimension. If set to > 0, assume that spk_emb will be provided as the input. - spk_embed_integration_type (str): How to integrate speaker embedding. - dropout_rate (float): Dropout rate. - zoneout_rate (float): Zoneout rate. + spk_embed_integration_type (str): + How to integrate speaker embedding. + dropout_rate (float): + Dropout rate. + zoneout_rate (float): + Zoneout rate. """ assert check_argument_types() super().__init__() @@ -230,18 +259,28 @@ class Tacotron2(nn.Layer): """Calculate forward propagation. Args: - text (Tensor(int64)): Batch of padded character ids (B, T_text). - text_lengths (Tensor(int64)): Batch of lengths of each input batch (B,). - speech (Tensor): Batch of padded target features (B, T_feats, odim). - speech_lengths (Tensor(int64)): Batch of the lengths of each target (B,). - spk_emb (Optional[Tensor]): Batch of speaker embeddings (B, spk_embed_dim). - spk_id (Optional[Tensor]): Batch of speaker IDs (B, 1). - lang_id (Optional[Tensor]): Batch of language IDs (B, 1). + text (Tensor(int64)): + Batch of padded character ids (B, T_text). + text_lengths (Tensor(int64)): + Batch of lengths of each input batch (B,). + speech (Tensor): + Batch of padded target features (B, T_feats, odim). + speech_lengths (Tensor(int64)): + Batch of the lengths of each target (B,). + spk_emb (Optional[Tensor]): + Batch of speaker embeddings (B, spk_embed_dim). + spk_id (Optional[Tensor]): + Batch of speaker IDs (B, 1). + lang_id (Optional[Tensor]): + Batch of language IDs (B, 1). Returns: - Tensor: Loss scalar value. - Dict: Statistics to be monitored. - Tensor: Weight value if not joint training else model outputs. + Tensor: + Loss scalar value. + Dict: + Statistics to be monitored. + Tensor: + Weight value if not joint training else model outputs. """ text = text[:, :text_lengths.max()] @@ -329,18 +368,30 @@ class Tacotron2(nn.Layer): """Generate the sequence of features given the sequences of characters. Args: - text (Tensor(int64)): Input sequence of characters (T_text,). - speech (Optional[Tensor]): Feature sequence to extract style (N, idim). - spk_emb (ptional[Tensor]): Speaker embedding (spk_embed_dim,). - spk_id (Optional[Tensor]): Speaker ID (1,). - lang_id (Optional[Tensor]): Language ID (1,). - threshold (float): Threshold in inference. - minlenratio (float): Minimum length ratio in inference. - maxlenratio (float): Maximum length ratio in inference. - use_att_constraint (bool): Whether to apply attention constraint. - backward_window (int): Backward window in attention constraint. - forward_window (int): Forward window in attention constraint. - use_teacher_forcing (bool): Whether to use teacher forcing. + text (Tensor(int64)): + Input sequence of characters (T_text,). + speech (Optional[Tensor]): + Feature sequence to extract style (N, idim). + spk_emb (ptional[Tensor]): + Speaker embedding (spk_embed_dim,). + spk_id (Optional[Tensor]): + Speaker ID (1,). + lang_id (Optional[Tensor]): + Language ID (1,). + threshold (float): + Threshold in inference. + minlenratio (float): + Minimum length ratio in inference. + maxlenratio (float): + Maximum length ratio in inference. + use_att_constraint (bool): + Whether to apply attention constraint. + backward_window (int): + Backward window in attention constraint. + forward_window (int): + Forward window in attention constraint. + use_teacher_forcing (bool): + Whether to use teacher forcing. Returns: Dict[str, Tensor] diff --git a/paddlespeech/t2s/models/transformer_tts/transformer_tts.py b/paddlespeech/t2s/models/transformer_tts/transformer_tts.py index 92754c30a47e9619643b5f780205a5b47d971841..355fceb16108dcbff689b34d0777cb089b7ba1c8 100644 --- a/paddlespeech/t2s/models/transformer_tts/transformer_tts.py +++ b/paddlespeech/t2s/models/transformer_tts/transformer_tts.py @@ -49,66 +49,124 @@ class TransformerTTS(nn.Layer): https://arxiv.org/pdf/1809.08895.pdf Args: - idim (int): Dimension of the inputs. - odim (int): Dimension of the outputs. - embed_dim (int, optional): Dimension of character embedding. - eprenet_conv_layers (int, optional): Number of encoder prenet convolution layers. - eprenet_conv_chans (int, optional): Number of encoder prenet convolution channels. - eprenet_conv_filts (int, optional): Filter size of encoder prenet convolution. - dprenet_layers (int, optional): Number of decoder prenet layers. - dprenet_units (int, optional): Number of decoder prenet hidden units. - elayers (int, optional): Number of encoder layers. - eunits (int, optional): Number of encoder hidden units. - adim (int, optional): Number of attention transformation dimensions. - aheads (int, optional): Number of heads for multi head attention. - dlayers (int, optional): Number of decoder layers. - dunits (int, optional): Number of decoder hidden units. - postnet_layers (int, optional): Number of postnet layers. - postnet_chans (int, optional): Number of postnet channels. - postnet_filts (int, optional): Filter size of postnet. - use_scaled_pos_enc (pool, optional): Whether to use trainable scaled positional encoding. - use_batch_norm (bool, optional): Whether to use batch normalization in encoder prenet. - encoder_normalize_before (bool, optional): Whether to perform layer normalization before encoder block. - decoder_normalize_before (bool, optional): Whether to perform layer normalization before decoder block. - encoder_concat_after (bool, optional): Whether to concatenate attention layer's input and output in encoder. - decoder_concat_after (bool, optional): Whether to concatenate attention layer's input and output in decoder. - positionwise_layer_type (str, optional): Position-wise operation type. - positionwise_conv_kernel_size (int, optional): Kernel size in position wise conv 1d. - reduction_factor (int, optional): Reduction factor. - spk_embed_dim (int, optional): Number of speaker embedding dimenstions. - spk_embed_integration_type (str, optional): How to integrate speaker embedding. - use_gst (str, optional): Whether to use global style token. - gst_tokens (int, optional): The number of GST embeddings. - gst_heads (int, optional): The number of heads in GST multihead attention. - gst_conv_layers (int, optional): The number of conv layers in GST. - gst_conv_chans_list (Sequence[int], optional): List of the number of channels of conv layers in GST. - gst_conv_kernel_size (int, optional): Kernal size of conv layers in GST. - gst_conv_stride (int, optional): Stride size of conv layers in GST. - gst_gru_layers (int, optional): The number of GRU layers in GST. - gst_gru_units (int, optional): The number of GRU units in GST. - transformer_lr (float, optional): Initial value of learning rate. - transformer_warmup_steps (int, optional): Optimizer warmup steps. - transformer_enc_dropout_rate (float, optional): Dropout rate in encoder except attention and positional encoding. - transformer_enc_positional_dropout_rate (float, optional): Dropout rate after encoder positional encoding. - transformer_enc_attn_dropout_rate (float, optional): Dropout rate in encoder self-attention module. - transformer_dec_dropout_rate (float, optional): Dropout rate in decoder except attention & positional encoding. - transformer_dec_positional_dropout_rate (float, optional): Dropout rate after decoder positional encoding. - transformer_dec_attn_dropout_rate (float, optional): Dropout rate in deocoder self-attention module. - transformer_enc_dec_attn_dropout_rate (float, optional): Dropout rate in encoder-deocoder attention module. - init_type (str, optional): How to initialize transformer parameters. - init_enc_alpha (float, optional): Initial value of alpha in scaled pos encoding of the encoder. - init_dec_alpha (float, optional): Initial value of alpha in scaled pos encoding of the decoder. - eprenet_dropout_rate (float, optional): Dropout rate in encoder prenet. - dprenet_dropout_rate (float, optional): Dropout rate in decoder prenet. - postnet_dropout_rate (float, optional): Dropout rate in postnet. - use_masking (bool, optional): Whether to apply masking for padded part in loss calculation. - use_weighted_masking (bool, optional): Whether to apply weighted masking in loss calculation. - bce_pos_weight (float, optional): Positive sample weight in bce calculation (only for use_masking=true). - loss_type (str, optional): How to calculate loss. - use_guided_attn_loss (bool, optional): Whether to use guided attention loss. - num_heads_applied_guided_attn (int, optional): Number of heads in each layer to apply guided attention loss. - num_layers_applied_guided_attn (int, optional): Number of layers to apply guided attention loss. - List of module names to apply guided attention loss. + idim (int): + Dimension of the inputs. + odim (int): + Dimension of the outputs. + embed_dim (int, optional): + Dimension of character embedding. + eprenet_conv_layers (int, optional): + Number of encoder prenet convolution layers. + eprenet_conv_chans (int, optional): + Number of encoder prenet convolution channels. + eprenet_conv_filts (int, optional): + Filter size of encoder prenet convolution. + dprenet_layers (int, optional): + Number of decoder prenet layers. + dprenet_units (int, optional): + Number of decoder prenet hidden units. + elayers (int, optional): + Number of encoder layers. + eunits (int, optional): + Number of encoder hidden units. + adim (int, optional): + Number of attention transformation dimensions. + aheads (int, optional): + Number of heads for multi head attention. + dlayers (int, optional): + Number of decoder layers. + dunits (int, optional): + Number of decoder hidden units. + postnet_layers (int, optional): + Number of postnet layers. + postnet_chans (int, optional): + Number of postnet channels. + postnet_filts (int, optional): + Filter size of postnet. + use_scaled_pos_enc (pool, optional): + Whether to use trainable scaled positional encoding. + use_batch_norm (bool, optional): + Whether to use batch normalization in encoder prenet. + encoder_normalize_before (bool, optional): + Whether to perform layer normalization before encoder block. + decoder_normalize_before (bool, optional): + Whether to perform layer normalization before decoder block. + encoder_concat_after (bool, optional): + Whether to concatenate attention layer's input and output in encoder. + decoder_concat_after (bool, optional): + Whether to concatenate attention layer's input and output in decoder. + positionwise_layer_type (str, optional): + Position-wise operation type. + positionwise_conv_kernel_size (int, optional): + Kernel size in position wise conv 1d. + reduction_factor (int, optional): + Reduction factor. + spk_embed_dim (int, optional): + Number of speaker embedding dimenstions. + spk_embed_integration_type (str, optional): + How to integrate speaker embedding. + use_gst (str, optional): + Whether to use global style token. + gst_tokens (int, optional): + The number of GST embeddings. + gst_heads (int, optional): + The number of heads in GST multihead attention. + gst_conv_layers (int, optional): + The number of conv layers in GST. + gst_conv_chans_list (Sequence[int], optional): + List of the number of channels of conv layers in GST. + gst_conv_kernel_size (int, optional): + Kernal size of conv layers in GST. + gst_conv_stride (int, optional): + Stride size of conv layers in GST. + gst_gru_layers (int, optional): + The number of GRU layers in GST. + gst_gru_units (int, optional): + The number of GRU units in GST. + transformer_lr (float, optional): + Initial value of learning rate. + transformer_warmup_steps (int, optional): + Optimizer warmup steps. + transformer_enc_dropout_rate (float, optional): + Dropout rate in encoder except attention and positional encoding. + transformer_enc_positional_dropout_rate (float, optional): + Dropout rate after encoder positional encoding. + transformer_enc_attn_dropout_rate (float, optional): + Dropout rate in encoder self-attention module. + transformer_dec_dropout_rate (float, optional): + Dropout rate in decoder except attention & positional encoding. + transformer_dec_positional_dropout_rate (float, optional): + Dropout rate after decoder positional encoding. + transformer_dec_attn_dropout_rate (float, optional): + Dropout rate in deocoder self-attention module. + transformer_enc_dec_attn_dropout_rate (float, optional): + Dropout rate in encoder-deocoder attention module. + init_type (str, optional): + How to initialize transformer parameters. + init_enc_alpha (float, optional): + Initial value of alpha in scaled pos encoding of the encoder. + init_dec_alpha (float, optional): + Initial value of alpha in scaled pos encoding of the decoder. + eprenet_dropout_rate (float, optional): + Dropout rate in encoder prenet. + dprenet_dropout_rate (float, optional): + Dropout rate in decoder prenet. + postnet_dropout_rate (float, optional): + Dropout rate in postnet. + use_masking (bool, optional): + Whether to apply masking for padded part in loss calculation. + use_weighted_masking (bool, optional): + Whether to apply weighted masking in loss calculation. + bce_pos_weight (float, optional): + Positive sample weight in bce calculation (only for use_masking=true). + loss_type (str, optional): + How to calculate loss. + use_guided_attn_loss (bool, optional): + Whether to use guided attention loss. + num_heads_applied_guided_attn (int, optional): + Number of heads in each layer to apply guided attention loss. + num_layers_applied_guided_attn (int, optional): + Number of layers to apply guided attention loss. """ def __init__( diff --git a/paddlespeech/t2s/models/waveflow.py b/paddlespeech/t2s/models/waveflow.py index 52e6005be3969e1ad89c0d634efbbb62dfc1a68e..8e2ce822fd294c4f7a3eff3716a7c7a827bb60fa 100644 --- a/paddlespeech/t2s/models/waveflow.py +++ b/paddlespeech/t2s/models/waveflow.py @@ -33,8 +33,10 @@ def fold(x, n_group): """Fold audio or spectrogram's temporal dimension in to groups. Args: - x(Tensor): The input tensor. shape=(*, time_steps) - n_group(int): The size of a group. + x(Tensor): + The input tensor. shape=(*, time_steps) + n_group(int): + The size of a group. Returns: Tensor: Folded tensor. shape=(*, time_steps // n_group, group) @@ -53,7 +55,8 @@ class UpsampleNet(nn.LayerList): on mel and time dimension. Args: - upscale_factors(List[int], optional): Time upsampling factors for each Conv2DTranspose Layer. + upscale_factors(List[int], optional): + Time upsampling factors for each Conv2DTranspose Layer. The ``UpsampleNet`` contains ``len(upscale_factor)`` Conv2DTranspose Layers. Each upscale_factor is used as the ``stride`` for the corresponding Conv2DTranspose. Defaults to [16, 16], this the default @@ -94,8 +97,10 @@ class UpsampleNet(nn.LayerList): """Forward pass of the ``UpsampleNet`` Args: - x(Tensor): The input spectrogram. shape=(batch_size, input_channels, time_steps) - trim_conv_artifact(bool, optional, optional): Trim deconvolution artifact at each layer. Defaults to False. + x(Tensor): + The input spectrogram. shape=(batch_size, input_channels, time_steps) + trim_conv_artifact(bool, optional, optional): + Trim deconvolution artifact at each layer. Defaults to False. Returns: Tensor: The upsampled spectrogram. shape=(batch_size, input_channels, time_steps * upsample_factor) @@ -123,10 +128,14 @@ class ResidualBlock(nn.Layer): and output. Args: - channels (int): Feature size of the input. - cond_channels (int): Featuer size of the condition. - kernel_size (Tuple[int]): Kernel size of the Convolution2d applied to the input. - dilations (int): Dilations of the Convolution2d applied to the input. + channels (int): + Feature size of the input. + cond_channels (int): + Featuer size of the condition. + kernel_size (Tuple[int]): + Kernel size of the Convolution2d applied to the input. + dilations (int): + Dilations of the Convolution2d applied to the input. """ def __init__(self, channels, cond_channels, kernel_size, dilations): @@ -173,12 +182,16 @@ class ResidualBlock(nn.Layer): """Compute output for a whole folded sequence. Args: - x (Tensor): The input. [shape=(batch_size, channel, height, width)] - condition (Tensor [shape=(batch_size, condition_channel, height, width)]): The local condition. + x (Tensor): + The input. [shape=(batch_size, channel, height, width)] + condition (Tensor [shape=(batch_size, condition_channel, height, width)]): + The local condition. Returns: - res (Tensor): The residual output. [shape=(batch_size, channel, height, width)] - skip (Tensor): The skip output. [shape=(batch_size, channel, height, width)] + res (Tensor): + The residual output. [shape=(batch_size, channel, height, width)] + skip (Tensor): + The skip output. [shape=(batch_size, channel, height, width)] """ x_in = x x = self.conv(x) @@ -216,12 +229,16 @@ class ResidualBlock(nn.Layer): """Compute the output for a row and update the buffer. Args: - x_row (Tensor): A row of the input. shape=(batch_size, channel, 1, width) - condition_row (Tensor): A row of the condition. shape=(batch_size, condition_channel, 1, width) + x_row (Tensor): + A row of the input. shape=(batch_size, channel, 1, width) + condition_row (Tensor): + A row of the condition. shape=(batch_size, condition_channel, 1, width) Returns: - res (Tensor): A row of the the residual output. shape=(batch_size, channel, 1, width) - skip (Tensor): A row of the skip output. shape=(batch_size, channel, 1, width) + res (Tensor): + A row of the the residual output. shape=(batch_size, channel, 1, width) + skip (Tensor): + A row of the skip output. shape=(batch_size, channel, 1, width) """ x_row_in = x_row @@ -258,11 +275,16 @@ class ResidualNet(nn.LayerList): """A stack of several ResidualBlocks. It merges condition at each layer. Args: - n_layer (int): Number of ResidualBlocks in the ResidualNet. - residual_channels (int): Feature size of each ResidualBlocks. - condition_channels (int): Feature size of the condition. - kernel_size (Tuple[int]): Kernel size of each ResidualBlock. - dilations_h (List[int]): Dilation in height dimension of every ResidualBlock. + n_layer (int): + Number of ResidualBlocks in the ResidualNet. + residual_channels (int): + Feature size of each ResidualBlocks. + condition_channels (int): + Feature size of the condition. + kernel_size (Tuple[int]): + Kernel size of each ResidualBlock. + dilations_h (List[int]): + Dilation in height dimension of every ResidualBlock. Raises: ValueError: If the length of dilations_h does not equals n_layers. @@ -288,11 +310,13 @@ class ResidualNet(nn.LayerList): """Comput the output of given the input and the condition. Args: - x (Tensor): The input. shape=(batch_size, channel, height, width) - condition (Tensor): The local condition. shape=(batch_size, condition_channel, height, width) + x (Tensor): + The input. shape=(batch_size, channel, height, width) + condition (Tensor): + The local condition. shape=(batch_size, condition_channel, height, width) Returns: - Tensor : The output, which is an aggregation of all the skip outputs. shape=(batch_size, channel, height, width) + Tensor: The output, which is an aggregation of all the skip outputs. shape=(batch_size, channel, height, width) """ skip_connections = [] @@ -312,12 +336,16 @@ class ResidualNet(nn.LayerList): """Compute the output for a row and update the buffers. Args: - x_row (Tensor): A row of the input. shape=(batch_size, channel, 1, width) - condition_row (Tensor): A row of the condition. shape=(batch_size, condition_channel, 1, width) + x_row (Tensor): + A row of the input. shape=(batch_size, channel, 1, width) + condition_row (Tensor): + A row of the condition. shape=(batch_size, condition_channel, 1, width) Returns: - res (Tensor): A row of the the residual output. shape=(batch_size, channel, 1, width) - skip (Tensor): A row of the skip output. shape=(batch_size, channel, 1, width) + res (Tensor): + A row of the the residual output. shape=(batch_size, channel, 1, width) + skip (Tensor): + A row of the skip output. shape=(batch_size, channel, 1, width) """ skip_connections = [] @@ -337,11 +365,16 @@ class Flow(nn.Layer): sampling. Args: - n_layers (int): Number of ResidualBlocks in the Flow. - channels (int): Feature size of the ResidualBlocks. - mel_bands (int): Feature size of the mel spectrogram (mel bands). - kernel_size (Tuple[int]): Kernel size of each ResisualBlocks in the Flow. - n_group (int): Number of timesteps to the folded into a group. + n_layers (int): + Number of ResidualBlocks in the Flow. + channels (int): + Feature size of the ResidualBlocks. + mel_bands (int): + Feature size of the mel spectrogram (mel bands). + kernel_size (Tuple[int]): + Kernel size of each ResisualBlocks in the Flow. + n_group (int): + Number of timesteps to the folded into a group. """ dilations_dict = { 8: [1, 1, 1, 1, 1, 1, 1, 1], @@ -393,11 +426,14 @@ class Flow(nn.Layer): a sample from p(X) into a sample from p(Z). Args: - x (Tensor): A input sample of the distribution p(X). shape=(batch, 1, height, width) - condition (Tensor): The local condition. shape=(batch, condition_channel, height, width) + x (Tensor): + A input sample of the distribution p(X). shape=(batch, 1, height, width) + condition (Tensor): + The local condition. shape=(batch, condition_channel, height, width) Returns: - z (Tensor): shape(batch, 1, height, width), the transformed sample. + z (Tensor): + shape(batch, 1, height, width), the transformed sample. Tuple[Tensor, Tensor]: The parameter of the transformation. logs (Tensor): shape(batch, 1, height - 1, width), the log scale of the transformation from x to z. @@ -433,8 +469,10 @@ class Flow(nn.Layer): p(Z) and transform the sample. It is a auto regressive transformation. Args: - z(Tensor): A sample of the distribution p(Z). shape=(batch, 1, time_steps - condition(Tensor): The local condition. shape=(batch, condition_channel, time_steps) + z(Tensor): + A sample of the distribution p(Z). shape=(batch, 1, time_steps + condition(Tensor): + The local condition. shape=(batch, condition_channel, time_steps) Returns: Tensor: The transformed sample. shape=(batch, 1, height, width) @@ -462,12 +500,18 @@ class WaveFlow(nn.LayerList): flows. Args: - n_flows (int): Number of flows in the WaveFlow model. - n_layers (int): Number of ResidualBlocks in each Flow. - n_group (int): Number of timesteps to fold as a group. - channels (int): Feature size of each ResidualBlock. - mel_bands (int): Feature size of mel spectrogram (mel bands). - kernel_size (Union[int, List[int]]): Kernel size of the convolution layer in each ResidualBlock. + n_flows (int): + Number of flows in the WaveFlow model. + n_layers (int): + Number of ResidualBlocks in each Flow. + n_group (int): + Number of timesteps to fold as a group. + channels (int): + Feature size of each ResidualBlock. + mel_bands (int): + Feature size of mel spectrogram (mel bands). + kernel_size (Union[int, List[int]]): + Kernel size of the convolution layer in each ResidualBlock. """ def __init__(self, n_flows, n_layers, n_group, channels, mel_bands, @@ -518,12 +562,16 @@ class WaveFlow(nn.LayerList): condition. Args: - x (Tensor): The audio. shape=(batch_size, time_steps) - condition (Tensor): The local condition (mel spectrogram here). shape=(batch_size, condition channel, time_steps) + x (Tensor): + The audio. shape=(batch_size, time_steps) + condition (Tensor): + The local condition (mel spectrogram here). shape=(batch_size, condition channel, time_steps) Returns: - Tensor: The transformed random variable. shape=(batch_size, time_steps) - Tensor: The log determinant of the jacobian of the transformation from x to z. shape=(1,) + Tensor: + The transformed random variable. shape=(batch_size, time_steps) + Tensor: + The log determinant of the jacobian of the transformation from x to z. shape=(1,) """ # x: (B, T) # condition: (B, C, T) upsampled condition @@ -559,12 +607,13 @@ class WaveFlow(nn.LayerList): autoregressive manner. Args: - z (Tensor): A sample of the distribution p(Z). shape=(batch, 1, time_steps - condition (Tensor): The local condition. shape=(batch, condition_channel, time_steps) + z (Tensor): + A sample of the distribution p(Z). shape=(batch, 1, time_steps + condition (Tensor): + The local condition. shape=(batch, condition_channel, time_steps) Returns: Tensor: The transformed sample (audio here). shape=(batch_size, time_steps) - """ z, condition = self._trim(z, condition) @@ -590,13 +639,20 @@ class ConditionalWaveFlow(nn.LayerList): """ConditionalWaveFlow, a UpsampleNet with a WaveFlow model. Args: - upsample_factors (List[int]): Upsample factors for the upsample net. - n_flows (int): Number of flows in the WaveFlow model. - n_layers (int): Number of ResidualBlocks in each Flow. - n_group (int): Number of timesteps to fold as a group. - channels (int): Feature size of each ResidualBlock. - n_mels (int): Feature size of mel spectrogram (mel bands). - kernel_size (Union[int, List[int]]): Kernel size of the convolution layer in each ResidualBlock. + upsample_factors (List[int]): + Upsample factors for the upsample net. + n_flows (int): + Number of flows in the WaveFlow model. + n_layers (int): + Number of ResidualBlocks in each Flow. + n_group (int): + Number of timesteps to fold as a group. + channels (int): + Feature size of each ResidualBlock. + n_mels (int): + Feature size of mel spectrogram (mel bands). + kernel_size (Union[int, List[int]]): + Kernel size of the convolution layer in each ResidualBlock. """ def __init__(self, @@ -622,12 +678,16 @@ class ConditionalWaveFlow(nn.LayerList): the determinant of the jacobian of the transformation from x to z. Args: - audio(Tensor): The audio. shape=(B, T) - mel(Tensor): The mel spectrogram. shape=(B, C_mel, T_mel) + audio(Tensor): + The audio. shape=(B, T) + mel(Tensor): + The mel spectrogram. shape=(B, C_mel, T_mel) Returns: - Tensor: The inversely transformed random variable z (x to z). shape=(B, T) - Tensor: the log of the determinant of the jacobian of the transformation from x to z. shape=(1,) + Tensor: + The inversely transformed random variable z (x to z). shape=(B, T) + Tensor: + the log of the determinant of the jacobian of the transformation from x to z. shape=(1,) """ condition = self.encoder(mel) z, log_det_jacobian = self.decoder(audio, condition) @@ -638,10 +698,12 @@ class ConditionalWaveFlow(nn.LayerList): """Generate raw audio given mel spectrogram. Args: - mel(np.ndarray): Mel spectrogram of an utterance(in log-magnitude). shape=(C_mel, T_mel) + mel(np.ndarray): + Mel spectrogram of an utterance(in log-magnitude). shape=(C_mel, T_mel) Returns: - Tensor: The synthesized audio, where``T <= T_mel * upsample_factors``. shape=(B, T) + Tensor: + The synthesized audio, where``T <= T_mel * upsample_factors``. shape=(B, T) """ start = time.time() condition = self.encoder(mel, trim_conv_artifact=True) # (B, C, T) @@ -657,7 +719,8 @@ class ConditionalWaveFlow(nn.LayerList): """Generate raw audio given mel spectrogram. Args: - mel(np.ndarray): Mel spectrogram of an utterance(in log-magnitude). shape=(C_mel, T_mel) + mel(np.ndarray): + Mel spectrogram of an utterance(in log-magnitude). shape=(C_mel, T_mel) Returns: np.ndarray: The synthesized audio. shape=(T,) @@ -673,8 +736,10 @@ class ConditionalWaveFlow(nn.LayerList): """Build a ConditionalWaveFlow model from a pretrained model. Args: - config(yacs.config.CfgNode): model configs - checkpoint_path(Path or str): the path of pretrained model checkpoint, without extension name + config(yacs.config.CfgNode): + model configs + checkpoint_path(Path or str): + the path of pretrained model checkpoint, without extension name Returns: ConditionalWaveFlow The model built from pretrained result. @@ -694,8 +759,8 @@ class WaveFlowLoss(nn.Layer): """Criterion of a WaveFlow model. Args: - sigma (float): The standard deviation of the gaussian noise used in WaveFlow, - by default 1.0. + sigma (float): + The standard deviation of the gaussian noise used in WaveFlow, by default 1.0. """ def __init__(self, sigma=1.0): @@ -708,8 +773,10 @@ class WaveFlowLoss(nn.Layer): log_det_jacobian of transformation from x to z. Args: - z(Tensor): The transformed random variable (x to z). shape=(B, T) - log_det_jacobian(Tensor): The log of the determinant of the jacobian matrix of the + z(Tensor): + The transformed random variable (x to z). shape=(B, T) + log_det_jacobian(Tensor): + The log of the determinant of the jacobian matrix of the transformation from x to z. shape=(1,) Returns: @@ -726,7 +793,8 @@ class ConditionalWaveFlow2Infer(ConditionalWaveFlow): """Generate raw audio given mel spectrogram. Args: - mel (np.ndarray): Mel spectrogram of an utterance(in log-magnitude). shape=(C_mel, T_mel) + mel (np.ndarray): + Mel spectrogram of an utterance(in log-magnitude). shape=(C_mel, T_mel) Returns: np.ndarray: The synthesized audio. shape=(T,) diff --git a/paddlespeech/t2s/models/wavernn/wavernn.py b/paddlespeech/t2s/models/wavernn/wavernn.py index eb892eda56e5f412b3bf20fca864dfec0ff150cc..254edbb2df0faec9d896ad1a1cc426e438fa27d1 100644 --- a/paddlespeech/t2s/models/wavernn/wavernn.py +++ b/paddlespeech/t2s/models/wavernn/wavernn.py @@ -165,19 +165,29 @@ class WaveRNN(nn.Layer): init_type: str="xavier_uniform", ): ''' Args: - rnn_dims (int, optional): Hidden dims of RNN Layers. - fc_dims (int, optional): Dims of FC Layers. - bits (int, optional): bit depth of signal. - aux_context_window (int, optional): The context window size of the first convolution applied to the - auxiliary input, by default 2 - upsample_scales (List[int], optional): Upsample scales of the upsample network. - aux_channels (int, optional): Auxiliary channel of the residual blocks. - compute_dims (int, optional): Dims of Conv1D in MelResNet. - res_out_dims (int, optional): Dims of output in MelResNet. - res_blocks (int, optional): Number of residual blocks. - mode (str, optional): Output mode of the WaveRNN vocoder. + rnn_dims (int, optional): + Hidden dims of RNN Layers. + fc_dims (int, optional): + Dims of FC Layers. + bits (int, optional): + bit depth of signal. + aux_context_window (int, optional): + The context window size of the first convolution applied to the auxiliary input, by default 2 + upsample_scales (List[int], optional): + Upsample scales of the upsample network. + aux_channels (int, optional): + Auxiliary channel of the residual blocks. + compute_dims (int, optional): + Dims of Conv1D in MelResNet. + res_out_dims (int, optional): + Dims of output in MelResNet. + res_blocks (int, optional): + Number of residual blocks. + mode (str, optional): + Output mode of the WaveRNN vocoder. `MOL` for Mixture of Logistic Distribution, and `RAW` for quantized bits as the model's output. - init_type (str): How to initialize parameters. + init_type (str): + How to initialize parameters. ''' super().__init__() self.mode = mode @@ -226,8 +236,10 @@ class WaveRNN(nn.Layer): def forward(self, x, c): ''' Args: - x (Tensor): wav sequence, [B, T] - c (Tensor): mel spectrogram [B, C_aux, T'] + x (Tensor): + wav sequence, [B, T] + c (Tensor): + mel spectrogram [B, C_aux, T'] T = (T' - 2 * aux_context_window ) * hop_length Returns: @@ -280,10 +292,14 @@ class WaveRNN(nn.Layer): gen_display: bool=False): """ Args: - c(Tensor): input mels, (T', C_aux) - batched(bool): generate in batch or not - target(int): target number of samples to be generated in each batch entry - overlap(int): number of samples for crossfading between batches + c(Tensor): + input mels, (T', C_aux) + batched(bool): + generate in batch or not + target(int): + target number of samples to be generated in each batch entry + overlap(int): + number of samples for crossfading between batches mu_law(bool) Returns: wav sequence: Output (T' * prod(upsample_scales), out_channels, C_out). @@ -404,7 +420,8 @@ class WaveRNN(nn.Layer): def pad_tensor(self, x, pad, side='both'): ''' Args: - x(Tensor): mel, [1, n_frames, 80] + x(Tensor): + mel, [1, n_frames, 80] pad(int): side(str, optional): (Default value = 'both') @@ -428,12 +445,15 @@ class WaveRNN(nn.Layer): Overlap will be used for crossfading in xfade_and_unfold() Args: - x(Tensor): Upsampled conditioning features. mels or aux + x(Tensor): + Upsampled conditioning features. mels or aux shape=(1, T, features) mels: [1, T, 80] aux: [1, T, 128] - target(int): Target timesteps for each index of batch - overlap(int): Timesteps for both xfade and rnn warmup + target(int): + Target timesteps for each index of batch + overlap(int): + Timesteps for both xfade and rnn warmup Returns: Tensor: diff --git a/paddlespeech/t2s/modules/causal_conv.py b/paddlespeech/t2s/modules/causal_conv.py index 3abccc15f45e0911f18535efe8575177f735c66b..337ee2383a69c4e773f1a345c2582c8b929a0a24 100644 --- a/paddlespeech/t2s/modules/causal_conv.py +++ b/paddlespeech/t2s/modules/causal_conv.py @@ -42,7 +42,8 @@ class CausalConv1D(nn.Layer): def forward(self, x): """Calculate forward propagation. Args: - x (Tensor): Input tensor (B, in_channels, T). + x (Tensor): + Input tensor (B, in_channels, T). Returns: Tensor: Output tensor (B, out_channels, T). """ @@ -67,7 +68,8 @@ class CausalConv1DTranspose(nn.Layer): def forward(self, x): """Calculate forward propagation. Args: - x (Tensor): Input tensor (B, in_channels, T_in). + x (Tensor): + Input tensor (B, in_channels, T_in). Returns: Tensor: Output tensor (B, out_channels, T_out). """ diff --git a/paddlespeech/t2s/modules/conformer/convolution.py b/paddlespeech/t2s/modules/conformer/convolution.py index 185c62fb3c804f9ce495323f590878072d8bafa6..dadda064075d10a79e946258a2bd72d5904b7862 100644 --- a/paddlespeech/t2s/modules/conformer/convolution.py +++ b/paddlespeech/t2s/modules/conformer/convolution.py @@ -20,8 +20,10 @@ class ConvolutionModule(nn.Layer): """ConvolutionModule in Conformer model. Args: - channels (int): The number of channels of conv layers. - kernel_size (int): Kernerl size of conv layers. + channels (int): + The number of channels of conv layers. + kernel_size (int): + Kernerl size of conv layers. """ def __init__(self, channels, kernel_size, activation=nn.ReLU(), bias=True): @@ -59,7 +61,8 @@ class ConvolutionModule(nn.Layer): """Compute convolution module. Args: - x (Tensor): Input tensor (#batch, time, channels). + x (Tensor): + Input tensor (#batch, time, channels). Returns: Tensor: Output tensor (#batch, time, channels). """ diff --git a/paddlespeech/t2s/modules/conformer/encoder_layer.py b/paddlespeech/t2s/modules/conformer/encoder_layer.py index 61c32612527630ec66941b882335a208a50d1b11..26a354565a8b62e3f8941319b69961b794d18fbb 100644 --- a/paddlespeech/t2s/modules/conformer/encoder_layer.py +++ b/paddlespeech/t2s/modules/conformer/encoder_layer.py @@ -23,25 +23,34 @@ class EncoderLayer(nn.Layer): """Encoder layer module. Args: - size (int): Input dimension. - self_attn (nn.Layer): Self-attention module instance. + size (int): + Input dimension. + self_attn (nn.Layer): + Self-attention module instance. `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` instance can be used as the argument. - feed_forward (nn.Layer): Feed-forward module instance. + feed_forward (nn.Layer): + Feed-forward module instance. `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance can be used as the argument. - feed_forward_macaron (nn.Layer): Additional feed-forward module instance. + feed_forward_macaron (nn.Layer): + Additional feed-forward module instance. `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance can be used as the argument. - conv_module (nn.Layer): Convolution module instance. + conv_module (nn.Layer): + Convolution module instance. `ConvlutionModule` instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): Whether to use layer_norm before the first block. - concat_after (bool): Whether to concat attention layer's input and output. + dropout_rate (float): + Dropout rate. + normalize_before (bool): + Whether to use layer_norm before the first block. + concat_after (bool): + Whether to concat attention layer's input and output. if True, additional linear will be applied. i.e. x -> x + linear(concat(x, att(x))) if False, no additional linear will be applied. i.e. x -> x + att(x) - stochastic_depth_rate (float): Proability to skip this layer. + stochastic_depth_rate (float): + Proability to skip this layer. During training, the layer may skip residual computation and return input as-is with given probability. """ @@ -86,15 +95,19 @@ class EncoderLayer(nn.Layer): """Compute encoded features. Args: - x_input(Union[Tuple, Tensor]): Input tensor w/ or w/o pos emb. + x_input(Union[Tuple, Tensor]): + Input tensor w/ or w/o pos emb. - w/ pos emb: Tuple of tensors [(#batch, time, size), (1, time, size)]. - w/o pos emb: Tensor (#batch, time, size). - mask(Tensor): Mask tensor for the input (#batch, time). + mask(Tensor): + Mask tensor for the input (#batch, time). cache (Tensor): Returns: - Tensor: Output tensor (#batch, time, size). - Tensor: Mask tensor (#batch, time). + Tensor: + Output tensor (#batch, time, size). + Tensor: + Mask tensor (#batch, time). """ if isinstance(x_input, tuple): x, pos_emb = x_input[0], x_input[1] diff --git a/paddlespeech/t2s/modules/conv.py b/paddlespeech/t2s/modules/conv.py index aa875bd500124e5bd3d3807b10f63ed8442d3800..922af03f2d1a87094f31d86e5d645fa94be163ff 100644 --- a/paddlespeech/t2s/modules/conv.py +++ b/paddlespeech/t2s/modules/conv.py @@ -42,13 +42,19 @@ class Conv1dCell(nn.Conv1D): class. Args: - in_channels (int): The feature size of the input. - out_channels (int): The feature size of the output. - kernel_size (int or Tuple[int]): The size of the kernel. - dilation (int or Tuple[int]): The dilation of the convolution, by default 1 - weight_attr (ParamAttr, Initializer, str or bool, optional) : The parameter attribute of the convolution kernel, + in_channels (int): + The feature size of the input. + out_channels (int): + The feature size of the output. + kernel_size (int or Tuple[int]): + The size of the kernel. + dilation (int or Tuple[int]): + The dilation of the convolution, by default 1 + weight_attr (ParamAttr, Initializer, str or bool, optional): + The parameter attribute of the convolution kernel, by default None. - bias_attr (ParamAttr, Initializer, str or bool, optional):The parameter attribute of the bias. + bias_attr (ParamAttr, Initializer, str or bool, optional): + The parameter attribute of the bias. If ``False``, this layer does not have a bias, by default None. Examples: @@ -122,7 +128,8 @@ class Conv1dCell(nn.Conv1D): """Initialize the buffer for the step input. Args: - x_t (Tensor): The step input. shape=(batch_size, in_channels) + x_t (Tensor): + The step input. shape=(batch_size, in_channels) """ batch_size, _ = x_t.shape @@ -134,7 +141,8 @@ class Conv1dCell(nn.Conv1D): """Shift the buffer by one step. Args: - x_t (Tensor): The step input. shape=(batch_size, in_channels) + x_t (Tensor): T + he step input. shape=(batch_size, in_channels) """ self._buffer = paddle.concat( @@ -144,10 +152,12 @@ class Conv1dCell(nn.Conv1D): """Add step input and compute step output. Args: - x_t (Tensor): The step input. shape=(batch_size, in_channels) + x_t (Tensor): + The step input. shape=(batch_size, in_channels) Returns: - y_t (Tensor): The step output. shape=(batch_size, out_channels) + y_t (Tensor): + The step output. shape=(batch_size, out_channels) """ batch_size = x_t.shape[0] @@ -173,10 +183,14 @@ class Conv1dBatchNorm(nn.Layer): """A Conv1D Layer followed by a BatchNorm1D. Args: - in_channels (int): The feature size of the input. - out_channels (int): The feature size of the output. - kernel_size (int): The size of the convolution kernel. - stride (int, optional): The stride of the convolution, by default 1. + in_channels (int): + The feature size of the input. + out_channels (int): + The feature size of the output. + kernel_size (int): + The size of the convolution kernel. + stride (int, optional): + The stride of the convolution, by default 1. padding (int, str or Tuple[int], optional): The padding of the convolution. If int, a symmetrical padding is applied before convolution; @@ -189,9 +203,12 @@ class Conv1dBatchNorm(nn.Layer): bias_attr (ParamAttr, Initializer, str or bool, optional): The parameter attribute of the bias of the convolution, by defaultNone. - data_format (str ["NCL" or "NLC"], optional): The data layout of the input, by default "NCL" - momentum (float, optional): The momentum of the BatchNorm1D layer, by default 0.9 - epsilon (float, optional): The epsilon of the BatchNorm1D layer, by default 1e-05 + data_format (str ["NCL" or "NLC"], optional): + The data layout of the input, by default "NCL" + momentum (float, optional): + The momentum of the BatchNorm1D layer, by default 0.9 + epsilon (float, optional): + The epsilon of the BatchNorm1D layer, by default 1e-05 """ def __init__(self, @@ -225,12 +242,13 @@ class Conv1dBatchNorm(nn.Layer): """Forward pass of the Conv1dBatchNorm layer. Args: - x (Tensor): The input tensor. Its data layout depends on ``data_format``. - shape=(B, C_in, T_in) or (B, T_in, C_in) + x (Tensor): + The input tensor. Its data layout depends on ``data_format``. + shape=(B, C_in, T_in) or (B, T_in, C_in) Returns: - Tensor: The output tensor. - shape=(B, C_out, T_out) or (B, T_out, C_out) + Tensor: + The output tensor. shape=(B, C_out, T_out) or (B, T_out, C_out) """ x = self.conv(x) diff --git a/paddlespeech/t2s/modules/geometry.py b/paddlespeech/t2s/modules/geometry.py index 01eb5ad0ab2479cff21d210c3b2f1aa5742fbd4c..80c872a817d6e16c4117dc941580c038e72fe83c 100644 --- a/paddlespeech/t2s/modules/geometry.py +++ b/paddlespeech/t2s/modules/geometry.py @@ -19,8 +19,10 @@ def shuffle_dim(x, axis, perm=None): """Permute input tensor along aixs given the permutation or randomly. Args: - x (Tensor): The input tensor. - axis (int): The axis to shuffle. + x (Tensor): + The input tensor. + axis (int): + The axis to shuffle. perm (List[int], ndarray, optional): The order to reorder the tensor along the ``axis``-th dimension. It is a permutation of ``[0, d)``, where d is the size of the diff --git a/paddlespeech/t2s/modules/layer_norm.py b/paddlespeech/t2s/modules/layer_norm.py index 088b98e02cf3fc987da54b881cf8060dfe15ecf2..9e2add29334a646fb4191eeb2d8bd6a8041531d8 100644 --- a/paddlespeech/t2s/modules/layer_norm.py +++ b/paddlespeech/t2s/modules/layer_norm.py @@ -19,8 +19,10 @@ from paddle import nn class LayerNorm(nn.LayerNorm): """Layer normalization module. Args: - nout (int): Output dim size. - dim (int): Dimension to be normalized. + nout (int): + Output dim size. + dim (int): + Dimension to be normalized. """ def __init__(self, nout, dim=-1): @@ -32,7 +34,8 @@ class LayerNorm(nn.LayerNorm): """Apply layer normalization. Args: - x (Tensor):Input tensor. + x (Tensor): + Input tensor. Returns: Tensor: Normalized tensor. diff --git a/paddlespeech/t2s/modules/losses.py b/paddlespeech/t2s/modules/losses.py index 4726f40ecf1ee3c8208bead0919f348cb679de4a..b2a31a32145afb981bff432579ddc513b937bd6f 100644 --- a/paddlespeech/t2s/modules/losses.py +++ b/paddlespeech/t2s/modules/losses.py @@ -269,8 +269,10 @@ class GuidedAttentionLoss(nn.Layer): """Make masks indicating non-padded part. Args: - ilens(Tensor(int64) or List): Batch of lengths (B,). - olens(Tensor(int64) or List): Batch of lengths (B,). + ilens(Tensor(int64) or List): + Batch of lengths (B,). + olens(Tensor(int64) or List): + Batch of lengths (B,). Returns: Tensor: Mask tensor indicating non-padded part. @@ -322,9 +324,12 @@ class GuidedMultiHeadAttentionLoss(GuidedAttentionLoss): """Calculate forward propagation. Args: - att_ws(Tensor): Batch of multi head attention weights (B, H, T_max_out, T_max_in). - ilens(Tensor): Batch of input lenghts (B,). - olens(Tensor): Batch of output lenghts (B,). + att_ws(Tensor): + Batch of multi head attention weights (B, H, T_max_out, T_max_in). + ilens(Tensor): + Batch of input lenghts (B,). + olens(Tensor): + Batch of output lenghts (B,). Returns: Tensor: Guided attention loss value. @@ -354,9 +359,12 @@ class Tacotron2Loss(nn.Layer): """Initialize Tactoron2 loss module. Args: - use_masking (bool): Whether to apply masking for padded part in loss calculation. - use_weighted_masking (bool): Whether to apply weighted masking in loss calculation. - bce_pos_weight (float): Weight of positive sample of stop token. + use_masking (bool): + Whether to apply masking for padded part in loss calculation. + use_weighted_masking (bool): + Whether to apply weighted masking in loss calculation. + bce_pos_weight (float): + Weight of positive sample of stop token. """ super().__init__() assert (use_masking != use_weighted_masking) or not use_masking @@ -374,17 +382,25 @@ class Tacotron2Loss(nn.Layer): """Calculate forward propagation. Args: - after_outs(Tensor): Batch of outputs after postnets (B, Lmax, odim). - before_outs(Tensor): Batch of outputs before postnets (B, Lmax, odim). - logits(Tensor): Batch of stop logits (B, Lmax). - ys(Tensor): Batch of padded target features (B, Lmax, odim). - stop_labels(Tensor(int64)): Batch of the sequences of stop token labels (B, Lmax). + after_outs(Tensor): + Batch of outputs after postnets (B, Lmax, odim). + before_outs(Tensor): + Batch of outputs before postnets (B, Lmax, odim). + logits(Tensor): + Batch of stop logits (B, Lmax). + ys(Tensor): + Batch of padded target features (B, Lmax, odim). + stop_labels(Tensor(int64)): + Batch of the sequences of stop token labels (B, Lmax). olens(Tensor(int64)): Returns: - Tensor: L1 loss value. - Tensor: Mean square error loss value. - Tensor: Binary cross entropy loss value. + Tensor: + L1 loss value. + Tensor: + Mean square error loss value. + Tensor: + Binary cross entropy loss value. """ # make mask and apply it if self.use_masking: @@ -437,16 +453,24 @@ def stft(x, pad_mode='reflect'): """Perform STFT and convert to magnitude spectrogram. Args: - x(Tensor): Input signal tensor (B, T). - fft_size(int): FFT size. - hop_size(int): Hop size. - win_length(int, optional): window : str, optional (Default value = None) - window(str, optional): Name of window function, see `scipy.signal.get_window` for more - details. Defaults to "hann". - center(bool, optional, optional): center (bool, optional): Whether to pad `x` to make that the + x(Tensor): + Input signal tensor (B, T). + fft_size(int): + FFT size. + hop_size(int): + Hop size. + win_length(int, optional): + window (str, optional): + (Default value = None) + window(str, optional): + Name of window function, see `scipy.signal.get_window` for more details. Defaults to "hann". + center(bool, optional, optional): center (bool, optional): + Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of :math:`t`-th frame. Default: `True`. - pad_mode(str, optional, optional): (Default value = 'reflect') - hop_length: (Default value = None) + pad_mode(str, optional, optional): + (Default value = 'reflect') + hop_length: + (Default value = None) Returns: Tensor: Magnitude spectrogram (B, #frames, fft_size // 2 + 1). @@ -480,8 +504,10 @@ class SpectralConvergenceLoss(nn.Layer): def forward(self, x_mag, y_mag): """Calculate forward propagation. Args: - x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins). - y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins). + x_mag (Tensor): + Magnitude spectrogram of predicted signal (B, #frames, #freq_bins). + y_mag (Tensor): + Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins). Returns: Tensor: Spectral convergence loss value. """ @@ -501,8 +527,10 @@ class LogSTFTMagnitudeLoss(nn.Layer): def forward(self, x_mag, y_mag): """Calculate forward propagation. Args: - x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins). - y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins). + x_mag (Tensor): + Magnitude spectrogram of predicted signal (B, #frames, #freq_bins). + y_mag (Tensor): + Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins). Returns: Tensor: Log STFT magnitude loss value. """ @@ -531,11 +559,15 @@ class STFTLoss(nn.Layer): def forward(self, x, y): """Calculate forward propagation. Args: - x (Tensor): Predicted signal (B, T). - y (Tensor): Groundtruth signal (B, T). + x (Tensor): + Predicted signal (B, T). + y (Tensor): + Groundtruth signal (B, T). Returns: - Tensor: Spectral convergence loss value. - Tensor: Log STFT magnitude loss value. + Tensor: + Spectral convergence loss value. + Tensor: + Log STFT magnitude loss value. """ x_mag = stft(x, self.fft_size, self.shift_size, self.win_length, self.window) @@ -558,10 +590,14 @@ class MultiResolutionSTFTLoss(nn.Layer): window="hann", ): """Initialize Multi resolution STFT loss module. Args: - fft_sizes (list): List of FFT sizes. - hop_sizes (list): List of hop sizes. - win_lengths (list): List of window lengths. - window (str): Window function type. + fft_sizes (list): + List of FFT sizes. + hop_sizes (list): + List of hop sizes. + win_lengths (list): + List of window lengths. + window (str): + Window function type. """ super().__init__() assert len(fft_sizes) == len(hop_sizes) == len(win_lengths) @@ -573,11 +609,15 @@ class MultiResolutionSTFTLoss(nn.Layer): """Calculate forward propagation. Args: - x (Tensor): Predicted signal (B, T) or (B, #subband, T). - y (Tensor): Groundtruth signal (B, T) or (B, #subband, T). + x (Tensor): + Predicted signal (B, T) or (B, #subband, T). + y (Tensor): + Groundtruth signal (B, T) or (B, #subband, T). Returns: - Tensor: Multi resolution spectral convergence loss value. - Tensor: Multi resolution log STFT magnitude loss value. + Tensor: + Multi resolution spectral convergence loss value. + Tensor: + Multi resolution log STFT magnitude loss value. """ if len(x.shape) == 3: # (B, C, T) -> (B x C, T) @@ -615,9 +655,11 @@ class GeneratorAdversarialLoss(nn.Layer): def forward(self, outputs): """Calcualate generator adversarial loss. Args: - outputs (Tensor or List): Discriminator outputs or list of discriminator outputs. + outputs (Tensor or List): + Discriminator outputs or list of discriminator outputs. Returns: - Tensor: Generator adversarial loss value. + Tensor: + Generator adversarial loss value. """ if isinstance(outputs, (tuple, list)): adv_loss = 0.0 @@ -659,13 +701,15 @@ class DiscriminatorAdversarialLoss(nn.Layer): """Calcualate discriminator adversarial loss. Args: - outputs_hat (Tensor or list): Discriminator outputs or list of - discriminator outputs calculated from generator outputs. - outputs (Tensor or list): Discriminator outputs or list of - discriminator outputs calculated from groundtruth. + outputs_hat (Tensor or list): + Discriminator outputs or list of discriminator outputs calculated from generator outputs. + outputs (Tensor or list): + Discriminator outputs or list of discriminator outputs calculated from groundtruth. Returns: - Tensor: Discriminator real loss value. - Tensor: Discriminator fake loss value. + Tensor: + Discriminator real loss value. + Tensor: + Discriminator fake loss value. """ if isinstance(outputs, (tuple, list)): real_loss = 0.0 @@ -766,9 +810,12 @@ def masked_l1_loss(prediction, target, mask): """Compute maksed L1 loss. Args: - prediction(Tensor): The prediction. - target(Tensor): The target. The shape should be broadcastable to ``prediction``. - mask(Tensor): The mask. The shape should be broadcatable to the broadcasted shape of + prediction(Tensor): + The prediction. + target(Tensor): + The target. The shape should be broadcastable to ``prediction``. + mask(Tensor): + The mask. The shape should be broadcatable to the broadcasted shape of ``prediction`` and ``target``. Returns: @@ -916,8 +963,10 @@ class MelSpectrogramLoss(nn.Layer): def forward(self, y_hat, y): """Calculate Mel-spectrogram loss. Args: - y_hat(Tensor): Generated single tensor (B, 1, T). - y(Tensor): Groundtruth single tensor (B, 1, T). + y_hat(Tensor): + Generated single tensor (B, 1, T). + y(Tensor): + Groundtruth single tensor (B, 1, T). Returns: Tensor: Mel-spectrogram loss value. @@ -947,9 +996,11 @@ class FeatureMatchLoss(nn.Layer): """Calcualate feature matching loss. Args: - feats_hat(list): List of list of discriminator outputs + feats_hat(list): + List of list of discriminator outputs calcuated from generater outputs. - feats(list): List of list of discriminator outputs + feats(list): + List of list of discriminator outputs Returns: Tensor: Feature matching loss value. @@ -986,11 +1037,16 @@ class KLDivergenceLoss(nn.Layer): """Calculate KL divergence loss. Args: - z_p (Tensor): Flow hidden representation (B, H, T_feats). - logs_q (Tensor): Posterior encoder projected scale (B, H, T_feats). - m_p (Tensor): Expanded text encoder projected mean (B, H, T_feats). - logs_p (Tensor): Expanded text encoder projected scale (B, H, T_feats). - z_mask (Tensor): Mask tensor (B, 1, T_feats). + z_p (Tensor): + Flow hidden representation (B, H, T_feats). + logs_q (Tensor): + Posterior encoder projected scale (B, H, T_feats). + m_p (Tensor): + Expanded text encoder projected mean (B, H, T_feats). + logs_p (Tensor): + Expanded text encoder projected scale (B, H, T_feats). + z_mask (Tensor): + Mask tensor (B, 1, T_feats). Returns: Tensor: KL divergence loss. diff --git a/paddlespeech/t2s/modules/nets_utils.py b/paddlespeech/t2s/modules/nets_utils.py index 8cf17a6a15244e2b4a65f4c259d04c67df27436e..a3d5d1354f114952b784ba16b71c9344ef28c9d8 100644 --- a/paddlespeech/t2s/modules/nets_utils.py +++ b/paddlespeech/t2s/modules/nets_utils.py @@ -25,8 +25,10 @@ def pad_list(xs, pad_value): """Perform padding for the list of tensors. Args: - xs (List[Tensor]): List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)]. - pad_value (float): Value for padding. + xs (List[Tensor]): + List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)]. + pad_value (float): + Value for padding. Returns: Tensor: Padded tensor (B, Tmax, `*`). @@ -55,10 +57,13 @@ def make_pad_mask(lengths, xs=None, length_dim=-1): """Make mask tensor containing indices of padded part. Args: - lengths (Tensor(int64)): Batch of lengths (B,). - xs (Tensor, optional): The reference tensor. + lengths (Tensor(int64)): + Batch of lengths (B,). + xs (Tensor, optional): + The reference tensor. If set, masks will be the same shape as this tensor. - length_dim (int, optional): Dimension indicator of the above tensor. + length_dim (int, optional): + Dimension indicator of the above tensor. See the example. Returns: @@ -166,14 +171,18 @@ def make_non_pad_mask(lengths, xs=None, length_dim=-1): """Make mask tensor containing indices of non-padded part. Args: - lengths (Tensor(int64) or List): Batch of lengths (B,). - xs (Tensor, optional): The reference tensor. + lengths (Tensor(int64) or List): + Batch of lengths (B,). + xs (Tensor, optional): + The reference tensor. If set, masks will be the same shape as this tensor. - length_dim (int, optional): Dimension indicator of the above tensor. + length_dim (int, optional): + Dimension indicator of the above tensor. See the example. Returns: - Tensor(bool): mask tensor containing indices of padded part bool. + Tensor(bool): + mask tensor containing indices of padded part bool. Examples: With only lengths. @@ -257,8 +266,10 @@ def initialize(model: nn.Layer, init: str): Custom initialization routines can be implemented into submodules Args: - model (nn.Layer): Target. - init (str): Method of initialization. + model (nn.Layer): + Target. + init (str): + Method of initialization. """ assert check_argument_types() @@ -285,12 +296,17 @@ def get_random_segments( segment_size: int, ) -> Tuple[paddle.Tensor, paddle.Tensor]: """Get random segments. Args: - x (Tensor): Input tensor (B, C, T). - x_lengths (Tensor): Length tensor (B,). - segment_size (int): Segment size. + x (Tensor): + Input tensor (B, C, T). + x_lengths (Tensor): + Length tensor (B,). + segment_size (int): + Segment size. Returns: - Tensor: Segmented tensor (B, C, segment_size). - Tensor: Start index tensor (B,). + Tensor: + Segmented tensor (B, C, segment_size). + Tensor: + Start index tensor (B,). """ b, c, t = paddle.shape(x) max_start_idx = x_lengths - segment_size @@ -306,9 +322,12 @@ def get_segments( segment_size: int, ) -> paddle.Tensor: """Get segments. Args: - x (Tensor): Input tensor (B, C, T). - start_idxs (Tensor): Start index tensor (B,). - segment_size (int): Segment size. + x (Tensor): + Input tensor (B, C, T). + start_idxs (Tensor): + Start index tensor (B,). + segment_size (int): + Segment size. Returns: Tensor: Segmented tensor (B, C, segment_size). """ @@ -353,14 +372,20 @@ def phones_masking(xs_pad: paddle.Tensor, span_bdy: paddle.Tensor=None): ''' Args: - xs_pad (paddle.Tensor): input speech (B, Tmax, D). - src_mask (paddle.Tensor): mask of speech (B, 1, Tmax). - align_start (paddle.Tensor): frame level phone alignment start (B, Tmax2). - align_end (paddle.Tensor): frame level phone alignment end (B, Tmax2). - align_start_lens (paddle.Tensor): length of align_start (B, ). + xs_pad (paddle.Tensor): + input speech (B, Tmax, D). + src_mask (paddle.Tensor): + mask of speech (B, 1, Tmax). + align_start (paddle.Tensor): + frame level phone alignment start (B, Tmax2). + align_end (paddle.Tensor): + frame level phone alignment end (B, Tmax2). + align_start_lens (paddle.Tensor): + length of align_start (B, ). mlm_prob (float): mean_phn_span (int): - span_bdy (paddle.Tensor): masked mel boundary of input speech (B, 2). + span_bdy (paddle.Tensor): + masked mel boundary of input speech (B, 2). Returns: paddle.Tensor[bool]: masked position of input speech (B, Tmax). ''' @@ -416,19 +441,29 @@ def phones_text_masking(xs_pad: paddle.Tensor, span_bdy: paddle.Tensor=None): ''' Args: - xs_pad (paddle.Tensor): input speech (B, Tmax, D). - src_mask (paddle.Tensor): mask of speech (B, 1, Tmax). - text_pad (paddle.Tensor): input text (B, Tmax2). - text_mask (paddle.Tensor): mask of text (B, 1, Tmax2). - align_start (paddle.Tensor): frame level phone alignment start (B, Tmax2). - align_end (paddle.Tensor): frame level phone alignment end (B, Tmax2). - align_start_lens (paddle.Tensor): length of align_start (B, ). + xs_pad (paddle.Tensor): + input speech (B, Tmax, D). + src_mask (paddle.Tensor): + mask of speech (B, 1, Tmax). + text_pad (paddle.Tensor): + input text (B, Tmax2). + text_mask (paddle.Tensor): + mask of text (B, 1, Tmax2). + align_start (paddle.Tensor): + frame level phone alignment start (B, Tmax2). + align_end (paddle.Tensor): + frame level phone alignment end (B, Tmax2). + align_start_lens (paddle.Tensor): + length of align_start (B, ). mlm_prob (float): mean_phn_span (int): - span_bdy (paddle.Tensor): masked mel boundary of input speech (B, 2). + span_bdy (paddle.Tensor): + masked mel boundary of input speech (B, 2). Returns: - paddle.Tensor[bool]: masked position of input speech (B, Tmax). - paddle.Tensor[bool]: masked position of input text (B, Tmax2). + paddle.Tensor[bool]: + masked position of input speech (B, Tmax). + paddle.Tensor[bool]: + masked position of input text (B, Tmax2). ''' bz, sent_len, _ = paddle.shape(xs_pad) masked_pos = paddle.zeros((bz, sent_len)) @@ -488,12 +523,18 @@ def get_seg_pos(speech_pad: paddle.Tensor, seg_emb: bool=False): ''' Args: - speech_pad (paddle.Tensor): input speech (B, Tmax, D). - text_pad (paddle.Tensor): input text (B, Tmax2). - align_start (paddle.Tensor): frame level phone alignment start (B, Tmax2). - align_end (paddle.Tensor): frame level phone alignment end (B, Tmax2). - align_start_lens (paddle.Tensor): length of align_start (B, ). - seg_emb (bool): whether to use segment embedding. + speech_pad (paddle.Tensor): + input speech (B, Tmax, D). + text_pad (paddle.Tensor): + input text (B, Tmax2). + align_start (paddle.Tensor): + frame level phone alignment start (B, Tmax2). + align_end (paddle.Tensor): + frame level phone alignment end (B, Tmax2). + align_start_lens (paddle.Tensor): + length of align_start (B, ). + seg_emb (bool): + whether to use segment embedding. Returns: paddle.Tensor[int]: n-th phone of each mel, 0<=n<=Tmax2 (B, Tmax). eg: @@ -579,8 +620,10 @@ def random_spans_noise_mask(length: int, def _random_seg(num_items, num_segs): """Partition a sequence of items randomly into non-empty segments. Args: - num_items: an integer scalar > 0 - num_segs: an integer scalar in [1, num_items] + num_items: + an integer scalar > 0 + num_segs: + an integer scalar in [1, num_items] Returns: a Tensor with shape [num_segs] containing positive integers that add up to num_items diff --git a/paddlespeech/t2s/modules/pqmf.py b/paddlespeech/t2s/modules/pqmf.py index 9860da906094ad930a7791ca527b44cc2a3e51d1..7b42409d8250ed3a2c7f6815f03193f081986243 100644 --- a/paddlespeech/t2s/modules/pqmf.py +++ b/paddlespeech/t2s/modules/pqmf.py @@ -26,9 +26,12 @@ def design_prototype_filter(taps=62, cutoff_ratio=0.142, beta=9.0): filters of cosine modulated filterbanks`_. Args: - taps (int): The number of filter taps. - cutoff_ratio (float): Cut-off frequency ratio. - beta (float): Beta coefficient for kaiser window. + taps (int): + The number of filter taps. + cutoff_ratio (float): + Cut-off frequency ratio. + beta (float): + Beta coefficient for kaiser window. Returns: ndarray: Impluse response of prototype filter (taps + 1,). @@ -66,10 +69,14 @@ class PQMF(nn.Layer): See dicussion in https://github.com/kan-bayashi/ParallelWaveGAN/issues/195. Args: - subbands (int): The number of subbands. - taps (int): The number of filter taps. - cutoff_ratio (float): Cut-off frequency ratio. - beta (float): Beta coefficient for kaiser window. + subbands (int): + The number of subbands. + taps (int): + The number of filter taps. + cutoff_ratio (float): + Cut-off frequency ratio. + beta (float): + Beta coefficient for kaiser window. """ super().__init__() @@ -103,7 +110,8 @@ class PQMF(nn.Layer): def analysis(self, x): """Analysis with PQMF. Args: - x (Tensor): Input tensor (B, 1, T). + x (Tensor): + Input tensor (B, 1, T). Returns: Tensor: Output tensor (B, subbands, T // subbands). """ @@ -113,7 +121,8 @@ class PQMF(nn.Layer): def synthesis(self, x): """Synthesis with PQMF. Args: - x (Tensor): Input tensor (B, subbands, T // subbands). + x (Tensor): + Input tensor (B, subbands, T // subbands). Returns: Tensor: Output tensor (B, 1, T). """ diff --git a/paddlespeech/t2s/modules/predictor/duration_predictor.py b/paddlespeech/t2s/modules/predictor/duration_predictor.py index 33ed575b4245506438e439fff5d5b8a6ff1b238a..cb38fd5b4e37b7fa139fef335ce62130a280bf6c 100644 --- a/paddlespeech/t2s/modules/predictor/duration_predictor.py +++ b/paddlespeech/t2s/modules/predictor/duration_predictor.py @@ -50,12 +50,18 @@ class DurationPredictor(nn.Layer): """Initilize duration predictor module. Args: - idim (int):Input dimension. - n_layers (int, optional): Number of convolutional layers. - n_chans (int, optional): Number of channels of convolutional layers. - kernel_size (int, optional): Kernel size of convolutional layers. - dropout_rate (float, optional): Dropout rate. - offset (float, optional): Offset value to avoid nan in log domain. + idim (int): + Input dimension. + n_layers (int, optional): + Number of convolutional layers. + n_chans (int, optional): + Number of channels of convolutional layers. + kernel_size (int, optional): + Kernel size of convolutional layers. + dropout_rate (float, optional): + Dropout rate. + offset (float, optional): + Offset value to avoid nan in log domain. """ super().__init__() @@ -99,8 +105,10 @@ class DurationPredictor(nn.Layer): def forward(self, xs, x_masks=None): """Calculate forward propagation. Args: - xs(Tensor): Batch of input sequences (B, Tmax, idim). - x_masks(ByteTensor, optional, optional): Batch of masks indicating padded part (B, Tmax). (Default value = None) + xs(Tensor): + Batch of input sequences (B, Tmax, idim). + x_masks(ByteTensor, optional, optional): + Batch of masks indicating padded part (B, Tmax). (Default value = None) Returns: Tensor: Batch of predicted durations in log domain (B, Tmax). @@ -110,8 +118,10 @@ class DurationPredictor(nn.Layer): def inference(self, xs, x_masks=None): """Inference duration. Args: - xs(Tensor): Batch of input sequences (B, Tmax, idim). - x_masks(Tensor(bool), optional, optional): Batch of masks indicating padded part (B, Tmax). (Default value = None) + xs(Tensor): + Batch of input sequences (B, Tmax, idim). + x_masks(Tensor(bool), optional, optional): + Batch of masks indicating padded part (B, Tmax). (Default value = None) Returns: Tensor: Batch of predicted durations in linear domain int64 (B, Tmax). @@ -140,8 +150,10 @@ class DurationPredictorLoss(nn.Layer): """Calculate forward propagation. Args: - outputs(Tensor): Batch of prediction durations in log domain (B, T) - targets(Tensor): Batch of groundtruth durations in linear domain (B, T) + outputs(Tensor): + Batch of prediction durations in log domain (B, T) + targets(Tensor): + Batch of groundtruth durations in linear domain (B, T) Returns: Tensor: Mean squared error loss value. diff --git a/paddlespeech/t2s/modules/predictor/length_regulator.py b/paddlespeech/t2s/modules/predictor/length_regulator.py index e4fbf54916ed98948fffe8bf8325a312928efa57..bdfa18391c6bf6a9ed83ac9bb79c6567b3947dae 100644 --- a/paddlespeech/t2s/modules/predictor/length_regulator.py +++ b/paddlespeech/t2s/modules/predictor/length_regulator.py @@ -36,7 +36,8 @@ class LengthRegulator(nn.Layer): """Initilize length regulator module. Args: - pad_value (float, optional): Value used for padding. + pad_value (float, optional): + Value used for padding. """ super().__init__() @@ -97,9 +98,12 @@ class LengthRegulator(nn.Layer): """Calculate forward propagation. Args: - xs (Tensor): Batch of sequences of char or phoneme embeddings (B, Tmax, D). - ds (Tensor(int64)): Batch of durations of each frame (B, T). - alpha (float, optional): Alpha value to control speed of speech. + xs (Tensor): + Batch of sequences of char or phoneme embeddings (B, Tmax, D). + ds (Tensor(int64)): + Batch of durations of each frame (B, T). + alpha (float, optional): + Alpha value to control speed of speech. Returns: Tensor: replicated input tensor based on durations (B, T*, D). diff --git a/paddlespeech/t2s/modules/predictor/variance_predictor.py b/paddlespeech/t2s/modules/predictor/variance_predictor.py index 8afbf2576d158c9df7a56800f7fdea386bb0ae2b..4c2a67cc4ecd60e77210677e87042bf6d3a554c8 100644 --- a/paddlespeech/t2s/modules/predictor/variance_predictor.py +++ b/paddlespeech/t2s/modules/predictor/variance_predictor.py @@ -43,11 +43,16 @@ class VariancePredictor(nn.Layer): """Initilize duration predictor module. Args: - idim (int): Input dimension. - n_layers (int, optional): Number of convolutional layers. - n_chans (int, optional): Number of channels of convolutional layers. - kernel_size (int, optional): Kernel size of convolutional layers. - dropout_rate (float, optional): Dropout rate. + idim (int): + Input dimension. + n_layers (int, optional): + Number of convolutional layers. + n_chans (int, optional): + Number of channels of convolutional layers. + kernel_size (int, optional): + Kernel size of convolutional layers. + dropout_rate (float, optional): + Dropout rate. """ assert check_argument_types() super().__init__() @@ -74,11 +79,14 @@ class VariancePredictor(nn.Layer): """Calculate forward propagation. Args: - xs (Tensor): Batch of input sequences (B, Tmax, idim). - x_masks (Tensor(bool), optional): Batch of masks indicating padded part (B, Tmax, 1). + xs (Tensor): + Batch of input sequences (B, Tmax, idim). + x_masks (Tensor(bool), optional): + Batch of masks indicating padded part (B, Tmax, 1). Returns: - Tensor: Batch of predicted sequences (B, Tmax, 1). + Tensor: + Batch of predicted sequences (B, Tmax, 1). """ # (B, idim, Tmax) xs = xs.transpose([0, 2, 1]) diff --git a/paddlespeech/t2s/modules/residual_block.py b/paddlespeech/t2s/modules/residual_block.py index 5965a72032720b69ca494fe9ee42a8c7bae17c63..f21eedecb5f3a546ea50942c02394d6fc9e21a0d 100644 --- a/paddlespeech/t2s/modules/residual_block.py +++ b/paddlespeech/t2s/modules/residual_block.py @@ -29,15 +29,24 @@ class WaveNetResidualBlock(nn.Layer): refer to `WaveNet: A Generative Model for Raw Audio `_. Args: - kernel_size (int, optional): Kernel size of the 1D convolution, by default 3 - residual_channels (int, optional): Feature size of the residual output(and also the input), by default 64 - gate_channels (int, optional): Output feature size of the 1D convolution, by default 128 - skip_channels (int, optional): Feature size of the skip output, by default 64 - aux_channels (int, optional): Feature size of the auxiliary input (e.g. spectrogram), by default 80 - dropout (float, optional): Probability of the dropout before the 1D convolution, by default 0. - dilation (int, optional): Dilation of the 1D convolution, by default 1 - bias (bool, optional): Whether to use bias in the 1D convolution, by default True - use_causal_conv (bool, optional): Whether to use causal padding for the 1D convolution, by default False + kernel_size (int, optional): + Kernel size of the 1D convolution, by default 3 + residual_channels (int, optional): + Feature size of the residual output(and also the input), by default 64 + gate_channels (int, optional): + Output feature size of the 1D convolution, by default 128 + skip_channels (int, optional): + Feature size of the skip output, by default 64 + aux_channels (int, optional): + Feature size of the auxiliary input (e.g. spectrogram), by default 80 + dropout (float, optional): + Probability of the dropout before the 1D convolution, by default 0. + dilation (int, optional): + Dilation of the 1D convolution, by default 1 + bias (bool, optional): + Whether to use bias in the 1D convolution, by default True + use_causal_conv (bool, optional): + Whether to use causal padding for the 1D convolution, by default False """ def __init__(self, @@ -81,13 +90,17 @@ class WaveNetResidualBlock(nn.Layer): def forward(self, x, c): """ Args: - x (Tensor): the input features. Shape (N, C_res, T) - c (Tensor): the auxiliary input. Shape (N, C_aux, T) + x (Tensor): + the input features. Shape (N, C_res, T) + c (Tensor): + the auxiliary input. Shape (N, C_aux, T) Returns: - res (Tensor): Shape (N, C_res, T), the residual output, which is used as the + res (Tensor): + Shape (N, C_res, T), the residual output, which is used as the input of the next ResidualBlock in a stack of ResidualBlocks. - skip (Tensor): Shape (N, C_skip, T), the skip output, which is collected among + skip (Tensor): + Shape (N, C_skip, T), the skip output, which is collected among each layer in a stack of ResidualBlocks. """ x_input = x @@ -121,13 +134,20 @@ class HiFiGANResidualBlock(nn.Layer): ): """Initialize HiFiGANResidualBlock module. Args: - kernel_size (int): Kernel size of dilation convolution layer. - channels (int): Number of channels for convolution layer. - dilations (List[int]): List of dilation factors. - use_additional_convs (bool): Whether to use additional convolution layers. - bias (bool): Whether to add bias parameter in convolution layers. - nonlinear_activation (str): Activation function module name. - nonlinear_activation_params (dict): Hyperparameters for activation function. + kernel_size (int): + Kernel size of dilation convolution layer. + channels (int): + Number of channels for convolution layer. + dilations (List[int]): + List of dilation factors. + use_additional_convs (bool): + Whether to use additional convolution layers. + bias (bool): + Whether to add bias parameter in convolution layers. + nonlinear_activation (str): + Activation function module name. + nonlinear_activation_params (dict): + Hyperparameters for activation function. """ super().__init__() @@ -167,7 +187,8 @@ class HiFiGANResidualBlock(nn.Layer): def forward(self, x): """Calculate forward propagation. Args: - x (Tensor): Input tensor (B, channels, T). + x (Tensor): + Input tensor (B, channels, T). Returns: Tensor: Output tensor (B, channels, T). """ diff --git a/paddlespeech/t2s/modules/residual_stack.py b/paddlespeech/t2s/modules/residual_stack.py index 0d949b5635329819a613a748e34015964d2fed5c..98f5db3cf6820f3571d97a35b7b6eb8da6b1bc5f 100644 --- a/paddlespeech/t2s/modules/residual_stack.py +++ b/paddlespeech/t2s/modules/residual_stack.py @@ -39,15 +39,24 @@ class ResidualStack(nn.Layer): """Initialize ResidualStack module. Args: - kernel_size (int): Kernel size of dilation convolution layer. - channels (int): Number of channels of convolution layers. - dilation (int): Dilation factor. - bias (bool): Whether to add bias parameter in convolution layers. - nonlinear_activation (str): Activation function module name. - nonlinear_activation_params (Dict[str,Any]): Hyperparameters for activation function. - pad (str): Padding function module name before dilated convolution layer. - pad_params (Dict[str, Any]): Hyperparameters for padding function. - use_causal_conv (bool): Whether to use causal convolution. + kernel_size (int): + Kernel size of dilation convolution layer. + channels (int): + Number of channels of convolution layers. + dilation (int): + Dilation factor. + bias (bool): + Whether to add bias parameter in convolution layers. + nonlinear_activation (str): + Activation function module name. + nonlinear_activation_params (Dict[str,Any]): + Hyperparameters for activation function. + pad (str): + Padding function module name before dilated convolution layer. + pad_params (Dict[str, Any]): + Hyperparameters for padding function. + use_causal_conv (bool): + Whether to use causal convolution. """ super().__init__() # for compatibility @@ -95,7 +104,8 @@ class ResidualStack(nn.Layer): """Calculate forward propagation. Args: - c (Tensor): Input tensor (B, channels, T). + c (Tensor): + Input tensor (B, channels, T). Returns: Tensor: Output tensor (B, chennels, T). """ diff --git a/paddlespeech/t2s/modules/style_encoder.py b/paddlespeech/t2s/modules/style_encoder.py index 49091eac8215898d1428b937a353adb037f774c6..b558e7693aeee59c6351f78c1a47fd9ad4934249 100644 --- a/paddlespeech/t2s/modules/style_encoder.py +++ b/paddlespeech/t2s/modules/style_encoder.py @@ -32,16 +32,26 @@ class StyleEncoder(nn.Layer): Speech Synthesis`: https://arxiv.org/abs/1803.09017 Args: - idim (int, optional): Dimension of the input mel-spectrogram. - gst_tokens (int, optional): The number of GST embeddings. - gst_token_dim (int, optional): Dimension of each GST embedding. - gst_heads (int, optional): The number of heads in GST multihead attention. - conv_layers (int, optional): The number of conv layers in the reference encoder. - conv_chans_list (Sequence[int], optional): List of the number of channels of conv layers in the referece encoder. - conv_kernel_size (int, optional): Kernal size of conv layers in the reference encoder. - conv_stride (int, optional): Stride size of conv layers in the reference encoder. - gru_layers (int, optional): The number of GRU layers in the reference encoder. - gru_units (int, optional):The number of GRU units in the reference encoder. + idim (int, optional): + Dimension of the input mel-spectrogram. + gst_tokens (int, optional): + The number of GST embeddings. + gst_token_dim (int, optional): + Dimension of each GST embedding. + gst_heads (int, optional): + The number of heads in GST multihead attention. + conv_layers (int, optional): + The number of conv layers in the reference encoder. + conv_chans_list (Sequence[int], optional): + List of the number of channels of conv layers in the referece encoder. + conv_kernel_size (int, optional): + Kernal size of conv layers in the reference encoder. + conv_stride (int, optional): + Stride size of conv layers in the reference encoder. + gru_layers (int, optional): + The number of GRU layers in the reference encoder. + gru_units (int, optional): + The number of GRU units in the reference encoder. Todo: * Support manual weight specification in inference. @@ -82,7 +92,8 @@ class StyleEncoder(nn.Layer): """Calculate forward propagation. Args: - speech (Tensor): Batch of padded target features (B, Lmax, odim). + speech (Tensor): + Batch of padded target features (B, Lmax, odim). Returns: Tensor: Style token embeddings (B, token_dim). @@ -104,13 +115,20 @@ class ReferenceEncoder(nn.Layer): Speech Synthesis`: https://arxiv.org/abs/1803.09017 Args: - idim (int, optional): Dimension of the input mel-spectrogram. - conv_layers (int, optional): The number of conv layers in the reference encoder. - conv_chans_list: (Sequence[int], optional): List of the number of channels of conv layers in the referece encoder. - conv_kernel_size (int, optional): Kernal size of conv layers in the reference encoder. - conv_stride (int, optional): Stride size of conv layers in the reference encoder. - gru_layers (int, optional): The number of GRU layers in the reference encoder. - gru_units (int, optional): The number of GRU units in the reference encoder. + idim (int, optional): + Dimension of the input mel-spectrogram. + conv_layers (int, optional): + The number of conv layers in the reference encoder. + conv_chans_list: (Sequence[int], optional): + List of the number of channels of conv layers in the referece encoder. + conv_kernel_size (int, optional): + Kernal size of conv layers in the reference encoder. + conv_stride (int, optional): + Stride size of conv layers in the reference encoder. + gru_layers (int, optional): + The number of GRU layers in the reference encoder. + gru_units (int, optional): + The number of GRU units in the reference encoder. """ @@ -168,7 +186,8 @@ class ReferenceEncoder(nn.Layer): def forward(self, speech: paddle.Tensor) -> paddle.Tensor: """Calculate forward propagation. Args: - speech (Tensor): Batch of padded target features (B, Lmax, idim). + speech (Tensor): + Batch of padded target features (B, Lmax, idim). Returns: Tensor: Reference embedding (B, gru_units) @@ -200,11 +219,16 @@ class StyleTokenLayer(nn.Layer): .. _`Style Tokens: Unsupervised Style Modeling, Control and Transfer in End-to-End Speech Synthesis`: https://arxiv.org/abs/1803.09017 Args: - ref_embed_dim (int, optional): Dimension of the input reference embedding. - gst_tokens (int, optional): The number of GST embeddings. - gst_token_dim (int, optional): Dimension of each GST embedding. - gst_heads (int, optional): The number of heads in GST multihead attention. - dropout_rate (float, optional): Dropout rate in multi-head attention. + ref_embed_dim (int, optional): + Dimension of the input reference embedding. + gst_tokens (int, optional): + The number of GST embeddings. + gst_token_dim (int, optional): + Dimension of each GST embedding. + gst_heads (int, optional): + The number of heads in GST multihead attention. + dropout_rate (float, optional): + Dropout rate in multi-head attention. """ @@ -236,7 +260,8 @@ class StyleTokenLayer(nn.Layer): """Calculate forward propagation. Args: - ref_embs (Tensor): Reference embeddings (B, ref_embed_dim). + ref_embs (Tensor): + Reference embeddings (B, ref_embed_dim). Returns: Tensor: Style token embeddings (B, gst_token_dim). diff --git a/paddlespeech/t2s/modules/tacotron2/attentions.py b/paddlespeech/t2s/modules/tacotron2/attentions.py index a6fde742d98f90d4db06f734e5f7f4508848d989..cdaef4608a09a283054d8222d44f69ffe2d7c048 100644 --- a/paddlespeech/t2s/modules/tacotron2/attentions.py +++ b/paddlespeech/t2s/modules/tacotron2/attentions.py @@ -31,10 +31,14 @@ def _apply_attention_constraint(e, Text-to-Speech with Convolutional Sequence Learning`_. Args: - e(Tensor): Attention energy before applying softmax (1, T). - last_attended_idx(int): The index of the inputs of the last attended [0, T]. - backward_window(int, optional, optional): Backward window size in attention constraint. (Default value = 1) - forward_window(int, optional, optional): Forward window size in attetion constraint. (Default value = 3) + e(Tensor): + Attention energy before applying softmax (1, T). + last_attended_idx(int): + The index of the inputs of the last attended [0, T]. + backward_window(int, optional, optional): + Backward window size in attention constraint. (Default value = 1) + forward_window(int, optional, optional): + Forward window size in attetion constraint. (Default value = 3) Returns: Tensor: Monotonic constrained attention energy (1, T). @@ -62,12 +66,18 @@ class AttLoc(nn.Layer): (https://arxiv.org/pdf/1506.07503.pdf) Args: - eprojs (int): projection-units of encoder - dunits (int): units of decoder - att_dim (int): attention dimension - aconv_chans (int): channels of attention convolution - aconv_filts (int): filter size of attention convolution - han_mode (bool): flag to swith on mode of hierarchical attention and not store pre_compute_enc_h + eprojs (int): + projection-units of encoder + dunits (int): + units of decoder + att_dim (int): + attention dimension + aconv_chans (int): + channels of attention convolution + aconv_filts (int): + filter size of attention convolution + han_mode (bool): + flag to swith on mode of hierarchical attention and not store pre_compute_enc_h """ def __init__(self, @@ -117,18 +127,29 @@ class AttLoc(nn.Layer): forward_window=3, ): """Calculate AttLoc forward propagation. Args: - enc_hs_pad(Tensor): padded encoder hidden state (B, T_max, D_enc) - enc_hs_len(Tensor): padded encoder hidden state length (B) - dec_z(Tensor dec_z): decoder hidden state (B, D_dec) - att_prev(Tensor): previous attention weight (B, T_max) - scaling(float, optional): scaling parameter before applying softmax (Default value = 2.0) - forward_window(Tensor, optional): forward window size when constraining attention (Default value = 3) - last_attended_idx(int, optional): index of the inputs of the last attended (Default value = None) - backward_window(int, optional): backward window size in attention constraint (Default value = 1) - forward_window(int, optional): forward window size in attetion constraint (Default value = 3) + enc_hs_pad(Tensor): + padded encoder hidden state (B, T_max, D_enc) + enc_hs_len(Tensor): + padded encoder hidden state length (B) + dec_z(Tensor dec_z): + decoder hidden state (B, D_dec) + att_prev(Tensor): + previous attention weight (B, T_max) + scaling(float, optional): + scaling parameter before applying softmax (Default value = 2.0) + forward_window(Tensor, optional): + forward window size when constraining attention (Default value = 3) + last_attended_idx(int, optional): + index of the inputs of the last attended (Default value = None) + backward_window(int, optional): + backward window size in attention constraint (Default value = 1) + forward_window(int, optional): + forward window size in attetion constraint (Default value = 3) Returns: - Tensor: attention weighted encoder state (B, D_enc) - Tensor: previous attention weights (B, T_max) + Tensor: + attention weighted encoder state (B, D_enc) + Tensor: + previous attention weights (B, T_max) """ batch = paddle.shape(enc_hs_pad)[0] # pre-compute all h outside the decoder loop @@ -192,11 +213,16 @@ class AttForward(nn.Layer): (https://arxiv.org/pdf/1807.06736.pdf) Args: - eprojs (int): projection-units of encoder - dunits (int): units of decoder - att_dim (int): attention dimension - aconv_chans (int): channels of attention convolution - aconv_filts (int): filter size of attention convolution + eprojs (int): + projection-units of encoder + dunits (int): + units of decoder + att_dim (int): + attention dimension + aconv_chans (int): + channels of attention convolution + aconv_filts (int): + filter size of attention convolution """ def __init__(self, eprojs, dunits, att_dim, aconv_chans, aconv_filts): @@ -239,18 +265,28 @@ class AttForward(nn.Layer): """Calculate AttForward forward propagation. Args: - enc_hs_pad(Tensor): padded encoder hidden state (B, T_max, D_enc) - enc_hs_len(list): padded encoder hidden state length (B,) - dec_z(Tensor): decoder hidden state (B, D_dec) - att_prev(Tensor): attention weights of previous step (B, T_max) - scaling(float, optional): scaling parameter before applying softmax (Default value = 1.0) - last_attended_idx(int, optional): index of the inputs of the last attended (Default value = None) - backward_window(int, optional): backward window size in attention constraint (Default value = 1) - forward_window(int, optional): (Default value = 3) + enc_hs_pad(Tensor): + padded encoder hidden state (B, T_max, D_enc) + enc_hs_len(list): + padded encoder hidden state length (B,) + dec_z(Tensor): + decoder hidden state (B, D_dec) + att_prev(Tensor): + attention weights of previous step (B, T_max) + scaling(float, optional): + scaling parameter before applying softmax (Default value = 1.0) + last_attended_idx(int, optional): + index of the inputs of the last attended (Default value = None) + backward_window(int, optional): + backward window size in attention constraint (Default value = 1) + forward_window(int, optional): + (Default value = 3) Returns: - Tensor: attention weighted encoder state (B, D_enc) - Tensor: previous attention weights (B, T_max) + Tensor: + attention weighted encoder state (B, D_enc) + Tensor: + previous attention weights (B, T_max) """ batch = len(enc_hs_pad) # pre-compute all h outside the decoder loop @@ -321,12 +357,18 @@ class AttForwardTA(nn.Layer): (https://arxiv.org/pdf/1807.06736.pdf) Args: - eunits (int): units of encoder - dunits (int): units of decoder - att_dim (int): attention dimension - aconv_chans (int): channels of attention convolution - aconv_filts (int): filter size of attention convolution - odim (int): output dimension + eunits (int): + units of encoder + dunits (int): + units of decoder + att_dim (int): + attention dimension + aconv_chans (int): + channels of attention convolution + aconv_filts (int): + filter size of attention convolution + odim (int): + output dimension """ def __init__(self, eunits, dunits, att_dim, aconv_chans, aconv_filts, odim): @@ -372,19 +414,30 @@ class AttForwardTA(nn.Layer): """Calculate AttForwardTA forward propagation. Args: - enc_hs_pad(Tensor): padded encoder hidden state (B, Tmax, eunits) - enc_hs_len(list Tensor): padded encoder hidden state length (B,) - dec_z(Tensor): decoder hidden state (B, dunits) - att_prev(Tensor): attention weights of previous step (B, T_max) - out_prev(Tensor): decoder outputs of previous step (B, odim) - scaling(float, optional): scaling parameter before applying softmax (Default value = 1.0) - last_attended_idx(int, optional): index of the inputs of the last attended (Default value = None) - backward_window(int, optional): backward window size in attention constraint (Default value = 1) - forward_window(int, optional): (Default value = 3) + enc_hs_pad(Tensor): + padded encoder hidden state (B, Tmax, eunits) + enc_hs_len(list Tensor): + padded encoder hidden state length (B,) + dec_z(Tensor): + decoder hidden state (B, dunits) + att_prev(Tensor): + attention weights of previous step (B, T_max) + out_prev(Tensor): + decoder outputs of previous step (B, odim) + scaling(float, optional): + scaling parameter before applying softmax (Default value = 1.0) + last_attended_idx(int, optional): + index of the inputs of the last attended (Default value = None) + backward_window(int, optional): + backward window size in attention constraint (Default value = 1) + forward_window(int, optional): + (Default value = 3) Returns: - Tensor: attention weighted encoder state (B, dunits) - Tensor: previous attention weights (B, Tmax) + Tensor: + attention weighted encoder state (B, dunits) + Tensor: + previous attention weights (B, Tmax) """ batch = len(enc_hs_pad) # pre-compute all h outside the decoder loop diff --git a/paddlespeech/t2s/modules/tacotron2/decoder.py b/paddlespeech/t2s/modules/tacotron2/decoder.py index ebdfa387989828eb4c92df8a1d6bbf215a50b775..41c94b63f3fadbcff0397144e0a395058738f69d 100644 --- a/paddlespeech/t2s/modules/tacotron2/decoder.py +++ b/paddlespeech/t2s/modules/tacotron2/decoder.py @@ -45,10 +45,14 @@ class Prenet(nn.Layer): """Initialize prenet module. Args: - idim (int): Dimension of the inputs. - odim (int): Dimension of the outputs. - n_layers (int, optional): The number of prenet layers. - n_units (int, optional): The number of prenet units. + idim (int): + Dimension of the inputs. + odim (int): + Dimension of the outputs. + n_layers (int, optional): + The number of prenet layers. + n_units (int, optional): + The number of prenet units. """ super().__init__() self.dropout_rate = dropout_rate @@ -62,7 +66,8 @@ class Prenet(nn.Layer): """Calculate forward propagation. Args: - x (Tensor): Batch of input tensors (B, ..., idim). + x (Tensor): + Batch of input tensors (B, ..., idim). Returns: Tensor: Batch of output tensors (B, ..., odim). @@ -212,7 +217,8 @@ class ZoneOutCell(nn.Layer): """Calculate forward propagation. Args: - inputs (Tensor): Batch of input tensor (B, input_size). + inputs (Tensor): + Batch of input tensor (B, input_size). hidden (tuple): - Tensor: Batch of initial hidden states (B, hidden_size). - Tensor: Batch of initial cell states (B, hidden_size). @@ -277,26 +283,39 @@ class Decoder(nn.Layer): """Initialize Tacotron2 decoder module. Args: - idim (int): Dimension of the inputs. - odim (int): Dimension of the outputs. - att (nn.Layer): Instance of attention class. - dlayers (int, optional): The number of decoder lstm layers. - dunits (int, optional): The number of decoder lstm units. - prenet_layers (int, optional): The number of prenet layers. - prenet_units (int, optional): The number of prenet units. - postnet_layers (int, optional): The number of postnet layers. - postnet_filts (int, optional): The number of postnet filter size. - postnet_chans (int, optional): The number of postnet filter channels. - output_activation_fn (nn.Layer, optional): Activation function for outputs. - cumulate_att_w (bool, optional): Whether to cumulate previous attention weight. - use_batch_norm (bool, optional): Whether to use batch normalization. - use_concate : bool, optional + idim (int): + Dimension of the inputs. + odim (int): + Dimension of the outputs. + att (nn.Layer): + Instance of attention class. + dlayers (int, optional): + The number of decoder lstm layers. + dunits (int, optional): + The number of decoder lstm units. + prenet_layers (int, optional): + The number of prenet layers. + prenet_units (int, optional): + The number of prenet units. + postnet_layers (int, optional): + The number of postnet layers. + postnet_filts (int, optional): + The number of postnet filter size. + postnet_chans (int, optional): + The number of postnet filter channels. + output_activation_fn (nn.Layer, optional): + Activation function for outputs. + cumulate_att_w (bool, optional): + Whether to cumulate previous attention weight. + use_batch_norm (bool, optional): + Whether to use batch normalization. + use_concate (bool, optional): Whether to concatenate encoder embedding with decoder lstm outputs. - dropout_rate : float, optional + dropout_rate (float, optional): Dropout rate. - zoneout_rate : float, optional + zoneout_rate (float, optional): Zoneout rate. - reduction_factor : int, optional + reduction_factor (int, optional): Reduction factor. """ super().__init__() @@ -363,15 +382,22 @@ class Decoder(nn.Layer): """Calculate forward propagation. Args: - hs (Tensor): Batch of the sequences of padded hidden states (B, Tmax, idim). - hlens (Tensor(int64) padded): Batch of lengths of each input batch (B,). - ys (Tensor): Batch of the sequences of padded target features (B, Lmax, odim). + hs (Tensor): + Batch of the sequences of padded hidden states (B, Tmax, idim). + hlens (Tensor(int64) padded): + Batch of lengths of each input batch (B,). + ys (Tensor): + Batch of the sequences of padded target features (B, Lmax, odim). Returns: - Tensor: Batch of output tensors after postnet (B, Lmax, odim). - Tensor: Batch of output tensors before postnet (B, Lmax, odim). - Tensor: Batch of logits of stop prediction (B, Lmax). - Tensor: Batch of attention weights (B, Lmax, Tmax). + Tensor: + Batch of output tensors after postnet (B, Lmax, odim). + Tensor: + Batch of output tensors before postnet (B, Lmax, odim). + Tensor: + Batch of logits of stop prediction (B, Lmax). + Tensor: + Batch of attention weights (B, Lmax, Tmax). Note: This computation is performed in teacher-forcing manner. @@ -471,20 +497,30 @@ class Decoder(nn.Layer): forward_window=None, ): """Generate the sequence of features given the sequences of characters. Args: - h(Tensor): Input sequence of encoder hidden states (T, C). - threshold(float, optional, optional): Threshold to stop generation. (Default value = 0.5) - minlenratio(float, optional, optional): Minimum length ratio. If set to 1.0 and the length of input is 10, + h(Tensor): + Input sequence of encoder hidden states (T, C). + threshold(float, optional, optional): + Threshold to stop generation. (Default value = 0.5) + minlenratio(float, optional, optional): + Minimum length ratio. If set to 1.0 and the length of input is 10, the minimum length of outputs will be 10 * 1 = 10. (Default value = 0.0) - maxlenratio(float, optional, optional): Minimum length ratio. If set to 10 and the length of input is 10, + maxlenratio(float, optional, optional): + Minimum length ratio. If set to 10 and the length of input is 10, the maximum length of outputs will be 10 * 10 = 100. (Default value = 0.0) - use_att_constraint(bool, optional): Whether to apply attention constraint introduced in `Deep Voice 3`_. (Default value = False) - backward_window(int, optional): Backward window size in attention constraint. (Default value = None) - forward_window(int, optional): (Default value = None) + use_att_constraint(bool, optional): + Whether to apply attention constraint introduced in `Deep Voice 3`_. (Default value = False) + backward_window(int, optional): + Backward window size in attention constraint. (Default value = None) + forward_window(int, optional): + (Default value = None) Returns: - Tensor: Output sequence of features (L, odim). - Tensor: Output sequence of stop probabilities (L,). - Tensor: Attention weights (L, T). + Tensor: + Output sequence of features (L, odim). + Tensor: + Output sequence of stop probabilities (L,). + Tensor: + Attention weights (L, T). Note: This computation is performed in auto-regressive manner. @@ -625,9 +661,12 @@ class Decoder(nn.Layer): """Calculate all of the attention weights. Args: - hs (Tensor): Batch of the sequences of padded hidden states (B, Tmax, idim). - hlens (Tensor(int64)): Batch of lengths of each input batch (B,). - ys (Tensor): Batch of the sequences of padded target features (B, Lmax, odim). + hs (Tensor): + Batch of the sequences of padded hidden states (B, Tmax, idim). + hlens (Tensor(int64)): + Batch of lengths of each input batch (B,). + ys (Tensor): + Batch of the sequences of padded target features (B, Lmax, odim). Returns: numpy.ndarray: diff --git a/paddlespeech/t2s/modules/tacotron2/encoder.py b/paddlespeech/t2s/modules/tacotron2/encoder.py index db102a115a067a0c9872cf0bebceb355711da482..224c82400d2b2815ee5669d225c720633651b5ab 100644 --- a/paddlespeech/t2s/modules/tacotron2/encoder.py +++ b/paddlespeech/t2s/modules/tacotron2/encoder.py @@ -46,17 +46,28 @@ class Encoder(nn.Layer): padding_idx=0, ): """Initialize Tacotron2 encoder module. Args: - idim (int): Dimension of the inputs. - input_layer (str): Input layer type. - embed_dim (int, optional): Dimension of character embedding. - elayers (int, optional): The number of encoder blstm layers. - eunits (int, optional): The number of encoder blstm units. - econv_layers (int, optional): The number of encoder conv layers. - econv_filts (int, optional): The number of encoder conv filter size. - econv_chans (int, optional): The number of encoder conv filter channels. - use_batch_norm (bool, optional): Whether to use batch normalization. - use_residual (bool, optional): Whether to use residual connection. - dropout_rate (float, optional): Dropout rate. + idim (int): + Dimension of the inputs. + input_layer (str): + Input layer type. + embed_dim (int, optional): + Dimension of character embedding. + elayers (int, optional): + The number of encoder blstm layers. + eunits (int, optional): + The number of encoder blstm units. + econv_layers (int, optional): + The number of encoder conv layers. + econv_filts (int, optional): + The number of encoder conv filter size. + econv_chans (int, optional): + The number of encoder conv filter channels. + use_batch_norm (bool, optional): + Whether to use batch normalization. + use_residual (bool, optional): + Whether to use residual connection. + dropout_rate (float, optional): + Dropout rate. """ super().__init__() @@ -127,14 +138,18 @@ class Encoder(nn.Layer): """Calculate forward propagation. Args: - xs (Tensor): Batch of the padded sequence. Either character ids (B, Tmax) + xs (Tensor): + Batch of the padded sequence. Either character ids (B, Tmax) or acoustic feature (B, Tmax, idim * encoder_reduction_factor). Padded value should be 0. - ilens (Tensor(int64)): Batch of lengths of each input batch (B,). + ilens (Tensor(int64)): + Batch of lengths of each input batch (B,). Returns: - Tensor: Batch of the sequences of encoder states(B, Tmax, eunits). - Tensor(int64): Batch of lengths of each sequence (B,) + Tensor: + Batch of the sequences of encoder states(B, Tmax, eunits). + Tensor(int64): + Batch of lengths of each sequence (B,) """ xs = self.embed(xs).transpose([0, 2, 1]) if self.convs is not None: @@ -161,8 +176,8 @@ class Encoder(nn.Layer): """Inference. Args: - x (Tensor): The sequeunce of character ids (T,) - or acoustic feature (T, idim * encoder_reduction_factor). + x (Tensor): + The sequeunce of character ids (T,) or acoustic feature (T, idim * encoder_reduction_factor). Returns: Tensor: The sequences of encoder states(T, eunits). diff --git a/paddlespeech/t2s/modules/tade_res_block.py b/paddlespeech/t2s/modules/tade_res_block.py index b2275e2361405c81542042d92ea161c5dc6bb4bf..799cbe9fd4041363571ede7a743e6c74f97c1aa3 100644 --- a/paddlespeech/t2s/modules/tade_res_block.py +++ b/paddlespeech/t2s/modules/tade_res_block.py @@ -60,11 +60,15 @@ class TADELayer(nn.Layer): def forward(self, x, c): """Calculate forward propagation. Args: - x (Tensor): Input tensor (B, in_channels, T). - c (Tensor): Auxiliary input tensor (B, aux_channels, T). + x (Tensor): + Input tensor (B, in_channels, T). + c (Tensor): + Auxiliary input tensor (B, aux_channels, T). Returns: - Tensor: Output tensor (B, in_channels, T * upsample_factor). - Tensor: Upsampled aux tensor (B, in_channels, T * upsample_factor). + Tensor: + Output tensor (B, in_channels, T * upsample_factor). + Tensor: + Upsampled aux tensor (B, in_channels, T * upsample_factor). """ x = self.norm(x) @@ -138,11 +142,15 @@ class TADEResBlock(nn.Layer): """Calculate forward propagation. Args: - x (Tensor): Input tensor (B, in_channels, T). - c (Tensor): Auxiliary input tensor (B, aux_channels, T). + x (Tensor): + Input tensor (B, in_channels, T). + c (Tensor): + Auxiliary input tensor (B, aux_channels, T). Returns: - Tensor: Output tensor (B, in_channels, T * upsample_factor). - Tensor: Upsampled auxirialy tensor (B, in_channels, T * upsample_factor). + Tensor: + Output tensor (B, in_channels, T * upsample_factor). + Tensor: + Upsampled auxirialy tensor (B, in_channels, T * upsample_factor). """ residual = x x, c = self.tade1(x, c) diff --git a/paddlespeech/t2s/modules/transformer/attention.py b/paddlespeech/t2s/modules/transformer/attention.py index 538a36b6bfa3c8a1ec82798ef1b7923f4d4bdfb5..d7a032445304fe5a6d6b9294912665f29ff53858 100644 --- a/paddlespeech/t2s/modules/transformer/attention.py +++ b/paddlespeech/t2s/modules/transformer/attention.py @@ -25,9 +25,12 @@ from paddlespeech.t2s.modules.masked_fill import masked_fill class MultiHeadedAttention(nn.Layer): """Multi-Head Attention layer. Args: - n_head (int): The number of heads. - n_feat (int): The number of features. - dropout_rate (float): Dropout rate. + n_head (int): + The number of heads. + n_feat (int): + The number of features. + dropout_rate (float): + Dropout rate. """ def __init__(self, n_head, n_feat, dropout_rate): @@ -48,14 +51,20 @@ class MultiHeadedAttention(nn.Layer): """Transform query, key and value. Args: - query(Tensor): query tensor (#batch, time1, size). - key(Tensor): Key tensor (#batch, time2, size). - value(Tensor): Value tensor (#batch, time2, size). + query(Tensor): + query tensor (#batch, time1, size). + key(Tensor): + Key tensor (#batch, time2, size). + value(Tensor): + Value tensor (#batch, time2, size). Returns: - Tensor: Transformed query tensor (#batch, n_head, time1, d_k). - Tensor: Transformed key tensor (#batch, n_head, time2, d_k). - Tensor: Transformed value tensor (#batch, n_head, time2, d_k). + Tensor: + Transformed query tensor (#batch, n_head, time1, d_k). + Tensor: + Transformed key tensor (#batch, n_head, time2, d_k). + Tensor: + Transformed value tensor (#batch, n_head, time2, d_k). """ n_batch = paddle.shape(query)[0] @@ -77,9 +86,12 @@ class MultiHeadedAttention(nn.Layer): """Compute attention context vector. Args: - value(Tensor): Transformed value (#batch, n_head, time2, d_k). - scores(Tensor): Attention score (#batch, n_head, time1, time2). - mask(Tensor, optional): Mask (#batch, 1, time2) or (#batch, time1, time2). (Default value = None) + value(Tensor): + Transformed value (#batch, n_head, time2, d_k). + scores(Tensor): + Attention score (#batch, n_head, time1, time2). + mask(Tensor, optional): + Mask (#batch, 1, time2) or (#batch, time1, time2). (Default value = None) Returns: Tensor: Transformed value (#batch, time1, d_model) weighted by the attention score (#batch, time1, time2). @@ -113,10 +125,14 @@ class MultiHeadedAttention(nn.Layer): """Compute scaled dot product attention. Args: - query(Tensor): Query tensor (#batch, time1, size). - key(Tensor): Key tensor (#batch, time2, size). - value(Tensor): Value tensor (#batch, time2, size). - mask(Tensor, optional): Mask tensor (#batch, 1, time2) or (#batch, time1, time2). (Default value = None) + query(Tensor): + Query tensor (#batch, time1, size). + key(Tensor): + Key tensor (#batch, time2, size). + value(Tensor): + Value tensor (#batch, time2, size). + mask(Tensor, optional): + Mask tensor (#batch, 1, time2) or (#batch, time1, time2). (Default value = None) Returns: Tensor: Output tensor (#batch, time1, d_model). @@ -134,10 +150,14 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention): Paper: https://arxiv.org/abs/1901.02860 Args: - n_head (int): The number of heads. - n_feat (int): The number of features. - dropout_rate (float): Dropout rate. - zero_triu (bool): Whether to zero the upper triangular part of attention matrix. + n_head (int): + The number of heads. + n_feat (int): + The number of features. + dropout_rate (float): + Dropout rate. + zero_triu (bool): + Whether to zero the upper triangular part of attention matrix. """ def __init__(self, n_head, n_feat, dropout_rate, zero_triu=False): @@ -161,10 +181,11 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention): def rel_shift(self, x): """Compute relative positional encoding. Args: - x(Tensor): Input tensor (batch, head, time1, 2*time1-1). + x(Tensor): + Input tensor (batch, head, time1, 2*time1-1). Returns: - Tensor:Output tensor. + Tensor: Output tensor. """ b, h, t1, t2 = paddle.shape(x) zero_pad = paddle.zeros((b, h, t1, 1)) @@ -183,11 +204,16 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention): """Compute 'Scaled Dot Product Attention' with rel. positional encoding. Args: - query(Tensor): Query tensor (#batch, time1, size). - key(Tensor): Key tensor (#batch, time2, size). - value(Tensor): Value tensor (#batch, time2, size). - pos_emb(Tensor): Positional embedding tensor (#batch, 2*time1-1, size). - mask(Tensor): Mask tensor (#batch, 1, time2) or (#batch, time1, time2). + query(Tensor): + Query tensor (#batch, time1, size). + key(Tensor): + Key tensor (#batch, time2, size). + value(Tensor): + Value tensor (#batch, time2, size). + pos_emb(Tensor): + Positional embedding tensor (#batch, 2*time1-1, size). + mask(Tensor): + Mask tensor (#batch, 1, time2) or (#batch, time1, time2). Returns: Tensor: Output tensor (#batch, time1, d_model). @@ -228,10 +254,14 @@ class LegacyRelPositionMultiHeadedAttention(MultiHeadedAttention): Paper: https://arxiv.org/abs/1901.02860 Args: - n_head (int): The number of heads. - n_feat (int): The number of features. - dropout_rate (float): Dropout rate. - zero_triu (bool): Whether to zero the upper triangular part of attention matrix. + n_head (int): + The number of heads. + n_feat (int): + The number of features. + dropout_rate (float): + Dropout rate. + zero_triu (bool): + Whether to zero the upper triangular part of attention matrix. """ def __init__(self, n_head, n_feat, dropout_rate, zero_triu=False): @@ -255,8 +285,8 @@ class LegacyRelPositionMultiHeadedAttention(MultiHeadedAttention): def rel_shift(self, x): """Compute relative positional encoding. Args: - x(Tensor): Input tensor (batch, head, time1, time2). - + x(Tensor): + Input tensor (batch, head, time1, time2). Returns: Tensor:Output tensor. """ diff --git a/paddlespeech/t2s/modules/transformer/decoder.py b/paddlespeech/t2s/modules/transformer/decoder.py index a8db7345ad07b336debee14ff692cfe4a363a1dd..e68487678560702e484844b804a3d916ee40c838 100644 --- a/paddlespeech/t2s/modules/transformer/decoder.py +++ b/paddlespeech/t2s/modules/transformer/decoder.py @@ -37,28 +37,46 @@ class Decoder(nn.Layer): """Transfomer decoder module. Args: - odim (int): Output diminsion. - self_attention_layer_type (str): Self-attention layer type. - attention_dim (int): Dimention of attention. - attention_heads (int): The number of heads of multi head attention. - conv_wshare (int): The number of kernel of convolution. Only used in + odim (int): + Output diminsion. + self_attention_layer_type (str): + Self-attention layer type. + attention_dim (int): + Dimention of attention. + attention_heads (int): + The number of heads of multi head attention. + conv_wshare (int): + The number of kernel of convolution. Only used in self_attention_layer_type == "lightconv*" or "dynamiconv*". - conv_kernel_length (Union[int, str]):Kernel size str of convolution + conv_kernel_length (Union[int, str]): + Kernel size str of convolution (e.g. 71_71_71_71_71_71). Only used in self_attention_layer_type == "lightconv*" or "dynamiconv*". - conv_usebias (bool): Whether to use bias in convolution. Only used in + conv_usebias (bool): + Whether to use bias in convolution. Only used in self_attention_layer_type == "lightconv*" or "dynamiconv*". - linear_units(int): The number of units of position-wise feed forward. - num_blocks (int): The number of decoder blocks. - dropout_rate (float): Dropout rate. - positional_dropout_rate (float): Dropout rate after adding positional encoding. - self_attention_dropout_rate (float): Dropout rate in self-attention. - src_attention_dropout_rate (float): Dropout rate in source-attention. - input_layer (Union[str, nn.Layer]): Input layer type. - use_output_layer (bool): Whether to use output layer. - pos_enc_class (nn.Layer): Positional encoding module class. + linear_units(int): + The number of units of position-wise feed forward. + num_blocks (int): + The number of decoder blocks. + dropout_rate (float): + Dropout rate. + positional_dropout_rate (float): + Dropout rate after adding positional encoding. + self_attention_dropout_rate (float): + Dropout rate in self-attention. + src_attention_dropout_rate (float): + Dropout rate in source-attention. + input_layer (Union[str, nn.Layer]): + Input layer type. + use_output_layer (bool): + Whether to use output layer. + pos_enc_class (nn.Layer): + Positional encoding module class. `PositionalEncoding `or `ScaledPositionalEncoding` - normalize_before (bool): Whether to use layer_norm before the first block. - concat_after (bool): Whether to concat attention layer's input and output. + normalize_before (bool): + Whether to use layer_norm before the first block. + concat_after (bool): + Whether to concat attention layer's input and output. if True, additional linear will be applied. i.e. x -> x + linear(concat(x, att(x))) if False, no additional linear will be applied. i.e. x -> x + att(x) @@ -143,17 +161,22 @@ class Decoder(nn.Layer): def forward(self, tgt, tgt_mask, memory, memory_mask): """Forward decoder. Args: - tgt(Tensor): Input token ids, int64 (#batch, maxlen_out) if input_layer == "embed". + tgt(Tensor): + Input token ids, int64 (#batch, maxlen_out) if input_layer == "embed". In the other case, input tensor (#batch, maxlen_out, odim). - tgt_mask(Tensor): Input token mask (#batch, maxlen_out). - memory(Tensor): Encoded memory, float32 (#batch, maxlen_in, feat). - memory_mask(Tensor): Encoded memory mask (#batch, maxlen_in). + tgt_mask(Tensor): + Input token mask (#batch, maxlen_out). + memory(Tensor): + Encoded memory, float32 (#batch, maxlen_in, feat). + memory_mask(Tensor): + Encoded memory mask (#batch, maxlen_in). Returns: Tensor: Decoded token score before softmax (#batch, maxlen_out, odim) if use_output_layer is True. In the other case,final block outputs (#batch, maxlen_out, attention_dim). - Tensor: Score mask before softmax (#batch, maxlen_out). + Tensor: + Score mask before softmax (#batch, maxlen_out). """ x = self.embed(tgt) @@ -169,14 +192,20 @@ class Decoder(nn.Layer): """Forward one step. Args: - tgt(Tensor): Input token ids, int64 (#batch, maxlen_out). - tgt_mask(Tensor): Input token mask (#batch, maxlen_out). - memory(Tensor): Encoded memory, float32 (#batch, maxlen_in, feat). - cache((List[Tensor]), optional): List of cached tensors. (Default value = None) + tgt(Tensor): + Input token ids, int64 (#batch, maxlen_out). + tgt_mask(Tensor): + Input token mask (#batch, maxlen_out). + memory(Tensor): + Encoded memory, float32 (#batch, maxlen_in, feat). + cache((List[Tensor]), optional): + List of cached tensors. (Default value = None) Returns: - Tensor: Output tensor (batch, maxlen_out, odim). - List[Tensor]: List of cache tensors of each decoder layer. + Tensor: + Output tensor (batch, maxlen_out, odim). + List[Tensor]: + List of cache tensors of each decoder layer. """ x = self.embed(tgt) @@ -219,9 +248,12 @@ class Decoder(nn.Layer): """Score new token batch (required). Args: - ys(Tensor): paddle.int64 prefix tokens (n_batch, ylen). - states(List[Any]): Scorer states for prefix tokens. - xs(Tensor): The encoder feature that generates ys (n_batch, xlen, n_feat). + ys(Tensor): + paddle.int64 prefix tokens (n_batch, ylen). + states(List[Any]): + Scorer states for prefix tokens. + xs(Tensor): + The encoder feature that generates ys (n_batch, xlen, n_feat). Returns: tuple[Tensor, List[Any]]: diff --git a/paddlespeech/t2s/modules/transformer/decoder_layer.py b/paddlespeech/t2s/modules/transformer/decoder_layer.py index 9a13cd794c52cdfab8e7e5ae4cc3aa7842a71688..0a79e95480871288a8cc2c3d7c5b7af4b8f1ff90 100644 --- a/paddlespeech/t2s/modules/transformer/decoder_layer.py +++ b/paddlespeech/t2s/modules/transformer/decoder_layer.py @@ -24,16 +24,23 @@ class DecoderLayer(nn.Layer): Args: - size (int): Input dimension. - self_attn (nn.Layer): Self-attention module instance. + size (int): + Input dimension. + self_attn (nn.Layer): + Self-attention module instance. `MultiHeadedAttention` instance can be used as the argument. - src_attn (nn.Layer): Self-attention module instance. + src_attn (nn.Layer): + Self-attention module instance. `MultiHeadedAttention` instance can be used as the argument. - feed_forward (nn.Layer): Feed-forward module instance. + feed_forward (nn.Layer): + Feed-forward module instance. `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): Whether to use layer_norm before the first block. - concat_after (bool): Whether to concat attention layer's input and output. + dropout_rate (float): + Dropout rate. + normalize_before (bool): + Whether to use layer_norm before the first block. + concat_after (bool): + Whether to concat attention layer's input and output. if True, additional linear will be applied. i.e. x -> x + linear(concat(x, att(x))) if False, no additional linear will be applied. i.e. x -> x + att(x) @@ -69,11 +76,16 @@ class DecoderLayer(nn.Layer): """Compute decoded features. Args: - tgt(Tensor): Input tensor (#batch, maxlen_out, size). - tgt_mask(Tensor): Mask for input tensor (#batch, maxlen_out). - memory(Tensor): Encoded memory, float32 (#batch, maxlen_in, size). - memory_mask(Tensor): Encoded memory mask (#batch, maxlen_in). - cache(List[Tensor], optional): List of cached tensors. + tgt(Tensor): + Input tensor (#batch, maxlen_out, size). + tgt_mask(Tensor): + Mask for input tensor (#batch, maxlen_out). + memory(Tensor): + Encoded memory, float32 (#batch, maxlen_in, size). + memory_mask(Tensor): + Encoded memory mask (#batch, maxlen_in). + cache(List[Tensor], optional): + List of cached tensors. Each tensor shape should be (#batch, maxlen_out - 1, size). (Default value = None) Returns: Tensor diff --git a/paddlespeech/t2s/modules/transformer/embedding.py b/paddlespeech/t2s/modules/transformer/embedding.py index 9524f07ee6db6cc6e9ac64998b3f1adb3d7b9ee4..7ba301cbd6af16ccda30dbfba7b432a15ec74c2c 100644 --- a/paddlespeech/t2s/modules/transformer/embedding.py +++ b/paddlespeech/t2s/modules/transformer/embedding.py @@ -23,11 +23,16 @@ class PositionalEncoding(nn.Layer): """Positional encoding. Args: - d_model (int): Embedding dimension. - dropout_rate (float): Dropout rate. - max_len (int): Maximum input length. - reverse (bool): Whether to reverse the input position. - type (str): dtype of param + d_model (int): + Embedding dimension. + dropout_rate (float): + Dropout rate. + max_len (int): + Maximum input length. + reverse (bool): + Whether to reverse the input position. + type (str): + dtype of param """ def __init__(self, @@ -68,7 +73,8 @@ class PositionalEncoding(nn.Layer): """Add positional encoding. Args: - x (Tensor): Input tensor (batch, time, `*`). + x (Tensor): + Input tensor (batch, time, `*`). Returns: Tensor: Encoded tensor (batch, time, `*`). @@ -84,10 +90,14 @@ class ScaledPositionalEncoding(PositionalEncoding): See Sec. 3.2 https://arxiv.org/abs/1809.08895 Args: - d_model (int): Embedding dimension. - dropout_rate (float): Dropout rate. - max_len (int): Maximum input length. - dtype (str): dtype of param + d_model (int): + Embedding dimension. + dropout_rate (float): + Dropout rate. + max_len (int): + Maximum input length. + dtype (str): + dtype of param """ def __init__(self, d_model, dropout_rate, max_len=5000, dtype="float32"): @@ -111,7 +121,8 @@ class ScaledPositionalEncoding(PositionalEncoding): """Add positional encoding. Args: - x (Tensor): Input tensor (batch, time, `*`). + x (Tensor): + Input tensor (batch, time, `*`). Returns: Tensor: Encoded tensor (batch, time, `*`). """ @@ -127,9 +138,12 @@ class RelPositionalEncoding(nn.Layer): See : Appendix B in https://arxiv.org/abs/1901.02860 Args: - d_model (int): Embedding dimension. - dropout_rate (float): Dropout rate. - max_len (int): Maximum input length. + d_model (int): + Embedding dimension. + dropout_rate (float): + Dropout rate. + max_len (int): + Maximum input length. """ def __init__(self, d_model, dropout_rate, max_len=5000, dtype="float32"): @@ -175,7 +189,8 @@ class RelPositionalEncoding(nn.Layer): def forward(self, x: paddle.Tensor): """Add positional encoding. Args: - x (Tensor):Input tensor (batch, time, `*`). + x (Tensor): + Input tensor (batch, time, `*`). Returns: Tensor: Encoded tensor (batch, time, `*`). """ @@ -195,18 +210,24 @@ class LegacyRelPositionalEncoding(PositionalEncoding): See : Appendix B in https://arxiv.org/abs/1901.02860 Args: - d_model (int): Embedding dimension. - dropout_rate (float): Dropout rate. - max_len (int): Maximum input length. + d_model (int): + Embedding dimension. + dropout_rate (float): + Dropout rate. + max_len (int): + Maximum input length. """ def __init__(self, d_model: int, dropout_rate: float, max_len: int=5000): """ Args: - d_model (int): Embedding dimension. - dropout_rate (float): Dropout rate. - max_len (int, optional): [Maximum input length.]. Defaults to 5000. + d_model (int): + Embedding dimension. + dropout_rate (float): + Dropout rate. + max_len (int, optional): + [Maximum input length.]. Defaults to 5000. """ super().__init__(d_model, dropout_rate, max_len, reverse=True) @@ -234,10 +255,13 @@ class LegacyRelPositionalEncoding(PositionalEncoding): def forward(self, x: paddle.Tensor): """Compute positional encoding. Args: - x (paddle.Tensor): Input tensor (batch, time, `*`). + x (Tensor): + Input tensor (batch, time, `*`). Returns: - paddle.Tensor: Encoded tensor (batch, time, `*`). - paddle.Tensor: Positional embedding tensor (1, time, `*`). + Tensor: + Encoded tensor (batch, time, `*`). + Tensor: + Positional embedding tensor (1, time, `*`). """ self.extend_pe(x) x = x * self.xscale diff --git a/paddlespeech/t2s/modules/transformer/encoder.py b/paddlespeech/t2s/modules/transformer/encoder.py index 11986360a30ed7dfd7dfcaed4a50bdf259e26983..f2aed58926d7c44392ffa4a69038d32c1506b7bf 100644 --- a/paddlespeech/t2s/modules/transformer/encoder.py +++ b/paddlespeech/t2s/modules/transformer/encoder.py @@ -38,32 +38,55 @@ class BaseEncoder(nn.Layer): """Base Encoder module. Args: - idim (int): Input dimension. - attention_dim (int): Dimention of attention. - attention_heads (int): The number of heads of multi head attention. - linear_units (int): The number of units of position-wise feed forward. - num_blocks (int): The number of decoder blocks. - dropout_rate (float): Dropout rate. - positional_dropout_rate (float): Dropout rate after adding positional encoding. - attention_dropout_rate (float): Dropout rate in attention. - input_layer (Union[str, nn.Layer]): Input layer type. - normalize_before (bool): Whether to use layer_norm before the first block. - concat_after (bool): Whether to concat attention layer's input and output. + idim (int): + Input dimension. + attention_dim (int): + Dimention of attention. + attention_heads (int): + The number of heads of multi head attention. + linear_units (int): + The number of units of position-wise feed forward. + num_blocks (int): + The number of decoder blocks. + dropout_rate (float): + Dropout rate. + positional_dropout_rate (float): + Dropout rate after adding positional encoding. + attention_dropout_rate (float): + Dropout rate in attention. + input_layer (Union[str, nn.Layer]): + Input layer type. + normalize_before (bool): + Whether to use layer_norm before the first block. + concat_after (bool): + Whether to concat attention layer's input and output. if True, additional linear will be applied. i.e. x -> x + linear(concat(x, att(x))) if False, no additional linear will be applied. i.e. x -> x + att(x) - positionwise_layer_type (str): "linear", "conv1d", or "conv1d-linear". - positionwise_conv_kernel_size (int): Kernel size of positionwise conv1d layer. - macaron_style (bool): Whether to use macaron style for positionwise layer. - pos_enc_layer_type (str): Encoder positional encoding layer type. - selfattention_layer_type (str): Encoder attention layer type. - activation_type (str): Encoder activation function type. - use_cnn_module (bool): Whether to use convolution module. - zero_triu (bool): Whether to zero the upper triangular part of attention matrix. - cnn_module_kernel (int): Kernerl size of convolution module. - padding_idx (int): Padding idx for input_layer=embed. - stochastic_depth_rate (float): Maximum probability to skip the encoder layer. - intermediate_layers (Union[List[int], None]): indices of intermediate CTC layer. + positionwise_layer_type (str): + "linear", "conv1d", or "conv1d-linear". + positionwise_conv_kernel_size (int): + Kernel size of positionwise conv1d layer. + macaron_style (bool): + Whether to use macaron style for positionwise layer. + pos_enc_layer_type (str): + Encoder positional encoding layer type. + selfattention_layer_type (str): + Encoder attention layer type. + activation_type (str): + Encoder activation function type. + use_cnn_module (bool): + Whether to use convolution module. + zero_triu (bool): + Whether to zero the upper triangular part of attention matrix. + cnn_module_kernel (int): + Kernerl size of convolution module. + padding_idx (int): + Padding idx for input_layer=embed. + stochastic_depth_rate (float): + Maximum probability to skip the encoder layer. + intermediate_layers (Union[List[int], None]): + indices of intermediate CTC layer. indices start from 1. if not None, intermediate outputs are returned (which changes return type signature.) @@ -266,12 +289,16 @@ class BaseEncoder(nn.Layer): """Encode input sequence. Args: - xs (Tensor): Input tensor (#batch, time, idim). - masks (Tensor): Mask tensor (#batch, 1, time). + xs (Tensor): + Input tensor (#batch, time, idim). + masks (Tensor): + Mask tensor (#batch, 1, time). Returns: - Tensor: Output tensor (#batch, time, attention_dim). - Tensor: Mask tensor (#batch, 1, time). + Tensor: + Output tensor (#batch, time, attention_dim). + Tensor: + Mask tensor (#batch, 1, time). """ xs = self.embed(xs) xs, masks = self.encoders(xs, masks) @@ -284,26 +311,43 @@ class TransformerEncoder(BaseEncoder): """Transformer encoder module. Args: - idim (int): Input dimension. - attention_dim (int): Dimention of attention. - attention_heads (int): The number of heads of multi head attention. - linear_units (int): The number of units of position-wise feed forward. - num_blocks (int): The number of decoder blocks. - dropout_rate (float): Dropout rate. - positional_dropout_rate (float): Dropout rate after adding positional encoding. - attention_dropout_rate (float): Dropout rate in attention. - input_layer (Union[str, paddle.nn.Layer]): Input layer type. - pos_enc_layer_type (str): Encoder positional encoding layer type. - normalize_before (bool): Whether to use layer_norm before the first block. - concat_after (bool): Whether to concat attention layer's input and output. + idim (int): + Input dimension. + attention_dim (int): + Dimention of attention. + attention_heads (int): + The number of heads of multi head attention. + linear_units (int): + The number of units of position-wise feed forward. + num_blocks (int): + The number of decoder blocks. + dropout_rate (float): + Dropout rate. + positional_dropout_rate (float): + Dropout rate after adding positional encoding. + attention_dropout_rate (float): + Dropout rate in attention. + input_layer (Union[str, paddle.nn.Layer]): + Input layer type. + pos_enc_layer_type (str): + Encoder positional encoding layer type. + normalize_before (bool): + Whether to use layer_norm before the first block. + concat_after (bool): + Whether to concat attention layer's input and output. if True, additional linear will be applied. i.e. x -> x + linear(concat(x, att(x))) if False, no additional linear will be applied. i.e. x -> x + att(x) - positionwise_layer_type (str): "linear", "conv1d", or "conv1d-linear". - positionwise_conv_kernel_size (int): Kernel size of positionwise conv1d layer. - selfattention_layer_type (str): Encoder attention layer type. - activation_type (str): Encoder activation function type. - padding_idx (int): Padding idx for input_layer=embed. + positionwise_layer_type (str): + "linear", "conv1d", or "conv1d-linear". + positionwise_conv_kernel_size (int): + Kernel size of positionwise conv1d layer. + selfattention_layer_type (str): + Encoder attention layer type. + activation_type (str): + Encoder activation function type. + padding_idx (int): + Padding idx for input_layer=embed. """ def __init__( @@ -350,12 +394,16 @@ class TransformerEncoder(BaseEncoder): """Encoder input sequence. Args: - xs(Tensor): Input tensor (#batch, time, idim). - masks(Tensor): Mask tensor (#batch, 1, time). + xs(Tensor): + Input tensor (#batch, time, idim). + masks(Tensor): + Mask tensor (#batch, 1, time). Returns: - Tensor: Output tensor (#batch, time, attention_dim). - Tensor: Mask tensor (#batch, 1, time). + Tensor: + Output tensor (#batch, time, attention_dim). + Tensor: + Mask tensor (#batch, 1, time). """ xs = self.embed(xs) xs, masks = self.encoders(xs, masks) @@ -367,14 +415,20 @@ class TransformerEncoder(BaseEncoder): """Encode input frame. Args: - xs (Tensor): Input tensor. - masks (Tensor): Mask tensor. - cache (List[Tensor]): List of cache tensors. + xs (Tensor): + Input tensor. + masks (Tensor): + Mask tensor. + cache (List[Tensor]): + List of cache tensors. Returns: - Tensor: Output tensor. - Tensor: Mask tensor. - List[Tensor]: List of new cache tensors. + Tensor: + Output tensor. + Tensor: + Mask tensor. + List[Tensor]: + List of new cache tensors. """ xs = self.embed(xs) @@ -393,32 +447,55 @@ class ConformerEncoder(BaseEncoder): """Conformer encoder module. Args: - idim (int): Input dimension. - attention_dim (int): Dimention of attention. - attention_heads (int): The number of heads of multi head attention. - linear_units (int): The number of units of position-wise feed forward. - num_blocks (int): The number of decoder blocks. - dropout_rate (float): Dropout rate. - positional_dropout_rate (float): Dropout rate after adding positional encoding. - attention_dropout_rate (float): Dropout rate in attention. - input_layer (Union[str, nn.Layer]): Input layer type. - normalize_before (bool): Whether to use layer_norm before the first block. - concat_after (bool):Whether to concat attention layer's input and output. + idim (int): + Input dimension. + attention_dim (int): + Dimention of attention. + attention_heads (int): + The number of heads of multi head attention. + linear_units (int): + The number of units of position-wise feed forward. + num_blocks (int): + The number of decoder blocks. + dropout_rate (float): + Dropout rate. + positional_dropout_rate (float): + Dropout rate after adding positional encoding. + attention_dropout_rate (float): + Dropout rate in attention. + input_layer (Union[str, nn.Layer]): + Input layer type. + normalize_before (bool): + Whether to use layer_norm before the first block. + concat_after (bool): + Whether to concat attention layer's input and output. if True, additional linear will be applied. i.e. x -> x + linear(concat(x, att(x))) if False, no additional linear will be applied. i.e. x -> x + att(x) - positionwise_layer_type (str): "linear", "conv1d", or "conv1d-linear". - positionwise_conv_kernel_size (int): Kernel size of positionwise conv1d layer. - macaron_style (bool): Whether to use macaron style for positionwise layer. - pos_enc_layer_type (str): Encoder positional encoding layer type. - selfattention_layer_type (str): Encoder attention layer type. - activation_type (str): Encoder activation function type. - use_cnn_module (bool): Whether to use convolution module. - zero_triu (bool): Whether to zero the upper triangular part of attention matrix. - cnn_module_kernel (int): Kernerl size of convolution module. - padding_idx (int): Padding idx for input_layer=embed. - stochastic_depth_rate (float): Maximum probability to skip the encoder layer. - intermediate_layers (Union[List[int], None]):indices of intermediate CTC layer. indices start from 1. + positionwise_layer_type (str): + "linear", "conv1d", or "conv1d-linear". + positionwise_conv_kernel_size (int): + Kernel size of positionwise conv1d layer. + macaron_style (bool): + Whether to use macaron style for positionwise layer. + pos_enc_layer_type (str): + Encoder positional encoding layer type. + selfattention_layer_type (str): + Encoder attention layer type. + activation_type (str): + Encoder activation function type. + use_cnn_module (bool): + Whether to use convolution module. + zero_triu (bool): + Whether to zero the upper triangular part of attention matrix. + cnn_module_kernel (int): + Kernerl size of convolution module. + padding_idx (int): + Padding idx for input_layer=embed. + stochastic_depth_rate (float): + Maximum probability to skip the encoder layer. + intermediate_layers (Union[List[int], None]): + indices of intermediate CTC layer. indices start from 1. if not None, intermediate outputs are returned (which changes return type signature.) """ @@ -478,11 +555,15 @@ class ConformerEncoder(BaseEncoder): """Encode input sequence. Args: - xs (Tensor): Input tensor (#batch, time, idim). - masks (Tensor): Mask tensor (#batch, 1, time). + xs (Tensor): + Input tensor (#batch, time, idim). + masks (Tensor): + Mask tensor (#batch, 1, time). Returns: - Tensor: Output tensor (#batch, time, attention_dim). - Tensor: Mask tensor (#batch, 1, time). + Tensor: + Output tensor (#batch, time, attention_dim). + Tensor: + Mask tensor (#batch, 1, time). """ if isinstance(self.embed, (Conv2dSubsampling)): xs, masks = self.embed(xs, masks) @@ -539,7 +620,8 @@ class Conv1dResidualBlock(nn.Layer): def forward(self, xs): """Encode input sequence. Args: - xs (Tensor): Input tensor (#batch, idim, T). + xs (Tensor): + Input tensor (#batch, idim, T). Returns: Tensor: Output tensor (#batch, odim, T). """ @@ -582,8 +664,10 @@ class CNNDecoder(nn.Layer): def forward(self, xs, masks=None): """Encode input sequence. Args: - xs (Tensor): Input tensor (#batch, time, idim). - masks (Tensor): Mask tensor (#batch, 1, time). + xs (Tensor): + Input tensor (#batch, time, idim). + masks (Tensor): + Mask tensor (#batch, 1, time). Returns: Tensor: Output tensor (#batch, time, odim). """ @@ -629,8 +713,10 @@ class CNNPostnet(nn.Layer): def forward(self, xs, masks=None): """Encode input sequence. Args: - xs (Tensor): Input tensor (#batch, odim, time). - masks (Tensor): Mask tensor (#batch, 1, time). + xs (Tensor): + Input tensor (#batch, odim, time). + masks (Tensor): + Mask tensor (#batch, 1, time). Returns: Tensor: Output tensor (#batch, odim, time). """ diff --git a/paddlespeech/t2s/modules/transformer/encoder_layer.py b/paddlespeech/t2s/modules/transformer/encoder_layer.py index 72372b69b92bcae4dab8485498f56d0ad639f91f..63494b0de8d3cbf16edc4dc7d8dd09b7cdec65d9 100644 --- a/paddlespeech/t2s/modules/transformer/encoder_layer.py +++ b/paddlespeech/t2s/modules/transformer/encoder_layer.py @@ -21,14 +21,20 @@ class EncoderLayer(nn.Layer): """Encoder layer module. Args: - size (int): Input dimension. - self_attn (nn.Layer): Self-attention module instance. + size (int): + Input dimension. + self_attn (nn.Layer): + Self-attention module instance. `MultiHeadedAttention` instance can be used as the argument. - feed_forward (nn.Layer): Feed-forward module instance. + feed_forward (nn.Layer): + Feed-forward module instance. `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): Whether to use layer_norm before the first block. - concat_after (bool): Whether to concat attention layer's input and output. + dropout_rate (float): + Dropout rate. + normalize_before (bool): + Whether to use layer_norm before the first block. + concat_after (bool): + Whether to concat attention layer's input and output. if True, additional linear will be applied. i.e. x -> x + linear(concat(x, att(x))) if False, no additional linear will be applied. i.e. x -> x + att(x) @@ -59,13 +65,18 @@ class EncoderLayer(nn.Layer): """Compute encoded features. Args: - x(Tensor): Input tensor (#batch, time, size). - mask(Tensor): Mask tensor for the input (#batch, time). - cache(Tensor, optional): Cache tensor of the input (#batch, time - 1, size). + x(Tensor): + Input tensor (#batch, time, size). + mask(Tensor): + Mask tensor for the input (#batch, time). + cache(Tensor, optional): + Cache tensor of the input (#batch, time - 1, size). Returns: - Tensor: Output tensor (#batch, time, size). - Tensor: Mask tensor (#batch, time). + Tensor: + Output tensor (#batch, time, size). + Tensor: + Mask tensor (#batch, time). """ residual = x if self.normalize_before: diff --git a/paddlespeech/t2s/modules/transformer/lightconv.py b/paddlespeech/t2s/modules/transformer/lightconv.py index 9bcc1acfba021d91bec798f4eea41461cfffb81e..22217d50f512699a88cb649af11628a2e5d111bf 100644 --- a/paddlespeech/t2s/modules/transformer/lightconv.py +++ b/paddlespeech/t2s/modules/transformer/lightconv.py @@ -31,12 +31,18 @@ class LightweightConvolution(nn.Layer): https://github.com/pytorch/fairseq/tree/master/fairseq Args: - wshare (int): the number of kernel of convolution - n_feat (int): the number of features - dropout_rate (float): dropout_rate - kernel_size (int): kernel size (length) - use_kernel_mask (bool): Use causal mask or not for convolution kernel - use_bias (bool): Use bias term or not. + wshare (int): + the number of kernel of convolution + n_feat (int): + the number of features + dropout_rate (float): + dropout_rate + kernel_size (int): + kernel size (length) + use_kernel_mask (bool): + Use causal mask or not for convolution kernel + use_bias (bool): + Use bias term or not. """ @@ -94,10 +100,14 @@ class LightweightConvolution(nn.Layer): This is just for compatibility with self-attention layer (attention.py) Args: - query (Tensor): input tensor. (batch, time1, d_model) - key (Tensor): NOT USED. (batch, time2, d_model) - value (Tensor): NOT USED. (batch, time2, d_model) - mask : (Tensor): (batch, time1, time2) mask + query (Tensor): + input tensor. (batch, time1, d_model) + key (Tensor): + NOT USED. (batch, time2, d_model) + value (Tensor): + NOT USED. (batch, time2, d_model) + mask : (Tensor): + (batch, time1, time2) mask Return: Tensor: ouput. (batch, time1, d_model) diff --git a/paddlespeech/t2s/modules/transformer/mask.py b/paddlespeech/t2s/modules/transformer/mask.py index c10e6add2a0c37e052a9df9d3ae6a8535d03e942..71dd379756df892a729994022e84f1467118d814 100644 --- a/paddlespeech/t2s/modules/transformer/mask.py +++ b/paddlespeech/t2s/modules/transformer/mask.py @@ -19,8 +19,10 @@ def subsequent_mask(size, dtype=paddle.bool): """Create mask for subsequent steps (size, size). Args: - size (int): size of mask - dtype (paddle.dtype): result dtype + size (int): + size of mask + dtype (paddle.dtype): + result dtype Return: Tensor: >>> subsequent_mask(3) @@ -36,9 +38,12 @@ def target_mask(ys_in_pad, ignore_id, dtype=paddle.bool): """Create mask for decoder self-attention. Args: - ys_pad (Tensor): batch of padded target sequences (B, Lmax) - ignore_id (int): index of padding - dtype (paddle.dtype): result dtype + ys_pad (Tensor): + batch of padded target sequences (B, Lmax) + ignore_id (int): + index of padding + dtype (paddle.dtype): + result dtype Return: Tensor: (B, Lmax, Lmax) """ diff --git a/paddlespeech/t2s/modules/transformer/multi_layer_conv.py b/paddlespeech/t2s/modules/transformer/multi_layer_conv.py index d3285b65f3113c4aaa844d5ccb35d0399e3f6331..91d67ca58376967f4eeaf6d069691446c2714b79 100644 --- a/paddlespeech/t2s/modules/transformer/multi_layer_conv.py +++ b/paddlespeech/t2s/modules/transformer/multi_layer_conv.py @@ -32,10 +32,14 @@ class MultiLayeredConv1d(nn.Layer): """Initialize MultiLayeredConv1d module. Args: - in_chans (int): Number of input channels. - hidden_chans (int): Number of hidden channels. - kernel_size (int): Kernel size of conv1d. - dropout_rate (float): Dropout rate. + in_chans (int): + Number of input channels. + hidden_chans (int): + Number of hidden channels. + kernel_size (int): + Kernel size of conv1d. + dropout_rate (float): + Dropout rate. """ super().__init__() @@ -58,7 +62,8 @@ class MultiLayeredConv1d(nn.Layer): """Calculate forward propagation. Args: - x (Tensor): Batch of input tensors (B, T, in_chans). + x (Tensor): + Batch of input tensors (B, T, in_chans). Returns: Tensor: Batch of output tensors (B, T, in_chans). @@ -79,10 +84,14 @@ class Conv1dLinear(nn.Layer): """Initialize Conv1dLinear module. Args: - in_chans (int): Number of input channels. - hidden_chans (int): Number of hidden channels. - kernel_size (int): Kernel size of conv1d. - dropout_rate (float): Dropout rate. + in_chans (int): + Number of input channels. + hidden_chans (int): + Number of hidden channels. + kernel_size (int): + Kernel size of conv1d. + dropout_rate (float): + Dropout rate. """ super().__init__() self.w_1 = nn.Conv1D( @@ -99,7 +108,8 @@ class Conv1dLinear(nn.Layer): """Calculate forward propagation. Args: - x (Tensor): Batch of input tensors (B, T, in_chans). + x (Tensor): + Batch of input tensors (B, T, in_chans). Returns: Tensor: Batch of output tensors (B, T, in_chans). diff --git a/paddlespeech/t2s/modules/transformer/positionwise_feed_forward.py b/paddlespeech/t2s/modules/transformer/positionwise_feed_forward.py index 92af6851c402b969a5e590be287ba5e7f9c5a262..45ea279bfdde9707a96e219270f9a24886c9c094 100644 --- a/paddlespeech/t2s/modules/transformer/positionwise_feed_forward.py +++ b/paddlespeech/t2s/modules/transformer/positionwise_feed_forward.py @@ -21,9 +21,12 @@ class PositionwiseFeedForward(nn.Layer): """Positionwise feed forward layer. Args: - idim (int): Input dimenstion. - hidden_units (int): The number of hidden units. - dropout_rate (float): Dropout rate. + idim (int): + Input dimenstion. + hidden_units (int): + The number of hidden units. + dropout_rate (float): + Dropout rate. """ def __init__(self, diff --git a/paddlespeech/t2s/modules/transformer/repeat.py b/paddlespeech/t2s/modules/transformer/repeat.py index 1e946adf7e469fd6c05c2a8c8d9e6f16f638524e..43d11e9f96ee203d003fabac046a8620edd0bdec 100644 --- a/paddlespeech/t2s/modules/transformer/repeat.py +++ b/paddlespeech/t2s/modules/transformer/repeat.py @@ -30,8 +30,10 @@ def repeat(N, fn): """Repeat module N times. Args: - N (int): Number of repeat time. - fn (Callable): Function to generate module. + N (int): + Number of repeat time. + fn (Callable): + Function to generate module. Returns: MultiSequential: Repeated model instance. diff --git a/paddlespeech/t2s/modules/transformer/subsampling.py b/paddlespeech/t2s/modules/transformer/subsampling.py index 07439705a66cb6bc683bfa5a977aef0db379516c..a17278c0b52ee39d0157ad36e07a61d123bc1b9a 100644 --- a/paddlespeech/t2s/modules/transformer/subsampling.py +++ b/paddlespeech/t2s/modules/transformer/subsampling.py @@ -23,10 +23,14 @@ class Conv2dSubsampling(nn.Layer): """Convolutional 2D subsampling (to 1/4 length). Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - pos_enc (nn.Layer): Custom position encoding layer. + idim (int): + Input dimension. + odim (int): + Output dimension. + dropout_rate (float): + Dropout rate. + pos_enc (nn.Layer): + Custom position encoding layer. """ def __init__(self, idim, odim, dropout_rate, pos_enc=None): @@ -45,11 +49,15 @@ class Conv2dSubsampling(nn.Layer): def forward(self, x, x_mask): """Subsample x. Args: - x (Tensor): Input tensor (#batch, time, idim). - x_mask (Tensor): Input mask (#batch, 1, time). + x (Tensor): + Input tensor (#batch, time, idim). + x_mask (Tensor): + Input mask (#batch, 1, time). Returns: - Tensor: Subsampled tensor (#batch, time', odim), where time' = time // 4. - Tensor: Subsampled mask (#batch, 1, time'), where time' = time // 4. + Tensor: + Subsampled tensor (#batch, time', odim), where time' = time // 4. + Tensor: + Subsampled mask (#batch, 1, time'), where time' = time // 4. """ # (b, c, t, f) x = x.unsqueeze(1) diff --git a/paddlespeech/t2s/modules/upsample.py b/paddlespeech/t2s/modules/upsample.py index 65e78a8928adcab69379c883a00bd1ab90bccbc0..164db65ddbf996cf13ffd0a726fb63ae214da649 100644 --- a/paddlespeech/t2s/modules/upsample.py +++ b/paddlespeech/t2s/modules/upsample.py @@ -28,9 +28,12 @@ class Stretch2D(nn.Layer): """Strech an image (or image-like object) with some interpolation. Args: - w_scale (int): Scalar of width. - h_scale (int): Scalar of the height. - mode (str, optional): Interpolation mode, modes suppored are "nearest", "bilinear", + w_scale (int): + Scalar of width. + h_scale (int): + Scalar of the height. + mode (str, optional): + Interpolation mode, modes suppored are "nearest", "bilinear", "trilinear", "bicubic", "linear" and "area",by default "nearest" For more details about interpolation, see `paddle.nn.functional.interpolate `_. @@ -44,11 +47,12 @@ class Stretch2D(nn.Layer): """ Args: - x (Tensor): Shape (N, C, H, W) + x (Tensor): + Shape (N, C, H, W) Returns: - Tensor: The stretched image. - Shape (N, C, H', W'), where ``H'=h_scale * H``, ``W'=w_scale * W``. + Tensor: + The stretched image. Shape (N, C, H', W'), where ``H'=h_scale * H``, ``W'=w_scale * W``. """ out = F.interpolate( @@ -61,12 +65,18 @@ class UpsampleNet(nn.Layer): convolutions. Args: - upsample_scales (List[int]): Upsampling factors for each strech. - nonlinear_activation (Optional[str], optional): Activation after each convolution, by default None - nonlinear_activation_params (Dict[str, Any], optional): Parameters passed to construct the activation, by default {} - interpolate_mode (str, optional): Interpolation mode of the strech, by default "nearest" - freq_axis_kernel_size (int, optional): Convolution kernel size along the frequency axis, by default 1 - use_causal_conv (bool, optional): Whether to use causal padding before convolution, by default False + upsample_scales (List[int]): + Upsampling factors for each strech. + nonlinear_activation (Optional[str], optional): + Activation after each convolution, by default None + nonlinear_activation_params (Dict[str, Any], optional): + Parameters passed to construct the activation, by default {} + interpolate_mode (str, optional): + Interpolation mode of the strech, by default "nearest" + freq_axis_kernel_size (int, optional): + Convolution kernel size along the frequency axis, by default 1 + use_causal_conv (bool, optional): + Whether to use causal padding before convolution, by default False If True, Causal padding is used along the time axis, i.e. padding amount is ``receptive field - 1`` and 0 for before and after, respectively. If False, "same" padding is used along the time axis. @@ -106,7 +116,8 @@ class UpsampleNet(nn.Layer): def forward(self, c): """ Args: - c (Tensor): spectrogram. Shape (N, F, T) + c (Tensor): + spectrogram. Shape (N, F, T) Returns: Tensor: upsampled spectrogram. @@ -126,17 +137,25 @@ class ConvInUpsampleNet(nn.Layer): UpsampleNet. Args: - upsample_scales (List[int]): Upsampling factors for each strech. - nonlinear_activation (Optional[str], optional): Activation after each convolution, by default None - nonlinear_activation_params (Dict[str, Any], optional): Parameters passed to construct the activation, by default {} - interpolate_mode (str, optional): Interpolation mode of the strech, by default "nearest" - freq_axis_kernel_size (int, optional): Convolution kernel size along the frequency axis, by default 1 - aux_channels (int, optional): Feature size of the input, by default 80 - aux_context_window (int, optional): Context window of the first 1D convolution applied to the input. It + upsample_scales (List[int]): + Upsampling factors for each strech. + nonlinear_activation (Optional[str], optional): + Activation after each convolution, by default None + nonlinear_activation_params (Dict[str, Any], optional): + Parameters passed to construct the activation, by default {} + interpolate_mode (str, optional): + Interpolation mode of the strech, by default "nearest" + freq_axis_kernel_size (int, optional): + Convolution kernel size along the frequency axis, by default 1 + aux_channels (int, optional): + Feature size of the input, by default 80 + aux_context_window (int, optional): + Context window of the first 1D convolution applied to the input. It related to the kernel size of the convolution, by default 0 If use causal convolution, the kernel size is ``window + 1``, else the kernel size is ``2 * window + 1``. - use_causal_conv (bool, optional): Whether to use causal padding before convolution, by default False + use_causal_conv (bool, optional): + Whether to use causal padding before convolution, by default False If True, Causal padding is used along the time axis, i.e. padding amount is ``receptive field - 1`` and 0 for before and after, respectively. If False, "same" padding is used along the time axis. @@ -171,7 +190,8 @@ class ConvInUpsampleNet(nn.Layer): def forward(self, c): """ Args: - c (Tensor): spectrogram. Shape (N, F, T) + c (Tensor): + spectrogram. Shape (N, F, T) Returns: Tensors: upsampled spectrogram. Shape (N, F, T'), where ``T' = upsample_factor * T``, diff --git a/paddlespeech/t2s/training/experiment.py b/paddlespeech/t2s/training/experiment.py index 05a363ff204511ae5c18390277612ad69496732f..1eba826df4785739c9fddf749fff0f212b89c48c 100644 --- a/paddlespeech/t2s/training/experiment.py +++ b/paddlespeech/t2s/training/experiment.py @@ -58,8 +58,10 @@ class ExperimentBase(object): need. Args: - config (yacs.config.CfgNode): The configuration used for the experiment. - args (argparse.Namespace): The parsed command line arguments. + config (yacs.config.CfgNode): + The configuration used for the experiment. + args (argparse.Namespace): + The parsed command line arguments. Examples: >>> def main_sp(config, args): diff --git a/paddlespeech/t2s/utils/checkpoint.py b/paddlespeech/t2s/utils/checkpoint.py index 1e222c50c12790f3ef5b63d24a6ebd1483122b1b..a3a19c0a022ae287bd5d9489e6c18683d06f72b9 100644 --- a/paddlespeech/t2s/utils/checkpoint.py +++ b/paddlespeech/t2s/utils/checkpoint.py @@ -25,7 +25,8 @@ def _load_latest_checkpoint(checkpoint_dir: str) -> int: """Get the iteration number corresponding to the latest saved checkpoint. Args: - checkpoint_dir (str): the directory where checkpoint is saved. + checkpoint_dir (str): + the directory where checkpoint is saved. Returns: int: the latest iteration number. @@ -46,8 +47,10 @@ def _save_checkpoint(checkpoint_dir: str, iteration: int): """Save the iteration number of the latest model to be checkpointed. Args: - checkpoint_dir (str): the directory where checkpoint is saved. - iteration (int): the latest iteration number. + checkpoint_dir (str): + the directory where checkpoint is saved. + iteration (int): + the latest iteration number. Returns: None @@ -65,11 +68,14 @@ def load_parameters(model, """Load a specific model checkpoint from disk. Args: - model (Layer): model to load parameters. - optimizer (Optimizer, optional): optimizer to load states if needed. - Defaults to None. - checkpoint_dir (str, optional): the directory where checkpoint is saved. - checkpoint_path (str, optional): if specified, load the checkpoint + model (Layer): + model to load parameters. + optimizer (Optimizer, optional): + optimizer to load states if needed. Defaults to None. + checkpoint_dir (str, optional): + the directory where checkpoint is saved. + checkpoint_path (str, optional): + if specified, load the checkpoint stored in the checkpoint_path and the argument 'checkpoint_dir' will be ignored. Defaults to None. @@ -113,11 +119,14 @@ def save_parameters(checkpoint_dir, iteration, model, optimizer=None): """Checkpoint the latest trained model parameters. Args: - checkpoint_dir (str): the directory where checkpoint is saved. - iteration (int): the latest iteration number. - model (Layer): model to be checkpointed. - optimizer (Optimizer, optional): optimizer to be checkpointed. - Defaults to None. + checkpoint_dir (str): + the directory where checkpoint is saved. + iteration (int): + the latest iteration number. + model (Layer): + model to be checkpointed. + optimizer (Optimizer, optional): + optimizer to be checkpointed. Defaults to None. Returns: None diff --git a/paddlespeech/t2s/utils/error_rate.py b/paddlespeech/t2s/utils/error_rate.py index 41b13b75f06eceefa1c35492fece64864037adc7..76a4f45bee0618668725e6ab88dfa223918b219e 100644 --- a/paddlespeech/t2s/utils/error_rate.py +++ b/paddlespeech/t2s/utils/error_rate.py @@ -71,10 +71,14 @@ def word_errors(reference, hypothesis, ignore_case=False, delimiter=' '): hypothesis sequence in word-level. Args: - reference (str): The reference sentence. - hypothesis (str): The hypothesis sentence. - ignore_case (bool): Whether case-sensitive or not. - delimiter (char(str)): Delimiter of input sentences. + reference (str): + The reference sentence. + hypothesis (str): + The hypothesis sentence. + ignore_case (bool): + Whether case-sensitive or not. + delimiter (char(str)): + Delimiter of input sentences. Returns: list: Levenshtein distance and word number of reference sentence. diff --git a/paddlespeech/t2s/utils/h5_utils.py b/paddlespeech/t2s/utils/h5_utils.py index 75c2e448820da8a6dc183e69e5b1e7683f258b28..7558e046a8a7d234cde64eaaa301fc1ec82144f2 100644 --- a/paddlespeech/t2s/utils/h5_utils.py +++ b/paddlespeech/t2s/utils/h5_utils.py @@ -24,8 +24,10 @@ import numpy as np def read_hdf5(filename: Union[Path, str], dataset_name: str) -> Any: """Read a dataset from a HDF5 file. Args: - filename (Union[Path, str]): Path of the HDF5 file. - dataset_name (str): Name of the dataset to read. + filename (Union[Path, str]): + Path of the HDF5 file. + dataset_name (str): + Name of the dataset to read. Returns: Any: The retrieved dataset. diff --git a/paddlespeech/t2s/utils/internals.py b/paddlespeech/t2s/utils/internals.py index 6c10bd2d53ebb944e065ab8fac4fc1ffdfadd994..830e8a80fcb3f7186ba15c416bb310f4a4f17ed1 100644 --- a/paddlespeech/t2s/utils/internals.py +++ b/paddlespeech/t2s/utils/internals.py @@ -22,7 +22,8 @@ def convert_dtype_to_np_dtype_(dtype): Convert paddle's data type to corrsponding numpy data type. Args: - dtype(np.dtype): the data type in paddle. + dtype(np.dtype): + the data type in paddle. Returns: type: the data type in numpy. diff --git a/setup.py b/setup.py index 716c03bc5fd3e0cccd3a034d988ae8e127f8b735..a3ef753a026de9638491e4f42463d5988a38b04c 100644 --- a/setup.py +++ b/setup.py @@ -48,6 +48,7 @@ base = [ "pandas", "paddlenlp", "paddlespeech_feat", + "Pillow>=9.0.0" "praatio==5.0.0", "pypinyin", "pypinyin-dict", @@ -68,14 +69,16 @@ base = [ "prettytable", "zhon", "colorlog", - "pathos == 0.2.8" + "pathos == 0.2.8", + "braceexpand", + "pyyaml" ] server = [ "fastapi", "uvicorn", "pattern_singleton", - "websockets", + "websockets" ] requirements = { @@ -87,7 +90,6 @@ requirements = { "gpustat", "paddlespeech_ctcdecoders", "phkit", - "Pillow", "pybind11", "pypi-kenlm", "snakeviz", diff --git a/speechx/examples/ds2_ol/onnx/local/pd_infer_shape.py b/speechx/examples/ds2_ol/onnx/local/pd_infer_shape.py deleted file mode 100755 index c6e693c6bb53a9d354bfd5b8daf50866259cef59..0000000000000000000000000000000000000000 --- a/speechx/examples/ds2_ol/onnx/local/pd_infer_shape.py +++ /dev/null @@ -1,111 +0,0 @@ -#!/usr/bin/env python3 -W ignore::DeprecationWarning -# https://github.com/jiangjiajun/PaddleUtils/blob/main/paddle/README.md#2-%E4%BF%AE%E6%94%B9paddle%E6%A8%A1%E5%9E%8B%E8%BE%93%E5%85%A5shape -import argparse - -# paddle inference shape - - -def process_old_ops_desc(program): - """set matmul op head_number attr to 1 is not exist. - - Args: - program (_type_): _description_ - """ - for i in range(len(program.blocks[0].ops)): - if program.blocks[0].ops[i].type == "matmul": - if not program.blocks[0].ops[i].has_attr("head_number"): - program.blocks[0].ops[i]._set_attr("head_number", 1) - - -def infer_shape(program, input_shape_dict): - # 2002002 - model_version = program.desc._version() - # 2.2.2 - paddle_version = paddle.__version__ - major_ver = model_version // 1000000 - minor_ver = (model_version - major_ver * 1000000) // 1000 - patch_ver = model_version - major_ver * 1000000 - minor_ver * 1000 - model_version = "{}.{}.{}".format(major_ver, minor_ver, patch_ver) - if model_version != paddle_version: - print( - f"[WARNING] The model is saved by paddlepaddle v{model_version}, but now your paddlepaddle is version of {paddle_version}, this difference may cause error, it is recommend you reinstall a same version of paddlepaddle for this model" - ) - - OP_WITHOUT_KERNEL_SET = { - 'feed', 'fetch', 'recurrent', 'go', 'rnn_memory_helper_grad', - 'conditional_block', 'while', 'send', 'recv', 'listen_and_serv', - 'fl_listen_and_serv', 'ncclInit', 'select', 'checkpoint_notify', - 'gen_bkcl_id', 'c_gen_bkcl_id', 'gen_nccl_id', 'c_gen_nccl_id', - 'c_comm_init', 'c_sync_calc_stream', 'c_sync_comm_stream', - 'queue_generator', 'dequeue', 'enqueue', 'heter_listen_and_serv', - 'c_wait_comm', 'c_wait_compute', 'c_gen_hccl_id', 'c_comm_init_hccl', - 'copy_cross_scope' - } - - for k, v in input_shape_dict.items(): - program.blocks[0].var(k).desc.set_shape(v) - - for i in range(len(program.blocks)): - for j in range(len(program.blocks[0].ops)): - # for ops - if program.blocks[i].ops[j].type in OP_WITHOUT_KERNEL_SET: - print(f"not infer: {program.blocks[i].ops[j].type} op") - continue - print(f"infer: {program.blocks[i].ops[j].type} op") - program.blocks[i].ops[j].desc.infer_shape(program.blocks[i].desc) - - -def parse_arguments(): - # python pd_infer_shape.py --model_dir data/exp/deepspeech2_online/checkpoints \ - # --model_filename avg_1.jit.pdmodel\ - # --params_filename avg_1.jit.pdiparams \ - # --save_dir . \ - # --input_shape_dict="{'audio_chunk':[1,-1,161], 'audio_chunk_lens':[1], 'chunk_state_c_box':[5, 1, 1024], 'chunk_state_h_box':[5,1,1024]}" - parser = argparse.ArgumentParser() - parser.add_argument( - '--model_dir', - required=True, - help='Path of directory saved the input model.') - parser.add_argument( - '--model_filename', required=True, help='model.pdmodel.') - parser.add_argument( - '--params_filename', required=True, help='model.pdiparams.') - parser.add_argument( - '--save_dir', - required=True, - help='directory to save the exported model.') - parser.add_argument( - '--input_shape_dict', required=True, help="The new shape information.") - return parser.parse_args() - - -if __name__ == '__main__': - args = parse_arguments() - - import paddle - paddle.enable_static() - import paddle.fluid as fluid - - input_shape_dict_str = args.input_shape_dict - input_shape_dict = eval(input_shape_dict_str) - - print("Start to load paddle model...") - exe = fluid.Executor(fluid.CPUPlace()) - - prog, ipts, outs = fluid.io.load_inference_model( - args.model_dir, - exe, - model_filename=args.model_filename, - params_filename=args.params_filename) - - process_old_ops_desc(prog) - infer_shape(prog, input_shape_dict) - - fluid.io.save_inference_model( - args.save_dir, - ipts, - outs, - exe, - prog, - model_filename=args.model_filename, - params_filename=args.params_filename) diff --git a/speechx/examples/ds2_ol/onnx/local/pd_prune_model.py b/speechx/examples/ds2_ol/onnx/local/pd_prune_model.py deleted file mode 100755 index 5386a971a352e0e529a458a6ccfd02e78153ba40..0000000000000000000000000000000000000000 --- a/speechx/examples/ds2_ol/onnx/local/pd_prune_model.py +++ /dev/null @@ -1,158 +0,0 @@ -#!/usr/bin/env python3 -W ignore::DeprecationWarning -# https://github.com/jiangjiajun/PaddleUtils/blob/main/paddle/README.md#1-%E8%A3%81%E5%89%AApaddle%E6%A8%A1%E5%9E%8B -import argparse -import sys -from typing import List - -# paddle prune model. - - -def prepend_feed_ops(program, - feed_target_names: List[str], - feed_holder_name='feed'): - import paddle.fluid.core as core - if len(feed_target_names) == 0: - return - - global_block = program.global_block() - feed_var = global_block.create_var( - name=feed_holder_name, - type=core.VarDesc.VarType.FEED_MINIBATCH, - persistable=True, ) - - for i, name in enumerate(feed_target_names, 0): - if not global_block.has_var(name): - print( - f"The input[{i}]: '{name}' doesn't exist in pruned inference program, which will be ignored in new saved model." - ) - continue - - out = global_block.var(name) - global_block._prepend_op( - type='feed', - inputs={'X': [feed_var]}, - outputs={'Out': [out]}, - attrs={'col': i}, ) - - -def append_fetch_ops(program, - fetch_target_names: List[str], - fetch_holder_name='fetch'): - """in the place, we will add the fetch op - - Args: - program (_type_): inference program - fetch_target_names (List[str]): target names - fetch_holder_name (str, optional): fetch op name. Defaults to 'fetch'. - """ - import paddle.fluid.core as core - global_block = program.global_block() - fetch_var = global_block.create_var( - name=fetch_holder_name, - type=core.VarDesc.VarType.FETCH_LIST, - persistable=True, ) - - print(f"the len of fetch_target_names: {len(fetch_target_names)}") - - for i, name in enumerate(fetch_target_names): - global_block.append_op( - type='fetch', - inputs={'X': [name]}, - outputs={'Out': [fetch_var]}, - attrs={'col': i}, ) - - -def insert_fetch(program, - fetch_target_names: List[str], - fetch_holder_name='fetch'): - """in the place, we will add the fetch op - - Args: - program (_type_): inference program - fetch_target_names (List[str]): target names - fetch_holder_name (str, optional): fetch op name. Defaults to 'fetch'. - """ - global_block = program.global_block() - - # remove fetch - need_to_remove_op_index = [] - for i, op in enumerate(global_block.ops): - if op.type == 'fetch': - need_to_remove_op_index.append(i) - - for index in reversed(need_to_remove_op_index): - global_block._remove_op(index) - - program.desc.flush() - - # append new fetch - append_fetch_ops(program, fetch_target_names, fetch_holder_name) - - -def parse_arguments(): - parser = argparse.ArgumentParser() - parser.add_argument( - '--model_dir', - required=True, - help='Path of directory saved the input model.') - parser.add_argument( - '--model_filename', required=True, help='model.pdmodel.') - parser.add_argument( - '--params_filename', required=True, help='model.pdiparams.') - parser.add_argument( - '--output_names', - required=True, - help='The outputs of model. sep by comma') - parser.add_argument( - '--save_dir', - required=True, - help='directory to save the exported model.') - parser.add_argument('--debug', default=False, help='output debug info.') - return parser.parse_args() - - -if __name__ == '__main__': - args = parse_arguments() - - args.output_names = args.output_names.split(",") - - if len(set(args.output_names)) < len(args.output_names): - print( - f"[ERROR] There's dumplicate name in --output_names {args.output_names}, which is not allowed." - ) - sys.exit(-1) - - import paddle - paddle.enable_static() - # hack prepend_feed_ops - paddle.fluid.io.prepend_feed_ops = prepend_feed_ops - - import paddle.fluid as fluid - - print("start to load paddle model") - exe = fluid.Executor(fluid.CPUPlace()) - prog, ipts, outs = fluid.io.load_inference_model( - args.model_dir, - exe, - model_filename=args.model_filename, - params_filename=args.params_filename) - - print("start to load insert fetch op") - new_outputs = [] - insert_fetch(prog, args.output_names) - for out_name in args.output_names: - new_outputs.append(prog.global_block().var(out_name)) - - # not equal to paddle.static.save_inference_model - fluid.io.save_inference_model( - args.save_dir, - ipts, - new_outputs, - exe, - prog, - model_filename=args.model_filename, - params_filename=args.params_filename) - - if args.debug: - for op in prog.global_block().ops: - print(op) diff --git a/speechx/examples/ds2_ol/onnx/local/prune.sh b/speechx/examples/ds2_ol/onnx/local/prune.sh deleted file mode 100755 index 64636bccf79043b6e6eac6a917fb522388d2b70b..0000000000000000000000000000000000000000 --- a/speechx/examples/ds2_ol/onnx/local/prune.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/bash - -set -e - -if [ $# != 5 ]; then - # local/prune.sh data/exp/deepspeech2_online/checkpoints avg_1.jit.pdmodel avg_1.jit.pdiparams softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 $PWD - echo "usage: $0 model_dir model_filename param_filename outputs_names save_dir" - exit 1 -fi - -dir=$1 -model=$2 -param=$3 -outputs=$4 -save_dir=$5 - - -python local/pd_prune_model.py \ - --model_dir $dir \ - --model_filename $model \ - --params_filename $param \ - --output_names $outputs \ - --save_dir $save_dir \ No newline at end of file diff --git a/speechx/examples/ds2_ol/onnx/run.sh b/speechx/examples/ds2_ol/onnx/run.sh index b4df9d94ff87104c9fad00d85d7b775d7587d01a..3dc5e9100da3d51173e08ac36ddd98b998951f96 100755 --- a/speechx/examples/ds2_ol/onnx/run.sh +++ b/speechx/examples/ds2_ol/onnx/run.sh @@ -39,41 +39,10 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ];then popd fi -output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 -if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ];then - # prune model by outputs - mkdir -p $exp/prune - - # prune model deps on output_names. - ./local/prune.sh $dir $model $param $output_names $exp/prune -fi - -# aishell rnn hidden is 1024 -# wenetspeech rnn hiddn is 2048 -if [ $model_type == 'aishell' ];then - input_shape_dict="{'audio_chunk':[1,-1,161], 'audio_chunk_lens':[1], 'chunk_state_c_box':[5, 1, 1024], 'chunk_state_h_box':[5,1,1024]}" -elif [ $model_type == 'wenetspeech' ];then - input_shape_dict="{'audio_chunk':[1,-1,161], 'audio_chunk_lens':[1], 'chunk_state_c_box':[5, 1, 2048], 'chunk_state_h_box':[5,1,2048]}" -else - echo "not support: $model_type" - exit -1 -fi -if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ];then - # infer shape by new shape - mkdir -p $exp/shape - echo $input_shape_dict - python3 local/pd_infer_shape.py \ - --model_dir $dir \ - --model_filename $model \ - --params_filename $param \ - --save_dir $exp/shape \ - --input_shape_dict="${input_shape_dict}" -fi - input_file=$exp/static_ds2online_inputs.pickle test -e $input_file -if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ];then +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ];then # to onnx ./local/tonnx.sh $dir $model $param $exp/model.onnx @@ -81,7 +50,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ];then fi -if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ] ;then +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ] ;then # ort graph optmize ./local/ort_opt.py --model_in $exp/model.onnx --opt_level 0 --model_out $exp/model.ort.opt.onnx @@ -89,7 +58,7 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ] ;then fi -if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ];then +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ];then # convert opset_num to 11 ./local/onnx_convert_opset.py --target-opset 11 --model-file $exp/model.ort.opt.onnx --save-model $exp/model.optset11.onnx