diff --git a/.gitignore b/.gitignore index 8cbb734dfd364215a4436dd67ca4a74a314cdcf0..cc8fff8770b97a3f31eb49270ad32ac25af30fad 100644 --- a/.gitignore +++ b/.gitignore @@ -15,6 +15,7 @@ build docs/build/ +docs/topic/ctc/warp-ctc/ tools/venv tools/kenlm diff --git a/.mergify.yml b/.mergify.yml index 2c30721f70411f728784792a0bf248ea62e480bf..3347c6dc3e9c0864c991c64cebd7ea1b27f0dbb7 100644 --- a/.mergify.yml +++ b/.mergify.yml @@ -32,6 +32,12 @@ pull_request_rules: actions: label: remove: ["conflicts"] + - name: "auto add label=Dataset" + conditions: + - files~=^dataset/ + actions: + label: + add: ["Dataset"] - name: "auto add label=S2T" conditions: - files~=^paddlespeech/s2t/ @@ -50,18 +56,30 @@ pull_request_rules: actions: label: add: ["Audio"] - - name: "auto add label=TextProcess" + - name: "auto add label=Vector" + conditions: + - files~=^paddlespeech/vector/ + actions: + label: + add: ["Vector"] + - name: "auto add label=Text" conditions: - files~=^paddlespeech/text/ actions: label: - add: ["TextProcess"] + add: ["Text"] - name: "auto add label=Example" conditions: - files~=^examples/ actions: label: add: ["Example"] + - name: "auto add label=CLI" + conditions: + - files~=^paddlespeech/cli + actions: + label: + add: ["CLI"] - name: "auto add label=Demo" conditions: - files~=^demos/ @@ -70,13 +88,13 @@ pull_request_rules: add: ["Demo"] - name: "auto add label=README" conditions: - - files~=README.md + - files~=(README.md|READEME_cn.md) actions: label: add: ["README"] - name: "auto add label=Documentation" conditions: - - files~=^docs/ + - files~=^(docs/|CHANGELOG.md|paddleaudio/CHANGELOG.md) actions: label: add: ["Documentation"] @@ -88,10 +106,16 @@ pull_request_rules: add: ["CI"] - name: "auto add label=Installation" conditions: - - files~=^(tools/|setup.py|setup.sh) + - files~=^(tools/|setup.py|setup.cfg|setup_audio.py) actions: label: add: ["Installation"] + - name: "auto add label=Test" + conditions: + - files~=^(tests/) + actions: + label: + add: ["Test"] - name: "auto add label=mergify" conditions: - files~=^.mergify.yml diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000000000000000000000000000000000000..5ffe809841145f8403ae50b118c3e29c8b4899a9 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,11 @@ +# Changelog + + +Date: 2022-1-10, Author: Jackwaterveg. +Add features to: CLI: + - Support English (librispeech/asr1/transformer). + - Support choosing `decode_method` for conformer and transformer models. + - Refactor the config, using the unified config. + - PRLink: https://github.com/PaddlePaddle/PaddleSpeech/pull/1297 + +*** diff --git a/README.md b/README.md index 328508f1f48a5a4c9949ff7e501e687284d7d76d..cca1cb5399ef2e398718befb5ff501735c46a20f 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@

Quick Start - | Tutorials + | Documents | Models List @@ -25,14 +25,6 @@

- - **PaddleSpeech** is an open-source toolkit on [PaddlePaddle](https://github.com/PaddlePaddle/Paddle) platform for a variety of critical tasks in speech and audio, with the state-of-art and influential models. @@ -61,7 +53,6 @@ from https://github.com/18F/open-source-guide/blob/18f-pages/pages/making-readme 我认为跑步最重要的就是给我带来了身体健康。 - @@ -95,7 +86,7 @@ from https://github.com/18F/open-source-guide/blob/18f-pages/pages/making-readme - + @@ -114,6 +105,13 @@ from https://github.com/18F/open-source-guide/blob/18f-pages/pages/making-readme
+ + + +
Input Text Input Text Synthetic Audio
季姬寂,集鸡,鸡即棘鸡。棘鸡饥叽,季姬及箕稷济鸡。鸡既济,跻姬笈,季姬忌,急咭鸡,鸡急,继圾几,季姬急,即籍箕击鸡,箕疾击几伎,伎即齑,鸡叽集几基,季姬急极屐击鸡,鸡既殛,季姬激,即记《季姬击鸡记》。 + +
+
@@ -121,7 +119,39 @@ from https://github.com/18F/open-source-guide/blob/18f-pages/pages/making-readme For more synthesized audios, please refer to [PaddleSpeech Text-to-Speech samples](https://paddlespeech.readthedocs.io/en/latest/tts/demo.html). -### Features: +##### Punctuation Restoration +
+ + + + + + + + + + + + + +
Input Text Output Text
今天的天气真不错啊你下午有空吗我想约你一起去吃饭今天的天气真不错啊!你下午有空吗?我想约你一起去吃饭。
+ +
+ +### ⭐ Examples +- **[PaddleBoBo](https://github.com/JiehangXie/PaddleBoBo): Use PaddleSpeech TTS to generate virtual human voice.** + +
+ +### 🔥 Hot Activities + +- 2021.12.21~12.24 + + 4 Days Live Courses: Depth interpretation of PaddleSpeech! + + **Courses videos and related materials: https://aistudio.baidu.com/aistudio/education/group/info/25130** + +### Features Via the easy-to-use, efficient, flexible and scalable implementation, our vision is to empower both industrial application and academic research, including training, inference & testing modules, and deployment process. To be more specific, this toolkit features at: - 📦 **Ease of Use**: low barriers to install, and [CLI](#quick-start) is available to quick-start your journey. @@ -132,8 +162,7 @@ Via the easy-to-use, efficient, flexible and scalable implementation, our vision - 🔬 *Integration of mainstream models and datasets*: the toolkit implements modules that participate in the whole pipeline of the speech tasks, and uses mainstream datasets like LibriSpeech, LJSpeech, AIShell, CSMSC, etc. See also [model list](#model-list) for more details. - 🧩 *Cascaded models application*: as an extension of the typical traditional audio tasks, we combine the workflows of the aforementioned tasks with other fields like Natural language processing (NLP) and Computer Vision (CV). - -### Recent Update: +### Recent Update - @@ -313,7 +344,7 @@ The current hyperlinks redirect to [Previous Parakeet](https://github.com/Paddle - + - + + + + + + + + + + + @@ -383,11 +428,37 @@ The current hyperlinks redirect to [Previous Parakeet](https://github.com/Paddle
VocoderVocoder WaveFlow LJSpeech @@ -333,7 +364,21 @@ The current hyperlinks redirect to [Previous Parakeet](https://github.com/Paddle Multi Band MelGAN-csmsc
Style MelGANCSMSC + Style MelGAN-csmsc +
HiFiGANCSMSC + HiFiGAN-csmsc +
Voice Cloning GE2E
+**Punctuation Restoration** + + + + + + + + + + + + + + + + + + + +
Task Dataset Model Type Link
Punctuation RestorationIWLST2012_zhErnie Linear + iwslt2012-punc0 +
+ ## Documents Normally, [Speech SoTA](https://paperswithcode.com/area/speech), [Audio SoTA](https://paperswithcode.com/area/audio) and [Music SoTA](https://paperswithcode.com/area/music) give you an overview of the hot academic topics in the related area. To focus on the tasks in PaddleSpeech, you will find the following guidelines are helpful to grasp the core ideas. - [Installation](./docs/source/install.md) +- [Quick Start](#quickstart) +- [Some Demos](./demos/README.md) - Tutorials - [Automatic Speech Recognition](./docs/source/asr/quick_start.md) - [Introduction](./docs/source/asr/models_introduction.md) @@ -399,9 +470,12 @@ Normally, [Speech SoTA](https://paperswithcode.com/area/speech), [Audio SoTA](ht - [Advanced Usage](./docs/source/tts/advanced_usage.md) - [Chinese Rule Based Text Frontend](./docs/source/tts/zh_text_frontend.md) - [Test Audio Samples](https://paddlespeech.readthedocs.io/en/latest/tts/demo.html) - - Audio Classification - - Speech Translation + - [Audio Classification](./demos/audio_tagging/README.md) + - [Speech Translation](./demos/speech_translation/README.md) - [Released Models](./docs/source/released_model.md) +- [Community](#Community) +- [Welcome to contribute](#contribution) +- [License](#License) The Text-to-Speech module is originally called [Parakeet](https://github.com/PaddlePaddle/Parakeet), and now merged with this repository. If you are interested in academic research about this task, please see [TTS research overview](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/docs/source/tts#overview). Also, [this document](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/tts/models_introduction.md) is a good guideline for the pipeline components. @@ -416,7 +490,7 @@ howpublished = {\url{https://github.com/PaddlePaddle/PaddleSpeech}}, year={2021} } ``` - + ## Contribute to PaddleSpeech You are warmly welcome to submit questions in [discussions](https://github.com/PaddlePaddle/PaddleSpeech/discussions) and bug reports in [issues](https://github.com/PaddlePaddle/PaddleSpeech/issues)! Also, we highly appreciate if you are willing to contribute to this project! @@ -460,13 +534,16 @@ You are warmly welcome to submit questions in [discussions](https://github.com/P ## Acknowledgement -- Many thanks to [yeyupiaoling](https://github.com/yeyupiaoling) for years of attention, constructive advice and great help. + +- Many thanks to [yeyupiaoling](https://github.com/yeyupiaoling)/[PPASR](https://github.com/yeyupiaoling/PPASR)/[PaddlePaddle-DeepSpeech](https://github.com/yeyupiaoling/PaddlePaddle-DeepSpeech)/[VoiceprintRecognition-PaddlePaddle](https://github.com/yeyupiaoling/VoiceprintRecognition-PaddlePaddle)/[AudioClassification-PaddlePaddle](https://github.com/yeyupiaoling/AudioClassification-PaddlePaddle) for years of attention, constructive advice and great help. - Many thanks to [AK391](https://github.com/AK391) for TTS web demo on Huggingface Spaces using Gradio. - Many thanks to [mymagicpower](https://github.com/mymagicpower) for the Java implementation of ASR upon [short](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_sdk) and [long](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_long_audio_sdk) audio files. - +- Many thanks to [JiehangXie](https://github.com/JiehangXie)/[PaddleBoBo](https://github.com/JiehangXie/PaddleBoBo) for developing Virtual Uploader(VUP)/Virtual YouTuber(VTuber) with PaddleSpeech TTS function. +- Many thanks to [745165806](https://github.com/745165806)/[PaddleSpeechTask](https://github.com/745165806/PaddleSpeechTask) for contributing Punctuation Restoration model. Besides, PaddleSpeech depends on a lot of open source repositories. See [references](./docs/source/reference.md) for more information. + ## License PaddleSpeech is provided under the [Apache-2.0 License](./LICENSE). diff --git a/README_cn.md b/README_cn.md index 551c9395b8244d7c8d2fdbad0a8d2cb5e0f029f8..ddf189c318f70c3ef86b6757af8f30ddfc1216fc 100644 --- a/README_cn.md +++ b/README_cn.md @@ -1,4 +1,4 @@ - (简体中文|[English](./README.md)) +(简体中文|[English](./README.md))

@@ -6,7 +6,7 @@

快速开始 - | 教程 + | 教程文档 | 模型列表 @@ -30,7 +30,7 @@ from https://github.com/18F/open-source-guide/blob/18f-pages/pages/making-readme 4.What is the goal of this project? --> -**PaddleSpeech** 是基于飞桨 [PaddlePaddle](https://github.com/PaddlePaddle/Paddle) 深度学习开源框架平台上的一个开源模型库,用于语音和音频中的各种关键任务的开发,包含大量前沿和有影响力的模型,一些典型的应用示例如下: +**PaddleSpeech** 是基于飞桨 [PaddlePaddle](https://github.com/PaddlePaddle/Paddle) 的语音方向的开源模型库,用于语音和音频中的各种关键任务的开发,包含大量基于深度学习前沿和有影响力的模型,一些典型的应用示例如下: ##### 语音识别
@@ -38,7 +38,7 @@ from https://github.com/18F/open-source-guide/blob/18f-pages/pages/making-readme 输入音频 - 识别结果 + 识别结果 @@ -68,8 +68,8 @@ from https://github.com/18F/open-source-guide/blob/18f-pages/pages/making-readme - - + + @@ -90,7 +90,7 @@ from https://github.com/18F/open-source-guide/blob/18f-pages/pages/making-readme
输入音频 翻译结果 输入音频 翻译结果
- + @@ -109,6 +109,13 @@ from https://github.com/18F/open-source-guide/blob/18f-pages/pages/making-readme
+ + + +
输入文本 输入文本 合成音频
季姬寂,集鸡,鸡即棘鸡。棘鸡饥叽,季姬及箕稷济鸡。鸡既济,跻姬笈,季姬忌,急咭鸡,鸡急,继圾几,季姬急,即籍箕击鸡,箕疾击几伎,伎即齑,鸡叽集几基,季姬急极屐击鸡,鸡既殛,季姬激,即记《季姬击鸡记》。 + +
+
@@ -116,7 +123,39 @@ from https://github.com/18F/open-source-guide/blob/18f-pages/pages/making-readme 更多合成音频,可以参考 [PaddleSpeech 语音合成音频示例](https://paddlespeech.readthedocs.io/en/latest/tts/demo.html)。 -### 特性: +##### 标点恢复 +
+ + + + + + + + + + + + + +
输入文本 输出文本
今天的天气真不错啊你下午有空吗我想约你一起去吃饭今天的天气真不错啊!你下午有空吗?我想约你一起去吃饭。
+ +
+ +### ⭐ 应用案例 +- **[PaddleBoBo](https://github.com/JiehangXie/PaddleBoBo): 使用 PaddleSpeech 的语音合成模块生成虚拟人的声音。** + +
+ + +### 🔥 热门活动 + +- 2021.12.21~12.24 + + 4 日直播课: 深度解读 PaddleSpeech 语音技术! + + **直播回放与课件资料: https://aistudio.baidu.com/aistudio/education/group/info/25130** +### 特性 本项目采用了易用、高效、灵活以及可扩展的实现,旨在为工业应用、学术研究提供更好的支持,实现的功能包含训练、推断以及测试模块,以及部署过程,主要包括 - 📦 **易用性**: 安装门槛低,可使用 [CLI](#quick-start) 快速开始。 @@ -127,7 +166,7 @@ from https://github.com/18F/open-source-guide/blob/18f-pages/pages/making-readme - 🔬 主流模型及数据集: 本工具包实现了参与整条语音任务流水线的各个模块,并且采用了主流数据集如 LibriSpeech、LJSpeech、AIShell、CSMSC,详情请见 [模型列表](#model-list)。 - 🧩 级联模型应用: 作为传统语音任务的扩展,我们结合了自然语言处理、计算机视觉等任务,实现更接近实际需求的产业级应用。 -### 近期更新: +### 近期更新 - - + @@ -254,6 +293,7 @@ The current hyperlinks redirect to [Previous Parakeet](https://github.com/Paddle
语音识别模块种类语音转文本模块类型 数据集 模型种类 链接
+ PaddleSpeech 的 **语音合成** 主要包含三个模块:文本前端、声学模型和声码器。声学模型和声码器模型如下: @@ -261,8 +301,8 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块:文本前端、声 - - + + @@ -302,7 +342,7 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块:文本前端、声 - + - + + + + + + + + + + + @@ -348,6 +402,7 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块:文本前端、声
语音合成模块类型 模型种类 数据集 链接 数据集 链接
声码器声码器 WaveFlow LJSpeech @@ -322,7 +362,21 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块:文本前端、声 Multi Band MelGAN-csmsc
Style MelGANCSMSC + Style MelGAN-csmsc +
HiFiGANCSMSC + HiFiGAN-csmsc +
声音克隆 GE2E
+ **声音分类** @@ -373,25 +428,62 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块:文本前端、声
-## 文档 +**标点恢复** + + + + + + + + + + + + + + + + + + + +
任务 数据集 模型种类 链接
标点恢复IWLST2012_zhErnie Linear + iwslt2012-punc0 +
+ +## 教程文档 -[语音 SoTA](https://paperswithcode.com/area/speech)、[声音 SoTA](https://paperswithcode.com/area/audio)、[音乐 SoTA](https://paperswithcode.com/area/music) 概述了相关领域的热门学术话题。对于 PaddleSpeech 的所关注的任务,以下指南有助于掌握核心思想。 +对于 PaddleSpeech 的所关注的任务,以下指南有助于帮助开发者快速入门,了解语音相关核心思想。 -- [安装](./docs/source/install.md) -- 教程 - - [语音识别](./docs/source/asr/quick_start.md) +- [下载安装](./docs/source/install_cn.md) +- [快速开始](#快速开始) +- Notebook基础教程 + - [声音分类](./docs/tutorial/cls/cls_tutorial.ipynb) + - [语音识别](./docs/tutorial/asr/tutorial_transformer.ipynb) + - [语音翻译](./docs/tutorial/st/st_tutorial.ipynb) + - [声音合成](./docs/tutorial/tts/tts_tutorial.ipynb) + - [示例Demo](./demos/README.md) +- 进阶文档 + - [语音识别自定义训练](./docs/source/asr/quick_start.md) - [简介](./docs/source/asr/models_introduction.md) - [数据准备](./docs/source/asr/data_preparation.md) - [数据增强](./docs/source/asr/augmentation.md) - [Ngram 语言模型](./docs/source/asr/ngram_lm.md) - - [语音合成](./docs/source/tts/quick_start.md) + - [语音合成自定义训练](./docs/source/tts/quick_start.md) - [简介](./docs/source/tts/models_introduction.md) - [进阶用法](./docs/source/tts/advanced_usage.md) - [中文文本前端](./docs/source/tts/zh_text_frontend.md) - - [音频示例](https://paddlespeech.readthedocs.io/en/latest/tts/demo.html) - - 声音分类 - - 语音翻译 -- [模型](./docs/source/released_model.md) + - [测试语音样本](https://paddlespeech.readthedocs.io/en/latest/tts/demo.html) + - [声音分类](./demos/audio_tagging/README_cn.md) + - [语音翻译](./demos/speech_translation/README_cn.md) +- [模型列表](#模型列表) + - [语音识别](#语音识别模型) + - [语音合成](#语音合成模型) + - [声音分类](#声音分类模型) +- [技术交流群](#技术交流群) +- [欢迎贡献](#欢迎贡献) +- [License](#License) 语音合成模块最初被称为 [Parakeet](https://github.com/PaddlePaddle/Parakeet),现在与此仓库合并。如果您对该任务的学术研究感兴趣,请参阅 [TTS 研究概述](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/docs/source/tts#overview)。此外,[模型介绍](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/tts/models_introduction.md) 是了解语音合成流程的一个很好的指南。 @@ -408,9 +500,9 @@ year={2021} } ``` + ## 参与 PaddleSpeech 的开发 - 热烈欢迎您在[Discussions](https://github.com/PaddlePaddle/PaddleSpeech/discussions) 中提交问题,并在[Issues](https://github.com/PaddlePaddle/PaddleSpeech/issues) 中指出发现的 bug。此外,我们非常希望您参与到 PaddleSpeech 的开发中! ### 贡献者 @@ -452,10 +544,12 @@ year={2021} ## 致谢 -- 非常感谢 [yeyupiaoling](https://github.com/yeyupiaoling) 多年来的关注和建议,以及在诸多问题上的帮助。 +- 非常感谢 [yeyupiaoling](https://github.com/yeyupiaoling)/[PPASR](https://github.com/yeyupiaoling/PPASR)/[PaddlePaddle-DeepSpeech](https://github.com/yeyupiaoling/PaddlePaddle-DeepSpeech)/[VoiceprintRecognition-PaddlePaddle](https://github.com/yeyupiaoling/VoiceprintRecognition-PaddlePaddle)/[AudioClassification-PaddlePaddle](https://github.com/yeyupiaoling/AudioClassification-PaddlePaddle) 多年来的关注和建议,以及在诸多问题上的帮助。 - 非常感谢 [AK391](https://github.com/AK391) 在 Huggingface Spaces 上使用 Gradio 对我们的语音合成功能进行网页版演示。 - 非常感谢 [mymagicpower](https://github.com/mymagicpower) 采用PaddleSpeech 对 ASR 的[短语音](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_sdk)及[长语音](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_long_audio_sdk)进行 Java 实现。 - +- 非常感谢 [JiehangXie](https://github.com/JiehangXie)/[PaddleBoBo](https://github.com/JiehangXie/PaddleBoBo) 采用 PaddleSpeech 语音合成功能实现 Virtual Uploader(VUP)/Virtual YouTuber(VTuber) 虚拟主播。 +- 非常感谢 [745165806](https://github.com/745165806)/[PaddleSpeechTask](https://github.com/745165806/PaddleSpeechTask) 贡献标点重建相关模型。 + 此外,PaddleSpeech 依赖于许多开源存储库。有关更多信息,请参阅 [references](./docs/source/reference.md)。 ## License diff --git a/demos/README.md b/demos/README.md index 28bab8bb318cfe7a8fa47b32b922d0961e522b1f..4482aa191cb7bf8b79d39ee37bb75590c24fe3b1 100644 --- a/demos/README.md +++ b/demos/README.md @@ -1,10 +1,15 @@ # Speech Application based on PaddleSpeech +([简体中文](./README_cn.md)|English) + The directory containes many speech applications in multi scenarios. -* audio tagging - tag audio label in vedio -* metaverse - 2D AR with TTS -* speech recogintion - vidio understanding +* audio tagging - multi-label tagging of an audio file +* automatic_video_subtitiles - generate subtitles from a video +* metaverse - 2D AR with TTS +* punctuation_restoration - restore punctuation from raw text +* speech recogintion - recognize text of an audio file * speech translation - end to end speech translation * story talker - book reader based on OCR and TTS * style_fs2 - multi style control for FastSpeech2 model +* text_to_speech - convert text into speech diff --git a/demos/README_cn.md b/demos/README_cn.md new file mode 100644 index 0000000000000000000000000000000000000000..242b4f070d41e791577c526610a7ae78ea24df3e --- /dev/null +++ b/demos/README_cn.md @@ -0,0 +1,15 @@ +# PaddleSpeech 语音应用 Demo + +(简体中文|[English](./README.md)) + +该目录包含基于 PaddleSpeech 开发的不同场景的语音应用 Demo: + +* 声音分类 - 基于 AudioSet 的 527 类标签的音频多标签分类。 +* 视频字幕生成 - 识别视频中语音的文本,并进行文本后处理。 +* 元宇宙 - 基于语音合成的 2D 增强现实。 +* 标点恢复 - 通常作为语音识别的文本后处理任务,为一段无标点的纯文本添加相应的标点符号。 +* 语音识别 - 识别一段音频中包含的语音文字。 +* 语音翻译 - 实时识别音频中的语言,并同时翻译成目标语言。 +* 会说话的故事书 - 基于 OCR 和语音合成的会说话的故事书。 +* 个性化语音合成 - 基于 FastSpeech2 模型的个性化语音合成。 +* 语音合成 - 基于给定的文本生成语音音频。 diff --git a/demos/audio_tagging/README.md b/demos/audio_tagging/README.md index 88ff363750e0453e3766a9d6b465d26075e2c785..9d4af0be6c286ecb5ea09baa9178217d67c3f20c 100644 --- a/demos/audio_tagging/README.md +++ b/demos/audio_tagging/README.md @@ -9,9 +9,9 @@ This demo is an implementation to tag an audio file with 527 [AudioSet](https:// ## Usage ### 1. Installation -```bash -pip install paddlespeech -``` +see [installation](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md). + +You can choose one way from easy, meduim and hard to install paddlespeech. ### 2. Prepare Input File The input of this demo should be a WAV file(`.wav`). diff --git a/demos/audio_tagging/README_cn.md b/demos/audio_tagging/README_cn.md index 3331a83a69625f178c685fbe18b565c2e70e3a03..79f87bf8c224e4d0c57368130d04fbc7b32d811e 100644 --- a/demos/audio_tagging/README_cn.md +++ b/demos/audio_tagging/README_cn.md @@ -9,9 +9,10 @@ ## 使用方法 ### 1. 安装 -```bash -pip install paddlespeech -``` +请看[安装文档](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install_cn.md)。 + +你可以从 easy,medium,hard 三中方式中选择一种方式安装。 + ### 2. 准备输入 这个 demo 的输入应该是一个 WAV 文件(`.wav`), diff --git a/demos/automatic_video_subtitiles/README.md b/demos/automatic_video_subtitiles/README.md index df4b0e264b1f1a5c64d896efe06bb44dc631630e..db6da40db0add1b61d95aecc389d645f001b735f 100644 --- a/demos/automatic_video_subtitiles/README.md +++ b/demos/automatic_video_subtitiles/README.md @@ -8,9 +8,9 @@ This demo is an implementation to automatic video subtitles from a video file. I ## Usage ### 1. Installation -```bash -pip install paddlespeech -``` +see [installation](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md). + +You can choose one way from easy, meduim and hard to install paddlespeech. ### 2. Prepare Input Get a video file with the speech of the specific language: diff --git a/demos/automatic_video_subtitiles/README_cn.md b/demos/automatic_video_subtitiles/README_cn.md index b217f921d7d5203a09308c050a3b076bf977a8a6..fc7b2cf6a04203dee389f4e2d640d3a3de57cc3a 100644 --- a/demos/automatic_video_subtitiles/README_cn.md +++ b/demos/automatic_video_subtitiles/README_cn.md @@ -6,9 +6,10 @@ 这个 demo 是一个为视频自动生成字幕的实现,它可以通过使用 `PaddleSpeech` 的单个命令或 python 中的几行代码来实现。 ## 使用方法 ### 1. 安装 -```bash -pip install paddlespeech -``` +请看[安装文档](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install_cn.md)。 + +你可以从 easy,medium,hard 三中方式中选择一种方式安装。 + ### 2. 准备输入 获取包含特定语言语音的视频文件: ```bash diff --git a/demos/metaverse/run.sh b/demos/metaverse/run.sh index e653dbb70dd445209481346666ea623d7c52b7c8..551f0b4e5f6bc8f61c61bb1c383c0cb64f60b6e7 100755 --- a/demos/metaverse/run.sh +++ b/demos/metaverse/run.sh @@ -37,17 +37,20 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # run tts CUDA_VISIBLE_DEVICES=${gpus} \ - python3 ${BIN_DIR}/synthesize_e2e.py \ - --fastspeech2-config=download/fastspeech2_nosil_baker_ckpt_0.4/default.yaml \ - --fastspeech2-checkpoint=download/fastspeech2_nosil_baker_ckpt_0.4/snapshot_iter_76000.pdz \ - --fastspeech2-stat=download/fastspeech2_nosil_baker_ckpt_0.4/speech_stats.npy \ - --pwg-config=download/pwg_baker_ckpt_0.4/pwg_default.yaml \ - --pwg-checkpoint=download/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ - --pwg-stat=download/pwg_baker_ckpt_0.4/pwg_stats.npy \ + python3 ${BIN_DIR}/../synthesize_e2e.py \ + --am=fastspeech2_csmsc \ + --am_config=download/fastspeech2_nosil_baker_ckpt_0.4/default.yaml \ + --am_ckpt=download/fastspeech2_nosil_baker_ckpt_0.4/snapshot_iter_76000.pdz \ + --am_stat=download/fastspeech2_nosil_baker_ckpt_0.4/speech_stats.npy \ + --voc=pwgan_csmsc \ + --voc_config=download/pwg_baker_ckpt_0.4/pwg_default.yaml \ + --voc_ckpt=download/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ + --voc_stat=download/pwg_baker_ckpt_0.4/pwg_stats.npy \ + --lang=zh \ --text=sentences.txt \ - --output-dir=output/wavs \ - --inference-dir=output/inference \ - --phones-dict=download/fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt + --output_dir=output/wavs \ + --inference_dir=output/inference \ + --phones_dict=download/fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt # output/inference is not needed here, which save the static models rm -rf output/inference fi diff --git a/demos/punctuation_restoration/README.md b/demos/punctuation_restoration/README.md index 966e387c6431bdb8b8da63c48ea693817111f2f6..518d437dc20a34251e24f58298b0f49dcd57e967 100644 --- a/demos/punctuation_restoration/README.md +++ b/demos/punctuation_restoration/README.md @@ -7,9 +7,10 @@ This demo is an implementation to restore punctuation from raw text. It can be d ## Usage ### 1. Installation -```bash -pip install paddlespeech -``` +see [installation](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md). + +You can choose one way from easy, meduim and hard to install paddlespeech. + ### 2. Prepare Input The input of this demo should be a text of the specific language that can be passed via argument. diff --git a/demos/punctuation_restoration/README_cn.md b/demos/punctuation_restoration/README_cn.md index 4f1e012391cfb899cecd375d33a12a6e80923985..9d4be8bf08a8d512cd4189367cca9cd8590a60f4 100644 --- a/demos/punctuation_restoration/README_cn.md +++ b/demos/punctuation_restoration/README_cn.md @@ -9,9 +9,10 @@ ## 使用方法 ### 1. 安装 -```bash -pip install paddlespeech -``` +请看[安装文档](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install_cn.md)。 + +你可以从 easy,medium,hard 三中方式中选择一种方式安装。 + ### 2. 准备输入 这个 demo 的输入是通过参数传递的特定语言的文本。 diff --git a/demos/speech_recognition/README.md b/demos/speech_recognition/README.md index 738acdc5ce6b4d59cc47fa930e911f64ecd90d55..c49afa35c2d8027011c333eb110eb22b1d08924d 100644 --- a/demos/speech_recognition/README.md +++ b/demos/speech_recognition/README.md @@ -8,9 +8,9 @@ This demo is an implementation to recognize text from a specific audio file. It ## Usage ### 1. Installation -```bash -pip install paddlespeech -``` +see [installation](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md). + +You can choose one way from easy, meduim and hard to install paddlespeech. ### 2. Prepare Input File The input of this demo should be a WAV file(`.wav`), and the sample rate must be the same as the model. @@ -23,8 +23,13 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee ### 3. Usage - Command Line(Recommended) ```bash + # Chinese paddlespeech asr --input ./zh.wav + # English + paddlespeech asr --model transformer_librispeech --lang en --input ./en.wav ``` + (It doesn't matter if package `paddlespeech-ctcdecoders` is not found, this package is optional.) + Usage: ```bash paddlespeech asr --help @@ -36,11 +41,15 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee - `sample_rate`: Sample rate of the model. Default: `16000`. - `config`: Config of asr task. Use pretrained model when it is None. Default: `None`. - `ckpt_path`: Model checkpoint. Use pretrained model when it is None. Default: `None`. + - `yes`: No additional parameters required. Once set this parameter, it means accepting the request of the program by default, which includes transforming the audio sample rate. Default: `False`. - `device`: Choose device to execute model inference. Default: default device of paddlepaddle in current environment. Output: ```bash + # Chinese [2021-12-08 13:12:34,063] [ INFO] [utils.py] [L225] - ASR Result: 我认为跑步最重要的就是给我带来了身体健康 + # English + [2022-01-12 11:51:10,815] [ INFO] - ASR Result: i knocked at the door on the ancient side of the building ``` - Python API @@ -56,6 +65,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee config=None, # Set `config` and `ckpt_path` to None to use pretrained model. ckpt_path=None, audio_file='./zh.wav', + force_yes=False, device=paddle.get_device()) print('ASR Result: \n{}'.format(text)) ``` @@ -73,4 +83,4 @@ Here is a list of pretrained models released by PaddleSpeech that can be used by | Model | Language | Sample Rate | :--- | :---: | :---: | | conformer_wenetspeech| zh| 16000 -| transformer_aishell| zh| 16000 +| transformer_librispeech| en| 16000 diff --git a/demos/speech_recognition/README_cn.md b/demos/speech_recognition/README_cn.md index 29e2343d38b80f177f8828c476b203a5df5be909..c2e38c91bc6b6374e8ab93f720b5c59330f3e05c 100644 --- a/demos/speech_recognition/README_cn.md +++ b/demos/speech_recognition/README_cn.md @@ -2,14 +2,15 @@ # 语音识别 ## 介绍 -语音识别解决让计算机程序自动转录语音的问题。 +语音识别是一项用计算机程序自动转录语音的技术。 这个 demo 是一个从给定音频文件识别文本的实现,它可以通过使用 `PaddleSpeech` 的单个命令或 python 中的几行代码来实现。 ## 使用方法 ### 1. 安装 -```bash -pip install paddlespeech -``` +请看[安装文档](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install_cn.md)。 + +你可以从 easy,medium,hard 三中方式中选择一种方式安装。 + ### 2. 准备输入 这个 demo 的输入应该是一个 WAV 文件(`.wav`),并且采样率必须与模型的采样率相同。 @@ -20,8 +21,13 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee ### 3. 使用方法 - 命令行 (推荐使用) ```bash + # 中文 paddlespeech asr --input ./zh.wav + # 英文 + paddlespeech asr --model transformer_librispeech --lang en --input ./en.wav ``` + (如果显示 `paddlespeech-ctcdecoders` 这个 python 包没有找到的 Error,没有关系,这个包是非必须的。) + 使用方法: ```bash paddlespeech asr --help @@ -33,11 +39,15 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee - `sample_rate`:音频采样率,默认值:`16000`。 - `config`:ASR 任务的参数文件,若不设置则使用预训练模型中的默认配置,默认值:`None`。 - `ckpt_path`:模型参数文件,若不设置则下载预训练模型使用,默认值:`None`。 + - `yes`;不需要设置额外的参数,一旦设置了该参数,说明你默认同意程序的所有请求,其中包括自动转换输入音频的采样率。默认值:`False`。 - `device`:执行预测的设备,默认值:当前系统下 paddlepaddle 的默认 device。 输出: ```bash + # 中文 [2021-12-08 13:12:34,063] [ INFO] [utils.py] [L225] - ASR Result: 我认为跑步最重要的就是给我带来了身体健康 + # 英文 + [2022-01-12 11:51:10,815] [ INFO] - ASR Result: i knocked at the door on the ancient side of the building ``` - Python API @@ -53,6 +63,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee config=None, # Set `config` and `ckpt_path` to None to use pretrained model. ckpt_path=None, audio_file='./zh.wav', + force_yes=False, device=paddle.get_device()) print('ASR Result: \n{}'.format(text)) ``` @@ -69,4 +80,4 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee | 模型 | 语言 | 采样率 | :--- | :---: | :---: | | conformer_wenetspeech| zh| 16000 -| transformer_aishell| zh| 16000 +| transformer_librispeech| en| 16000 diff --git a/demos/speech_translation/README.md b/demos/speech_translation/README.md index a2041b0c9bff13f36327dc3abb3e1f780d970416..f675a4eda0a57b9c33633d9b9a9c619a99e8e712 100644 --- a/demos/speech_translation/README.md +++ b/demos/speech_translation/README.md @@ -7,9 +7,10 @@ This demo is an implementation to recognize text from a specific audio file and ## Usage ### 1. Installation -```bash -pip install paddlespeech -``` +see [installation](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md). + +You can choose one way from easy, meduim and hard to install paddlespeech. + ### 2. Prepare Input File The input of this demo should be a WAV file(`.wav`). diff --git a/demos/speech_translation/README_cn.md b/demos/speech_translation/README_cn.md index affa82282798e6107998140c172b5f240cf8ffa2..bad9b392f9122d02c9fa3c365d5fc54bb82ed492 100644 --- a/demos/speech_translation/README_cn.md +++ b/demos/speech_translation/README_cn.md @@ -8,9 +8,10 @@ ## 使用方法 ### 1. 安装 -```bash -pip install paddlespeech -``` +请看[安装文档](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install_cn.md)。 + +你可以从 easy,medium,hard 三中方式中选择一种方式安装。 + ### 2. 准备输入 这个 Demo 的输入是 WAV(`.wav`) 语音文件 diff --git a/demos/story_talker/run.sh b/demos/story_talker/run.sh index 142959b6bbeee2f6c268252d09f6a21db531cc2f..50335e73be7dfb29fcc21c95863e2ddae433f948 100755 --- a/demos/story_talker/run.sh +++ b/demos/story_talker/run.sh @@ -37,17 +37,20 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # run tts CUDA_VISIBLE_DEVICES=${gpus} \ - python3 ${BIN_DIR}/synthesize_e2e.py \ - --fastspeech2-config=download/fastspeech2_nosil_baker_ckpt_0.4/default.yaml \ - --fastspeech2-checkpoint=download/fastspeech2_nosil_baker_ckpt_0.4/snapshot_iter_76000.pdz \ - --fastspeech2-stat=download/fastspeech2_nosil_baker_ckpt_0.4/speech_stats.npy \ - --pwg-config=download/pwg_baker_ckpt_0.4/pwg_default.yaml \ - --pwg-checkpoint=download/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ - --pwg-stat=download/pwg_baker_ckpt_0.4/pwg_stats.npy \ + python3 ${BIN_DIR}/../synthesize_e2e.py \ + --am=fastspeech2_csmsc \ + --am_config=download/fastspeech2_nosil_baker_ckpt_0.4/default.yaml \ + --am_ckpt=download/fastspeech2_nosil_baker_ckpt_0.4/snapshot_iter_76000.pdz \ + --am_stat=download/fastspeech2_nosil_baker_ckpt_0.4/speech_stats.npy \ + --voc=pwgan_csmsc \ + --voc_config=download/pwg_baker_ckpt_0.4/pwg_default.yaml \ + --voc_ckpt=download/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ + --voc_stat=download/pwg_baker_ckpt_0.4/pwg_stats.npy \ + --lang=zh \ --text=output/sentences.txt \ - --output-dir=output/wavs \ - --inference-dir=output/inference \ - --phones-dict=download/fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt + --output_dir=output/wavs \ + --inference_dir=output/inference \ + --phones_dict=download/fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt # output/inference is not needed here, which save the static models rm -rf output/inference fi diff --git a/demos/text_to_speech/README.md b/demos/text_to_speech/README.md index 785c2a623316e76ed0cc7aba18b927a54faa908f..9d3c4ac539a1afcd62a03c4f98b2dfe4cb622aae 100644 --- a/demos/text_to_speech/README.md +++ b/demos/text_to_speech/README.md @@ -8,9 +8,9 @@ This demo is an implementation to generate audio from the given text. It can be ## Usage ### 1. Installation -```bash -pip install paddlespeech -``` +see [installation](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md). + +You can choose one way from easy, meduim and hard to install paddlespeech. ### 2. Prepare Input The input of this demo should be a text of the specific language that can be passed via argument. diff --git a/demos/text_to_speech/README_cn.md b/demos/text_to_speech/README_cn.md index 0b2cd0b5dad69a8bdb7cea6f42666dfa05a837c4..f075efdafc1a236b4517764568b31499159c151b 100644 --- a/demos/text_to_speech/README_cn.md +++ b/demos/text_to_speech/README_cn.md @@ -9,9 +9,10 @@ ## 使用方法 ### 1. 安装 -```bash -pip install paddlespeech -``` +请看[安装文档](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install_cn.md)。 + +你可以从 easy,medium,hard 三中方式中选择一种方式安装。 + ### 2. 准备输入 这个 demo 的输入是通过参数传递的特定语言的文本。 diff --git a/docs/source/cls/custom_dataset.md b/docs/source/cls/custom_dataset.md new file mode 100644 index 0000000000000000000000000000000000000000..aaf5943c55fe7c2b7df8c9da06724eef42ba9134 --- /dev/null +++ b/docs/source/cls/custom_dataset.md @@ -0,0 +1,128 @@ +# Customize Dataset for Audio Classification + +Following this tutorial you can customize your dataset for audio classification task by using `paddlespeech` and `paddleaudio`. + +A base class of classification dataset is `paddleaudio.dataset.AudioClassificationDataset`. To customize your dataset you should write a dataset class derived from `AudioClassificationDataset`. + +Assuming you have some wave files that stored in your own directory. You should prepare a meta file with the information of filepaths and labels. For example the absolute path of it is `/PATH/TO/META_FILE.txt`: +``` +/PATH/TO/WAVE_FILE/1.wav cat +/PATH/TO/WAVE_FILE/2.wav cat +/PATH/TO/WAVE_FILE/3.wav dog +/PATH/TO/WAVE_FILE/4.wav dog +``` +Here is an example to build your custom dataset in `custom_dataset.py`: + +```python +from paddleaudio.datasets.dataset import AudioClassificationDataset + +class CustomDataset(AudioClassificationDataset): + meta_file = '/PATH/TO/META_FILE.txt' + # List all the class labels + label_list = [ + 'cat', + 'dog', + ] + + def __init__(self, **kwargs): + files, labels = self._get_data() + super(CustomDataset, self).__init__( + files=files, labels=labels, feat_type='raw', **kwargs) + + def _get_data(self): + ''' + This method offer information of wave files and labels. + ''' + files = [] + labels = [] + + with open(self.meta_file) as f: + for line in f: + file, label_str = line.strip().split(' ') + files.append(file) + labels.append(self.label_list.index(label_str)) + + return files, labels +``` + +Then you can build dataset and data loader from `CustomDataset`: +```python +import paddle +from paddleaudio.features import LogMelSpectrogram + +from custom_dataset import CustomDataset + +# Feature config should be align with pretrained model +sample_rate = 32000 +feat_conf = { + 'sr': sample_rate, + 'n_fft': 1024, + 'hop_length': 320, + 'window': 'hann', + 'win_length': 1024, + 'f_min': 50.0, + 'f_max': 14000.0, + 'n_mels': 64, +} + +train_ds = CustomDataset(sample_rate=sample_rate) +feature_extractor = LogMelSpectrogram(**feat_conf) + +train_sampler = paddle.io.DistributedBatchSampler( + train_ds, batch_size=4, shuffle=True, drop_last=False) +train_loader = paddle.io.DataLoader( + train_ds, + batch_sampler=train_sampler, + return_list=True, + use_buffer_reader=True) +``` + +Train model with `CustomDataset`: +```python +from paddlespeech.cls.models import cnn14 +from paddlespeech.cls.models import SoundClassifier + +backbone = cnn14(pretrained=True, extract_embedding=True) +model = SoundClassifier(backbone, num_class=len(train_ds.label_list)) +optimizer = paddle.optimizer.Adam( + learning_rate=1e-6, parameters=model.parameters()) +criterion = paddle.nn.loss.CrossEntropyLoss() + +steps_per_epoch = len(train_sampler) +epochs = 10 +for epoch in range(1, epochs + 1): + model.train() + + for batch_idx, batch in enumerate(train_loader): + waveforms, labels = batch + # Need a padding when lengths of waveforms differ in a batch. + feats = feature_extractor(waveforms) + feats = paddle.transpose(feats, [0, 2, 1]) + logits = model(feats) + loss = criterion(logits, labels) + loss.backward() + optimizer.step() + if isinstance(optimizer._learning_rate, + paddle.optimizer.lr.LRScheduler): + optimizer._learning_rate.step() + optimizer.clear_grad() + + # Calculate loss + avg_loss = loss.numpy()[0] + + # Calculate metrics + preds = paddle.argmax(logits, axis=1) + num_corrects = (preds == labels).numpy().sum() + num_samples = feats.shape[0] + + avg_acc = num_corrects / num_samples + + print_msg = 'Epoch={}/{}, Step={}/{}'.format( + epoch, epochs, batch_idx + 1, steps_per_epoch) + print_msg += ' loss={:.4f}'.format(avg_loss) + print_msg += ' acc={:.4f}'.format(avg_acc) + print_msg += ' lr={:.6f}'.format(optimizer.get_lr()) + print(print_msg) +``` + +If you want to save the checkpoint of model and evaluate from a specific dataset, please see `paddlespeech/cli/exp/panns/train.py` for more details. diff --git a/docs/source/cls/quick_start.md b/docs/source/cls/quick_start.md new file mode 100644 index 0000000000000000000000000000000000000000..e173255cf9dfd1df5627c51e81c88c198f72bab6 --- /dev/null +++ b/docs/source/cls/quick_start.md @@ -0,0 +1,51 @@ +# Quick Start of Audio Classification +Several shell scripts provided in `./examples/esc50/cls0` will help us to quickly give it a try, for most major modules, including data preparation, model training, model evaluation, with [ESC50](ttps://github.com/karolpiczak/ESC-50) dataset. + +Some of the scripts in `./examples` are not configured with GPUs. If you want to train with 8 GPUs, please modify `CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7`. If you don't have any GPU available, please set `CUDA_VISIBLE_DEVICES=` to use CPUs instead. + +Let's start a audio classification task with the following steps: + +- Go to the directory + + ```bash + cd examples/esc50/cls0 + ``` + +- Source env + ```bash + source path.sh + ``` + +- Main entry point + ```bash + CUDA_VISIBLE_DEVICES=0 ./run.sh 1 + ``` + +This demo includes fine-tuning, evaluating and deploying a audio classificatio model. More detailed information is provided in the following sections. + +## Fine-tuning a model +PANNs([PANNs: Large-Scale Pretrained Audio Neural Networks for Audio Pattern Recognition](https://arxiv.org/pdf/1912.10211.pdf)) are pretrained models with [Audioset](https://research.google.com/audioset/). They can be easily used to extract audio embeddings for audio classification task. + +To start a model fine-tuning, please run: +```bash +ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') +feat_backend=numpy +./local/train.sh ${ngpu} ${feat_backend} +``` + +## Deploy a model +Once you save a model checkpoint, you can export it to static graph and deploy by python scirpt: + +- Export to a static graph + ```bash + ./local/export.sh ${ckpt_dir} ./export + ``` + The argument `ckpt_dir` should be a directory in which a model checkpoint stored, for example `checkpoint/epoch_50`. + + The static graph will be exported to `./export`. + +- Inference + ```bash + ./local/static_model_infer.sh ${infer_device} ./export ${audio_file} + ``` + The argument `infer_device` can be `cpu` or `gpu`, and it means which device to be used to infer. And `audio_file` should be a wave file with name `*.wav`. diff --git a/docs/source/install.md b/docs/source/install.md index 8508477726c9b86d51074c1d06ded9325000b508..bdeb37cec23de92941cbff2dd971a9f301c702e0 100644 --- a/docs/source/install.md +++ b/docs/source/install.md @@ -6,16 +6,17 @@ There are 3 ways to use `PaddleSpeech`. According to the degree of difficulty, t |:---- |:----------------------------------------------------------- |:----| | Easy | (1) Use command-line functions of PaddleSpeech.
(2) Experience PaddleSpeech on Ai Studio. | Linux, Mac(not support M1 chip),Windows | | Medium | Support major functions ,such as using the` ready-made `examples and using PaddleSpeech to train your model. | Linux | -| Hard | Support full function of Paddlespeech,including training n-gram language model, Montreal-Forced-Aligner, and so on. And you are more able to be a developer! | Ubuntu | +| Hard | Support full function of Paddlespeech, including using join ctc decoder with kaldi, training n-gram language model, Montreal-Forced-Aligner, and so on. And you are more able to be a developer! | Ubuntu | ## Prerequisites - Python >= 3.7 - PaddlePaddle latest version (please refer to the [Installation Guide] (https://www.paddlepaddle.org.cn/documentation/docs/en/beginners_guide/index_en.html)) - C++ compilation environment - Hip: For Linux and Mac, do not use command `sh` instead of command `bash` in installation document. +- Hip: We recommand you to install `paddlepaddle` from https://mirror.baidu.com/pypi/simple and install `paddlespeech` from https://pypi.tuna.tsinghua.edu.cn/simple. ## Easy: Get the Basic Function (Support Linux, Mac, and Windows) -- If you are newer to `PaddleSpeech` and want to experience it easily without your machine. We recommend you to use [AI Studio](https://aistudio.baidu.com/aistudio/index) to experience it. There is a step-by-step tutorial for `PaddleSpeech` and you can use the basic function of `PaddleSpeech` with a free machine. +- If you are newer to `PaddleSpeech` and want to experience it easily without your machine. We recommend you to use [AI Studio](https://aistudio.baidu.com/aistudio/index) to experience it. There is a step-by-step [tutorial](https://aistudio.baidu.com/aistudio/education/group/info/25130) for `PaddleSpeech`, and you can use the basic function of `PaddleSpeech` with a free machine. - If you want to use the command line function of Paddlespeech, you need to complete the following steps to install `PaddleSpeech`. For more information about how to use the command line function, you can see the [cli](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/cli). ### Install Conda Conda is a management system of the environment. You can go to [minicoda](https://docs.conda.io/en/latest/miniconda.html) (select a version py>=3.7) to download and install the conda. @@ -29,6 +30,10 @@ conda install -y -c conda-forge sox libsndfile bzip2 #### Windows You need to install `Visual Studio` to make the C++ compilation environment. +https://visualstudio.microsoft.com/visual-cpp-build-tools/ + +You can also see [#1195](https://github.com/PaddlePaddle/PaddleSpeech/discussions/1195) for more help. + #### Mac ```bash brew install gcc @@ -47,10 +52,19 @@ sudo apt install build-essential conda install -y -c gcc_linux-64=8.4.0 gxx_linux-64=8.4.0 ``` ### Install PaddleSpeech -You can use the following command: +Some users may fail to install `kaldiio` due to the default download source, you can install `pytest-runner` at first; +```bash +pip install pytest-runner -i https://pypi.tuna.tsinghua.edu.cn/simple +``` +Then you can use the following commands: ```bash -pip install paddlepaddle paddlespeech +pip install paddlepaddle -i https://mirror.baidu.com/pypi/simple +pip install paddlespeech -i https://pypi.tuna.tsinghua.edu.cn/simple ``` +> If you encounter problem with downloading **nltk_data** while using paddlespeech, it maybe due to your poor network, we suggest you download the [nltk_data](https://paddlespeech.bj.bcebos.com/Parakeet/tools/nltk_data.tar.gz) provided by us, and extract it to your `${HOME}`. + +> If you fail to install paddlespeech-ctcdecoders, it doesn't matter. + ## Medium: Get the Major Functions (Support Linux) If you want to get the major function of `paddlespeech`, you need to do following steps: ### Git clone PaddleSpeech @@ -105,13 +119,15 @@ conda install -y -c gcc_linux-64=8.4.0 gxx_linux-64=8.4.0 ### Install PaddlePaddle You can choose the `PaddlePaddle` version based on your system. For example, for CUDA 10.2, CuDNN7.5 install paddlepaddle-gpu 2.2.0: ```bash -python3 -m pip install paddlepaddle-gpu==2.2.0 +python3 -m pip install paddlepaddle-gpu==2.2.0 -i https://mirror.baidu.com/pypi/simple ``` ### Install PaddleSpeech You can install `paddlespeech` by the following command,then you can use the `ready-made` examples in `paddlespeech` : ```bash +# Some users may fail to install `kaldiio` due to the default download source, you can install `pytest-runner` at first; +pip install pytest-runner -i https://pypi.tuna.tsinghua.edu.cn/simple # Make sure you are in the root directory of PaddleSpeech -pip install . +pip install . -i https://pypi.tuna.tsinghua.edu.cn/simple ``` ## Hard: Get the Full Function (Support Ubuntu) @@ -175,14 +191,17 @@ conda activate tools/venv conda install -y -c conda-forge sox libsndfile swig bzip2 libflac bc ``` ### Install PaddlePaddle +Some users may fail to install `kaldiio` due to the default download source, you can install `pytest-runner` at first; +```bash +pip install pytest-runner -i https://pypi.tuna.tsinghua.edu.cn/simple +``` Make sure you have GPU and the paddlepaddle version is right. For example, for CUDA 10.2, CuDNN7.5 install paddle 2.2.0: - ```bash -python3 -m pip install paddlepaddle-gpu==2.2.0 +python3 -m pip install paddlepaddle-gpu==2.2.0 -i https://mirror.baidu.com/pypi/simple ``` ### Install PaddleSpeech in Developing Mode ```bash -pip install -e .[develop] +pip install -e .[develop] -i https://pypi.tuna.tsinghua.edu.cn/simple ``` ### Install the Kaldi (Optional) ```bash diff --git a/docs/source/install_cn.md b/docs/source/install_cn.md index 3ffe371d936d0fd10921a0b9d679c36f23955501..55fef93d554a78b4b8e083895846414ef44de62b 100644 --- a/docs/source/install_cn.md +++ b/docs/source/install_cn.md @@ -5,16 +5,18 @@ | :--- | :----------------------------------------------------------- | :------------------ | | 简单 | (1) 使用 PaddleSpeech 的命令行功能.
(2) 在 Aistudio上体验 PaddleSpeech. | Linux, Mac(不支持M1芯片),Windows | | 中等 | 支持 PaddleSpeech 主要功能,比如使用已有 examples 中的模型和使用 PaddleSpeech 来训练自己的模型. | Linux | -| 困难 | 支持 PaddleSpeech 的各项功能,包含训练语言模型,使用强制对齐等。并且你更能成为一名开发者! | Ubuntu | +| 困难 | 支持 PaddleSpeech 的各项功能,包含结合kaldi使用 join ctc decoder 方式解码,训练语言模型,使用强制对齐等。并且你更能成为一名开发者! | Ubuntu | ## 先决条件 - Python >= 3.7 - 最新版本的 PaddlePaddle (请看 [安装向导](https://www.paddlepaddle.org.cn/documentation/docs/en/beginners_guide/index_en.html)) - C++ 编译环境 - 提示: 对于 Linux 和 Mac,请不要使用 `sh` 代替安装文档中的 `bash` +- 提示: 我们建议在安装 `paddlepaddle` 的时候使用百度源 https://mirror.baidu.com/pypi/simple ,而在安装 `paddlespeech` 的时候使用清华源 https://pypi.tuna.tsinghua.edu.cn/simple 。 + ## 简单: 获取基本功能(支持 Linux,Mac 和 Windows) -- 如果你是一个刚刚接触 `PaddleSpeech` 的新人并且想要很方便地体验一下该项目。我们建议你 体验一下[AI Studio](https://aistudio.baidu.com/aistudio/index)。我们在AI Studio上面建立了一个让你一步一步运行体验来使用`PaddleSpeech`的教程。 +- 如果你是一个刚刚接触 `PaddleSpeech` 的新人并且想要很方便地体验一下该项目。我们建议你体验一下 [AI Studio](https://aistudio.baidu.com/aistudio/index)。我们在 AI Studio上面建立了一个让你一步一步运行体验来使用 `PaddleSpeech` 的[教程](https://aistudio.baidu.com/aistudio/education/group/info/25130)。 - 如果你想使用 `PaddleSpeech` 的命令行功能,你需要跟随下面的步骤来安装 `PaddleSpeech`。如果你想了解更多关于使用 `PaddleSpeech` 命令行功能的信息,你可以参考 [cli](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/cli)。 -### 安装Conda +### 安装 Conda Conda是一个包管理的环境。你可以前往 [minicoda](https://docs.conda.io/en/latest/miniconda.html) 去下载并安装 conda(请下载 py>=3.7 的版本)。 然后你需要安装 `paddlespeech` 的 conda 依赖: ```bash @@ -24,6 +26,11 @@ conda install -y -c conda-forge sox libsndfile bzip2 (如果你系统上已经安装了 C++ 编译环境,请忽略这一步。) #### Windows 对于 Windows 系统,需要安装 `Visual Studio` 来完成 C++ 编译环境的安装。 + +https://visualstudio.microsoft.com/visual-cpp-build-tools/ + +你可以前往讨论区[#1195](https://github.com/PaddlePaddle/PaddleSpeech/discussions/1195)获取更多帮助。 + #### Mac ```bash brew install gcc @@ -42,19 +49,27 @@ sudo apt install build-essential conda install -y -c gcc_linux-64=8.4.0 gxx_linux-64=8.4.0 ``` ### 安装 PaddleSpeech -你可以使用如下命令: +部分用户系统由于默认源的问题,安装中会出现kaldiio安转出错的问题,建议首先安装pytest-runner: +```bash +pip install pytest-runner -i https://pypi.tuna.tsinghua.edu.cn/simple +``` +然后你可以使用如下命令: ```bash -pip install paddlepaddle paddlespeech +pip install paddlepaddle -i https://mirror.baidu.com/pypi/simple +pip install paddlespeech -i https://pypi.tuna.tsinghua.edu.cn/simple ``` +> 如果您在使用 paddlespeech 的过程中遇到关于下载 **nltk_data** 的问题,可能是您的网络不佳,我们建议您下载我们提供的 [nltk_data](https://paddlespeech.bj.bcebos.com/Parakeet/tools/nltk_data.tar.gz) 并解压缩到您的 `${HOME}` 目录下。 + +> 如果出现 paddlespeech-ctcdecoders 无法安装的问题,无须担心,这不影响使用。 + ## 中等: 获取主要功能(支持 Linux) -如果你想要使用` paddlespeech` 的主要功能。你需要完成以下几个步骤 +如果你想要使用 `paddlespeech` 的主要功能。你需要完成以下几个步骤 ### Git clone PaddleSpeech -你需要先git clone本仓库 +你需要先 git clone 本仓库 ```bash git clone https://github.com/PaddlePaddle/PaddleSpeech.git cd PaddleSpeech ``` - ### 安装 Conda Conda 是一个包管理的环境。你可以前往 [minicoda](https://docs.conda.io/en/latest/miniconda.html) 去下载并安装 conda(请下载 py>=3.7 的版本)。你可以尝试自己安装,或者使用以下的命令: ```bash @@ -98,12 +113,15 @@ conda install -y -c gcc_linux-64=8.4.0 gxx_linux-64=8.4.0 ### 安装 PaddlePaddle 你可以根据系统配置选择 PaddlePaddle 版本,例如系统使用 CUDA 10.2, CuDNN7.5 ,你可以安装 paddlepaddle-gpu 2.2.0: ```bash -python3 -m pip install paddlepaddle-gpu==2.2.0 +python3 -m pip install paddlepaddle-gpu==2.2.0 -i https://mirror.baidu.com/pypi/simple ``` ### 安装 PaddleSpeech 最后安装 `paddlespeech`,这样你就可以使用 `paddlespeech`中已有的 examples: ```bash -pip install . +# 部分用户系统由于默认源的问题,安装中会出现kaldiio安转出错的问题,建议首先安装pytest-runner: +pip install pytest-runner -i https://pypi.tuna.tsinghua.edu.cn/simple +# 请确保目前处于PaddleSpeech项目的根目录 +pip install . -i https://pypi.tuna.tsinghua.edu.cn/simple ``` ## 困难: 获取所有功能(支持 Ubuntu) ### 先决条件 @@ -164,11 +182,16 @@ conda install -y -c conda-forge sox libsndfile swig bzip2 libflac bc ### 安装 PaddlePaddle 请确认你系统是否有 GPU,并且使用了正确版本的 paddlepaddle。例如系统使用 CUDA 10.2, CuDNN7.5 ,你可以安装 paddlepaddle-gpu 2.2.0: ```bash -python3 -m pip install paddlepaddle-gpu==2.2.0 +python3 -m pip install paddlepaddle-gpu==2.2.0 -i https://mirror.baidu.com/pypi/simple ``` ### 用开发者模式安装 PaddleSpeech +部分用户系统由于默认源的问题,安装中会出现kaldiio安转出错的问题,建议首先安装pytest-runner: +```bash +pip install pytest-runner -i https://pypi.tuna.tsinghua.edu.cn/simple +``` +然后安装 PaddleSpeech: ```bash -pip install -e .[develop] +pip install -e .[develop] -i https://pypi.tuna.tsinghua.edu.cn/simple ``` ### 安装 Kaldi(可选) ```bash diff --git a/docs/source/released_model.md b/docs/source/released_model.md index 91ef6d166741dc946a916b1c0ffacf820430442f..3310bfb23cfb56acd3a9e84a41bbed2272dcc913 100644 --- a/docs/source/released_model.md +++ b/docs/source/released_model.md @@ -4,34 +4,32 @@ ### Speech Recognition Model Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | Hours of speech | Example Link -:-------------:| :------------:| :-----: | -----: | :----------------- |:--------- | :---------- | :--------- | :----------- -[Ds2 Online Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/aishell_ds2_online_cer8.00_release.tar.gz) | Aishell Dataset | Char-based | 345 MB | 2 Conv + 5 LSTM layers with only forward direction | 0.080 |-| 151 h | [D2 Online Aishell ASR0](../../examples/aishell/asr0) -[Ds2 Offline Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/ds2.model.tar.gz)| Aishell Dataset | Char-based | 306 MB | 2 Conv + 3 bidirectional GRU layers| 0.064 |-| 151 h | [Ds2 Offline Aishell ASR0](../../examples/aishell/asr0) -[Conformer Online Aishell ASR1 Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.chunk.release.tar.gz) | Aishell Dataset | Char-based | 283 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0594 |-| 151 h | [Conformer Online Aishell ASR1](../../examples/aishell/asr1) -[Conformer Offline Aishell ASR1 Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.release.tar.gz) | Aishell Dataset | Char-based | 284 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0547 |-| 151 h | [Conformer Offline Aishell ASR1](../../examples/aishell/asr1) -[Transformer Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/transformer.model.tar.gz) | Aishell Dataset | Char-based | 128 MB | Encoder:Transformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0538 || 151 h | [Transformer Aishell ASR1](../../examples/aishell/asr1) -[Conformer Librispeech ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr1/conformer.model.tar.gz) | Librispeech Dataset | subword-based | 191 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring |-| 0.0337 | 960 h | [Conformer Librispeech ASR1](../../example/librispeech/asr1) -[Transformer Librispeech ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr1/transformer.model.tar.gz) | Librispeech Dataset | subword-based | 131 MB | Encoder:Transformer, Decoder:Transformer, Decoding method: Attention rescoring |-| 0.0381 | 960 h | [Transformer Librispeech ASR1](../../example/librispeech/asr1) -[Transformer Librispeech ASR2 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr2/transformer.model.tar.gz) | Librispeech Dataset | subword-based | 131 MB | Encoder:Transformer, Decoder:Transformer, Decoding method: JoinCTC w/ LM |-| 0.0240 | 960 h | [Transformer Librispeech ASR2](../../example/librispeech/asr2) +:-------------:| :------------:| :-----: | -----: | :-----: |:-----:| :-----: | :-----: | :-----: +[Ds2 Online Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_ckpt_0.1.1.model.tar.gz) | Aishell Dataset | Char-based | 345 MB | 2 Conv + 5 LSTM layers with only forward direction | 0.080 |-| 151 h | [D2 Online Aishell ASR0](../../examples/aishell/asr0) +[Ds2 Offline Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_aishell_ckpt_0.1.1.model.tar.gz)| Aishell Dataset | Char-based | 306 MB | 2 Conv + 3 bidirectional GRU layers| 0.064 |-| 151 h | [Ds2 Offline Aishell ASR0](../../examples/aishell/asr0) +[Conformer Offline Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_conformer_aishell_ckpt_0.1.1.model.tar.gz) | Aishell Dataset | Char-based | 284 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.056 |-| 151 h | [Conformer Offline Aishell ASR1](../../examples/aishell/asr1) +[Transformer Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_transformer_aishell_ckpt_0.1.1.model.tar.gz) | Aishell Dataset | Char-based | 128 MB | Encoder:Transformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0523 || 151 h | [Transformer Aishell ASR1](../../examples/aishell/asr1) +[Conformer Librispeech ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr1/asr1_conformer_librispeech_ckpt_0.1.1.model.tar.gz) | Librispeech Dataset | subword-based | 191 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring |-| 0.0337 | 960 h | [Conformer Librispeech ASR1](../../example/librispeech/asr1) +[Transformer Librispeech ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr1/asr1_transformer_librispeech_ckpt_0.1.1.model.tar.gz) | Librispeech Dataset | subword-based | 131 MB | Encoder:Transformer, Decoder:Transformer, Decoding method: Attention rescoring |-| 0.0381 | 960 h | [Transformer Librispeech ASR1](../../example/librispeech/asr1) +[Transformer Librispeech ASR2 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr2/asr2_transformer_librispeech_ckpt_0.1.1.model.tar.gz) | Librispeech Dataset | subword-based | 131 MB | Encoder:Transformer, Decoder:Transformer, Decoding method: JoinCTC w/ LM |-| 0.0240 | 960 h | [Transformer Librispeech ASR2](../../example/librispeech/asr2) ### Language Model based on NGram Language Model | Training Data | Token-based | Size | Descriptions -:-------------:| :------------:| :-----: | -----: | :----------------- +:------------:| :------------:|:------------: | :------------: | :------------: [English LM](https://deepspeech.bj.bcebos.com/en_lm/common_crawl_00.prune01111.trie.klm) | [CommonCrawl(en.00)](http://web-language-models.s3-website-us-east-1.amazonaws.com/ngrams/en/deduped/en.00.deduped.xz) | Word-based | 8.3 GB | Pruned with 0 1 1 1 1;
About 1.85 billion n-grams;
'trie' binary with '-a 22 -q 8 -b 8' [Mandarin LM Small](https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm) | Baidu Internal Corpus | Char-based | 2.8 GB | Pruned with 0 1 2 4 4;
About 0.13 billion n-grams;
'probing' binary with default settings [Mandarin LM Large](https://deepspeech.bj.bcebos.com/zh_lm/zhidao_giga.klm) | Baidu Internal Corpus | Char-based | 70.4 GB | No Pruning;
About 3.7 billion n-grams;
'probing' binary with default settings ### Speech Translation Models -| Model | Training Data | Token-based | Size | Descriptions | BLEU | Example Link | -| ------------------------------------------------------------ | ------------- | ----------- | ---- | ------------------------------------------------------------ | ----- | ------------------------------------------------------------ | -| [Transformer FAT-ST MTL En-Zh](https://paddlespeech.bj.bcebos.com/s2t/ted_en_zh/st1/fat_st_ted-en-zh.tar.gz) | Ted-En-Zh | Spm | | Encoder:Transformer, Decoder:Transformer,
Decoding method: Attention | 20.80 | [Transformer Ted-En-Zh ST1](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/examples/ted_en_zh/st1) | - +| Model | Training Data | Token-based | Size | Descriptions | BLEU | Example Link | +| :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | +| (only for CLI)[Transformer FAT-ST MTL En-Zh](https://paddlespeech.bj.bcebos.com/s2t/ted_en_zh/st1/st1_transformer_mtl_noam_ted-en-zh_ckpt_0.1.1.model.tar.gz) | Ted-En-Zh| Spm| | Encoder:Transformer, Decoder:Transformer,
Decoding method: Attention | 20.80 | [Transformer Ted-En-Zh ST1](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/examples/ted_en_zh/st1) | ## Text-to-Speech Models ### Acoustic Models -Model Type | Dataset| Example Link | Pretrained Models|Static Models|Siize(static) +Model Type | Dataset| Example Link | Pretrained Models|Static Models|Size (static) :-------------:| :------------:| :-----: | :-----:| :-----:| :-----: Tacotron2|LJSpeech|[tacotron2-vctk](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts0)|[tacotron2_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_ljspeech_ckpt_0.3.zip)||| TransformerTTS| LJSpeech| [transformer-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts1)|[transformer_tts_ljspeech_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/transformer_tts/transformer_tts_ljspeech_ckpt_0.4.zip)||| @@ -43,14 +41,16 @@ FastSpeech2| LJSpeech |[fastspeech2-ljspeech](https://github.com/PaddlePaddle/Pa FastSpeech2| VCTK |[fastspeech2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/tts3)|[fastspeech2_nosil_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_vctk_ckpt_0.5.zip)||| ### Vocoders -Model Type | Dataset| Example Link | Pretrained Models| Static Models|Size(static) -:-------------:| :------------:| :-----: | :-----:| :-----:| :-----: +Model Type | Dataset| Example Link | Pretrained Models| Static Models|Size (static) +:-----:| :-----:| :-----: | :-----:| :-----:| :-----: WaveFlow| LJSpeech |[waveflow-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc0)|[waveflow_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/waveflow/waveflow_ljspeech_ckpt_0.3.zip)||| Parallel WaveGAN| CSMSC |[PWGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc1)|[pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip)|[pwg_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_static_0.4.zip)|5.1MB| Parallel WaveGAN| LJSpeech |[PWGAN-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc1)|[pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_ljspeech_ckpt_0.5.zip)||| Parallel WaveGAN|AISHELL-3 |[PWGAN-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/voc1)|[pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_aishell3_ckpt_0.5.zip)||| Parallel WaveGAN| VCTK |[PWGAN-vctk](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/voc1)|[pwg_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_vctk_ckpt_0.5.zip)||| -|Multi Band MelGAN |CSMSC|[MB MelGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc3) | [mb_melgan_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_ckpt_0.5.zip)
[mb_melgan_baker_finetune_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_finetune_ckpt_0.5.zip)|[mb_melgan_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_static_0.5.zip) |8.2MB| +|Multi Band MelGAN | CSMSC |[MB MelGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc3) | [mb_melgan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_ckpt_0.1.1.zip)
[mb_melgan_baker_finetune_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_finetune_ckpt_0.5.zip)|[mb_melgan_csmsc_static_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_static_0.1.1.zip) |8.2MB| +Style MelGAN | CSMSC |[Style MelGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc4)|[style_melgan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/style_melgan/style_melgan_csmsc_ckpt_0.1.1.zip)| | | +HiFiGAN | CSMSC |[HiFiGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc5)|[hifigan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_ckpt_0.1.1.zip)|[hifigan_csmsc_static_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_static_0.1.1.zip)|50MB| ### Voice Cloning Model Type | Dataset| Example Link | Pretrained Models @@ -64,14 +64,18 @@ GE2E + FastSpeech2 | AISHELL-3 |[ge2e-fastspeech2-aishell3](https://github.com/ Model Type | Dataset| Example Link | Pretrained Models :-------------:| :------------:| :-----: | :-----: -PANN | Audioset| [audioset_tagging_cnn](https://github.com/qiuqiangkong/audioset_tagging_cnn) | [panns_cnn6.pdparams](https://bj.bcebos.com/paddleaudio/models/panns_cnn6.pdparams),[panns_cnn10.pdparams](https://bj.bcebos.com/paddleaudio/models/panns_cnn10.pdparams),[panns_cnn14.pdparams](https://bj.bcebos.com/paddleaudio/models/panns_cnn14.pdparams) -PANN | ESC-50 |[pann-esc50]("./examples/esc50/cls0")|[panns_cnn6.tar.gz](https://paddlespeech.bj.bcebos.com/cls/panns_cnn6.tar.gz), [panns_cnn10](https://paddlespeech.bj.bcebos.com/cls/panns_cnn10.tar.gz), [panns_cnn14.tar.gz](https://paddlespeech.bj.bcebos.com/cls/panns_cnn14.tar.gz) +PANN | Audioset| [audioset_tagging_cnn](https://github.com/qiuqiangkong/audioset_tagging_cnn) | [panns_cnn6.pdparams](https://bj.bcebos.com/paddleaudio/models/panns_cnn6.pdparams), [panns_cnn10.pdparams](https://bj.bcebos.com/paddleaudio/models/panns_cnn10.pdparams), [panns_cnn14.pdparams](https://bj.bcebos.com/paddleaudio/models/panns_cnn14.pdparams) +PANN | ESC-50 |[pann-esc50]("./examples/esc50/cls0")|[esc50_cnn6.tar.gz](https://paddlespeech.bj.bcebos.com/cls/esc50/esc50_cnn6.tar.gz), [esc50_cnn10.tar.gz](https://paddlespeech.bj.bcebos.com/cls/esc50/esc50_cnn10.tar.gz), [esc50_cnn14.tar.gz](https://paddlespeech.bj.bcebos.com/cls/esc50/esc50_cnn14.tar.gz) +## Punctuation Restoration Models +Model Type | Dataset| Example Link | Pretrained Models +:-------------:| :------------:| :-----: | :-----: +Ernie Linear | IWLST2012_zh |[iwslt2012_punc0](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/iwslt2012/punc0)|[ernie_linear_p3_iwslt2012_zh_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/text/ernie_linear_p3_iwslt2012_zh_ckpt_0.1.1.zip) ## Speech Recognition Model from paddle 1.8 -| Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | Hours of speech | -| :----------------------------------------------------------: | :----------------------------: | :---------: | -----: | :------------------------------------------------- | :----- | :----- | :-------------- | -| [Ds2 Offline Aishell model](https://deepspeech.bj.bcebos.com/mandarin_models/aishell_model_v1.8_to_v2.x.tar.gz) | Aishell Dataset | Char-based | 234 MB | 2 Conv + 3 bidirectional GRU layers | 0.0804 | - | 151 h | -| [Ds2 Offline Librispeech model](https://deepspeech.bj.bcebos.com/eng_models/librispeech_v1.8_to_v2.x.tar.gz) | Librispeech Dataset | Word-based | 307 MB | 2 Conv + 3 bidirectional sharing weight RNN layers | - | 0.0685 | 960 h | -| [Ds2 Offline Baidu en8k model](https://deepspeech.bj.bcebos.com/eng_models/baidu_en8k_v1.8_to_v2.x.tar.gz) | Baidu Internal English Dataset | Word-based | 273 MB | 2 Conv + 3 bidirectional GRU layers | - | 0.0541 | 8628 h | +| Acoustic Model |Training Data| Token-based | Size | Descriptions | CER | WER | Hours of speech | +| :-----:| :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | +| [Ds2 Offline Aishell model](https://deepspeech.bj.bcebos.com/mandarin_models/aishell_model_v1.8_to_v2.x.tar.gz) | Aishell Dataset | Char-based | 234 MB | 2 Conv + 3 bidirectional GRU layers | 0.0804 | — | 151 h | +| [Ds2 Offline Librispeech model](https://deepspeech.bj.bcebos.com/eng_models/librispeech_v1.8_to_v2.x.tar.gz) | Librispeech Dataset | Word-based | 307 MB | 2 Conv + 3 bidirectional sharing weight RNN layers | — | 0.0685 | 960 h | +| [Ds2 Offline Baidu en8k model](https://deepspeech.bj.bcebos.com/eng_models/baidu_en8k_v1.8_to_v2.x.tar.gz) | Baidu Internal English Dataset | Word-based | 273 MB | 2 Conv + 3 bidirectional GRU layers |— | 0.0541 | 8628 h| diff --git a/docs/source/tts/tts_papers.md b/docs/source/tts/tts_papers.md new file mode 100644 index 0000000000000000000000000000000000000000..2b35b8852157295b4bcd7cde17ca67d7b72e2ede --- /dev/null +++ b/docs/source/tts/tts_papers.md @@ -0,0 +1,42 @@ +# TTS Papers +## Text Frontend +### Polyphone +- [【g2pM】g2pM: A Neural Grapheme-to-Phoneme Conversion Package for Mandarin Chinese Based on a New Open Benchmark Dataset](https://arxiv.org/abs/2004.03136) +- [Disambiguation of Chinese Polyphones in an End-to-End Framework with Semantic Features Extracted by Pre-trained BERT](https://www1.se.cuhk.edu.hk/~hccl/publications/pub/201909_INTERSPEECH_DongyangDAI.pdf) +### Text Normalization +#### English +- [applenob/text_normalization](https://github.com/applenob/text_normalization) +### G2P +#### English +- [cmusphinx/g2p-seq2seq](https://github.com/cmusphinx/g2p-seq2seq) + +## Acoustic Models +- [【AdaSpeech3】AdaSpeech 3: Adaptive Text to Speech for Spontaneous Style](https://arxiv.org/abs/2107.02530) +- [【AdaSpeech2】AdaSpeech 2: Adaptive Text to Speech with Untranscribed Data](https://arxiv.org/abs/2104.09715) +- [【AdaSpeech】AdaSpeech: Adaptive Text to Speech for Custom Voice](https://arxiv.org/abs/2103.00993) +- [【FastSpeech2】FastSpeech 2: Fast and High-Quality End-to-End Text to Speech](https://arxiv.org/abs/2006.04558) +- [【FastPitch】FastPitch: Parallel Text-to-speech with Pitch Prediction](https://arxiv.org/abs/2006.06873) +- [【SpeedySpeech】SpeedySpeech: Efficient Neural Speech Synthesis](https://arxiv.org/abs/2008.03802) +- [【FastSpeech】FastSpeech: Fast, Robust and Controllable Text to Speech](https://arxiv.org/abs/1905.09263) +- [【Transformer TTS】Neural Speech Synthesis with Transformer Network](https://arxiv.org/abs/1809.08895) +- [【Tacotron2】Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions](https://arxiv.org/abs/1712.05884) + +## Vocoders +- [【RefineGAN】RefineGAN: Universally Generating Waveform Better than Ground Truth with Highly Accurate Pitch and Intensity Responses](https://arxiv.org/abs/2111.00962) +- [【Fre-GAN】Fre-GAN: Adversarial Frequency-consistent Audio Synthesis](https://arxiv.org/abs/2106.02297) +- [【StyleMelGAN】StyleMelGAN: An Efficient High-Fidelity Adversarial Vocoder with Temporal Adaptive Normalization](https://arxiv.org/abs/2011.01557) +- [【Multi-band MelGAN】Multi-band MelGAN: Faster Waveform Generation for High-Quality Text-to-Speech](https://arxiv.org/abs/2005.05106) +- [【HiFi-GAN】HiFi-GAN: Generative Adversarial Networks for Efficient and High Fidelity Speech Synthesis](https://arxiv.org/abs/2010.05646) +- [【VocGAN】VocGAN: A High-Fidelity Real-time Vocoder with a Hierarchically-nested Adversarial Network](https://arxiv.org/abs/2007.15256) +- [【Parallel WaveGAN】Parallel WaveGAN: A fast waveform generation model based on generative adversarial networks with multi-resolution spectrogram](https://arxiv.org/abs/1910.11480) +- [【MelGAN】MelGAN: Generative Adversarial Networks for Conditional Waveform Synthesis](https://arxiv.org/abs/1910.06711) +- [【WaveFlow】WaveFlow: A Compact Flow-based Model for Raw Audio](https://arxiv.org/abs/1912.01219) +- [【LPCNet】LPCNet: Improving Neural Speech Synthesis Through Linear Prediction](https://arxiv.org/abs/1810.11846) +- [【WaveRNN】Efficient Neural Audio Synthesis](https://arxiv.org/abs/1802.08435) +## GAN TTS + +- [【GAN TTS】High Fidelity Speech Synthesis with Adversarial Networks](https://arxiv.org/abs/1909.11646) + +## Voice Cloning +- [【SV2TTS】Transfer Learning from Speaker Verification to Multispeaker Text-to-Speech Synthesis](https://arxiv.org/abs/1806.04558) +- [【GE2E】Generalized End-to-End Loss for Speaker Verification](https://arxiv.org/abs/1710.10467) diff --git a/docs/topic/ctc/ctc_loss_compare.ipynb b/docs/topic/ctc/ctc_loss_compare.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..95b2af5085b79a76ba7898089327c1b278921dac --- /dev/null +++ b/docs/topic/ctc/ctc_loss_compare.ipynb @@ -0,0 +1,520 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "ff6ff1e0", + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "33af5f76", + "metadata": {}, + "outputs": [], + "source": [ + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "9b566b73", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Cloning into 'warp-ctc'...\n", + "remote: Enumerating objects: 829, done.\u001b[K\n", + "remote: Total 829 (delta 0), reused 0 (delta 0), pack-reused 829\u001b[K\n", + "Receiving objects: 100% (829/829), 388.85 KiB | 140.00 KiB/s, done.\n", + "Resolving deltas: 100% (419/419), done.\n", + "Checking connectivity... done.\n" + ] + } + ], + "source": [ + "!git clone https://github.com/SeanNaren/warp-ctc.git" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "4a087a09", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/workspace/zhanghui/DeepSpeech-2.x/docs/topic/ctc/warp-ctc\n" + ] + } + ], + "source": [ + "%cd warp-ctc" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "f55dc29a", + "metadata": {}, + "outputs": [], + "source": [ + "mkdir -p build" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "fe79f4cf", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/workspace/zhanghui/DeepSpeech-2.x/docs/topic/ctc/warp-ctc/build\n" + ] + } + ], + "source": [ + "cd build" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "3d25c718", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "-- The C compiler identification is GNU 5.4.0\n", + "-- The CXX compiler identification is GNU 5.4.0\n", + "-- Check for working C compiler: /usr/bin/cc\n", + "-- Check for working C compiler: /usr/bin/cc -- works\n", + "-- Detecting C compiler ABI info\n", + "-- Detecting C compiler ABI info - done\n", + "-- Detecting C compile features\n", + "-- Detecting C compile features - done\n", + "-- Check for working CXX compiler: /usr/bin/c++\n", + "-- Check for working CXX compiler: /usr/bin/c++ -- works\n", + "-- Detecting CXX compiler ABI info\n", + "-- Detecting CXX compiler ABI info - done\n", + "-- Detecting CXX compile features\n", + "-- Detecting CXX compile features - done\n", + "-- Looking for pthread.h\n", + "-- Looking for pthread.h - found\n", + "-- Performing Test CMAKE_HAVE_LIBC_PTHREAD\n", + "-- Performing Test CMAKE_HAVE_LIBC_PTHREAD - Failed\n", + "-- Looking for pthread_create in pthreads\n", + "-- Looking for pthread_create in pthreads - not found\n", + "-- Looking for pthread_create in pthread\n", + "-- Looking for pthread_create in pthread - found\n", + "-- Found Threads: TRUE \n", + "-- Found CUDA: /usr/local/cuda (found suitable version \"10.2\", minimum required is \"6.5\") \n", + "-- cuda found TRUE\n", + "-- Building shared library with GPU support\n", + "-- Configuring done\n", + "-- Generating done\n", + "-- Build files have been written to: /workspace/zhanghui/DeepSpeech-2.x/docs/topic/ctc/warp-ctc/build\n" + ] + } + ], + "source": [ + "!cmake .." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "7a4238f1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[ 11%] \u001b[34m\u001b[1mBuilding NVCC (Device) object CMakeFiles/warpctc.dir/src/warpctc_generated_reduce.cu.o\u001b[0m\n", + "[ 22%] \u001b[34m\u001b[1mBuilding NVCC (Device) object CMakeFiles/warpctc.dir/src/warpctc_generated_ctc_entrypoint.cu.o\u001b[0m\n", + "\u001b[35m\u001b[1mScanning dependencies of target warpctc\u001b[0m\n", + "[ 33%] \u001b[32m\u001b[1mLinking CXX shared library libwarpctc.so\u001b[0m\n", + "[ 33%] Built target warpctc\n", + "[ 44%] \u001b[34m\u001b[1mBuilding NVCC (Device) object CMakeFiles/test_gpu.dir/tests/test_gpu_generated_test_gpu.cu.o\u001b[0m\n", + "\u001b[35m\u001b[1mScanning dependencies of target test_cpu\u001b[0m\n", + "[ 55%] \u001b[32mBuilding CXX object CMakeFiles/test_cpu.dir/tests/test_cpu.cpp.o\u001b[0m\n", + "[ 66%] \u001b[32mBuilding CXX object CMakeFiles/test_cpu.dir/tests/random.cpp.o\u001b[0m\n", + "[ 77%] \u001b[32m\u001b[1mLinking CXX executable test_cpu\u001b[0m\n", + "[ 77%] Built target test_cpu\n", + "\u001b[35m\u001b[1mScanning dependencies of target test_gpu\u001b[0m\n", + "[ 88%] \u001b[32mBuilding CXX object CMakeFiles/test_gpu.dir/tests/random.cpp.o\u001b[0m\n", + "[100%] \u001b[32m\u001b[1mLinking CXX executable test_gpu\u001b[0m\n", + "[100%] Built target test_gpu\n" + ] + } + ], + "source": [ + "!make -j" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "31761a31", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/workspace/zhanghui/DeepSpeech-2.x/docs/topic/ctc/warp-ctc\n" + ] + } + ], + "source": [ + "cd .." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "f53316f6", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/workspace/zhanghui/DeepSpeech-2.x/docs/topic/ctc/warp-ctc/pytorch_binding\n" + ] + } + ], + "source": [ + "cd pytorch_binding" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "084f1e49", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "running install\n", + "running bdist_egg\n", + "running egg_info\n", + "creating warpctc_pytorch.egg-info\n", + "writing warpctc_pytorch.egg-info/PKG-INFO\n", + "writing dependency_links to warpctc_pytorch.egg-info/dependency_links.txt\n", + "writing top-level names to warpctc_pytorch.egg-info/top_level.txt\n", + "writing manifest file 'warpctc_pytorch.egg-info/SOURCES.txt'\n", + "writing manifest file 'warpctc_pytorch.egg-info/SOURCES.txt'\n", + "installing library code to build/bdist.linux-x86_64/egg\n", + "running install_lib\n", + "running build_py\n", + "creating build\n", + "creating build/lib.linux-x86_64-3.9\n", + "creating build/lib.linux-x86_64-3.9/warpctc_pytorch\n", + "copying warpctc_pytorch/__init__.py -> build/lib.linux-x86_64-3.9/warpctc_pytorch\n", + "running build_ext\n", + "building 'warpctc_pytorch._warp_ctc' extension\n", + "creating /workspace/zhanghui/DeepSpeech-2.x/docs/topic/ctc/warp-ctc/pytorch_binding/build/temp.linux-x86_64-3.9\n", + "creating /workspace/zhanghui/DeepSpeech-2.x/docs/topic/ctc/warp-ctc/pytorch_binding/build/temp.linux-x86_64-3.9/src\n", + "Emitting ninja build file /workspace/zhanghui/DeepSpeech-2.x/docs/topic/ctc/warp-ctc/pytorch_binding/build/temp.linux-x86_64-3.9/build.ninja...\n", + "Compiling objects...\n", + "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n", + "[1/1] c++ -MMD -MF /workspace/zhanghui/DeepSpeech-2.x/docs/topic/ctc/warp-ctc/pytorch_binding/build/temp.linux-x86_64-3.9/src/binding.o.d -pthread -B /workspace/zhanghui/DeepSpeech-2.x/tools/venv/compiler_compat -Wl,--sysroot=/ -Wno-unused-result -Wsign-compare -DNDEBUG -O2 -Wall -fPIC -O2 -isystem /workspace/zhanghui/DeepSpeech-2.x/tools/venv/include -fPIC -O2 -isystem /workspace/zhanghui/DeepSpeech-2.x/tools/venv/include -fPIC -I/workspace/zhanghui/DeepSpeech-2.x/docs/topic/ctc/warp-ctc/include -I/workspace/zhanghui/DeepSpeech-2.x/tools/venv/lib/python3.9/site-packages/torch/include -I/workspace/zhanghui/DeepSpeech-2.x/tools/venv/lib/python3.9/site-packages/torch/include/torch/csrc/api/include -I/workspace/zhanghui/DeepSpeech-2.x/tools/venv/lib/python3.9/site-packages/torch/include/TH -I/workspace/zhanghui/DeepSpeech-2.x/tools/venv/lib/python3.9/site-packages/torch/include/THC -I/usr/local/cuda/include -I/workspace/zhanghui/DeepSpeech-2.x/tools/venv/include/python3.9 -c -c /workspace/zhanghui/DeepSpeech-2.x/docs/topic/ctc/warp-ctc/pytorch_binding/src/binding.cpp -o /workspace/zhanghui/DeepSpeech-2.x/docs/topic/ctc/warp-ctc/pytorch_binding/build/temp.linux-x86_64-3.9/src/binding.o -std=c++14 -fPIC -DWARPCTC_ENABLE_GPU -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE=\"_gcc\"' '-DPYBIND11_STDLIB=\"_libstdcpp\"' '-DPYBIND11_BUILD_ABI=\"_cxxabi1011\"' -DTORCH_EXTENSION_NAME=_warp_ctc -D_GLIBCXX_USE_CXX11_ABI=0\n", + "g++ -pthread -B /workspace/zhanghui/DeepSpeech-2.x/tools/venv/compiler_compat -Wl,--sysroot=/ -shared -Wl,-rpath,/workspace/zhanghui/DeepSpeech-2.x/tools/venv/lib -Wl,-rpath-link,/workspace/zhanghui/DeepSpeech-2.x/tools/venv/lib -L/workspace/zhanghui/DeepSpeech-2.x/tools/venv/lib -Wl,-rpath,/workspace/zhanghui/DeepSpeech-2.x/tools/venv/lib -Wl,-rpath-link,/workspace/zhanghui/DeepSpeech-2.x/tools/venv/lib -L/workspace/zhanghui/DeepSpeech-2.x/tools/venv/lib /workspace/zhanghui/DeepSpeech-2.x/docs/topic/ctc/warp-ctc/pytorch_binding/build/temp.linux-x86_64-3.9/src/binding.o -L/workspace/zhanghui/DeepSpeech-2.x/docs/topic/ctc/warp-ctc/build -L/workspace/zhanghui/DeepSpeech-2.x/tools/venv/lib/python3.9/site-packages/torch/lib -L/usr/local/cuda/lib64 -lwarpctc -lc10 -ltorch -ltorch_cpu -ltorch_python -lcudart -lc10_cuda -ltorch_cuda -o build/lib.linux-x86_64-3.9/warpctc_pytorch/_warp_ctc.cpython-39-x86_64-linux-gnu.so -Wl,-rpath,/workspace/zhanghui/DeepSpeech-2.x/docs/topic/ctc/warp-ctc/build\n", + "creating build/bdist.linux-x86_64\n", + "creating build/bdist.linux-x86_64/egg\n", + "creating build/bdist.linux-x86_64/egg/warpctc_pytorch\n", + "copying build/lib.linux-x86_64-3.9/warpctc_pytorch/__init__.py -> build/bdist.linux-x86_64/egg/warpctc_pytorch\n", + "copying build/lib.linux-x86_64-3.9/warpctc_pytorch/_warp_ctc.cpython-39-x86_64-linux-gnu.so -> build/bdist.linux-x86_64/egg/warpctc_pytorch\n", + "byte-compiling build/bdist.linux-x86_64/egg/warpctc_pytorch/__init__.py to __init__.cpython-39.pyc\n", + "creating stub loader for warpctc_pytorch/_warp_ctc.cpython-39-x86_64-linux-gnu.so\n", + "byte-compiling build/bdist.linux-x86_64/egg/warpctc_pytorch/_warp_ctc.py to _warp_ctc.cpython-39.pyc\n", + "creating build/bdist.linux-x86_64/egg/EGG-INFO\n", + "copying warpctc_pytorch.egg-info/PKG-INFO -> build/bdist.linux-x86_64/egg/EGG-INFO\n", + "copying warpctc_pytorch.egg-info/SOURCES.txt -> build/bdist.linux-x86_64/egg/EGG-INFO\n", + "copying warpctc_pytorch.egg-info/dependency_links.txt -> build/bdist.linux-x86_64/egg/EGG-INFO\n", + "copying warpctc_pytorch.egg-info/top_level.txt -> build/bdist.linux-x86_64/egg/EGG-INFO\n", + "writing build/bdist.linux-x86_64/egg/EGG-INFO/native_libs.txt\n", + "zip_safe flag not set; analyzing archive contents...\n", + "warpctc_pytorch.__pycache__._warp_ctc.cpython-39: module references __file__\n", + "creating dist\n", + "creating 'dist/warpctc_pytorch-0.1-py3.9-linux-x86_64.egg' and adding 'build/bdist.linux-x86_64/egg' to it\n", + "removing 'build/bdist.linux-x86_64/egg' (and everything under it)\n", + "Processing warpctc_pytorch-0.1-py3.9-linux-x86_64.egg\n", + "removing '/workspace/zhanghui/DeepSpeech-2.x/tools/venv/lib/python3.9/site-packages/warpctc_pytorch-0.1-py3.9-linux-x86_64.egg' (and everything under it)\n", + "creating /workspace/zhanghui/DeepSpeech-2.x/tools/venv/lib/python3.9/site-packages/warpctc_pytorch-0.1-py3.9-linux-x86_64.egg\n", + "Extracting warpctc_pytorch-0.1-py3.9-linux-x86_64.egg to /workspace/zhanghui/DeepSpeech-2.x/tools/venv/lib/python3.9/site-packages\n", + "warpctc-pytorch 0.1 is already the active version in easy-install.pth\n", + "\n", + "Installed /workspace/zhanghui/DeepSpeech-2.x/tools/venv/lib/python3.9/site-packages/warpctc_pytorch-0.1-py3.9-linux-x86_64.egg\n", + "Processing dependencies for warpctc-pytorch==0.1\n", + "Finished processing dependencies for warpctc-pytorch==0.1\n" + ] + } + ], + "source": [ + "!python setup.py install" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "ee4ca9e3", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Python 3.9.5\r\n" + ] + } + ], + "source": [ + "!python -V" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "59255ed8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/workspace/zhanghui/DeepSpeech-2.x/docs/topic/ctc/warp-ctc\n" + ] + } + ], + "source": [ + "cd .." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "1dae09b9", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "grep: warning: GREP_OPTIONS is deprecated; please use an alias or script\n" + ] + } + ], + "source": [ + "import torch\n", + "import torch.nn as nn\n", + "import warpctc_pytorch as wp\n", + "import paddle.nn as pn\n", + "import paddle" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "83d0762e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'1.10.0+cu102'" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "torch.__version__" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "62501e2c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'2.2.0'" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "paddle.__version__" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "9e8e0f40", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "torch.Size([2, 1, 5])\n", + "2.4628584384918213\n", + "[[[ 0.17703122 -0.70812464 0.17703122 0.17703122 0.17703122]]\n", + "\n", + " [[ 0.17703122 0.17703122 -0.70812464 0.17703122 0.17703122]]]\n" + ] + } + ], + "source": [ + "probs = torch.FloatTensor([[\n", + " [0.1, 0.6, 0.1, 0.1, 0.1], [0.1, 0.1, 0.6, 0.1, 0.1]\n", + " ]]).transpose(0, 1).contiguous()\n", + "print(probs.size())\n", + "labels = torch.IntTensor([1, 2])\n", + "label_sizes = torch.IntTensor([2])\n", + "probs_sizes = torch.IntTensor([2])\n", + "probs.requires_grad_(True)\n", + "bs = probs.size(1)\n", + "\n", + "ctc_loss = wp.CTCLoss(size_average=False, length_average=False)\n", + "cost = ctc_loss(probs, labels, probs_sizes, label_sizes)\n", + "cost = cost.sum() / bs\n", + "print(cost.item())\n", + "cost.backward()\n", + "print(probs.grad.numpy())" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "2cd46569", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2.4628584384918213\n", + "[[[ 0.1770312 -0.7081248 0.1770312 0.1770312 0.1770312]]\n", + "\n", + " [[ 0.1770312 0.1770312 -0.7081248 0.1770312 0.1770312]]]\n" + ] + } + ], + "source": [ + "probs = torch.FloatTensor([[\n", + " [0.1, 0.6, 0.1, 0.1, 0.1], [0.1, 0.1, 0.6, 0.1, 0.1]\n", + " ]]).transpose(0, 1).contiguous()\n", + "labels = torch.IntTensor([1, 2])\n", + "label_sizes = torch.IntTensor([2])\n", + "probs_sizes = torch.IntTensor([2])\n", + "probs.requires_grad_(True)\n", + "bs = probs.size(1)\n", + "\n", + "log_probs = torch.log_softmax(probs, axis=-1)\n", + "\n", + "ctc_loss1 = nn.CTCLoss(reduction='none')\n", + "cost = ctc_loss1(log_probs, labels, probs_sizes, label_sizes)\n", + "cost = cost.sum() / bs\n", + "print(cost.item())\n", + "cost.backward()\n", + "print(probs.grad.numpy())" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "85c3461a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2, 1, 5]\n", + "[1, 2]\n", + "2.4628584384918213\n", + "[[[ 0.17703122 -0.70812464 0.17703122 0.17703122 0.17703122]]\n", + "\n", + " [[ 0.17703122 0.17703122 -0.70812464 0.17703122 0.17703122]]]\n" + ] + } + ], + "source": [ + "paddle.set_device('cpu')\n", + "probs = paddle.to_tensor([[\n", + " [0.1, 0.6, 0.1, 0.1, 0.1], [0.1, 0.1, 0.6, 0.1, 0.1],\n", + " ]]).transpose([1,0,2])\n", + "print(probs.shape) # (T, B, D)\n", + "labels = paddle.to_tensor([[1, 2]], dtype='int32') #(B,L)\n", + "print(labels.shape)\n", + "label_sizes = paddle.to_tensor([2], dtype='int64')\n", + "probs_sizes = paddle.to_tensor([2], dtype='int64')\n", + "bs = paddle.shape(probs)[1]\n", + "probs.stop_gradient=False\n", + "\n", + "ctc_loss = pn.CTCLoss(reduction='none')\n", + "cost = ctc_loss(probs, labels, probs_sizes, label_sizes)\n", + "cost = cost.sum() / bs\n", + "print(cost.item())\n", + "cost.backward()\n", + "print(probs.grad.numpy())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d390cd91", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/topic/frontend/g2p.md b/docs/topic/frontend/g2p.md new file mode 100644 index 0000000000000000000000000000000000000000..7713420a1c5dd0d96f3bcb99ccd006a27b87c4dd --- /dev/null +++ b/docs/topic/frontend/g2p.md @@ -0,0 +1,174 @@ +# g2p 字典设计 + +本文主要讲语音合成的 g2p (grapheme to phoneme) 部分。 + +代码: [generate_lexicon.py](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/examples/other/mfa/local/generate_lexicon.py) (代码可能与此处的描述有些许出入,以代码为准,生成的带 tone 带儿化的 pinyin 字典参考 [simple.lexicon](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/examples/csmsc/tts3/local/simple.lexicon)) + +## ARPAbet +对于英文 TTS,常用的 g2p 是通过查询 CMUDict 来实现,而 CMUDict 注音使用的系统是 ARPAbet,具体含义参见 [CMU 发音词典](http://www.speech.cs.cmu.edu/cgi-bin/cmudict/)。 + +它包含 39 个 phoneme, 不包含音词汇重音的变体: + +| Phoneme | Example | Translation | +|:-------------:|:-------:|:-----------:| +| AA | odd | AA D | +| AE | at | AE T | +| AH | hut | HH AH T | +| AO | ought | AO T | +| AW | cow | K AW | +| AY | hide | HH AY D | +| B | be | B IY | +| CH | cheese | CH IY Z | +| D | dee | D IY | +| DH | thee | DH IY | +| EH | Ed | EH D | +| ER | hurt | HH ER T | +| EY | ate | EY T | +| F | fee | F IY | +| G | green | G R IY N | +| HH | he | HH IY | +| IH | it | IH T | +| IY | eat | IY T | +| JH | gee | JH IY | +| K | key | K IY | +| L | lee | L IY | +| M | me | M IY | +| N | knee | N IY | +| NG | ping | P IH NG | +| OW | oat | OW T | +| OY | toy | T OY | +| P | pee | P IY | +| R | read | R IY D | +| S | sea | S IY | +| SH | she | SH IY | +| T | tea | T IY | +| TH | theta | TH EY T AH| +| UH | hood | HH UH D | +| UW | two | T UW | +| V | vee | V IY | +| W | we | W IY | +| Y | yield | Y IY L D | +| Z | zee | Z IY | +| ZH | seizure| S IY ZH ER| + +另外还包含三个重音标记, + +0 — No stress +1 — Primary stress +2 — Secondary stress + +其中重音标记附在元音后面。当只需要音标而不需要重音标记的时候也可以直接省略。 + +CMUDict 只是一个词典,当出现了不在词典中的词时(OOV),可以求助其他工具可以根据拼写得到对应的发音,如: + - [Lexicon Tool](http://www.speech.cs.cmu.edu/tools) + - [g2p-seq2seq](https://github.com/cmusphinx/g2p-seq2seq) + +## 中文注音系统 + +中文普通话的注音系统存在许多套,比如汉语拼音 (pinyin), 注音符号 (bopomofo), 国语注音符第二式, 威妥玛拼音等。而且有一些并非注音方案,是拉丁化方案,因此为了符号系统的经济性,会做一些互补符号的简并,比如汉语拼音中的 `i` 的代表了三个音位, `e` 代表了两个音位(单用的情况很少, 单用时写作 `ê`);也有一些简写,比如 `bpmf` 后的 `o` 是 `uo` 的简写, `ui` 是 `uei` 的简写,` iu` 是 `iou` 的简写, `un` 是 `uen` 的简写, `ao` 是为了书写避免形近而改掉的 `au`, `y` 和 `w` 是为了连续书写时作为分隔而产生的零声母, `ü` 在 `j`、 `q`、 `x` 后面省略两点(中国大陆使用美式键盘打字的时候,一般只有在“女”、 “律”、“略”和“虐”这一类的字里面用 `v` 代替 `ü`,而在 `j`、 `q`、 `x` 后面的时候则仍用 `u` ),有鼻韵母 `uang` 而没有 `ueng`,但是又有 `weng` 这个音节之类的问题, 有 `ong` 韵母但是又没有单用的情形。其实这些都是汉语拼音作为拉丁化方案而做的一系列的修改。 + +另外,汉语的声调是用了特殊符号来标调型,用字母记录的时候常用 `12345` 或者 `1234`、轻音不标等手段。 + +另外还有两个比较突出的问题是**儿化**和**变调**(参考 [zh_text_frontend](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/tts/zh_text_frontend.md))。对于具体的数据集,也可能有不同的标注方案。一般我们为汉字标音是标字调而不标变调,但是**标贝数据集是标变调的**(但是也没有把所有的变调都正确标出来)。儿化在汉语书写和拼音中也是一个比较麻烦的事情,虽然正字法中说到可以用小字号的儿表示儿化,但是这种发音由字号这种排版要素来表达的手法未免过于崎岖,所以鲜见有人真的这么排版,只有在某些书籍中,强调此事的时候见过。另外,在儿化的标音方式上,鼻韵母需要去掉韵尾然后换成 r,这么一来,如果直接抽取拼音的字符串表示,那么可能出现的音节就会超过 1400, 甚至进入一种含糊的状态,不清楚一共有多少个有效音节,即使是韵母,也会因此扩展近一倍。 + +因为存在这样的情形,再考虑到不同的数据集自带的拼音 transcription 的风格可能不同,所以需要考虑进行转换,在内部转成统一的表示。既然这个过程是必要的,那么我们可以大胆设计一个内部方案。 + +这里设计的原则是: + +1. 有效符号集仅切分为声母和韵母,不作声母,介音,韵腹,韵尾的切分; + +2. 尽可能把不同的音用不同的符号表示,比如 `i` 的 `e` 会被拆分为 3 和 2 个符号, `u` 和 `ü` 开头的韵母分开,这是为了 TTS 系统的建议性考虑的,我们选择尽量反映语音的现实情况,而不把注音系统里面的奇怪规则留给模型去学习; + +3. 不包含零声母 `y`, `w`之类的形式上的符号,因为如果这些符号不发声或者发声极短,那么可以不加入音符序列中,以期待 attention 更对角; + +4. 声调和韵母不结合为一个符号,而是分开,这样可以**减少词汇量**,使得符号的 embedding 得到更充分的训练,也更能反映声调语言的特点(数据集少时推荐这么做); + +5. 儿化的标音方式采用拆分的方式处理, 但是增设一个特殊符号 `&r` 来表示儿化的 `r`,它和一般的 `er` 不同,以区分实际读音的区别。 + +6. 更加贴近注音符号,把 `in` 写作 `ien`,`ing` 写作 `ieng`, `un` 写作 `uen`, `ong` 写作 `ueng`, `iong` 写作 `üeng`。其中 `in` 和 `ing` 的转写纯属偏好,无论用什么符号写,都可以被转为一个 index, 只要它们的使用情况不发声变化就可以。而 `ong` 写作 `ueng` 则是有实际差别的,如果 `ong` 作为一个韵母,那么 `weng` 经过修改之后会变成 `ueng`, 就会同时有 `ueng` 和 `ong`。而如果不细究音值上的微妙差异,`ong` 就是 `ung` 的一种奇怪表示, 在注意符号中, 它就记作 `ㄨㄥ`。而 `iong` 则是 `ㄩㄥ`。 + +7. `ui`, `iu` 都展开为 `uei` 和 `iou` , 纯属偏好,对实际结果没有影响。`bpmf `后的 `o` 展开为 `uo`,这个则是为了和单独的 `o` 区分开(哦, 和波里面的韵母的发音其实不同)。 + +8. 所有的 `ü `都有 `v` 代替,无论是单独作韵母, 还是复韵母和鼻韵母。 + +9. 把停顿以 `#1` 等方式纳入其中, 把 `` `` `` `` 这些为了处理符号系列的特殊符号也加入其中,多一些特殊词汇并不会对 Embedding 产生什么影响。 + +于是我们可以的通过一套规则系统,把标贝的**拼音标注**转换成我们需要的形式。(当然,如果是别的数据集的实际标注不同,那么转换规则也要作一些修改) + +在实际使用中文数据集时,我们仅使用其提供的**拼音标注**,而不使用**音素标注**(PhoneLabel),因为不同的数据集有不同的标注规则,而且有的数据集是没有**音素标注**的(如,aishell3) + +我们的做法和维基百科上的汉语拼音音节列表更接近 [汉语拼音音节列表](https://zh.wikipedia.org/zh-hans/%E6%B1%89%E8%AF%AD%E6%8B%BC%E9%9F%B3%E9%9F%B3%E8%8A%82%E5%88%97%E8%A1%A8) + +转换之后,符号列表是: + +声母基本没有什么争议,共 21 个: +|声母| +|:--:| +|b| +|p| +|m| +|f| +|d| +|t| +|n| +|l| +|g| +|k| +|h| +|j| +|q| +|x| +|zh| +|ch| +|sh| +|r| +|z| +|c| +|s| + +韵母和儿化韵尾(共 41个) +|韵母|解释| +|:----:|:-----------: | +|ii |`zi`,`ci`, `si` 里面的韵母 `i`| +|iii |`zhi`, `chi`, `shi`, `ri` 里面的韵母 `i`| +|a |啊,卡| +|o |哦| +|e |恶,个| +|ea |ê| +|ai |爱,在| +|ei |诶,薇| +|ao |奥,脑| +|ou |欧,勾| +|an |安,单| +|en |恩,痕| +|ang |盎,刚| +|eng |嗯,更| +|er |儿| +|i |一| +|ia |鸦,家| +|io |哟| +|ie |叶,界| +|iai |崖(台语发音)| +|iao |要,教| +|iou |有,久| +|ian |言,眠| +|ien |因,新| +|iang |样,降| +|ieng |英,晶 +|u |无,卢| +|ua |哇,瓜| +|uo |我,波| +|uai |外,怪| +|uei |位,贵| +|uan |万,乱| +|uen |问,论| +|uang |网,光| +|ueng |翁,共| +|v |玉,曲,`ü`| +|ve |月,却| +|van |源,倦| +|ven |韵,君| +|veng |永,炯| +|&r |儿化韵尾| diff --git a/docs/topic/gan_vocoder/gan_vocoder.ipynb b/docs/topic/gan_vocoder/gan_vocoder.ipynb index d214a81e2071db2ed048c8bee6885af7d491ac4d..edb4eeb1dd9ed223851ebea77a95a892ef9da271 100644 --- a/docs/topic/gan_vocoder/gan_vocoder.ipynb +++ b/docs/topic/gan_vocoder/gan_vocoder.ipynb @@ -21,7 +21,11 @@ "|FB-RAWs|Filter Bank Random Window Discriminators|\n", "\n", "

\n", - "csmsc 数据集上 GAN Vocoder 整体对比\n", + "csmsc 数据集上 GAN Vocoder 整体对比如下, \n ", + "\n", + "测试机器:1 x Tesla V100-32G 40 core Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz\n ", + "\n", + "测试环境:Python 3.7.0, paddlepaddle 2.2.0\n", "\n", "Model|Date|Input|Generator
Loss|Discriminator
Loss|Need
Finetune|Training
Steps|Finetune
Steps|Batch
Size|ips
(gen only)
(gen + dis)|Static Model
Size (gen)|RTF
(GPU)|\n", ":-----:|:-----:|:-----:|:-----:|:-----:|:-----:|:-----:|:-----:|:-----:|:-----:|:-----:|:-----:|\n", diff --git a/docs/topic/package_release/python_package_release.md b/docs/topic/package_release/python_package_release.md new file mode 100644 index 0000000000000000000000000000000000000000..96d6f5f4ec502f0b82700d74c0897e18d45228ef --- /dev/null +++ b/docs/topic/package_release/python_package_release.md @@ -0,0 +1,173 @@ +# 简化安装与发包 + +## 问题: + +1. [如何去除 ubuntu 的 apt 安装依赖?](#conda-代替系统依赖) +2. [如何支持普通用户和开发者两种安装的需求,尽量减少普通用户所需的依赖?](#区分install模式和develop模式) +3. [如何进行 python 包的动态安装?](#python-包的动态安装) +4. [如何进行 python 项目编包?](#python-编包方法) +5. [发包前要有什么准备?](#关于发包前的准备工作) +6. [发 C++ 包需要注意的东西?](#manylinux) + + +## conda 代替系统依赖 + +conda 可以用来代替一些 apt-get 安装的系统依赖,这样可以让项目适用于除了 ubuntu 以外的系统。 + +使用 conda 可以安装 sox、 libsndfile、swig 等 paddlespeech 需要的依赖: + +```bash +conda install -y -c conda-forge sox libsndfile +``` + +部分系统会缺少 libbzip2 库,这个 paddlespeech 也是需要的,这也可以用 conda 安装: + +```bash +conda install -y -c bzip2 +``` + +conda 也可以安装 linux 的 C++ 的依赖: + +```bash +conda install -y -c gcc_linux-64=8.4.0 gxx_linux-64=8.4.0 +``` + +#### 剩余问题:使用 conda 环境编译 kenlm 失败。目前在 conda 环境下编译 kenlm 会出现链接失败的问题 + +目前知道需要的依赖: + +```bash +conda install -c conda-forge eigen boost cmake +``` + +## 区分install模式和develop模式 + +可以在 setup.py 中划分 install 的依赖(基本依赖)和 develop 的依赖 (开发者额外依赖)。 setup_info 中 `install_requires` 设置 install 的依赖,而在 `extras_require` 中设置 `develop` key 为 develop 的依赖。 +普通安装可以使用: + +```bash +pip install . +``` + +另外使用 pip 安装已发的包也是使用普通安装的: + +``` +pip install paddlespeech +``` + +而开发者可以使用如下方式安装,这样不仅会安装 install 的依赖,也会安装 develop 的依赖, 即:最后安装的依赖 = install 依赖 + develop 依赖: + +```bash +pip install -e .[develop] +``` + +## python 包的动态安装 + +可以使用 pip 包来实现动态安装: + +```python +import pip +if int(pip.__version__.split('.')[0]) > 9: + from pip._internal import main + else: + from pip import main + main(['install', package_name]) +``` + +## python 编包方法 + +#### 创建 pypi的账号 + +创建 pypi 账号 + +#### 下载 twine + +``` +pip install twine +``` + +#### python 编包 + +编写好 python 包的 setup.py, 然后使用如下命令编 wheel 包: + +```bash +python setup.py bdist_wheel +``` + +如果要编源码包,用如下命令: + +```bash +python setup.py sdist +``` + +#### 上传包 + +```bash +twine upload dist/wheel包 +``` + +输入账号和密码后就可以上传 wheel 包了 + +#### 关于python 包的发包信息 + +主要可以参考这个[文档](https://packaging.python.org/en/latest/guides/distributing-packages-using-setuptools/?highlight=find_packages) + + +## 关于发包前的准备工作 + +#### 拉分支 +在发包之前需要拉分支。例如需要发 0.1.0 版本的正式包,则需要拉一个 r0.1 的分支。并且在这个 r0.1 分支的包上面打 0.1.0 的tag。在拉分支之前可以选择性的使用 rc 版本发一个正式版前的试用包,例如0.1.0rc0,等到rc包测试通过后,再拉分支(如果是发 0.1.1 包,则 merge r0.1分支),打tag,完成发包。总体步骤可以总结为: + +- 用 develop 分支发 rc 包 +- rc 包通过后拉分支 +- 打 tag +- 发包 +- 编写 release note + + + +## ManyLinux + +为了让有 C++ 依赖的 pip wheel 包可以适用于更多的 linux 系统,需要降低其本身的 glibc 的依赖。这就需要让 pip wheel 包在 manylinux 的 docker 下编包。关于查看系统的 glibc 版本,可以使用命令:`ldd --version`。 + +### Manylinux + +关于 Manylinux,主要可以参考 Github 项目的说明[ github many linux](https://github.com/pypa/manylinux)。 +manylinux1 支持 Centos5以上, manylinux2010 支持 Centos 6 以上,manylinux2014 支持Centos 7 以上。 +目前使用 manylinux2010 基本可以满足所有的 linux 生产环境需求。(不建议使用manylinux1,系统较老,难度较大) + +### 拉取 manylinux2010 + +```bash +docker pull quay.io/pypa/manylinux1_x86_64 +``` + +### 使用 manylinux2010 + +启动 manylinux2010 docker。 + +```bash +docker run -it xxxxxx +``` + +在 manylinux2010 的docker环境自带 swig 和各种类型的 python 版本。这里注意不要自己下载 conda 来安装环境来编译 pip 包,要用 docker 本身的环境来编包。 +设置python: + +```bash +export PATH="/opt/python/cp37-cp37m/bin/:$PATH" +#export PATH="/opt/python/cp38-cp38/bin/:$PATH" +#export PATH="/opt/python/cp39-cp39/bin/:$PATH" +``` + +随后正常编包,编包后需要使用 [auditwheel](https://github.com/pypa/auditwheel) 来降低编好的wheel包的版本。 +显示 wheel 包的 glibc 依赖版本 + +```bash +auditwheel show wheel包 +``` + +降低 wheel包的版本 + +```bash +auditwheel repair wheel包 +``` diff --git a/docs/tutorial/cls/cls_tutorial.ipynb b/docs/tutorial/cls/cls_tutorial.ipynb index 9b8bfc11926e10d8c2158ab50d24b97e3ba23058..56b488adc167867bb2cdc7c6593e431a0c7d9699 100644 --- a/docs/tutorial/cls/cls_tutorial.ipynb +++ b/docs/tutorial/cls/cls_tutorial.ipynb @@ -2,9 +2,7 @@ "cells": [ { "cell_type": "markdown", - "metadata": { - "collapsed": false - }, + "metadata": {}, "source": [ "\"Fork\n", "\n", @@ -32,9 +30,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "%%HTML\n", @@ -45,9 +41,7 @@ }, { "cell_type": "markdown", - "metadata": { - "collapsed": false - }, + "metadata": {}, "source": [ "# 2. 音频和特征提取" ] @@ -55,9 +49,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "# 环境准备:安装paddlespeech和paddleaudio\n", @@ -67,9 +59,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "import warnings\n", @@ -82,9 +72,7 @@ }, { "cell_type": "markdown", - "metadata": { - "collapsed": false - }, + "metadata": {}, "source": [ "\n", "\n", @@ -98,9 +86,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "# 获取示例音频\n", @@ -111,9 +97,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "from paddleaudio import load\n", @@ -130,9 +114,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "!paddlespeech cls --input ./dog.wav" @@ -140,9 +122,7 @@ }, { "cell_type": "markdown", - "metadata": { - "collapsed": false - }, + "metadata": {}, "source": [ "## 2.2 音频特征提取\n", "\n", @@ -162,21 +142,20 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "import paddle\n", "import numpy as np\n", "\n", + "data, sr = load(file='./dog.wav', sr=32000, mono=True, dtype='float32')\n", "x = paddle.to_tensor(data)\n", "n_fft = 1024\n", "win_length = 1024\n", - "hop_length = 512\n", + "hop_length = 320\n", "\n", "# [D, T]\n", - "spectrogram = paddle.signal.stft(x, n_fft=1024, win_length=1024, hop_length=512, onesided=True) \n", + "spectrogram = paddle.signal.stft(x, n_fft=n_fft, win_length=win_length, hop_length=hop_length, onesided=True) \n", "print('spectrogram.shape: {}'.format(spectrogram.shape))\n", "print('spectrogram.dtype: {}'.format(spectrogram.dtype))\n", "\n", @@ -190,9 +169,7 @@ }, { "cell_type": "markdown", - "metadata": { - "collapsed": false - }, + "metadata": {}, "source": [ "### 2.2.2 LogFBank\n", "\n", @@ -220,13 +197,15 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "from paddleaudio.features import LogMelSpectrogram\n", "\n", + "f_min=50.0\n", + "f_max=14000.0\n", + "n_mels=64\n", + "\n", "# - sr: 音频文件的采样率。\n", "# - n_fft: FFT样本点个数。\n", "# - hop_length: 音频帧之间的间隔。\n", @@ -239,7 +218,9 @@ " hop_length=hop_length, \n", " win_length=win_length, \n", " window='hann', \n", - " n_mels=64)\n", + " f_min=f_min,\n", + " f_max=f_max,\n", + " n_mels=n_mels)\n", "\n", "x = paddle.to_tensor(data).unsqueeze(0) # [B, L]\n", "log_fbank = feature_extractor2(x) # [B, D, T]\n", @@ -253,9 +234,7 @@ }, { "cell_type": "markdown", - "metadata": { - "collapsed": false - }, + "metadata": {}, "source": [ "## 2.3 声音分类方法\n", "\n", @@ -272,9 +251,7 @@ }, { "cell_type": "markdown", - "metadata": { - "collapsed": false - }, + "metadata": {}, "source": [ "### 2.3.2 深度学习方法\n", "传统机器学习方法可以捕捉声音特征的差异(例如男声和女声的声音在音高上往往差异较大)并实现分类任务。\n", @@ -288,9 +265,7 @@ }, { "cell_type": "markdown", - "metadata": { - "collapsed": false - }, + "metadata": {}, "source": [ "### 2.3.3 Pretrain + Finetune\n", "\n", @@ -315,9 +290,7 @@ }, { "cell_type": "markdown", - "metadata": { - "collapsed": false - }, + "metadata": {}, "source": [ "# 3. 实践:环境声音分类\n", "\n", @@ -361,22 +334,18 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "from paddleaudio.datasets import ESC50\n", "\n", - "train_ds = ESC50(mode='train')\n", - "dev_ds = ESC50(mode='dev')" + "train_ds = ESC50(mode='train', sample_rate=sr)\n", + "dev_ds = ESC50(mode='dev', sample_rate=sr)" ] }, { "cell_type": "markdown", - "metadata": { - "collapsed": false - }, + "metadata": {}, "source": [ "### 3.1.2 特征提取\n", "通过下列代码,用 `paddleaudio.features.LogMelSpectrogram` 初始化一个音频特征提取器,在训练过程中实时提取音频的 LogFBank 特征,其中主要的参数如下: " @@ -385,19 +354,23 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ - "feature_extractor = LogMelSpectrogram(sr=44100, n_fft=n_fft, hop_length=hop_length, win_length=win_length, window='hann', n_mels=64)" + "feature_extractor = LogMelSpectrogram(\n", + " sr=sr, \n", + " n_fft=n_fft, \n", + " hop_length=hop_length, \n", + " win_length=win_length, \n", + " window='hann', \n", + " f_min=f_min,\n", + " f_max=f_max,\n", + " n_mels=n_mels)" ] }, { "cell_type": "markdown", - "metadata": { - "collapsed": false - }, + "metadata": {}, "source": [ "## 3.2 模型\n", "\n", @@ -409,9 +382,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "from paddlespeech.cls.models import cnn14\n", @@ -420,9 +391,7 @@ }, { "cell_type": "markdown", - "metadata": { - "collapsed": false - }, + "metadata": {}, "source": [ "### 3.2.2 构建分类模型\n", "\n", @@ -432,9 +401,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "import paddle.nn as nn\n", @@ -461,18 +428,14 @@ }, { "cell_type": "markdown", - "metadata": { - "collapsed": false - }, + "metadata": {}, "source": [ "## 3.3 Finetune" ] }, { "cell_type": "markdown", - "metadata": { - "collapsed": false - }, + "metadata": {}, "source": [ "1. 创建 DataLoader " ] @@ -480,9 +443,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "batch_size = 16\n", @@ -492,9 +453,7 @@ }, { "cell_type": "markdown", - "metadata": { - "collapsed": false - }, + "metadata": {}, "source": [ "2. 定义优化器和 Loss" ] @@ -502,9 +461,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "optimizer = paddle.optimizer.Adam(learning_rate=1e-4, parameters=model.parameters())\n", @@ -513,19 +470,15 @@ }, { "cell_type": "markdown", - "metadata": { - "collapsed": false - }, + "metadata": {}, "source": [ "3. 启动模型训练 " ] }, { "cell_type": "code", - "execution_count": 15, - "metadata": { - "collapsed": false - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "from paddleaudio.utils import logger\n", @@ -603,9 +556,7 @@ }, { "cell_type": "markdown", - "metadata": { - "collapsed": false - }, + "metadata": {}, "source": [ "## 3.4 音频预测\n", "\n", @@ -615,16 +566,13 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "top_k = 10\n", "wav_file = './dog.wav'\n", "\n", - "waveform, sr = load(wav_file)\n", - "feature_extractor = LogMelSpectrogram(sr=sr, n_fft=n_fft, hop_length=hop_length, win_length=win_length, window='hann', n_mels=64)\n", + "waveform, _ = load(wav_file, sr)\n", "feats = feature_extractor(paddle.to_tensor(paddle.to_tensor(waveform).unsqueeze(0)))\n", "feats = paddle.transpose(feats, [0, 2, 1]) # [B, N, T] -> [B, T, N]\n", "print(feats.shape)\n", @@ -635,16 +583,14 @@ "sorted_indices = probs[0].argsort()\n", "\n", "msg = f'[{wav_file}]\\n'\n", - "for idx in sorted_indices[-top_k:]:\n", + "for idx in sorted_indices[-1:-top_k-1:-1]:\n", " msg += f'{ESC50.label_list[idx]}: {probs[0][idx]:.5f}\\n'\n", "print(msg)" ] }, { "cell_type": "markdown", - "metadata": { - "collapsed": false - }, + "metadata": {}, "source": [ "# 4. 作业\n", "1. 使用开发模式安装 [PaddleSpeech](https://github.com/PaddlePaddle/PaddleSpeech) \n", @@ -653,6 +599,7 @@ "1. 在 [MusicSpeech](http://marsyas.info/downloads/datasets.html) 数据集上完成 music/speech 二分类。 \n", "2. 在 [GTZAN Genre Collection](http://marsyas.info/downloads/datasets.html) 音乐分类数据集上利用 PANNs 预训练模型实现音乐类别十分类。\n", "\n", + "关于如何自定义分类数据集,请参考文档 [PaddleSpeech/docs/source/cls/custom_dataset.md](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/cls/custom_dataset.md)\n", "\n", "# 5. 关注 PaddleSpeech\n", "\n", @@ -681,9 +628,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "py37", "language": "python", - "name": "py35-paddle1.2.0" + "name": "py37" }, "language_info": { "codemirror_mode": { @@ -695,7 +642,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.4" + "version": "3.7.7" } }, "nbformat": 4, diff --git a/examples/aishell/asr0/RESULTS.md b/examples/aishell/asr0/RESULTS.md index 8e85d1d58dab553ca25d17fcc48299b8452a0a59..5841a85220da92dd55ffb980e7e24ac398574709 100644 --- a/examples/aishell/asr0/RESULTS.md +++ b/examples/aishell/asr0/RESULTS.md @@ -1,12 +1,18 @@ # Aishell-1 +## Deepspeech2 Streaming + +| Model | Number of Params | Release | Config | Test set | Valid Loss | CER | +| --- | --- | --- | --- | --- | --- | --- | +| DeepSpeech2 | 45.18M | 2.2.0 | conf/deepspeech2_online.yaml + spec aug | test | 7.994938373565674 | 0.080 | + ## Deepspeech2 Non-Streaming -| Model | Params | Release | Config | Test set | Loss | CER | +| Model | Number of Params | Release | Config | Test set | Valid Loss | CER | | --- | --- | --- | --- | --- | --- | --- | | DeepSpeech2 | 58.4M | 2.2.0 | conf/deepspeech2.yaml + spec aug | test | 5.738585948944092 | 0.064000 | | DeepSpeech2 | 58.4M | 2.1.0 | conf/deepspeech2.yaml + spec aug | test | 7.483316898345947 | 0.077860 | | DeepSpeech2 | 58.4M | 2.1.0 | conf/deepspeech2.yaml | test | 7.299022197723389 | 0.078671 | | DeepSpeech2 | 58.4M | 2.0.0 | conf/deepspeech2.yaml | test | - | 0.078977 | | --- | --- | --- | --- | --- | --- | --- | -| DeepSpeech2 | 58.4M | 1.8.5 | - | test | - | 0.080447 | +| DeepSpeech2 | 58.4M | 1.8.5 | - | test | - | 0.080447 | diff --git a/examples/aishell/asr0/conf/deepspeech2.yaml b/examples/aishell/asr0/conf/deepspeech2.yaml index bdfa42199011973cdef445583663822273daac8f..fb69986474b853e81290623f066fc1c148533cf9 100644 --- a/examples/aishell/asr0/conf/deepspeech2.yaml +++ b/examples/aishell/asr0/conf/deepspeech2.yaml @@ -1,68 +1,64 @@ # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test - min_input_len: 0.0 - max_input_len: 27.0 # second - min_output_len: 0.0 - max_output_len: .inf - min_output_input_ratio: 0.00 - max_output_input_ratio: .inf +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test +min_input_len: 0.0 +max_input_len: 27.0 # second +min_output_len: 0.0 +max_output_len: .inf +min_output_input_ratio: 0.00 +max_output_input_ratio: .inf -collator: - batch_size: 64 # one gpu - mean_std_filepath: data/mean_std.json - unit_type: char - vocab_filepath: data/lang_char/vocab.txt - augmentation_config: conf/augmentation.json - random_seed: 0 - spm_model_prefix: - spectrum_type: linear - feat_dim: - delta_delta: False - stride_ms: 10.0 - window_ms: 20.0 - n_fft: None - max_freq: None - target_sample_rate: 16000 - use_dB_normalization: True - target_dB: -20 - dither: 1.0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 +########################################### +# Dataloader # +########################################### +batch_size: 64 # one gpu +mean_std_filepath: data/mean_std.json +unit_type: char +vocab_filepath: data/lang_char/vocab.txt +augmentation_config: conf/augmentation.json +random_seed: 0 +spm_model_prefix: +spectrum_type: linear +feat_dim: 161 +delta_delta: False +stride_ms: 10.0 +window_ms: 20.0 +n_fft: None +max_freq: None +target_sample_rate: 16000 +use_dB_normalization: True +target_dB: -20 +dither: 1.0 +keep_transcription_text: False +sortagrad: True +shuffle_method: batch_shuffle +num_workers: 2 -model: - num_conv_layers: 2 - num_rnn_layers: 3 - rnn_layer_size: 1024 - use_gru: True - share_rnn_weights: False - blank_id: 0 - ctc_grad_norm_type: instance +############################################ +# Network Architecture # +############################################ +num_conv_layers: 2 +num_rnn_layers: 3 +rnn_layer_size: 1024 +use_gru: True +share_rnn_weights: False +blank_id: 0 +ctc_grad_norm_type: instance -training: - n_epoch: 80 - accum_grad: 1 - lr: 2e-3 - lr_decay: 0.83 - weight_decay: 1e-06 - global_grad_clip: 3.0 - log_interval: 100 - checkpoint: - kbest_n: 50 - latest_n: 5 - -decoding: - batch_size: 128 - error_rate_type: cer - decoding_method: ctc_beam_search - lang_model_path: data/lm/zh_giga.no_cna_cmn.prune01244.klm - alpha: 1.9 - beta: 5.0 - beam_size: 300 - cutoff_prob: 0.99 - cutoff_top_n: 40 - num_proc_bsearch: 10 +########################################### +# Training # +########################################### +n_epoch: 80 +accum_grad: 1 +lr: 2.0e-3 +lr_decay: 0.83 +weight_decay: 1.0e-6 +global_grad_clip: 3.0 +log_interval: 100 +checkpoint: + kbest_n: 50 + latest_n: 5 diff --git a/examples/aishell/asr0/conf/deepspeech2_online.yaml b/examples/aishell/asr0/conf/deepspeech2_online.yaml index 2f63f4de0cc5101ce6b2cbe5d1153fbb85a25d1a..ef01ac595e12ea67a582cf1057f333e8b21e4741 100644 --- a/examples/aishell/asr0/conf/deepspeech2_online.yaml +++ b/examples/aishell/asr0/conf/deepspeech2_online.yaml @@ -1,70 +1,68 @@ # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test - min_input_len: 0.0 - max_input_len: 27.0 # second - min_output_len: 0.0 - max_output_len: .inf - min_output_input_ratio: 0.00 - max_output_input_ratio: .inf +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test +min_input_len: 0.0 +max_input_len: 27.0 # second +min_output_len: 0.0 +max_output_len: .inf +min_output_input_ratio: 0.00 +max_output_input_ratio: .inf -collator: - batch_size: 64 # one gpu - mean_std_filepath: data/mean_std.json - unit_type: char - vocab_filepath: data/lang_char/vocab.txt - augmentation_config: conf/augmentation.json - random_seed: 0 - spm_model_prefix: - spectrum_type: linear #linear, mfcc, fbank - feat_dim: - delta_delta: False - stride_ms: 10.0 - window_ms: 20.0 - n_fft: None - max_freq: None - target_sample_rate: 16000 - use_dB_normalization: True - target_dB: -20 - dither: 1.0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 0 +########################################### +# Dataloader # +########################################### +batch_size: 64 # one gpu +mean_std_filepath: data/mean_std.json +unit_type: char +vocab_filepath: data/lang_char/vocab.txt +augmentation_config: conf/augmentation.json +random_seed: 0 +spm_model_prefix: +spectrum_type: linear #linear, mfcc, fbank +feat_dim: 161 +delta_delta: False +stride_ms: 10.0 +window_ms: 20.0 +n_fft: None +max_freq: None +target_sample_rate: 16000 +use_dB_normalization: True +target_dB: -20 +dither: 1.0 +keep_transcription_text: False +sortagrad: True +shuffle_method: batch_shuffle +num_workers: 0 -model: - num_conv_layers: 2 - num_rnn_layers: 5 - rnn_layer_size: 1024 - rnn_direction: forward # [forward, bidirect] - num_fc_layers: 0 - fc_layers_size_list: -1, - use_gru: False - blank_id: 0 +############################################ +# Network Architecture # +############################################ +num_conv_layers: 2 +num_rnn_layers: 5 +rnn_layer_size: 1024 +rnn_direction: forward # [forward, bidirect] +num_fc_layers: 0 +fc_layers_size_list: -1, +use_gru: False +blank_id: 0 -training: - n_epoch: 65 - accum_grad: 1 - lr: 5e-4 - lr_decay: 0.93 - weight_decay: 1e-06 - global_grad_clip: 3.0 - log_interval: 100 - checkpoint: - kbest_n: 50 - latest_n: 5 +########################################### +# Training # +########################################### +n_epoch: 65 +accum_grad: 1 +lr: 5.0e-4 +lr_decay: 0.93 +weight_decay: 1.0e-6 +global_grad_clip: 3.0 +log_interval: 100 +checkpoint: + kbest_n: 50 + latest_n: 5 + -decoding: - batch_size: 32 - error_rate_type: cer - decoding_method: ctc_beam_search - lang_model_path: data/lm/zh_giga.no_cna_cmn.prune01244.klm - alpha: 2.2 #1.9 - beta: 4.3 - beam_size: 300 - cutoff_prob: 0.99 - cutoff_top_n: 40 - num_proc_bsearch: 10 diff --git a/examples/aishell/asr0/conf/tuning/chunk_decode.yaml b/examples/aishell/asr0/conf/tuning/chunk_decode.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9de06711cc7c605803776bf0aee53fd3b3bd4f95 --- /dev/null +++ b/examples/aishell/asr0/conf/tuning/chunk_decode.yaml @@ -0,0 +1,10 @@ +chunk_batch_size: 32 +error_rate_type: cer +decoding_method: ctc_beam_search +lang_model_path: data/lm/zh_giga.no_cna_cmn.prune01244.klm +alpha: 2.2 #1.9 +beta: 4.3 +beam_size: 300 +cutoff_prob: 0.99 +cutoff_top_n: 40 +num_proc_bsearch: 10 diff --git a/examples/aishell/asr0/conf/tuning/decode.yaml b/examples/aishell/asr0/conf/tuning/decode.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5778e65656ef4a4bc691c243b7ea97bd8903065c --- /dev/null +++ b/examples/aishell/asr0/conf/tuning/decode.yaml @@ -0,0 +1,10 @@ +decode_batch_size: 128 +error_rate_type: cer +decoding_method: ctc_beam_search +lang_model_path: data/lm/zh_giga.no_cna_cmn.prune01244.klm +alpha: 1.9 +beta: 5.0 +beam_size: 300 +cutoff_prob: 0.99 +cutoff_top_n: 40 +num_proc_bsearch: 10 diff --git a/examples/aishell/asr0/local/test.sh b/examples/aishell/asr0/local/test.sh index 8cbff23520a61523e5d8599e627f897ef583d127..463593ef389c0256344ba2c40b7cdff426af1248 100755 --- a/examples/aishell/asr0/local/test.sh +++ b/examples/aishell/asr0/local/test.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 3 ];then - echo "usage: ${0} config_path ckpt_path_prefix model_type" +if [ $# != 4 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix model_type" exit -1 fi @@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 -model_type=$3 +decode_config_path=$2 +ckpt_prefix=$3 +model_type=$4 # download language model bash local/download_lm_ch.sh @@ -21,6 +22,7 @@ fi python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ +--decode_cfg ${decode_config_path} \ --result_file ${ckpt_prefix}.rsl \ --checkpoint_path ${ckpt_prefix} \ --model_type ${model_type} diff --git a/examples/aishell/asr0/local/test_export.sh b/examples/aishell/asr0/local/test_export.sh index 4f5e5c8b6371da9e8ee19c939ccba47b815f5617..7a4b87f8c926608a8b97ff181704171cba0a6e4f 100755 --- a/examples/aishell/asr0/local/test_export.sh +++ b/examples/aishell/asr0/local/test_export.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 3 ];then - echo "usage: ${0} config_path ckpt_path_prefix model_type" +if [ $# != 4 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix model_type" exit -1 fi @@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -jit_model_export_path=$2 -model_type=$3 +decode_config_path=$2 +jit_model_export_path=$3 +model_type=$4 # download language model bash local/download_lm_ch.sh > /dev/null 2>&1 @@ -21,6 +22,7 @@ fi python3 -u ${BIN_DIR}/test_export.py \ --ngpu ${ngpu} \ --config ${config_path} \ +--decode_cfg ${decode_config_path} \ --result_file ${jit_model_export_path}.rsl \ --export_path ${jit_model_export_path} \ --model_type ${model_type} diff --git a/examples/aishell/asr0/local/test_wav.sh b/examples/aishell/asr0/local/test_wav.sh index 4a6d92fbe1be25960dcd58293682ca43cabe4cad..62b005a6a0bda1061978780274477bc442448e19 100755 --- a/examples/aishell/asr0/local/test_wav.sh +++ b/examples/aishell/asr0/local/test_wav.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 4 ];then - echo "usage: ${0} config_path ckpt_path_prefix model_type audio_file" +if [ $# != 5 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix model_type audio_file" exit -1 fi @@ -9,9 +9,10 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 -model_type=$3 -audio_file=$4 +decode_config_path=$2 +ckpt_prefix=$3 +model_type=$4 +audio_file=$5 mkdir -p data wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/demo_01_03.wav -P data/ @@ -33,6 +34,7 @@ fi python3 -u ${BIN_DIR}/test_wav.py \ --ngpu ${ngpu} \ --config ${config_path} \ +--decode_cfg ${decode_config_path} \ --result_file ${ckpt_prefix}.rsl \ --checkpoint_path ${ckpt_prefix} \ --model_type ${model_type} \ diff --git a/examples/aishell/asr0/run.sh b/examples/aishell/asr0/run.sh index 270b88fc063ed1da690a2df3501c5f620dfb1677..15685f21f8ce05435fffa4e8f267f3723d46fa63 100755 --- a/examples/aishell/asr0/run.sh +++ b/examples/aishell/asr0/run.sh @@ -6,6 +6,7 @@ gpus=0,1,2,3 stage=0 stop_stage=100 conf_path=conf/deepspeech2.yaml #conf/deepspeech2.yaml or conf/deepspeeech2_online.yaml +decode_conf_path=conf/tuning/decode.yaml avg_num=1 model_type=offline # offline or online audio_file=data/demo_01_03.wav @@ -34,7 +35,7 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # test ckpt avg_n - CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type}|| exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type}|| exit -1 fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then @@ -44,11 +45,11 @@ fi if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then # test export ckpt avg_n - CUDA_VISIBLE_DEVICES=0 ./local/test_export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt}.jit ${model_type}|| exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/test_export.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt}.jit ${model_type}|| exit -1 fi # Optionally, you can add LM and test it with runtime. if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then # test a single .wav file - CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type} ${audio_file} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type} ${audio_file} || exit -1 fi diff --git a/examples/aishell/asr1/RESULTS.md b/examples/aishell/asr1/RESULTS.md index 783e179e008c3143c987fca6f72f43d8911e11e4..b68d69924d6d3db4306b6d866a3201e74e1536d6 100644 --- a/examples/aishell/asr1/RESULTS.md +++ b/examples/aishell/asr1/RESULTS.md @@ -25,7 +25,7 @@ Need set `decoding.decoding_chunk_size=16` when decoding. | Model | Params | Config | Augmentation| Test set | Decode method | Loss | CER | | --- | --- | --- | --- | --- | --- | --- | --- | -| transformer | 31.95M | conf/transformer.yaml | spec_aug | test | attention | 3.858648955821991 | 0.057293 | -| transformer | 31.95M | conf/transformer.yaml | spec_aug | test | ctc_greedy_search | 3.858648955821991 | 0.061837 | -| transformer | 31.95M | conf/transformer.yaml | spec_aug | test | ctc_prefix_beam_search | 3.858648955821991 | 0.061685 | -| transformer | 31.95M | conf/transformer.yaml | spec_aug | test | attention_rescoring | 3.858648955821991 | 0.053844 | +| transformer | 31.95M | conf/transformer.yaml | spec_aug | test | attention | 3.8103787302970886 | 0.056588 | +| transformer | 31.95M | conf/transformer.yaml | spec_aug | test | ctc_greedy_search | 3.8103787302970886 | 0.059932 | +| transformer | 31.95M | conf/transformer.yaml | spec_aug | test | ctc_prefix_beam_search | 3.8103787302970886 | 0.059989 | +| transformer | 31.95M | conf/transformer.yaml | spec_aug | test | attention_rescoring | 3.8103787302970886 | 0.052273 | diff --git a/examples/aishell/asr1/conf/chunk_conformer.yaml b/examples/aishell/asr1/conf/chunk_conformer.yaml index 80b455878f5f293234ac9130c8c22f863115d9ae..68e852ba77770cd0de9b4c33e93ee3ed777fe674 100644 --- a/examples/aishell/asr1/conf/chunk_conformer.yaml +++ b/examples/aishell/asr1/conf/chunk_conformer.yaml @@ -1,122 +1,95 @@ -# https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test - min_input_len: 0.5 - max_input_len: 20.0 # second - min_output_len: 0.0 - max_output_len: 400.0 - min_output_input_ratio: 0.05 - max_output_input_ratio: 10.0 - +############################################ +# Network Architecture # +############################################ +cmvn_file: +cmvn_file_type: "json" +# encoder related +encoder: conformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: True + cnn_module_kernel: 15 + use_cnn_module: True + activation_type: 'swish' + pos_enc_layer_type: 'rel_pos' + selfattention_layer_type: 'rel_selfattn' + causal: true + use_dynamic_chunk: true + cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster + use_dynamic_left_chunk: false +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false -collator: - vocab_filepath: data/lang_char/vocab.txt - unit_type: 'char' - spm_model_prefix: '' - augmentation_config: conf/preprocess.yaml - batch_size: 32 - raw_wav: True # use raw_wav or kaldi feature - spectrum_type: fbank #linear, mfcc, fbank - feat_dim: 80 - delta_delta: False - dither: 1.0 - target_sample_rate: 16000 - max_freq: None - n_fft: None - stride_ms: 10.0 - window_ms: 25.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test -# network architecture -model: - cmvn_file: - cmvn_file_type: "json" - # encoder related - encoder: conformer - encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: True - use_cnn_module: True - cnn_module_kernel: 15 - activation_type: 'swish' - pos_enc_layer_type: 'rel_pos' - selfattention_layer_type: 'rel_selfattn' - causal: true - use_dynamic_chunk: true - cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster - use_dynamic_left_chunk: false - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 +########################################### +# Dataloader # +########################################### - # hybrid CTC/attention - model_conf: - ctc_weight: 0.3 - ctc_dropoutrate: 0.0 - ctc_grad_norm_type: null - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false - - -training: - n_epoch: 240 - accum_grad: 4 - global_grad_clip: 5.0 - optim: adam - optim_conf: - lr: 0.001 - weight_decay: 1e-6 - scheduler: warmuplr - scheduler_conf: - warmup_steps: 25000 - lr_decay: 1.0 - log_interval: 100 - checkpoint: - kbest_n: 50 - latest_n: 5 - - -decoding: - batch_size: 128 - error_rate_type: cer - decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 2.5 - beta: 0.3 - beam_size: 10 - cutoff_prob: 1.0 - cutoff_top_n: 0 - num_proc_bsearch: 8 - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: true # simulate streaming inference. Defaults to False. +vocab_filepath: data/lang_char/vocab.txt +spm_model_prefix: '' +unit_type: 'char' +preprocess_config: conf/preprocess.yaml +feat_dim: 80 +stride_ms: 10.0 +window_ms: 25.0 +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +batch_size: 64 +maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +num_workers: 0 +subsampling_factor: 1 +num_encs: 1 +########################################### +# Training # +########################################### +n_epoch: 240 +accum_grad: 2 +global_grad_clip: 5.0 +optim: adam +optim_conf: + lr: 0.002 + weight_decay: 1.0e-6 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 +log_interval: 100 +checkpoint: + kbest_n: 50 + latest_n: 5 diff --git a/examples/aishell/asr1/conf/conformer.yaml b/examples/aishell/asr1/conf/conformer.yaml index 154f44a25ca31a69a68fc195a2a4e3211632e333..775a4527d49925e6f0aaf73a2d9b6f7bc37657da 100644 --- a/examples/aishell/asr1/conf/conformer.yaml +++ b/examples/aishell/asr1/conf/conformer.yaml @@ -1,117 +1,89 @@ -# https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test - min_input_len: 0.5 - max_input_len: 20.0 # second - min_output_len: 0.0 - max_output_len: 400.0 - min_output_input_ratio: 0.05 - max_output_input_ratio: 10.0 +############################################ +# Network Architecture # +############################################ +cmvn_file: +cmvn_file_type: "json" +# encoder related +encoder: conformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: True + cnn_module_kernel: 15 + use_cnn_module: True + activation_type: 'swish' + pos_enc_layer_type: 'rel_pos' + selfattention_layer_type: 'rel_selfattn' +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 -collator: - vocab_filepath: data/lang_char/vocab.txt - unit_type: 'char' - spm_model_prefix: '' - augmentation_config: conf/preprocess.yaml - batch_size: 64 - raw_wav: True # use raw_wav or kaldi feature - spectrum_type: fbank #linear, mfcc, fbank - feat_dim: 80 - delta_delta: False - dither: 1.0 - target_sample_rate: 16000 - max_freq: None - n_fft: None - stride_ms: 10.0 - window_ms: 25.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false -# network architecture -model: - cmvn_file: - cmvn_file_type: "json" - # encoder related - encoder: conformer - encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: True - use_cnn_module: True - cnn_module_kernel: 15 - activation_type: 'swish' - pos_enc_layer_type: 'rel_pos' - selfattention_layer_type: 'rel_selfattn' - - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 - - # hybrid CTC/attention - model_conf: - ctc_weight: 0.3 - ctc_dropoutrate: 0.0 - ctc_grad_norm_type: null - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false - - -training: - n_epoch: 240 - accum_grad: 2 - global_grad_clip: 5.0 - optim: adam - optim_conf: - lr: 0.002 - weight_decay: 1e-6 - scheduler: warmuplr # pytorch v1.1.0+ required - scheduler_conf: - warmup_steps: 25000 - lr_decay: 1.0 - log_interval: 100 - checkpoint: - kbest_n: 50 - latest_n: 5 - - -decoding: - batch_size: 128 - error_rate_type: cer - decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 2.5 - beta: 0.3 - beam_size: 10 - cutoff_prob: 1.0 - cutoff_top_n: 0 - num_proc_bsearch: 8 - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: False # simulate streaming inference. Defaults to False. +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test +########################################### +# Dataloader # +########################################### +vocab_filepath: data/lang_char/vocab.txt +spm_model_prefix: '' +unit_type: 'char' +preprocess_config: conf/preprocess.yaml +feat_dim: 80 +stride_ms: 10.0 +window_ms: 25.0 +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +batch_size: 64 +maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +num_workers: 2 +subsampling_factor: 1 +num_encs: 1 +########################################### +# Training # +########################################### +n_epoch: 240 +accum_grad: 2 +global_grad_clip: 5.0 +optim: adam +optim_conf: + lr: 0.002 + weight_decay: 1.0e-6 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 +log_interval: 100 +checkpoint: + kbest_n: 50 + latest_n: 5 diff --git a/examples/aishell/asr1/conf/preprocess.yaml b/examples/aishell/asr1/conf/preprocess.yaml index dd4cfd273e0f654169955d29ed76c818ded9181c..f7f4c58d5228beea1b6462355bb5970d6f070ad4 100644 --- a/examples/aishell/asr1/conf/preprocess.yaml +++ b/examples/aishell/asr1/conf/preprocess.yaml @@ -5,7 +5,7 @@ process: n_mels: 80 n_shift: 160 win_length: 400 - dither: true + dither: 0.1 - type: cmvn_json cmvn_path: data/mean_std.json # these three processes are a.k.a. SpecAugument diff --git a/examples/aishell/asr1/conf/transformer.yaml b/examples/aishell/asr1/conf/transformer.yaml index 60ec01801c634afa29a4cb0d88ec664323919500..9d2946537b44ed55f59dbebc09de2ef7571324bf 100644 --- a/examples/aishell/asr1/conf/transformer.yaml +++ b/examples/aishell/asr1/conf/transformer.yaml @@ -1,112 +1,85 @@ -# https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test - min_input_len: 0.5 - max_input_len: 20.0 # second - min_output_len: 0.0 - max_output_len: 400.0 - min_output_input_ratio: 0.05 - max_output_input_ratio: 10.0 - - -collator: - vocab_filepath: data/lang_char/vocab.txt - unit_type: 'char' - spm_model_prefix: '' - augmentation_config: conf/preprocess.yaml - batch_size: 64 - raw_wav: True # use raw_wav or kaldi feature - spectrum_type: fbank #linear, mfcc, fbank - feat_dim: 80 - delta_delta: False - dither: 1.0 - target_sample_rate: 16000 - max_freq: None - n_fft: None - stride_ms: 10.0 - window_ms: 25.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 - -# network architecture -model: - cmvn_file: - cmvn_file_type: "json" - # encoder related - encoder: transformer - encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true +############################################ +# Network Architecture # +############################################ +cmvn_file: +cmvn_file_type: "json" +# encoder related +encoder: transformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: true +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 - - # hybrid CTC/attention - model_conf: - ctc_weight: 0.3 - ctc_dropoutrate: 0.0 - ctc_grad_norm_type: null - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false - - -training: - n_epoch: 120 - accum_grad: 2 - global_grad_clip: 5.0 - optim: adam - optim_conf: - lr: 0.002 - weight_decay: 1e-6 - scheduler: warmuplr - scheduler_conf: - warmup_steps: 25000 - lr_decay: 1.0 - log_interval: 100 - checkpoint: - kbest_n: 50 - latest_n: 5 +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false +########################################### +# Data # +########################################### +# https://yaml.org/type/float.html +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test -decoding: - batch_size: 128 - error_rate_type: cer - decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 2.5 - beta: 0.3 - beam_size: 10 - cutoff_prob: 1.0 - cutoff_top_n: 0 - num_proc_bsearch: 8 - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: False # simulate streaming inference. Defaults to False. +########################################### +# Dataloader # +########################################### +unit_type: 'char' +vocab_filepath: data/lang_char/vocab.txt +spm_model_prefix: '' +feat_dim: 80 +stride_ms: 10.0 +window_ms: 25.0 +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +batch_size: 64 +maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +preprocess_config: conf/preprocess.yaml +num_workers: 0 +subsampling_factor: 1 +num_encs: 1 +########################################### +# Training # +########################################### +n_epoch: 240 +accum_grad: 2 +global_grad_clip: 5.0 +optim: adam +optim_conf: + lr: 0.002 + weight_decay: 1.0e-6 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 +log_interval: 100 +checkpoint: + kbest_n: 50 + latest_n: 5 diff --git a/examples/aishell/asr1/conf/tuning/chunk_decode.yaml b/examples/aishell/asr1/conf/tuning/chunk_decode.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7e8afb7a8e16d2865d9f88826ddc8509acd7d1da --- /dev/null +++ b/examples/aishell/asr1/conf/tuning/chunk_decode.yaml @@ -0,0 +1,11 @@ +beam_size: 10 +decode_batch_size: 128 +error_rate_type: cer +decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' +ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. +decoding_chunk_size: 16 # decoding chunk size. Defaults to -1. + # <0: for decoding, use full chunk. + # >0: for decoding, use fixed chunk size as set. + # 0: used for training, it's prohibited here. +num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. +simulate_streaming: True # simulate streaming inference. Defaults to False. diff --git a/examples/aishell/asr1/conf/tuning/decode.yaml b/examples/aishell/asr1/conf/tuning/decode.yaml new file mode 100644 index 0000000000000000000000000000000000000000..72ede9272b190289daac57946c7e44d5407ef0b0 --- /dev/null +++ b/examples/aishell/asr1/conf/tuning/decode.yaml @@ -0,0 +1,11 @@ +beam_size: 10 +decode_batch_size: 128 +error_rate_type: cer +decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' +ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. +decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. + # <0: for decoding, use full chunk. + # >0: for decoding, use fixed chunk size as set. + # 0: used for training, it's prohibited here. +num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. +simulate_streaming: False # simulate streaming inference. Defaults to False. diff --git a/examples/aishell/asr1/local/align.sh b/examples/aishell/asr1/local/align.sh index c65d611c43e870fe4024235c3a2f433452251d21..14d91d68702700179f5e81a843f08a2e264688e2 100755 --- a/examples/aishell/asr1/local/align.sh +++ b/examples/aishell/asr1/local/align.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 2 ];then - echo "usage: ${0} config_path ckpt_path_prefix" +if [ $# != 3 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix" exit -1 fi @@ -9,7 +9,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 +decode_config_path=$2 +ckpt_prefix=$3 batch_size=1 output_dir=${ckpt_prefix} @@ -20,9 +21,10 @@ mkdir -p ${output_dir} python3 -u ${BIN_DIR}/alignment.py \ --ngpu ${ngpu} \ --config ${config_path} \ +--decode_cfg ${decode_config_path} \ --result_file ${output_dir}/${type}.align \ --checkpoint_path ${ckpt_prefix} \ ---opts decoding.batch_size ${batch_size} +--opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in ctc alignment!" diff --git a/examples/aishell/asr1/local/test.sh b/examples/aishell/asr1/local/test.sh index da159de734a40996eed297beb147c23962b7fba3..65b884e512fa0ebf46c94d587e3ed4410c04343f 100755 --- a/examples/aishell/asr1/local/test.sh +++ b/examples/aishell/asr1/local/test.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 2 ];then - echo "usage: ${0} config_path ckpt_path_prefix" +if [ $# != 3 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix" exit -1 fi @@ -9,7 +9,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 +decode_config_path=$2 +ckpt_prefix=$3 chunk_mode=false if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then @@ -36,10 +37,11 @@ for type in attention ctc_greedy_search; do python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${output_dir}/${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ - --opts decoding.batch_size ${batch_size} + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in evaluation!" @@ -55,10 +57,11 @@ for type in ctc_prefix_beam_search attention_rescoring; do python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${output_dir}/${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ - --opts decoding.batch_size ${batch_size} + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in evaluation!" diff --git a/examples/aishell/asr1/local/test_wav.sh b/examples/aishell/asr1/local/test_wav.sh index f85c1a47e82b5328ce070e1e85bdcbdadcc85de4..d029f2fdebdfb6340527c3ed20878114c7034afa 100755 --- a/examples/aishell/asr1/local/test_wav.sh +++ b/examples/aishell/asr1/local/test_wav.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 3 ];then - echo "usage: ${0} config_path ckpt_path_prefix audio_file" +if [ $# != 4 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix audio_file" exit -1 fi @@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 -audio_file=$3 +decode_config_path=$2 +ckpt_prefix=$3 +audio_file=$4 mkdir -p data wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/demo_01_03.wav -P data/ @@ -42,10 +43,11 @@ for type in attention_rescoring; do python3 -u ${BIN_DIR}/test_wav.py \ --ngpu ${ngpu} \ --config ${config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${output_dir}/${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ - --opts decoding.batch_size ${batch_size} \ + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} \ --audio_file ${audio_file} if [ $? -ne 0 ]; then diff --git a/examples/aishell/asr1/run.sh b/examples/aishell/asr1/run.sh index d07a4ed5ce404881a9e7aca311a891f3a990f4da..c54dae9cffd7bf3f1ad86833200531d64a4c6397 100644 --- a/examples/aishell/asr1/run.sh +++ b/examples/aishell/asr1/run.sh @@ -6,6 +6,7 @@ gpus=0,1,2,3 stage=0 stop_stage=50 conf_path=conf/conformer.yaml +decode_conf_path=conf/tuning/decode.yaml avg_num=20 audio_file=data/demo_01_03.wav @@ -32,18 +33,18 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # test ckpt avg_n - CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then # ctc alignment of test data - CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi # Optionally, you can add LM and test it with runtime. if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then # test a single .wav file - CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1 fi # Not supported at now!!! diff --git a/examples/aishell3/README.md b/examples/aishell3/README.md index b52950c47cff51d956c981b01af808a59734fb96..273f488e45406a6ca9a63eb2e8f251e293df3e88 100644 --- a/examples/aishell3/README.md +++ b/examples/aishell3/README.md @@ -8,4 +8,5 @@ * voc1 - Parallel WaveGAN * voc2 - MelGAN * voc3 - MultiBand MelGAN -* vc0 - Tactron2 Voice Clone with GE2E +* vc0 - Tactron2 Voice Cloning with GE2E +* vc1 - FastSpeech2 Voice Cloning with GE2E diff --git a/examples/aishell3/tts3/README.md b/examples/aishell3/tts3/README.md index 8d1c2aa9cd896f4741b850fb87b8919784b57a22..2538e8f96e8f2a62568573f442f21c9164f09706 100644 --- a/examples/aishell3/tts3/README.md +++ b/examples/aishell3/tts3/README.md @@ -72,8 +72,8 @@ Here's the complete help message. ```text usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] - [--ngpu NGPU] [--verbose VERBOSE] [--phones-dict PHONES_DICT] - [--speaker-dict SPEAKER_DICT] + [--ngpu NGPU] [--phones-dict PHONES_DICT] + [--speaker-dict SPEAKER_DICT] [--voice-cloning VOICE_CLONING] Train a FastSpeech2 model. @@ -87,11 +87,12 @@ optional arguments: --output-dir OUTPUT_DIR output dir. --ngpu NGPU if ngpu=0, use cpu. - --verbose VERBOSE verbose. --phones-dict PHONES_DICT phone vocabulary file. --speaker-dict SPEAKER_DICT speaker id map file for multiple speaker model. + --voice-cloning VOICE_CLONING + whether training voice cloning model. ``` 1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`. 2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder. diff --git a/examples/aishell3/voc1/README.md b/examples/aishell3/voc1/README.md index 7da3946e3a6ff91a14850f36bca74cd8616738b3..dad464092d197de0065a3876ccc3605719de75ff 100644 --- a/examples/aishell3/voc1/README.md +++ b/examples/aishell3/voc1/README.md @@ -67,8 +67,8 @@ Here's the complete help message. ```text usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] - [--ngpu NGPU] [--verbose VERBOSE] [--batch-size BATCH_SIZE] - [--max-iter MAX_ITER] [--run-benchmark RUN_BENCHMARK] + [--ngpu NGPU] [--batch-size BATCH_SIZE] [--max-iter MAX_ITER] + [--run-benchmark RUN_BENCHMARK] [--profiler_options PROFILER_OPTIONS] Train a ParallelWaveGAN model. @@ -83,7 +83,6 @@ optional arguments: --output-dir OUTPUT_DIR output dir. --ngpu NGPU if ngpu == 0, use cpu. - --verbose VERBOSE verbose. benchmark: arguments related to benchmark. @@ -113,7 +112,6 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p usage: synthesize.py [-h] [--generator-type GENERATOR_TYPE] [--config CONFIG] [--checkpoint CHECKPOINT] [--test-metadata TEST_METADATA] [--output-dir OUTPUT_DIR] [--ngpu NGPU] - [--verbose VERBOSE] Synthesize with GANVocoder. @@ -130,7 +128,6 @@ optional arguments: --output-dir OUTPUT_DIR output dir. --ngpu NGPU if ngpu == 0, use cpu. - --verbose VERBOSE verbose. ``` 1. `--config` parallel wavegan config file. You should use the same config with which the model is trained. diff --git a/examples/aishell3/voc1/conf/default.yaml b/examples/aishell3/voc1/conf/default.yaml index 88968d6fccb03efc8073edb3b03a4765e41640bb..7fbffbdde0164ea2040b5060842c962fb021b679 100644 --- a/examples/aishell3/voc1/conf/default.yaml +++ b/examples/aishell3/voc1/conf/default.yaml @@ -72,10 +72,7 @@ lambda_adv: 4.0 # Loss balancing coefficient. ########################################################### batch_size: 8 # Batch size. batch_max_steps: 24000 # Length of each audio in batch. Make sure dividable by n_shift. -pin_memory: true # Whether to pin memory in Pytorch DataLoader. -num_workers: 4 # Number of workers in Pytorch DataLoader. -remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps. -allow_cache: true # Whether to allow cache in dataset. If true, it requires cpu memory. +num_workers: 2 # Number of workers in DataLoader. ########################################################### # OPTIMIZER & SCHEDULER SETTING # diff --git a/examples/callcenter/asr1/conf/chunk_conformer.yaml b/examples/callcenter/asr1/conf/chunk_conformer.yaml index 69959c68eeac1f998ce053b62f1fb6aed30dba9b..19e783a62a32300688236d666e8f14c63fcdedbb 100644 --- a/examples/callcenter/asr1/conf/chunk_conformer.yaml +++ b/examples/callcenter/asr1/conf/chunk_conformer.yaml @@ -1,120 +1,98 @@ # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test - min_input_len: 0.5 - max_input_len: 20.0 # second - min_output_len: 0.0 - max_output_len: 400.0 - min_output_input_ratio: 0.05 - max_output_input_ratio: 10.0 - - -collator: - vocab_filepath: data/lang_char/vocab.txt - unit_type: 'char' - spm_model_prefix: '' - augmentation_config: conf/preprocess.yaml - batch_size: 32 - raw_wav: True # use raw_wav or kaldi feature - spectrum_type: fbank #linear, mfcc, fbank - feat_dim: 80 - delta_delta: False - dither: 1.0 - target_sample_rate: 8000 - max_freq: None - n_fft: None - stride_ms: 10.0 - window_ms: 25.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 - - -# network architecture -model: - cmvn_file: - cmvn_file_type: "json" - # encoder related - encoder: conformer - encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: True - use_cnn_module: True - cnn_module_kernel: 15 - activation_type: 'swish' - pos_enc_layer_type: 'rel_pos' - selfattention_layer_type: 'rel_selfattn' - causal: true - use_dynamic_chunk: true - cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster - use_dynamic_left_chunk: false - - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 - - # hybrid CTC/attention - model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test + +########################################### +# Dataloader # +########################################### +vocab_filepath: data/lang_char/vocab.txt +unit_type: 'char' +spm_model_prefix: '' +preprocess_config: conf/preprocess.yaml +batch_size: 32 +raw_wav: True # use raw_wav or kaldi feature +spectrum_type: fbank #linear, mfcc, fbank +feat_dim: 80 +delta_delta: False +dither: 1.0 +target_sample_rate: 8000 +max_freq: None +n_fft: None +stride_ms: 10.0 +window_ms: 25.0 +use_dB_normalization: True +target_dB: -20 +random_seed: 0 +keep_transcription_text: False +sortagrad: True +shuffle_method: batch_shuffle +num_workers: 2 -training: - n_epoch: 240 - accum_grad: 4 - global_grad_clip: 5.0 - optim: adam - optim_conf: - lr: 0.001 - weight_decay: 1e-6 - scheduler: warmuplr - scheduler_conf: - warmup_steps: 25000 - lr_decay: 1.0 - log_interval: 100 - checkpoint: - kbest_n: 50 - latest_n: 5 +############################################ +# Network Architecture # +############################################ +cmvn_file: +cmvn_file_type: "json" +# encoder related +encoder: conformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: True + use_cnn_module: True + cnn_module_kernel: 15 + activation_type: 'swish' + pos_enc_layer_type: 'rel_pos' + selfattention_layer_type: 'rel_selfattn' + causal: true + use_dynamic_chunk: true + cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster + use_dynamic_left_chunk: false -decoding: - batch_size: 128 - error_rate_type: cer - decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 2.5 - beta: 0.3 - beam_size: 10 - cutoff_prob: 1.0 - cutoff_top_n: 0 - num_proc_bsearch: 8 - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: true # simulate streaming inference. Defaults to False. +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false +########################################### +# Training # +########################################### +n_epoch: 240 +accum_grad: 4 +global_grad_clip: 5.0 +optim: adam +optim_conf: + lr: 0.001 + weight_decay: 1.0e-6 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 +log_interval: 100 +checkpoint: + kbest_n: 50 + latest_n: 5 diff --git a/examples/callcenter/asr1/conf/conformer.yaml b/examples/callcenter/asr1/conf/conformer.yaml index 80c15abb1c30dc514a9a523bb248cf5f75665117..f6fcb949888899e66639b80a126896afef534d0f 100644 --- a/examples/callcenter/asr1/conf/conformer.yaml +++ b/examples/callcenter/asr1/conf/conformer.yaml @@ -1,117 +1,92 @@ # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test - min_input_len: 0.5 - max_input_len: 20.0 # second - min_output_len: 0.0 - max_output_len: 400.0 - min_output_input_ratio: 0.0 - max_output_input_ratio: .inf +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test -collator: - vocab_filepath: data/lang_char/vocab.txt - unit_type: 'char' - spm_model_prefix: '' - augmentation_config: conf/preprocess.yaml - batch_size: 32 - raw_wav: True # use raw_wav or kaldi feature - spectrum_type: fbank #linear, mfcc, fbank - feat_dim: 80 - delta_delta: False - dither: 1.0 - target_sample_rate: 8000 - max_freq: None - n_fft: None - stride_ms: 10.0 - window_ms: 25.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 +########################################### +# Dataloader # +########################################### +vocab_filepath: data/lang_char/vocab.txt +unit_type: 'char' +spm_model_prefix: '' +preprocess_config: conf/preprocess.yaml +feat_dim: 80 +stride_ms: 10.0 +window_ms: 25.0 +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +batch_size: 64 +maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +num_workers: 0 +subsampling_factor: 1 +num_encs: 1 -# network architecture -model: - cmvn_file: - cmvn_file_type: "json" - # encoder related - encoder: conformer - encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: True - use_cnn_module: True - cnn_module_kernel: 15 - activation_type: 'swish' - pos_enc_layer_type: 'rel_pos' - selfattention_layer_type: 'rel_selfattn' +############################################ +# Network Architecture # +############################################ +cmvn_file: +cmvn_file_type: "json" +# encoder related +encoder: conformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: True + use_cnn_module: True + cnn_module_kernel: 15 + activation_type: 'swish' + pos_enc_layer_type: 'rel_pos' + selfattention_layer_type: 'rel_selfattn' - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 - # hybrid CTC/attention - model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false - - -training: - n_epoch: 100 # 50 will be lowest - accum_grad: 4 - global_grad_clip: 5.0 - optim: adam - optim_conf: - lr: 0.002 - weight_decay: 1e-6 - scheduler: warmuplr - scheduler_conf: - warmup_steps: 25000 - lr_decay: 1.0 - log_interval: 100 - checkpoint: - kbest_n: 50 - latest_n: 5 - - - - -decoding: - batch_size: 128 - error_rate_type: cer - decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 2.5 - beta: 0.3 - beam_size: 10 - cutoff_prob: 1.0 - cutoff_top_n: 0 - num_proc_bsearch: 8 - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: False # simulate streaming inference. Defaults to False. +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false +########################################### +# Training # +########################################### +n_epoch: 100 # 50 will be lowest +accum_grad: 4 +global_grad_clip: 5.0 +optim: adam +optim_conf: + lr: 0.002 + weight_decay: 1.0e-6 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 +log_interval: 100 +checkpoint: + kbest_n: 50 + latest_n: 5 diff --git a/examples/callcenter/asr1/conf/preprocess.yaml b/examples/callcenter/asr1/conf/preprocess.yaml index dd4cfd273e0f654169955d29ed76c818ded9181c..877e7d5a7d423d3685a0e526662ec371eb4db91a 100644 --- a/examples/callcenter/asr1/conf/preprocess.yaml +++ b/examples/callcenter/asr1/conf/preprocess.yaml @@ -1,11 +1,11 @@ process: # extract kaldi fbank from PCM - type: fbank_kaldi - fs: 16000 + fs: 8000 n_mels: 80 n_shift: 160 win_length: 400 - dither: true + dither: 0.1 - type: cmvn_json cmvn_path: data/mean_std.json # these three processes are a.k.a. SpecAugument diff --git a/examples/callcenter/asr1/conf/tuning/chunk_decode.yaml b/examples/callcenter/asr1/conf/tuning/chunk_decode.yaml new file mode 100644 index 0000000000000000000000000000000000000000..49a6a114cd2ca3181522523f7851d6b600e33c0e --- /dev/null +++ b/examples/callcenter/asr1/conf/tuning/chunk_decode.yaml @@ -0,0 +1,11 @@ +decode_batch_size: 128 +error_rate_type: cer +decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' +beam_size: 10 +ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. +decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. + # <0: for decoding, use full chunk. + # >0: for decoding, use fixed chunk size as set. + # 0: used for training, it's prohibited here. +num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. +simulate_streaming: true # simulate streaming inference. Defaults to False. \ No newline at end of file diff --git a/examples/callcenter/asr1/conf/tuning/decode.yaml b/examples/callcenter/asr1/conf/tuning/decode.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d2e0b72dd94a5ff8a5915ccfc98af7e77133dccd --- /dev/null +++ b/examples/callcenter/asr1/conf/tuning/decode.yaml @@ -0,0 +1,13 @@ +decode_batch_size: 128 +error_rate_type: cer +decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' +beam_size: 10 +ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. +decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. + # <0: for decoding, use full chunk. + # >0: for decoding, use fixed chunk size as set. + # 0: used for training, it's prohibited here. +num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. +simulate_streaming: False # simulate streaming inference. Defaults to False. + + diff --git a/examples/callcenter/asr1/local/align.sh b/examples/callcenter/asr1/local/align.sh index 681c77edea276fd675e2a3bb01b46528944d7cc0..1397ae57d5c9e36d8455f28627c3942f7839138a 100755 --- a/examples/callcenter/asr1/local/align.sh +++ b/examples/callcenter/asr1/local/align.sh @@ -1,7 +1,7 @@ #! /usr/bin/env bash -if [ $# != 2 ];then - echo "usage: ${0} config_path ckpt_path_prefix" +if [ $# != 3 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix" exit -1 fi @@ -9,7 +9,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 +decode_config_path=$2 +ckpt_prefix=$3 ckpt_name=$(basename ${ckpt_prefxi}) @@ -25,9 +26,10 @@ mkdir -p ${output_dir} python3 -u ${BIN_DIR}/alignment.py \ --ngpu ${ngpu} \ --config ${config_path} \ +--decode_cfg ${decode_config_path} \ --result_file ${output_dir}/${type}.align \ --checkpoint_path ${ckpt_prefix} \ ---opts decoding.batch_size ${batch_size} +--opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in ctc alignment!" diff --git a/examples/callcenter/asr1/local/test.sh b/examples/callcenter/asr1/local/test.sh index fc43c5a20c867e9a34c3c726e27e78b6261bfc85..b7ff722a74babda901077faa106a35ee6ac7b808 100755 --- a/examples/callcenter/asr1/local/test.sh +++ b/examples/callcenter/asr1/local/test.sh @@ -1,7 +1,7 @@ #! /usr/bin/env bash -if [ $# != 2 ];then - echo "usage: ${0} config_path ckpt_path_prefix" +if [ $# != 3 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix" exit -1 fi @@ -9,7 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 +decode_config_path=$2 +ckpt_prefix=$3 + ckpt_name=$(basename ${ckpt_prefxi}) @@ -30,10 +32,11 @@ for type in attention ctc_greedy_search; do python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${output_dir}/${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ - --opts decoding.batch_size ${batch_size} + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in evaluation!" @@ -49,10 +52,11 @@ for type in ctc_prefix_beam_search attention_rescoring; do python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${output_dir}/${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ - --opts decoding.batch_size ${batch_size} + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in evaluation!" diff --git a/examples/callcenter/asr1/run.sh b/examples/callcenter/asr1/run.sh index e9be3d03cb59d2944d60fe3821ab80b6d9541676..0c7ffc1e7990319754abaeaa3a6726d3b098fb2b 100644 --- a/examples/callcenter/asr1/run.sh +++ b/examples/callcenter/asr1/run.sh @@ -4,8 +4,9 @@ source path.sh gpus=0,1,2,3 stage=0 -stop_stage=100 +stop_stage=50 conf_path=conf/conformer.yaml +decode_conf_path=conf/tuning/decode.yaml avg_num=20 source ${MAIN_ROOT}/utils/parse_options.sh || exit 1; @@ -31,15 +32,15 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # test ckpt avg_n - CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then # ctc alignment of test data - CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi -if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then +if [ ${stage} -le 51 ] && [ ${stop_stage} -ge 51 ]; then # export ckpt avg_n CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit fi diff --git a/examples/csmsc/tts2/README.md b/examples/csmsc/tts2/README.md index 2c7a917e9d2dd98613e76ffa0b59d90d1231f917..5f31f7b369429a50a1a83185cf14eb687a88861a 100644 --- a/examples/csmsc/tts2/README.md +++ b/examples/csmsc/tts2/README.md @@ -60,8 +60,7 @@ Here's the complete help message. ```text usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] - [--ngpu NGPU] [--verbose VERBOSE] - [--use-relative-path USE_RELATIVE_PATH] + [--ngpu NGPU] [--use-relative-path USE_RELATIVE_PATH] [--phones-dict PHONES_DICT] [--tones-dict TONES_DICT] Train a Speedyspeech model with a single speaker dataset. @@ -76,7 +75,6 @@ optional arguments: --output-dir OUTPUT_DIR output dir. --ngpu NGPU if ngpu == 0, use cpu. - --verbose VERBOSE verbose. --use-relative-path USE_RELATIVE_PATH whether use relative path in metadata --phones-dict PHONES_DICT @@ -109,7 +107,7 @@ pwg_baker_ckpt_0.4 ```bash CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} ``` -``text +```text usage: synthesize.py [-h] [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}] [--am_config AM_CONFIG] [--am_ckpt AM_CKPT] diff --git a/examples/csmsc/tts2/local/preprocess.sh b/examples/csmsc/tts2/local/preprocess.sh index f7f5ea74c742c4edfa1fc7084a14807f8e35fdae..c44f075db52c2b0ffc84adcda36206f6ab783575 100755 --- a/examples/csmsc/tts2/local/preprocess.sh +++ b/examples/csmsc/tts2/local/preprocess.sh @@ -45,6 +45,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then --stats=dump/train/feats_stats.npy \ --phones-dict=dump/phone_id_map.txt \ --tones-dict=dump/tone_id_map.txt \ + --speaker-dict=dump/speaker_id_map.txt \ --use-relative-path=True python3 ${BIN_DIR}/normalize.py \ @@ -53,6 +54,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then --stats=dump/train/feats_stats.npy \ --phones-dict=dump/phone_id_map.txt \ --tones-dict=dump/tone_id_map.txt \ + --speaker-dict=dump/speaker_id_map.txt \ --use-relative-path=True python3 ${BIN_DIR}/normalize.py \ @@ -61,6 +63,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then --stats=dump/train/feats_stats.npy \ --phones-dict=dump/phone_id_map.txt \ --tones-dict=dump/tone_id_map.txt \ + --speaker-dict=dump/speaker_id_map.txt \ --use-relative-path=True fi diff --git a/examples/csmsc/tts2/local/synthesize_e2e.sh b/examples/csmsc/tts2/local/synthesize_e2e.sh index a458bd5ff7cfe9a01162fa9f332edf639a79b0f5..0a4cf69bb4595ce57e25d4deef461c55d69eee16 100755 --- a/examples/csmsc/tts2/local/synthesize_e2e.sh +++ b/examples/csmsc/tts2/local/synthesize_e2e.sh @@ -38,9 +38,9 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ --am_stat=dump/train/feats_stats.npy \ --voc=mb_melgan_csmsc \ - --voc_config=mb_melgan_baker_finetune_ckpt_0.5/finetune.yaml \ - --voc_ckpt=mb_melgan_baker_finetune_ckpt_0.5/snapshot_iter_2000000.pdz\ - --voc_stat=mb_melgan_baker_finetune_ckpt_0.5/feats_stats.npy \ + --voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \ + --voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\ + --voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \ --lang=zh \ --text=${BIN_DIR}/../sentences.txt \ --output_dir=${train_output_path}/test_e2e \ @@ -61,9 +61,9 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ --am_stat=dump/train/feats_stats.npy \ --voc=style_melgan_csmsc \ - --voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \ - --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ - --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \ + --voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \ + --voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \ + --voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \ --lang=zh \ --text=${BIN_DIR}/../sentences.txt \ --output_dir=${train_output_path}/test_e2e \ @@ -82,9 +82,9 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ --am_stat=dump/train/feats_stats.npy \ --voc=hifigan_csmsc \ - --voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \ - --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ - --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \ + --voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \ + --voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \ + --voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \ --lang=zh \ --text=${BIN_DIR}/../sentences.txt \ --output_dir=${train_output_path}/test_e2e \ diff --git a/examples/csmsc/tts3/README.md b/examples/csmsc/tts3/README.md index 570dd28b800bdf3f8aa2d6d3e8c271639ea8b3db..13d291b5c390328ada4cbbbf580633ed4a194ffb 100644 --- a/examples/csmsc/tts3/README.md +++ b/examples/csmsc/tts3/README.md @@ -7,7 +7,7 @@ Download CSMSC from it's [Official Website](https://test.data-baker.com/data/ind ### Get MFA Result and Extract We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for fastspeech2. -You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo. +You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo. ## Get Started Assume the path to the dataset is `~/datasets/BZNSYP`. @@ -63,8 +63,8 @@ Here's the complete help message. ```text usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] - [--ngpu NGPU] [--verbose VERBOSE] [--phones-dict PHONES_DICT] - [--speaker-dict SPEAKER_DICT] + [--ngpu NGPU] [--phones-dict PHONES_DICT] + [--speaker-dict SPEAKER_DICT] [--voice-cloning VOICE_CLONING] Train a FastSpeech2 model. @@ -78,11 +78,12 @@ optional arguments: --output-dir OUTPUT_DIR output dir. --ngpu NGPU if ngpu=0, use cpu. - --verbose VERBOSE verbose. --phones-dict PHONES_DICT phone vocabulary file. --speaker-dict SPEAKER_DICT speaker id map file for multiple speaker model. + --voice-cloning VOICE_CLONING + whether training voice cloning model. ``` 1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`. 2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder. @@ -259,5 +260,5 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \ --text=${BIN_DIR}/../sentences.txt \ --output_dir=exp/default/test_e2e \ --inference_dir=exp/default/inference \ - --phones_dict=dump/phone_id_map.txt + --phones_dict=fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt ``` diff --git a/examples/csmsc/tts3/local/synthesize_e2e.sh b/examples/csmsc/tts3/local/synthesize_e2e.sh index 891ed041b3129dab88d1fe38b424ce099e0fdc1f..d4744486ca634bd85b0381f7e715147b78400d6f 100755 --- a/examples/csmsc/tts3/local/synthesize_e2e.sh +++ b/examples/csmsc/tts3/local/synthesize_e2e.sh @@ -37,9 +37,9 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ --am_stat=dump/train/speech_stats.npy \ --voc=mb_melgan_csmsc \ - --voc_config=mb_melgan_baker_finetune_ckpt_0.5/finetune.yaml \ - --voc_ckpt=mb_melgan_baker_finetune_ckpt_0.5/snapshot_iter_2000000.pdz\ - --voc_stat=mb_melgan_baker_finetune_ckpt_0.5/feats_stats.npy \ + --voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \ + --voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\ + --voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \ --lang=zh \ --text=${BIN_DIR}/../sentences.txt \ --output_dir=${train_output_path}/test_e2e \ @@ -59,9 +59,9 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ --am_stat=dump/train/speech_stats.npy \ --voc=style_melgan_csmsc \ - --voc_config=style_melgan_test/default.yaml \ - --voc_ckpt=style_melgan_test/snapshot_iter_935000.pdz \ - --voc_stat=style_melgan_test/feats_stats.npy \ + --voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \ + --voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \ + --voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \ --lang=zh \ --text=${BIN_DIR}/../sentences.txt \ --output_dir=${train_output_path}/test_e2e \ @@ -80,9 +80,9 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ --am_stat=dump/train/speech_stats.npy \ --voc=hifigan_csmsc \ - --voc_config=hifigan_test/default.yaml \ - --voc_ckpt=hifigan_test/snapshot_iter_1600000.pdz \ - --voc_stat=hifigan_test/feats_stats.npy \ + --voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \ + --voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \ + --voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \ --lang=zh \ --text=${BIN_DIR}/../sentences.txt \ --output_dir=${train_output_path}/test_e2e \ diff --git a/examples/csmsc/voc1/README.md b/examples/csmsc/voc1/README.md index 19a9c722ac45c90ee91f4e29d447b9183dc3266f..5527e80888c12456c2e3dbd13cad8e79a6e93da4 100644 --- a/examples/csmsc/voc1/README.md +++ b/examples/csmsc/voc1/README.md @@ -57,8 +57,8 @@ Here's the complete help message. ```text usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] - [--ngpu NGPU] [--verbose VERBOSE] [--batch-size BATCH_SIZE] - [--max-iter MAX_ITER] [--run-benchmark RUN_BENCHMARK] + [--ngpu NGPU] [--batch-size BATCH_SIZE] [--max-iter MAX_ITER] + [--run-benchmark RUN_BENCHMARK] [--profiler_options PROFILER_OPTIONS] Train a ParallelWaveGAN model. @@ -73,7 +73,6 @@ optional arguments: --output-dir OUTPUT_DIR output dir. --ngpu NGPU if ngpu == 0, use cpu. - --verbose VERBOSE verbose. benchmark: arguments related to benchmark. @@ -103,7 +102,6 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p usage: synthesize.py [-h] [--generator-type GENERATOR_TYPE] [--config CONFIG] [--checkpoint CHECKPOINT] [--test-metadata TEST_METADATA] [--output-dir OUTPUT_DIR] [--ngpu NGPU] - [--verbose VERBOSE] Synthesize with GANVocoder. @@ -120,7 +118,6 @@ optional arguments: --output-dir OUTPUT_DIR output dir. --ngpu NGPU if ngpu == 0, use cpu. - --verbose VERBOSE verbose. ``` 1. `--config` parallel wavegan config file. You should use the same config with which the model is trained. @@ -134,7 +131,7 @@ The pretrained model can be downloaded here [pwg_baker_ckpt_0.4.zip](https://pad The static model can be downloaded here [pwg_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_static_0.4.zip). -Model | Step | eval/generator_loss | eval/log_stft_magnitude_loss:| eval/spectral_convergence_loss +Model | Step | eval/generator_loss | eval/log_stft_magnitude_loss| eval/spectral_convergence_loss :-------------:| :------------:| :-----: | :-----: | :--------: default| 1(gpu) x 400000|1.948763|0.670098|0.248882 diff --git a/examples/csmsc/voc1/conf/default.yaml b/examples/csmsc/voc1/conf/default.yaml index 9ea81b8d3c2e225041d95cd0196ba1d9b606142b..28d218ff3655dc3a0e74adaf43d4efd2221d9447 100644 --- a/examples/csmsc/voc1/conf/default.yaml +++ b/examples/csmsc/voc1/conf/default.yaml @@ -79,10 +79,7 @@ lambda_adv: 4.0 # Loss balancing coefficient. ########################################################### batch_size: 8 # Batch size. batch_max_steps: 25500 # Length of each audio in batch. Make sure dividable by n_shift. -pin_memory: true # Whether to pin memory in Pytorch DataLoader. -num_workers: 2 # Number of workers in Pytorch DataLoader. -remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps. -allow_cache: true # Whether to allow cache in dataset. If true, it requires cpu memory. +num_workers: 2 # Number of workers in DataLoader. ########################################################### # OPTIMIZER & SCHEDULER SETTING # diff --git a/examples/csmsc/voc3/README.md b/examples/csmsc/voc3/README.md index e4f6be4e8260972ed334d2356d845213de95d0a1..22104a8f215f2c1eca29889778b98ac08575e193 100644 --- a/examples/csmsc/voc3/README.md +++ b/examples/csmsc/voc3/README.md @@ -6,7 +6,7 @@ Download CSMSC from the [official website](https://www.data-baker.com/data/index ### Get MFA Result and Extract We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to cut the silence in the edge of audio. -You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/mfa) of our repo. +You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo. ## Get Started Assume the path to the dataset is `~/datasets/BZNSYP`. @@ -57,7 +57,7 @@ Here's the complete help message. ```text usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] - [--ngpu NGPU] [--verbose VERBOSE] + [--ngpu NGPU] Train a Multi-Band MelGAN model. @@ -71,7 +71,6 @@ optional arguments: --output-dir OUTPUT_DIR output dir. --ngpu NGPU if ngpu == 0, use cpu. - --verbose VERBOSE verbose. ``` 1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`. @@ -88,7 +87,6 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p usage: synthesize.py [-h] [--generator-type GENERATOR_TYPE] [--config CONFIG] [--checkpoint CHECKPOINT] [--test-metadata TEST_METADATA] [--output-dir OUTPUT_DIR] [--ngpu NGPU] - [--verbose VERBOSE] Synthesize with GANVocoder. @@ -105,7 +103,6 @@ optional arguments: --output-dir OUTPUT_DIR output dir. --ngpu NGPU if ngpu == 0, use cpu. - --verbose VERBOSE verbose. ``` 1. `--config` multi band melgan config file. You should use the same config with which the model is trained. @@ -155,22 +152,22 @@ TODO: The hyperparameter of `finetune.yaml` is not good enough, a smaller `learning_rate` should be used (more `milestones` should be set). ## Pretrained Models -The pretrained model can be downloaded here [mb_melgan_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_ckpt_0.5.zip). +The pretrained model can be downloaded here [mb_melgan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_ckpt_0.1.1.zip). The finetuned model can be downloaded here [mb_melgan_baker_finetune_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_finetune_ckpt_0.5.zip). -The static model can be downloaded here [mb_melgan_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_static_0.5.zip) +The static model can be downloaded here [mb_melgan_csmsc_static_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_static_0.1.1.zip) Model | Step | eval/generator_loss | eval/log_stft_magnitude_loss|eval/spectral_convergence_loss |eval/sub_log_stft_magnitude_loss|eval/sub_spectral_convergence_loss :-------------:| :------------:| :-----: | :-----: | :--------:| :--------:| :--------: -default| 1(gpu) x 1000000| ——|—— |—— |—— | ——| +default| 1(gpu) x 1000000| 2.4851|0.71778 |0.2761 |0.66334 |0.2777| finetune| 1(gpu) x 1000000|3.196967|0.977804| 0.778484| 0.889576 |0.776756 | Multi Band MelGAN checkpoint contains files listed below. ```text -mb_melgan_baker_ckpt_0.5 +mb_melgan_csmsc_ckpt_0.1.1 ├── default.yaml # default config used to train multi band melgan ├── feats_stats.npy # statistics used to normalize spectrogram when training multi band melgan └── snapshot_iter_1000000.pdz # generator parameters of multi band melgan diff --git a/examples/csmsc/voc3/finetune.sh b/examples/csmsc/voc3/finetune.sh index 4ab10e5b3775af433ae9c9f6a2e82966ca3361ba..6719bd0bef340e3629c83f78f08fe91771b62ca7 100755 --- a/examples/csmsc/voc3/finetune.sh +++ b/examples/csmsc/voc3/finetune.sh @@ -15,11 +15,13 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then --fastspeech2-stat=fastspeech2_nosil_baker_ckpt_0.4/speech_stats.npy \ --dur-file=durations.txt \ --output-dir=dump_finetune \ - --phones-dict=fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt + --phones-dict=fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt \ + --dataset=baker \ + --rootdir=~/datasets/BZNSYP/ fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then - python3 local/link_wav.py \ + python3 ${MAIN_ROOT}/utils/link_wav.py \ --old-dump-dir=dump \ --dump-dir=dump_finetune fi diff --git a/examples/csmsc/voc4/README.md b/examples/csmsc/voc4/README.md index 57d88e0fc59a09f163bcdfb208eb6d561dd53795..b5c6873917602350d4e244fdcfd496d74e6192da 100644 --- a/examples/csmsc/voc4/README.md +++ b/examples/csmsc/voc4/README.md @@ -6,7 +6,7 @@ Download CSMSC from the [official website](https://www.data-baker.com/data/index ### Get MFA Result and Extract We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to cut the silence in the edge of audio. -You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/mfa) of our repo. +You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo. ## Get Started Assume the path to the dataset is `~/datasets/BZNSYP`. @@ -57,9 +57,9 @@ Here's the complete help message. ```text usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] - [--ngpu NGPU] [--verbose VERBOSE] + [--ngpu NGPU] -Train a Multi-Band MelGAN model. +Train a Style MelGAN model. optional arguments: -h, --help show this help message and exit @@ -71,7 +71,6 @@ optional arguments: --output-dir OUTPUT_DIR output dir. --ngpu NGPU if ngpu == 0, use cpu. - --verbose VERBOSE verbose. ``` 1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`. @@ -88,7 +87,6 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p usage: synthesize.py [-h] [--generator-type GENERATOR_TYPE] [--config CONFIG] [--checkpoint CHECKPOINT] [--test-metadata TEST_METADATA] [--output-dir OUTPUT_DIR] [--ngpu NGPU] - [--verbose VERBOSE] Synthesize with GANVocoder. @@ -105,7 +103,6 @@ optional arguments: --output-dir OUTPUT_DIR output dir. --ngpu NGPU if ngpu == 0, use cpu. - --verbose VERBOSE verbose. ``` 1. `--config` style melgan config file. You should use the same config with which the model is trained. @@ -113,3 +110,20 @@ optional arguments: 3. `--test-metadata` is the metadata of the test dataset. Use the `metadata.jsonl` in the `dev/norm` subfolder from the processed directory. 4. `--output-dir` is the directory to save the synthesized audio files. 5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. + +## Pretrained Models +The pretrained model can be downloaded here [style_melgan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/style_melgan/style_melgan_csmsc_ckpt_0.1.1.zip). + +The static model of Style MelGAN is not available now. + +Style MelGAN checkpoint contains files listed below. + +```text +hifigan_csmsc_ckpt_0.1.1 +├── default.yaml # default config used to train style melgan +├── feats_stats.npy # statistics used to normalize spectrogram when training style melgan +└── snapshot_iter_1500000.pdz # generator parameters of style melgan +``` + +## Acknowledgement +We adapted some code from https://github.com/kan-bayashi/ParallelWaveGAN. diff --git a/examples/csmsc/voc4/conf/default.yaml b/examples/csmsc/voc4/conf/default.yaml index 6f7d0f2b326ddd244b8a28f7a9e577964b3b24da..c9abf78dc220ce0e976baecbb8f6987307fc17c1 100644 --- a/examples/csmsc/voc4/conf/default.yaml +++ b/examples/csmsc/voc4/conf/default.yaml @@ -88,7 +88,7 @@ discriminator_adv_loss_params: batch_size: 32 # Batch size. # batch_max_steps(24000) == prod(noise_upsample_scales)(80) * prod(upsample_scales)(300, n_shift) batch_max_steps: 24000 # Length of each audio in batch. Make sure dividable by n_shift. -num_workers: 2 # Number of workers in Pytorch DataLoader. +num_workers: 2 # Number of workers in DataLoader. ########################################################### # OPTIMIZER & SCHEDULER SETTING # diff --git a/examples/csmsc/voc5/README.md b/examples/csmsc/voc5/README.md index 2ced9f779a31a5086d5fc37ef813523040a953f4..21afe6eefad51678568b0b7ab961ceffe33ac779 100644 --- a/examples/csmsc/voc5/README.md +++ b/examples/csmsc/voc5/README.md @@ -6,7 +6,7 @@ Download CSMSC from the [official website](https://www.data-baker.com/data/index ### Get MFA Result and Extract We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to cut silence at the edge of audio. -You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/mfa) of our repo. +You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo. ## Get Started Assume the path to the dataset is `~/datasets/BZNSYP`. @@ -57,7 +57,7 @@ Here's the complete help message. ```text usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] - [--ngpu NGPU] [--verbose VERBOSE] + [--ngpu NGPU] Train a HiFiGAN model. @@ -71,7 +71,6 @@ optional arguments: --output-dir OUTPUT_DIR output dir. --ngpu NGPU if ngpu == 0, use cpu. - --verbose VERBOSE verbose. ``` 1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`. @@ -88,7 +87,6 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p usage: synthesize.py [-h] [--generator-type GENERATOR_TYPE] [--config CONFIG] [--checkpoint CHECKPOINT] [--test-metadata TEST_METADATA] [--output-dir OUTPUT_DIR] [--ngpu NGPU] - [--verbose VERBOSE] Synthesize with GANVocoder. @@ -105,7 +103,6 @@ optional arguments: --output-dir OUTPUT_DIR output dir. --ngpu NGPU if ngpu == 0, use cpu. - --verbose VERBOSE verbose. ``` 1. `--config` config file. You should use the same config with which the model is trained. @@ -114,4 +111,23 @@ optional arguments: 4. `--output-dir` is the directory to save the synthesized audio files. 5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. -## Fine-tuning +## Pretrained Models +The pretrained model can be downloaded here [hifigan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_ckpt_0.1.1.zip). + +The static model can be downloaded here [hifigan_csmsc_static_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_static_0.1.1.zip). + +Model | Step | eval/generator_loss | eval/mel_loss| eval/feature_matching_loss +:-------------:| :------------:| :-----: | :-----: | :--------: +default| 1(gpu) x 2500000|24.927|0.1262|7.554 + +HiFiGAN checkpoint contains files listed below. + +```text +hifigan_csmsc_ckpt_0.1.1 +├── default.yaml # default config used to train hifigan +├── feats_stats.npy # statistics used to normalize spectrogram when training hifigan +└── snapshot_iter_2500000.pdz # generator parameters of hifigan +``` + +## Acknowledgement +We adapted some code from https://github.com/kan-bayashi/ParallelWaveGAN. diff --git a/examples/csmsc/voc5/conf/default.yaml b/examples/csmsc/voc5/conf/default.yaml index 5192d3897f96673a5d6ef81ad93feba5be56af8c..f42fc385acf5bd527d6b2144c39242efe7ed800c 100644 --- a/examples/csmsc/voc5/conf/default.yaml +++ b/examples/csmsc/voc5/conf/default.yaml @@ -119,7 +119,7 @@ lambda_feat_match: 2.0 # Loss balancing coefficient for feat match loss.. ########################################################### batch_size: 16 # Batch size. batch_max_steps: 8400 # Length of each audio in batch. Make sure dividable by hop_size. -num_workers: 2 # Number of workers in Pytorch DataLoader. +num_workers: 2 # Number of workers in DataLoader. ########################################################### # OPTIMIZER & SCHEDULER SETTING # diff --git a/examples/csmsc/voc5/conf/finetune.yaml b/examples/csmsc/voc5/conf/finetune.yaml index 9876e93d01c8be8d9d9499a35b56f60c49654af9..73420625111641ffc088ecf34af528c5d13becd3 100644 --- a/examples/csmsc/voc5/conf/finetune.yaml +++ b/examples/csmsc/voc5/conf/finetune.yaml @@ -119,7 +119,7 @@ lambda_feat_match: 2.0 # Loss balancing coefficient for feat match loss.. ########################################################### batch_size: 16 # Batch size. batch_max_steps: 8400 # Length of each audio in batch. Make sure dividable by hop_size. -num_workers: 2 # Number of workers in Pytorch DataLoader. +num_workers: 2 # Number of workers in DataLoader. ########################################################### # OPTIMIZER & SCHEDULER SETTING # diff --git a/examples/csmsc/voc5/finetune.sh b/examples/csmsc/voc5/finetune.sh index 4ab10e5b3775af433ae9c9f6a2e82966ca3361ba..6719bd0bef340e3629c83f78f08fe91771b62ca7 100755 --- a/examples/csmsc/voc5/finetune.sh +++ b/examples/csmsc/voc5/finetune.sh @@ -15,11 +15,13 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then --fastspeech2-stat=fastspeech2_nosil_baker_ckpt_0.4/speech_stats.npy \ --dur-file=durations.txt \ --output-dir=dump_finetune \ - --phones-dict=fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt + --phones-dict=fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt \ + --dataset=baker \ + --rootdir=~/datasets/BZNSYP/ fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then - python3 local/link_wav.py \ + python3 ${MAIN_ROOT}/utils/link_wav.py \ --old-dump-dir=dump \ --dump-dir=dump_finetune fi diff --git a/examples/csmsc/voc5/local/link_wav.py b/examples/csmsc/voc5/local/link_wav.py deleted file mode 100644 index c81e0d4b83320665b98720d09a940e9de6dc63cd..0000000000000000000000000000000000000000 --- a/examples/csmsc/voc5/local/link_wav.py +++ /dev/null @@ -1,85 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import argparse -import os -from operator import itemgetter -from pathlib import Path - -import jsonlines -import numpy as np - - -def main(): - # parse config and args - parser = argparse.ArgumentParser( - description="Preprocess audio and then extract features .") - - parser.add_argument( - "--old-dump-dir", - default=None, - type=str, - help="directory to dump feature files.") - parser.add_argument( - "--dump-dir", - type=str, - required=True, - help="directory to finetune dump feature files.") - args = parser.parse_args() - - old_dump_dir = Path(args.old_dump_dir).expanduser() - old_dump_dir = old_dump_dir.resolve() - dump_dir = Path(args.dump_dir).expanduser() - # use absolute path - dump_dir = dump_dir.resolve() - dump_dir.mkdir(parents=True, exist_ok=True) - - assert old_dump_dir.is_dir() - assert dump_dir.is_dir() - - for sub in ["train", "dev", "test"]: - # 把 old_dump_dir 里面的 *-wave.npy 软连接到 dump_dir 的对应位置 - output_dir = dump_dir / sub - output_dir.mkdir(parents=True, exist_ok=True) - results = [] - for name in os.listdir(output_dir / "raw"): - # 003918_feats.npy - utt_id = name.split("_")[0] - mel_path = output_dir / ("raw/" + name) - gen_mel = np.load(mel_path) - wave_name = utt_id + "_wave.npy" - wav = np.load(old_dump_dir / sub / ("raw/" + wave_name)) - os.symlink(old_dump_dir / sub / ("raw/" + wave_name), - output_dir / ("raw/" + wave_name)) - num_sample = wav.shape[0] - num_frames = gen_mel.shape[0] - wav_path = output_dir / ("raw/" + wave_name) - - record = { - "utt_id": utt_id, - "num_samples": num_sample, - "num_frames": num_frames, - "feats": str(mel_path), - "wave": str(wav_path), - } - results.append(record) - - results.sort(key=itemgetter("utt_id")) - - with jsonlines.open(output_dir / "raw/metadata.jsonl", 'w') as writer: - for item in results: - writer.write(item) - - -if __name__ == "__main__": - main() diff --git a/examples/esc50/README.md b/examples/esc50/README.md index 66409754d464b7a674bc28050350fef6c77ec48a..2ce57ae0641fad36132bd26e49f867e279a48a9c 100644 --- a/examples/esc50/README.md +++ b/examples/esc50/README.md @@ -17,21 +17,32 @@ PaddleAudio提供了PANNs的CNN14、CNN10和CNN6的预训练模型,可供用 - CNN6: 该模型主要包含4个卷积层和2个全连接层,模型参数的数量为4.5M,embbedding维度是512。 +## 数据集 + +[ESC-50: Dataset for Environmental Sound Classification](https://github.com/karolpiczak/ESC-50) 是一个包含有 2000 个带标签的环境声音样本,音频样本采样率为 44,100Hz 的单通道音频文件,所有样本根据标签被划分为 50 个类别,每个类别有 40 个样本。 + +## 模型指标 + +根据 `ESC-50` 提供的fold信息,对数据集进行 5-fold 的 fine-tune 训练和评估,平均准确率如下: + +|Model|Acc| +|--|--| +|CNN14| 0.9500 +|CNN10| 0.8975 +|CNN6| 0.8825 + ## 快速开始 ### 模型训练 -以环境声音分类数据集`ESC50`为示例,运行下面的命令,可在训练集上进行模型的finetune,支持单机的单卡训练和多卡训练。 +运行下面的命令,可在训练集上进行模型的finetune,支持单机的单卡训练和多卡训练。 启动训练: ```shell -$ CUDA_VISIBLE_DEVICES=0 ./run.sh 1 +$ CUDA_VISIBLE_DEVICES=0 ./run.sh 1 conf/panns.yaml ``` -`paddlespeech/cls/exps/panns/train.py` 脚本中可支持配置的参数: - -- `device`: 指定模型预测时使用的设备。 -- `feat_backend`: 选择提取特征的后端,可选`'numpy'`或`'paddle'`,默认为`'numpy'`。 +训练的参数可在 `conf/panns.yaml` 的 `training` 中配置,其中: - `epochs`: 训练轮次,默认为50。 - `learning_rate`: Fine-tune的学习率;默认为5e-5。 - `batch_size`: 批处理大小,请结合显存情况进行调整,若出现显存不足,请适当调低这一参数;默认为16。 @@ -40,36 +51,31 @@ $ CUDA_VISIBLE_DEVICES=0 ./run.sh 1 - `save_freq`: 训练过程中的模型保存频率,默认为10。 - `log_freq`: 训练过程中的信息打印频率,默认为10。 -示例代码中使用的预训练模型为`CNN14`,如果想更换为其他预训练模型,可通过以下方式执行: -```python -from paddleaudio.datasets import ESC50 -from paddlespeech.cls.models import SoundClassifier -from paddlespeech.cls.models import cnn14, cnn10, cnn6 - +示例代码中使用的预训练模型为`CNN14`,如果想更换为其他预训练模型,可通过修改 `conf/panns.yaml` 的 `model` 中配置: +```yaml # CNN14 -backbone = cnn14(pretrained=True, extract_embedding=True) -model = SoundClassifier(backbone, num_class=len(ESC50.label_list)) - +model: + backbone: 'paddlespeech.cls.models:cnn14' +``` +```yaml # CNN10 -backbone = cnn10(pretrained=True, extract_embedding=True) -model = SoundClassifier(backbone, num_class=len(ESC50.label_list)) - +model: + backbone: 'paddlespeech.cls.models:cnn10' +``` +```yaml # CNN6 -backbone = cnn6(pretrained=True, extract_embedding=True) -model = SoundClassifier(backbone, num_class=len(ESC50.label_list)) +model: + backbone: 'paddlespeech.cls.models:cnn6' ``` ### 模型预测 ```shell -$ CUDA_VISIBLE_DEVICES=0 ./run.sh 2 +$ CUDA_VISIBLE_DEVICES=0 ./run.sh 2 conf/panns.yaml ``` -`paddlespeech/cls/exps/panns/predict.py` 脚本中可支持配置的参数: - -- `device`: 指定模型预测时使用的设备。 -- `wav`: 指定预测的音频文件。 -- `feat_backend`: 选择提取特征的后端,可选`'numpy'`或`'paddle'`,默认为`'numpy'`。 +训练的参数可在 `conf/panns.yaml` 的 `predicting` 中配置,其中: +- `audio_file`: 指定预测的音频文件。 - `top_k`: 预测显示的top k标签的得分,默认为1。 - `checkpoint`: 模型参数checkpoint文件。 @@ -88,7 +94,7 @@ Cat: 6.579841738130199e-06 模型训练结束后,可以将已保存的动态图参数导出成静态图的模型和参数,然后实施静态图的部署。 ```shell -$ CUDA_VISIBLE_DEVICES=0 ./run.sh 3 +$ CUDA_VISIBLE_DEVICES=0 ./run.sh 3 ./checkpoint/epoch_50/model.pdparams ./export ``` `paddlespeech/cls/exps/panns/export_model.py` 脚本中可支持配置的参数: @@ -109,7 +115,7 @@ export `paddlespeech/cls/exps/panns/deploy/predict.py` 脚本使用了`paddle.inference`模块下的api,提供了python端部署的示例: ```shell -$ CUDA_VISIBLE_DEVICES=0 ./run.sh 4 +$ CUDA_VISIBLE_DEVICES=0 ./run.sh 4 cpu ./export /audio/dog.wav ``` `paddlespeech/cls/exps/panns/deploy/predict.py` 脚本中可支持配置的主要参数: diff --git a/examples/esc50/RESULTS.md b/examples/esc50/RESULTS.md new file mode 100644 index 0000000000000000000000000000000000000000..edbf07a3ffcef269f9e4a93cb58995d0b2788abd --- /dev/null +++ b/examples/esc50/RESULTS.md @@ -0,0 +1,9 @@ +## Metrics + +5-fold cross validation accuracy on [ESC-50](https://github.com/karolpiczak/ESC-50) dataset: + +|Model|Acc| +|--|--| +|CNN14| 0.9500 +|CNN10| 0.8975 +|CNN6| 0.8825 diff --git a/examples/esc50/cls0/conf/panns.yaml b/examples/esc50/cls0/conf/panns.yaml new file mode 100644 index 0000000000000000000000000000000000000000..3a9d42aa5e52de31c6c461d471988bb2067e5cdf --- /dev/null +++ b/examples/esc50/cls0/conf/panns.yaml @@ -0,0 +1,36 @@ +data: + dataset: 'paddleaudio.datasets:ESC50' + num_classes: 50 + train: + mode: 'train' + split: 1 + dev: + mode: 'dev' + split: 1 + +model: + backbone: 'paddlespeech.cls.models:cnn14' + +feature: + sr: 32000 + n_fft: 1024 + hop_length: 320 + window: 'hann' + win_length: 1024 + f_min: 50.0 + f_max: 14000.0 + n_mels: 64 + +training: + epochs: 50 + learning_rate: 0.00005 + num_workers: 2 + batch_size: 16 + checkpoint_dir: './checkpoint' + save_freq: 10 + log_freq: 10 + +predicting: + audio_file: '/audio/dog.wav' + top_k: 10 + checkpoint: './checkpoint/epoch_50/model.pdparams' \ No newline at end of file diff --git a/examples/esc50/cls0/local/export.sh b/examples/esc50/cls0/local/export.sh index 160dc7432d28c650281684d0c7a539023bfd0fb0..9c854a19401f446565c50be47eb8731dab1684d3 100755 --- a/examples/esc50/cls0/local/export.sh +++ b/examples/esc50/cls0/local/export.sh @@ -1,8 +1,8 @@ #!/bin/bash -ckpt_dir=$1 +ckpt=$1 output_dir=$2 python3 ${BIN_DIR}/export_model.py \ ---checkpoint ${ckpt_dir}/model.pdparams \ +--checkpoint ${ckpt} \ --output_dir ${output_dir} diff --git a/examples/esc50/cls0/local/infer.sh b/examples/esc50/cls0/local/infer.sh index bc03d68106e6123e0f7b4dfff45bb0d9182f28c1..25d595be26e4a3f3a6f4808fb55f6c3d3f3cc638 100755 --- a/examples/esc50/cls0/local/infer.sh +++ b/examples/esc50/cls0/local/infer.sh @@ -1,11 +1,4 @@ #!/bin/bash -audio_file=$1 -ckpt_dir=$2 -feat_backend=$3 - python3 ${BIN_DIR}/predict.py \ ---wav ${audio_file} \ ---feat_backend ${feat_backend} \ ---top_k 10 \ ---checkpoint ${ckpt_dir}/model.pdparams +--cfg_path=$1 diff --git a/examples/esc50/cls0/local/train.sh b/examples/esc50/cls0/local/train.sh index 0f0f3d091a314fdadf39c27d4c35e9debaf0984c..cab547b84ec2b6da067d544e625ce4cd2760c94c 100755 --- a/examples/esc50/cls0/local/train.sh +++ b/examples/esc50/cls0/local/train.sh @@ -1,25 +1,12 @@ #!/bin/bash ngpu=$1 -feat_backend=$2 - -num_epochs=50 -batch_size=16 -ckpt_dir=./checkpoint -save_freq=10 +cfg_path=$2 if [ ${ngpu} -gt 0 ]; then python3 -m paddle.distributed.launch --gpus $CUDA_VISIBLE_DEVICES ${BIN_DIR}/train.py \ - --epochs ${num_epochs} \ - --feat_backend ${feat_backend} \ - --batch_size ${batch_size} \ - --checkpoint_dir ${ckpt_dir} \ - --save_freq ${save_freq} + --cfg_path ${cfg_path} else python3 ${BIN_DIR}/train.py \ - --epochs ${num_epochs} \ - --feat_backend ${feat_backend} \ - --batch_size ${batch_size} \ - --checkpoint_dir ${ckpt_dir} \ - --save_freq ${save_freq} + --cfg_path ${cfg_path} fi diff --git a/examples/esc50/cls0/run.sh b/examples/esc50/cls0/run.sh index 7283aa8d7cf85c2acd5bede3eaf77ffc03f848f3..0e407b40ed83db4ea6980228e9ea59fddf3515c1 100755 --- a/examples/esc50/cls0/run.sh +++ b/examples/esc50/cls0/run.sh @@ -6,28 +6,30 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') stage=$1 stop_stage=100 -feat_backend=numpy -audio_file=~/cat.wav -ckpt_dir=./checkpoint/epoch_50 -output_dir=./export -infer_device=cpu if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then - ./local/train.sh ${ngpu} ${feat_backend} || exit -1 + cfg_path=$2 + ./local/train.sh ${ngpu} ${cfg_path} || exit -1 exit 0 fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then - ./local/infer.sh ${audio_file} ${ckpt_dir} ${feat_backend} || exit -1 + cfg_path=$2 + ./local/infer.sh ${cfg_path} || exit -1 exit 0 fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then - ./local/export.sh ${ckpt_dir} ${output_dir} || exit -1 + ckpt=$2 + output_dir=$3 + ./local/export.sh ${ckpt} ${output_dir} || exit -1 exit 0 fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then - ./local/static_model_infer.sh ${infer_device} ${output_dir} ${audio_file} || exit -1 + infer_device=$2 + graph_dir=$3 + audio_file=$4 + ./local/static_model_infer.sh ${infer_device} ${graph_dir} ${audio_file} || exit -1 exit 0 fi diff --git a/examples/iwslt2012/punc0/README.md b/examples/iwslt2012/punc0/README.md index 1fcd954ca9bee8db56843a1adde526ebd6bd8fb8..74d599a21d9cc392abdbae60fcda81a65ff2e01d 100644 --- a/examples/iwslt2012/punc0/README.md +++ b/examples/iwslt2012/punc0/README.md @@ -1,35 +1,29 @@ -# 中文实验例程 -## 测试数据: -- IWLST2012中文:test2012 +# Punctuation Restoration with IWLST2012-Zh -## 运行代码 -- 运行 `run.sh 0 0 conf/train_conf/bertBLSTM_zh.yaml 1 conf/data_conf/chinese.yaml ` +## Get Started +### Data Preprocessing +```bash +./run.sh --stage 0 --stop-stage 0 +``` +### Model Training +```bash +./run.sh --stage 1 --stop-stage 1 +``` +### Testing +```bash +./run.sh --stage 2 --stop-stage 2 +``` +### Punctuation Restoration +```bash +./run.sh --stage 3 --stop-stage 3 +``` +## Pretrained Model +The pretrained model can be downloaded here [ernie_linear_p3_iwslt2012_zh_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/text/ernie_linear_p3_iwslt2012_zh_ckpt_0.1.1.zip). -## 实验结果: -- BertLinear - - 实验配置:conf/train_conf/bertLinear_zh.yaml - - 测试结果 - - | | COMMA | PERIOD | QUESTION | OVERALL | - |-----------|-----------|-----------|-----------|--------- | - |Precision | 0.425665 | 0.335190 | 0.698113 | 0.486323 | - |Recall | 0.511278 | 0.572108 | 0.787234 | 0.623540 | - |F1 | 0.464560 | 0.422717 | 0.740000 | 0.542426 | - -- BertBLSTM - - 实验配置:conf/train_conf/bertBLSTM_zh.yaml - - 测试结果 avg_1 - - | | COMMA | PERIOD | QUESTION | OVERALL | - |-----------|-----------|-----------|-----------|--------- | - |Precision | 0.469484 | 0.550604 | 0.801887 | 0.607325 | - |Recall | 0.580271 | 0.592408 | 0.817308 | 0.663329 | - |F1 | 0.519031 | 0.570741 | 0.809524 | 0.633099 | - - - BertBLSTM/avg_1测试标贝合成数据 - - | | COMMA | PERIOD | QUESTION | OVERALL | - |-----------|-----------|-----------|-----------|--------- | - |Precision | 0.217192 | 0.196339 | 0.820717 | 0.411416 | - |Recall | 0.205922 | 0.892531 | 0.416162 | 0.504872 | - |F1 | 0.211407 | 0.321873 | 0.552279 | 0.361853 | +### Test Result +- Ernie Linear + | |COMMA | PERIOD | QUESTION | OVERALL| + |:-----:|:-----:|:-----:|:-----:|:-----:| + |Precision |0.510955 |0.526462 |0.820755 |0.619391| + |Recall |0.517433 |0.564179 |0.861386 |0.647666| + |F1 |0.514173 |0.544669 |0.840580 |0.633141| diff --git a/examples/iwslt2012/punc0/conf/default.yaml b/examples/iwslt2012/punc0/conf/default.yaml new file mode 100644 index 0000000000000000000000000000000000000000..74ced99327b38dff4863718b75a7bff8096061c1 --- /dev/null +++ b/examples/iwslt2012/punc0/conf/default.yaml @@ -0,0 +1,44 @@ +########################################################### +# DATA SETTING # +########################################################### +dataset_type: Ernie +train_path: data/iwslt2012_zh/train.txt +dev_path: data/iwslt2012_zh/dev.txt +test_path: data/iwslt2012_zh/test.txt +batch_size: 64 +num_workers: 2 +data_params: + pretrained_token: ernie-1.0 + punc_path: data/iwslt2012_zh/punc_vocab + seq_len: 100 + + +########################################################### +# MODEL SETTING # +########################################################### +model_type: ErnieLinear +model: + pretrained_token: ernie-1.0 + num_classes: 4 + +########################################################### +# OPTIMIZER SETTING # +########################################################### +optimizer_params: + weight_decay: 1.0e-6 # weight decay coefficient. + +scheduler_params: + learning_rate: 1.0e-5 # learning rate. + gamma: 1.0 # scheduler gamma. + +########################################################### +# TRAINING SETTING # +########################################################### +max_epoch: 20 +num_snapshots: 5 + +########################################################### +# OTHER SETTING # +########################################################### +num_snapshots: 10 # max number of snapshots to keep while training +seed: 42 # random seed for paddle, random, and np.random diff --git a/examples/iwslt2012/punc0/conf/ernie_linear.yaml b/examples/iwslt2012/punc0/conf/ernie_linear.yaml deleted file mode 100644 index e00b793c1972941e32fb9c262156b4a0a763c1bf..0000000000000000000000000000000000000000 --- a/examples/iwslt2012/punc0/conf/ernie_linear.yaml +++ /dev/null @@ -1,36 +0,0 @@ -data: - dataset_type: Ernie - train_path: data/iwslt2012_zh/train.txt - dev_path: data/iwslt2012_zh/dev.txt - test_path: data/iwslt2012_zh/test.txt - data_params: - pretrained_token: ernie-1.0 - punc_path: data/iwslt2012_zh/punc_vocab - seq_len: 100 - batch_size: 64 - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 0 - -checkpoint: - kbest_n: 5 - latest_n: 10 - metric_type: F1 - -model_type: ErnieLinear - -model_params: - pretrained_token: ernie-1.0 - num_classes: 4 - -training: - n_epoch: 100 - lr: !!float 1e-5 - lr_decay: 1.0 - weight_decay: !!float 1e-06 - global_grad_clip: 5.0 - log_interval: 10 - log_path: log/train_ernie_linear.log - -testing: - log_path: log/test_ernie_linear.log diff --git a/examples/iwslt2012/punc0/local/avg.sh b/examples/iwslt2012/punc0/local/avg.sh deleted file mode 100644 index b8c14c6623cc30ca167cfb26fcc9ade349cad288..0000000000000000000000000000000000000000 --- a/examples/iwslt2012/punc0/local/avg.sh +++ /dev/null @@ -1,23 +0,0 @@ -#! /usr/bin/env bash - -if [ $# != 2 ]; then - echo "usage: ${0} ckpt_dir avg_num" - exit -1 -fi - -ckpt_dir=${1} -average_num=${2} -decode_checkpoint=${ckpt_dir}/avg_${average_num}.pdparams - -python3 -u ${BIN_DIR}/avg_model.py \ ---dst_model ${decode_checkpoint} \ ---ckpt_dir ${ckpt_dir} \ ---num ${average_num} \ ---val_best - -if [ $? -ne 0 ]; then - echo "Failed in avg ckpt!" - exit 1 -fi - -exit 0 \ No newline at end of file diff --git a/examples/iwslt2012/punc0/local/data.sh b/examples/iwslt2012/punc0/local/data.sh old mode 100644 new mode 100755 diff --git a/examples/iwslt2012/punc0/local/punc_restore.sh b/examples/iwslt2012/punc0/local/punc_restore.sh new file mode 100755 index 0000000000000000000000000000000000000000..30a4f12f8040cbb1617aa267ff68fe0248e01951 --- /dev/null +++ b/examples/iwslt2012/punc0/local/punc_restore.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +config_path=$1 +train_output_path=$2 +ckpt_name=$3 +text=$4 +ckpt_prefix=${ckpt_name%.*} + +python3 ${BIN_DIR}/punc_restore.py \ + --config=${config_path} \ + --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \ + --text=${text} diff --git a/examples/iwslt2012/punc0/local/test.sh b/examples/iwslt2012/punc0/local/test.sh old mode 100644 new mode 100755 index ee02246222b7ed6c7e780cde443abf537da7f14c..94e508b5bff9ae0d7b3253fafe3ebb943d44a97a --- a/examples/iwslt2012/punc0/local/test.sh +++ b/examples/iwslt2012/punc0/local/test.sh @@ -1,26 +1,11 @@ - #!/bin/bash -if [ $# != 2 ];then - echo "usage: ${0} config_path ckpt_path_prefix" - exit -1 -fi - -ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') -echo "using $ngpu gpus..." - config_path=$1 -ckpt_prefix=$2 - -python3 -u ${BIN_DIR}/test.py \ ---ngpu 1 \ ---config ${config_path} \ ---result_file ${ckpt_prefix}.rsl \ ---checkpoint_path ${ckpt_prefix} +train_output_path=$2 +ckpt_name=$3 -if [ $? -ne 0 ]; then - echo "Failed in evaluation!" - exit 1 -fi +ckpt_prefix=${ckpt_name%.*} -exit 0 +python3 ${BIN_DIR}/test.py \ + --config=${config_path} \ + --checkpoint=${train_output_path}/checkpoints/${ckpt_name} diff --git a/examples/iwslt2012/punc0/local/train.sh b/examples/iwslt2012/punc0/local/train.sh old mode 100644 new mode 100755 index 9fabb8f75b2b00364bd6b878d9d18f058b8d91b9..85227eacbe518f59655a3b8013363c97d192b396 --- a/examples/iwslt2012/punc0/local/train.sh +++ b/examples/iwslt2012/punc0/local/train.sh @@ -1,28 +1,9 @@ #!/bin/bash -if [ $# != 3 ];then - echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name log_dir" - exit -1 -fi - -ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') -echo "using $ngpu gpus..." - config_path=$1 -ckpt_name=$2 -log_dir=$3 - -mkdir -p exp - -python3 -u ${BIN_DIR}/train.py \ ---ngpu ${ngpu} \ ---config ${config_path} \ ---output_dir exp/${ckpt_name} \ ---log_dir ${log_dir} - -if [ $? -ne 0 ]; then - echo "Failed in training!" - exit 1 -fi +train_output_path=$2 -exit 0 +python3 ${BIN_DIR}/train.py \ + --config=${config_path} \ + --output-dir=${train_output_path} \ + --ngpu=1 diff --git a/examples/iwslt2012/punc0/path.sh b/examples/iwslt2012/punc0/path.sh old mode 100644 new mode 100755 index 8f67f9c938d36013bdb2f37f781ae3f5a9b43524..da790261f8da529f45cacf05c693c16dd20bd84b --- a/examples/iwslt2012/punc0/path.sh +++ b/examples/iwslt2012/punc0/path.sh @@ -10,5 +10,5 @@ export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/ -MODEL=$1 +MODEL=ernie_linear export BIN_DIR=${MAIN_ROOT}/paddlespeech/text/exps/${MODEL} diff --git a/examples/iwslt2012/punc0/run.sh b/examples/iwslt2012/punc0/run.sh index 8d786a198c5671b98dc615f3eed7a3c44eed64ab..0c14eb7e23a7c02204adb933381ec9343e7041b6 100755 --- a/examples/iwslt2012/punc0/run.sh +++ b/examples/iwslt2012/punc0/run.sh @@ -1,40 +1,35 @@ #!/bin/bash set -e +source path.sh -if [ $# -ne 4 ]; then - echo "usage: bash ./run.sh stage gpu train_config avg_num" - echo "eg: bash ./run.sh 1 0 train_config 1" - exit -1 -fi - -stage=$1 +gpus=0,1 +stage=0 stop_stage=100 -gpus=$2 -conf_path=$3 -avg_num=$4 -avg_ckpt=avg_${avg_num} -ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}') -log_dir=log -source path.sh ${ckpt} +conf_path=conf/default.yaml +train_output_path=exp/default +ckpt_name=snapshot_iter_12840.pdz +text=今天的天气真不错啊你下午有空吗我想约你一起去吃饭 +# with the following command, you can choose the stage range you want to run +# such as `./run.sh --stage 0 --stop-stage 0` +# this can not be mixed use with `$1`, `$2` ... +source ${MAIN_ROOT}/utils/parse_options.sh || exit 1 -if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then # prepare data - bash ./local/data.sh + ./local/data.sh fi -if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then - # train model, all `ckpt` under `exp` dir - CUDA_VISIBLE_DEVICES=${gpus} bash ./local/train.sh ${conf_path} ${ckpt} ${log_dir} +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # train model, all `ckpt` under `train_output_path/checkpoints/` dir + CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1 fi -if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then - # avg n best model - bash ./local/avg.sh exp/${ckpt}/checkpoints ${avg_num} +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 fi -if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then - # test ckpt avg_n - CUDA_VISIBLE_DEVICES=${gpus} bash ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 -fi +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + CUDA_VISIBLE_DEVICES=${gpus} ./local/punc_restore.sh ${conf_path} ${train_output_path} ${ckpt_name} ${text}|| exit -1 +fi \ No newline at end of file diff --git a/examples/librispeech/asr0/conf/deepspeech2.yaml b/examples/librispeech/asr0/conf/deepspeech2.yaml index f3574e15025fea63a4d6434a2ee2a1ac25856758..0307b9f39ecee66d75fe3a0e3e81651e5f23518b 100644 --- a/examples/librispeech/asr0/conf/deepspeech2.yaml +++ b/examples/librispeech/asr0/conf/deepspeech2.yaml @@ -1,68 +1,65 @@ # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev-clean - test_manifest: data/manifest.test-clean - min_input_len: 0.0 - max_input_len: 30.0 # second - min_output_len: 0.0 - max_output_len: .inf - min_output_input_ratio: 0.00 - max_output_input_ratio: .inf +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev-clean +test_manifest: data/manifest.test-clean +min_input_len: 0.0 +max_input_len: 30.0 # second +min_output_len: 0.0 +max_output_len: .inf +min_output_input_ratio: 0.00 +max_output_input_ratio: .inf -collator: - batch_size: 20 - mean_std_filepath: data/mean_std.json - unit_type: char - vocab_filepath: data/lang_char/vocab.txt - augmentation_config: conf/augmentation.json - random_seed: 0 - spm_model_prefix: - spectrum_type: linear - target_sample_rate: 16000 - max_freq: None - n_fft: None - stride_ms: 10.0 - window_ms: 20.0 - delta_delta: False - dither: 1.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 +########################################### +# Dataloader # +########################################### +batch_size: 20 +mean_std_filepath: data/mean_std.json +unit_type: char +vocab_filepath: data/lang_char/vocab.txt +augmentation_config: conf/augmentation.json +random_seed: 0 +spm_model_prefix: +spectrum_type: linear +feat_dim: +target_sample_rate: 16000 +max_freq: None +n_fft: None +stride_ms: 10.0 +window_ms: 20.0 +delta_delta: False +dither: 1.0 +use_dB_normalization: True +target_dB: -20 +random_seed: 0 +keep_transcription_text: False +sortagrad: True +shuffle_method: batch_shuffle +num_workers: 2 -model: - num_conv_layers: 2 - num_rnn_layers: 3 - rnn_layer_size: 2048 - use_gru: False - share_rnn_weights: True - blank_id: 0 +############################################ +# Network Architecture # +############################################ +num_conv_layers: 2 +num_rnn_layers: 3 +rnn_layer_size: 2048 +use_gru: False +share_rnn_weights: True +blank_id: 0 -training: - n_epoch: 50 - accum_grad: 1 - lr: 1e-3 - lr_decay: 0.83 - weight_decay: 1e-06 - global_grad_clip: 5.0 - log_interval: 100 - checkpoint: - kbest_n: 50 - latest_n: 5 - -decoding: - batch_size: 128 - error_rate_type: wer - decoding_method: ctc_beam_search - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 1.9 - beta: 0.3 - beam_size: 500 - cutoff_prob: 1.0 - cutoff_top_n: 40 - num_proc_bsearch: 8 +########################################### +# Training # +########################################### +n_epoch: 50 +accum_grad: 1 +lr: 1.0e-3 +lr_decay: 0.83 +weight_decay: 1.0e-6 +global_grad_clip: 5.0 +log_interval: 100 +checkpoint: + kbest_n: 50 + latest_n: 5 diff --git a/examples/librispeech/asr0/conf/deepspeech2_online.yaml b/examples/librispeech/asr0/conf/deepspeech2_online.yaml index 0d16bc5714e46697719d218af696ed6be9c9478c..a0d2bcfe277e84852f3f8526a5cb6a35dee88079 100644 --- a/examples/librispeech/asr0/conf/deepspeech2_online.yaml +++ b/examples/librispeech/asr0/conf/deepspeech2_online.yaml @@ -1,70 +1,67 @@ # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev-clean - test_manifest: data/manifest.test-clean - min_input_len: 0.0 - max_input_len: 30.0 # second - min_output_len: 0.0 - max_output_len: .inf - min_output_input_ratio: 0.00 - max_output_input_ratio: .inf +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev-clean +test_manifest: data/manifest.test-clean +min_input_len: 0.0 +max_input_len: 30.0 # second +min_output_len: 0.0 +max_output_len: .inf +min_output_input_ratio: 0.00 +max_output_input_ratio: .inf -collator: - batch_size: 15 - mean_std_filepath: data/mean_std.json - unit_type: char - vocab_filepath: data/lang_char/vocab.txt - augmentation_config: conf/augmentation.json - random_seed: 0 - spm_model_prefix: - spectrum_type: linear - target_sample_rate: 16000 - max_freq: None - n_fft: None - stride_ms: 10.0 - window_ms: 20.0 - delta_delta: False - dither: 1.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 0 +########################################### +# Dataloader # +########################################### +batch_size: 15 +mean_std_filepath: data/mean_std.json +unit_type: char +vocab_filepath: data/lang_char/vocab.txt +augmentation_config: conf/augmentation.json +random_seed: 0 +spm_model_prefix: +spectrum_type: linear +feat_dim: +target_sample_rate: 16000 +max_freq: None +n_fft: None +stride_ms: 10.0 +window_ms: 20.0 +delta_delta: False +dither: 1.0 +use_dB_normalization: True +target_dB: -20 +random_seed: 0 +keep_transcription_text: False +sortagrad: True +shuffle_method: batch_shuffle +num_workers: 0 -model: - num_conv_layers: 2 - num_rnn_layers: 3 - rnn_layer_size: 2048 - rnn_direction: forward - num_fc_layers: 2 - fc_layers_size_list: 512, 256 - use_gru: False - blank_id: 0 +############################################ +# Network Architecture # +############################################ +num_conv_layers: 2 +num_rnn_layers: 3 +rnn_layer_size: 2048 +rnn_direction: forward +num_fc_layers: 2 +fc_layers_size_list: 512, 256 +use_gru: False +blank_id: 0 -training: - n_epoch: 50 - accum_grad: 4 - lr: 1e-3 - lr_decay: 0.83 - weight_decay: 1e-06 - global_grad_clip: 5.0 - log_interval: 100 - checkpoint: - kbest_n: 50 - latest_n: 5 - -decoding: - batch_size: 128 - error_rate_type: wer - decoding_method: ctc_beam_search - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 1.9 - beta: 0.3 - beam_size: 500 - cutoff_prob: 1.0 - cutoff_top_n: 40 - num_proc_bsearch: 8 +########################################### +# Training # +########################################### +n_epoch: 50 +accum_grad: 4 +lr: 1.0e-3 +lr_decay: 0.83 +weight_decay: 1.0e-6 +global_grad_clip: 5.0 +log_interval: 100 +checkpoint: + kbest_n: 50 + latest_n: 5 diff --git a/examples/librispeech/asr0/conf/tuning/chunk_decode.yaml b/examples/librispeech/asr0/conf/tuning/chunk_decode.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e07026ba5376dbf47905268efc85b422b1c4b6f8 --- /dev/null +++ b/examples/librispeech/asr0/conf/tuning/chunk_decode.yaml @@ -0,0 +1,10 @@ +decode_batch_size: 128 +error_rate_type: wer +decoding_method: ctc_beam_search +lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm +alpha: 1.9 +beta: 0.3 +beam_size: 500 +cutoff_prob: 1.0 +cutoff_top_n: 40 +num_proc_bsearch: 8 \ No newline at end of file diff --git a/examples/librispeech/asr0/conf/tuning/decode.yaml b/examples/librispeech/asr0/conf/tuning/decode.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e07026ba5376dbf47905268efc85b422b1c4b6f8 --- /dev/null +++ b/examples/librispeech/asr0/conf/tuning/decode.yaml @@ -0,0 +1,10 @@ +decode_batch_size: 128 +error_rate_type: wer +decoding_method: ctc_beam_search +lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm +alpha: 1.9 +beta: 0.3 +beam_size: 500 +cutoff_prob: 1.0 +cutoff_top_n: 40 +num_proc_bsearch: 8 \ No newline at end of file diff --git a/examples/librispeech/asr0/local/test.sh b/examples/librispeech/asr0/local/test.sh index a627ef72274e3d67219701d34956fdf2b3a364b2..ea40046b10997ee425d4e654b89fedc732c8b3fe 100755 --- a/examples/librispeech/asr0/local/test.sh +++ b/examples/librispeech/asr0/local/test.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 3 ];then - echo "usage: ${0} config_path ckpt_path_prefix model_type" +if [ $# != 4 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix model_type" exit -1 fi @@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 -model_type=$3 +decode_config_path=$2 +ckpt_prefix=$3 +model_type=$4 # download language model bash local/download_lm_en.sh @@ -21,6 +22,7 @@ fi python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ +--decode_cfg ${decode_config_path} \ --result_file ${ckpt_prefix}.rsl \ --checkpoint_path ${ckpt_prefix} \ --model_type ${model_type} diff --git a/examples/librispeech/asr0/local/test_wav.sh b/examples/librispeech/asr0/local/test_wav.sh index e8337da7f699f35c1c6c6a74a8650deada03785d..25cfc45e3ad532ecb51f6116def2dac5899824d0 100755 --- a/examples/librispeech/asr0/local/test_wav.sh +++ b/examples/librispeech/asr0/local/test_wav.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 4 ];then - echo "usage: ${0} config_path ckpt_path_prefix model_type audio_file" +if [ $# != 5 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix model_type audio_file" exit -1 fi @@ -9,9 +9,10 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 -model_type=$3 -audio_file=$4 +decode_config_path=$2 +ckpt_prefix=$3 +model_type=$4 +audio_file=$5 mkdir -p data wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/en/demo_002_en.wav -P data/ @@ -33,6 +34,7 @@ fi python3 -u ${BIN_DIR}/test_wav.py \ --ngpu ${ngpu} \ --config ${config_path} \ +--decode_cfg ${decode_config_path} \ --result_file ${ckpt_prefix}.rsl \ --checkpoint_path ${ckpt_prefix} \ --model_type ${model_type} \ diff --git a/examples/librispeech/asr0/run.sh b/examples/librispeech/asr0/run.sh index 5d811b65324b7b7e4b6843a80fe5c6b6c07641bb..ca2c2b9da39d739520a6a6c4e00da6dc93cff1ef 100755 --- a/examples/librispeech/asr0/run.sh +++ b/examples/librispeech/asr0/run.sh @@ -6,6 +6,7 @@ gpus=0,1,2,3,4,5,6,7 stage=0 stop_stage=100 conf_path=conf/deepspeech2.yaml +decode_conf_path=conf/tuning/decode.yaml avg_num=30 model_type=offline audio_file=data/demo_002_en.wav @@ -33,7 +34,7 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # test ckpt avg_n - CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type} || exit -1 fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then @@ -43,5 +44,5 @@ fi if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then # test a single .wav file - CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type} ${audio_file} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type} ${audio_file} || exit -1 fi diff --git a/examples/librispeech/asr1/conf/chunk_conformer.yaml b/examples/librispeech/asr1/conf/chunk_conformer.yaml index 2872b69ef103a6f2e5b7ea1dbbcccf7c872ad2a8..72b9cb7be478f289b94504afe4df3267c3afba67 100644 --- a/examples/librispeech/asr1/conf/chunk_conformer.yaml +++ b/examples/librispeech/asr1/conf/chunk_conformer.yaml @@ -1,115 +1,99 @@ -# network architecture -model: - cmvn_file: - cmvn_file_type: "json" - # encoder related - encoder: conformer - encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: True - use_cnn_module: True - cnn_module_kernel: 15 - activation_type: 'swish' - pos_enc_layer_type: 'rel_pos' - selfattention_layer_type: 'rel_selfattn' - causal: True - use_dynamic_chunk: true - cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster - use_dynamic_left_chunk: false +############################################ +# Network Architecture # +############################################ +cmvn_file: +cmvn_file_type: "json" +# encoder related +encoder: conformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: True + use_cnn_module: True + cnn_module_kernel: 15 + activation_type: 'swish' + pos_enc_layer_type: 'rel_pos' + selfattention_layer_type: 'rel_selfattn' + causal: True + use_dynamic_chunk: true + cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster + use_dynamic_left_chunk: false - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 - # hybrid CTC/attention - model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test +########################################### +# Dataloader # +########################################### +vocab_filepath: data/lang_char/vocab.txt +unit_type: 'spm' +spm_model_prefix: 'data/lang_char/bpe_unigram_5000' +mean_std_filepath: "" +preprocess_config: conf/preprocess.yaml +feat_dim: 80 +stride_ms: 10.0 +window_ms: 25.0 +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +batch_size: 16 +maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +num_workers: 0 +subsampling_factor: 1 +num_encs: 1 -collator: - vocab_filepath: data/lang_char/vocab.txt - unit_type: 'spm' - spm_model_prefix: 'data/lang_char/bpe_unigram_5000' - mean_std_filepath: "" - augmentation_config: conf/preprocess.yaml - batch_size: 16 - raw_wav: True # use raw_wav or kaldi feature - spectrum_type: fbank #linear, mfcc, fbank - feat_dim: 80 - delta_delta: False - dither: 1.0 - target_sample_rate: 16000 - max_freq: None - n_fft: None - stride_ms: 10.0 - window_ms: 25.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 +########################################### +# Training # +########################################### +n_epoch: 120 +accum_grad: 8 +global_grad_clip: 5.0 +optim: adam +optim_conf: + lr: 0.001 + weight_decay: 1.0e-06 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 +log_interval: 100 +checkpoint: + kbest_n: 50 + latest_n: 5 -training: - n_epoch: 240 - accum_grad: 8 - global_grad_clip: 5.0 - optim: adam - optim_conf: - lr: 0.001 - weight_decay: 1e-06 - scheduler: warmuplr - scheduler_conf: - warmup_steps: 25000 - lr_decay: 1.0 - log_interval: 100 - checkpoint: - kbest_n: 50 - latest_n: 5 - - -decoding: - batch_size: 128 - error_rate_type: wer - decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 2.5 - beta: 0.3 - beam_size: 10 - cutoff_prob: 1.0 - cutoff_top_n: 0 - num_proc_bsearch: 8 - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: true # simulate streaming inference. Defaults to False. - diff --git a/examples/librispeech/asr1/conf/chunk_transformer.yaml b/examples/librispeech/asr1/conf/chunk_transformer.yaml index 275e940aff930abe47da311abd3408d48cf8589b..19ade8ad27ec7a31903ba9f721f47ae91e59ed8a 100644 --- a/examples/librispeech/asr1/conf/chunk_transformer.yaml +++ b/examples/librispeech/asr1/conf/chunk_transformer.yaml @@ -1,106 +1,89 @@ -# network architecture -model: - cmvn_file: - cmvn_file_type: "json" - # encoder related - encoder: transformer - encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true - use_dynamic_chunk: true - use_dynamic_left_chunk: false +############################################ +# Network Architecture # +############################################ +cmvn_file: +cmvn_file_type: "json" +# encoder related +encoder: transformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: true + use_dynamic_chunk: true + use_dynamic_left_chunk: false - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 - # hybrid CTC/attention - model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test -collator: - vocab_filepath: data/lang_char/vocab.txt - unit_type: 'spm' - spm_model_prefix: 'data/lang_char/bpe_unigram_5000' - mean_std_filepath: "" - augmentation_config: conf/preprocess.yaml - batch_size: 64 - raw_wav: True # use raw_wav or kaldi feature - spectrum_type: fbank #linear, mfcc, fbank - feat_dim: 80 - delta_delta: False - dither: 1.0 - target_sample_rate: 16000 - max_freq: None - n_fft: None - stride_ms: 10.0 - window_ms: 25.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 - - -training: - n_epoch: 120 - accum_grad: 1 - global_grad_clip: 5.0 - optim: adam - optim_conf: - lr: 0.001 - weight_decay: 1e-06 - scheduler: warmuplr - scheduler_conf: - warmup_steps: 25000 - lr_decay: 1.0 - log_interval: 100 - checkpoint: - kbest_n: 50 - latest_n: 5 - - -decoding: - batch_size: 64 - error_rate_type: wer - decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 2.5 - beta: 0.3 - beam_size: 10 - cutoff_prob: 1.0 - cutoff_top_n: 0 - num_proc_bsearch: 8 - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: true # simulate streaming inference. Defaults to False. +########################################### +# Dataloader # +########################################### +vocab_filepath: data/lang_char/vocab.txt +unit_type: 'spm' +spm_model_prefix: 'data/lang_char/bpe_unigram_5000' +mean_std_filepath: "" +preprocess_config: conf/preprocess.yaml +feat_dim: 80 +stride_ms: 10.0 +window_ms: 25.0 +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +batch_size: 64 +maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +num_workers: 0 +subsampling_factor: 1 +num_encs: 1 +########################################### +# Training # +########################################### +n_epoch: 120 +accum_grad: 1 +global_grad_clip: 5.0 +optim: adam +optim_conf: + lr: 0.001 + weight_decay: 1.0e-06 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 +log_interval: 100 +checkpoint: + kbest_n: 50 + latest_n: 5 \ No newline at end of file diff --git a/examples/librispeech/asr1/conf/conformer.yaml b/examples/librispeech/asr1/conf/conformer.yaml index 1193f14b1e717d751815e548677db70b369f5987..4f7b759be05c50be68862e12cb6a6bc5adb1a02f 100644 --- a/examples/librispeech/asr1/conf/conformer.yaml +++ b/examples/librispeech/asr1/conf/conformer.yaml @@ -1,104 +1,96 @@ -# network architecture -model: - cmvn_file: - cmvn_file_type: "json" - # encoder related - encoder: conformer - encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: True - use_cnn_module: True - cnn_module_kernel: 15 - activation_type: 'swish' - pos_enc_layer_type: 'rel_pos' - selfattention_layer_type: 'rel_selfattn' +############################################ +# Network Architecture # +############################################ +cmvn_file: +cmvn_file_type: "json" +# encoder related +encoder: conformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: True + use_cnn_module: True + cnn_module_kernel: 15 + activation_type: 'swish' + pos_enc_layer_type: 'rel_pos' + selfattention_layer_type: 'rel_selfattn' - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 - # hybrid CTC/attention - model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + ctc_grad_norm_type: null + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test-clean +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test-clean -collator: - vocab_filepath: data/lang_char/vocab.txt - unit_type: 'spm' - spm_model_prefix: 'data/lang_char/bpe_unigram_5000' - mean_std_filepath: "" - augmentation_config: conf/preprocess.yaml - batch_size: 16 - raw_wav: True # use raw_wav or kaldi feature - spectrum_type: fbank #linear, mfcc, fbank - feat_dim: 80 - delta_delta: False - dither: 1.0 - target_sample_rate: 16000 - max_freq: None - n_fft: None - stride_ms: 10.0 - window_ms: 25.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 +########################################### +# Dataloader # +########################################### +vocab_filepath: data/lang_char/vocab.txt +unit_type: 'spm' +spm_model_prefix: 'data/lang_char/bpe_unigram_5000' +mean_std_filepath: "" +preprocess_config: conf/preprocess.yaml +feat_dim: 80 +stride_ms: 10.0 +window_ms: 25.0 +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +batch_size: 16 +maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +num_workers: 0 +subsampling_factor: 1 +num_encs: 1 + - -training: - n_epoch: 70 - accum_grad: 8 - global_grad_clip: 3.0 - optim: adam - optim_conf: - lr: 0.004 - weight_decay: 1e-06 - scheduler: warmuplr - scheduler_conf: - warmup_steps: 25000 - lr_decay: 1.0 - log_interval: 100 - checkpoint: - kbest_n: 50 - latest_n: 5 - - -decoding: - batch_size: 64 - error_rate_type: wer - decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' - beam_size: 10 - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: False # simulate streaming inference. Defaults to False. +########################################### +# Training # +########################################### +n_epoch: 70 +accum_grad: 8 +global_grad_clip: 3.0 +optim: adam +optim_conf: + lr: 0.004 + weight_decay: 1.0e-06 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 +log_interval: 100 +checkpoint: + kbest_n: 50 + latest_n: 5 diff --git a/examples/librispeech/asr1/conf/preprocess.yaml b/examples/librispeech/asr1/conf/preprocess.yaml index 021ca4c58022f696c5218dedfe20e7244f09bd7f..d3992cb9fc3e46b2a8779d4e276223e1477570af 100644 --- a/examples/librispeech/asr1/conf/preprocess.yaml +++ b/examples/librispeech/asr1/conf/preprocess.yaml @@ -5,7 +5,7 @@ process: n_mels: 80 n_shift: 160 win_length: 400 - dither: true + dither: 0.1 - type: cmvn_json cmvn_path: data/mean_std.json # these three processes are a.k.a. SpecAugument diff --git a/examples/librispeech/asr1/conf/transformer.yaml b/examples/librispeech/asr1/conf/transformer.yaml index a90efe482df0b3a7ca779e9857926fd1a6fc16f7..740ce78f30b0fef9b27967aafb9f5d8c517814a8 100644 --- a/examples/librispeech/asr1/conf/transformer.yaml +++ b/examples/librispeech/asr1/conf/transformer.yaml @@ -1,111 +1,88 @@ -# network architecture -model: - cmvn_file: - cmvn_file_type: "json" - # encoder related - encoder: transformer - encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true +############################################ +# Network Architecture # +############################################ +cmvn_file: +cmvn_file_type: "json" +# encoder related +encoder: transformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: true - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 - # hybrid CTC/attention - model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test-clean - min_input_len: 0.5 # second - max_input_len: 30.0 # second - min_output_len: 0.0 # tokens - max_output_len: 400.0 # tokens - min_output_input_ratio: 0.05 - max_output_input_ratio: 100.0 +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test-clean -collator: - vocab_filepath: data/lang_char/vocab.txt - unit_type: 'spm' - spm_model_prefix: 'data/lang_char/bpe_unigram_5000' - mean_std_filepath: "" - augmentation_config: conf/preprocess.yaml - batch_size: 32 - raw_wav: True # use raw_wav or kaldi feature - spectrum_type: fbank #linear, mfcc, fbank - feat_dim: 80 - delta_delta: False - dither: 1.0 - target_sample_rate: 16000 - max_freq: None - n_fft: None - stride_ms: 10.0 - window_ms: 25.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 - - -training: - n_epoch: 120 - accum_grad: 4 - global_grad_clip: 5.0 - optim: adam - optim_conf: - lr: 0.004 - weight_decay: 1e-06 - scheduler: warmuplr - scheduler_conf: - warmup_steps: 25000 - lr_decay: 1.0 - log_interval: 100 - checkpoint: - kbest_n: 50 - latest_n: 5 - - -decoding: - batch_size: 64 - error_rate_type: wer - decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 2.5 - beta: 0.3 - beam_size: 10 - cutoff_prob: 1.0 - cutoff_top_n: 0 - num_proc_bsearch: 8 - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: False # simulate streaming inference. Defaults to False. +########################################### +# Dataloader # +########################################### +vocab_filepath: data/lang_char/vocab.txt +unit_type: 'spm' +spm_model_prefix: 'data/lang_char/bpe_unigram_5000' +mean_std_filepath: "" +preprocess_config: conf/preprocess.yaml +feat_dim: 80 +stride_ms: 10.0 +window_ms: 25.0 +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +batch_size: 32 +maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +num_workers: 0 +subsampling_factor: 1 +num_encs: 1 +########################################### +# Training # +########################################### +n_epoch: 120 +accum_grad: 4 +global_grad_clip: 5.0 +optim: adam +optim_conf: + lr: 0.004 + weight_decay: 1.0e-06 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 +log_interval: 100 +checkpoint: + kbest_n: 50 + latest_n: 5 diff --git a/examples/librispeech/asr1/conf/tuning/chunk_decode.yaml b/examples/librispeech/asr1/conf/tuning/chunk_decode.yaml new file mode 100644 index 0000000000000000000000000000000000000000..0760e721e303884e48b703b90dae5da249696da7 --- /dev/null +++ b/examples/librispeech/asr1/conf/tuning/chunk_decode.yaml @@ -0,0 +1,11 @@ +decode_batch_size: 128 +error_rate_type: wer +decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' +beam_size: 10 +ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. +decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. + # <0: for decoding, use full chunk. + # >0: for decoding, use fixed chunk size as set. + # 0: used for training, it's prohibited here. +num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. +simulate_streaming: true # simulate streaming inference. Defaults to False. diff --git a/examples/librispeech/asr1/conf/tuning/decode.yaml b/examples/librispeech/asr1/conf/tuning/decode.yaml new file mode 100644 index 0000000000000000000000000000000000000000..805dd02faa54721c993433168e0cfe7e1dd769d0 --- /dev/null +++ b/examples/librispeech/asr1/conf/tuning/decode.yaml @@ -0,0 +1,11 @@ +decode_batch_size: 64 +error_rate_type: wer +decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' +beam_size: 10 +ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. +decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. + # <0: for decoding, use full chunk. + # >0: for decoding, use fixed chunk size as set. + # 0: used for training, it's prohibited here. +num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. +simulate_streaming: False # simulate streaming inference. Defaults to False. diff --git a/examples/librispeech/asr1/local/align.sh b/examples/librispeech/asr1/local/align.sh index c65d611c43e870fe4024235c3a2f433452251d21..14d91d68702700179f5e81a843f08a2e264688e2 100755 --- a/examples/librispeech/asr1/local/align.sh +++ b/examples/librispeech/asr1/local/align.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 2 ];then - echo "usage: ${0} config_path ckpt_path_prefix" +if [ $# != 3 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix" exit -1 fi @@ -9,7 +9,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 +decode_config_path=$2 +ckpt_prefix=$3 batch_size=1 output_dir=${ckpt_prefix} @@ -20,9 +21,10 @@ mkdir -p ${output_dir} python3 -u ${BIN_DIR}/alignment.py \ --ngpu ${ngpu} \ --config ${config_path} \ +--decode_cfg ${decode_config_path} \ --result_file ${output_dir}/${type}.align \ --checkpoint_path ${ckpt_prefix} \ ---opts decoding.batch_size ${batch_size} +--opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in ctc alignment!" diff --git a/examples/librispeech/asr1/local/test.sh b/examples/librispeech/asr1/local/test.sh index aa06132e48245cc08b8eb89d453d8ade9db5f075..51ced18b2f765b443ba0ad9a6934be2a35d621fb 100755 --- a/examples/librispeech/asr1/local/test.sh +++ b/examples/librispeech/asr1/local/test.sh @@ -15,8 +15,8 @@ recog_set="test-clean" stage=0 stop_stage=100 -if [ $# != 2 ];then - echo "usage: ${0} config_path ckpt_path_prefix" +if [ $# != 3 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix" exit -1 fi @@ -24,7 +24,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 +decode_config_path=$2 +ckpt_prefix=$3 chunk_mode=false if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then @@ -52,10 +53,11 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${ckpt_prefix}.${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ - --opts decoding.batch_size ${batch_size} + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in evaluation!" @@ -76,10 +78,11 @@ for type in ctc_greedy_search; do python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${ckpt_prefix}.${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ - --opts decoding.batch_size ${batch_size} + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in evaluation!" @@ -96,10 +99,11 @@ for type in ctc_prefix_beam_search attention_rescoring; do python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${ckpt_prefix}.${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ - --opts decoding.batch_size ${batch_size} + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in evaluation!" diff --git a/examples/librispeech/asr1/local/test_wav.sh b/examples/librispeech/asr1/local/test_wav.sh index ab6d685d8abd94acbc9057c60c0456b0991f9a0d..e70fc83c8cbc2d56eb20e86fa50467e83d4bd42a 100755 --- a/examples/librispeech/asr1/local/test_wav.sh +++ b/examples/librispeech/asr1/local/test_wav.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 3 ];then - echo "usage: ${0} config_path ckpt_path_prefix audio_file" +if [ $# != 4 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix audio_file" exit -1 fi @@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 -audio_file=$3 +decode_config_path=$2 +ckpt_prefix=$3 +audio_file=$4 mkdir -p data wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/en/demo_002_en.wav -P data/ @@ -49,10 +50,11 @@ for type in attention_rescoring; do python3 -u ${BIN_DIR}/test_wav.py \ --ngpu ${ngpu} \ --config ${config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${output_dir}/${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ - --opts decoding.batch_size ${batch_size} \ + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} \ --audio_file ${audio_file} #score_sclite.sh --bpe ${nbpe} --bpemodel ${bpemodel}.model --wer true ${expdir}/${decode_dir} ${dict} diff --git a/examples/librispeech/asr1/run.sh b/examples/librispeech/asr1/run.sh index f839e5af79a4caf8e8db838d365b124cef7aa7f7..116dae1265fe8833b4ef193a360ce0603dfe8293 100755 --- a/examples/librispeech/asr1/run.sh +++ b/examples/librispeech/asr1/run.sh @@ -8,6 +8,7 @@ gpus=0,1,2,3 stage=0 stop_stage=50 conf_path=conf/transformer.yaml +decode_conf_path=conf/tuning/decode.yaml avg_num=30 audio_file=data/demo_002_en.wav @@ -34,17 +35,17 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # test ckpt avg_n - CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then # ctc alignment of test data - CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then # test a single .wav file - CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1 fi if [ ${stage} -le 51 ] && [ ${stop_stage} -ge 51 ]; then diff --git a/examples/librispeech/asr2/conf/decode/decode_base.yaml b/examples/librispeech/asr2/conf/decode/decode_base.yaml new file mode 100644 index 0000000000000000000000000000000000000000..384ed197d731d1fc43814f091b041474b603a095 --- /dev/null +++ b/examples/librispeech/asr2/conf/decode/decode_base.yaml @@ -0,0 +1,11 @@ +decode_batch_size: 1 +error_rate_type: wer +decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' +beam_size: 10 +ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. +decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. + # <0: for decoding, use full chunk. + # >0: for decoding, use fixed chunk size as set. + # 0: used for training, it's prohibited here. +num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. +simulate_streaming: False # simulate streaming inference. Defaults to False. diff --git a/examples/librispeech/asr2/conf/transformer.yaml b/examples/librispeech/asr2/conf/transformer.yaml index a16563a59ddf1ac44ceb206e54d6f57e5e02faed..32d95b4145ac5bc4aab54046289cccb8c967c6b8 100644 --- a/examples/librispeech/asr2/conf/transformer.yaml +++ b/examples/librispeech/asr2/conf/transformer.yaml @@ -1,73 +1,80 @@ # https://yaml.org/type/float.html -# network architecture -model: - cmvn_file: - cmvn_file_type: "json" - # encoder related - encoder: transformer - encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true +############################################ +# Network Architecture # +############################################ +cmvn_file: +cmvn_file_type: "json" +# encoder related +encoder: transformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: true - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 - # hybrid CTC/attention - model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test-clean +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test-clean -collator: - vocab_filepath: data/lang_char/train_960_unigram5000_units.txt - unit_type: spm - spm_model_prefix: data/lang_char/train_960_unigram5000 - feat_dim: 83 - stride_ms: 10.0 - window_ms: 25.0 - sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs - batch_size: 30 - maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced - maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced - minibatches: 0 # for debug - batch_count: auto - batch_bins: 0 - batch_frames_in: 0 - batch_frames_out: 0 - batch_frames_inout: 0 - augmentation_config: conf/preprocess.yaml - num_workers: 0 - subsampling_factor: 1 - num_encs: 1 +########################################### +# Dataloader # +########################################### +vocab_filepath: data/lang_char/train_960_unigram5000_units.txt +unit_type: spm +spm_model_prefix: data/lang_char/train_960_unigram5000 +feat_dim: 83 +stride_ms: 10.0 +window_ms: 25.0 +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +batch_size: 30 +maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +preprocess_config: conf/preprocess.yaml +num_workers: 0 +subsampling_factor: 1 +num_encs: 1 -training: - n_epoch: 120 - accum_grad: 2 - log_interval: 100 - checkpoint: - kbest_n: 50 - latest_n: 5 +########################################### +# Training # +########################################### +n_epoch: 120 +accum_grad: 2 +log_interval: 1 +checkpoint: + kbest_n: 50 + latest_n: 5 optim: adam optim_conf: @@ -79,23 +86,5 @@ scheduler_conf: warmup_steps: 25000 lr_decay: 1.0 -decoding: - batch_size: 1 - error_rate_type: wer - decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 2.5 - beta: 0.3 - beam_size: 10 - cutoff_prob: 1.0 - cutoff_top_n: 0 - num_proc_bsearch: 8 - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: False # simulate streaming inference. Defaults to False. diff --git a/examples/librispeech/asr2/local/align.sh b/examples/librispeech/asr2/local/align.sh index 626c3574258490c3e62c7f334dddd9c672e23941..60a16f42b48120aef4b3f33ca05858acd0c6a2cf 100755 --- a/examples/librispeech/asr2/local/align.sh +++ b/examples/librispeech/asr2/local/align.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 3 ];then - echo "usage: ${0} config_path dict_path ckpt_path_prefix" +if [ $# != 4 ];then + echo "usage: ${0} config_path decode_config_path dict_path ckpt_path_prefix" exit -1 fi @@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -dict_path=$2 -ckpt_prefix=$3 +decode_config_path=$2 +dict_path=$3 +ckpt_prefix=$4 batch_size=1 output_dir=${ckpt_prefix} @@ -24,9 +25,10 @@ python3 -u ${BIN_DIR}/test.py \ --dict-path ${dict_path} \ --ngpu ${ngpu} \ --config ${config_path} \ +--decode_cfg ${decode_config_path} \ --result-file ${output_dir}/${type}.align \ --checkpoint_path ${ckpt_prefix} \ ---opts decoding.batch_size ${batch_size} +--opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in ctc alignment!" diff --git a/examples/librispeech/asr2/local/test.sh b/examples/librispeech/asr2/local/test.sh index d210f2a85e703f537da81dbf2ce840ab0abfb945..8cf3b52c080444521130cbce6ce9a4ee3bb5330c 100755 --- a/examples/librispeech/asr2/local/test.sh +++ b/examples/librispeech/asr2/local/test.sh @@ -19,6 +19,7 @@ bpeprefix=data/lang_char/${train_set}_${bpemode}${nbpe} bpemodel=${bpeprefix}.model config_path=conf/transformer.yaml +decode_config_path=conf/decode/decode_base.yaml dict=data/lang_char/${train_set}_${bpemode}${nbpe}_units.txt ckpt_prefix= @@ -79,11 +80,12 @@ for dmethd in attention ctc_greedy_search ctc_prefix_beam_search attention_resco --ngpu ${ngpu} \ --dict-path ${dict} \ --config ${config_path} \ + --decode_cfg ${decode_config_path} \ --checkpoint_path ${ckpt_prefix} \ --result-file ${decode_dir}/data.JOB.json \ - --opts decoding.decoding_method ${dmethd} \ - --opts decoding.batch_size ${batch_size} \ - --opts data.test_manifest ${feat_recog_dir}/split${nj}/JOB/manifest.${rtask} + --opts decode.decoding_method ${dmethd} \ + --opts decode.decode_batch_size ${batch_size} \ + --opts test_manifest ${feat_recog_dir}/split${nj}/JOB/manifest.${rtask} score_sclite.sh --bpe ${nbpe} --bpemodel ${bpemodel} --wer false ${decode_dir} ${dict} diff --git a/examples/librispeech/asr2/run.sh b/examples/librispeech/asr2/run.sh index 5b7596f2d3f19fd48e6c9846ba7534cf3711ca5c..c9a794e341881442403b2730316e18ca636978ad 100755 --- a/examples/librispeech/asr2/run.sh +++ b/examples/librispeech/asr2/run.sh @@ -9,7 +9,8 @@ gpus=0,1,2,3,4,5,6,7 stage=0 stop_stage=50 conf_path=conf/transformer.yaml -dict_path=lang_char/train_960_unigram5000_units.txt +decode_conf_path=conf/decode/decode_base.yaml +dict_path=data/lang_char/train_960_unigram5000_units.txt avg_num=10 source ${MAIN_ROOT}/utils/parse_options.sh || exit 1; @@ -35,7 +36,7 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # attetion resocre decoder - ./local/test.sh ${conf_path} ${dict_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + ./local/test.sh ${conf_path} ${decode_conf_path} ${dict_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then @@ -45,7 +46,7 @@ fi if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then # ctc alignment of test data - CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} ${dict_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} ${decode_conf_path} ${dict_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then diff --git a/examples/ljspeech/tts1/README.md b/examples/ljspeech/tts1/README.md index 5bb163e1d70aa9df20c2759f8068cb0cc7f24f70..4f7680e8456d077a6641d3de6f5d54a7c3dd37cf 100644 --- a/examples/ljspeech/tts1/README.md +++ b/examples/ljspeech/tts1/README.md @@ -55,7 +55,7 @@ Here's the complete help message. ```text usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] - [--ngpu NGPU] [--verbose VERBOSE] [--phones-dict PHONES_DICT] + [--ngpu NGPU] [--phones-dict PHONES_DICT] Train a TransformerTTS model with LJSpeech TTS dataset. @@ -69,7 +69,6 @@ optional arguments: --output-dir OUTPUT_DIR output dir. --ngpu NGPU if ngpu == 0, use cpu. - --verbose VERBOSE verbose. --phones-dict PHONES_DICT phone vocabulary file. ``` @@ -103,7 +102,7 @@ usage: synthesize.py [-h] [--transformer-tts-config TRANSFORMER_TTS_CONFIG] [--waveflow-checkpoint WAVEFLOW_CHECKPOINT] [--phones-dict PHONES_DICT] [--test-metadata TEST_METADATA] [--output-dir OUTPUT_DIR] - [--ngpu NGPU] [--verbose VERBOSE] + [--ngpu NGPU] Synthesize with transformer tts & waveflow. @@ -127,7 +126,6 @@ optional arguments: --output-dir OUTPUT_DIR output dir. --ngpu NGPU if ngpu == 0, use cpu. - --verbose VERBOSE verbose. ``` `./local/synthesize_e2e.sh` calls `${BIN_DIR}/synthesize_e2e.py`, which can synthesize waveform from text file. ```bash @@ -142,7 +140,6 @@ usage: synthesize_e2e.py [-h] [--waveflow-checkpoint WAVEFLOW_CHECKPOINT] [--phones-dict PHONES_DICT] [--text TEXT] [--output-dir OUTPUT_DIR] [--ngpu NGPU] - [--verbose VERBOSE] Synthesize with transformer tts & waveflow. @@ -165,7 +162,6 @@ optional arguments: --output-dir OUTPUT_DIR output dir. --ngpu NGPU if ngpu == 0, use cpu. - --verbose VERBOSE verbose. ``` 1. `--transformer-tts-config`, `--transformer-tts-checkpoint`, `--transformer-tts-stat` and `--phones-dict` are arguments for transformer_tts, which correspond to the 4 files in the transformer_tts pretrained model. 2. `--waveflow-config`, `--waveflow-checkpoint` are arguments for waveflow, which correspond to the 2 files in the waveflow pretrained model. diff --git a/examples/ljspeech/tts3/README.md b/examples/ljspeech/tts3/README.md index 692c9746a09e66bebc3e1dbc0ed371cdc9de0e0c..f3602c3478697f61fcd830a5d4f1894d01e810a9 100644 --- a/examples/ljspeech/tts3/README.md +++ b/examples/ljspeech/tts3/README.md @@ -62,8 +62,8 @@ Here's the complete help message. ```text usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] - [--ngpu NGPU] [--verbose VERBOSE] [--phones-dict PHONES_DICT] - [--speaker-dict SPEAKER_DICT] + [--ngpu NGPU] [--phones-dict PHONES_DICT] + [--speaker-dict SPEAKER_DICT] [--voice-cloning VOICE_CLONING] Train a FastSpeech2 model. @@ -77,11 +77,12 @@ optional arguments: --output-dir OUTPUT_DIR output dir. --ngpu NGPU if ngpu=0, use cpu. - --verbose VERBOSE verbose. --phones-dict PHONES_DICT phone vocabulary file. --speaker-dict SPEAKER_DICT speaker id map file for multiple speaker model. + --voice-cloning VOICE_CLONING + whether training voice cloning model. ``` 1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`. 2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder. diff --git a/examples/ljspeech/voc1/README.md b/examples/ljspeech/voc1/README.md index 9dd0f5cc340ea445feeb7ab13f3cbb343a330f45..6fcb2a520d8167daaea24d7574bd3fa809a6090f 100644 --- a/examples/ljspeech/voc1/README.md +++ b/examples/ljspeech/voc1/README.md @@ -57,8 +57,8 @@ Here's the complete help message. ```text usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] - [--ngpu NGPU] [--verbose VERBOSE] [--batch-size BATCH_SIZE] - [--max-iter MAX_ITER] [--run-benchmark RUN_BENCHMARK] + [--ngpu NGPU] [--batch-size BATCH_SIZE] [--max-iter MAX_ITER] + [--run-benchmark RUN_BENCHMARK] [--profiler_options PROFILER_OPTIONS] Train a ParallelWaveGAN model. @@ -73,7 +73,6 @@ optional arguments: --output-dir OUTPUT_DIR output dir. --ngpu NGPU if ngpu == 0, use cpu. - --verbose VERBOSE verbose. benchmark: arguments related to benchmark. @@ -103,7 +102,6 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p usage: synthesize.py [-h] [--generator-type GENERATOR_TYPE] [--config CONFIG] [--checkpoint CHECKPOINT] [--test-metadata TEST_METADATA] [--output-dir OUTPUT_DIR] [--ngpu NGPU] - [--verbose VERBOSE] Synthesize with GANVocoder. @@ -120,7 +118,6 @@ optional arguments: --output-dir OUTPUT_DIR output dir. --ngpu NGPU if ngpu == 0, use cpu. - --verbose VERBOSE verbose. ``` 1. `--config` parallel wavegan config file. You should use the same config with which the model is trained. diff --git a/examples/ljspeech/voc1/conf/default.yaml b/examples/ljspeech/voc1/conf/default.yaml index bef2d68149027125be74f09c523bafd29e49916f..2d39beb795dce888cb2fbc295bcdad8b22bd19f6 100644 --- a/examples/ljspeech/voc1/conf/default.yaml +++ b/examples/ljspeech/voc1/conf/default.yaml @@ -72,10 +72,7 @@ lambda_adv: 4.0 # Loss balancing coefficient. ########################################################### batch_size: 8 # Batch size. batch_max_steps: 25600 # Length of each audio in batch. Make sure dividable by n_shift. -pin_memory: true # Whether to pin memory in Pytorch DataLoader. -num_workers: 4 # Number of workers in Pytorch DataLoader. -remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps. -allow_cache: true # Whether to allow cache in dataset. If true, it requires cpu memory. +num_workers: 2 # Number of workers in DataLoader. ########################################################### # OPTIMIZER & SCHEDULER SETTING # diff --git a/examples/other/1xt2x/aishell/conf/deepspeech2.yaml b/examples/other/1xt2x/aishell/conf/deepspeech2.yaml index c2d6922638098d1f52517ebbb97049dddd89c02a..c2db2c7c2d20a232f330d6cd44136c4951981b72 100644 --- a/examples/other/1xt2x/aishell/conf/deepspeech2.yaml +++ b/examples/other/1xt2x/aishell/conf/deepspeech2.yaml @@ -1,67 +1,65 @@ # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test - min_input_len: 0.0 - max_input_len: 27.0 # second - min_output_len: 0.0 - max_output_len: .inf - min_output_input_ratio: 0.00 - max_output_input_ratio: .inf +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test +min_input_len: 0.0 +max_input_len: 27.0 # second +min_output_len: 0.0 +max_output_len: .inf +min_output_input_ratio: 0.00 +max_output_input_ratio: .inf -collator: - batch_size: 64 # one gpu - mean_std_filepath: data/mean_std.npz - unit_type: char - vocab_filepath: data/vocab.txt - augmentation_config: conf/augmentation.json - random_seed: 0 - spm_model_prefix: - spectrum_type: linear - feat_dim: - delta_delta: False - stride_ms: 10.0 - window_ms: 20.0 - n_fft: None - max_freq: None - target_sample_rate: 16000 - use_dB_normalization: True - target_dB: -20 - dither: 1.0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 +########################################### +# Dataloader # +########################################### +batch_size: 64 # one gpu +mean_std_filepath: data/mean_std.npz +unit_type: char +vocab_filepath: data/vocab.txt +augmentation_config: conf/augmentation.json +random_seed: 0 +spm_model_prefix: +spectrum_type: linear +feat_dim: +delta_delta: False +stride_ms: 10.0 +window_ms: 20.0 +n_fft: None +max_freq: None +target_sample_rate: 16000 +use_dB_normalization: True +target_dB: -20 +dither: 1.0 +keep_transcription_text: False +sortagrad: True +shuffle_method: batch_shuffle +num_workers: 2 -model: - num_conv_layers: 2 - num_rnn_layers: 3 - rnn_layer_size: 1024 - use_gru: True - share_rnn_weights: False - blank_id: 4333 +############################################ +# Network Architecture # +############################################ +num_conv_layers: 2 +num_rnn_layers: 3 +rnn_layer_size: 1024 +use_gru: True +share_rnn_weights: False +blank_id: 4333 -training: - n_epoch: 80 - accum_grad: 1 - lr: 2e-3 - lr_decay: 0.83 - weight_decay: 1e-06 - global_grad_clip: 3.0 - log_interval: 100 - checkpoint: - kbest_n: 50 - latest_n: 5 +########################################### +# Training # +########################################### +n_epoch: 80 +accum_grad: 1 +lr: 2e-3 +lr_decay: 0.83 +weight_decay: 1e-06 +global_grad_clip: 3.0 +log_interval: 100 +checkpoint: + kbest_n: 50 + latest_n: 5 + -decoding: - batch_size: 32 - error_rate_type: cer - decoding_method: ctc_beam_search - lang_model_path: data/lm/zh_giga.no_cna_cmn.prune01244.klm - alpha: 2.6 - beta: 5.0 - beam_size: 300 - cutoff_prob: 0.99 - cutoff_top_n: 40 - num_proc_bsearch: 8 diff --git a/examples/other/1xt2x/aishell/conf/tuning/decode.yaml b/examples/other/1xt2x/aishell/conf/tuning/decode.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b5283a934a235c676c8780af0c81e2fd1d231c5e --- /dev/null +++ b/examples/other/1xt2x/aishell/conf/tuning/decode.yaml @@ -0,0 +1,10 @@ +decode_batch_size: 32 +error_rate_type: cer +decoding_method: ctc_beam_search +lang_model_path: data/lm/zh_giga.no_cna_cmn.prune01244.klm +alpha: 2.6 +beta: 5.0 +beam_size: 300 +cutoff_prob: 0.99 +cutoff_top_n: 40 +num_proc_bsearch: 8 \ No newline at end of file diff --git a/examples/other/1xt2x/aishell/local/test.sh b/examples/other/1xt2x/aishell/local/test.sh index 8cbff23520a61523e5d8599e627f897ef583d127..463593ef389c0256344ba2c40b7cdff426af1248 100755 --- a/examples/other/1xt2x/aishell/local/test.sh +++ b/examples/other/1xt2x/aishell/local/test.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 3 ];then - echo "usage: ${0} config_path ckpt_path_prefix model_type" +if [ $# != 4 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix model_type" exit -1 fi @@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 -model_type=$3 +decode_config_path=$2 +ckpt_prefix=$3 +model_type=$4 # download language model bash local/download_lm_ch.sh @@ -21,6 +22,7 @@ fi python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ +--decode_cfg ${decode_config_path} \ --result_file ${ckpt_prefix}.rsl \ --checkpoint_path ${ckpt_prefix} \ --model_type ${model_type} diff --git a/examples/other/1xt2x/aishell/run.sh b/examples/other/1xt2x/aishell/run.sh index 1ccac1c355339126e9ac66c6487c909da5ef6123..89a634119ca76f96e8b59d38df0c822ff511d0ee 100755 --- a/examples/other/1xt2x/aishell/run.sh +++ b/examples/other/1xt2x/aishell/run.sh @@ -5,6 +5,7 @@ source path.sh stage=0 stop_stage=100 conf_path=conf/deepspeech2.yaml +decode_conf_path=conf/tuning/decode.yaml avg_num=1 model_type=offline gpus=2 @@ -23,6 +24,6 @@ fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then # test ckpt avg_n - CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${v18_ckpt} ${model_type}|| exit -1 + CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${v18_ckpt} ${model_type}|| exit -1 fi diff --git a/examples/other/1xt2x/baidu_en8k/conf/deepspeech2.yaml b/examples/other/1xt2x/baidu_en8k/conf/deepspeech2.yaml index be51a9b909d9b01a8a8b4911441496a976769ea2..0c08fbc635dc574cc186c7494948d2d9df367290 100644 --- a/examples/other/1xt2x/baidu_en8k/conf/deepspeech2.yaml +++ b/examples/other/1xt2x/baidu_en8k/conf/deepspeech2.yaml @@ -1,67 +1,64 @@ # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test-clean - min_input_len: 0.0 - max_input_len: .inf # second - min_output_len: 0.0 - max_output_len: .inf - min_output_input_ratio: 0.00 - max_output_input_ratio: .inf +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test-clean +min_input_len: 0.0 +max_input_len: .inf # second +min_output_len: 0.0 +max_output_len: .inf +min_output_input_ratio: 0.00 +max_output_input_ratio: .inf -collator: - batch_size: 64 # one gpu - mean_std_filepath: data/mean_std.npz - unit_type: char - vocab_filepath: data/vocab.txt - augmentation_config: conf/augmentation.json - random_seed: 0 - spm_model_prefix: - spectrum_type: linear - feat_dim: - delta_delta: False - stride_ms: 10.0 - window_ms: 20.0 - n_fft: None - max_freq: None - target_sample_rate: 16000 - use_dB_normalization: True - target_dB: -20 - dither: 1.0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 +########################################### +# Dataloader # +########################################### +batch_size: 64 # one gpu +mean_std_filepath: data/mean_std.npz +unit_type: char +vocab_filepath: data/vocab.txt +augmentation_config: conf/augmentation.json +random_seed: 0 +spm_model_prefix: +spectrum_type: linear +feat_dim: +delta_delta: False +stride_ms: 10.0 +window_ms: 20.0 +n_fft: None +max_freq: None +target_sample_rate: 16000 +use_dB_normalization: True +target_dB: -20 +dither: 1.0 +keep_transcription_text: False +sortagrad: True +shuffle_method: batch_shuffle +num_workers: 2 -model: - num_conv_layers: 2 - num_rnn_layers: 3 - rnn_layer_size: 1024 - use_gru: True - share_rnn_weights: False - blank_id: 28 +############################################ +# Network Architecture # +############################################ +num_conv_layers: 2 +num_rnn_layers: 3 +rnn_layer_size: 1024 +use_gru: True +share_rnn_weights: False +blank_id: 28 + +########################################### +# Training # +########################################### +n_epoch: 80 +accum_grad: 1 +lr: 2e-3 +lr_decay: 0.83 +weight_decay: 1e-06 +global_grad_clip: 3.0 +log_interval: 100 +checkpoint: + kbest_n: 50 + latest_n: 5 -training: - n_epoch: 80 - accum_grad: 1 - lr: 2e-3 - lr_decay: 0.83 - weight_decay: 1e-06 - global_grad_clip: 3.0 - log_interval: 100 - checkpoint: - kbest_n: 50 - latest_n: 5 - -decoding: - batch_size: 32 - error_rate_type: wer - decoding_method: ctc_beam_search - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 1.4 - beta: 0.35 - beam_size: 500 - cutoff_prob: 1.0 - cutoff_top_n: 40 - num_proc_bsearch: 8 diff --git a/examples/other/1xt2x/baidu_en8k/conf/tuning/decode.yaml b/examples/other/1xt2x/baidu_en8k/conf/tuning/decode.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f52dde320d62769f9bcea53295f6f24460279431 --- /dev/null +++ b/examples/other/1xt2x/baidu_en8k/conf/tuning/decode.yaml @@ -0,0 +1,10 @@ +decode_batch_size: 32 +error_rate_type: wer +decoding_method: ctc_beam_search +lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm +alpha: 1.4 +beta: 0.35 +beam_size: 500 +cutoff_prob: 1.0 +cutoff_top_n: 40 +num_proc_bsearch: 8 \ No newline at end of file diff --git a/examples/other/1xt2x/baidu_en8k/local/test.sh b/examples/other/1xt2x/baidu_en8k/local/test.sh index a627ef72274e3d67219701d34956fdf2b3a364b2..ea40046b10997ee425d4e654b89fedc732c8b3fe 100755 --- a/examples/other/1xt2x/baidu_en8k/local/test.sh +++ b/examples/other/1xt2x/baidu_en8k/local/test.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 3 ];then - echo "usage: ${0} config_path ckpt_path_prefix model_type" +if [ $# != 4 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix model_type" exit -1 fi @@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 -model_type=$3 +decode_config_path=$2 +ckpt_prefix=$3 +model_type=$4 # download language model bash local/download_lm_en.sh @@ -21,6 +22,7 @@ fi python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ +--decode_cfg ${decode_config_path} \ --result_file ${ckpt_prefix}.rsl \ --checkpoint_path ${ckpt_prefix} \ --model_type ${model_type} diff --git a/examples/other/1xt2x/baidu_en8k/run.sh b/examples/other/1xt2x/baidu_en8k/run.sh index b7f69f6b55c10d93c2b717fa1ad7d07ea35c2b84..82de56b094f32684c110a49b7ebb144263ae351e 100755 --- a/examples/other/1xt2x/baidu_en8k/run.sh +++ b/examples/other/1xt2x/baidu_en8k/run.sh @@ -5,6 +5,7 @@ source path.sh stage=0 stop_stage=100 conf_path=conf/deepspeech2.yaml +decode_conf_path=conf/tuning/decode.yaml avg_num=1 model_type=offline gpus=0 @@ -23,6 +24,6 @@ fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then # test ckpt avg_n - CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${v18_ckpt} ${model_type}|| exit -1 + CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${v18_ckpt} ${model_type}|| exit -1 fi diff --git a/examples/other/1xt2x/librispeech/conf/deepspeech2.yaml b/examples/other/1xt2x/librispeech/conf/deepspeech2.yaml index ad7fb2c19f40bc04d2486c3ea728b726d7e9def4..a2a5649baa0ad80615b721d723d5a7672000fadc 100644 --- a/examples/other/1xt2x/librispeech/conf/deepspeech2.yaml +++ b/examples/other/1xt2x/librispeech/conf/deepspeech2.yaml @@ -1,67 +1,64 @@ # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test-clean - min_input_len: 0.0 - max_input_len: 1000.0 # second - min_output_len: 0.0 - max_output_len: .inf - min_output_input_ratio: 0.00 - max_output_input_ratio: .inf +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test-clean +min_input_len: 0.0 +max_input_len: 1000.0 # second +min_output_len: 0.0 +max_output_len: .inf +min_output_input_ratio: 0.00 +max_output_input_ratio: .inf -collator: - batch_size: 64 # one gpu - mean_std_filepath: data/mean_std.npz - unit_type: char - vocab_filepath: data/vocab.txt - augmentation_config: conf/augmentation.json - random_seed: 0 - spm_model_prefix: - spectrum_type: linear - feat_dim: - delta_delta: False - stride_ms: 10.0 - window_ms: 20.0 - n_fft: None - max_freq: None - target_sample_rate: 16000 - use_dB_normalization: True - target_dB: -20 - dither: 1.0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 +########################################### +# Dataloader # +########################################### +batch_size: 64 # one gpu +mean_std_filepath: data/mean_std.npz +unit_type: char +vocab_filepath: data/vocab.txt +augmentation_config: conf/augmentation.json +random_seed: 0 +spm_model_prefix: +spectrum_type: linear +feat_dim: +delta_delta: False +stride_ms: 10.0 +window_ms: 20.0 +n_fft: None +max_freq: None +target_sample_rate: 16000 +use_dB_normalization: True +target_dB: -20 +dither: 1.0 +keep_transcription_text: False +sortagrad: True +shuffle_method: batch_shuffle +num_workers: 2 -model: - num_conv_layers: 2 - num_rnn_layers: 3 - rnn_layer_size: 2048 - use_gru: False - share_rnn_weights: True - blank_id: 28 +############################################ +# Network Architecture # +############################################ +num_conv_layers: 2 +num_rnn_layers: 3 +rnn_layer_size: 2048 +use_gru: False +share_rnn_weights: True +blank_id: 28 + +########################################### +# Training # +########################################### +n_epoch: 80 +accum_grad: 1 +lr: 2e-3 +lr_decay: 0.83 +weight_decay: 1e-06 +global_grad_clip: 3.0 +log_interval: 100 +checkpoint: + kbest_n: 50 + latest_n: 5 -training: - n_epoch: 80 - accum_grad: 1 - lr: 2e-3 - lr_decay: 0.83 - weight_decay: 1e-06 - global_grad_clip: 3.0 - log_interval: 100 - checkpoint: - kbest_n: 50 - latest_n: 5 - -decoding: - batch_size: 32 - error_rate_type: wer - decoding_method: ctc_beam_search - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 2.5 - beta: 0.3 - beam_size: 500 - cutoff_prob: 1.0 - cutoff_top_n: 40 - num_proc_bsearch: 8 diff --git a/examples/other/1xt2x/librispeech/conf/tuning/decode.yaml b/examples/other/1xt2x/librispeech/conf/tuning/decode.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f3b51defe10b89a32ca3f71621fafc6f1ad15c77 --- /dev/null +++ b/examples/other/1xt2x/librispeech/conf/tuning/decode.yaml @@ -0,0 +1,10 @@ +decode_batch_size: 32 +error_rate_type: wer +decoding_method: ctc_beam_search +lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm +alpha: 2.5 +beta: 0.3 +beam_size: 500 +cutoff_prob: 1.0 +cutoff_top_n: 40 +num_proc_bsearch: 8 \ No newline at end of file diff --git a/examples/other/1xt2x/librispeech/local/test.sh b/examples/other/1xt2x/librispeech/local/test.sh index a627ef72274e3d67219701d34956fdf2b3a364b2..ea40046b10997ee425d4e654b89fedc732c8b3fe 100755 --- a/examples/other/1xt2x/librispeech/local/test.sh +++ b/examples/other/1xt2x/librispeech/local/test.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 3 ];then - echo "usage: ${0} config_path ckpt_path_prefix model_type" +if [ $# != 4 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix model_type" exit -1 fi @@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 -model_type=$3 +decode_config_path=$2 +ckpt_prefix=$3 +model_type=$4 # download language model bash local/download_lm_en.sh @@ -21,6 +22,7 @@ fi python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ +--decode_cfg ${decode_config_path} \ --result_file ${ckpt_prefix}.rsl \ --checkpoint_path ${ckpt_prefix} \ --model_type ${model_type} diff --git a/examples/other/1xt2x/librispeech/run.sh b/examples/other/1xt2x/librispeech/run.sh index 8c667de2e3d67294ab1a9078f3fd3cd21fb0143c..8b614bbbfc8841cd6c60a4dcf5331a97349ce146 100755 --- a/examples/other/1xt2x/librispeech/run.sh +++ b/examples/other/1xt2x/librispeech/run.sh @@ -5,6 +5,7 @@ source path.sh stage=0 stop_stage=100 conf_path=conf/deepspeech2.yaml +decode_conf_path=conf/tuning/decode.yaml avg_num=1 model_type=offline gpus=1 @@ -23,5 +24,5 @@ fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then # test ckpt avg_n - CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${v18_ckpt} ${model_type}|| exit -1 + CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${v18_ckpt} ${model_type}|| exit -1 fi diff --git a/examples/other/1xt2x/src_deepspeech2x/bin/test.py b/examples/other/1xt2x/src_deepspeech2x/bin/test.py index b4f9cdf9dab02422818ee39f030515d917b2b343..88a13fdca31276f9d9aeeeb0bf0d42e7bec4d4c3 100644 --- a/examples/other/1xt2x/src_deepspeech2x/bin/test.py +++ b/examples/other/1xt2x/src_deepspeech2x/bin/test.py @@ -13,8 +13,8 @@ # limitations under the License. """Evaluation for DeepSpeech2 model.""" from src_deepspeech2x.test_model import DeepSpeech2Tester as Tester +from yacs.config import CfgNode -from paddlespeech.s2t.exps.deepspeech2.config import get_cfg_defaults from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.utils.utility import print_arguments @@ -41,9 +41,13 @@ if __name__ == "__main__": print("model_type:{}".format(args.model_type)) # https://yaml.org/type/float.html - config = get_cfg_defaults(args.model_type) + config = CfgNode(new_allowed=True) if args.config: config.merge_from_file(args.config) + if args.decode_cfg: + decode_confs = CfgNode(new_allowed=True) + decode_confs.merge_from_file(args.decode_cfg) + config.decode = decode_confs if args.opts: config.merge_from_list(args.opts) config.freeze() diff --git a/examples/other/1xt2x/src_deepspeech2x/models/ds2/deepspeech2.py b/examples/other/1xt2x/src_deepspeech2x/models/ds2/deepspeech2.py index ad83a41db48c460db66700b33b5dff5f2aa805a6..fb8b321ced781c5246f01a7c19fec79df0cd9c85 100644 --- a/examples/other/1xt2x/src_deepspeech2x/models/ds2/deepspeech2.py +++ b/examples/other/1xt2x/src_deepspeech2x/models/ds2/deepspeech2.py @@ -12,12 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. """Deepspeech2 ASR Model""" -from typing import Optional - import paddle from paddle import nn from src_deepspeech2x.models.ds2.rnn import RNNStack -from yacs.config import CfgNode from paddlespeech.s2t.models.ds2.conv import ConvStack from paddlespeech.s2t.modules.ctc import CTCDecoder @@ -120,20 +117,6 @@ class DeepSpeech2Model(nn.Layer): :rtype: tuple of LayerOutput """ - @classmethod - def params(cls, config: Optional[CfgNode]=None) -> CfgNode: - default = CfgNode( - dict( - num_conv_layers=2, #Number of stacking convolution layers. - num_rnn_layers=3, #Number of stacking RNN layers. - rnn_layer_size=1024, #RNN layer size (number of RNN cells). - use_gru=True, #Use gru if set True. Use simple rnn if set False. - share_rnn_weights=True #Whether to share input-hidden weights between forward and backward directional RNNs.Notice that for GRU, weight sharing is not supported. - )) - if config is not None: - config.merge_from_other_cfg(default) - return default - def __init__(self, feat_size, dict_size, @@ -166,13 +149,13 @@ class DeepSpeech2Model(nn.Layer): """Compute Model loss Args: - audio (Tenosr): [B, T, D] + audio (Tensor): [B, T, D] audio_len (Tensor): [B] text (Tensor): [B, U] text_len (Tensor): [B] Returns: - loss (Tenosr): [1] + loss (Tensor): [1] """ eouts, eouts_len = self.encoder(audio, audio_len) loss = self.decoder(eouts, eouts_len, text, text_len) @@ -233,11 +216,11 @@ class DeepSpeech2Model(nn.Layer): """ model = cls(feat_size=dataloader.collate_fn.feature_size, dict_size=len(dataloader.collate_fn.vocab_list), - num_conv_layers=config.model.num_conv_layers, - num_rnn_layers=config.model.num_rnn_layers, - rnn_size=config.model.rnn_layer_size, - use_gru=config.model.use_gru, - share_rnn_weights=config.model.share_rnn_weights) + num_conv_layers=config.num_conv_layers, + num_rnn_layers=config.num_rnn_layers, + rnn_size=config.rnn_layer_size, + use_gru=config.use_gru, + share_rnn_weights=config.share_rnn_weights) infos = Checkpoint().load_parameters( model, checkpoint_path=checkpoint_path) logger.info(f"checkpoint info: {infos}") @@ -250,7 +233,7 @@ class DeepSpeech2Model(nn.Layer): Parameters config: yacs.config.CfgNode - config.model + config Returns ------- DeepSpeech2Model diff --git a/examples/other/1xt2x/src_deepspeech2x/test_model.py b/examples/other/1xt2x/src_deepspeech2x/test_model.py index 82e190d81bdcecb94fbc2a9626c1cb99bc5d9db4..2a38fb5cd9bf0fcb80494d0bc28525089440586b 100644 --- a/examples/other/1xt2x/src_deepspeech2x/test_model.py +++ b/examples/other/1xt2x/src_deepspeech2x/test_model.py @@ -15,8 +15,6 @@ import time from collections import defaultdict from contextlib import nullcontext -from pathlib import Path -from typing import Optional import numpy as np import paddle @@ -24,7 +22,6 @@ from paddle import distributed as dist from paddle.io import DataLoader from src_deepspeech2x.models.ds2 import DeepSpeech2InferModel from src_deepspeech2x.models.ds2 import DeepSpeech2Model -from yacs.config import CfgNode from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer from paddlespeech.s2t.io.collator import SpeechCollator @@ -44,27 +41,11 @@ logger = Log(__name__).getlog() class DeepSpeech2Trainer(Trainer): - @classmethod - def params(cls, config: Optional[CfgNode]=None) -> CfgNode: - # training config - default = CfgNode( - dict( - lr=5e-4, # learning rate - lr_decay=1.0, # learning rate decay - weight_decay=1e-6, # the coeff of weight decay - global_grad_clip=5.0, # the global norm clip - n_epoch=50, # train epochs - )) - - if config is not None: - config.merge_from_other_cfg(default) - return default - def __init__(self, config, args): super().__init__(config, args) def train_batch(self, batch_index, batch_data, msg): - train_conf = self.config.training + train_conf = self.config start = time.time() # forward @@ -98,7 +79,7 @@ class DeepSpeech2Trainer(Trainer): iteration_time = time.time() - start msg += "train time: {:>.3f}s, ".format(iteration_time) - msg += "batch size: {}, ".format(self.config.collator.batch_size) + msg += "batch size: {}, ".format(self.config.batch_size) msg += "accum: {}, ".format(train_conf.accum_grad) msg += ', '.join('{}: {:>.6f}'.format(k, v) for k, v in losses_np.items()) @@ -126,7 +107,7 @@ class DeepSpeech2Trainer(Trainer): total_loss += float(loss) * num_utts valid_losses['val_loss'].append(float(loss)) - if (i + 1) % self.config.training.log_interval == 0: + if (i + 1) % self.config.log_interval == 0: valid_dump = {k: np.mean(v) for k, v in valid_losses.items()} valid_dump['val_history_loss'] = total_loss / num_seen_utts @@ -146,15 +127,15 @@ class DeepSpeech2Trainer(Trainer): def setup_model(self): config = self.config.clone() config.defrost() - config.model.feat_size = self.train_loader.collate_fn.feature_size - #config.model.dict_size = self.train_loader.collate_fn.vocab_size - config.model.dict_size = len(self.train_loader.collate_fn.vocab_list) + config.feat_size = self.train_loader.collate_fn.feature_size + #config.dict_size = self.train_loader.collate_fn.vocab_size + config.dict_size = len(self.train_loader.collate_fn.vocab_list) config.freeze() if self.args.model_type == 'offline': - model = DeepSpeech2Model.from_config(config.model) + model = DeepSpeech2Model.from_config(config) elif self.args.model_type == 'online': - model = DeepSpeech2ModelOnline.from_config(config.model) + model = DeepSpeech2ModelOnline.from_config(config) else: raise Exception("wrong model type") if self.parallel: @@ -163,17 +144,13 @@ class DeepSpeech2Trainer(Trainer): logger.info(f"{model}") layer_tools.print_params(model, logger.info) - grad_clip = ClipGradByGlobalNormWithLog( - config.training.global_grad_clip) + grad_clip = ClipGradByGlobalNormWithLog(config.global_grad_clip) lr_scheduler = paddle.optimizer.lr.ExponentialDecay( - learning_rate=config.training.lr, - gamma=config.training.lr_decay, - verbose=True) + learning_rate=config.lr, gamma=config.lr_decay, verbose=True) optimizer = paddle.optimizer.Adam( learning_rate=lr_scheduler, parameters=model.parameters(), - weight_decay=paddle.regularizer.L2Decay( - config.training.weight_decay), + weight_decay=paddle.regularizer.L2Decay(config.weight_decay), grad_clip=grad_clip) self.model = model @@ -184,59 +161,59 @@ class DeepSpeech2Trainer(Trainer): def setup_dataloader(self): config = self.config.clone() config.defrost() - config.collator.keep_transcription_text = False + config.keep_transcription_text = False - config.data.manifest = config.data.train_manifest + config.manifest = config.train_manifest train_dataset = ManifestDataset.from_config(config) - config.data.manifest = config.data.dev_manifest + config.manifest = config.dev_manifest dev_dataset = ManifestDataset.from_config(config) - config.data.manifest = config.data.test_manifest + config.manifest = config.test_manifest test_dataset = ManifestDataset.from_config(config) if self.parallel: batch_sampler = SortagradDistributedBatchSampler( train_dataset, - batch_size=config.collator.batch_size, + batch_size=config.batch_size, num_replicas=None, rank=None, shuffle=True, drop_last=True, - sortagrad=config.collator.sortagrad, - shuffle_method=config.collator.shuffle_method) + sortagrad=config.sortagrad, + shuffle_method=config.shuffle_method) else: batch_sampler = SortagradBatchSampler( train_dataset, shuffle=True, - batch_size=config.collator.batch_size, + batch_size=config.batch_size, drop_last=True, - sortagrad=config.collator.sortagrad, - shuffle_method=config.collator.shuffle_method) + sortagrad=config.sortagrad, + shuffle_method=config.shuffle_method) collate_fn_train = SpeechCollator.from_config(config) - config.collator.augmentation_config = "" + config.augmentation_config = "" collate_fn_dev = SpeechCollator.from_config(config) - config.collator.keep_transcription_text = True - config.collator.augmentation_config = "" + config.keep_transcription_text = True + config.augmentation_config = "" collate_fn_test = SpeechCollator.from_config(config) self.train_loader = DataLoader( train_dataset, batch_sampler=batch_sampler, collate_fn=collate_fn_train, - num_workers=config.collator.num_workers) + num_workers=config.num_workers) self.valid_loader = DataLoader( dev_dataset, - batch_size=config.collator.batch_size, + batch_size=config.batch_size, shuffle=False, drop_last=False, collate_fn=collate_fn_dev) self.test_loader = DataLoader( test_dataset, - batch_size=config.decoding.batch_size, + batch_size=config.decode.decode_batch_size, shuffle=False, drop_last=False, collate_fn=collate_fn_test) @@ -250,31 +227,10 @@ class DeepSpeech2Trainer(Trainer): class DeepSpeech2Tester(DeepSpeech2Trainer): - @classmethod - def params(cls, config: Optional[CfgNode]=None) -> CfgNode: - # testing config - default = CfgNode( - dict( - alpha=2.5, # Coef of LM for beam search. - beta=0.3, # Coef of WC for beam search. - cutoff_prob=1.0, # Cutoff probability for pruning. - cutoff_top_n=40, # Cutoff number for pruning. - lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm', # Filepath for language model. - decoding_method='ctc_beam_search', # Decoding method. Options: ctc_beam_search, ctc_greedy - error_rate_type='wer', # Error rate type for evaluation. Options `wer`, 'cer' - num_proc_bsearch=8, # # of CPUs for beam search. - beam_size=500, # Beam search width. - batch_size=128, # decoding batch size - )) - - if config is not None: - config.merge_from_other_cfg(default) - return default - def __init__(self, config, args): self._text_featurizer = TextFeaturizer( - unit_type=config.collator.unit_type, vocab_filepath=None) + unit_type=config.unit_type, vocab=None) super().__init__(config, args) def ordid2token(self, texts, texts_len): @@ -293,7 +249,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): texts, texts_len, fout=None): - cfg = self.config.decoding + cfg = self.config.decode errors_sum, len_refs, num_ins = 0.0, 0, 0 errors_func = error_rate.char_errors if cfg.error_rate_type == 'cer' else error_rate.word_errors error_rate_func = error_rate.cer if cfg.error_rate_type == 'cer' else error_rate.wer @@ -399,31 +355,3 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): self.export() except KeyboardInterrupt: exit(-1) - - def setup(self): - """Setup the experiment. - """ - paddle.set_device('gpu' if self.args.ngpu > 0 else 'cpu') - - self.setup_output_dir() - self.setup_checkpointer() - - self.setup_dataloader() - self.setup_model() - - self.iteration = 0 - self.epoch = 0 - - def setup_output_dir(self): - """Create a directory used for output. - """ - # output dir - if self.args.output: - output_dir = Path(self.args.output).expanduser() - output_dir.mkdir(parents=True, exist_ok=True) - else: - output_dir = Path( - self.args.checkpoint_path).expanduser().parent.parent - output_dir.mkdir(parents=True, exist_ok=True) - - self.output_dir = output_dir diff --git a/examples/other/g2p/README.md b/examples/other/g2p/README.md index d734cc0ca2998519bee8b109c7bc609b8d46f1c5..c0f55bd42130a34a32ed21e34b5d5e297fff2f7c 100644 --- a/examples/other/g2p/README.md +++ b/examples/other/g2p/README.md @@ -10,11 +10,11 @@ Run the command below to get the results of the test. ```bash ./run.sh ``` -The `avg WER` of g2p is: 0.027495061517943988 +The `avg WER` of g2p is: 0.027124048652822204 ```text ,--------------------------------------------------------------------. | | # Snt # Wrd | Corr Sub Del Ins Err S.Err | |--------+-----------------+-----------------------------------------| - | Sum/Avg| 9996 299181 | 97.3 2.7 0.0 0.0 2.7 52.5 | + | Sum/Avg| 9996 299181 | 97.3 2.7 0.0 0.0 2.7 52.2 | `--------------------------------------------------------------------' ``` diff --git a/examples/other/g2p/get_g2p_data.py b/examples/other/g2p/get_g2p_data.py index 9b43ca6209eb8a74c3fad65dcc3c3a0f9453c531..8fa3e53cdab4aa72b2c22fdfcf5d82e5caf4b7f8 100644 --- a/examples/other/g2p/get_g2p_data.py +++ b/examples/other/g2p/get_g2p_data.py @@ -28,7 +28,8 @@ def get_baker_data(root_dir): alignment_files = [f for f in alignment_files if f.stem not in exclude] data_dict = defaultdict(dict) for alignment_fp in alignment_files: - alignment = textgrid.openTextgrid(alignment_fp, includeEmptyIntervals=True) + alignment = textgrid.openTextgrid( + alignment_fp, includeEmptyIntervals=True) # only with baker's annotation utt_id = alignment.tierNameList[0].split(".")[0] intervals = alignment.tierDict[alignment.tierNameList[0]].entryList diff --git a/examples/other/tn/README.md b/examples/other/tn/README.md index 596b1815c8250b7ab4d695a928d4f72ad73b6a16..3b80de661e02ac32ea892bd1f0d169c7b9a8996a 100644 --- a/examples/other/tn/README.md +++ b/examples/other/tn/README.md @@ -7,11 +7,11 @@ Run the command below to get the results of the test. ```bash ./run.sh ``` -The `avg CER` of text normalization is: 0.006388318503308237 +The `avg CER` of text normalization is: 0.00730093543235227 ```text ,-----------------------------------------------------------------. | | # Snt # Wrd | Corr Sub Del Ins Err S.Err | |--------+--------------+-----------------------------------------| - | Sum/Avg| 125 2254 | 99.4 0.1 0.5 0.1 0.7 3.2 | + | Sum/Avg| 125 2254 | 99.4 0.1 0.5 0.2 0.8 4.8 | `-----------------------------------------------------------------' ``` diff --git a/examples/ted_en_zh/st0/conf/preprocess.yaml b/examples/ted_en_zh/st0/conf/preprocess.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d3992cb9fc3e46b2a8779d4e276223e1477570af --- /dev/null +++ b/examples/ted_en_zh/st0/conf/preprocess.yaml @@ -0,0 +1,25 @@ +process: + # extract kaldi fbank from PCM + - type: fbank_kaldi + fs: 16000 + n_mels: 80 + n_shift: 160 + win_length: 400 + dither: 0.1 + - type: cmvn_json + cmvn_path: data/mean_std.json + # these three processes are a.k.a. SpecAugument + - type: time_warp + max_time_warp: 5 + inplace: true + mode: PIL + - type: freq_mask + F: 30 + n_mask: 2 + inplace: true + replace_with_zero: false + - type: time_mask + T: 40 + n_mask: 2 + inplace: true + replace_with_zero: false diff --git a/examples/ted_en_zh/st0/conf/transformer.yaml b/examples/ted_en_zh/st0/conf/transformer.yaml index 36f287b10d0aa65262158f444bcf8dd057a846cf..d113fc943bca1413a49830e62f9bbc523d620fe5 100644 --- a/examples/ted_en_zh/st0/conf/transformer.yaml +++ b/examples/ted_en_zh/st0/conf/transformer.yaml @@ -1,109 +1,98 @@ # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train.tiny - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test - min_input_len: 0.05 # second - max_input_len: 30.0 # second - min_output_len: 0.0 # tokens - max_output_len: 400.0 # tokens - min_output_input_ratio: 0.01 - max_output_input_ratio: 20.0 +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test +min_input_len: 0.05 # second +max_input_len: 30.0 # second +min_output_len: 0.0 # tokens +max_output_len: 400.0 # tokens +min_output_input_ratio: 0.01 +max_output_input_ratio: 20.0 -collator: - vocab_filepath: data/lang_char/vocab.txt - unit_type: 'spm' - spm_model_prefix: data/lang_char/bpe_unigram_8000 - mean_std_filepath: "" - # augmentation_config: conf/augmentation.json - batch_size: 10 - raw_wav: True # use raw_wav or kaldi feature - spectrum_type: fbank #linear, mfcc, fbank - feat_dim: 80 - delta_delta: False - dither: 1.0 - target_sample_rate: 16000 - max_freq: None - n_fft: None - stride_ms: 10.0 - window_ms: 25.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 +########################################### +# Dataloader # +########################################### +vocab_filepath: data/lang_char/vocab.txt +unit_type: 'spm' +spm_model_prefix: data/lang_char/bpe_unigram_8000 +mean_std_filepath: "" +preprocess_config: conf/preprocess.yaml +batch_size: 16 +maxlen_in: 5 # if input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced +raw_wav: True # use raw_wav or kaldi feature +spectrum_type: fbank #linear, mfcc, fbank +feat_dim: 80 +delta_delta: False +dither: 1.0 +target_sample_rate: 16000 +max_freq: None +n_fft: None +stride_ms: 10.0 +window_ms: 25.0 +use_dB_normalization: True +target_dB: -20 +random_seed: 0 +keep_transcription_text: False +sortagrad: True +shuffle_method: batch_shuffle +num_workers: 2 -# network architecture -model: - cmvn_file: "data/mean_std.json" - cmvn_file_type: "json" - # encoder related - encoder: transformer - encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true +############################################ +# Network Architecture # +############################################ +cmvn_file: "data/mean_std.json" +cmvn_file_type: "json" +# encoder related +encoder: transformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: true - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 - # hybrid CTC/attention - model_conf: - asr_weight: 0.0 - ctc_weight: 0.0 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false +# hybrid CTC/attention +model_conf: + asr_weight: 0.0 + ctc_weight: 0.0 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false - -training: - n_epoch: 120 - accum_grad: 2 - global_grad_clip: 5.0 - optim: adam - optim_conf: - lr: 0.004 - weight_decay: 1e-06 - scheduler: warmuplr - scheduler_conf: - warmup_steps: 25000 - lr_decay: 1.0 - log_interval: 5 - checkpoint: - kbest_n: 50 - latest_n: 5 - - -decoding: - batch_size: 5 - error_rate_type: char-bleu - decoding_method: fullsentence # 'fullsentence', 'simultaneous' - alpha: 2.5 - beta: 0.3 - beam_size: 10 - cutoff_prob: 1.0 - cutoff_top_n: 0 - num_proc_bsearch: 8 - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: False # simulate streaming inference. Defaults to False. +########################################### +# Training # +########################################### +n_epoch: 120 +accum_grad: 2 +global_grad_clip: 5.0 +optim: adam +optim_conf: + lr: 2.5 + weight_decay: 1.0e-06 +scheduler: noam +scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 +log_interval: 50 +checkpoint: + kbest_n: 50 + latest_n: 5 diff --git a/examples/ted_en_zh/st0/conf/transformer_mtl_noam.yaml b/examples/ted_en_zh/st0/conf/transformer_mtl_noam.yaml index 78887d3cd95781efa54666d58c88ca75eb08520b..a01ec1a6d8bbd9d3425aee3ad89e2c655496ae28 100644 --- a/examples/ted_en_zh/st0/conf/transformer_mtl_noam.yaml +++ b/examples/ted_en_zh/st0/conf/transformer_mtl_noam.yaml @@ -1,112 +1,102 @@ # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test - min_input_len: 0.05 # second - max_input_len: 30.0 # second - min_output_len: 0.0 # tokens - max_output_len: 400.0 # tokens - min_output_input_ratio: 0.01 - max_output_input_ratio: 20.0 +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test +min_input_len: 0.05 # second +max_input_len: 30.0 # second +min_output_len: 0.0 # tokens +max_output_len: 400.0 # tokens +min_output_input_ratio: 0.01 +max_output_input_ratio: 20.0 -collator: - vocab_filepath: data/lang_char/vocab.txt - unit_type: 'spm' - spm_model_prefix: data/lang_char/bpe_unigram_8000 - mean_std_filepath: "" - # augmentation_config: conf/augmentation.json - batch_size: 10 - raw_wav: True # use raw_wav or kaldi feature - spectrum_type: fbank #linear, mfcc, fbank - feat_dim: 80 - delta_delta: False - dither: 1.0 - target_sample_rate: 16000 - max_freq: None - n_fft: None - stride_ms: 10.0 - window_ms: 25.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 +########################################### +# Dataloader # +########################################### +vocab_filepath: data/lang_char/vocab.txt +unit_type: 'spm' +spm_model_prefix: data/lang_char/bpe_unigram_8000 +mean_std_filepath: "" +preprocess_config: conf/preprocess.yaml +batch_size: 16 +maxlen_in: 5 # if input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced +raw_wav: True # use raw_wav or kaldi feature +spectrum_type: fbank #linear, mfcc, fbank +feat_dim: 80 +delta_delta: False +dither: 1.0 +target_sample_rate: 16000 +max_freq: None +n_fft: None +stride_ms: 10.0 +window_ms: 25.0 +use_dB_normalization: True +target_dB: -20 +random_seed: 0 +keep_transcription_text: False +sortagrad: True +shuffle_method: batch_shuffle +num_workers: 2 -# network architecture -model: - cmvn_file: "data/mean_std.json" - cmvn_file_type: "json" - # encoder related - encoder: transformer - encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true +############################################ +# Network Architecture # +############################################ +cmvn_file: "data/mean_std.json" +cmvn_file_type: "json" +# encoder related +encoder: transformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: true - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 - # hybrid CTC/attention - model_conf: - asr_weight: 0.5 - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false +# hybrid CTC/attention +model_conf: + asr_weight: 0.5 + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false -training: - n_epoch: 120 - accum_grad: 2 - global_grad_clip: 5.0 - optim: adam - optim_conf: - lr: 2.5 - weight_decay: 1e-06 - scheduler: noam - scheduler_conf: - warmup_steps: 25000 - lr_decay: 1.0 - log_interval: 50 - checkpoint: - kbest_n: 50 - latest_n: 5 +########################################### +# Training # +########################################### +n_epoch: 120 +accum_grad: 2 +global_grad_clip: 5.0 +optim: adam +optim_conf: + lr: 2.5 + weight_decay: 1.0e-06 +scheduler: noam +scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 +log_interval: 50 +checkpoint: + kbest_n: 50 + latest_n: 5 -decoding: - batch_size: 5 - error_rate_type: char-bleu - decoding_method: fullsentence # 'fullsentence', 'simultaneous' - alpha: 2.5 - beta: 0.3 - beam_size: 10 - word_reward: 0.7 - cutoff_prob: 1.0 - cutoff_top_n: 0 - num_proc_bsearch: 8 - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: False # simulate streaming inference. Defaults to False. - diff --git a/examples/ted_en_zh/st0/conf/tuning/decode.yaml b/examples/ted_en_zh/st0/conf/tuning/decode.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ed081cf4ab8a2c8dad0ddead5824fd87aa08b58e --- /dev/null +++ b/examples/ted_en_zh/st0/conf/tuning/decode.yaml @@ -0,0 +1,11 @@ +batch_size: 5 +error_rate_type: char-bleu +decoding_method: fullsentence # 'fullsentence', 'simultaneous' +beam_size: 10 +word_reward: 0.7 +decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. + # <0: for decoding, use full chunk. + # >0: for decoding, use fixed chunk size as set. + # 0: used for training, it's prohibited here. +num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. +simulate_streaming: False # simulate streaming inference. Defaults to False. \ No newline at end of file diff --git a/examples/ted_en_zh/st0/local/test.sh b/examples/ted_en_zh/st0/local/test.sh index a9b18dd98810d56cb2c0c94141511bfd8343a346..904f95c4a273b2af77e50d5a7c682202abd02397 100755 --- a/examples/ted_en_zh/st0/local/test.sh +++ b/examples/ted_en_zh/st0/local/test.sh @@ -1,7 +1,7 @@ #! /usr/bin/env bash -if [ $# != 2 ];then - echo "usage: ${0} config_path ckpt_path_prefix" +if [ $# != 3 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix" exit -1 fi @@ -9,18 +9,18 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 +decode_config_path=$2 +ckpt_prefix=$3 for type in fullsentence; do echo "decoding ${type}" - batch_size=32 python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${ckpt_prefix}.${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ - --opts decoding.batch_size ${batch_size} + --opts decode.decoding_method ${type} \ if [ $? -ne 0 ]; then echo "Failed in evaluation!" diff --git a/examples/ted_en_zh/st0/run.sh b/examples/ted_en_zh/st0/run.sh index b85ba95a377df469a3d26bfa2e85491455488fdf..1746c025191127e139c297f46b5a04982c456f00 100755 --- a/examples/ted_en_zh/st0/run.sh +++ b/examples/ted_en_zh/st0/run.sh @@ -6,6 +6,7 @@ gpus=0,1,2,3 stage=0 stop_stage=50 conf_path=conf/transformer_mtl_noam.yaml +decode_conf_path=conf/tuning/decode.yaml avg_num=5 data_path=./TED_EnZh # path to unzipped data source ${MAIN_ROOT}/utils/parse_options.sh || exit 1; @@ -32,7 +33,7 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # test ckpt avg_n - CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi if [ ${stage} -le 51 ] && [ ${stop_stage} -ge 51 ]; then diff --git a/examples/ted_en_zh/st1/RESULTS.md b/examples/ted_en_zh/st1/RESULTS.md index e8aed53ecc6bca6485afa7a8ad420375a4363bbb..66dbce6cda50ad0480c4fe036d54c7e651b30eee 100644 --- a/examples/ted_en_zh/st1/RESULTS.md +++ b/examples/ted_en_zh/st1/RESULTS.md @@ -12,5 +12,5 @@ ## Transformer | Model | Params | Config | Val loss | Char-BLEU | | --- | --- | --- | --- | --- | -| FAT + Transformer+ASR MTL | 50.26M | conf/transformer_mtl_noam.yaml | 62.86 | 19.45 | +| FAT + Transformer+ASR MTL | 50.26M | conf/transformer_mtl_noam.yaml | 69.91 | 20.26 | | FAT + Transformer+ASR MTL with word reward | 50.26M | conf/transformer_mtl_noam.yaml | 62.86 | 20.80 | diff --git a/examples/ted_en_zh/st1/conf/preprocess.yaml b/examples/ted_en_zh/st1/conf/preprocess.yaml new file mode 100644 index 0000000000000000000000000000000000000000..bc86d98c1aca87b69e66a3725321a44242c42c60 --- /dev/null +++ b/examples/ted_en_zh/st1/conf/preprocess.yaml @@ -0,0 +1,16 @@ +process: + # these three processes are a.k.a. SpecAugument + - type: time_warp + max_time_warp: 5 + inplace: true + mode: PIL + - type: freq_mask + F: 30 + n_mask: 2 + inplace: true + replace_with_zero: false + - type: time_mask + T: 40 + n_mask: 2 + inplace: true + replace_with_zero: false diff --git a/examples/ted_en_zh/st1/conf/transformer.yaml b/examples/ted_en_zh/st1/conf/transformer.yaml index 609c582407072f62bb5adc635201a92be5b2d6cd..515edee208c6e755218f80eb86c121df856bebbc 100644 --- a/examples/ted_en_zh/st1/conf/transformer.yaml +++ b/examples/ted_en_zh/st1/conf/transformer.yaml @@ -1,110 +1,90 @@ # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train.tiny - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test - min_input_len: 5.0 # frame - max_input_len: 3000.0 # frame - min_output_len: 0.0 # tokens - max_output_len: 400.0 # tokens - min_output_input_ratio: 0.01 - max_output_input_ratio: 20.0 +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test -collator: - vocab_filepath: data/lang_char/vocab.txt - unit_type: 'spm' - spm_model_prefix: data/lang_char/bpe_unigram_8000 - mean_std_filepath: "" - # augmentation_config: conf/augmentation.json - batch_size: 10 - raw_wav: True # use raw_wav or kaldi feature - spectrum_type: fbank #linear, mfcc, fbank - feat_dim: 83 - delta_delta: False - dither: 1.0 - target_sample_rate: 16000 - max_freq: None - n_fft: None - stride_ms: 10.0 - window_ms: 25.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 +########################################### +# Dataloader # +########################################### +vocab_filepath: data/lang_char/ted_en_zh_bpe8000.txt +unit_type: 'spm' +spm_model_prefix: data/lang_char/ted_en_zh_bpe8000 +mean_std_filepath: "" +# preprocess_config: conf/augmentation.json +batch_size: 20 +feat_dim: 83 +stride_ms: 10.0 +window_ms: 25.0 +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +preprocess_config: +num_workers: 0 +subsampling_factor: 1 +num_encs: 1 -# network architecture -model: - cmvn_file: None - cmvn_file_type: "json" - # encoder related - encoder: transformer - encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true +############################################ +# Network Architecture # +############################################ +cmvn_file: None +cmvn_file_type: "json" +# encoder related +encoder: transformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: true - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 - # hybrid CTC/attention - model_conf: - asr_weight: 0.0 - ctc_weight: 0.0 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false +# hybrid CTC/attention +model_conf: + asr_weight: 0.0 + ctc_weight: 0.0 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false -training: - n_epoch: 20 - accum_grad: 2 - global_grad_clip: 5.0 - optim: adam - optim_conf: - lr: 0.004 - weight_decay: 1e-06 - scheduler: warmuplr - scheduler_conf: - warmup_steps: 25000 - lr_decay: 1.0 - log_interval: 5 - checkpoint: - kbest_n: 50 - latest_n: 5 - - -decoding: - batch_size: 5 - error_rate_type: char-bleu - decoding_method: fullsentence # 'fullsentence', 'simultaneous' - alpha: 2.5 - beta: 0.3 - beam_size: 10 - word_reward: 0.7 - cutoff_prob: 1.0 - cutoff_top_n: 0 - num_proc_bsearch: 8 - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: False # simulate streaming inference. Defaults to False. +########################################### +# Training # +########################################### +n_epoch: 40 +accum_grad: 2 +global_grad_clip: 5.0 +optim: adam +optim_conf: + lr: 2.5 + weight_decay: 0. +scheduler: noam +scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 +log_interval: 50 +checkpoint: + kbest_n: 50 + latest_n: 5 diff --git a/examples/ted_en_zh/st1/conf/transformer_mtl_noam.yaml b/examples/ted_en_zh/st1/conf/transformer_mtl_noam.yaml index 10eccd1ebdcfd3d680ea4290f718ce53074dfa8d..a5f956fab71b223daa1dad0f348f2af5f9162ffb 100644 --- a/examples/ted_en_zh/st1/conf/transformer_mtl_noam.yaml +++ b/examples/ted_en_zh/st1/conf/transformer_mtl_noam.yaml @@ -1,110 +1,90 @@ # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test - min_input_len: 5.0 # frame - max_input_len: 3000.0 # frame - min_output_len: 0.0 # tokens - max_output_len: 400.0 # tokens - min_output_input_ratio: 0.01 - max_output_input_ratio: 20.0 +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test -collator: - vocab_filepath: data/lang_char/ted_en_zh_bpe8000.txt - unit_type: 'spm' - spm_model_prefix: data/lang_char/ted_en_zh_bpe8000 - mean_std_filepath: "" - # augmentation_config: conf/augmentation.json - batch_size: 10 - raw_wav: True # use raw_wav or kaldi feature - spectrum_type: fbank #linear, mfcc, fbank - feat_dim: 83 - delta_delta: False - dither: 1.0 - target_sample_rate: 16000 - max_freq: None - n_fft: None - stride_ms: 10.0 - window_ms: 25.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 +########################################### +# Dataloader # +########################################### +vocab_filepath: data/lang_char/ted_en_zh_bpe8000.txt +unit_type: 'spm' +spm_model_prefix: data/lang_char/ted_en_zh_bpe8000 +mean_std_filepath: "" +# preprocess_config: conf/augmentation.json +batch_size: 20 +feat_dim: 83 +stride_ms: 10.0 +window_ms: 25.0 +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +preprocess_config: +num_workers: 0 +subsampling_factor: 1 +num_encs: 1 -# network architecture -model: - cmvn_file: None - cmvn_file_type: "json" - # encoder related - encoder: transformer - encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true +############################################ +# Network Architecture # +############################################ +cmvn_file: None +cmvn_file_type: "json" +# encoder related +encoder: transformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: true - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 - # hybrid CTC/attention - model_conf: - asr_weight: 0.5 - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false +# hybrid CTC/attention +model_conf: + asr_weight: 0.5 + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false -training: - n_epoch: 20 - accum_grad: 2 - global_grad_clip: 5.0 - optim: adam - optim_conf: - lr: 2.5 - weight_decay: 1e-06 - scheduler: noam - scheduler_conf: - warmup_steps: 25000 - lr_decay: 1.0 - log_interval: 5 - checkpoint: - kbest_n: 50 - latest_n: 5 - - -decoding: - batch_size: 5 - error_rate_type: char-bleu - decoding_method: fullsentence # 'fullsentence', 'simultaneous' - alpha: 2.5 - beta: 0.3 - beam_size: 10 - word_reward: 0.7 - cutoff_prob: 1.0 - cutoff_top_n: 0 - num_proc_bsearch: 8 - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: False # simulate streaming inference. Defaults to False. \ No newline at end of file +########################################### +# Training # +########################################### +n_epoch: 40 +accum_grad: 2 +global_grad_clip: 5.0 +optim: adam +optim_conf: + lr: 2.5 + weight_decay: 0. +scheduler: noam +scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 +log_interval: 50 +checkpoint: + kbest_n: 50 + latest_n: 5 diff --git a/examples/ted_en_zh/st1/conf/tuning/decode.yaml b/examples/ted_en_zh/st1/conf/tuning/decode.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d6104dbce0181974625888bb5e5b90a098b4d7ce --- /dev/null +++ b/examples/ted_en_zh/st1/conf/tuning/decode.yaml @@ -0,0 +1,12 @@ + +batch_size: 5 +error_rate_type: char-bleu +decoding_method: fullsentence # 'fullsentence', 'simultaneous' +beam_size: 10 +word_reward: 0.7 +decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. + # <0: for decoding, use full chunk. + # >0: for decoding, use fixed chunk size as set. + # 0: used for training, it's prohibited here. +num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. +simulate_streaming: False # simulate streaming inference. Defaults to False. \ No newline at end of file diff --git a/examples/ted_en_zh/st1/local/test.sh b/examples/ted_en_zh/st1/local/test.sh index a9b18dd98810d56cb2c0c94141511bfd8343a346..904f95c4a273b2af77e50d5a7c682202abd02397 100755 --- a/examples/ted_en_zh/st1/local/test.sh +++ b/examples/ted_en_zh/st1/local/test.sh @@ -1,7 +1,7 @@ #! /usr/bin/env bash -if [ $# != 2 ];then - echo "usage: ${0} config_path ckpt_path_prefix" +if [ $# != 3 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix" exit -1 fi @@ -9,18 +9,18 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 +decode_config_path=$2 +ckpt_prefix=$3 for type in fullsentence; do echo "decoding ${type}" - batch_size=32 python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${ckpt_prefix}.${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ - --opts decoding.batch_size ${batch_size} + --opts decode.decoding_method ${type} \ if [ $? -ne 0 ]; then echo "Failed in evaluation!" diff --git a/examples/ted_en_zh/st1/run.sh b/examples/ted_en_zh/st1/run.sh index f6362a8b358b9020eb0ec79c79358ebd0ba0f5ad..1808e37b431402452a67d716b33038ec9ea25007 100755 --- a/examples/ted_en_zh/st1/run.sh +++ b/examples/ted_en_zh/st1/run.sh @@ -7,6 +7,7 @@ gpus=0,1,2,3 stage=1 stop_stage=4 conf_path=conf/transformer_mtl_noam.yaml +decode_conf_path=conf/tuning/decode.yaml ckpt_path= # paddle.98 # (finetune from FAT-ST pretrained model) avg_num=5 data_path=./TED_EnZh # path to unzipped data @@ -27,7 +28,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then if [ -n "${ckpt_path}" ]; then echo "Finetune from Pretrained Model" ${ckpt_path} ./local/download_pretrain.sh || exit -1 - fi + fi CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} "${ckpt_path}" fi @@ -38,5 +39,5 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # test ckpt avg_n - CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 -fi \ No newline at end of file + CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 +fi diff --git a/examples/timit/asr1/conf/preprocess.yaml b/examples/timit/asr1/conf/preprocess.yaml index dd4cfd273e0f654169955d29ed76c818ded9181c..f7f4c58d5228beea1b6462355bb5970d6f070ad4 100644 --- a/examples/timit/asr1/conf/preprocess.yaml +++ b/examples/timit/asr1/conf/preprocess.yaml @@ -5,7 +5,7 @@ process: n_mels: 80 n_shift: 160 win_length: 400 - dither: true + dither: 0.1 - type: cmvn_json cmvn_path: data/mean_std.json # these three processes are a.k.a. SpecAugument diff --git a/examples/timit/asr1/conf/transformer.yaml b/examples/timit/asr1/conf/transformer.yaml index 1c6059e4a94b43b537458f07e6e9a2cc75da20f9..4731395f0f695256bf25f1983e0e3bc767ed7425 100644 --- a/examples/timit/asr1/conf/transformer.yaml +++ b/examples/timit/asr1/conf/transformer.yaml @@ -1,110 +1,89 @@ # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test - min_input_len: 0.0 # second - max_input_len: 10.0 # second - min_output_len: 0.0 # tokens - max_output_len: 150.0 # tokens - min_output_input_ratio: 0.005 - max_output_input_ratio: 1000.0 +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test -collator: - vocab_filepath: data/lang_char/vocab.txt - unit_type: "word" - mean_std_filepath: "" - augmentation_config: conf/preprocess.yaml - batch_size: 64 - raw_wav: True # use raw_wav or kaldi feature - spectrum_type: fbank #linear, mfcc, fbank - feat_dim: 80 - delta_delta: False - dither: 1.0 - target_sample_rate: 16000 - max_freq: None - n_fft: None - stride_ms: 10.0 - window_ms: 25.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 +########################################### +# Dataloader # +########################################### +vocab_filepath: data/lang_char/vocab.txt +spm_model_prefix: '' +unit_type: "word" +mean_std_filepath: "" +preprocess_config: conf/preprocess.yaml +feat_dim: 80 +stride_ms: 10.0 +window_ms: 25.0 +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +batch_size: 64 +maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +num_workers: 0 +subsampling_factor: 1 +num_encs: 1 -# network architecture -model: - cmvn_file: - cmvn_file_type: "json" - # encoder related - encoder: transformer - encoder_conf: - output_size: 128 # dimension of attention - attention_heads: 4 - linear_units: 1024 # the number of units of position-wise feed forward - num_blocks: 6 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true +############################################ +# Network Architecture # +############################################ +cmvn_file: +cmvn_file_type: "json" +# encoder related +encoder: transformer +encoder_conf: + output_size: 128 # dimension of attention + attention_heads: 4 + linear_units: 1024 # the number of units of position-wise feed forward + num_blocks: 6 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: true - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 1024 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 1024 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 - # hybrid CTC/attention - model_conf: - ctc_weight: 0.5 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false +# hybrid CTC/attention +model_conf: + ctc_weight: 0.5 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false -training: - n_epoch: 50 - accum_grad: 1 - global_grad_clip: 5.0 - optim: adam - optim_conf: - lr: 0.004 - weight_decay: 1e-06 - scheduler: warmuplr - scheduler_conf: - warmup_steps: 1200 - lr_decay: 1.0 - log_interval: 10 - checkpoint: - kbest_n: 50 - latest_n: 5 - - -decoding: - batch_size: 64 - error_rate_type: wer - decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 2.5 - beta: 0.3 - beam_size: 10 - cutoff_prob: 1.0 - cutoff_top_n: 0 - num_proc_bsearch: 8 - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: False # simulate streaming inference. Defaults to False. - +########################################### +# Training # +########################################### +n_epoch: 50 +accum_grad: 1 +global_grad_clip: 5.0 +optim: adam +optim_conf: + lr: 0.004 + weight_decay: 1.0e-6 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 1200 + lr_decay: 1.0 +log_interval: 10 +checkpoint: + kbest_n: 50 + latest_n: 5 diff --git a/examples/timit/asr1/conf/tuning/decode.yaml b/examples/timit/asr1/conf/tuning/decode.yaml new file mode 100644 index 0000000000000000000000000000000000000000..805dd02faa54721c993433168e0cfe7e1dd769d0 --- /dev/null +++ b/examples/timit/asr1/conf/tuning/decode.yaml @@ -0,0 +1,11 @@ +decode_batch_size: 64 +error_rate_type: wer +decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' +beam_size: 10 +ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. +decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. + # <0: for decoding, use full chunk. + # >0: for decoding, use fixed chunk size as set. + # 0: used for training, it's prohibited here. +num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. +simulate_streaming: False # simulate streaming inference. Defaults to False. diff --git a/examples/timit/asr1/local/align.sh b/examples/timit/asr1/local/align.sh index c65d611c43e870fe4024235c3a2f433452251d21..14d91d68702700179f5e81a843f08a2e264688e2 100755 --- a/examples/timit/asr1/local/align.sh +++ b/examples/timit/asr1/local/align.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 2 ];then - echo "usage: ${0} config_path ckpt_path_prefix" +if [ $# != 3 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix" exit -1 fi @@ -9,7 +9,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 +decode_config_path=$2 +ckpt_prefix=$3 batch_size=1 output_dir=${ckpt_prefix} @@ -20,9 +21,10 @@ mkdir -p ${output_dir} python3 -u ${BIN_DIR}/alignment.py \ --ngpu ${ngpu} \ --config ${config_path} \ +--decode_cfg ${decode_config_path} \ --result_file ${output_dir}/${type}.align \ --checkpoint_path ${ckpt_prefix} \ ---opts decoding.batch_size ${batch_size} +--opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in ctc alignment!" diff --git a/examples/timit/asr1/local/test.sh b/examples/timit/asr1/local/test.sh index 08ee0e36571f058367ab4921e498505a498155f7..88192c58396dfea8fb60325818a5a3712586b149 100755 --- a/examples/timit/asr1/local/test.sh +++ b/examples/timit/asr1/local/test.sh @@ -7,8 +7,8 @@ stop_stage=50 . ${MAIN_ROOT}/utils/parse_options.sh || exit 1; -if [ $# != 2 ];then - echo "usage: ${0} config_path ckpt_path_prefix" +if [ $# != 3 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix" exit -1 fi @@ -17,7 +17,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 +decode_config_path=$2 +ckpt_prefix=$3 chunk_mode=false if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then @@ -43,10 +44,11 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${ckpt_prefix}.${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ - --opts decoding.batch_size ${batch_size} + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in evaluation!" @@ -63,10 +65,11 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${ckpt_prefix}.${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ - --opts decoding.batch_size ${batch_size} + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in evaluation!" @@ -82,10 +85,11 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${ckpt_prefix}.${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ - --opts decoding.batch_size ${batch_size} + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in evaluation!" diff --git a/examples/timit/asr1/run.sh b/examples/timit/asr1/run.sh index a95b5f3ad7ce8333c0e21494dbd7fbae169a4fc2..0d84be9f3838ff68203496a1f47d0cc958a8303f 100755 --- a/examples/timit/asr1/run.sh +++ b/examples/timit/asr1/run.sh @@ -7,6 +7,7 @@ gpus=0,1,2,3 stage=0 stop_stage=50 conf_path=conf/transformer.yaml +decode_conf_path=conf/tuning/decode.yaml avg_num=10 TIMIT_path=/path/to/TIMIT @@ -34,15 +35,15 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # test ckpt avg_n - CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then # ctc alignment of test data - CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi -# if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then -# # export ckpt avg_n -# CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit -# fi +if [ ${stage} -le 51 ] && [ ${stop_stage} -ge 51 ]; then + # export ckpt avg_n + CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit +fi diff --git a/examples/tiny/asr0/conf/deepspeech2.yaml b/examples/tiny/asr0/conf/deepspeech2.yaml index 7d841d4743822c414e8d8dc4582d2a2b83211da9..64d432e2603c87eb9c77a197dc1be8b25378b5dd 100644 --- a/examples/tiny/asr0/conf/deepspeech2.yaml +++ b/examples/tiny/asr0/conf/deepspeech2.yaml @@ -1,70 +1,67 @@ # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.tiny - dev_manifest: data/manifest.tiny - test_manifest: data/manifest.tiny - min_input_len: 0.0 - max_input_len: 30.0 - min_output_len: 0.0 - max_output_len: 400.0 - min_output_input_ratio: 0.05 - max_output_input_ratio: 10.0 +########################################### +# Data # +########################################### +train_manifest: data/manifest.tiny +dev_manifest: data/manifest.tiny +test_manifest: data/manifest.tiny +min_input_len: 0.0 +max_input_len: 30.0 +min_output_len: 0.0 +max_output_len: 400.0 +min_output_input_ratio: 0.05 +max_output_input_ratio: 10.0 -collator: - mean_std_filepath: data/mean_std.json - unit_type: char - vocab_filepath: data/lang_char/vocab.txt - augmentation_config: conf/augmentation.json - random_seed: 0 - spm_model_prefix: - spectrum_type: linear - feat_dim: - delta_delta: False - stride_ms: 10.0 - window_ms: 20.0 - n_fft: None - max_freq: None - target_sample_rate: 16000 - use_dB_normalization: True - target_dB: -20 - dither: 1.0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 - batch_size: 4 +########################################### +# Dataloader # +########################################### +mean_std_filepath: data/mean_std.json +unit_type: char +vocab_filepath: data/lang_char/vocab.txt +augmentation_config: conf/augmentation.json +random_seed: 0 +spm_model_prefix: +spectrum_type: linear +feat_dim: 161 +delta_delta: False +stride_ms: 10.0 +window_ms: 20.0 +n_fft: None +max_freq: None +target_sample_rate: 16000 +use_dB_normalization: True +target_dB: -20 +dither: 1.0 +keep_transcription_text: False +sortagrad: True +shuffle_method: batch_shuffle +num_workers: 2 +batch_size: 4 -model: - num_conv_layers: 2 - num_rnn_layers: 3 - rnn_layer_size: 2048 - use_gru: False - share_rnn_weights: True - blank_id: 0 +############################################ +# Network Architecture # +############################################ +num_conv_layers: 2 +num_rnn_layers: 3 +rnn_layer_size: 2048 +use_gru: False +share_rnn_weights: True +blank_id: 0 -training: - n_epoch: 5 - accum_grad: 1 - lr: 1e-5 - lr_decay: 0.8 - weight_decay: 1e-06 - global_grad_clip: 5.0 - log_interval: 1 - checkpoint: - kbest_n: 3 - latest_n: 2 +########################################### +# Training # +########################################### +n_epoch: 5 +accum_grad: 1 +lr: 1.0e-5 +lr_decay: 0.8 +weight_decay: 1.0e-6 +global_grad_clip: 5.0 +log_interval: 1 +checkpoint: + kbest_n: 3 + latest_n: 2 -decoding: - batch_size: 128 - error_rate_type: wer - decoding_method: ctc_beam_search - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 2.5 - beta: 0.3 - beam_size: 500 - cutoff_prob: 1.0 - cutoff_top_n: 40 - num_proc_bsearch: 8 diff --git a/examples/tiny/asr0/conf/deepspeech2_online.yaml b/examples/tiny/asr0/conf/deepspeech2_online.yaml index 393b6439f88c70997836aca494a3e894bffad445..74a4dc814d5f3183d0dc1b99994564850b801885 100644 --- a/examples/tiny/asr0/conf/deepspeech2_online.yaml +++ b/examples/tiny/asr0/conf/deepspeech2_online.yaml @@ -1,72 +1,68 @@ # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.tiny - dev_manifest: data/manifest.tiny - test_manifest: data/manifest.tiny - min_input_len: 0.0 - max_input_len: 30.0 - min_output_len: 0.0 - max_output_len: 400.0 - min_output_input_ratio: 0.05 - max_output_input_ratio: 10.0 +########################################### +# Data # +########################################### +train_manifest: data/manifest.tiny +dev_manifest: data/manifest.tiny +test_manifest: data/manifest.tiny +min_input_len: 0.0 +max_input_len: 30.0 +min_output_len: 0.0 +max_output_len: 400.0 +min_output_input_ratio: 0.05 +max_output_input_ratio: 10.0 -collator: - mean_std_filepath: data/mean_std.json - unit_type: char - vocab_filepath: data/lang_char/vocab.txt - augmentation_config: conf/augmentation.json - random_seed: 0 - spm_model_prefix: - spectrum_type: linear - feat_dim: - delta_delta: False - stride_ms: 10.0 - window_ms: 20.0 - n_fft: None - max_freq: None - target_sample_rate: 16000 - use_dB_normalization: True - target_dB: -20 - dither: 1.0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 0 - batch_size: 4 +########################################### +# Dataloader # +########################################### +mean_std_filepath: data/mean_std.json +unit_type: char +vocab_filepath: data/lang_char/vocab.txt +augmentation_config: conf/augmentation.json +random_seed: 0 +spm_model_prefix: +spectrum_type: linear +feat_dim: 161 +delta_delta: False +stride_ms: 10.0 +window_ms: 20.0 +n_fft: None +max_freq: None +target_sample_rate: 16000 +use_dB_normalization: True +target_dB: -20 +dither: 1.0 +keep_transcription_text: False +sortagrad: True +shuffle_method: batch_shuffle +num_workers: 0 +batch_size: 4 -model: - num_conv_layers: 2 - num_rnn_layers: 4 - rnn_layer_size: 2048 - rnn_direction: forward - num_fc_layers: 2 - fc_layers_size_list: 512, 256 - use_gru: True - blank_id: 0 +############################################ +# Network Architecture # +############################################ +num_conv_layers: 2 +num_rnn_layers: 4 +rnn_layer_size: 2048 +rnn_direction: forward +num_fc_layers: 2 +fc_layers_size_list: 512, 256 +use_gru: True +blank_id: 0 -training: - n_epoch: 5 - accum_grad: 1 - lr: 1e-5 - lr_decay: 1.0 - weight_decay: 1e-06 - global_grad_clip: 5.0 - log_interval: 1 - checkpoint: - kbest_n: 3 - latest_n: 2 +########################################### +# Training # +########################################### +n_epoch: 5 +accum_grad: 1 +lr: 1.0e-5 +lr_decay: 1.0 +weight_decay: 1.0e-6 +global_grad_clip: 5.0 +log_interval: 1 +checkpoint: + kbest_n: 3 + latest_n: 2 - -decoding: - batch_size: 128 - error_rate_type: wer - decoding_method: ctc_beam_search - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 2.5 - beta: 0.3 - beam_size: 500 - cutoff_prob: 1.0 - cutoff_top_n: 40 - num_proc_bsearch: 8 diff --git a/examples/tiny/asr0/conf/tuning/chunk_decode.yaml b/examples/tiny/asr0/conf/tuning/chunk_decode.yaml new file mode 100644 index 0000000000000000000000000000000000000000..94c3dbdeeb0210e3e8d84437c4d2e6c6596c580a --- /dev/null +++ b/examples/tiny/asr0/conf/tuning/chunk_decode.yaml @@ -0,0 +1,10 @@ +decode_batch_size: 128 +error_rate_type: wer +decoding_method: ctc_beam_search +lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm +alpha: 2.5 +beta: 0.3 +beam_size: 500 +cutoff_prob: 1.0 +cutoff_top_n: 40 +num_proc_bsearch: 8 diff --git a/examples/tiny/asr0/conf/tuning/decode.yaml b/examples/tiny/asr0/conf/tuning/decode.yaml new file mode 100644 index 0000000000000000000000000000000000000000..94c3dbdeeb0210e3e8d84437c4d2e6c6596c580a --- /dev/null +++ b/examples/tiny/asr0/conf/tuning/decode.yaml @@ -0,0 +1,10 @@ +decode_batch_size: 128 +error_rate_type: wer +decoding_method: ctc_beam_search +lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm +alpha: 2.5 +beta: 0.3 +beam_size: 500 +cutoff_prob: 1.0 +cutoff_top_n: 40 +num_proc_bsearch: 8 diff --git a/examples/tiny/asr0/local/test.sh b/examples/tiny/asr0/local/test.sh index a627ef72274e3d67219701d34956fdf2b3a364b2..ea40046b10997ee425d4e654b89fedc732c8b3fe 100755 --- a/examples/tiny/asr0/local/test.sh +++ b/examples/tiny/asr0/local/test.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 3 ];then - echo "usage: ${0} config_path ckpt_path_prefix model_type" +if [ $# != 4 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix model_type" exit -1 fi @@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 -model_type=$3 +decode_config_path=$2 +ckpt_prefix=$3 +model_type=$4 # download language model bash local/download_lm_en.sh @@ -21,6 +22,7 @@ fi python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ +--decode_cfg ${decode_config_path} \ --result_file ${ckpt_prefix}.rsl \ --checkpoint_path ${ckpt_prefix} \ --model_type ${model_type} diff --git a/examples/tiny/asr0/run.sh b/examples/tiny/asr0/run.sh index f39fb3fa022a2b35e72823671ed8bf3161f4ee18..25f04624526f2ea480492fad2ab53ee3eced0654 100755 --- a/examples/tiny/asr0/run.sh +++ b/examples/tiny/asr0/run.sh @@ -6,6 +6,7 @@ gpus=0 stage=0 stop_stage=100 conf_path=conf/deepspeech2.yaml +decode_conf_path=conf/tuning/decode.yaml avg_num=1 model_type=offline @@ -32,7 +33,7 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # test ckpt avg_n - CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type} || exit -1 + CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type} || exit -1 fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then diff --git a/examples/tiny/asr1/conf/chunk_confermer.yaml b/examples/tiny/asr1/conf/chunk_confermer.yaml index ad27478de66508e69596a7476f7ea037e3f60895..8f785121fa61922c8db9d99ee524938a811a7459 100644 --- a/examples/tiny/asr1/conf/chunk_confermer.yaml +++ b/examples/tiny/asr1/conf/chunk_confermer.yaml @@ -1,120 +1,97 @@ -# https://yaml.org/type/float.html -data: - train_manifest: data/manifest.tiny - dev_manifest: data/manifest.tiny - test_manifest: data/manifest.tiny - min_input_len: 0.5 # second - max_input_len: 30.0 # second - min_output_len: 0.0 # tokens - max_output_len: 400.0 # tokens - min_output_input_ratio: 0.05 - max_output_input_ratio: 10.0 - -collator: - mean_std_filepath: "" - vocab_filepath: data/lang_char/vocab.txt - unit_type: 'spm' - spm_model_prefix: 'data/lang_char/bpe_unigram_200' - augmentation_config: conf/preprocess.yaml - batch_size: 4 - raw_wav: True # use raw_wav or kaldi feature - spectrum_type: fbank #linear, mfcc, fbank - feat_dim: 80 - delta_delta: False - dither: 1.0 - target_sample_rate: 16000 - max_freq: None - n_fft: None - stride_ms: 10.0 - window_ms: 25.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 - - -# network architecture -model: - cmvn_file: "data/mean_std.json" - cmvn_file_type: "json" - # encoder related - encoder: conformer - encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: True - use_cnn_module: True - cnn_module_kernel: 15 - activation_type: 'swish' - pos_enc_layer_type: 'rel_pos' - selfattention_layer_type: 'rel_selfattn' - causal: True - use_dynamic_chunk: True - cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster - use_dynamic_left_chunk: false - - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 +############################################ +# Network Architecture # +############################################ +cmvn_file: "data/mean_std.json" +cmvn_file_type: "json" +# encoder related +encoder: conformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: True + use_cnn_module: True + cnn_module_kernel: 15 + activation_type: 'swish' + pos_enc_layer_type: 'rel_pos' + selfattention_layer_type: 'rel_selfattn' + causal: True + use_dynamic_chunk: True + cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster + use_dynamic_left_chunk: false - # hybrid CTC/attention - model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false -training: - n_epoch: 5 - accum_grad: 1 - global_grad_clip: 5.0 - optim: adam - optim_conf: - lr: 0.001 - weight_decay: 1e-06 - scheduler: warmuplr - scheduler_conf: - warmup_steps: 25000 - lr_decay: 1.0 - log_interval: 1 - checkpoint: - kbest_n: 10 - latest_n: 1 +########################################### +# Data # +########################################### +train_manifest: data/manifest.tiny +dev_manifest: data/manifest.tiny +test_manifest: data/manifest.tiny -decoding: - batch_size: 64 - error_rate_type: wer - decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 2.5 - beta: 0.3 - beam_size: 10 - cutoff_prob: 1.0 - cutoff_top_n: 0 - num_proc_bsearch: 8 - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: False # simulate streaming inference. Defaults to False. - + +########################################### +# Dataloader # +########################################### +mean_std_filepath: "" +vocab_filepath: data/lang_char/vocab.txt +unit_type: 'spm' +spm_model_prefix: 'data/lang_char/bpe_unigram_200' +feat_dim: 80 +stride_ms: 10.0 +window_ms: 25.0 +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +batch_size: 4 +maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +preprocess_config: conf/preprocess.yaml +num_workers: 0 +subsampling_factor: 1 +num_encs: 1 + +########################################### +# Training # +########################################### +n_epoch: 5 +accum_grad: 1 +global_grad_clip: 5.0 +optim: adam +optim_conf: + lr: 0.001 + weight_decay: 1.0e-06 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 +log_interval: 1 +checkpoint: + kbest_n: 10 + latest_n: 1 diff --git a/examples/tiny/asr1/conf/chunk_transformer.yaml b/examples/tiny/asr1/conf/chunk_transformer.yaml index 298518fb556a6f2a8d6748faa6a2dcc6a032c04e..2570bb852f5b6a6b0e00f5f052b049a0da07037e 100644 --- a/examples/tiny/asr1/conf/chunk_transformer.yaml +++ b/examples/tiny/asr1/conf/chunk_transformer.yaml @@ -1,113 +1,91 @@ -# https://yaml.org/type/float.html -data: - train_manifest: data/manifest.tiny - dev_manifest: data/manifest.tiny - test_manifest: data/manifest.tiny - min_input_len: 0.5 # second - max_input_len: 20.0 # second - min_output_len: 0.0 # tokens - max_output_len: 400.0 # tokens - min_output_input_ratio: 0.05 - max_output_input_ratio: 10.0 - -collator: - mean_std_filepath: "" - vocab_filepath: data/lang_char/vocab.txt - unit_type: 'spm' - spm_model_prefix: 'data/lang_char/bpe_unigram_200' - augmentation_config: conf/preprocess.yaml - batch_size: 4 - raw_wav: True # use raw_wav or kaldi feature - spectrum_type: fbank #linear, mfcc, fbank - feat_dim: 80 - delta_delta: False - dither: 1.0 - target_sample_rate: 16000 - max_freq: None - n_fft: None - stride_ms: 10.0 - window_ms: 25.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 - - -# network architecture -model: - cmvn_file: "data/mean_std.json" - cmvn_file_type: "json" - # encoder related - encoder: transformer - encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true - use_dynamic_chunk: true - use_dynamic_left_chunk: false +############################################ +# Network Architecture # +############################################ +cmvn_file: "data/mean_std.json" +cmvn_file_type: "json" +# encoder related +encoder: transformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: true + use_dynamic_chunk: true + use_dynamic_left_chunk: false - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 - # hybrid CTC/attention - model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false -training: - n_epoch: 5 - accum_grad: 1 - global_grad_clip: 5.0 - optim: adam - optim_conf: - lr: 0.002 - weight_decay: 1e-06 - scheduler: warmuplr - scheduler_conf: - warmup_steps: 25000 - lr_decay: 1.0 - log_interval: 1 - checkpoint: - kbest_n: 10 - latest_n: 1 - +# https://yaml.org/type/float.html +########################################### +# Data # +########################################### +train_manifest: data/manifest.tiny +dev_manifest: data/manifest.tiny +test_manifest: data/manifest.tiny + +########################################### +# Dataloader # +########################################### +mean_std_filepath: "" +vocab_filepath: data/lang_char/vocab.txt +unit_type: 'spm' +spm_model_prefix: 'data/lang_char/bpe_unigram_200' +preprocess_config: conf/preprocess.yaml +feat_dim: 80 +stride_ms: 10.0 +window_ms: 25.0 +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +batch_size: 4 +maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +num_workers: 0 +subsampling_factor: 1 +num_encs: 1 -decoding: - batch_size: 64 - error_rate_type: wer - decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 2.5 - beta: 0.3 - beam_size: 10 - cutoff_prob: 1.0 - cutoff_top_n: 0 - num_proc_bsearch: 8 - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: False # simulate streaming inference. Defaults to False. +########################################### +# Training # +########################################### +n_epoch: 5 +accum_grad: 1 +global_grad_clip: 5.0 +optim: adam +optim_conf: + lr: 0.002 + weight_decay: 1.0e-06 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 +log_interval: 1 +checkpoint: + kbest_n: 10 + latest_n: 1 diff --git a/examples/tiny/asr1/conf/conformer.yaml b/examples/tiny/asr1/conf/conformer.yaml index eb8509024ffcd31ae35e12c866dba2848dc59267..eb8f0ab9f32f28a1b8dc16d8745ca1095a579a6f 100644 --- a/examples/tiny/asr1/conf/conformer.yaml +++ b/examples/tiny/asr1/conf/conformer.yaml @@ -1,116 +1,97 @@ # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.tiny - dev_manifest: data/manifest.tiny - test_manifest: data/manifest.tiny - min_input_len: 0.5 # second - max_input_len: 20.0 # second - min_output_len: 0.0 # tokens - max_output_len: 400.0 # tokens - min_output_input_ratio: 0.05 - max_output_input_ratio: 10.0 - -collator: - mean_std_filepath: "" - vocab_filepath: data/lang_char/vocab.txt - unit_type: 'spm' - spm_model_prefix: 'data/lang_char/bpe_unigram_200' - augmentation_config: conf/preprocess.yaml - batch_size: 4 - raw_wav: True # use raw_wav or kaldi feature - spectrum_type: fbank #linear, mfcc, fbank - feat_dim: 80 - delta_delta: False - dither: 1.0 - target_sample_rate: 16000 - max_freq: None - n_fft: None - stride_ms: 10.0 - window_ms: 25.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 +############################################ +# Network Architecture # +############################################ +cmvn_file: "data/mean_std.json" +cmvn_file_type: "json" +# encoder related +encoder: conformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: true + use_cnn_module: True + cnn_module_kernel: 15 + activation_type: 'swish' + pos_enc_layer_type: 'rel_pos' + selfattention_layer_type: 'rel_selfattn' +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 -# network architecture -model: - cmvn_file: "data/mean_std.json" - cmvn_file_type: "json" - # encoder related - encoder: conformer - encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true - use_cnn_module: True - cnn_module_kernel: 15 - activation_type: 'swish' - pos_enc_layer_type: 'rel_pos' - selfattention_layer_type: 'rel_selfattn' +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 - # hybrid CTC/attention - model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false +########################################### +# Data # +########################################### +train_manifest: data/manifest.tiny +dev_manifest: data/manifest.tiny +test_manifest: data/manifest.tiny + +########################################### +# Dataloader # +########################################### +mean_std_filepath: "" +vocab_filepath: data/lang_char/vocab.txt +unit_type: 'spm' +spm_model_prefix: 'data/lang_char/bpe_unigram_200' +preprocess_config: conf/preprocess.yaml +feat_dim: 80 +stride_ms: 10.0 +window_ms: 25.0 +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +batch_size: 4 +maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +num_workers: 0 +subsampling_factor: 1 +num_encs: 1 -training: - n_epoch: 5 - accum_grad: 4 - global_grad_clip: 5.0 - optim: adam - optim_conf: - lr: 0.002 - weight_decay: 1e-06 - scheduler: warmuplr - scheduler_conf: - warmup_steps: 25000 - lr_decay: 1.0 - log_interval: 1 - checkpoint: - kbest_n: 10 - latest_n: 1 +########################################### +# Training # +########################################### +n_epoch: 5 +accum_grad: 4 +global_grad_clip: 5.0 +optim: adam +optim_conf: + lr: 0.002 + weight_decay: 1.0e-06 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 +log_interval: 1 +checkpoint: + kbest_n: 10 + latest_n: 1 -decoding: - batch_size: 64 - error_rate_type: wer - decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 2.5 - beta: 0.3 - beam_size: 10 - cutoff_prob: 1.0 - cutoff_top_n: 0 - num_proc_bsearch: 8 - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: False # simulate streaming inference. Defaults to False. diff --git a/examples/tiny/asr1/conf/preprocess.yaml b/examples/tiny/asr1/conf/preprocess.yaml index dd4cfd273e0f654169955d29ed76c818ded9181c..f7f4c58d5228beea1b6462355bb5970d6f070ad4 100644 --- a/examples/tiny/asr1/conf/preprocess.yaml +++ b/examples/tiny/asr1/conf/preprocess.yaml @@ -5,7 +5,7 @@ process: n_mels: 80 n_shift: 160 win_length: 400 - dither: true + dither: 0.1 - type: cmvn_json cmvn_path: data/mean_std.json # these three processes are a.k.a. SpecAugument diff --git a/examples/tiny/asr1/conf/transformer.yaml b/examples/tiny/asr1/conf/transformer.yaml index c641d1f5b7d4eddf86e22bde8e8b1c22466086da..4e3068d1540359f88f461f4e77fc0b8ab9627079 100644 --- a/examples/tiny/asr1/conf/transformer.yaml +++ b/examples/tiny/asr1/conf/transformer.yaml @@ -1,110 +1,90 @@ # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.tiny - dev_manifest: data/manifest.tiny - test_manifest: data/manifest.tiny - min_input_len: 0.5 # second - max_input_len: 20.0 # second - min_output_len: 0.0 # tokens - max_output_len: 400.0 # tokens - min_output_input_ratio: 0.05 - max_output_input_ratio: 10.0 - -collator: - mean_std_filepath: data/mean_std.json - vocab_filepath: data/lang_char/vocab.txt - unit_type: 'spm' - spm_model_prefix: 'data/lang_char/bpe_unigram_200' - augmentation_config: conf/preprocess.yaml - batch_size: 4 - raw_wav: True # use raw_wav or kaldi feature - spectrum_type: fbank #linear, mfcc, fbank - feat_dim: 80 - delta_delta: False - dither: 1.0 - target_sample_rate: 16000 - max_freq: None - n_fft: None - stride_ms: 10.0 - window_ms: 25.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 - -# network architecture -model: - cmvn_file: - cmvn_file_type: "json" - # encoder related - encoder: transformer - encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true +############################################ +# Network Architecture # +############################################ +cmvn_file: +cmvn_file_type: "json" +# encoder related +encoder: transformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: true - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 - # hybrid CTC/attention - model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false +########################################### +# Data # +########################################### +train_manifest: data/manifest.tiny +dev_manifest: data/manifest.tiny +test_manifest: data/manifest.tiny + +########################################### +# Dataloader # +########################################### +mean_std_filepath: data/mean_std.json +vocab_filepath: data/lang_char/vocab.txt +unit_type: 'spm' +spm_model_prefix: 'data/lang_char/bpe_unigram_200' +preprocess_config: conf/preprocess.yaml +feat_dim: 80 +stride_ms: 10.0 +window_ms: 25.0 +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +batch_size: 4 +maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +num_workers: 0 +subsampling_factor: 1 +num_encs: 1 -training: - n_epoch: 5 - accum_grad: 1 - global_grad_clip: 5.0 - optim: adam - optim_conf: - lr: 0.002 - weight_decay: 1e-06 - scheduler: warmuplr - scheduler_conf: - warmup_steps: 25000 - lr_decay: 1.0 - log_interval: 1 - checkpoint: - kbest_n: 2 - latest_n: 1 +########################################### +# Training # +########################################### +n_epoch: 5 +accum_grad: 1 +global_grad_clip: 5.0 +optim: adam +optim_conf: + lr: 0.002 + weight_decay: 1.0e-06 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 +log_interval: 1 +checkpoint: + kbest_n: 2 + latest_n: 1 -decoding: - batch_size: 8 #64 - error_rate_type: wer - decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 2.5 - beta: 0.3 - beam_size: 10 - cutoff_prob: 1.0 - cutoff_top_n: 0 - num_proc_bsearch: 8 - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: False # simulate streaming inference. Defaults to False. diff --git a/examples/tiny/asr1/conf/tuning/chunk_decode.yaml b/examples/tiny/asr1/conf/tuning/chunk_decode.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c5b641dae8e45a78e66bd72c1989c1279996097b --- /dev/null +++ b/examples/tiny/asr1/conf/tuning/chunk_decode.yaml @@ -0,0 +1,11 @@ +decode_batch_size: 8 #64 +error_rate_type: wer +decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' +beam_size: 10 +ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. +decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. + # <0: for decoding, use full chunk. + # >0: for decoding, use fixed chunk size as set. + # 0: used for training, it's prohibited here. +num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. +simulate_streaming: False # simulate streaming inference. Defaults to False. \ No newline at end of file diff --git a/examples/tiny/asr1/conf/tuning/decode.yaml b/examples/tiny/asr1/conf/tuning/decode.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a0984f9ee1c37ae9137e5320cabb2801602c3b4f --- /dev/null +++ b/examples/tiny/asr1/conf/tuning/decode.yaml @@ -0,0 +1,11 @@ +decode_batch_size: 8 #64 +error_rate_type: wer +decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' +beam_size: 10 +ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. +decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. + # <0: for decoding, use full chunk. + # >0: for decoding, use fixed chunk size as set. + # 0: used for training, it's prohibited here. +num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. +simulate_streaming: False # simulate streaming inference. Defaults to False. diff --git a/examples/tiny/asr1/local/align.sh b/examples/tiny/asr1/local/align.sh index c65d611c43e870fe4024235c3a2f433452251d21..14d91d68702700179f5e81a843f08a2e264688e2 100755 --- a/examples/tiny/asr1/local/align.sh +++ b/examples/tiny/asr1/local/align.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 2 ];then - echo "usage: ${0} config_path ckpt_path_prefix" +if [ $# != 3 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix" exit -1 fi @@ -9,7 +9,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 +decode_config_path=$2 +ckpt_prefix=$3 batch_size=1 output_dir=${ckpt_prefix} @@ -20,9 +21,10 @@ mkdir -p ${output_dir} python3 -u ${BIN_DIR}/alignment.py \ --ngpu ${ngpu} \ --config ${config_path} \ +--decode_cfg ${decode_config_path} \ --result_file ${output_dir}/${type}.align \ --checkpoint_path ${ckpt_prefix} \ ---opts decoding.batch_size ${batch_size} +--opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in ctc alignment!" diff --git a/examples/tiny/asr1/local/test.sh b/examples/tiny/asr1/local/test.sh index 190bacffc8b911e062299a00ddf2ba6b4701c337..79df969b430bbfa8243a170e5531627e45c589e9 100755 --- a/examples/tiny/asr1/local/test.sh +++ b/examples/tiny/asr1/local/test.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 2 ];then - echo "usage: ${0} config_path ckpt_path_prefix" +if [ $# != 3 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix" exit -1 fi @@ -9,7 +9,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 +decode_config_path=$2 +ckpt_prefix=$3 chunk_mode=false if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then @@ -33,10 +34,11 @@ for type in attention ctc_greedy_search; do python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${ckpt_prefix}.${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ - --opts decoding.batch_size ${batch_size} + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in evaluation!" @@ -50,10 +52,11 @@ for type in ctc_prefix_beam_search attention_rescoring; do python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${ckpt_prefix}.${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ - --opts decoding.batch_size ${batch_size} + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in evaluation!" diff --git a/examples/tiny/asr1/run.sh b/examples/tiny/asr1/run.sh index ec9c5a567547defa7f73ffdac8d164271ec2854e..1651c034c3e76692f10d366a3c21ca85109fd600 100755 --- a/examples/tiny/asr1/run.sh +++ b/examples/tiny/asr1/run.sh @@ -6,6 +6,7 @@ gpus=0 stage=0 stop_stage=50 conf_path=conf/transformer.yaml +decode_conf_path=conf/tuning/decode.yaml avg_num=1 source ${MAIN_ROOT}/utils/parse_options.sh || exit 1; @@ -31,12 +32,12 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # test ckpt avg_n - CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then # ctc alignment of test data - CUDA_VISIBLE_DEVICES=${gpus} ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + CUDA_VISIBLE_DEVICES=${gpus} ./local/align.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi if [ ${stage} -le 51 ] && [ ${stop_stage} -ge 51 ]; then diff --git a/examples/vctk/tts3/README.md b/examples/vctk/tts3/README.md index 1f2c9338e43ea25d05f4902bc2c3575d450c7f2d..74c1086a0e45ac77846de059d2988432dcd6263b 100644 --- a/examples/vctk/tts3/README.md +++ b/examples/vctk/tts3/README.md @@ -65,8 +65,8 @@ Here's the complete help message. ```text usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] - [--ngpu NGPU] [--verbose VERBOSE] [--phones-dict PHONES_DICT] - [--speaker-dict SPEAKER_DICT] + [--ngpu NGPU] [--phones-dict PHONES_DICT] + [--speaker-dict SPEAKER_DICT] [--voice-cloning VOICE_CLONING] Train a FastSpeech2 model. @@ -80,11 +80,12 @@ optional arguments: --output-dir OUTPUT_DIR output dir. --ngpu NGPU if ngpu=0, use cpu. - --verbose VERBOSE verbose. --phones-dict PHONES_DICT phone vocabulary file. --speaker-dict SPEAKER_DICT speaker id map file for multiple speaker model. + --voice-cloning VOICE_CLONING + whether training voice cloning model. ``` 1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`. 2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder. @@ -94,16 +95,16 @@ optional arguments: ### Synthesizing We use [parallel wavegan](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/voc1) as the neural vocoder. -Download pretrained parallel wavegan model from [pwg_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_vctk_ckpt_0.5.zip)and unzip it. +Download pretrained parallel wavegan model from [pwg_vctk_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_vctk_ckpt_0.1.1.zip) and unzip it. ```bash -unzip pwg_vctk_ckpt_0.5.zip +unzip pwg_vctk_ckpt_0.1.1.zip ``` Parallel WaveGAN checkpoint contains files listed below. ```text -pwg_vctk_ckpt_0.5 -├── pwg_default.yaml # default config used to train parallel wavegan -├── pwg_snapshot_iter_1000000.pdz # generator parameters of parallel wavegan -└── pwg_stats.npy # statistics used to normalize spectrogram when training parallel wavegan +pwg_vctk_ckpt_0.1.1 +├── default.yaml # default config used to train parallel wavegan +├── snapshot_iter_1500000.pdz # generator parameters of parallel wavegan +└── feats_stats.npy # statistics used to normalize spectrogram when training parallel wavegan ``` `./local/synthesize.sh` calls `${BIN_DIR}/../synthesize.py`, which can synthesize waveform from `metadata.jsonl`. ```bash @@ -245,7 +246,7 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \ --lang=en \ --text=${BIN_DIR}/../sentences_en.txt \ --output_dir=exp/default/test_e2e \ - --phones_dict=dump/phone_id_map.txt \ - --speaker_dict=dump/speaker_id_map.txt \ + --phones_dict=fastspeech2_nosil_vctk_ckpt_0.5/phone_id_map.txt \ + --speaker_dict=fastspeech2_nosil_vctk_ckpt_0.5/speaker_id_map.txt \ --spk_id=0 ``` diff --git a/examples/vctk/tts3/local/synthesize.sh b/examples/vctk/tts3/local/synthesize.sh index a8aef034cf0e03b0449ba96a219847ededb47b73..8381af464e62a3e4def2cae1e5844dc7778f853d 100755 --- a/examples/vctk/tts3/local/synthesize.sh +++ b/examples/vctk/tts3/local/synthesize.sh @@ -12,9 +12,9 @@ python3 ${BIN_DIR}/../synthesize.py \ --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ --am_stat=dump/train/speech_stats.npy \ --voc=pwgan_vctk \ - --voc_config=pwg_vctk_ckpt_0.5/pwg_default.yaml \ - --voc_ckpt=pwg_vctk_ckpt_0.5/pwg_snapshot_iter_1000000.pdz \ - --voc_stat=pwg_vctk_ckpt_0.5/pwg_stats.npy \ + --voc_config=pwg_vctk_ckpt_0.1.1/default.yaml \ + --voc_ckpt=pwg_vctk_ckpt_0.1.1/snapshot_iter_1500000.pdz \ + --voc_stat=pwg_vctk_ckpt_0.1.1/feats_stats.npy \ --test_metadata=dump/test/norm/metadata.jsonl \ --output_dir=${train_output_path}/test \ --phones_dict=dump/phone_id_map.txt \ diff --git a/examples/vctk/tts3/local/synthesize_e2e.sh b/examples/vctk/tts3/local/synthesize_e2e.sh index 954e8cb9a77681929158ce29f86a73de2077bb39..51bb9e192e41d5ecf4a1897e1e286aca470682bb 100755 --- a/examples/vctk/tts3/local/synthesize_e2e.sh +++ b/examples/vctk/tts3/local/synthesize_e2e.sh @@ -12,9 +12,9 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \ --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ --am_stat=dump/train/speech_stats.npy \ --voc=pwgan_vctk \ - --voc_config=pwg_vctk_ckpt_0.5/pwg_default.yaml \ - --voc_ckpt=pwg_vctk_ckpt_0.5/pwg_snapshot_iter_1000000.pdz \ - --voc_stat=pwg_vctk_ckpt_0.5/pwg_stats.npy \ + --voc_config=pwg_vctk_ckpt_0.1.1/default.yaml \ + --voc_ckpt=pwg_vctk_ckpt_0.1.1/snapshot_iter_1500000.pdz \ + --voc_stat=pwg_vctk_ckpt_0.1.1/feats_stats.npy \ --lang=en \ --text=${BIN_DIR}/../sentences_en.txt \ --output_dir=${train_output_path}/test_e2e \ diff --git a/examples/vctk/voc1/README.md b/examples/vctk/voc1/README.md index 78254d4e0412b8bff08600b4250e49813f359452..4714f28dc2c7e7aa06857349a24ec6ac2d0423f6 100644 --- a/examples/vctk/voc1/README.md +++ b/examples/vctk/voc1/README.md @@ -62,8 +62,8 @@ Here's the complete help message. ```text usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] - [--ngpu NGPU] [--verbose VERBOSE] [--batch-size BATCH_SIZE] - [--max-iter MAX_ITER] [--run-benchmark RUN_BENCHMARK] + [--ngpu NGPU] [--batch-size BATCH_SIZE] [--max-iter MAX_ITER] + [--run-benchmark RUN_BENCHMARK] [--profiler_options PROFILER_OPTIONS] Train a ParallelWaveGAN model. @@ -78,7 +78,6 @@ optional arguments: --output-dir OUTPUT_DIR output dir. --ngpu NGPU if ngpu == 0, use cpu. - --verbose VERBOSE verbose. benchmark: arguments related to benchmark. @@ -108,7 +107,6 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p usage: synthesize.py [-h] [--generator-type GENERATOR_TYPE] [--config CONFIG] [--checkpoint CHECKPOINT] [--test-metadata TEST_METADATA] [--output-dir OUTPUT_DIR] [--ngpu NGPU] - [--verbose VERBOSE] Synthesize with GANVocoder. @@ -125,7 +123,6 @@ optional arguments: --output-dir OUTPUT_DIR output dir. --ngpu NGPU if ngpu == 0, use cpu. - --verbose VERBOSE verbose. ``` 1. `--config` parallel wavegan config file. You should use the same config with which the model is trained. @@ -135,15 +132,15 @@ optional arguments: 5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. ## Pretrained Model -Pretrained models can be downloaded here [pwg_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_vctk_ckpt_0.5.zip). +Pretrained models can be downloaded here [pwg_vctk_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_vctk_ckpt_0.1.1.zip). Parallel WaveGAN checkpoint contains files listed below. ```text -pwg_vctk_ckpt_0.5 -├── pwg_default.yaml # default config used to train parallel wavegan -├── pwg_snapshot_iter_1000000.pdz # generator parameters of parallel wavegan -└── pwg_stats.npy # statistics used to normalize spectrogram when training parallel wavegan +pwg_vctk_ckpt_0.1.1 +├── default.yaml # default config used to train parallel wavegan +├── snapshot_iter_1500000.pdz # generator parameters of parallel wavegan +└── feats_stats.npy # statistics used to normalize spectrogram when training parallel wavegan ``` ## Acknowledgement We adapted some code from https://github.com/kan-bayashi/ParallelWaveGAN. diff --git a/examples/vctk/voc1/conf/default.yaml b/examples/vctk/voc1/conf/default.yaml index d95eaad9dfdd6453be5b3bd24da85cf75ba5e59d..59ce3825dcc55c9a7cc2e9299ebfa4a351189d79 100644 --- a/examples/vctk/voc1/conf/default.yaml +++ b/examples/vctk/voc1/conf/default.yaml @@ -70,12 +70,9 @@ lambda_adv: 4.0 # Loss balancing coefficient. ########################################################### # DATA LOADER SETTING # ########################################################### -batch_size: 8 # Batch size. +batch_size: 6 # Batch size. batch_max_steps: 24000 # Length of each audio in batch. Make sure dividable by n_shift. -pin_memory: true # Whether to pin memory in Pytorch DataLoader. -num_workers: 4 # Number of workers in Pytorch DataLoader. -remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps. -allow_cache: true # Whether to allow cache in dataset. If true, it requires cpu memory. +num_workers: 2 # Number of workers in DataLoader. ########################################################### # OPTIMIZER & SCHEDULER SETTING # @@ -103,7 +100,7 @@ discriminator_grad_norm: 1 # Discriminator's gradient norm. # INTERVAL SETTING # ########################################################### discriminator_train_start_steps: 100000 # Number of steps to start to train discriminator. -train_max_steps: 1000000 # Number of training steps. +train_max_steps: 1500000 # Number of training steps. save_interval_steps: 5000 # Interval steps to save checkpoint. eval_interval_steps: 1000 # Interval steps to evaluate the network. diff --git a/examples/wenetspeech/asr1/conf/conformer.yaml b/examples/wenetspeech/asr1/conf/conformer.yaml index a438236d8b950800670da36c2f873df5a119df75..6c2bbca4166e06f0b64e1bb00a197fb0295252d1 100644 --- a/examples/wenetspeech/asr1/conf/conformer.yaml +++ b/examples/wenetspeech/asr1/conf/conformer.yaml @@ -1,111 +1,92 @@ -# network architecture -model: - # encoder related - encoder: conformer - encoder_conf: - output_size: 512 # dimension of attention - attention_heads: 8 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: True - use_cnn_module: True - cnn_module_kernel: 15 - cnn_module_norm: layer_norm - activation_type: swish - pos_enc_layer_type: rel_pos - selfattention_layer_type: rel_selfattn +############################################ +# Network Architecture # +############################################ +cmvn_file: +cmvn_file_type: "json" +# encoder related +encoder: conformer +encoder_conf: + output_size: 512 # dimension of attention + attention_heads: 8 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: True + use_cnn_module: True + cnn_module_kernel: 15 + cnn_module_norm: layer_norm + activation_type: swish + pos_enc_layer_type: rel_pos + selfattention_layer_type: rel_selfattn - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 8 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 8 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 - # hybrid CTC/attention - model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test - min_input_len: 0.1 # second - max_input_len: 12.0 # second - min_output_len: 1.0 - max_output_len: 400.0 - min_output_input_ratio: 0.05 - max_output_input_ratio: 10.0 +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test -collator: - vocab_filepath: data/lang_char/vocab.txt - unit_type: 'char' - spm_model_prefix: '' - augmentation_config: conf/preprocess.yaml - batch_size: 64 - raw_wav: True # use raw_wav or kaldi feature - spectrum_type: fbank #linear, mfcc, fbank - feat_dim: 80 - delta_delta: False - dither: 1.0 - target_sample_rate: 16000 - max_freq: None - n_fft: None - stride_ms: 10.0 - window_ms: 25.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 +########################################### +# Dataloader # +########################################### +vocab_filepath: data/lang_char/vocab.txt +unit_type: 'char' +preprocess_config: conf/preprocess.yaml +spm_model_prefix: '' +feat_dim: 80 +stride_ms: 10.0 +window_ms: 25.0 +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +batch_size: 64 +maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +num_workers: 0 +subsampling_factor: 1 +num_encs: 1 -training: - n_epoch: 240 - accum_grad: 16 - global_grad_clip: 5.0 - log_interval: 100 - checkpoint: - kbest_n: 50 - latest_n: 5 - optim: adam - optim_conf: - lr: 0.001 - weight_decay: 1e-6 - scheduler: warmuplr - scheduler_conf: - warmup_steps: 5000 - lr_decay: 1.0 - - -decoding: - batch_size: 128 - error_rate_type: cer - decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 2.5 - beta: 0.3 - beam_size: 10 - cutoff_prob: 1.0 - cutoff_top_n: 0 - num_proc_bsearch: 8 - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: False # simulate streaming inference. Defaults to False. \ No newline at end of file +########################################### +# Training # +########################################### +n_epoch: 240 +accum_grad: 16 +global_grad_clip: 5.0 +log_interval: 100 +checkpoint: + kbest_n: 50 + latest_n: 5 +optim: adam +optim_conf: + lr: 0.001 + weight_decay: 1.0e-6 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 5000 + lr_decay: 1.0 diff --git a/examples/wenetspeech/asr1/conf/preprocess.yaml b/examples/wenetspeech/asr1/conf/preprocess.yaml index dd4cfd273e0f654169955d29ed76c818ded9181c..f7f4c58d5228beea1b6462355bb5970d6f070ad4 100644 --- a/examples/wenetspeech/asr1/conf/preprocess.yaml +++ b/examples/wenetspeech/asr1/conf/preprocess.yaml @@ -5,7 +5,7 @@ process: n_mels: 80 n_shift: 160 win_length: 400 - dither: true + dither: 0.1 - type: cmvn_json cmvn_path: data/mean_std.json # these three processes are a.k.a. SpecAugument diff --git a/examples/wenetspeech/asr1/conf/tuning/decode.yaml b/examples/wenetspeech/asr1/conf/tuning/decode.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6924bfa637a3bf81cc50a7be92d9337b9ac81460 --- /dev/null +++ b/examples/wenetspeech/asr1/conf/tuning/decode.yaml @@ -0,0 +1,11 @@ +decode_batch_size: 128 +error_rate_type: cer +decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' +beam_size: 10 +ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. +decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. + # <0: for decoding, use full chunk. + # >0: for decoding, use fixed chunk size as set. + # 0: used for training, it's prohibited here. +num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. +simulate_streaming: False # simulate streaming inference. Defaults to False. \ No newline at end of file diff --git a/examples/wenetspeech/asr1/local/data.sh b/examples/wenetspeech/asr1/local/data.sh index 7dd478d193c199df9d4d025006d9392668be0546..d216dd84abbf1fb0604ba4095d1833315e70a016 100755 --- a/examples/wenetspeech/asr1/local/data.sh +++ b/examples/wenetspeech/asr1/local/data.sh @@ -96,7 +96,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then python3 ${MAIN_ROOT}/utils/build_vocab.py \ --unit_type="char" \ --count_threshold=0 \ - --vocab_path="data/vocab.txt" \ + --vocab_path="data/lang_char/vocab.txt" \ --manifest_paths "data/manifest.train.raw" if [ $? -ne 0 ]; then diff --git a/examples/wenetspeech/asr1/local/test.sh b/examples/wenetspeech/asr1/local/test.sh index da159de734a40996eed297beb147c23962b7fba3..65b884e512fa0ebf46c94d587e3ed4410c04343f 100755 --- a/examples/wenetspeech/asr1/local/test.sh +++ b/examples/wenetspeech/asr1/local/test.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 2 ];then - echo "usage: ${0} config_path ckpt_path_prefix" +if [ $# != 3 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix" exit -1 fi @@ -9,7 +9,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 +decode_config_path=$2 +ckpt_prefix=$3 chunk_mode=false if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then @@ -36,10 +37,11 @@ for type in attention ctc_greedy_search; do python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${output_dir}/${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ - --opts decoding.batch_size ${batch_size} + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in evaluation!" @@ -55,10 +57,11 @@ for type in ctc_prefix_beam_search attention_rescoring; do python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${output_dir}/${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ - --opts decoding.batch_size ${batch_size} + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in evaluation!" diff --git a/examples/wenetspeech/asr1/local/test_wav.sh b/examples/wenetspeech/asr1/local/test_wav.sh index 5c7794740e88b14df8e2171d10a8fd8560e994bc..474642624e0a138764ec3eae5a5ced1e4cd57076 100755 --- a/examples/wenetspeech/asr1/local/test_wav.sh +++ b/examples/wenetspeech/asr1/local/test_wav.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 3 ];then - echo "usage: ${0} config_path ckpt_path_prefix audio_file" +if [ $# != 4 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix audio_file" exit -1 fi @@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 -audio_file=$3 +decode_config_path=$2 +ckpt_prefix=$3 +audio_file=$4 mkdir -p data wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/demo_01_03.wav -P data/ @@ -43,10 +44,11 @@ for type in attention_rescoring; do python3 -u ${BIN_DIR}/test_wav.py \ --ngpu ${ngpu} \ --config ${config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${output_dir}/${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ - --opts decoding.batch_size ${batch_size} \ + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} \ --audio_file ${audio_file} if [ $? -ne 0 ]; then diff --git a/examples/wenetspeech/asr1/run.sh b/examples/wenetspeech/asr1/run.sh index d77f409fda23616bdc0f31c3900d2183ad37b23a..9995bc63eb8f9c55252abdc2ac400ea01bf389d9 100644 --- a/examples/wenetspeech/asr1/run.sh +++ b/examples/wenetspeech/asr1/run.sh @@ -7,7 +7,7 @@ gpus=0,1,2,3,4,5,6,7 stage=0 stop_stage=100 conf_path=conf/conformer.yaml - +decode_conf_path=conf/tuning/decode.yaml average_checkpoint=true avg_num=10 @@ -36,12 +36,12 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # test ckpt avg_n - CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then # ctc alignment of test data - CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then @@ -51,5 +51,5 @@ fi if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then # test a single .wav file - CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1 fi diff --git a/paddleaudio/CHANGELOG.md b/paddleaudio/CHANGELOG.md new file mode 100644 index 0000000000000000000000000000000000000000..4dc68c6ff8e4ba0e0e4e14d5025b38d958d9c3b0 --- /dev/null +++ b/paddleaudio/CHANGELOG.md @@ -0,0 +1,2 @@ +# Changelog + diff --git a/paddleaudio/__init__.py b/paddleaudio/__init__.py index 2685cf57c600b5931f17f5257a9443796662b916..b717777d32de43f1856da6867567c63b1ef2f5a0 100644 --- a/paddleaudio/__init__.py +++ b/paddleaudio/__init__.py @@ -13,3 +13,5 @@ # limitations under the License. from .backends import * from .features import * + +__version__ = '0.1.0' diff --git a/paddleaudio/datasets/dataset.py b/paddleaudio/datasets/dataset.py index fb521beae0c68e4649a6cc593bc4457f2ac2fc5f..7a57fd6cc03ce7283c53116284872b2d5ea91485 100644 --- a/paddleaudio/datasets/dataset.py +++ b/paddleaudio/datasets/dataset.py @@ -36,6 +36,7 @@ class AudioClassificationDataset(paddle.io.Dataset): files: List[str], labels: List[int], feat_type: str='raw', + sample_rate: int=None, **kwargs): """ Ags: @@ -55,6 +56,7 @@ class AudioClassificationDataset(paddle.io.Dataset): self.labels = labels self.feat_type = feat_type + self.sample_rate = sample_rate self.feat_config = kwargs # Pass keyword arguments to customize feature config def _get_data(self, input_file: str): @@ -63,7 +65,11 @@ class AudioClassificationDataset(paddle.io.Dataset): def _convert_to_record(self, idx): file, label = self.files[idx], self.labels[idx] - waveform, sample_rate = load_audio(file) + if self.sample_rate is None: + waveform, sample_rate = load_audio(file) + else: + waveform, sample_rate = load_audio(file, sr=self.sample_rate) + feat_func = feat_funcs[self.feat_type] record = {} diff --git a/paddlespeech/cli/README.md b/paddlespeech/cli/README.md index 34466ec2f0d7001a27a7eb4d329ad6c39e036291..5ac7a3bcaf1709b94020715d4480c08cf98cc3f0 100644 --- a/paddlespeech/cli/README.md +++ b/paddlespeech/cli/README.md @@ -1,5 +1,7 @@ # PaddleSpeech Command Line +([简体中文](./README_cn.md)|English) + The simplest approach to use PaddleSpeech models. ## Help @@ -28,3 +30,9 @@ paddlespeech tts --input "你好,欢迎使用百度飞桨深度学习框架!" --output output.wav ``` + ## Text Post-precessing + +- Punctuation Restoration + ```bash + paddlespeech text --task punc --input 今天的天气真不错啊你下午有空吗我想约你一起去吃饭 + ``` diff --git a/paddlespeech/cli/README_cn.md b/paddlespeech/cli/README_cn.md new file mode 100644 index 0000000000000000000000000000000000000000..75ab9e41b10152446db762b1b4ed1c180cd49967 --- /dev/null +++ b/paddlespeech/cli/README_cn.md @@ -0,0 +1,39 @@ +# PaddleSpeech 命令行工具 + +(简体中文|[English](./README.md)) + +`paddlespeech.cli` 模块是 PaddleSpeech 的命令行工具,它提供了最简便的方式调用 PaddleSpeech 提供的不同语音应用场景的预训练模型,用一行命令就可以进行模型预测: + + ## 命令行使用帮助 + ```bash + paddlespeech help + ``` + + ## 声音分类 + ```bash + paddlespeech cls --input input.wav + ``` + + ## 语音识别 + ``` + paddlespeech asr --lang zh --input input_16k.wav + ``` + + ## 语音翻译(英-中) + + (暂不支持Windows系统) + ```bash + paddlespeech st --input input_16k.wav + ``` + + ## 语音合成 + ```bash + paddlespeech tts --input "你好,欢迎使用百度飞桨深度学习框架!" --output output.wav + ``` + + ## 文本后处理 + +- 标点恢复 + ```bash + paddlespeech text --task punc --input 今天的天气真不错啊你下午有空吗我想约你一起去吃饭 + ``` diff --git a/paddlespeech/cli/asr/infer.py b/paddlespeech/cli/asr/infer.py index 05fcc20a23a57eb183467c82177453d84016b0e0..aa4e31d9ecd1751d7e7b4561bf6e9626859c3e1f 100644 --- a/paddlespeech/cli/asr/infer.py +++ b/paddlespeech/cli/asr/infer.py @@ -31,6 +31,7 @@ from ..log import logger from ..utils import cli_register from ..utils import download_and_decompress from ..utils import MODEL_HOME +from ..utils import stats_wrapper from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer from paddlespeech.s2t.transform.transformation import Transformation from paddlespeech.s2t.utils.dynamic_import import dynamic_import @@ -45,19 +46,29 @@ pretrained_models = { # "paddlespeech asr --model conformer_wenetspeech --lang zh --sr 16000 --input ./input.wav" "conformer_wenetspeech-zh-16k": { 'url': - 'https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/conformer.model.tar.gz', + 'https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1_conformer_wenetspeech_ckpt_0.1.1.model.tar.gz', 'md5': - '54e7a558a6e020c2f5fb224874943f97', + '76cb19ed857e6623856b7cd7ebbfeda4', 'cfg_path': - 'conf/conformer.yaml', + 'model.yaml', 'ckpt_path': 'exp/conformer/checkpoints/wenetspeech', }, + "transformer_librispeech-en-16k": { + 'url': + 'https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr1/asr1_transformer_librispeech_ckpt_0.1.1.model.tar.gz', + 'md5': + '2c667da24922aad391eacafe37bc1660', + 'cfg_path': + 'model.yaml', + 'ckpt_path': + 'exp/transformer/checkpoints/avg_10', + }, } model_alias = { - "ds2_offline": "paddlespeech.s2t.models.ds2:DeepSpeech2Model", - "ds2_online": "paddlespeech.s2t.models.ds2_online:DeepSpeech2ModelOnline", + "deepspeech2offline": "paddlespeech.s2t.models.ds2:DeepSpeech2Model", + "deepspeech2online": "paddlespeech.s2t.models.ds2_online:DeepSpeech2ModelOnline", "conformer": "paddlespeech.s2t.models.u2:U2Model", "transformer": "paddlespeech.s2t.models.u2:U2Model", "wenetspeech": "paddlespeech.s2t.models.u2:U2Model", @@ -84,7 +95,7 @@ class ASRExecutor(BaseExecutor): '--lang', type=str, default='zh', - help='Choose model language. zh or en') + help='Choose model language. zh or en, zh:[conformer_wenetspeech-zh-16k], en:[transformer_librispeech-en-16k]') self.parser.add_argument( "--sample_rate", type=int, @@ -96,6 +107,12 @@ class ASRExecutor(BaseExecutor): type=str, default=None, help='Config of asr task. Use deault config when it is None.') + self.parser.add_argument( + '--decode_method', + type=str, + default='attention_rescoring', + choices=['ctc_greedy_search', 'ctc_prefix_beam_search', 'attention', 'attention_rescoring'], + help='only support transformer and conformer model') self.parser.add_argument( '--ckpt_path', type=str, @@ -135,6 +152,7 @@ class ASRExecutor(BaseExecutor): lang: str='zh', sample_rate: int=16000, cfg_path: Optional[os.PathLike]=None, + decode_method: str='attention_rescoring', ckpt_path: Optional[os.PathLike]=None): """ Init model and other resources from a specific path. @@ -158,51 +176,36 @@ class ASRExecutor(BaseExecutor): else: self.cfg_path = os.path.abspath(cfg_path) self.ckpt_path = os.path.abspath(ckpt_path + ".pdparams") - res_path = os.path.dirname( + self.res_path = os.path.dirname( os.path.dirname(os.path.abspath(self.cfg_path))) #Init body. self.config = CfgNode(new_allowed=True) self.config.merge_from_file(self.cfg_path) - self.config.decoding.decoding_method = "attention_rescoring" with UpdateConfig(self.config): - if "ds2_online" in model_type or "ds2_offline" in model_type: + if "deepspeech2online" in model_type or "deepspeech2offline" in model_type: from paddlespeech.s2t.io.collator import SpeechCollator - self.config.collator.vocab_filepath = os.path.join( - res_path, self.config.collator.vocab_filepath) - self.config.collator.mean_std_filepath = os.path.join( - res_path, self.config.collator.cmvn_path) + self.vocab = self.config.vocab_filepath + self.config.decode.lang_model_path = os.path.join(res_path, self.config.decode.lang_model_path) self.collate_fn_test = SpeechCollator.from_config(self.config) - text_feature = TextFeaturizer( - unit_type=self.config.collator.unit_type, - vocab=self.config.collator.vocab_filepath, - spm_model_prefix=self.config.collator.spm_model_prefix) - self.config.model.input_dim = self.collate_fn_test.feature_size - self.config.model.output_dim = text_feature.vocab_size + self.text_feature = TextFeaturizer( + unit_type=self.config.unit_type, + vocab=self.vocab) elif "conformer" in model_type or "transformer" in model_type or "wenetspeech" in model_type: - self.config.collator.vocab_filepath = os.path.join( - res_path, self.config.collator.vocab_filepath) - self.config.collator.augmentation_config = os.path.join( - res_path, self.config.collator.augmentation_config) - self.config.collator.spm_model_prefix = os.path.join( - res_path, self.config.collator.spm_model_prefix) - text_feature = TextFeaturizer( - unit_type=self.config.collator.unit_type, - vocab=self.config.collator.vocab_filepath, - spm_model_prefix=self.config.collator.spm_model_prefix) - self.config.model.input_dim = self.config.collator.feat_dim - self.config.model.output_dim = text_feature.vocab_size + self.config.spm_model_prefix = os.path.join(self.res_path, self.config.spm_model_prefix) + self.text_feature = TextFeaturizer( + unit_type=self.config.unit_type, + vocab=self.config.vocab_filepath, + spm_model_prefix=self.config.spm_model_prefix) + self.config.decode.decoding_method = decode_method else: raise Exception("wrong type") - # Enter the path of model root - model_name = model_type[:model_type.rindex( '_')] # model_type: {model_name}_{dataset} model_class = dynamic_import(model_name, model_alias) - model_conf = self.config.model - logger.info(model_conf) + model_conf = self.config model = model_class.from_config(model_conf) self.model = model self.model.eval() @@ -221,32 +224,21 @@ class ASRExecutor(BaseExecutor): logger.info("Preprocess audio_file:" + audio_file) # Get the object for feature extraction - if "ds2_online" in model_type or "ds2_offline" in model_type: + if "deepspeech2online" in model_type or "deepspeech2offline" in model_type: audio, _ = self.collate_fn_test.process_utterance( audio_file=audio_file, transcript=" ") audio_len = audio.shape[0] audio = paddle.to_tensor(audio, dtype='float32') audio_len = paddle.to_tensor(audio_len) audio = paddle.unsqueeze(audio, axis=0) - vocab_list = collate_fn_test.vocab_list + # vocab_list = collate_fn_test.vocab_list self._inputs["audio"] = audio self._inputs["audio_len"] = audio_len logger.info(f"audio feat shape: {audio.shape}") elif "conformer" in model_type or "transformer" in model_type or "wenetspeech" in model_type: logger.info("get the preprocess conf") - preprocess_conf_file = self.config.collator.augmentation_config - # redirect the cmvn path - with io.open(preprocess_conf_file, encoding="utf-8") as f: - preprocess_conf = yaml.safe_load(f) - for idx, process in enumerate(preprocess_conf["process"]): - if process['type'] == "cmvn_json": - preprocess_conf["process"][idx][ - "cmvn_path"] = os.path.join( - self.res_path, - preprocess_conf["process"][idx]["cmvn_path"]) - break - logger.info(preprocess_conf) + preprocess_conf = self.config.preprocess_config preprocess_args = {"train": False} preprocessing = Transformation(preprocess_conf) logger.info("read the audio file") @@ -274,10 +266,7 @@ class ASRExecutor(BaseExecutor): audio_len = paddle.to_tensor(audio.shape[0]) audio = paddle.to_tensor(audio, dtype='float32').unsqueeze(axis=0) - text_feature = TextFeaturizer( - unit_type=self.config.collator.unit_type, - vocab=self.config.collator.vocab_filepath, - spm_model_prefix=self.config.collator.spm_model_prefix) + self._inputs["audio"] = audio self._inputs["audio_len"] = audio_len logger.info(f"audio feat shape: {audio.shape}") @@ -290,18 +279,15 @@ class ASRExecutor(BaseExecutor): """ Model inference and result stored in self.output. """ - text_feature = TextFeaturizer( - unit_type=self.config.collator.unit_type, - vocab=self.config.collator.vocab_filepath, - spm_model_prefix=self.config.collator.spm_model_prefix) - cfg = self.config.decoding + + cfg = self.config.decode audio = self._inputs["audio"] audio_len = self._inputs["audio_len"] - if "ds2_online" in model_type or "ds2_offline" in model_type: + if "deepspeech2online" in model_type or "deepspeech2offline" in model_type: result_transcripts = self.model.decode( audio, audio_len, - text_feature.vocab_list, + self.text_feature.vocab_list, decoding_method=cfg.decoding_method, lang_model_path=cfg.lang_model_path, beam_alpha=cfg.alpha, @@ -316,7 +302,7 @@ class ASRExecutor(BaseExecutor): result_transcripts = self.model.decode( audio, audio_len, - text_feature=text_feature, + text_feature=self.text_feature, decoding_method=cfg.decoding_method, beam_size=cfg.beam_size, ctc_weight=cfg.ctc_weight, @@ -419,18 +405,20 @@ class ASRExecutor(BaseExecutor): config = parser_args.config ckpt_path = parser_args.ckpt_path audio_file = parser_args.input + decode_method = parser_args.decode_method force_yes = parser_args.yes device = parser_args.device try: res = self(audio_file, model, lang, sample_rate, config, ckpt_path, - force_yes, device) + decode_method, force_yes, device) logger.info('ASR Result: {}'.format(res)) return True except Exception as e: logger.exception(e) return False + @stats_wrapper def __call__(self, audio_file: os.PathLike, model: str='conformer_wenetspeech', @@ -438,6 +426,7 @@ class ASRExecutor(BaseExecutor): sample_rate: int=16000, config: os.PathLike=None, ckpt_path: os.PathLike=None, + decode_method: str='attention_rescoring', force_yes: bool=False, device=paddle.get_device()): """ @@ -446,7 +435,7 @@ class ASRExecutor(BaseExecutor): audio_file = os.path.abspath(audio_file) self._check(audio_file, sample_rate, force_yes) paddle.set_device(device) - self._init_from_path(model, lang, sample_rate, config, ckpt_path) + self._init_from_path(model, lang, sample_rate, config, decode_method, ckpt_path) self.preprocess(model, audio_file) self.infer(model) res = self.postprocess() # Retrieve result of asr. diff --git a/paddlespeech/cli/cls/infer.py b/paddlespeech/cli/cls/infer.py index c31ad36105e599596fc023b87496f37d993b399c..52bc1972df9b726c068f3a804ef7b0c37be88aca 100644 --- a/paddlespeech/cli/cls/infer.py +++ b/paddlespeech/cli/cls/infer.py @@ -26,6 +26,7 @@ from ..log import logger from ..utils import cli_register from ..utils import download_and_decompress from ..utils import MODEL_HOME +from ..utils import stats_wrapper from paddleaudio import load from paddleaudio.features import LogMelSpectrogram from paddlespeech.s2t.utils.dynamic_import import dynamic_import @@ -245,6 +246,7 @@ class CLSExecutor(BaseExecutor): logger.exception(e) return False + @stats_wrapper def __call__(self, audio_file: os.PathLike, model: str='panns_cnn14', diff --git a/paddlespeech/cli/log.py b/paddlespeech/cli/log.py index 891b71a9485efc30ff77ecb4cb3362afe8fd366f..8644064c73ef407476e7870e65d1149019762723 100644 --- a/paddlespeech/cli/log.py +++ b/paddlespeech/cli/log.py @@ -43,8 +43,7 @@ class Logger(object): level) self.format = logging.Formatter( - fmt='[%(asctime)-15s] [%(levelname)8s] [%(filename)s] [L%(lineno)d] - %(message)s' - ) + fmt='[%(asctime)-15s] [%(levelname)8s] - %(message)s') self.handler = logging.StreamHandler() self.handler.setFormatter(self.format) diff --git a/paddlespeech/cli/st/infer.py b/paddlespeech/cli/st/infer.py index 553b025f022b064e936a9312624249106c3c95bb..1276424c509c961cc8d4555a59076758e1f76e78 100644 --- a/paddlespeech/cli/st/infer.py +++ b/paddlespeech/cli/st/infer.py @@ -30,6 +30,7 @@ from ..log import logger from ..utils import cli_register from ..utils import download_and_decompress from ..utils import MODEL_HOME +from ..utils import stats_wrapper from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer from paddlespeech.s2t.utils.dynamic_import import dynamic_import from paddlespeech.s2t.utils.utility import UpdateConfig @@ -39,11 +40,11 @@ __all__ = ["STExecutor"] pretrained_models = { "fat_st_ted-en-zh": { "url": - "https://paddlespeech.bj.bcebos.com/s2t/ted_en_zh/st1/fat_st_ted-en-zh.tar.gz", + "https://paddlespeech.bj.bcebos.com/s2t/ted_en_zh/st1/st1_transformer_mtl_noam_ted-en-zh_ckpt_0.1.1.model.tar.gz", "md5": - "fa0a7425b91b4f8d259c70b2aca5ae67", + "d62063f35a16d91210a71081bd2dd557", "cfg_path": - "conf/transformer_mtl_noam.yaml", + "model.yaml", "ckpt_path": "exp/transformer_mtl_noam/checkpoints/fat_st_ted-en-zh.pdparams", } @@ -169,24 +170,19 @@ class STExecutor(BaseExecutor): #Init body. self.config = CfgNode(new_allowed=True) self.config.merge_from_file(self.cfg_path) - self.config.decoding.decoding_method = "fullsentence" + self.config.decode.decoding_method = "fullsentence" with UpdateConfig(self.config): - self.config.collator.vocab_filepath = os.path.join( - res_path, self.config.collator.vocab_filepath) - self.config.collator.cmvn_path = os.path.join( - res_path, self.config.collator.cmvn_path) - self.config.collator.spm_model_prefix = os.path.join( - res_path, self.config.collator.spm_model_prefix) + self.config.cmvn_path = os.path.join( + res_path, self.config.cmvn_path) + self.config.spm_model_prefix = os.path.join( + res_path, self.config.spm_model_prefix) self.text_feature = TextFeaturizer( - unit_type=self.config.collator.unit_type, - vocab=self.config.collator.vocab_filepath, - spm_model_prefix=self.config.collator.spm_model_prefix) - self.config.model.input_dim = self.config.collator.feat_dim - self.config.model.output_dim = self.text_feature.vocab_size - - model_conf = self.config.model - logger.info(model_conf) + unit_type=self.config.unit_type, + vocab=self.config.vocab_filepath, + spm_model_prefix=self.config.spm_model_prefix) + + model_conf = self.config model_name = model_type[:model_type.rindex( '_')] # model_type: {model_name}_{dataset} model_class = dynamic_import(model_name, model_alias) @@ -217,7 +213,7 @@ class STExecutor(BaseExecutor): logger.info("Preprocess audio_file:" + audio_file) if "fat_st" in model_type: - cmvn = self.config.collator.cmvn_path + cmvn = self.config.cmvn_path utt_name = "_tmp" # Get the object for feature extraction @@ -283,7 +279,7 @@ class STExecutor(BaseExecutor): """ Model inference and result stored in self.output. """ - cfg = self.config.decoding + cfg = self.config.decode audio = self._inputs["audio"] audio_len = self._inputs["audio_len"] if model_type == "fat_st_ted": @@ -334,6 +330,7 @@ class STExecutor(BaseExecutor): logger.exception(e) return False + @stats_wrapper def __call__(self, audio_file: os.PathLike, model: str='fat_st_ted', diff --git a/paddlespeech/cli/text/infer.py b/paddlespeech/cli/text/infer.py index da9c5fe05710d9262e8ab8461020820720ed481d..1cef8fcfd2919f1dc9ede0baa22b2f1e80eb9d2c 100644 --- a/paddlespeech/cli/text/infer.py +++ b/paddlespeech/cli/text/infer.py @@ -26,6 +26,7 @@ from ..log import logger from ..utils import cli_register from ..utils import download_and_decompress from ..utils import MODEL_HOME +from ..utils import stats_wrapper __all__ = ['TextExecutor'] @@ -272,6 +273,7 @@ class TextExecutor(BaseExecutor): logger.exception(e) return False + @stats_wrapper def __call__( self, text: str, diff --git a/paddlespeech/cli/tts/infer.py b/paddlespeech/cli/tts/infer.py index f60f422452a4298de4a3e99e160eeac79e7d0381..a39a5c4e64564b71eac9d49b7144dabc5df332cc 100644 --- a/paddlespeech/cli/tts/infer.py +++ b/paddlespeech/cli/tts/infer.py @@ -29,6 +29,7 @@ from ..log import logger from ..utils import cli_register from ..utils import download_and_decompress from ..utils import MODEL_HOME +from ..utils import stats_wrapper from paddlespeech.s2t.utils.dynamic_import import dynamic_import from paddlespeech.t2s.frontend import English from paddlespeech.t2s.frontend.zh_frontend import Frontend @@ -155,26 +156,52 @@ pretrained_models = { }, "pwgan_vctk-en": { 'url': - 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_vctk_ckpt_0.5.zip', + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_vctk_ckpt_0.1.1.zip', 'md5': - '322ca688aec9b127cec2788b65aa3d52', + 'b3da1defcde3e578be71eb284cb89f2c', 'config': - 'pwg_default.yaml', + 'default.yaml', 'ckpt': - 'pwg_snapshot_iter_1000000.pdz', + 'snapshot_iter_1500000.pdz', 'speech_stats': - 'pwg_stats.npy', + 'feats_stats.npy', }, # mb_melgan "mb_melgan_csmsc-zh": { 'url': - 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_finetune_ckpt_0.5.zip', + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_ckpt_0.1.1.zip', 'md5': - 'b69322ab4ea766d955bd3d9af7dc5f2d', + 'ee5f0604e20091f0d495b6ec4618b90d', 'config': - 'finetune.yaml', + 'default.yaml', 'ckpt': - 'snapshot_iter_2000000.pdz', + 'snapshot_iter_1000000.pdz', + 'speech_stats': + 'feats_stats.npy', + }, + # style_melgan + "style_melgan_csmsc-zh": { + 'url': + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/style_melgan/style_melgan_csmsc_ckpt_0.1.1.zip', + 'md5': + '5de2d5348f396de0c966926b8c462755', + 'config': + 'default.yaml', + 'ckpt': + 'snapshot_iter_1500000.pdz', + 'speech_stats': + 'feats_stats.npy', + }, + # hifigan + "hifigan_csmsc-zh": { + 'url': + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_ckpt_0.1.1.zip', + 'md5': + 'dd40a3d88dfcf64513fba2f0f961ada6', + 'config': + 'default.yaml', + 'ckpt': + 'snapshot_iter_2500000.pdz', 'speech_stats': 'feats_stats.npy', }, @@ -199,6 +226,14 @@ model_alias = { "paddlespeech.t2s.models.melgan:MelGANGenerator", "mb_melgan_inference": "paddlespeech.t2s.models.melgan:MelGANInference", + "style_melgan": + "paddlespeech.t2s.models.melgan:StyleMelGANGenerator", + "style_melgan_inference": + "paddlespeech.t2s.models.melgan:StyleMelGANInference", + "hifigan": + "paddlespeech.t2s.models.hifigan:HiFiGANGenerator", + "hifigan_inference": + "paddlespeech.t2s.models.hifigan:HiFiGANInference", } @@ -266,7 +301,7 @@ class TTSExecutor(BaseExecutor): default='pwgan_csmsc', choices=[ 'pwgan_csmsc', 'pwgan_ljspeech', 'pwgan_aishell3', 'pwgan_vctk', - 'mb_melgan_csmsc' + 'mb_melgan_csmsc', 'style_melgan_csmsc', 'hifigan_csmsc' ], help='Choose vocoder type of tts task.') @@ -504,37 +539,47 @@ class TTSExecutor(BaseExecutor): am_name = am[:am.rindex('_')] am_dataset = am[am.rindex('_') + 1:] get_tone_ids = False + merge_sentences = False if am_name == 'speedyspeech': get_tone_ids = True if lang == 'zh': input_ids = self.frontend.get_input_ids( - text, merge_sentences=True, get_tone_ids=get_tone_ids) + text, + merge_sentences=merge_sentences, + get_tone_ids=get_tone_ids) phone_ids = input_ids["phone_ids"] - phone_ids = phone_ids[0] if get_tone_ids: tone_ids = input_ids["tone_ids"] - tone_ids = tone_ids[0] elif lang == 'en': - input_ids = self.frontend.get_input_ids(text) + input_ids = self.frontend.get_input_ids( + text, merge_sentences=merge_sentences) phone_ids = input_ids["phone_ids"] else: print("lang should in {'zh', 'en'}!") - # am - if am_name == 'speedyspeech': - mel = self.am_inference(phone_ids, tone_ids) - # fastspeech2 - else: - # multi speaker - if am_dataset in {"aishell3", "vctk"}: - mel = self.am_inference( - phone_ids, spk_id=paddle.to_tensor(spk_id)) + flags = 0 + for i in range(len(phone_ids)): + part_phone_ids = phone_ids[i] + # am + if am_name == 'speedyspeech': + part_tone_ids = tone_ids[i] + mel = self.am_inference(part_phone_ids, part_tone_ids) + # fastspeech2 else: - mel = self.am_inference(phone_ids) - - # voc - wav = self.voc_inference(mel) - self._outputs['wav'] = wav + # multi speaker + if am_dataset in {"aishell3", "vctk"}: + mel = self.am_inference( + part_phone_ids, spk_id=paddle.to_tensor(spk_id)) + else: + mel = self.am_inference(part_phone_ids) + # voc + wav = self.voc_inference(mel) + if flags == 0: + wav_all = wav + flags = 1 + else: + wav_all = paddle.concat([wav_all, wav]) + self._outputs['wav'] = wav_all def postprocess(self, output: str='output.wav') -> Union[str, os.PathLike]: """ @@ -601,6 +646,7 @@ class TTSExecutor(BaseExecutor): logger.exception(e) return False + @stats_wrapper def __call__(self, text: str, am: str='fastspeech2_csmsc', diff --git a/paddlespeech/cli/utils.py b/paddlespeech/cli/utils.py index ee31b771bd7a0e4cbbb2e5d29fa70ebc8e8a2de4..63b670c863111719c3158d82dc4517bf0dc29d6a 100644 --- a/paddlespeech/cli/utils.py +++ b/paddlespeech/cli/utils.py @@ -11,22 +11,36 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import hashlib +import inspect +import json import os import tarfile +import threading +import time +import uuid import zipfile from typing import Any from typing import Dict +import paddle +import paddleaudio +import requests +import yaml from paddle.framework import load from . import download +from .. import __version__ from .entry import commands +requests.adapters.DEFAULT_RETRIES = 3 + __all__ = [ 'cli_register', 'get_command', 'download_and_decompress', 'load_state_dict_from_url', + 'stats_wrapper', ] @@ -101,6 +115,13 @@ def download_and_decompress(archive: Dict[str, str], path: str) -> os.PathLike: if not os.path.isdir(uncompress_path): download._decompress(filepath) else: + StatsWorker( + task='download', + version=__version__, + extra_info={ + 'download_url': archive['url'], + 'paddle_version': paddle.__version__ + }).start() uncompress_path = download.get_path_from_url(archive['url'], path, archive['md5']) @@ -146,3 +167,171 @@ def _get_sub_home(directory): PPSPEECH_HOME = _get_paddlespcceh_home() MODEL_HOME = _get_sub_home('models') +CONF_HOME = _get_sub_home('conf') + + +def _md5(text: str): + '''Calculate the md5 value of the input text.''' + md5code = hashlib.md5(text.encode()) + return md5code.hexdigest() + + +class ConfigCache: + def __init__(self): + self._data = {} + self._initialize() + self.file = os.path.join(CONF_HOME, 'cache.yaml') + if not os.path.exists(self.file): + self.flush() + return + + with open(self.file, 'r') as file: + try: + cfg = yaml.load(file, Loader=yaml.FullLoader) + self._data.update(cfg) + except: + self.flush() + + @property + def cache_info(self): + return self._data['cache_info'] + + def _initialize(self): + # Set default configuration values. + cache_info = _md5(str(uuid.uuid1())[-12:]) + "-" + str(int(time.time())) + self._data['cache_info'] = cache_info + + def flush(self): + '''Flush the current configuration into the configuration file.''' + with open(self.file, 'w') as file: + cfg = json.loads(json.dumps(self._data)) + yaml.dump(cfg, file) + + +stats_api = "http://paddlepaddle.org.cn/paddlehub/stat" +cache_info = ConfigCache().cache_info + + +class StatsWorker(threading.Thread): + def __init__(self, + task="asr", + model=None, + version=__version__, + extra_info={}): + threading.Thread.__init__(self) + self._task = task + self._model = model + self._version = version + self._extra_info = extra_info + + def run(self): + params = { + 'task': self._task, + 'version': self._version, + 'from': 'ppspeech' + } + if self._model: + params['model'] = self._model + + self._extra_info.update({ + 'cache_info': cache_info, + }) + params.update({"extra": json.dumps(self._extra_info)}) + + try: + requests.get(stats_api, params) + except Exception: + pass + + return + + +def _note_one_stat(cls_name, params={}): + task = cls_name.replace('Executor', '').lower() # XXExecutor + extra_info = { + 'paddle_version': paddle.__version__, + } + + if 'model' in params: + model = params['model'] + else: + model = None + + if 'audio_file' in params: + try: + _, sr = paddleaudio.load(params['audio_file']) + except Exception: + sr = -1 + + if task == 'asr': + extra_info.update({ + 'lang': params['lang'], + 'inp_sr': sr, + 'model_sr': params['sample_rate'], + }) + elif task == 'st': + extra_info.update({ + 'lang': + params['src_lang'] + '-' + params['tgt_lang'], + 'inp_sr': + sr, + 'model_sr': + params['sample_rate'], + }) + elif task == 'tts': + model = params['am'] + extra_info.update({ + 'lang': params['lang'], + 'vocoder': params['voc'], + }) + elif task == 'cls': + extra_info.update({ + 'inp_sr': sr, + }) + elif task == 'text': + extra_info.update({ + 'sub_task': params['task'], + 'lang': params['lang'], + }) + else: + return + + StatsWorker( + task=task, + model=model, + version=__version__, + extra_info=extra_info, ).start() + + +def _parse_args(func, *args, **kwargs): + # FullArgSpec(args, varargs, varkw, defaults, kwonlyargs, kwonlydefaults, annotations) + argspec = inspect.getfullargspec(func) + + keys = argspec[0] + if keys[0] == 'self': # Remove self pointer. + keys = keys[1:] + + default_values = argspec[3] + values = [None] * (len(keys) - len(default_values)) + values.extend(list(default_values)) + params = dict(zip(keys, values)) + + for idx, v in enumerate(args): + params[keys[idx]] = v + for k, v in kwargs.items(): + params[k] = v + + return params + + +def stats_wrapper(executor_func): + def _warpper(self, *args, **kwargs): + try: + _note_one_stat( + type(self).__name__, _parse_args(executor_func, *args, + **kwargs)) + except Exception: + pass + return executor_func(self, *args, **kwargs) + + return _warpper diff --git a/paddlespeech/cls/exps/panns/predict.py b/paddlespeech/cls/exps/panns/predict.py index 9cfd8b6ce44ab7318a218a52bd1c2eaee28d680a..ffe42d3904bc9b62c678b623147de320b287c071 100644 --- a/paddlespeech/cls/exps/panns/predict.py +++ b/paddlespeech/cls/exps/panns/predict.py @@ -12,58 +12,61 @@ # See the License for the specific language governing permissions and # limitations under the License. import argparse +import os -import numpy as np import paddle import paddle.nn.functional as F +import yaml from paddleaudio.backends import load as load_audio -from paddleaudio.datasets import ESC50 from paddleaudio.features import LogMelSpectrogram -from paddleaudio.features import melspectrogram -from paddlespeech.cls.models import cnn14 +from paddleaudio.utils import logger from paddlespeech.cls.models import SoundClassifier +from paddlespeech.s2t.utils.dynamic_import import dynamic_import # yapf: disable parser = argparse.ArgumentParser(__doc__) -parser.add_argument("--wav", type=str, required=True, help="Audio file to infer.") -parser.add_argument("--feat_backend", type=str, choices=['numpy', 'paddle'], default='numpy', help="Choose backend to extract features from audio files.") -parser.add_argument("--top_k", type=int, default=1, help="Show top k predicted results") -parser.add_argument("--checkpoint", type=str, required=True, help="Checkpoint of model.") +parser.add_argument("--cfg_path", type=str, required=True) args = parser.parse_args() # yapf: enable -def extract_features(file: str, feat_backend: str='numpy', - **kwargs) -> paddle.Tensor: - waveform, sr = load_audio(file, sr=None) - - if args.feat_backend == 'numpy': - feat = melspectrogram(waveform, sr, **kwargs).transpose() - feat = np.expand_dims(feat, 0) - feat = paddle.to_tensor(feat) - else: - feature_extractor = LogMelSpectrogram(sr=sr, **kwargs) - feat = feature_extractor(paddle.to_tensor(waveform).unsqueeze(0)) - feat = paddle.transpose(feat, [0, 2, 1]) +def extract_features(file: str, **feat_conf) -> paddle.Tensor: + file = os.path.abspath(os.path.expanduser(file)) + waveform, _ = load_audio(file, sr=feat_conf['sr']) + feature_extractor = LogMelSpectrogram(**feat_conf) + feat = feature_extractor(paddle.to_tensor(waveform).unsqueeze(0)) + feat = paddle.transpose(feat, [0, 2, 1]) return feat if __name__ == '__main__': + args.cfg_path = os.path.abspath(os.path.expanduser(args.cfg_path)) + with open(args.cfg_path, 'r') as f: + config = yaml.safe_load(f) + + model_conf = config['model'] + data_conf = config['data'] + feat_conf = config['feature'] + predicting_conf = config['predicting'] + + ds_class = dynamic_import(data_conf['dataset']) + backbone_class = dynamic_import(model_conf['backbone']) + model = SoundClassifier( - backbone=cnn14(pretrained=False, extract_embedding=True), - num_class=len(ESC50.label_list)) - model.set_state_dict(paddle.load(args.checkpoint)) + backbone=backbone_class(pretrained=False, extract_embedding=True), + num_class=len(ds_class.label_list)) + model.set_state_dict(paddle.load(predicting_conf['checkpoint'])) model.eval() - feat = extract_features(args.wav, args.feat_backend) + feat = extract_features(predicting_conf['audio_file'], **feat_conf) logits = model(feat) probs = F.softmax(logits, axis=1).numpy() sorted_indices = (-probs[0]).argsort() - msg = f'[{args.wav}]\n' - for idx in sorted_indices[:args.top_k]: - msg += f'{ESC50.label_list[idx]}: {probs[0][idx]}\n' - print(msg) + msg = f"[{predicting_conf['audio_file']}]\n" + for idx in sorted_indices[:predicting_conf['top_k']]: + msg += f'{ds_class.label_list[idx]}: {probs[0][idx]}\n' + logger.info(msg) diff --git a/paddlespeech/cls/exps/panns/train.py b/paddlespeech/cls/exps/panns/train.py index 1213097899cee2594b5ecf4e91310c16b5f46841..7e292214827cdbbaaed51a51c175331cd159b098 100644 --- a/paddlespeech/cls/exps/panns/train.py +++ b/paddlespeech/cls/exps/panns/train.py @@ -15,24 +15,17 @@ import argparse import os import paddle +import yaml -from paddleaudio.datasets import ESC50 from paddleaudio.features import LogMelSpectrogram from paddleaudio.utils import logger from paddleaudio.utils import Timer -from paddlespeech.cls.models import cnn14 from paddlespeech.cls.models import SoundClassifier +from paddlespeech.s2t.utils.dynamic_import import dynamic_import # yapf: disable parser = argparse.ArgumentParser(__doc__) -parser.add_argument("--epochs", type=int, default=50, help="Number of epoches for fine-tuning.") -parser.add_argument("--feat_backend", type=str, choices=['numpy', 'paddle'], default='numpy', help="Choose backend to extract features from audio files.") -parser.add_argument("--learning_rate", type=float, default=5e-5, help="Learning rate used to train with warmup.") -parser.add_argument("--batch_size", type=int, default=16, help="Total examples' number in batch for training.") -parser.add_argument("--num_workers", type=int, default=0, help="Number of workers in dataloader.") -parser.add_argument("--checkpoint_dir", type=str, default='./checkpoint', help="Directory to save model checkpoints.") -parser.add_argument("--save_freq", type=int, default=10, help="Save checkpoint every n epoch.") -parser.add_argument("--log_freq", type=int, default=10, help="Log the training infomation every n steps.") +parser.add_argument("--cfg_path", type=str, required=True) args = parser.parse_args() # yapf: enable @@ -42,50 +35,60 @@ if __name__ == "__main__": paddle.distributed.init_parallel_env() local_rank = paddle.distributed.get_rank() - backbone = cnn14(pretrained=True, extract_embedding=True) - model = SoundClassifier(backbone, num_class=len(ESC50.label_list)) - model = paddle.DataParallel(model) - optimizer = paddle.optimizer.Adam( - learning_rate=args.learning_rate, parameters=model.parameters()) - criterion = paddle.nn.loss.CrossEntropyLoss() + args.cfg_path = os.path.abspath(os.path.expanduser(args.cfg_path)) + with open(args.cfg_path, 'r') as f: + config = yaml.safe_load(f) - if args.feat_backend == 'numpy': - train_ds = ESC50(mode='train', feat_type='melspectrogram') - dev_ds = ESC50(mode='dev', feat_type='melspectrogram') - else: - train_ds = ESC50(mode='train') - dev_ds = ESC50(mode='dev') - feature_extractor = LogMelSpectrogram(sr=16000) + model_conf = config['model'] + data_conf = config['data'] + feat_conf = config['feature'] + training_conf = config['training'] + # Dataset + ds_class = dynamic_import(data_conf['dataset']) + train_ds = ds_class(**data_conf['train']) + dev_ds = ds_class(**data_conf['dev']) train_sampler = paddle.io.DistributedBatchSampler( - train_ds, batch_size=args.batch_size, shuffle=True, drop_last=False) + train_ds, + batch_size=training_conf['batch_size'], + shuffle=True, + drop_last=False) train_loader = paddle.io.DataLoader( train_ds, batch_sampler=train_sampler, - num_workers=args.num_workers, + num_workers=training_conf['num_workers'], return_list=True, use_buffer_reader=True, ) + # Feature + feature_extractor = LogMelSpectrogram(**feat_conf) + + # Model + backbone_class = dynamic_import(model_conf['backbone']) + backbone = backbone_class(pretrained=True, extract_embedding=True) + model = SoundClassifier(backbone, num_class=data_conf['num_classes']) + model = paddle.DataParallel(model) + optimizer = paddle.optimizer.Adam( + learning_rate=training_conf['learning_rate'], + parameters=model.parameters()) + criterion = paddle.nn.loss.CrossEntropyLoss() + steps_per_epoch = len(train_sampler) - timer = Timer(steps_per_epoch * args.epochs) + timer = Timer(steps_per_epoch * training_conf['epochs']) timer.start() - for epoch in range(1, args.epochs + 1): + for epoch in range(1, training_conf['epochs'] + 1): model.train() avg_loss = 0 num_corrects = 0 num_samples = 0 for batch_idx, batch in enumerate(train_loader): - if args.feat_backend == 'numpy': - feats, labels = batch - else: - waveforms, labels = batch - feats = feature_extractor( - waveforms - ) # Need a padding when lengths of waveforms differ in a batch. - feats = paddle.transpose(feats, - [0, 2, 1]) # To [N, length, n_mels] + waveforms, labels = batch + feats = feature_extractor( + waveforms + ) # Need a padding when lengths of waveforms differ in a batch. + feats = paddle.transpose(feats, [0, 2, 1]) # To [N, length, n_mels] logits = model(feats) @@ -107,13 +110,15 @@ if __name__ == "__main__": timer.count() - if (batch_idx + 1) % args.log_freq == 0 and local_rank == 0: + if (batch_idx + 1 + ) % training_conf['log_freq'] == 0 and local_rank == 0: lr = optimizer.get_lr() - avg_loss /= args.log_freq + avg_loss /= training_conf['log_freq'] avg_acc = num_corrects / num_samples print_msg = 'Epoch={}/{}, Step={}/{}'.format( - epoch, args.epochs, batch_idx + 1, steps_per_epoch) + epoch, training_conf['epochs'], batch_idx + 1, + steps_per_epoch) print_msg += ' loss={:.4f}'.format(avg_loss) print_msg += ' acc={:.4f}'.format(avg_acc) print_msg += ' lr={:.6f} step/sec={:.2f} | ETA {}'.format( @@ -124,16 +129,17 @@ if __name__ == "__main__": num_corrects = 0 num_samples = 0 - if epoch % args.save_freq == 0 and batch_idx + 1 == steps_per_epoch and local_rank == 0: + if epoch % training_conf[ + 'save_freq'] == 0 and batch_idx + 1 == steps_per_epoch and local_rank == 0: dev_sampler = paddle.io.BatchSampler( dev_ds, - batch_size=args.batch_size, + batch_size=training_conf['batch_size'], shuffle=False, drop_last=False) dev_loader = paddle.io.DataLoader( dev_ds, batch_sampler=dev_sampler, - num_workers=args.num_workers, + num_workers=training_conf['num_workers'], return_list=True, ) model.eval() @@ -141,12 +147,9 @@ if __name__ == "__main__": num_samples = 0 with logger.processing('Evaluation on validation dataset'): for batch_idx, batch in enumerate(dev_loader): - if args.feat_backend == 'numpy': - feats, labels = batch - else: - waveforms, labels = batch - feats = feature_extractor(waveforms) - feats = paddle.transpose(feats, [0, 2, 1]) + waveforms, labels = batch + feats = feature_extractor(waveforms) + feats = paddle.transpose(feats, [0, 2, 1]) logits = model(feats) @@ -160,7 +163,7 @@ if __name__ == "__main__": logger.eval(print_msg) # Save model - save_dir = os.path.join(args.checkpoint_dir, + save_dir = os.path.join(training_conf['checkpoint_dir'], 'epoch_{}'.format(epoch)) logger.info('Saving model checkpoint to {}'.format(save_dir)) paddle.save(model.state_dict(), diff --git a/paddlespeech/s2t/decoders/ctcdecoder/scorer_deprecated.py b/paddlespeech/s2t/decoders/ctcdecoder/scorer_deprecated.py index d81fb2e3c7143edfa48720b5e5a5f1731dc90cda..362098fe65ec34106926e1804dfbb5abb273d97d 100644 --- a/paddlespeech/s2t/decoders/ctcdecoder/scorer_deprecated.py +++ b/paddlespeech/s2t/decoders/ctcdecoder/scorer_deprecated.py @@ -62,7 +62,7 @@ class Scorer(object): """Evaluation function, gathering all the different scores and return the final one. - :param sentence: The input sentence for evalutation + :param sentence: The input sentence for evaluation :type sentence: str :param log: Whether return the score in log representation. :type log: bool diff --git a/paddlespeech/s2t/decoders/recog.py b/paddlespeech/s2t/decoders/recog.py index 3e9939f02bc678217e1f1c61d06aa674ab403cf6..88955eacb16f0e8a042143d06193484f1440f5ca 100644 --- a/paddlespeech/s2t/decoders/recog.py +++ b/paddlespeech/s2t/decoders/recog.py @@ -85,7 +85,7 @@ def recog_v2(args): mode="asr", load_output=False, sort_in_input_length=False, - preprocess_conf=confs.collator.augmentation_config + preprocess_conf=confs.preprocess_config if args.preprocess_conf is None else args.preprocess_conf, preprocess_args={"train": False}, ) diff --git a/paddlespeech/s2t/decoders/scorers/ctc.py b/paddlespeech/s2t/decoders/scorers/ctc.py index ace80bd3eb8d3cf1639be03c244b43aacbba2ec4..81d8b078392eb0282d59cfbefbb72a2583647aae 100644 --- a/paddlespeech/s2t/decoders/scorers/ctc.py +++ b/paddlespeech/s2t/decoders/scorers/ctc.py @@ -154,7 +154,7 @@ class CTCPrefixScorer(BatchPartialScorerInterface): Args: state: The states of hyps - Returns: exteded state + Returns: extended state """ new_state = [] diff --git a/paddlespeech/s2t/decoders/scorers/ctc_prefix_score.py b/paddlespeech/s2t/decoders/scorers/ctc_prefix_score.py index 13429d491399e5b0f12268dfe6cde805691ba7cf..78b8fe36c8c0383d642740cab252ba7c89ba2ec0 100644 --- a/paddlespeech/s2t/decoders/scorers/ctc_prefix_score.py +++ b/paddlespeech/s2t/decoders/scorers/ctc_prefix_score.py @@ -11,7 +11,7 @@ class CTCPrefixScorePD(): which is based on Algorithm 2 in WATANABE et al. "HYBRID CTC/ATTENTION ARCHITECTURE FOR END-TO-END SPEECH RECOGNITION," - but extended to efficiently compute the label probablities for multiple + but extended to efficiently compute the label probabilities for multiple hypotheses simultaneously See also Seki et al. "Vectorized Beam Search for CTC-Attention-Based Speech Recognition," In INTERSPEECH (pp. 3825-3829), 2019. @@ -272,7 +272,7 @@ class CTCPrefixScore(): which is based on Algorithm 2 in WATANABE et al. "HYBRID CTC/ATTENTION ARCHITECTURE FOR END-TO-END SPEECH RECOGNITION," - but extended to efficiently compute the probablities of multiple labels + but extended to efficiently compute the probabilities of multiple labels simultaneously """ diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/deploy/runtime.py b/paddlespeech/s2t/exps/deepspeech2/bin/deploy/runtime.py index 7ccb3a6c23f0bfad874bc7720fafc4514c1b971a..5755a5f101c18927aff975262d627f81a74fb783 100644 --- a/paddlespeech/s2t/exps/deepspeech2/bin/deploy/runtime.py +++ b/paddlespeech/s2t/exps/deepspeech2/bin/deploy/runtime.py @@ -19,8 +19,8 @@ import paddle from paddle.inference import Config from paddle.inference import create_predictor from paddle.io import DataLoader +from yacs.config import CfgNode -from paddlespeech.s2t.exps.deepspeech2.config import get_cfg_defaults from paddlespeech.s2t.io.collator import SpeechCollator from paddlespeech.s2t.io.dataset import ManifestDataset from paddlespeech.s2t.models.ds2 import DeepSpeech2Model @@ -80,13 +80,13 @@ def inference(config, args): def start_server(config, args): """Start the ASR server""" config.defrost() - config.data.manifest = config.data.test_manifest + config.manifest = config.test_manifest dataset = ManifestDataset.from_config(config) - config.collator.augmentation_config = "" - config.collator.keep_transcription_text = True - config.collator.batch_size = 1 - config.collator.num_workers = 0 + config.augmentation_config = "" + config.keep_transcription_text = True + config.batch_size = 1 + config.num_workers = 0 collate_fn = SpeechCollator.from_config(config) test_loader = DataLoader(dataset, collate_fn=collate_fn, num_workers=0) @@ -105,14 +105,14 @@ def start_server(config, args): paddle.to_tensor(audio), paddle.to_tensor(audio_len), vocab_list=test_loader.collate_fn.vocab_list, - decoding_method=config.decoding.decoding_method, - lang_model_path=config.decoding.lang_model_path, - beam_alpha=config.decoding.alpha, - beam_beta=config.decoding.beta, - beam_size=config.decoding.beam_size, - cutoff_prob=config.decoding.cutoff_prob, - cutoff_top_n=config.decoding.cutoff_top_n, - num_processes=config.decoding.num_proc_bsearch) + decoding_method=config.decode.decoding_method, + lang_model_path=config.decode.lang_model_path, + beam_alpha=config.decode.alpha, + beam_beta=config.decode.beta, + beam_size=config.decode.beam_size, + cutoff_prob=config.decode.cutoff_prob, + cutoff_top_n=config.decode.cutoff_top_n, + num_processes=config.decode.num_proc_bsearch) return result_transcript[0] # warming up with utterrances sampled from Librispeech @@ -176,15 +176,19 @@ if __name__ == "__main__": print_arguments(args, globals()) # https://yaml.org/type/float.html - config = get_cfg_defaults() + config = CfgNode(new_allowed=True) if args.config: config.merge_from_file(args.config) + if args.decode_cfg: + decode_confs = CfgNode(new_allowed=True) + decode_confs.merge_from_file(args.decode_cfg) + config.decode = decode_confs if args.opts: config.merge_from_list(args.opts) config.freeze() print(config) - args.warmup_manifest = config.data.test_manifest + args.warmup_manifest = config.test_manifest print_arguments(args, globals()) if args.dump_config: diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/deploy/server.py b/paddlespeech/s2t/exps/deepspeech2/bin/deploy/server.py index 5c6eee3f6cf647bda222fc92442dec9d40ae0b0d..0d0b4f2197c05383285bbf590f53f334d134c969 100644 --- a/paddlespeech/s2t/exps/deepspeech2/bin/deploy/server.py +++ b/paddlespeech/s2t/exps/deepspeech2/bin/deploy/server.py @@ -17,8 +17,8 @@ import functools import numpy as np import paddle from paddle.io import DataLoader +from yacs.config import CfgNode -from paddlespeech.s2t.exps.deepspeech2.config import get_cfg_defaults from paddlespeech.s2t.io.collator import SpeechCollator from paddlespeech.s2t.io.dataset import ManifestDataset from paddlespeech.s2t.models.ds2 import DeepSpeech2Model @@ -33,13 +33,13 @@ from paddlespeech.s2t.utils.utility import print_arguments def start_server(config, args): """Start the ASR server""" config.defrost() - config.data.manifest = config.data.test_manifest + config.manifest = config.test_manifest dataset = ManifestDataset.from_config(config) - config.collator.augmentation_config = "" - config.collator.keep_transcription_text = True - config.collator.batch_size = 1 - config.collator.num_workers = 0 + config.augmentation_config = "" + config.keep_transcription_text = True + config.batch_size = 1 + config.num_workers = 0 collate_fn = SpeechCollator.from_config(config) test_loader = DataLoader(dataset, collate_fn=collate_fn, num_workers=0) @@ -62,14 +62,14 @@ def start_server(config, args): paddle.to_tensor(audio), paddle.to_tensor(audio_len), vocab_list=test_loader.collate_fn.vocab_list, - decoding_method=config.decoding.decoding_method, - lang_model_path=config.decoding.lang_model_path, - beam_alpha=config.decoding.alpha, - beam_beta=config.decoding.beta, - beam_size=config.decoding.beam_size, - cutoff_prob=config.decoding.cutoff_prob, - cutoff_top_n=config.decoding.cutoff_top_n, - num_processes=config.decoding.num_proc_bsearch) + decoding_method=config.decode.decoding_method, + lang_model_path=config.decode.lang_model_path, + beam_alpha=config.decode.alpha, + beam_beta=config.decode.beta, + beam_size=config.decode.beam_size, + cutoff_prob=config.decode.cutoff_prob, + cutoff_top_n=config.decode.cutoff_top_n, + num_processes=config.decode.num_proc_bsearch) return result_transcript[0] # warming up with utterrances sampled from Librispeech @@ -111,15 +111,19 @@ if __name__ == "__main__": print_arguments(args, globals()) # https://yaml.org/type/float.html - config = get_cfg_defaults() + config = CfgNode(new_allowed=True) if args.config: config.merge_from_file(args.config) + if args.decode_cfg: + decode_confs = CfgNode(new_allowed=True) + decode_confs.merge_from_file(args.decode_cfg) + config.decode = decode_confs if args.opts: config.merge_from_list(args.opts) config.freeze() print(config) - args.warmup_manifest = config.data.test_manifest + args.warmup_manifest = config.test_manifest print_arguments(args, globals()) if args.dump_config: diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/export.py b/paddlespeech/s2t/exps/deepspeech2/bin/export.py index 66042e8431c0079c8ca630c6eeb2ea83435a0210..ee013d79e6ed3d39516ee65d5c4df5ec30a24b42 100644 --- a/paddlespeech/s2t/exps/deepspeech2/bin/export.py +++ b/paddlespeech/s2t/exps/deepspeech2/bin/export.py @@ -12,7 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Export for DeepSpeech2 model.""" -from paddlespeech.s2t.exps.deepspeech2.config import get_cfg_defaults +from yacs.config import CfgNode + from paddlespeech.s2t.exps.deepspeech2.model import DeepSpeech2Tester as Tester from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.utils.utility import print_arguments @@ -41,7 +42,7 @@ if __name__ == "__main__": print_arguments(args) # https://yaml.org/type/float.html - config = get_cfg_defaults(args.model_type) + config = CfgNode(new_allowed=True) if args.config: config.merge_from_file(args.config) if args.opts: diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/test.py b/paddlespeech/s2t/exps/deepspeech2/bin/test.py index f52615faefedbef1b40abf1b94ab840f23907fe3..388b380d1c78aeb45970486091285f4c1248eb55 100644 --- a/paddlespeech/s2t/exps/deepspeech2/bin/test.py +++ b/paddlespeech/s2t/exps/deepspeech2/bin/test.py @@ -12,7 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Evaluation for DeepSpeech2 model.""" -from paddlespeech.s2t.exps.deepspeech2.config import get_cfg_defaults +from yacs.config import CfgNode + from paddlespeech.s2t.exps.deepspeech2.model import DeepSpeech2Tester as Tester from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.utils.utility import print_arguments @@ -41,9 +42,13 @@ if __name__ == "__main__": print("model_type:{}".format(args.model_type)) # https://yaml.org/type/float.html - config = get_cfg_defaults(args.model_type) + config = CfgNode(new_allowed=True) if args.config: config.merge_from_file(args.config) + if args.decode_cfg: + decode_confs = CfgNode(new_allowed=True) + decode_confs.merge_from_file(args.decode_cfg) + config.decode = decode_confs if args.opts: config.merge_from_list(args.opts) config.freeze() diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/test_export.py b/paddlespeech/s2t/exps/deepspeech2/bin/test_export.py index e073ebbf9823d75d87e950335bffd570ce7e8fb4..707eb9e1bc26204fe5b6a9070e02f7ad95d5f334 100644 --- a/paddlespeech/s2t/exps/deepspeech2/bin/test_export.py +++ b/paddlespeech/s2t/exps/deepspeech2/bin/test_export.py @@ -12,7 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Evaluation for DeepSpeech2 model.""" -from paddlespeech.s2t.exps.deepspeech2.config import get_cfg_defaults +from yacs.config import CfgNode + from paddlespeech.s2t.exps.deepspeech2.model import DeepSpeech2ExportTester as ExportTester from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.utils.utility import print_arguments @@ -46,9 +47,13 @@ if __name__ == "__main__": print("model_type:{}".format(args.model_type)) # https://yaml.org/type/float.html - config = get_cfg_defaults(args.model_type) + config = CfgNode(new_allowed=True) if args.config: config.merge_from_file(args.config) + if args.decode_cfg: + decode_confs = CfgNode(new_allowed=True) + decode_confs.merge_from_file(args.decode_cfg) + config.decode = decode_confs if args.opts: config.merge_from_list(args.opts) config.freeze() diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py b/paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py index cf2ca0d642c11db133fe348b49e2c633da3a55c8..a909dd416a03766dba505129a20edaeb9bee0cd8 100644 --- a/paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py +++ b/paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py @@ -18,8 +18,8 @@ from pathlib import Path import paddle import soundfile +from yacs.config import CfgNode -from paddlespeech.s2t.exps.deepspeech2.config import get_cfg_defaults from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer from paddlespeech.s2t.io.collator import SpeechCollator from paddlespeech.s2t.models.ds2 import DeepSpeech2Model @@ -41,7 +41,7 @@ class DeepSpeech2Tester_hub(): self.audio_file = args.audio_file self.collate_fn_test = SpeechCollator.from_config(config) self._text_featurizer = TextFeaturizer( - unit_type=config.collator.unit_type, vocab=None) + unit_type=config.unit_type, vocab=None) def compute_result_transcripts(self, audio, audio_len, vocab_list, cfg): result_transcripts = self.model.decode( @@ -74,7 +74,7 @@ class DeepSpeech2Tester_hub(): audio = paddle.unsqueeze(audio, axis=0) vocab_list = collate_fn_test.vocab_list result_transcripts = self.compute_result_transcripts( - audio, audio_len, vocab_list, cfg.decoding) + audio, audio_len, vocab_list, cfg.decode) logger.info("result_transcripts: " + result_transcripts[0]) def run_test(self): @@ -110,13 +110,13 @@ class DeepSpeech2Tester_hub(): def setup_model(self): config = self.config.clone() with UpdateConfig(config): - config.model.input_dim = self.collate_fn_test.feature_size - config.model.output_dim = self.collate_fn_test.vocab_size + config.input_dim = self.collate_fn_test.feature_size + config.output_dim = self.collate_fn_test.vocab_size if self.args.model_type == 'offline': - model = DeepSpeech2Model.from_config(config.model) + model = DeepSpeech2Model.from_config(config) elif self.args.model_type == 'online': - model = DeepSpeech2ModelOnline.from_config(config.model) + model = DeepSpeech2ModelOnline.from_config(config) else: raise Exception("wrong model type") @@ -134,8 +134,8 @@ class DeepSpeech2Tester_hub(): self.checkpoint_dir = checkpoint_dir self.checkpoint = Checkpoint( - kbest_n=self.config.training.checkpoint.kbest_n, - latest_n=self.config.training.checkpoint.latest_n) + kbest_n=self.config.checkpoint.kbest_n, + latest_n=self.config.checkpoint.latest_n) def resume(self): """Resume from the checkpoint at checkpoints in the output @@ -187,9 +187,13 @@ if __name__ == "__main__": print("model_type:{}".format(args.model_type)) # https://yaml.org/type/float.html - config = get_cfg_defaults(args.model_type) + config = CfgNode(new_allowed=True) if args.config: config.merge_from_file(args.config) + if args.decode_cfg: + decode_confs = CfgNode(new_allowed=True) + decode_confs.merge_from_file(args.decode_cfg) + config.decode = decode_confs if args.opts: config.merge_from_list(args.opts) config.freeze() diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/train.py b/paddlespeech/s2t/exps/deepspeech2/bin/train.py index 400538f9b2cd55ca1f2f1ed006e89f1437332c0e..09e8662f1ce75ff351d7981dc1c4382082f9b61a 100644 --- a/paddlespeech/s2t/exps/deepspeech2/bin/train.py +++ b/paddlespeech/s2t/exps/deepspeech2/bin/train.py @@ -13,8 +13,8 @@ # limitations under the License. """Trainer for DeepSpeech2 model.""" from paddle import distributed as dist +from yacs.config import CfgNode -from paddlespeech.s2t.exps.deepspeech2.config import get_cfg_defaults from paddlespeech.s2t.exps.deepspeech2.model import DeepSpeech2Trainer as Trainer from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.utils.utility import print_arguments @@ -42,7 +42,7 @@ if __name__ == "__main__": print_arguments(args, globals()) # https://yaml.org/type/float.html - config = get_cfg_defaults(args.model_type) + config = CfgNode(new_allowed=True) if args.config: config.merge_from_file(args.config) if args.opts: diff --git a/paddlespeech/s2t/exps/deepspeech2/config.py b/paddlespeech/s2t/exps/deepspeech2/config.py deleted file mode 100644 index 58dc05ff6cbb5dbb1bcc0c89c22dad85b0dfe654..0000000000000000000000000000000000000000 --- a/paddlespeech/s2t/exps/deepspeech2/config.py +++ /dev/null @@ -1,39 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from yacs.config import CfgNode - -from paddlespeech.s2t.exps.deepspeech2.model import DeepSpeech2Tester -from paddlespeech.s2t.exps.deepspeech2.model import DeepSpeech2Trainer -from paddlespeech.s2t.io.collator import SpeechCollator -from paddlespeech.s2t.io.dataset import ManifestDataset -from paddlespeech.s2t.models.ds2 import DeepSpeech2Model -from paddlespeech.s2t.models.ds2_online import DeepSpeech2ModelOnline - - -def get_cfg_defaults(model_type='offline'): - _C = CfgNode() - _C.data = ManifestDataset.params() - _C.collator = SpeechCollator.params() - _C.training = DeepSpeech2Trainer.params() - _C.decoding = DeepSpeech2Tester.params() - if model_type == 'offline': - _C.model = DeepSpeech2Model.params() - else: - _C.model = DeepSpeech2ModelOnline.params() - """Get a yacs CfgNode object with default values for my_project.""" - # Return a clone so that the defaults will not be altered - # This is for the "local variable" use pattern - config = _C.clone() - config.set_new_allowed(True) - return config diff --git a/paddlespeech/s2t/exps/deepspeech2/model.py b/paddlespeech/s2t/exps/deepspeech2/model.py index a0b69d64f58a66373435e8c8c28456cc6b86cb4a..049311c78147dd42540f7b915180111149892dcf 100644 --- a/paddlespeech/s2t/exps/deepspeech2/model.py +++ b/paddlespeech/s2t/exps/deepspeech2/model.py @@ -16,7 +16,6 @@ import os import time from collections import defaultdict from contextlib import nullcontext -from typing import Optional import jsonlines import numpy as np @@ -24,7 +23,6 @@ import paddle from paddle import distributed as dist from paddle import inference from paddle.io import DataLoader -from yacs.config import CfgNode from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer from paddlespeech.s2t.io.collator import SpeechCollator @@ -49,28 +47,12 @@ logger = Log(__name__).getlog() class DeepSpeech2Trainer(Trainer): - @classmethod - def params(cls, config: Optional[CfgNode]=None) -> CfgNode: - # training config - default = CfgNode( - dict( - lr=5e-4, # learning rate - lr_decay=1.0, # learning rate decay - weight_decay=1e-6, # the coeff of weight decay - global_grad_clip=5.0, # the global norm clip - n_epoch=50, # train epochs - )) - - if config is not None: - config.merge_from_other_cfg(default) - return default - def __init__(self, config, args): super().__init__(config, args) def train_batch(self, batch_index, batch_data, msg): - batch_size = self.config.collator.batch_size - accum_grad = self.config.training.accum_grad + batch_size = self.config.batch_size + accum_grad = self.config.accum_grad start = time.time() @@ -133,7 +115,7 @@ class DeepSpeech2Trainer(Trainer): total_loss += float(loss) * num_utts valid_losses['val_loss'].append(float(loss)) - if (i + 1) % self.config.training.log_interval == 0: + if (i + 1) % self.config.log_interval == 0: valid_dump = {k: np.mean(v) for k, v in valid_losses.items()} valid_dump['val_history_loss'] = total_loss / num_seen_utts @@ -154,16 +136,16 @@ class DeepSpeech2Trainer(Trainer): config = self.config.clone() with UpdateConfig(config): if self.train: - config.model.input_dim = self.train_loader.collate_fn.feature_size - config.model.output_dim = self.train_loader.collate_fn.vocab_size + config.input_dim = self.train_loader.collate_fn.feature_size + config.output_dim = self.train_loader.collate_fn.vocab_size else: - config.model.input_dim = self.test_loader.collate_fn.feature_size - config.model.output_dim = self.test_loader.collate_fn.vocab_size + config.input_dim = self.test_loader.collate_fn.feature_size + config.output_dim = self.test_loader.collate_fn.vocab_size if self.args.model_type == 'offline': - model = DeepSpeech2Model.from_config(config.model) + model = DeepSpeech2Model.from_config(config) elif self.args.model_type == 'online': - model = DeepSpeech2ModelOnline.from_config(config.model) + model = DeepSpeech2ModelOnline.from_config(config) else: raise Exception("wrong model type") if self.parallel: @@ -177,17 +159,13 @@ class DeepSpeech2Trainer(Trainer): if not self.train: return - grad_clip = ClipGradByGlobalNormWithLog( - config.training.global_grad_clip) + grad_clip = ClipGradByGlobalNormWithLog(config.global_grad_clip) lr_scheduler = paddle.optimizer.lr.ExponentialDecay( - learning_rate=config.training.lr, - gamma=config.training.lr_decay, - verbose=True) + learning_rate=config.lr, gamma=config.lr_decay, verbose=True) optimizer = paddle.optimizer.Adam( learning_rate=lr_scheduler, parameters=model.parameters(), - weight_decay=paddle.regularizer.L2Decay( - config.training.weight_decay), + weight_decay=paddle.regularizer.L2Decay(config.weight_decay), grad_clip=grad_clip) self.optimizer = optimizer self.lr_scheduler = lr_scheduler @@ -198,95 +176,75 @@ class DeepSpeech2Trainer(Trainer): config.defrost() if self.train: # train - config.data.manifest = config.data.train_manifest + config.manifest = config.train_manifest train_dataset = ManifestDataset.from_config(config) if self.parallel: batch_sampler = SortagradDistributedBatchSampler( train_dataset, - batch_size=config.collator.batch_size, + batch_size=config.batch_size, num_replicas=None, rank=None, shuffle=True, drop_last=True, - sortagrad=config.collator.sortagrad, - shuffle_method=config.collator.shuffle_method) + sortagrad=config.sortagrad, + shuffle_method=config.shuffle_method) else: batch_sampler = SortagradBatchSampler( train_dataset, shuffle=True, - batch_size=config.collator.batch_size, + batch_size=config.batch_size, drop_last=True, - sortagrad=config.collator.sortagrad, - shuffle_method=config.collator.shuffle_method) + sortagrad=config.sortagrad, + shuffle_method=config.shuffle_method) - config.collator.keep_transcription_text = False + config.keep_transcription_text = False collate_fn_train = SpeechCollator.from_config(config) self.train_loader = DataLoader( train_dataset, batch_sampler=batch_sampler, collate_fn=collate_fn_train, - num_workers=config.collator.num_workers) + num_workers=config.num_workers) # dev - config.data.manifest = config.data.dev_manifest + config.manifest = config.dev_manifest dev_dataset = ManifestDataset.from_config(config) - config.collator.augmentation_config = "" - config.collator.keep_transcription_text = False + config.augmentation_config = "" + config.keep_transcription_text = False collate_fn_dev = SpeechCollator.from_config(config) self.valid_loader = DataLoader( dev_dataset, - batch_size=int(config.collator.batch_size), + batch_size=int(config.batch_size), shuffle=False, drop_last=False, collate_fn=collate_fn_dev, - num_workers=config.collator.num_workers) + num_workers=config.num_workers) logger.info("Setup train/valid Dataloader!") else: # test - config.data.manifest = config.data.test_manifest + config.manifest = config.test_manifest test_dataset = ManifestDataset.from_config(config) - config.collator.augmentation_config = "" - config.collator.keep_transcription_text = True + config.augmentation_config = "" + config.keep_transcription_text = True collate_fn_test = SpeechCollator.from_config(config) - + decode_batch_size = config.get('decode', dict()).get( + 'decode_batch_size', 1) self.test_loader = DataLoader( test_dataset, - batch_size=config.decoding.batch_size, + batch_size=decode_batch_size, shuffle=False, drop_last=False, collate_fn=collate_fn_test, - num_workers=config.collator.num_workers) + num_workers=config.num_workers) logger.info("Setup test Dataloader!") class DeepSpeech2Tester(DeepSpeech2Trainer): - @classmethod - def params(cls, config: Optional[CfgNode]=None) -> CfgNode: - # testing config - default = CfgNode( - dict( - alpha=2.5, # Coef of LM for beam search. - beta=0.3, # Coef of WC for beam search. - cutoff_prob=1.0, # Cutoff probability for pruning. - cutoff_top_n=40, # Cutoff number for pruning. - lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm', # Filepath for language model. - decoding_method='ctc_beam_search', # Decoding method. Options: ctc_beam_search, ctc_greedy - error_rate_type='wer', # Error rate type for evaluation. Options `wer`, 'cer' - num_proc_bsearch=8, # # of CPUs for beam search. - beam_size=500, # Beam search width. - batch_size=128, # decoding batch size - )) - - if config is not None: - config.merge_from_other_cfg(default) - return default - def __init__(self, config, args): super().__init__(config, args) self._text_featurizer = TextFeaturizer( - unit_type=config.collator.unit_type, vocab=None) + unit_type=config.unit_type, vocab=None) def ordid2token(self, texts, texts_len): """ ord() id to chr() chr """ @@ -304,17 +262,17 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): texts, texts_len, fout=None): - cfg = self.config.decoding + decode_cfg = self.config.decode errors_sum, len_refs, num_ins = 0.0, 0, 0 - errors_func = error_rate.char_errors if cfg.error_rate_type == 'cer' else error_rate.word_errors - error_rate_func = error_rate.cer if cfg.error_rate_type == 'cer' else error_rate.wer + errors_func = error_rate.char_errors if decode_cfg.error_rate_type == 'cer' else error_rate.word_errors + error_rate_func = error_rate.cer if decode_cfg.error_rate_type == 'cer' else error_rate.wer vocab_list = self.test_loader.collate_fn.vocab_list target_transcripts = self.ordid2token(texts, texts_len) - result_transcripts = self.compute_result_transcripts(audio, audio_len, - vocab_list, cfg) + result_transcripts = self.compute_result_transcripts( + audio, audio_len, vocab_list, decode_cfg) for utt, target, result in zip(utts, target_transcripts, result_transcripts): @@ -327,29 +285,31 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): logger.info(f"Utt: {utt}") logger.info(f"Ref: {target}") logger.info(f"Hyp: {result}") - logger.info("Current error rate [%s] = %f" % - (cfg.error_rate_type, error_rate_func(target, result))) + logger.info( + "Current error rate [%s] = %f" % + (decode_cfg.error_rate_type, error_rate_func(target, result))) return dict( errors_sum=errors_sum, len_refs=len_refs, num_ins=num_ins, error_rate=errors_sum / len_refs, - error_rate_type=cfg.error_rate_type) + error_rate_type=decode_cfg.error_rate_type) - def compute_result_transcripts(self, audio, audio_len, vocab_list, cfg): + def compute_result_transcripts(self, audio, audio_len, vocab_list, + decode_cfg): result_transcripts = self.model.decode( audio, audio_len, vocab_list, - decoding_method=cfg.decoding_method, - lang_model_path=cfg.lang_model_path, - beam_alpha=cfg.alpha, - beam_beta=cfg.beta, - beam_size=cfg.beam_size, - cutoff_prob=cfg.cutoff_prob, - cutoff_top_n=cfg.cutoff_top_n, - num_processes=cfg.num_proc_bsearch) + decoding_method=decode_cfg.decoding_method, + lang_model_path=decode_cfg.lang_model_path, + beam_alpha=decode_cfg.alpha, + beam_beta=decode_cfg.beta, + beam_size=decode_cfg.beam_size, + cutoff_prob=decode_cfg.cutoff_prob, + cutoff_top_n=decode_cfg.cutoff_top_n, + num_processes=decode_cfg.num_proc_bsearch) return result_transcripts @@ -358,7 +318,6 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): def test(self): logger.info(f"Test Total Examples: {len(self.test_loader.dataset)}") self.model.eval() - cfg = self.config error_rate_type = None errors_sum, len_refs, num_ins = 0.0, 0, 0 with jsonlines.open(self.args.result_file, 'w') as fout: @@ -412,11 +371,10 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester): if self.args.enable_auto_log is True: from paddlespeech.s2t.utils.log import Autolog self.autolog = Autolog( - batch_size=self.config.decoding.batch_size, + batch_size=self.config.decode.decode_batch_size, model_name="deepspeech2", model_precision="fp32").getlog() self.model.eval() - cfg = self.config error_rate_type = None errors_sum, len_refs, num_ins = 0.0, 0, 0 with jsonlines.open(self.args.result_file, 'w') as fout: @@ -441,7 +399,8 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester): if self.args.enable_auto_log is True: self.autolog.report() - def compute_result_transcripts(self, audio, audio_len, vocab_list, cfg): + def compute_result_transcripts(self, audio, audio_len, vocab_list, + decode_cfg): if self.args.model_type == "online": output_probs, output_lens = self.static_forward_online(audio, audio_len) @@ -454,13 +413,15 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester): self.predictor.clear_intermediate_tensor() self.predictor.try_shrink_memory() - self.model.decoder.init_decode(cfg.alpha, cfg.beta, cfg.lang_model_path, - vocab_list, cfg.decoding_method) + self.model.decoder.init_decode(decode_cfg.alpha, decode_cfg.beta, + decode_cfg.lang_model_path, vocab_list, + decode_cfg.decoding_method) result_transcripts = self.model.decoder.decode_probs( - output_probs, output_lens, vocab_list, cfg.decoding_method, - cfg.lang_model_path, cfg.alpha, cfg.beta, cfg.beam_size, - cfg.cutoff_prob, cfg.cutoff_top_n, cfg.num_proc_bsearch) + output_probs, output_lens, vocab_list, decode_cfg.decoding_method, + decode_cfg.lang_model_path, decode_cfg.alpha, decode_cfg.beta, + decode_cfg.beam_size, decode_cfg.cutoff_prob, + decode_cfg.cutoff_top_n, decode_cfg.num_proc_bsearch) #replace the with ' ' result_transcripts = [ self._text_featurizer.detokenize(sentence) @@ -531,12 +492,10 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester): num_chunk = int(num_chunk) chunk_state_h_box = np.zeros( - (self.config.model.num_rnn_layers, 1, - self.config.model.rnn_layer_size), + (self.config.num_rnn_layers, 1, self.config.rnn_layer_size), dtype=x.dtype) chunk_state_c_box = np.zeros( - (self.config.model.num_rnn_layers, 1, - self.config.model.rnn_layer_size), + (self.config.num_rnn_layers, 1, self.config.rnn_layer_size), dtype=x.dtype) input_names = self.predictor.get_input_names() diff --git a/paddlespeech/s2t/exps/u2/bin/alignment.py b/paddlespeech/s2t/exps/u2/bin/alignment.py index df95baeb9982ecacc9c249f7c9528b8193e24428..e3390feb1af359ede500e9fc8af582a4021060e3 100644 --- a/paddlespeech/s2t/exps/u2/bin/alignment.py +++ b/paddlespeech/s2t/exps/u2/bin/alignment.py @@ -12,7 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Alignment for U2 model.""" -from paddlespeech.s2t.exps.u2.config import get_cfg_defaults +from yacs.config import CfgNode + from paddlespeech.s2t.exps.u2.model import U2Tester as Tester from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.utils.utility import print_arguments @@ -31,16 +32,20 @@ def main(config, args): if __name__ == "__main__": parser = default_argument_parser() - # save asr result to + # save asr result to parser.add_argument( "--result_file", type=str, help="path of save the asr result") args = parser.parse_args() print_arguments(args, globals()) # https://yaml.org/type/float.html - config = get_cfg_defaults() + config = CfgNode(new_allowed=True) if args.config: config.merge_from_file(args.config) + if args.decode_cfg: + decode_confs = CfgNode(new_allowed=True) + decode_confs.merge_from_file(args.decode_cfg) + config.decode = decode_confs if args.opts: config.merge_from_list(args.opts) config.freeze() diff --git a/paddlespeech/s2t/exps/u2/bin/export.py b/paddlespeech/s2t/exps/u2/bin/export.py index 44fc7c3e56a1107e09154f2646576d20065b908a..592b12379be7648377ad3c450f2b608610121fbe 100644 --- a/paddlespeech/s2t/exps/u2/bin/export.py +++ b/paddlespeech/s2t/exps/u2/bin/export.py @@ -12,7 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Export for U2 model.""" -from paddlespeech.s2t.exps.u2.config import get_cfg_defaults +from yacs.config import CfgNode + from paddlespeech.s2t.exps.u2.model import U2Tester as Tester from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.utils.utility import print_arguments @@ -31,14 +32,14 @@ def main(config, args): if __name__ == "__main__": parser = default_argument_parser() - # save jit model to + # save jit model to parser.add_argument( "--export_path", type=str, help="path of the jit model to save") args = parser.parse_args() print_arguments(args, globals()) # https://yaml.org/type/float.html - config = get_cfg_defaults() + config = CfgNode(new_allowed=True) if args.config: config.merge_from_file(args.config) if args.opts: diff --git a/paddlespeech/s2t/exps/u2/bin/test.py b/paddlespeech/s2t/exps/u2/bin/test.py index 48b0670d5fe6d6ceeff594bcb979afe97d9de57e..f14d804f188a1e6089f4c513974a3e324f3f34f4 100644 --- a/paddlespeech/s2t/exps/u2/bin/test.py +++ b/paddlespeech/s2t/exps/u2/bin/test.py @@ -14,12 +14,13 @@ """Evaluation for U2 model.""" import cProfile -from paddlespeech.s2t.exps.u2.config import get_cfg_defaults +from yacs.config import CfgNode + from paddlespeech.s2t.exps.u2.model import U2Tester as Tester from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.utils.utility import print_arguments -# TODO(hui zhang): dynamic load +# TODO(hui zhang): dynamic load def main_sp(config, args): @@ -35,16 +36,20 @@ def main(config, args): if __name__ == "__main__": parser = default_argument_parser() - # save asr result to + # save asr result to parser.add_argument( "--result_file", type=str, help="path of save the asr result") args = parser.parse_args() print_arguments(args, globals()) # https://yaml.org/type/float.html - config = get_cfg_defaults() + config = CfgNode(new_allowed=True) if args.config: config.merge_from_file(args.config) + if args.decode_cfg: + decode_confs = CfgNode(new_allowed=True) + decode_confs.merge_from_file(args.decode_cfg) + config.decode = decode_confs if args.opts: config.merge_from_list(args.opts) config.freeze() diff --git a/paddlespeech/s2t/exps/u2/bin/test_wav.py b/paddlespeech/s2t/exps/u2/bin/test_wav.py index 556316ec0065f76e337a6ea00cf78313778ce04a..9904813a581cd4b3ea2dcbe873e5abec8e802491 100644 --- a/paddlespeech/s2t/exps/u2/bin/test_wav.py +++ b/paddlespeech/s2t/exps/u2/bin/test_wav.py @@ -18,8 +18,8 @@ from pathlib import Path import paddle import soundfile +from yacs.config import CfgNode -from paddlespeech.s2t.exps.u2.config import get_cfg_defaults from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer from paddlespeech.s2t.models.u2 import U2Model from paddlespeech.s2t.training.cli import default_argument_parser @@ -36,23 +36,22 @@ class U2Infer(): self.args = args self.config = config self.audio_file = args.audio_file - self.sr = config.collator.target_sample_rate - self.preprocess_conf = config.collator.augmentation_config + self.preprocess_conf = config.preprocess_config self.preprocess_args = {"train": False} self.preprocessing = Transformation(self.preprocess_conf) self.text_feature = TextFeaturizer( - unit_type=config.collator.unit_type, - vocab=config.collator.vocab_filepath, - spm_model_prefix=config.collator.spm_model_prefix) + unit_type=config.unit_type, + vocab=config.vocab_filepath, + spm_model_prefix=config.spm_model_prefix) paddle.set_device('gpu' if self.args.ngpu > 0 else 'cpu') # model - model_conf = config.model + model_conf = config with UpdateConfig(model_conf): - model_conf.input_dim = config.collator.feat_dim + model_conf.input_dim = config.feat_dim model_conf.output_dim = self.text_feature.vocab_size model = U2Model.from_config(model_conf) self.model = model @@ -70,10 +69,6 @@ class U2Infer(): # read audio, sample_rate = soundfile.read( self.audio_file, dtype="int16", always_2d=True) - if sample_rate != self.sr: - logger.error( - f"sample rate error: {sample_rate}, need {self.sr} ") - sys.exit(-1) audio = audio[:, 0] logger.info(f"audio shape: {audio.shape}") @@ -85,17 +80,17 @@ class U2Infer(): ilen = paddle.to_tensor(feat.shape[0]) xs = paddle.to_tensor(feat, dtype='float32').unsqueeze(axis=0) - cfg = self.config.decoding + decode_config = self.config.decode result_transcripts = self.model.decode( xs, ilen, text_feature=self.text_feature, - decoding_method=cfg.decoding_method, - beam_size=cfg.beam_size, - ctc_weight=cfg.ctc_weight, - decoding_chunk_size=cfg.decoding_chunk_size, - num_decoding_left_chunks=cfg.num_decoding_left_chunks, - simulate_streaming=cfg.simulate_streaming) + decoding_method=decode_config.decoding_method, + beam_size=decode_config.beam_size, + ctc_weight=decode_config.ctc_weight, + decoding_chunk_size=decode_config.decoding_chunk_size, + num_decoding_left_chunks=decode_config.num_decoding_left_chunks, + simulate_streaming=decode_config.simulate_streaming) rsl = result_transcripts[0][0] utt = Path(self.audio_file).name logger.info(f"hyp: {utt} {result_transcripts[0][0]}") @@ -133,9 +128,13 @@ if __name__ == "__main__": "--audio_file", type=str, help="path of the input audio file") args = parser.parse_args() - config = get_cfg_defaults() + config = CfgNode(new_allowed=True) if args.config: config.merge_from_file(args.config) + if args.decode_cfg: + decode_confs = CfgNode(new_allowed=True) + decode_confs.merge_from_file(args.decode_cfg) + config.decode = decode_confs if args.opts: config.merge_from_list(args.opts) config.freeze() diff --git a/paddlespeech/s2t/exps/u2/bin/train.py b/paddlespeech/s2t/exps/u2/bin/train.py index d6ee8b30773544cbf3584ba8b4062769783355b3..53c223283f180397e69b6a5290a4572e5a76cc41 100644 --- a/paddlespeech/s2t/exps/u2/bin/train.py +++ b/paddlespeech/s2t/exps/u2/bin/train.py @@ -16,8 +16,8 @@ import cProfile import os from paddle import distributed as dist +from yacs.config import CfgNode -from paddlespeech.s2t.exps.u2.config import get_cfg_defaults from paddlespeech.s2t.exps.u2.model import U2Trainer as Trainer from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.utils.utility import print_arguments @@ -44,7 +44,7 @@ if __name__ == "__main__": print_arguments(args, globals()) # https://yaml.org/type/float.html - config = get_cfg_defaults() + config = CfgNode(new_allowed=True) if args.config: config.merge_from_file(args.config) if args.opts: diff --git a/paddlespeech/s2t/exps/u2/config.py b/paddlespeech/s2t/exps/u2/config.py deleted file mode 100644 index 898b0bb2504b96d7a9cdb89f8383ba4961a0247f..0000000000000000000000000000000000000000 --- a/paddlespeech/s2t/exps/u2/config.py +++ /dev/null @@ -1,41 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from yacs.config import CfgNode - -from paddlespeech.s2t.exps.u2.model import U2Tester -from paddlespeech.s2t.exps.u2.model import U2Trainer -from paddlespeech.s2t.io.collator import SpeechCollator -from paddlespeech.s2t.io.dataset import ManifestDataset -from paddlespeech.s2t.models.u2 import U2Model - -_C = CfgNode() - -_C.data = ManifestDataset.params() - -_C.collator = SpeechCollator.params() - -_C.model = U2Model.params() - -_C.training = U2Trainer.params() - -_C.decoding = U2Tester.params() - - -def get_cfg_defaults(): - """Get a yacs CfgNode object with default values for my_project.""" - # Return a clone so that the defaults will not be altered - # This is for the "local variable" use pattern - config = _C.clone() - config.set_new_allowed(True) - return config diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py index 404058edcad00a4c6cc4833f772cba9b3b183a28..992be5cd42376c4b0bc7bbf857a115f0ca037353 100644 --- a/paddlespeech/s2t/exps/u2/model.py +++ b/paddlespeech/s2t/exps/u2/model.py @@ -18,13 +18,11 @@ import time from collections import defaultdict from collections import OrderedDict from contextlib import nullcontext -from typing import Optional import jsonlines import numpy as np import paddle from paddle import distributed as dist -from yacs.config import CfgNode from paddlespeech.s2t.frontend.featurizer import TextFeaturizer from paddlespeech.s2t.io.dataloader import BatchDataLoader @@ -46,38 +44,11 @@ logger = Log(__name__).getlog() class U2Trainer(Trainer): - @classmethod - def params(cls, config: Optional[CfgNode]=None) -> CfgNode: - # training config - default = CfgNode( - dict( - n_epoch=50, # train epochs - log_interval=100, # steps - accum_grad=1, # accum grad by # steps - global_grad_clip=5.0, # the global norm clip - )) - default.optim = 'adam' - default.optim_conf = CfgNode( - dict( - lr=5e-4, # learning rate - weight_decay=1e-6, # the coeff of weight decay - )) - default.scheduler = 'warmuplr' - default.scheduler_conf = CfgNode( - dict( - warmup_steps=25000, - lr_decay=1.0, # learning rate decay - )) - - if config is not None: - config.merge_from_other_cfg(default) - return default - def __init__(self, config, args): super().__init__(config, args) def train_batch(self, batch_index, batch_data, msg): - train_conf = self.config.training + train_conf = self.config start = time.time() # forward @@ -120,7 +91,7 @@ class U2Trainer(Trainer): for k, v in losses_np.items(): report(k, v) - report("batch_size", self.config.collator.batch_size) + report("batch_size", self.config.batch_size) report("accum", train_conf.accum_grad) report("step_cost", iteration_time) @@ -153,7 +124,7 @@ class U2Trainer(Trainer): if ctc_loss: valid_losses['val_ctc_loss'].append(float(ctc_loss)) - if (i + 1) % self.config.training.log_interval == 0: + if (i + 1) % self.config.log_interval == 0: valid_dump = {k: np.mean(v) for k, v in valid_losses.items()} valid_dump['val_history_loss'] = total_loss / num_seen_utts @@ -182,7 +153,7 @@ class U2Trainer(Trainer): self.before_train() logger.info(f"Train Total Examples: {len(self.train_loader.dataset)}") - while self.epoch < self.config.training.n_epoch: + while self.epoch < self.config.n_epoch: with Timer("Epoch-Train Time Cost: {}"): self.model.train() try: @@ -214,8 +185,7 @@ class U2Trainer(Trainer): k.split(',')) == 2 else "" msg += "," msg = msg[:-1] # remove the last "," - if (batch_index + 1 - ) % self.config.training.log_interval == 0: + if (batch_index + 1) % self.config.log_interval == 0: logger.info(msg) data_start_time = time.time() except Exception as e: @@ -252,30 +222,31 @@ class U2Trainer(Trainer): if self.train: # train/valid dataset, return token ids self.train_loader = BatchDataLoader( - json_file=config.data.train_manifest, + json_file=config.train_manifest, train_mode=True, - sortagrad=False, - batch_size=config.collator.batch_size, - maxlen_in=float('inf'), - maxlen_out=float('inf'), - minibatches=0, + sortagrad=config.sortagrad, + batch_size=config.batch_size, + maxlen_in=config.maxlen_in, + maxlen_out=config.maxlen_out, + minibatches=config.minibatches, mini_batch_size=self.args.ngpu, - batch_count='auto', - batch_bins=0, - batch_frames_in=0, - batch_frames_out=0, - batch_frames_inout=0, - preprocess_conf=config.collator. - augmentation_config, # aug will be off when train_mode=False - n_iter_processes=config.collator.num_workers, + batch_count=config.batch_count, + batch_bins=config.batch_bins, + batch_frames_in=config.batch_frames_in, + batch_frames_out=config.batch_frames_out, + batch_frames_inout=config.batch_frames_inout, + preprocess_conf=config.preprocess_config, + n_iter_processes=config.num_workers, subsampling_factor=1, - num_encs=1) + num_encs=1, + dist_sampler=False, + shortest_first=False) self.valid_loader = BatchDataLoader( - json_file=config.data.dev_manifest, + json_file=config.dev_manifest, train_mode=False, sortagrad=False, - batch_size=config.collator.batch_size, + batch_size=config.batch_size, maxlen_in=float('inf'), maxlen_out=float('inf'), minibatches=0, @@ -285,19 +256,22 @@ class U2Trainer(Trainer): batch_frames_in=0, batch_frames_out=0, batch_frames_inout=0, - preprocess_conf=config.collator. - augmentation_config, # aug will be off when train_mode=False - n_iter_processes=config.collator.num_workers, + preprocess_conf=config.preprocess_config, + n_iter_processes=config.num_workers, subsampling_factor=1, - num_encs=1) + num_encs=1, + dist_sampler=False, + shortest_first=False) logger.info("Setup train/valid Dataloader!") else: + decode_batch_size = config.get('decode', dict()).get( + 'decode_batch_size', 1) # test dataset, return raw text self.test_loader = BatchDataLoader( - json_file=config.data.test_manifest, + json_file=config.test_manifest, train_mode=False, sortagrad=False, - batch_size=config.decoding.batch_size, + batch_size=decode_batch_size, maxlen_in=float('inf'), maxlen_out=float('inf'), minibatches=0, @@ -307,17 +281,16 @@ class U2Trainer(Trainer): batch_frames_in=0, batch_frames_out=0, batch_frames_inout=0, - preprocess_conf=config.collator. - augmentation_config, # aug will be off when train_mode=False + preprocess_conf=config.preprocess_config, n_iter_processes=1, subsampling_factor=1, num_encs=1) self.align_loader = BatchDataLoader( - json_file=config.data.test_manifest, + json_file=config.test_manifest, train_mode=False, sortagrad=False, - batch_size=config.decoding.batch_size, + batch_size=decode_batch_size, maxlen_in=float('inf'), maxlen_out=float('inf'), minibatches=0, @@ -327,8 +300,7 @@ class U2Trainer(Trainer): batch_frames_in=0, batch_frames_out=0, batch_frames_inout=0, - preprocess_conf=config.collator. - augmentation_config, # aug will be off when train_mode=False + preprocess_conf=config.preprocess_config, n_iter_processes=1, subsampling_factor=1, num_encs=1) @@ -336,7 +308,7 @@ class U2Trainer(Trainer): def setup_model(self): config = self.config - model_conf = config.model + model_conf = config with UpdateConfig(model_conf): if self.train: @@ -359,7 +331,7 @@ class U2Trainer(Trainer): if not self.train: return - train_config = config.training + train_config = config optim_type = train_config.optim optim_conf = train_config.optim_conf scheduler_type = train_config.scheduler @@ -379,7 +351,7 @@ class U2Trainer(Trainer): config, parameters, lr_scheduler=None, ): - train_config = config.training + train_config = config optim_type = train_config.optim optim_conf = train_config.optim_conf scheduler_type = train_config.scheduler @@ -404,41 +376,12 @@ class U2Trainer(Trainer): class U2Tester(U2Trainer): - @classmethod - def params(cls, config: Optional[CfgNode]=None) -> CfgNode: - # decoding config - default = CfgNode( - dict( - alpha=2.5, # Coef of LM for beam search. - beta=0.3, # Coef of WC for beam search. - cutoff_prob=1.0, # Cutoff probability for pruning. - cutoff_top_n=40, # Cutoff number for pruning. - lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm', # Filepath for language model. - decoding_method='attention', # Decoding method. Options: 'attention', 'ctc_greedy_search', - # 'ctc_prefix_beam_search', 'attention_rescoring' - error_rate_type='wer', # Error rate type for evaluation. Options `wer`, 'cer' - num_proc_bsearch=8, # # of CPUs for beam search. - beam_size=10, # Beam search width. - batch_size=16, # decoding batch size - ctc_weight=0.0, # ctc weight for attention rescoring decode mode. - decoding_chunk_size=-1, # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks=-1, # number of left chunks for decoding. Defaults to -1. - simulate_streaming=False, # simulate streaming inference. Defaults to False. - )) - - if config is not None: - config.merge_from_other_cfg(default) - return default - def __init__(self, config, args): super().__init__(config, args) self.text_feature = TextFeaturizer( - unit_type=self.config.collator.unit_type, - vocab=self.config.collator.vocab_filepath, - spm_model_prefix=self.config.collator.spm_model_prefix) + unit_type=self.config.unit_type, + vocab=self.config.vocab_filepath, + spm_model_prefix=self.config.spm_model_prefix) self.vocab_list = self.text_feature.vocab_list def id2token(self, texts, texts_len, text_feature): @@ -457,10 +400,10 @@ class U2Tester(U2Trainer): texts, texts_len, fout=None): - cfg = self.config.decoding + decode_config = self.config.decode errors_sum, len_refs, num_ins = 0.0, 0, 0 - errors_func = error_rate.char_errors if cfg.error_rate_type == 'cer' else error_rate.word_errors - error_rate_func = error_rate.cer if cfg.error_rate_type == 'cer' else error_rate.wer + errors_func = error_rate.char_errors if decode_config.error_rate_type == 'cer' else error_rate.word_errors + error_rate_func = error_rate.cer if decode_config.error_rate_type == 'cer' else error_rate.wer start_time = time.time() target_transcripts = self.id2token(texts, texts_len, self.text_feature) @@ -468,12 +411,12 @@ class U2Tester(U2Trainer): audio, audio_len, text_feature=self.text_feature, - decoding_method=cfg.decoding_method, - beam_size=cfg.beam_size, - ctc_weight=cfg.ctc_weight, - decoding_chunk_size=cfg.decoding_chunk_size, - num_decoding_left_chunks=cfg.num_decoding_left_chunks, - simulate_streaming=cfg.simulate_streaming) + decoding_method=decode_config.decoding_method, + beam_size=decode_config.beam_size, + ctc_weight=decode_config.ctc_weight, + decoding_chunk_size=decode_config.decoding_chunk_size, + num_decoding_left_chunks=decode_config.num_decoding_left_chunks, + simulate_streaming=decode_config.simulate_streaming) decode_time = time.time() - start_time for utt, target, result, rec_tids in zip( @@ -492,15 +435,15 @@ class U2Tester(U2Trainer): logger.info(f"Utt: {utt}") logger.info(f"Ref: {target}") logger.info(f"Hyp: {result}") - logger.info("One example error rate [%s] = %f" % - (cfg.error_rate_type, error_rate_func(target, result))) + logger.info("One example error rate [%s] = %f" % ( + decode_config.error_rate_type, error_rate_func(target, result))) return dict( errors_sum=errors_sum, len_refs=len_refs, num_ins=num_ins, # num examples error_rate=errors_sum / len_refs, - error_rate_type=cfg.error_rate_type, + error_rate_type=decode_config.error_rate_type, num_frames=audio_len.sum().numpy().item(), decode_time=decode_time) @@ -511,7 +454,7 @@ class U2Tester(U2Trainer): self.model.eval() logger.info(f"Test Total Examples: {len(self.test_loader.dataset)}") - stride_ms = self.config.collator.stride_ms + stride_ms = self.config.stride_ms error_rate_type = None errors_sum, len_refs, num_ins = 0.0, 0, 0 num_frames = 0.0 @@ -562,15 +505,15 @@ class U2Tester(U2Trainer): "ref_len": len_refs, "decode_method": - self.config.decoding.decoding_method, + self.config.decode.decoding_method, }) f.write(data + '\n') @paddle.no_grad() def align(self): ctc_utils.ctc_align(self.config, self.model, self.align_loader, - self.config.decoding.batch_size, - self.config.collator.stride_ms, self.vocab_list, + self.config.decode.decode_batch_size, + self.config.stride_ms, self.vocab_list, self.args.result_file) def load_inferspec(self): @@ -581,10 +524,10 @@ class U2Tester(U2Trainer): List[paddle.static.InputSpec]: input spec. """ from paddlespeech.s2t.models.u2 import U2InferModel - infer_model = U2InferModel.from_pretrained(self.test_loader, - self.config.model.clone(), + infer_model = U2InferModel.from_pretrained(self.train_loader, + self.config.clone(), self.args.checkpoint_path) - feat_dim = self.test_loader.feat_dim + feat_dim = self.train_loader.feat_dim input_spec = [ paddle.static.InputSpec(shape=[1, None, feat_dim], dtype='float32'), # audio, [B,T,D] diff --git a/paddlespeech/s2t/exps/u2/trainer.py b/paddlespeech/s2t/exps/u2/trainer.py index 22a0a3c58e44fdc55d2269c724d3ebdf898016b3..ab87c30d6cdb83053b53e080094f5c52ef8050f1 100644 --- a/paddlespeech/s2t/exps/u2/trainer.py +++ b/paddlespeech/s2t/exps/u2/trainer.py @@ -44,77 +44,75 @@ class U2Trainer(Trainer): def setup_dataloader(self): config = self.config.clone() config.defrost() - config.collator.keep_transcription_text = False + config.keep_transcription_text = False # train/valid dataset, return token ids - config.data.manifest = config.data.train_manifest + config.manifest = config.train_manifest train_dataset = ManifestDataset.from_config(config) - config.data.manifest = config.data.dev_manifest + config.manifest = config.dev_manifest dev_dataset = ManifestDataset.from_config(config) collate_fn_train = SpeechCollator.from_config(config) - config.collator.augmentation_config = "" collate_fn_dev = SpeechCollator.from_config(config) if self.parallel: batch_sampler = SortagradDistributedBatchSampler( train_dataset, - batch_size=config.collator.batch_size, + batch_size=config.batch_size, num_replicas=None, rank=None, shuffle=True, drop_last=True, - sortagrad=config.collator.sortagrad, - shuffle_method=config.collator.shuffle_method) + sortagrad=config.sortagrad, + shuffle_method=config.shuffle_method) else: batch_sampler = SortagradBatchSampler( train_dataset, shuffle=True, - batch_size=config.collator.batch_size, + batch_size=config.batch_size, drop_last=True, - sortagrad=config.collator.sortagrad, - shuffle_method=config.collator.shuffle_method) + sortagrad=config.sortagrad, + shuffle_method=config.shuffle_method) self.train_loader = DataLoader( train_dataset, batch_sampler=batch_sampler, collate_fn=collate_fn_train, - num_workers=config.collator.num_workers, ) + num_workers=config.num_workers, ) self.valid_loader = DataLoader( dev_dataset, - batch_size=config.collator.batch_size, + batch_size=config.batch_size, shuffle=False, drop_last=False, collate_fn=collate_fn_dev, - num_workers=config.collator.num_workers, ) + num_workers=config.num_workers, ) # test dataset, return raw text - config.data.manifest = config.data.test_manifest + config.manifest = config.test_manifest # filter test examples, will cause less examples, but no mismatch with training # and can use large batch size , save training time, so filter test egs now. - config.data.min_input_len = 0.0 # second - config.data.max_input_len = float('inf') # second - config.data.min_output_len = 0.0 # tokens - config.data.max_output_len = float('inf') # tokens - config.data.min_output_input_ratio = 0.00 - config.data.max_output_input_ratio = float('inf') + config.min_input_len = 0.0 # second + config.max_input_len = float('inf') # second + config.min_output_len = 0.0 # tokens + config.max_output_len = float('inf') # tokens + config.min_output_input_ratio = 0.00 + config.max_output_input_ratio = float('inf') test_dataset = ManifestDataset.from_config(config) # return text ord id - config.collator.keep_transcription_text = True - config.collator.augmentation_config = "" + config.keep_transcription_text = True self.test_loader = DataLoader( test_dataset, - batch_size=config.decoding.batch_size, + batch_size=config.decode.batch_size, shuffle=False, drop_last=False, collate_fn=SpeechCollator.from_config(config)) # return text token id - config.collator.keep_transcription_text = False + config.keep_transcription_text = False self.align_loader = DataLoader( test_dataset, - batch_size=config.decoding.batch_size, + batch_size=config.decode.batch_size, shuffle=False, drop_last=False, collate_fn=SpeechCollator.from_config(config)) @@ -122,7 +120,7 @@ class U2Trainer(Trainer): def setup_model(self): config = self.config - model_conf = config.model + model_conf = config with UpdateConfig(model_conf): model_conf.input_dim = self.train_loader.collate_fn.feature_size model_conf.output_dim = self.train_loader.collate_fn.vocab_size @@ -136,7 +134,7 @@ class U2Trainer(Trainer): logger.info(f"{model}") layer_tools.print_params(model, logger.info) - train_config = config.training + train_config = config optim_type = train_config.optim optim_conf = train_config.optim_conf scheduler_type = train_config.scheduler @@ -156,7 +154,7 @@ class U2Trainer(Trainer): config, parameters, lr_scheduler=None, ): - train_config = config.training + train_config = config optim_type = train_config.optim optim_conf = train_config.optim_conf scheduler_type = train_config.scheduler @@ -182,7 +180,7 @@ class U2Trainer(Trainer): def setup_updater(self): output_dir = self.output_dir - config = self.config.training + config = self.config updater = U2Updater( model=self.model, diff --git a/paddlespeech/s2t/exps/u2_kaldi/bin/test.py b/paddlespeech/s2t/exps/u2_kaldi/bin/test.py index 67bed349792930a0ae90180650e6de8d1e47502e..422483b9797059f8b836c8abe6ee4546e56619f7 100644 --- a/paddlespeech/s2t/exps/u2_kaldi/bin/test.py +++ b/paddlespeech/s2t/exps/u2_kaldi/bin/test.py @@ -69,6 +69,10 @@ if __name__ == "__main__": config = CfgNode() config.set_new_allowed(True) config.merge_from_file(args.config) + if args.decode_cfg: + decode_confs = CfgNode(new_allowed=True) + decode_confs.merge_from_file(args.decode_cfg) + config.decode = decode_confs if args.opts: config.merge_from_list(args.opts) config.freeze() diff --git a/paddlespeech/s2t/exps/u2_kaldi/model.py b/paddlespeech/s2t/exps/u2_kaldi/model.py index 9b8274ad64bbdd704e030c8f3f61d72d71c9c9b6..bc995977ada577770612f99d05387ed0bb87d39e 100644 --- a/paddlespeech/s2t/exps/u2_kaldi/model.py +++ b/paddlespeech/s2t/exps/u2_kaldi/model.py @@ -17,13 +17,11 @@ import os import time from collections import defaultdict from contextlib import nullcontext -from typing import Optional import jsonlines import numpy as np import paddle from paddle import distributed as dist -from yacs.config import CfgNode from paddlespeech.s2t.frontend.featurizer import TextFeaturizer from paddlespeech.s2t.frontend.utility import load_dict @@ -43,44 +41,12 @@ from paddlespeech.s2t.utils.utility import UpdateConfig logger = Log(__name__).getlog() -def get_cfg_defaults(): - """Get a yacs CfgNode object with default values for my_project.""" - # Return a clone so that the defaults will not be altered - # This is for the "local variable" use pattern - _C = CfgNode() - - _C.model = U2Model.params() - - _C.training = U2Trainer.params() - - _C.decoding = U2Tester.params() - - config = _C.clone() - config.set_new_allowed(True) - return config - - class U2Trainer(Trainer): - @classmethod - def params(cls, config: Optional[CfgNode]=None) -> CfgNode: - # training config - default = CfgNode( - dict( - n_epoch=50, # train epochs - log_interval=100, # steps - accum_grad=1, # accum grad by # steps - checkpoint=dict( - kbest_n=50, - latest_n=5, ), )) - if config is not None: - config.merge_from_other_cfg(default) - return default - def __init__(self, config, args): super().__init__(config, args) def train_batch(self, batch_index, batch_data, msg): - train_conf = self.config.training + train_conf = self.config start = time.time() # forward @@ -122,7 +88,7 @@ class U2Trainer(Trainer): if (batch_index + 1) % train_conf.log_interval == 0: msg += "train time: {:>.3f}s, ".format(iteration_time) - msg += "batch size: {}, ".format(self.config.collator.batch_size) + msg += "batch size: {}, ".format(self.config.batch_size) msg += "accum: {}, ".format(train_conf.accum_grad) msg += ', '.join('{}: {:>.6f}'.format(k, v) for k, v in losses_np.items()) @@ -157,7 +123,7 @@ class U2Trainer(Trainer): if ctc_loss: valid_losses['val_ctc_loss'].append(float(ctc_loss)) - if (i + 1) % self.config.training.log_interval == 0: + if (i + 1) % self.config.log_interval == 0: valid_dump = {k: np.mean(v) for k, v in valid_losses.items()} valid_dump['val_history_loss'] = total_loss / num_seen_utts @@ -186,7 +152,7 @@ class U2Trainer(Trainer): self.before_train() logger.info(f"Train Total Examples: {len(self.train_loader.dataset)}") - while self.epoch < self.config.training.n_epoch: + while self.epoch < self.config.n_epoch: with Timer("Epoch-Train Time Cost: {}"): self.model.train() try: @@ -235,10 +201,10 @@ class U2Trainer(Trainer): config = self.config.clone() # train/valid dataset, return token ids self.train_loader = BatchDataLoader( - json_file=config.data.train_manifest, + json_file=config.train_manifest, train_mode=True, sortagrad=False, - batch_size=config.collator.batch_size, + batch_size=config.batch_size, maxlen_in=float('inf'), maxlen_out=float('inf'), minibatches=0, @@ -248,16 +214,16 @@ class U2Trainer(Trainer): batch_frames_in=0, batch_frames_out=0, batch_frames_inout=0, - preprocess_conf=config.collator.augmentation_config, - n_iter_processes=config.collator.num_workers, + preprocess_conf=config.preprocess_config, + n_iter_processes=config.num_workers, subsampling_factor=1, num_encs=1) self.valid_loader = BatchDataLoader( - json_file=config.data.dev_manifest, + json_file=config.dev_manifest, train_mode=False, sortagrad=False, - batch_size=config.collator.batch_size, + batch_size=config.batch_size, maxlen_in=float('inf'), maxlen_out=float('inf'), minibatches=0, @@ -268,16 +234,18 @@ class U2Trainer(Trainer): batch_frames_out=0, batch_frames_inout=0, preprocess_conf=None, - n_iter_processes=config.collator.num_workers, + n_iter_processes=config.num_workers, subsampling_factor=1, num_encs=1) + decode_batch_size = config.get('decode', dict()).get( + 'decode_batch_size', 1) # test dataset, return raw text self.test_loader = BatchDataLoader( - json_file=config.data.test_manifest, + json_file=config.test_manifest, train_mode=False, sortagrad=False, - batch_size=config.decoding.batch_size, + batch_size=decode_batch_size, maxlen_in=float('inf'), maxlen_out=float('inf'), minibatches=0, @@ -293,10 +261,10 @@ class U2Trainer(Trainer): num_encs=1) self.align_loader = BatchDataLoader( - json_file=config.data.test_manifest, + json_file=config.test_manifest, train_mode=False, sortagrad=False, - batch_size=config.decoding.batch_size, + batch_size=decode_batch_size, maxlen_in=float('inf'), maxlen_out=float('inf'), minibatches=0, @@ -316,7 +284,7 @@ class U2Trainer(Trainer): config = self.config # model - model_conf = config.model + model_conf = config with UpdateConfig(model_conf): model_conf.input_dim = self.train_loader.feat_dim model_conf.output_dim = self.train_loader.vocab_size @@ -360,41 +328,12 @@ class U2Trainer(Trainer): class U2Tester(U2Trainer): - @classmethod - def params(cls, config: Optional[CfgNode]=None) -> CfgNode: - # decoding config - default = CfgNode( - dict( - alpha=2.5, # Coef of LM for beam search. - beta=0.3, # Coef of WC for beam search. - cutoff_prob=1.0, # Cutoff probability for pruning. - cutoff_top_n=40, # Cutoff number for pruning. - lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm', # Filepath for language model. - decoding_method='attention', # Decoding method. Options: 'attention', 'ctc_greedy_search', - # 'ctc_prefix_beam_search', 'attention_rescoring' - error_rate_type='wer', # Error rate type for evaluation. Options `wer`, 'cer' - num_proc_bsearch=8, # # of CPUs for beam search. - beam_size=10, # Beam search width. - batch_size=16, # decoding batch size - ctc_weight=0.0, # ctc weight for attention rescoring decode mode. - decoding_chunk_size=-1, # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks=-1, # number of left chunks for decoding. Defaults to -1. - simulate_streaming=False, # simulate streaming inference. Defaults to False. - )) - - if config is not None: - config.merge_from_other_cfg(default) - return default - def __init__(self, config, args): super().__init__(config, args) self.text_feature = TextFeaturizer( - unit_type=self.config.collator.unit_type, - vocab=self.config.collator.vocab_filepath, - spm_model_prefix=self.config.collator.spm_model_prefix) + unit_type=self.config.unit_type, + vocab=self.config.vocab_filepath, + spm_model_prefix=self.config.spm_model_prefix) self.vocab_list = self.text_feature.vocab_list def id2token(self, texts, texts_len, text_feature): @@ -413,10 +352,10 @@ class U2Tester(U2Trainer): texts, texts_len, fout=None): - cfg = self.config.decoding + decode_cfg = self.config.decode errors_sum, len_refs, num_ins = 0.0, 0, 0 - errors_func = error_rate.char_errors if cfg.error_rate_type == 'cer' else error_rate.word_errors - error_rate_func = error_rate.cer if cfg.error_rate_type == 'cer' else error_rate.wer + errors_func = error_rate.char_errors if decode_cfg.error_rate_type == 'cer' else error_rate.word_errors + error_rate_func = error_rate.cer if decode_cfg.error_rate_type == 'cer' else error_rate.wer start_time = time.time() target_transcripts = self.id2token(texts, texts_len, self.text_feature) @@ -424,12 +363,12 @@ class U2Tester(U2Trainer): audio, audio_len, text_feature=self.text_feature, - decoding_method=cfg.decoding_method, - beam_size=cfg.beam_size, - ctc_weight=cfg.ctc_weight, - decoding_chunk_size=cfg.decoding_chunk_size, - num_decoding_left_chunks=cfg.num_decoding_left_chunks, - simulate_streaming=cfg.simulate_streaming) + decoding_method=decode_cfg.decoding_method, + beam_size=decode_cfg.beam_size, + ctc_weight=decode_cfg.ctc_weight, + decoding_chunk_size=decode_cfg.decoding_chunk_size, + num_decoding_left_chunks=decode_cfg.num_decoding_left_chunks, + simulate_streaming=decode_cfg.simulate_streaming) decode_time = time.time() - start_time for i, (utt, target, result, rec_tids) in enumerate( @@ -449,15 +388,16 @@ class U2Tester(U2Trainer): logger.info(f"Utt: {utt}") logger.info(f"Ref: {target}") logger.info(f"Hyp: {result}") - logger.info("One example error rate [%s] = %f" % - (cfg.error_rate_type, error_rate_func(target, result))) + logger.info( + "One example error rate [%s] = %f" % + (decode_cfg.error_rate_type, error_rate_func(target, result))) return dict( errors_sum=errors_sum, len_refs=len_refs, num_ins=num_ins, # num examples error_rate=errors_sum / len_refs, - error_rate_type=cfg.error_rate_type, + error_rate_type=decode_cfg.error_rate_type, num_frames=audio_len.sum().numpy().item(), decode_time=decode_time) @@ -468,7 +408,7 @@ class U2Tester(U2Trainer): self.model.eval() logger.info(f"Test Total Examples: {len(self.test_loader.dataset)}") - stride_ms = self.config.collator.stride_ms + stride_ms = self.config.stride_ms error_rate_type = None errors_sum, len_refs, num_ins = 0.0, 0, 0 num_frames = 0.0 @@ -519,15 +459,15 @@ class U2Tester(U2Trainer): "ref_len": len_refs, "decode_method": - self.config.decoding.decoding_method, + self.config.decode.decoding_method, }) f.write(data + '\n') @paddle.no_grad() def align(self): ctc_utils.ctc_align(self.config, self.model, self.align_loader, - self.config.decoding.batch_size, - self.config.collator.stride_ms, self.vocab_list, + self.config.decode.decode_batch_size, + self.config.stride_ms, self.vocab_list, self.args.result_file) def load_inferspec(self): @@ -539,7 +479,7 @@ class U2Tester(U2Trainer): """ from paddlespeech.s2t.models.u2 import U2InferModel infer_model = U2InferModel.from_pretrained(self.test_loader, - self.config.model.clone(), + self.config.clone(), self.args.checkpoint_path) feat_dim = self.test_loader.feat_dim input_spec = [ diff --git a/paddlespeech/s2t/exps/u2_st/bin/export.py b/paddlespeech/s2t/exps/u2_st/bin/export.py index 69d9718f89e1c15354419f4f7f4e102cad6d30d0..c641152fe4bd75b5c72617370b02b29cc79f0435 100644 --- a/paddlespeech/s2t/exps/u2_st/bin/export.py +++ b/paddlespeech/s2t/exps/u2_st/bin/export.py @@ -12,7 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Export for U2 model.""" -from paddlespeech.s2t.exps.u2_st.config import get_cfg_defaults +from yacs.config import CfgNode + from paddlespeech.s2t.exps.u2_st.model import U2STTester as Tester from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.utils.utility import print_arguments @@ -31,14 +32,14 @@ def main(config, args): if __name__ == "__main__": parser = default_argument_parser() - # save jit model to + # save jit model to parser.add_argument( "--export_path", type=str, help="path of the jit model to save") args = parser.parse_args() print_arguments(args, globals()) # https://yaml.org/type/float.html - config = get_cfg_defaults() + config = CfgNode(new_allowed=True) if args.config: config.merge_from_file(args.config) if args.opts: diff --git a/paddlespeech/s2t/exps/u2_st/bin/test.py b/paddlespeech/s2t/exps/u2_st/bin/test.py index 93c2fee0acf5caef52914f6fcaaba735a88afaf7..1d70a310347c6eecba7358124a58c9de9ef11c31 100644 --- a/paddlespeech/s2t/exps/u2_st/bin/test.py +++ b/paddlespeech/s2t/exps/u2_st/bin/test.py @@ -14,12 +14,13 @@ """Evaluation for U2 model.""" import cProfile -from paddlespeech.s2t.exps.u2_st.config import get_cfg_defaults +from yacs.config import CfgNode + from paddlespeech.s2t.exps.u2_st.model import U2STTester as Tester from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.utils.utility import print_arguments -# TODO(hui zhang): dynamic load +# TODO(hui zhang): dynamic load def main_sp(config, args): @@ -35,16 +36,20 @@ def main(config, args): if __name__ == "__main__": parser = default_argument_parser() - # save asr result to + # save asr result to parser.add_argument( "--result_file", type=str, help="path of save the asr result") args = parser.parse_args() print_arguments(args, globals()) # https://yaml.org/type/float.html - config = get_cfg_defaults() + config = CfgNode(new_allowed=True) if args.config: config.merge_from_file(args.config) + if args.decode_cfg: + decode_conf = CfgNode(new_allowed=True) + decode_conf.merge_from_file(args.decode_cfg) + config.decode = decode_conf if args.opts: config.merge_from_list(args.opts) config.freeze() diff --git a/paddlespeech/s2t/exps/u2_st/bin/train.py b/paddlespeech/s2t/exps/u2_st/bin/train.py index 58496c8874b3dd433d006cdb4c40daa4be051ed7..4dec9ec8ae5da869f8b78282ea19c7a6964610db 100644 --- a/paddlespeech/s2t/exps/u2_st/bin/train.py +++ b/paddlespeech/s2t/exps/u2_st/bin/train.py @@ -16,8 +16,8 @@ import cProfile import os from paddle import distributed as dist +from yacs.config import CfgNode -from paddlespeech.s2t.exps.u2_st.config import get_cfg_defaults from paddlespeech.s2t.exps.u2_st.model import U2STTrainer as Trainer from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.utils.utility import print_arguments @@ -42,7 +42,7 @@ if __name__ == "__main__": print_arguments(args, globals()) # https://yaml.org/type/float.html - config = get_cfg_defaults() + config = CfgNode(new_allowed=True) if args.config: config.merge_from_file(args.config) if args.opts: diff --git a/paddlespeech/s2t/exps/u2_st/config.py b/paddlespeech/s2t/exps/u2_st/config.py deleted file mode 100644 index a48f9106a8ddaa14c85e1f0c8ed450d27e931d11..0000000000000000000000000000000000000000 --- a/paddlespeech/s2t/exps/u2_st/config.py +++ /dev/null @@ -1,41 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from yacs.config import CfgNode - -from paddlespeech.s2t.exps.u2_st.model import U2STTester -from paddlespeech.s2t.exps.u2_st.model import U2STTrainer -from paddlespeech.s2t.io.collator import SpeechCollator -from paddlespeech.s2t.io.dataset import ManifestDataset -from paddlespeech.s2t.models.u2_st import U2STModel - -_C = CfgNode() - -_C.data = ManifestDataset.params() - -_C.collator = SpeechCollator.params() - -_C.model = U2STModel.params() - -_C.training = U2STTrainer.params() - -_C.decoding = U2STTester.params() - - -def get_cfg_defaults(): - """Get a yacs CfgNode object with default values for my_project.""" - # Return a clone so that the defaults will not be altered - # This is for the "local variable" use pattern - config = _C.clone() - config.set_new_allowed(True) - return config diff --git a/paddlespeech/s2t/exps/u2_st/model.py b/paddlespeech/s2t/exps/u2_st/model.py index a3b39df7cd4d8d6a5783aa874e0164b589aac01d..b03ca38b6684d0d32e63ba5d5a672870fa5c38ff 100644 --- a/paddlespeech/s2t/exps/u2_st/model.py +++ b/paddlespeech/s2t/exps/u2_st/model.py @@ -16,28 +16,24 @@ import json import os import time from collections import defaultdict +from collections import OrderedDict from contextlib import nullcontext -from typing import Optional import jsonlines import numpy as np import paddle from paddle import distributed as dist -from paddle.io import DataLoader -from yacs.config import CfgNode - -from paddlespeech.s2t.io.collator import SpeechCollator -from paddlespeech.s2t.io.collator import TripletSpeechCollator -from paddlespeech.s2t.io.dataset import ManifestDataset -from paddlespeech.s2t.io.sampler import SortagradBatchSampler -from paddlespeech.s2t.io.sampler import SortagradDistributedBatchSampler + +from paddlespeech.s2t.frontend.featurizer import TextFeaturizer +from paddlespeech.s2t.io.dataloader import BatchDataLoader from paddlespeech.s2t.models.u2_st import U2STModel -from paddlespeech.s2t.training.gradclip import ClipGradByGlobalNormWithLog -from paddlespeech.s2t.training.scheduler import WarmupLR +from paddlespeech.s2t.training.optimizer import OptimizerFactory +from paddlespeech.s2t.training.reporter import ObsScope +from paddlespeech.s2t.training.reporter import report +from paddlespeech.s2t.training.scheduler import LRSchedulerFactory from paddlespeech.s2t.training.timer import Timer from paddlespeech.s2t.training.trainer import Trainer from paddlespeech.s2t.utils import bleu_score -from paddlespeech.s2t.utils import ctc_utils from paddlespeech.s2t.utils import layer_tools from paddlespeech.s2t.utils import mp_tools from paddlespeech.s2t.utils.log import Log @@ -47,38 +43,11 @@ logger = Log(__name__).getlog() class U2STTrainer(Trainer): - @classmethod - def params(cls, config: Optional[CfgNode]=None) -> CfgNode: - # training config - default = CfgNode( - dict( - n_epoch=50, # train epochs - log_interval=100, # steps - accum_grad=1, # accum grad by # steps - global_grad_clip=5.0, # the global norm clip - )) - default.optim = 'adam' - default.optim_conf = CfgNode( - dict( - lr=5e-4, # learning rate - weight_decay=1e-6, # the coeff of weight decay - )) - default.scheduler = 'warmuplr' - default.scheduler_conf = CfgNode( - dict( - warmup_steps=25000, - lr_decay=1.0, # learning rate decay - )) - - if config is not None: - config.merge_from_other_cfg(default) - return default - def __init__(self, config, args): super().__init__(config, args) def train_batch(self, batch_index, batch_data, msg): - train_conf = self.config.training + train_conf = self.config start = time.time() # forward utt, audio, audio_len, text, text_len = batch_data @@ -96,6 +65,8 @@ class U2STTrainer(Trainer): # loss div by `batch_size * accum_grad` loss /= train_conf.accum_grad losses_np = {'loss': float(loss) * train_conf.accum_grad} + if st_loss: + losses_np['st_loss'] = float(st_loss) if attention_loss: losses_np['att_loss'] = float(attention_loss) if ctc_loss: @@ -125,9 +96,15 @@ class U2STTrainer(Trainer): iteration_time = time.time() - start + for k, v in losses_np.items(): + report(k, v) + report("batch_size", self.config.batch_size) + report("accum", train_conf.accum_grad) + report("step_cost", iteration_time) + if (batch_index + 1) % train_conf.log_interval == 0: msg += "train time: {:>.3f}s, ".format(iteration_time) - msg += "batch size: {}, ".format(self.config.collator.batch_size) + msg += "batch size: {}, ".format(self.config.batch_size) msg += "accum: {}, ".format(train_conf.accum_grad) msg += ', '.join('{}: {:>.6f}'.format(k, v) for k, v in losses_np.items()) @@ -168,7 +145,7 @@ class U2STTrainer(Trainer): if ctc_loss: valid_losses['val_ctc_loss'].append(float(ctc_loss)) - if (i + 1) % self.config.training.log_interval == 0: + if (i + 1) % self.config.log_interval == 0: valid_dump = {k: np.mean(v) for k, v in valid_losses.items()} valid_dump['val_history_st_loss'] = total_loss / num_seen_utts @@ -197,23 +174,40 @@ class U2STTrainer(Trainer): self.before_train() logger.info(f"Train Total Examples: {len(self.train_loader.dataset)}") - while self.epoch < self.config.training.n_epoch: + while self.epoch < self.config.n_epoch: with Timer("Epoch-Train Time Cost: {}"): self.model.train() try: data_start_time = time.time() for batch_index, batch in enumerate(self.train_loader): dataload_time = time.time() - data_start_time - msg = "Train: Rank: {}, ".format(dist.get_rank()) - msg += "epoch: {}, ".format(self.epoch) - msg += "step: {}, ".format(self.iteration) - msg += "batch : {}/{}, ".format(batch_index + 1, - len(self.train_loader)) - msg += "lr: {:>.8f}, ".format(self.lr_scheduler()) - msg += "data time: {:>.3f}s, ".format(dataload_time) - self.train_batch(batch_index, batch, msg) - self.after_train_batch() - data_start_time = time.time() + msg = "Train:" + observation = OrderedDict() + with ObsScope(observation): + report("Rank", dist.get_rank()) + report("epoch", self.epoch) + report('step', self.iteration) + report("lr", self.lr_scheduler()) + self.train_batch(batch_index, batch, msg) + self.after_train_batch() + report('iter', batch_index + 1) + report('total', len(self.train_loader)) + report('reader_cost', dataload_time) + observation['batch_cost'] = observation[ + 'reader_cost'] + observation['step_cost'] + observation['samples'] = observation['batch_size'] + observation['ips,sent./sec'] = observation[ + 'batch_size'] / observation['batch_cost'] + for k, v in observation.items(): + msg += f" {k.split(',')[0]}: " + msg += f"{v:>.8f}" if isinstance(v, + float) else f"{v}" + msg += f" {k.split(',')[1]}" if len( + k.split(',')) == 2 else "" + msg += "," + msg = msg[:-1] # remove the last "," + if (batch_index + 1) % self.config.log_interval == 0: + logger.info(msg) except Exception as e: logger.error(e) raise e @@ -244,95 +238,92 @@ class U2STTrainer(Trainer): def setup_dataloader(self): config = self.config.clone() - config.defrost() - config.collator.keep_transcription_text = False - - # train/valid dataset, return token ids - config.data.manifest = config.data.train_manifest - train_dataset = ManifestDataset.from_config(config) - - config.data.manifest = config.data.dev_manifest - dev_dataset = ManifestDataset.from_config(config) - if config.model.model_conf.asr_weight > 0.: - Collator = TripletSpeechCollator - TestCollator = SpeechCollator - else: - TestCollator = Collator = SpeechCollator - - collate_fn_train = Collator.from_config(config) - config.collator.augmentation_config = "" - collate_fn_dev = Collator.from_config(config) - - if self.parallel: - batch_sampler = SortagradDistributedBatchSampler( - train_dataset, - batch_size=config.collator.batch_size, - num_replicas=None, - rank=None, - shuffle=True, - drop_last=True, - sortagrad=config.collator.sortagrad, - shuffle_method=config.collator.shuffle_method) + load_transcript = True if config.model_conf.asr_weight > 0 else False + + if self.train: + # train/valid dataset, return token ids + self.train_loader = BatchDataLoader( + json_file=config.train_manifest, + train_mode=True, + sortagrad=False, + batch_size=config.batch_size, + maxlen_in=config.maxlen_in, + maxlen_out=config.maxlen_out, + minibatches=0, + mini_batch_size=1, + batch_count='auto', + batch_bins=0, + batch_frames_in=0, + batch_frames_out=0, + batch_frames_inout=0, + preprocess_conf=config. + preprocess_config, # aug will be off when train_mode=False + n_iter_processes=config.num_workers, + subsampling_factor=1, + load_aux_output=load_transcript, + num_encs=1, + dist_sampler=True) + + self.valid_loader = BatchDataLoader( + json_file=config.dev_manifest, + train_mode=False, + sortagrad=False, + batch_size=config.batch_size, + maxlen_in=float('inf'), + maxlen_out=float('inf'), + minibatches=0, + mini_batch_size=1, + batch_count='auto', + batch_bins=0, + batch_frames_in=0, + batch_frames_out=0, + batch_frames_inout=0, + preprocess_conf=config. + preprocess_config, # aug will be off when train_mode=False + n_iter_processes=config.num_workers, + subsampling_factor=1, + load_aux_output=load_transcript, + num_encs=1, + dist_sampler=True) + logger.info("Setup train/valid Dataloader!") else: - batch_sampler = SortagradBatchSampler( - train_dataset, - shuffle=True, - batch_size=config.collator.batch_size, - drop_last=True, - sortagrad=config.collator.sortagrad, - shuffle_method=config.collator.shuffle_method) - self.train_loader = DataLoader( - train_dataset, - batch_sampler=batch_sampler, - collate_fn=collate_fn_train, - num_workers=config.collator.num_workers, ) - self.valid_loader = DataLoader( - dev_dataset, - batch_size=config.collator.batch_size, - shuffle=False, - drop_last=False, - collate_fn=collate_fn_dev, - num_workers=config.collator.num_workers, ) - - # test dataset, return raw text - config.data.manifest = config.data.test_manifest - # filter test examples, will cause less examples, but no mismatch with training - # and can use large batch size , save training time, so filter test egs now. - # config.data.min_input_len = 0.0 # second - # config.data.max_input_len = float('inf') # second - # config.data.min_output_len = 0.0 # tokens - # config.data.max_output_len = float('inf') # tokens - # config.data.min_output_input_ratio = 0.00 - # config.data.max_output_input_ratio = float('inf') - test_dataset = ManifestDataset.from_config(config) - # return text ord id - config.collator.keep_transcription_text = True - config.collator.augmentation_config = "" - self.test_loader = DataLoader( - test_dataset, - batch_size=config.decoding.batch_size, - shuffle=False, - drop_last=False, - collate_fn=TestCollator.from_config(config), - num_workers=config.collator.num_workers, ) - # return text token id - config.collator.keep_transcription_text = False - self.align_loader = DataLoader( - test_dataset, - batch_size=config.decoding.batch_size, - shuffle=False, - drop_last=False, - collate_fn=TestCollator.from_config(config), - num_workers=config.collator.num_workers, ) - logger.info("Setup train/valid/test/align Dataloader!") + # test dataset, return raw text + decode_batch_size = config.get('decode', dict()).get( + 'decode_batch_size', 1) + self.test_loader = BatchDataLoader( + json_file=config.test_manifest, + train_mode=False, + sortagrad=False, + batch_size=decode_batch_size, + maxlen_in=float('inf'), + maxlen_out=float('inf'), + minibatches=0, + mini_batch_size=1, + batch_count='auto', + batch_bins=0, + batch_frames_in=0, + batch_frames_out=0, + batch_frames_inout=0, + preprocess_conf=config. + preprocess_config, # aug will be off when train_mode=False + n_iter_processes=config.num_workers, + subsampling_factor=1, + num_encs=1, + dist_sampler=False) + + logger.info("Setup test Dataloader!") def setup_model(self): config = self.config - model_conf = config.model + model_conf = config with UpdateConfig(model_conf): - model_conf.input_dim = self.train_loader.collate_fn.feature_size - model_conf.output_dim = self.train_loader.collate_fn.vocab_size + if self.train: + model_conf.input_dim = self.train_loader.feat_dim + model_conf.output_dim = self.train_loader.vocab_size + else: + model_conf.input_dim = self.test_loader.feat_dim + model_conf.output_dim = self.test_loader.vocab_size model = U2STModel.from_config(model_conf) @@ -342,41 +333,44 @@ class U2STTrainer(Trainer): logger.info(f"{model}") layer_tools.print_params(model, logger.info) - train_config = config.training + train_config = config optim_type = train_config.optim optim_conf = train_config.optim_conf scheduler_type = train_config.scheduler scheduler_conf = train_config.scheduler_conf - if scheduler_type == 'expdecaylr': - lr_scheduler = paddle.optimizer.lr.ExponentialDecay( - learning_rate=optim_conf.lr, - gamma=scheduler_conf.lr_decay, - verbose=False) - elif scheduler_type == 'warmuplr': - lr_scheduler = WarmupLR( - learning_rate=optim_conf.lr, - warmup_steps=scheduler_conf.warmup_steps, - verbose=False) - elif scheduler_type == 'noam': - lr_scheduler = paddle.optimizer.lr.NoamDecay( - learning_rate=optim_conf.lr, - d_model=model_conf.encoder_conf.output_size, - warmup_steps=scheduler_conf.warmup_steps, - verbose=False) - else: - raise ValueError(f"Not support scheduler: {scheduler_type}") - - grad_clip = ClipGradByGlobalNormWithLog(train_config.global_grad_clip) - weight_decay = paddle.regularizer.L2Decay(optim_conf.weight_decay) - if optim_type == 'adam': - optimizer = paddle.optimizer.Adam( - learning_rate=lr_scheduler, - parameters=model.parameters(), - weight_decay=weight_decay, - grad_clip=grad_clip) - else: - raise ValueError(f"Not support optim: {optim_type}") + scheduler_args = { + "learning_rate": optim_conf.lr, + "verbose": False, + "warmup_steps": scheduler_conf.warmup_steps, + "gamma": scheduler_conf.lr_decay, + "d_model": model_conf.encoder_conf.output_size, + } + lr_scheduler = LRSchedulerFactory.from_args(scheduler_type, + scheduler_args) + + def optimizer_args( + config, + parameters, + lr_scheduler=None, ): + train_config = config + optim_type = train_config.optim + optim_conf = train_config.optim_conf + scheduler_type = train_config.scheduler + scheduler_conf = train_config.scheduler_conf + return { + "grad_clip": train_config.global_grad_clip, + "weight_decay": optim_conf.weight_decay, + "learning_rate": lr_scheduler + if lr_scheduler else optim_conf.lr, + "parameters": parameters, + "epsilon": 1e-9 if optim_type == 'noam' else None, + "beta1": 0.9 if optim_type == 'noam' else None, + "beat2": 0.98 if optim_type == 'noam' else None, + } + + optimzer_args = optimizer_args(config, model.parameters(), lr_scheduler) + optimizer = OptimizerFactory.from_args(optim_type, optimzer_args) self.model = model self.optimizer = optimizer @@ -385,63 +379,38 @@ class U2STTrainer(Trainer): class U2STTester(U2STTrainer): - @classmethod - def params(cls, config: Optional[CfgNode]=None) -> CfgNode: - # decoding config - default = CfgNode( - dict( - alpha=2.5, # Coef of LM for beam search. - beta=0.3, # Coef of WC for beam search. - cutoff_prob=1.0, # Cutoff probability for pruning. - cutoff_top_n=40, # Cutoff number for pruning. - lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm', # Filepath for language model. - decoding_method='attention', # Decoding method. Options: 'attention', 'ctc_greedy_search', - # 'ctc_prefix_beam_search', 'attention_rescoring' - error_rate_type='bleu', # Error rate type for evaluation. Options `bleu`, 'char_bleu' - num_proc_bsearch=8, # # of CPUs for beam search. - beam_size=10, # Beam search width. - batch_size=16, # decoding batch size - ctc_weight=0.0, # ctc weight for attention rescoring decode mode. - decoding_chunk_size=-1, # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks=-1, # number of left chunks for decoding. Defaults to -1. - simulate_streaming=False, # simulate streaming inference. Defaults to False. - )) - - if config is not None: - config.merge_from_other_cfg(default) - return default - def __init__(self, config, args): super().__init__(config, args) + self.text_feature = TextFeaturizer( + unit_type=self.config.unit_type, + vocab=self.config.vocab_filepath, + spm_model_prefix=self.config.spm_model_prefix) + self.vocab_list = self.text_feature.vocab_list - def ordid2token(self, texts, texts_len): + def id2token(self, texts, texts_len, text_feature): """ ord() id to chr() chr """ trans = [] for text, n in zip(texts, texts_len): n = n.numpy().item() ids = text[:n] - trans.append(''.join([chr(i) for i in ids])) + trans.append(text_feature.defeaturize(ids.numpy().tolist())) return trans def translate(self, audio, audio_len): """"E2E translation from extracted audio feature""" - cfg = self.config.decoding - text_feature = self.test_loader.collate_fn.text_feature + decode_cfg = self.config.decode self.model.eval() hyps = self.model.decode( audio, audio_len, - text_feature=text_feature, - decoding_method=cfg.decoding_method, - beam_size=cfg.beam_size, - word_reward=cfg.word_reward, - decoding_chunk_size=cfg.decoding_chunk_size, - num_decoding_left_chunks=cfg.num_decoding_left_chunks, - simulate_streaming=cfg.simulate_streaming) + text_feature=self.text_feature, + decoding_method=decode_cfg.decoding_method, + beam_size=decode_cfg.beam_size, + word_reward=decode_cfg.word_reward, + decoding_chunk_size=decode_cfg.decoding_chunk_size, + num_decoding_left_chunks=decode_cfg.num_decoding_left_chunks, + simulate_streaming=decode_cfg.simulate_streaming) return hyps def compute_translation_metrics(self, @@ -452,27 +421,24 @@ class U2STTester(U2STTrainer): texts_len, bleu_func, fout=None): - cfg = self.config.decoding + decode_cfg = self.config.decode len_refs, num_ins = 0, 0 start_time = time.time() - text_feature = self.test_loader.collate_fn.text_feature - refs = [ - "".join(chr(t) for t in text[:text_len]) - for text, text_len in zip(texts, texts_len) - ] + refs = self.id2token(texts, texts_len, self.text_feature) hyps = self.model.decode( audio, audio_len, - text_feature=text_feature, - decoding_method=cfg.decoding_method, - beam_size=cfg.beam_size, - word_reward=cfg.word_reward, - decoding_chunk_size=cfg.decoding_chunk_size, - num_decoding_left_chunks=cfg.num_decoding_left_chunks, - simulate_streaming=cfg.simulate_streaming) + text_feature=self.text_feature, + decoding_method=decode_cfg.decoding_method, + beam_size=decode_cfg.beam_size, + word_reward=decode_cfg.word_reward, + decoding_chunk_size=decode_cfg.decoding_chunk_size, + num_decoding_left_chunks=decode_cfg.num_decoding_left_chunks, + simulate_streaming=decode_cfg.simulate_streaming) + decode_time = time.time() - start_time for utt, target, result in zip(utts, refs, hyps): @@ -502,10 +468,10 @@ class U2STTester(U2STTrainer): self.model.eval() logger.info(f"Test Total Examples: {len(self.test_loader.dataset)}") - cfg = self.config.decoding - bleu_func = bleu_score.char_bleu if cfg.error_rate_type == 'char-bleu' else bleu_score.bleu + decode_cfg = self.config.decode + bleu_func = bleu_score.char_bleu if decode_cfg.error_rate_type == 'char-bleu' else bleu_score.bleu - stride_ms = self.test_loader.collate_fn.stride_ms + stride_ms = self.config.stride_ms hyps, refs = [], [] len_refs, num_ins = 0, 0 num_frames = 0.0 @@ -522,7 +488,8 @@ class U2STTester(U2STTrainer): len_refs += metrics['len_refs'] num_ins += metrics['num_ins'] rtf = num_time / (num_frames * stride_ms) - logger.info("RTF: %f, BELU (%d) = %f" % (rtf, num_ins, bleu)) + logger.info("RTF: %f, instance (%d), batch BELU = %f" % + (rtf, num_ins, bleu)) rtf = num_time / (num_frames * stride_ms) msg = "Test: " @@ -549,17 +516,10 @@ class U2STTester(U2STTrainer): "num_examples": num_ins, "decode_method": - self.config.decoding.decoding_method, + self.config.decode.decoding_method, }) f.write(data + '\n') - @paddle.no_grad() - def align(self): - ctc_utils.ctc_align(self.config, self.model, self.align_loader, - self.config.decoding.batch_size, - self.config.collator.stride_ms, self.vocab_list, - self.args.result_file) - def load_inferspec(self): """infer model and input spec. @@ -567,11 +527,11 @@ class U2STTester(U2STTrainer): nn.Layer: inference model List[paddle.static.InputSpec]: input spec. """ - from paddlespeech.s2t.models.u2 import U2InferModel - infer_model = U2InferModel.from_pretrained(self.test_loader, - self.config.model.clone(), - self.args.checkpoint_path) - feat_dim = self.test_loader.collate_fn.feature_size + from paddlespeech.s2t.models.u2_st import U2STInferModel + infer_model = U2STInferModel.from_pretrained(self.test_loader, + self.config.clone(), + self.args.checkpoint_path) + feat_dim = self.test_loader.feat_dim input_spec = [ paddle.static.InputSpec(shape=[1, None, feat_dim], dtype='float32'), # audio, [B,T,D] diff --git a/paddlespeech/s2t/frontend/normalizer.py b/paddlespeech/s2t/frontend/normalizer.py index 017851e6344a83932f92e7f20f6fa680b4d650f1..b596b2ab09ffc9dc60fd52bfb43ebaa1be7a730b 100644 --- a/paddlespeech/s2t/frontend/normalizer.py +++ b/paddlespeech/s2t/frontend/normalizer.py @@ -117,7 +117,8 @@ class FeatureNormalizer(object): self._compute_mean_std(manifest_path, featurize_func, num_samples, num_workers) else: - self._read_mean_std_from_file(mean_std_filepath) + mean_std = mean_std_filepath + self._read_mean_std_from_file(mean_std) def apply(self, features): """Normalize features to be of zero mean and unit stddev. @@ -131,10 +132,14 @@ class FeatureNormalizer(object): """ return (features - self._mean) * self._istd - def _read_mean_std_from_file(self, filepath, eps=1e-20): + def _read_mean_std_from_file(self, mean_std, eps=1e-20): """Load mean and std from file.""" - filetype = filepath.split(".")[-1] - mean, istd = load_cmvn(filepath, filetype=filetype) + if isinstance(mean_std, list): + mean = mean_std[0]['cmvn_stats']['mean'] + istd = mean_std[0]['cmvn_stats']['istd'] + else: + filetype = mean_std.split(".")[-1] + mean, istd = load_cmvn(mean_std, filetype=filetype) self._mean = np.expand_dims(mean, axis=0) self._istd = np.expand_dims(istd, axis=0) diff --git a/paddlespeech/s2t/io/collator.py b/paddlespeech/s2t/io/collator.py index 5f2335496c6c1cfddb8a62874d328fd51084aec9..b99fc80c023f72e401726fc347066c5bd569b40a 100644 --- a/paddlespeech/s2t/io/collator.py +++ b/paddlespeech/s2t/io/collator.py @@ -12,10 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. import io -from typing import Optional import numpy as np -from yacs.config import CfgNode from paddlespeech.s2t.frontend.augmentor.augmentation import AugmentationPipeline from paddlespeech.s2t.frontend.featurizer.speech_featurizer import SpeechFeaturizer @@ -219,33 +217,6 @@ class SpeechCollatorBase(): class SpeechCollator(SpeechCollatorBase): - @classmethod - def params(cls, config: Optional[CfgNode]=None) -> CfgNode: - default = CfgNode( - dict( - augmentation_config="", - random_seed=0, - mean_std_filepath="", - unit_type="char", - vocab_filepath="", - spm_model_prefix="", - spectrum_type='linear', # 'linear', 'mfcc', 'fbank' - feat_dim=0, # 'mfcc', 'fbank' - delta_delta=False, # 'mfcc', 'fbank' - stride_ms=10.0, # ms - window_ms=20.0, # ms - n_fft=None, # fft points - max_freq=None, # None for samplerate/2 - target_sample_rate=16000, # target sample rate - use_dB_normalization=True, - target_dB=-20, - dither=1.0, # feature dither - keep_transcription_text=False)) - - if config is not None: - config.merge_from_other_cfg(default) - return default - @classmethod def from_config(cls, config): """Build a SpeechCollator object from a config. @@ -256,45 +227,43 @@ class SpeechCollator(SpeechCollatorBase): Returns: SpeechCollator: collator object. """ - assert 'augmentation_config' in config.collator - assert 'keep_transcription_text' in config.collator - assert 'mean_std_filepath' in config.collator - assert 'vocab_filepath' in config.collator - assert 'spectrum_type' in config.collator - assert 'n_fft' in config.collator - assert config.collator - - if isinstance(config.collator.augmentation_config, (str, bytes)): - if config.collator.augmentation_config: + assert 'augmentation_config' in config + assert 'keep_transcription_text' in config + assert 'mean_std_filepath' in config + assert 'vocab_filepath' in config + assert 'spectrum_type' in config + assert 'n_fft' in config + assert config + + if isinstance(config.augmentation_config, (str, bytes)): + if config.augmentation_config: aug_file = io.open( - config.collator.augmentation_config, - mode='r', - encoding='utf8') + config.augmentation_config, mode='r', encoding='utf8') else: aug_file = io.StringIO(initial_value='{}', newline='') else: - aug_file = config.collator.augmentation_config + aug_file = config.augmentation_config assert isinstance(aug_file, io.StringIO) speech_collator = cls( aug_file=aug_file, random_seed=0, - mean_std_filepath=config.collator.mean_std_filepath, - unit_type=config.collator.unit_type, - vocab_filepath=config.collator.vocab_filepath, - spm_model_prefix=config.collator.spm_model_prefix, - spectrum_type=config.collator.spectrum_type, - feat_dim=config.collator.feat_dim, - delta_delta=config.collator.delta_delta, - stride_ms=config.collator.stride_ms, - window_ms=config.collator.window_ms, - n_fft=config.collator.n_fft, - max_freq=config.collator.max_freq, - target_sample_rate=config.collator.target_sample_rate, - use_dB_normalization=config.collator.use_dB_normalization, - target_dB=config.collator.target_dB, - dither=config.collator.dither, - keep_transcription_text=config.collator.keep_transcription_text) + mean_std_filepath=config.mean_std_filepath, + unit_type=config.unit_type, + vocab_filepath=config.vocab_filepath, + spm_model_prefix=config.spm_model_prefix, + spectrum_type=config.spectrum_type, + feat_dim=config.feat_dim, + delta_delta=config.delta_delta, + stride_ms=config.stride_ms, + window_ms=config.window_ms, + n_fft=config.n_fft, + max_freq=config.max_freq, + target_sample_rate=config.target_sample_rate, + use_dB_normalization=config.use_dB_normalization, + target_dB=config.target_dB, + dither=config.dither, + keep_transcription_text=config.keep_transcription_text) return speech_collator diff --git a/paddlespeech/s2t/io/converter.py b/paddlespeech/s2t/io/converter.py index b217d2b1b4028bd0218561127bc4fa7409de6f45..a802ac7490ce57bed3529ce78e3eb6112e3dd492 100644 --- a/paddlespeech/s2t/io/converter.py +++ b/paddlespeech/s2t/io/converter.py @@ -31,11 +31,17 @@ class CustomConverter(): """ - def __init__(self, subsampling_factor=1, dtype=np.float32): + def __init__(self, + subsampling_factor=1, + dtype=np.float32, + load_aux_input=False, + load_aux_output=False): """Construct a CustomConverter object.""" self.subsampling_factor = subsampling_factor self.ignore_id = -1 self.dtype = dtype + self.load_aux_input = load_aux_input + self.load_aux_output = load_aux_output def __call__(self, batch): """Transform a batch and send it to a device. @@ -49,34 +55,53 @@ class CustomConverter(): """ # batch should be located in list assert len(batch) == 1 - (xs, ys), utts = batch[0] - assert xs[0] is not None, "please check Reader and Augmentation impl." - - # perform subsampling - if self.subsampling_factor > 1: - xs = [x[::self.subsampling_factor, :] for x in xs] - - # get batch of lengths of input sequences - ilens = np.array([x.shape[0] for x in xs]) - - # perform padding and convert to tensor - # currently only support real number - if xs[0].dtype.kind == "c": - xs_pad_real = pad_list([x.real for x in xs], 0).astype(self.dtype) - xs_pad_imag = pad_list([x.imag for x in xs], 0).astype(self.dtype) - # Note(kamo): - # {'real': ..., 'imag': ...} will be changed to ComplexTensor in E2E. - # Don't create ComplexTensor and give it E2E here - # because torch.nn.DataParellel can't handle it. - xs_pad = {"real": xs_pad_real, "imag": xs_pad_imag} - else: - xs_pad = pad_list(xs, 0).astype(self.dtype) + data, utts = batch[0] + xs_data, ys_data = [], [] + for ud in data: + if ud[0].ndim > 1: + # speech data (input): (speech_len, feat_dim) + xs_data.append(ud) + else: + # text data (output): (text_len, ) + ys_data.append(ud) + + assert xs_data[0][ + 0] is not None, "please check Reader and Augmentation impl." + + xs_pad, ilens = [], [] + for xs in xs_data: + # perform subsampling + if self.subsampling_factor > 1: + xs = [x[::self.subsampling_factor, :] for x in xs] + + # get batch of lengths of input sequences + ilens.append(np.array([x.shape[0] for x in xs])) + + # perform padding and convert to tensor + # currently only support real number + xs_pad.append(pad_list(xs, 0).astype(self.dtype)) + + if not self.load_aux_input: + xs_pad, ilens = xs_pad[0], ilens[0] + break # NOTE: this is for multi-output (e.g., speech translation) - ys_pad = pad_list( - [np.array(y[0][:]) if isinstance(y, tuple) else y for y in ys], - self.ignore_id) + ys_pad, olens = [], [] + + for ys in ys_data: + ys_pad.append( + pad_list([ + np.array(y[0][:]) if isinstance(y, tuple) else y for y in ys + ], self.ignore_id)) + + olens.append( + np.array([ + y[0].shape[0] if isinstance(y, tuple) else y.shape[0] + for y in ys + ])) + + if not self.load_aux_output: + ys_pad, olens = ys_pad[0], olens[0] + break - olens = np.array( - [y[0].shape[0] if isinstance(y, tuple) else y.shape[0] for y in ys]) return utts, xs_pad, ilens, ys_pad, olens diff --git a/paddlespeech/s2t/io/dataloader.py b/paddlespeech/s2t/io/dataloader.py index b8eb33679dfadfb155ae4be9cce54876fe991666..920de34fc2144aa00ab988257b502a73cae31f33 100644 --- a/paddlespeech/s2t/io/dataloader.py +++ b/paddlespeech/s2t/io/dataloader.py @@ -18,7 +18,9 @@ from typing import Text import jsonlines import numpy as np +from paddle.io import BatchSampler from paddle.io import DataLoader +from paddle.io import DistributedBatchSampler from paddlespeech.s2t.io.batchfy import make_batchset from paddlespeech.s2t.io.converter import CustomConverter @@ -73,7 +75,11 @@ class BatchDataLoader(): preprocess_conf=None, n_iter_processes: int=1, subsampling_factor: int=1, - num_encs: int=1): + load_aux_input: bool=False, + load_aux_output: bool=False, + num_encs: int=1, + dist_sampler: bool=False, + shortest_first: bool=False): self.json_file = json_file self.train_mode = train_mode self.use_sortagrad = sortagrad == -1 or sortagrad > 0 @@ -89,6 +95,10 @@ class BatchDataLoader(): self.num_encs = num_encs self.preprocess_conf = preprocess_conf self.n_iter_processes = n_iter_processes + self.load_aux_input = load_aux_input + self.load_aux_output = load_aux_output + self.dist_sampler = dist_sampler + self.shortest_first = shortest_first # read json data with jsonlines.open(json_file, 'r') as reader: @@ -105,7 +115,7 @@ class BatchDataLoader(): maxlen_out, minibatches, # for debug min_batch_size=mini_batch_size, - shortest_first=self.use_sortagrad, + shortest_first=self.shortest_first or self.use_sortagrad, count=batch_count, batch_bins=batch_bins, batch_frames_in=batch_frames_in, @@ -126,21 +136,36 @@ class BatchDataLoader(): # Setup a converter if num_encs == 1: self.converter = CustomConverter( - subsampling_factor=subsampling_factor, dtype=np.float32) + subsampling_factor=subsampling_factor, + dtype=np.float32, + load_aux_input=load_aux_input, + load_aux_output=load_aux_output) else: assert NotImplementedError("not impl CustomConverterMulEnc.") # hack to make batchsize argument as 1 # actual bathsize is included in a list - # default collate function converts numpy array to pytorch tensor + # default collate function converts numpy array to paddle tensor # we used an empty collate function instead which returns list self.dataset = TransformDataset(self.minibaches, self.converter, self.reader) + if self.dist_sampler: + self.batch_sampler = DistributedBatchSampler( + dataset=self.dataset, + batch_size=1, + shuffle=not self.use_sortagrad if self.train_mode else False, + drop_last=False, ) + else: + self.batch_sampler = BatchSampler( + dataset=self.dataset, + batch_size=1, + shuffle=not self.use_sortagrad if self.train_mode else False, + drop_last=False, ) + self.dataloader = DataLoader( dataset=self.dataset, - batch_size=1, - shuffle=not self.use_sortagrad if self.train_mode else False, + batch_sampler=self.batch_sampler, collate_fn=batch_collate, num_workers=self.n_iter_processes, ) @@ -168,5 +193,9 @@ class BatchDataLoader(): echo += f"subsampling_factor: {self.subsampling_factor}, " echo += f"num_encs: {self.num_encs}, " echo += f"num_workers: {self.n_iter_processes}, " + echo += f"load_aux_input: {self.load_aux_input}, " + echo += f"load_aux_output: {self.load_aux_output}, " + echo += f"dist_sampler: {self.dist_sampler}, " + echo += f"shortest_first: {self.shortest_first}, " echo += f"file: {self.json_file}" return echo diff --git a/paddlespeech/s2t/io/dataset.py b/paddlespeech/s2t/io/dataset.py index d64d7d3ec16527c7b1f18e0f7c439d14aff3b0ad..0e94f047bce7ad053ecd566f4a8d8c83a1b10a7c 100644 --- a/paddlespeech/s2t/io/dataset.py +++ b/paddlespeech/s2t/io/dataset.py @@ -13,11 +13,8 @@ # limitations under the License. # Modified from espnet(https://github.com/espnet/espnet) # Modified from wenet(https://github.com/wenet-e2e/wenet) -from typing import Optional - import jsonlines from paddle.io import Dataset -from yacs.config import CfgNode from paddlespeech.s2t.frontend.utility import read_manifest from paddlespeech.s2t.utils.log import Log @@ -28,22 +25,6 @@ logger = Log(__name__).getlog() class ManifestDataset(Dataset): - @classmethod - def params(cls, config: Optional[CfgNode]=None) -> CfgNode: - default = CfgNode( - dict( - manifest="", - max_input_len=27.0, - min_input_len=0.0, - max_output_len=float('inf'), - min_output_len=0.0, - max_output_input_ratio=float('inf'), - min_output_input_ratio=0.0, )) - - if config is not None: - config.merge_from_other_cfg(default) - return default - @classmethod def from_config(cls, config): """Build a ManifestDataset object from a config. @@ -54,17 +35,17 @@ class ManifestDataset(Dataset): Returns: ManifestDataset: dataet object. """ - assert 'manifest' in config.data - assert config.data.manifest + assert 'manifest' in config + assert config.manifest dataset = cls( - manifest_path=config.data.manifest, - max_input_len=config.data.max_input_len, - min_input_len=config.data.min_input_len, - max_output_len=config.data.max_output_len, - min_output_len=config.data.min_output_len, - max_output_input_ratio=config.data.max_output_input_ratio, - min_output_input_ratio=config.data.min_output_input_ratio, ) + manifest_path=config.manifest, + max_input_len=config.max_input_len, + min_input_len=config.min_input_len, + max_output_len=config.max_output_len, + min_output_len=config.min_output_len, + max_output_input_ratio=config.max_output_input_ratio, + min_output_input_ratio=config.min_output_input_ratio, ) return dataset def __init__(self, diff --git a/paddlespeech/s2t/io/reader.py b/paddlespeech/s2t/io/reader.py index 38ff1396389f55c5dfcd8f42656483e5981a3e54..4e136bdce1d9b5490dadf58ab6359e6430121ced 100644 --- a/paddlespeech/s2t/io/reader.py +++ b/paddlespeech/s2t/io/reader.py @@ -68,7 +68,7 @@ class LoadInputsAndTargets(): if mode not in ["asr"]: raise ValueError("Only asr are allowed: mode={}".format(mode)) - if preprocess_conf is not None: + if preprocess_conf: self.preprocessing = Transformation(preprocess_conf) logger.warning( "[Experimental feature] Some preprocessing will be done " @@ -82,12 +82,11 @@ class LoadInputsAndTargets(): self.load_output = load_output self.load_input = load_input self.sort_in_input_length = sort_in_input_length - if preprocess_args is None: - self.preprocess_args = {} - else: + if preprocess_args: assert isinstance(preprocess_args, dict), type(preprocess_args) self.preprocess_args = dict(preprocess_args) - + else: + self.preprocess_args = {} self.keep_all_data_on_mem = keep_all_data_on_mem def __call__(self, batch, return_uttid=False): diff --git a/paddlespeech/s2t/models/ds2/deepspeech2.py b/paddlespeech/s2t/models/ds2/deepspeech2.py index 0dfaec29cd7762904a63676a694f7478fcbbb093..4a4d67ce97492e2df5953411d22ecf879c526afd 100644 --- a/paddlespeech/s2t/models/ds2/deepspeech2.py +++ b/paddlespeech/s2t/models/ds2/deepspeech2.py @@ -12,11 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Deepspeech2 ASR Model""" -from typing import Optional - import paddle from paddle import nn -from yacs.config import CfgNode from paddlespeech.s2t.models.ds2.conv import ConvStack from paddlespeech.s2t.models.ds2.rnn import RNNStack @@ -120,20 +117,6 @@ class DeepSpeech2Model(nn.Layer): :rtype: tuple of LayerOutput """ - @classmethod - def params(cls, config: Optional[CfgNode]=None) -> CfgNode: - default = CfgNode( - dict( - num_conv_layers=2, #Number of stacking convolution layers. - num_rnn_layers=3, #Number of stacking RNN layers. - rnn_layer_size=1024, #RNN layer size (number of RNN cells). - use_gru=True, #Use gru if set True. Use simple rnn if set False. - share_rnn_weights=True, #Whether to share input-hidden weights between forward and backward directional RNNs.Notice that for GRU, weight sharing is not supported. - ctc_grad_norm_type=None, )) - if config is not None: - config.merge_from_other_cfg(default) - return default - def __init__(self, feat_size, dict_size, @@ -168,13 +151,13 @@ class DeepSpeech2Model(nn.Layer): """Compute Model loss Args: - audio (Tenosr): [B, T, D] + audio (Tensors): [B, T, D] audio_len (Tensor): [B] text (Tensor): [B, U] text_len (Tensor): [B] Returns: - loss (Tenosr): [1] + loss (Tensor): [1] """ eouts, eouts_len = self.encoder(audio, audio_len) loss = self.decoder(eouts, eouts_len, text, text_len) @@ -221,12 +204,12 @@ class DeepSpeech2Model(nn.Layer): model = cls( feat_size=dataloader.collate_fn.feature_size, dict_size=dataloader.collate_fn.vocab_size, - num_conv_layers=config.model.num_conv_layers, - num_rnn_layers=config.model.num_rnn_layers, - rnn_size=config.model.rnn_layer_size, - use_gru=config.model.use_gru, - share_rnn_weights=config.model.share_rnn_weights, - blank_id=config.model.blank_id, + num_conv_layers=config.num_conv_layers, + num_rnn_layers=config.num_rnn_layers, + rnn_size=config.rnn_layer_size, + use_gru=config.use_gru, + share_rnn_weights=config.share_rnn_weights, + blank_id=config.blank_id, ctc_grad_norm_type=config.get('ctc_grad_norm_type', None), ) infos = Checkpoint().load_parameters( model, checkpoint_path=checkpoint_path) @@ -240,7 +223,7 @@ class DeepSpeech2Model(nn.Layer): Parameters config: yacs.config.CfgNode - config.model + config Returns ------- DeepSpeech2Model diff --git a/paddlespeech/s2t/models/ds2_online/deepspeech2.py b/paddlespeech/s2t/models/ds2_online/deepspeech2.py index 85876bce8abd4162c2076e6cb2c3de0fce384f36..5e4981c06ce60257dc44dfa561a7b49ad9e785bc 100644 --- a/paddlespeech/s2t/models/ds2_online/deepspeech2.py +++ b/paddlespeech/s2t/models/ds2_online/deepspeech2.py @@ -12,12 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. """Deepspeech2 ASR Online Model""" -from typing import Optional - import paddle import paddle.nn.functional as F from paddle import nn -from yacs.config import CfgNode from paddlespeech.s2t.models.ds2_online.conv import Conv2dSubsampling4Online from paddlespeech.s2t.modules.ctc import CTCDecoder @@ -244,22 +241,6 @@ class DeepSpeech2ModelOnline(nn.Layer): :rtype: tuple of LayerOutput """ - @classmethod - def params(cls, config: Optional[CfgNode]=None) -> CfgNode: - default = CfgNode( - dict( - num_conv_layers=2, #Number of stacking convolution layers. - num_rnn_layers=4, #Number of stacking RNN layers. - rnn_layer_size=1024, #RNN layer size (number of RNN cells). - num_fc_layers=2, - fc_layers_size_list=[512, 256], - use_gru=True, #Use gru if set True. Use simple rnn if set False. - blank_id=0, # index of blank in vocob.txt - ctc_grad_norm_type=None, )) - if config is not None: - config.merge_from_other_cfg(default) - return default - def __init__( self, feat_size, @@ -298,13 +279,13 @@ class DeepSpeech2ModelOnline(nn.Layer): """Compute Model loss Args: - audio (Tenosr): [B, T, D] + audio (Tensor): [B, T, D] audio_len (Tensor): [B] text (Tensor): [B, U] text_len (Tensor): [B] Returns: - loss (Tenosr): [1] + loss (Tensor): [1] """ eouts, eouts_len, final_state_h_box, final_state_c_box = self.encoder( audio, audio_len, None, None) @@ -353,14 +334,14 @@ class DeepSpeech2ModelOnline(nn.Layer): model = cls( feat_size=dataloader.collate_fn.feature_size, dict_size=dataloader.collate_fn.vocab_size, - num_conv_layers=config.model.num_conv_layers, - num_rnn_layers=config.model.num_rnn_layers, - rnn_size=config.model.rnn_layer_size, - rnn_direction=config.model.rnn_direction, - num_fc_layers=config.model.num_fc_layers, - fc_layers_size_list=config.model.fc_layers_size_list, - use_gru=config.model.use_gru, - blank_id=config.model.blank_id, + num_conv_layers=config.num_conv_layers, + num_rnn_layers=config.num_rnn_layers, + rnn_size=config.rnn_layer_size, + rnn_direction=config.rnn_direction, + num_fc_layers=config.num_fc_layers, + fc_layers_size_list=config.fc_layers_size_list, + use_gru=config.use_gru, + blank_id=config.blank_id, ctc_grad_norm_type=config.get('ctc_grad_norm_type', None), ) infos = Checkpoint().load_parameters( model, checkpoint_path=checkpoint_path) @@ -374,7 +355,7 @@ class DeepSpeech2ModelOnline(nn.Layer): Parameters config: yacs.config.CfgNode - config.model + config Returns ------- DeepSpeech2ModelOnline diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py index 2d6fb21807792cc3ddd73b3cb065cfcbb7728e54..ff4012e8eefb5f44cbbcfa1adfb4da8ad4754062 100644 --- a/paddlespeech/s2t/models/u2/u2.py +++ b/paddlespeech/s2t/models/u2/u2.py @@ -26,7 +26,6 @@ from typing import Tuple import paddle from paddle import jit from paddle import nn -from yacs.config import CfgNode from paddlespeech.s2t.decoders.scorers.ctc import CTCPrefixScorer from paddlespeech.s2t.frontend.utility import IGNORE_ID @@ -60,56 +59,6 @@ logger = Log(__name__).getlog() class U2BaseModel(ASRInterface, nn.Layer): """CTC-Attention hybrid Encoder-Decoder model""" - @classmethod - def params(cls, config: Optional[CfgNode]=None) -> CfgNode: - # network architecture - default = CfgNode() - # allow add new item when merge_with_file - default.cmvn_file = "" - default.cmvn_file_type = "json" - default.input_dim = 0 - default.output_dim = 0 - # encoder related - default.encoder = 'transformer' - default.encoder_conf = CfgNode( - dict( - output_size=256, # dimension of attention - attention_heads=4, - linear_units=2048, # the number of units of position-wise feed forward - num_blocks=12, # the number of encoder blocks - dropout_rate=0.1, - positional_dropout_rate=0.1, - attention_dropout_rate=0.0, - input_layer='conv2d', # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before=True, - # use_cnn_module=True, - # cnn_module_kernel=15, - # activation_type='swish', - # pos_enc_layer_type='rel_pos', - # selfattention_layer_type='rel_selfattn', - )) - # decoder related - default.decoder = 'transformer' - default.decoder_conf = CfgNode( - dict( - attention_heads=4, - linear_units=2048, - num_blocks=6, - dropout_rate=0.1, - positional_dropout_rate=0.1, - self_attention_dropout_rate=0.0, - src_attention_dropout_rate=0.0, )) - # hybrid CTC/attention - default.model_conf = CfgNode( - dict( - ctc_weight=0.3, - lsm_weight=0.1, # label smoothing option - length_normalized_loss=False, )) - - if config is not None: - config.merge_from_other_cfg(default) - return default - def __init__(self, vocab_size: int, encoder: TransformerEncoder, @@ -584,8 +533,9 @@ class U2BaseModel(ASRInterface, nn.Layer): hyp_content = hyp[0] # Prevent the hyp is empty if len(hyp_content) == 0: - hyp_content = (self.ctc.blank_id,) - hyp_content = paddle.to_tensor(hyp_content, place=device, dtype=paddle.long) + hyp_content = (self.ctc.blank_id, ) + hyp_content = paddle.to_tensor( + hyp_content, place=device, dtype=paddle.long) hyp_list.append(hyp_content) hyps_pad = pad_sequence(hyp_list, True, self.ignore_id) hyps_lens = paddle.to_tensor( @@ -730,8 +680,8 @@ class U2BaseModel(ASRInterface, nn.Layer): """u2 decoding. Args: - feats (Tenosr): audio features, (B, T, D) - feats_lengths (Tenosr): (B) + feats (Tensor): audio features, (B, T, D) + feats_lengths (Tensor): (B) text_feature (TextFeaturizer): text feature object. decoding_method (str): decoding mode, e.g. 'attention', 'ctc_greedy_search', diff --git a/paddlespeech/s2t/models/u2_st/u2_st.py b/paddlespeech/s2t/models/u2_st/u2_st.py index 8b07e389d011cdc410c10c6abe1ba194e40389dd..79ca423f8740bde850e8c1b71a36856ea54f3d5e 100644 --- a/paddlespeech/s2t/models/u2_st/u2_st.py +++ b/paddlespeech/s2t/models/u2_st/u2_st.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """U2 ASR Model -Unified Streaming and Non-streaming Two-pass End-to-end Model for Speech Recognition +Unified Streaming and Non-streaming Two-pass End-to-end Model for Speech Recognition (https://arxiv.org/pdf/2012.05481.pdf) """ import time @@ -24,7 +24,6 @@ from typing import Tuple import paddle from paddle import jit from paddle import nn -from yacs.config import CfgNode from paddlespeech.s2t.frontend.utility import IGNORE_ID from paddlespeech.s2t.frontend.utility import load_cmvn @@ -52,57 +51,6 @@ logger = Log(__name__).getlog() class U2STBaseModel(nn.Layer): """CTC-Attention hybrid Encoder-Decoder model""" - @classmethod - def params(cls, config: Optional[CfgNode]=None) -> CfgNode: - # network architecture - default = CfgNode() - # allow add new item when merge_with_file - default.cmvn_file = "" - default.cmvn_file_type = "json" - default.input_dim = 0 - default.output_dim = 0 - # encoder related - default.encoder = 'transformer' - default.encoder_conf = CfgNode( - dict( - output_size=256, # dimension of attention - attention_heads=4, - linear_units=2048, # the number of units of position-wise feed forward - num_blocks=12, # the number of encoder blocks - dropout_rate=0.1, - positional_dropout_rate=0.1, - attention_dropout_rate=0.0, - input_layer='conv2d', # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before=True, - # use_cnn_module=True, - # cnn_module_kernel=15, - # activation_type='swish', - # pos_enc_layer_type='rel_pos', - # selfattention_layer_type='rel_selfattn', - )) - # decoder related - default.decoder = 'transformer' - default.decoder_conf = CfgNode( - dict( - attention_heads=4, - linear_units=2048, - num_blocks=6, - dropout_rate=0.1, - positional_dropout_rate=0.1, - self_attention_dropout_rate=0.0, - src_attention_dropout_rate=0.0, )) - # hybrid CTC/attention - default.model_conf = CfgNode( - dict( - asr_weight=0.0, - ctc_weight=0.0, - lsm_weight=0.1, # label smoothing option - length_normalized_loss=False, )) - - if config is not None: - config.merge_from_other_cfg(default) - return default - def __init__(self, vocab_size: int, encoder: TransformerEncoder, @@ -289,8 +237,8 @@ class U2STBaseModel(nn.Layer): simulate_streaming (bool, optional): streaming or not. Defaults to False. Returns: - Tuple[paddle.Tensor, paddle.Tensor]: - encoder hiddens (B, Tmax, D), + Tuple[paddle.Tensor, paddle.Tensor]: + encoder hiddens (B, Tmax, D), encoder hiddens mask (B, 1, Tmax). """ # Let's assume B = batch_size @@ -530,24 +478,24 @@ class U2STBaseModel(nn.Layer): """u2 decoding. Args: - feats (Tenosr): audio features, (B, T, D) - feats_lengths (Tenosr): (B) + feats (Tensor): audio features, (B, T, D) + feats_lengths (Tensor): (B) text_feature (TextFeaturizer): text feature object. - decoding_method (str): decoding mode, e.g. - 'fullsentence', + decoding_method (str): decoding mode, e.g. + 'fullsentence', 'simultaneous' beam_size (int): beam size for search decoding_chunk_size (int, optional): decoding chunk size. Defaults to -1. <0: for decoding, use full chunk. >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here. - num_decoding_left_chunks (int, optional): + 0: used for training, it's prohibited here. + num_decoding_left_chunks (int, optional): number of left chunks for decoding. Defaults to -1. simulate_streaming (bool, optional): simulate streaming inference. Defaults to False. Raises: ValueError: when not support decoding_method. - + Returns: List[List[int]]: transcripts. """ @@ -601,7 +549,7 @@ class U2STModel(U2STBaseModel): ValueError: raise when using not support encoder type. Returns: - int, nn.Layer, nn.Layer, nn.Layer: vocab size, encoder, decoder, ctc + int, nn.Layer, nn.Layer, nn.Layer: vocab size, encoder, decoder, ctc """ if configs['cmvn_file'] is not None: mean, istd = load_cmvn(configs['cmvn_file'], diff --git a/paddlespeech/s2t/modules/ctc.py b/paddlespeech/s2t/modules/ctc.py index ffc9f038736d89224abb0275d3fe24ceb4a3ed71..1f98380771eb1e67ca186b369e5ad13eb7f1c463 100644 --- a/paddlespeech/s2t/modules/ctc.py +++ b/paddlespeech/s2t/modules/ctc.py @@ -39,10 +39,6 @@ except ImportError: except Exception as e: logger.info("paddlespeech_ctcdecoders not installed!") -#try: -#except Exception as e: -# logger.info("ctcdecoder not installed!") - __all__ = ['CTCDecoder'] @@ -85,10 +81,10 @@ class CTCDecoderBase(nn.Layer): Args: hs_pad (Tensor): batch of padded hidden state sequences (B, Tmax, D) hlens (Tensor): batch of lengths of hidden state sequences (B) - ys_pad (Tenosr): batch of padded character id sequence tensor (B, Lmax) + ys_pad (Tensor): batch of padded character id sequence tensor (B, Lmax) ys_lens (Tensor): batch of lengths of character sequence (B) Returns: - loss (Tenosr): ctc loss value, scalar. + loss (Tensor): ctc loss value, scalar. """ logits = self.ctc_lo(self.dropout(hs_pad)) loss = self.criterion(logits, ys_pad, hlens, ys_lens) @@ -256,8 +252,8 @@ class CTCDecoder(CTCDecoderBase): """ctc decoding with probs. Args: - probs (Tenosr): activation after softmax - logits_lens (Tenosr): audio output lens + probs (Tensor): activation after softmax + logits_lens (Tensor): audio output lens vocab_list ([type]): [description] decoding_method ([type]): [description] lang_model_path ([type]): [description] diff --git a/paddlespeech/s2t/modules/mask.py b/paddlespeech/s2t/modules/mask.py index d6b63761b49b530db68a7ff0bb342675124c9fca..1f66c015acb4574bddab8276a7c9c4454206997e 100644 --- a/paddlespeech/s2t/modules/mask.py +++ b/paddlespeech/s2t/modules/mask.py @@ -54,7 +54,7 @@ def make_pad_mask(lengths: paddle.Tensor) -> paddle.Tensor: [0, 0, 0, 1, 1], [0, 0, 1, 1, 1]] """ - # (TODO: Hui Zhang): jit not support Tenosr.dim() and Tensor.ndim + # (TODO: Hui Zhang): jit not support Tensor.dim() and Tensor.ndim # assert lengths.dim() == 1 batch_size = int(lengths.shape[0]) max_len = int(lengths.max()) diff --git a/paddlespeech/s2t/training/cli.py b/paddlespeech/s2t/training/cli.py index 3ef871c5df08386e5a1e6ef8798d607b304df237..bb85732a6f7f33ee9c5f2f7febabcd7912b78374 100644 --- a/paddlespeech/s2t/training/cli.py +++ b/paddlespeech/s2t/training/cli.py @@ -97,6 +97,14 @@ def default_argument_parser(parser=None): train_group.add_argument( "--dump-config", metavar="FILE", help="dump config to `this` file.") + test_group = parser.add_argument_group( + title='Test Options', description=None) + + test_group.add_argument( + "--decode_cfg", + metavar="DECODE_CONFIG_FILE", + help="decode config file.") + profile_group = parser.add_argument_group( title='Benchmark Options', description=None) profile_group.add_argument( diff --git a/paddlespeech/s2t/training/scheduler.py b/paddlespeech/s2t/training/scheduler.py index 0222246e8649bd2b934cc60483e35c9c49224fef..b22f7ef85081032e0fdb370c5883e775b2c64693 100644 --- a/paddlespeech/s2t/training/scheduler.py +++ b/paddlespeech/s2t/training/scheduler.py @@ -67,18 +67,19 @@ class WarmupLR(LRScheduler): super().__init__(learning_rate, last_epoch, verbose) def __repr__(self): - return f"{self.__class__.__name__}(warmup_steps={self.warmup_steps})" + return f"{self.__class__.__name__}(warmup_steps={self.warmup_steps}, lr={self.base_lr}, last_epoch={self.last_epoch})" def get_lr(self): + # self.last_epoch start from zero step_num = self.last_epoch + 1 return self.base_lr * self.warmup_steps**0.5 * min( step_num**-0.5, step_num * self.warmup_steps**-1.5) def set_step(self, step: int=None): ''' - It will update the learning rate in optimizer according to current ``epoch`` . + It will update the learning rate in optimizer according to current ``epoch`` . The new learning rate will take effect on next ``optimizer.step`` . - + Args: step (int, None): specify current epoch. Default: None. Auto-increment from last_epoch=-1. Returns: @@ -94,7 +95,7 @@ class ConstantLR(LRScheduler): learning_rate (float): The initial learning rate. It is a python float number. last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate. verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` . - + Returns: ``ConstantLR`` instance to schedule learning rate. """ diff --git a/paddlespeech/s2t/training/trainer.py b/paddlespeech/s2t/training/trainer.py index 9bf1ca4db26c42f6c05e57916ffd96a94917da1d..cac5e5704421f5ada788fe46c4764e8242d92339 100644 --- a/paddlespeech/s2t/training/trainer.py +++ b/paddlespeech/s2t/training/trainer.py @@ -117,8 +117,8 @@ class Trainer(): self.init_parallel() self.checkpoint = Checkpoint( - kbest_n=self.config.training.checkpoint.kbest_n, - latest_n=self.config.training.checkpoint.latest_n) + kbest_n=self.config.checkpoint.kbest_n, + latest_n=self.config.checkpoint.latest_n) # set random seed if needed if args.seed: @@ -129,8 +129,8 @@ class Trainer(): if hasattr(self.args, "benchmark_batch_size") and self.args.benchmark_batch_size: with UpdateConfig(self.config): - self.config.collator.batch_size = self.args.benchmark_batch_size - self.config.training.log_interval = 1 + self.config.batch_size = self.args.benchmark_batch_size + self.config.log_interval = 1 logger.info( f"Benchmark reset batch-size: {self.args.benchmark_batch_size}") @@ -222,7 +222,7 @@ class Trainer(): batch_sampler = self.train_loader.batch_sampler if isinstance(batch_sampler, paddle.io.DistributedBatchSampler): logger.debug( - f"train_loader.batch_sample set epoch: {self.epoch}") + f"train_loader.batch_sample.set_epoch: {self.epoch}") batch_sampler.set_epoch(self.epoch) def before_train(self): @@ -260,7 +260,7 @@ class Trainer(): self.before_train() logger.info(f"Train Total Examples: {len(self.train_loader.dataset)}") - while self.epoch < self.config.training.n_epoch: + while self.epoch < self.config.n_epoch: with Timer("Epoch-Train Time Cost: {}"): self.model.train() try: diff --git a/paddlespeech/s2t/transform/spectrogram.py b/paddlespeech/s2t/transform/spectrogram.py index f35adef0c6256f45438de731e39deadb019a4852..a6346c344ff86c19b0e5d169683be01f7bdedee3 100644 --- a/paddlespeech/s2t/transform/spectrogram.py +++ b/paddlespeech/s2t/transform/spectrogram.py @@ -319,7 +319,7 @@ class LogMelSpectrogramKaldi(): fmin=20, fmax=None, eps=1e-10, - dither=False): + dither=1.0): self.fs = fs self.n_mels = n_mels self.n_fft = n_fft @@ -374,7 +374,7 @@ class LogMelSpectrogramKaldi(): Returns: np.ndarray: (T, D) """ - dither = self.dither if train else False + dither = self.dither if train else 0.0 if x.ndim != 1: raise ValueError("Not support x: [Time, Channel]") diff --git a/paddlespeech/s2t/utils/bleu_score.py b/paddlespeech/s2t/utils/bleu_score.py index ea32fcf954d5333c886bd60e52d744bdde4d2232..a50c000ae914027611af06e1b6d11e0401b13149 100644 --- a/paddlespeech/s2t/utils/bleu_score.py +++ b/paddlespeech/s2t/utils/bleu_score.py @@ -14,7 +14,6 @@ """This module provides functions to calculate bleu score in different level. e.g. wer for word-level, cer for char-level. """ -import nltk import numpy as np import sacrebleu @@ -114,6 +113,5 @@ class ErrorCalculator(): seq_true_text = "".join(seq_true).replace(self.space, " ") seqs_hat.append(seq_hat_text) seqs_true.append(seq_true_text) - bleu = nltk.bleu_score.corpus_bleu([[ref] for ref in seqs_true], - seqs_hat) - return bleu * 100 + bleu = sacrebleu.corpus_bleu(seqs_hat, [[ref] for ref in seqs_true]) + return bleu.score * 100 diff --git a/paddlespeech/s2t/utils/dynamic_import.py b/paddlespeech/s2t/utils/dynamic_import.py index 50bd73a6de4219fd10c1679856eda918ac012eac..bd738edf8b8a96759d2fe7466fb9f7f027687c9d 100644 --- a/paddlespeech/s2t/utils/dynamic_import.py +++ b/paddlespeech/s2t/utils/dynamic_import.py @@ -57,7 +57,7 @@ def filter_valid_args(args: Dict[Text, Any], valid_keys: List[Text]): return new_args -def filter_out_tenosr(args: Dict[Text, Any]): +def filter_out_tensor(args: Dict[Text, Any]): return {key: val for key, val in args.items() if not has_tensor(val)} @@ -65,5 +65,5 @@ def instance_class(module_class, args: Dict[Text, Any]): valid_keys = inspect.signature(module_class).parameters.keys() new_args = filter_valid_args(args, valid_keys) logger.info( - f"Instance: {module_class.__name__} {filter_out_tenosr(new_args)}.") + f"Instance: {module_class.__name__} {filter_out_tensor(new_args)}.") return module_class(**new_args) diff --git a/paddlespeech/s2t/utils/log.py b/paddlespeech/s2t/utils/log.py index 1790efdb1d2328c3843342f1195fec1380b3d7f1..4f51b7f05e5c14b4ec7877b32ee9cc016ee5ba13 100644 --- a/paddlespeech/s2t/utils/log.py +++ b/paddlespeech/s2t/utils/log.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import getpass +import inspect import os import socket import sys @@ -94,15 +95,31 @@ def find_log_dir_and_names(program_name=None, log_dir=None): class Log(): """Default Logger for all.""" logger.remove() - logger.add( - sys.stdout, - level='INFO', - enqueue=True, - filter=lambda record: record['level'].no >= 20) - _, file_prefix, _ = find_log_dir_and_names() - sink_prefix = os.path.join("exp/log", file_prefix) - sink_path = sink_prefix[:-3] + "{time}.log" - logger.add(sink_path, level='DEBUG', enqueue=True, rotation="500 MB") + + _call_from_cli = False + _frame = inspect.currentframe() + while _frame: + if 'paddlespeech/cli/__init__.py' in _frame.f_code.co_filename or 'paddlespeech/t2s' in _frame.f_code.co_filename: + _call_from_cli = True + break + _frame = _frame.f_back + + if _call_from_cli: + logger.add( + sys.stdout, + level='ERROR', + enqueue=True, + filter=lambda record: record['level'].no >= 20) + else: + logger.add( + sys.stdout, + level='INFO', + enqueue=True, + filter=lambda record: record['level'].no >= 20) + _, file_prefix, _ = find_log_dir_and_names() + sink_prefix = os.path.join("exp/log", file_prefix) + sink_path = sink_prefix[:-3] + "{time}.log" + logger.add(sink_path, level='DEBUG', enqueue=True, rotation="500 MB") def __init__(self, name=None): pass diff --git a/paddlespeech/s2t/utils/utility.py b/paddlespeech/s2t/utils/utility.py index 73c7981668c07bae2fcad2190fc8a5bae3c2dc31..dc1be815923d234d13dfabc3ada0e7e3296202cd 100644 --- a/paddlespeech/s2t/utils/utility.py +++ b/paddlespeech/s2t/utils/utility.py @@ -130,7 +130,7 @@ def get_subsample(config): Returns: int: subsample rate. """ - input_layer = config["model"]["encoder_conf"]["input_layer"] + input_layer = config["encoder_conf"]["input_layer"] assert input_layer in ["conv2d", "conv2d6", "conv2d8"] if input_layer == "conv2d": return 4 diff --git a/paddlespeech/t2s/datasets/am_batch_fn.py b/paddlespeech/t2s/datasets/am_batch_fn.py index 9470f9234c2e5e8623d4c9b5eb86d8c0edb96392..526871a232d3241806377c16b459cfe42396b4df 100644 --- a/paddlespeech/t2s/datasets/am_batch_fn.py +++ b/paddlespeech/t2s/datasets/am_batch_fn.py @@ -17,7 +17,7 @@ import paddle from paddlespeech.t2s.data.batch import batch_sequences -def speedyspeech_batch_fn(examples): +def speedyspeech_single_spk_batch_fn(examples): # fields = ["phones", "tones", "num_phones", "num_frames", "feats", "durations"] phones = [np.array(item["phones"], dtype=np.int64) for item in examples] tones = [np.array(item["tones"], dtype=np.int64) for item in examples] @@ -55,6 +55,48 @@ def speedyspeech_batch_fn(examples): return batch +def speedyspeech_multi_spk_batch_fn(examples): + # fields = ["phones", "tones", "num_phones", "num_frames", "feats", "durations"] + phones = [np.array(item["phones"], dtype=np.int64) for item in examples] + tones = [np.array(item["tones"], dtype=np.int64) for item in examples] + feats = [np.array(item["feats"], dtype=np.float32) for item in examples] + durations = [ + np.array(item["durations"], dtype=np.int64) for item in examples + ] + num_phones = [ + np.array(item["num_phones"], dtype=np.int64) for item in examples + ] + num_frames = [ + np.array(item["num_frames"], dtype=np.int64) for item in examples + ] + + phones = batch_sequences(phones) + tones = batch_sequences(tones) + feats = batch_sequences(feats) + durations = batch_sequences(durations) + + # convert each batch to paddle.Tensor + phones = paddle.to_tensor(phones) + tones = paddle.to_tensor(tones) + feats = paddle.to_tensor(feats) + durations = paddle.to_tensor(durations) + num_phones = paddle.to_tensor(num_phones) + num_frames = paddle.to_tensor(num_frames) + batch = { + "phones": phones, + "tones": tones, + "num_phones": num_phones, + "num_frames": num_frames, + "feats": feats, + "durations": durations, + } + if "spk_id" in examples[0]: + spk_id = [np.array(item["spk_id"], dtype=np.int64) for item in examples] + spk_id = paddle.to_tensor(spk_id) + batch["spk_id"] = spk_id + return batch + + def fastspeech2_single_spk_batch_fn(examples): # fields = ["text", "text_lengths", "speech", "speech_lengths", "durations", "pitch", "energy"] text = [np.array(item["text"], dtype=np.int64) for item in examples] diff --git a/paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py b/paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py index 8a9ef370c0f2916149b62c50d2425e969b49a5cb..4ddd19f72b4bd52aa1f7f64ae22614e5c4efc5d4 100644 --- a/paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py +++ b/paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py @@ -21,6 +21,8 @@ import numpy as np import paddle import yaml from yacs.config import CfgNode +from tqdm import tqdm +import os from paddlespeech.t2s.datasets.preprocess_utils import get_phn_dur from paddlespeech.t2s.datasets.preprocess_utils import merge_silence @@ -30,6 +32,8 @@ from paddlespeech.t2s.modules.normalizer import ZScore def evaluate(args, fastspeech2_config): + rootdir = Path(args.rootdir).expanduser() + assert rootdir.is_dir() # construct dataset for evaluation with open(args.phones_dict, "r") as f: @@ -41,9 +45,16 @@ def evaluate(args, fastspeech2_config): for phn, id in phn_id: phone_dict[phn] = int(id) + if args.speaker_dict: + with open(args.speaker_dict, 'rt') as f: + spk_id_list = [line.strip().split() for line in f.readlines()] + spk_num = len(spk_id_list) + else: + spk_num=None + odim = fastspeech2_config.n_mels model = FastSpeech2( - idim=vocab_size, odim=odim, **fastspeech2_config["model"]) + idim=vocab_size, odim=odim, **fastspeech2_config["model"], spk_num=spk_num) model.set_state_dict( paddle.load(args.fastspeech2_checkpoint)["main_params"]) @@ -65,7 +76,34 @@ def evaluate(args, fastspeech2_config): sentences, speaker_set = get_phn_dur(args.dur_file) merge_silence(sentences) - for i, utt_id in enumerate(sentences): + if args.dataset == "baker": + wav_files = sorted(list((rootdir / "Wave").rglob("*.wav"))) + # split data into 3 sections + num_train = 9800 + num_dev = 100 + train_wav_files = wav_files[:num_train] + dev_wav_files = wav_files[num_train:num_train + num_dev] + test_wav_files = wav_files[num_train + num_dev:] + elif args.dataset == "aishell3": + sub_num_dev = 5 + wav_dir = rootdir / "train" / "wav" + train_wav_files = [] + dev_wav_files = [] + test_wav_files = [] + for speaker in os.listdir(wav_dir): + wav_files = sorted(list((wav_dir / speaker).rglob("*.wav"))) + if len(wav_files) > 100: + train_wav_files += wav_files[:-sub_num_dev * 2] + dev_wav_files += wav_files[-sub_num_dev * 2:-sub_num_dev] + test_wav_files += wav_files[-sub_num_dev:] + else: + train_wav_files += wav_files + + train_wav_files = [os.path.basename(str(str_path)) for str_path in train_wav_files] + dev_wav_files = [os.path.basename(str(str_path)) for str_path in dev_wav_files] + test_wav_files = [os.path.basename(str(str_path)) for str_path in test_wav_files] + + for i, utt_id in enumerate(tqdm(sentences)): phones = sentences[utt_id][0] durations = sentences[utt_id][1] speaker = sentences[utt_id][2] @@ -82,21 +120,30 @@ def evaluate(args, fastspeech2_config): phone_ids = [phone_dict[phn] for phn in phones] phone_ids = paddle.to_tensor(np.array(phone_ids)) + + if args.speaker_dict: + speaker_id = int([item[1] for item in spk_id_list if speaker == item[0]][0]) + speaker_id = paddle.to_tensor(speaker_id) + else: + speaker_id = None + durations = paddle.to_tensor(np.array(durations)) # 生成的和真实的可能有 1, 2 帧的差距,但是 batch_fn 会修复 # split data into 3 sections - if args.dataset == "baker": - num_train = 9800 - num_dev = 100 - if i in range(0, num_train): + + wav_path = utt_id + ".wav" + + if wav_path in train_wav_files: sub_output_dir = output_dir / ("train/raw") - elif i in range(num_train, num_train + num_dev): + elif wav_path in dev_wav_files: sub_output_dir = output_dir / ("dev/raw") - else: + elif wav_path in test_wav_files: sub_output_dir = output_dir / ("test/raw") + sub_output_dir.mkdir(parents=True, exist_ok=True) + with paddle.no_grad(): - mel = fastspeech2_inference(phone_ids, durations=durations) + mel = fastspeech2_inference(phone_ids, durations=durations, spk_id=speaker_id) np.save(sub_output_dir / (utt_id + "_feats.npy"), mel) @@ -109,6 +156,8 @@ def main(): default="baker", type=str, help="name of dataset, should in {baker, ljspeech, vctk} now") + parser.add_argument( + "--rootdir", default=None, type=str, help="directory to dataset.") parser.add_argument( "--fastspeech2-config", type=str, help="fastspeech2 config file.") parser.add_argument( @@ -126,13 +175,18 @@ def main(): type=str, default="phone_id_map.txt", help="phone vocabulary file.") + + parser.add_argument( + "--speaker-dict", + type=str, + default=None, + help="speaker id map file.") parser.add_argument( "--dur-file", default=None, type=str, help="path to durations.txt.") parser.add_argument("--output-dir", type=str, help="output dir.") parser.add_argument( "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") - parser.add_argument("--verbose", type=int, default=1, help="verbose.") def str2bool(str): return True if str.lower() == 'true' else False diff --git a/paddlespeech/t2s/exps/fastspeech2/train.py b/paddlespeech/t2s/exps/fastspeech2/train.py index fafded6fca0cef090d9ba1c844cdc526fe5d49d6..1dfa575a1810075c5dd276b3d65551308def2b3c 100644 --- a/paddlespeech/t2s/exps/fastspeech2/train.py +++ b/paddlespeech/t2s/exps/fastspeech2/train.py @@ -174,7 +174,6 @@ def main(): parser.add_argument("--output-dir", type=str, help="output dir.") parser.add_argument( "--ngpu", type=int, default=1, help="if ngpu=0, use cpu.") - parser.add_argument("--verbose", type=int, default=1, help="verbose.") parser.add_argument( "--phones-dict", type=str, default=None, help="phone vocabulary file.") parser.add_argument( diff --git a/paddlespeech/t2s/exps/gan_vocoder/hifigan/train.py b/paddlespeech/t2s/exps/gan_vocoder/hifigan/train.py index f0e7708f329a19fcaa6e47eb2bc52dc1db728797..9ac6cbd34e55b5aa65fa2279480157a7064a42b5 100644 --- a/paddlespeech/t2s/exps/gan_vocoder/hifigan/train.py +++ b/paddlespeech/t2s/exps/gan_vocoder/hifigan/train.py @@ -242,8 +242,7 @@ def train_sp(args, config): def main(): # parse args and config and redirect to train_sp - parser = argparse.ArgumentParser( - description="Train a HiFiGAN model.") + parser = argparse.ArgumentParser(description="Train a HiFiGAN model.") parser.add_argument( "--config", type=str, help="config file to overwrite default config.") parser.add_argument("--train-metadata", type=str, help="training data.") @@ -251,7 +250,6 @@ def main(): parser.add_argument("--output-dir", type=str, help="output dir.") parser.add_argument( "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") - parser.add_argument("--verbose", type=int, default=1, help="verbose.") args = parser.parse_args() diff --git a/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/train.py b/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/train.py index a44d2d3c263b5ca6f78b40d06a87cafa36bc7798..3d0ff7d35cfa1770221dd628ec1efed036645061 100644 --- a/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/train.py +++ b/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/train.py @@ -239,7 +239,6 @@ def main(): parser.add_argument("--output-dir", type=str, help="output dir.") parser.add_argument( "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") - parser.add_argument("--verbose", type=int, default=1, help="verbose.") args = parser.parse_args() diff --git a/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/synthesize_from_wav.py b/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/synthesize_from_wav.py index ca2e3f5501a77a5d191b666b09684ea58daa396c..f5affb50b065139b0840966f9cfce14225b29753 100644 --- a/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/synthesize_from_wav.py +++ b/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/synthesize_from_wav.py @@ -93,7 +93,6 @@ def main(): parser.add_argument("--output-dir", type=str, help="output dir.") parser.add_argument( "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") - parser.add_argument("--verbose", type=int, default=1, help="verbose.") args = parser.parse_args() diff --git a/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py b/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py index 98b0ed717c89d2d5d7ccf2c910420bd03722beaa..a7881d6bbb2d8dee9e3030124d2f04f97f598570 100644 --- a/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py +++ b/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py @@ -216,7 +216,6 @@ def main(): parser.add_argument("--output-dir", type=str, help="output dir.") parser.add_argument( "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") - parser.add_argument("--verbose", type=int, default=1, help="verbose.") benchmark_group = parser.add_argument_group( 'benchmark', 'arguments related to benchmark.') diff --git a/paddlespeech/t2s/exps/gan_vocoder/style_melgan/train.py b/paddlespeech/t2s/exps/gan_vocoder/style_melgan/train.py index bc746467806da598e1e346358fa24e4cac9871c8..b162260d61d34596450e3309094a289156cbcb67 100644 --- a/paddlespeech/t2s/exps/gan_vocoder/style_melgan/train.py +++ b/paddlespeech/t2s/exps/gan_vocoder/style_melgan/train.py @@ -223,8 +223,7 @@ def train_sp(args, config): def main(): # parse args and config and redirect to train_sp - parser = argparse.ArgumentParser( - description="Train a Multi-Band MelGAN model.") + parser = argparse.ArgumentParser(description="Train a Style MelGAN model.") parser.add_argument( "--config", type=str, help="config file to overwrite default config.") parser.add_argument("--train-metadata", type=str, help="training data.") @@ -232,7 +231,6 @@ def main(): parser.add_argument("--output-dir", type=str, help="output dir.") parser.add_argument( "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") - parser.add_argument("--verbose", type=int, default=1, help="verbose.") args = parser.parse_args() diff --git a/paddlespeech/t2s/exps/gan_vocoder/synthesize.py b/paddlespeech/t2s/exps/gan_vocoder/synthesize.py index 6f4dc92dbc9a74f4735880e5ce297a898e3e1014..c60b9add2eb584029b53b1ae657f0a04edece028 100644 --- a/paddlespeech/t2s/exps/gan_vocoder/synthesize.py +++ b/paddlespeech/t2s/exps/gan_vocoder/synthesize.py @@ -42,7 +42,6 @@ def main(): parser.add_argument("--output-dir", type=str, help="output dir.") parser.add_argument( "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") - parser.add_argument("--verbose", type=int, default=1, help="verbose.") args = parser.parse_args() diff --git a/paddlespeech/t2s/exps/speedyspeech/gen_gta_mel.py b/paddlespeech/t2s/exps/speedyspeech/gen_gta_mel.py new file mode 100644 index 0000000000000000000000000000000000000000..b6440fd6f27611188cfca95fed7c2c7deac59b1c --- /dev/null +++ b/paddlespeech/t2s/exps/speedyspeech/gen_gta_mel.py @@ -0,0 +1,246 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# generate mels using durations.txt +# for mb melgan finetune +# 长度和原本的 mel 不一致怎么办? +import argparse +import os +from pathlib import Path + +import numpy as np +import paddle +import yaml +from tqdm import tqdm +from yacs.config import CfgNode + +from paddlespeech.t2s.datasets.preprocess_utils import get_phn_dur +from paddlespeech.t2s.datasets.preprocess_utils import merge_silence +from paddlespeech.t2s.frontend.zh_frontend import Frontend +from paddlespeech.t2s.models.speedyspeech import SpeedySpeech +from paddlespeech.t2s.models.speedyspeech import SpeedySpeechInference +from paddlespeech.t2s.modules.normalizer import ZScore + + +def evaluate(args, speedyspeech_config): + rootdir = Path(args.rootdir).expanduser() + assert rootdir.is_dir() + + # construct dataset for evaluation + with open(args.phones_dict, "r") as f: + phn_id = [line.strip().split() for line in f.readlines()] + vocab_size = len(phn_id) + print("vocab_size:", vocab_size) + + phone_dict = {} + for phn, id in phn_id: + phone_dict[phn] = int(id) + + with open(args.tones_dict, "r") as f: + tone_id = [line.strip().split() for line in f.readlines()] + tone_size = len(tone_id) + print("tone_size:", tone_size) + + frontend = Frontend( + phone_vocab_path=args.phones_dict, tone_vocab_path=args.tones_dict) + + if args.speaker_dict: + with open(args.speaker_dict, 'rt') as f: + spk_id_list = [line.strip().split() for line in f.readlines()] + spk_num = len(spk_id_list) + else: + spk_num = None + + model = SpeedySpeech( + vocab_size=vocab_size, + tone_size=tone_size, + **speedyspeech_config["model"], + spk_num=spk_num) + + model.set_state_dict( + paddle.load(args.speedyspeech_checkpoint)["main_params"]) + model.eval() + + stat = np.load(args.speedyspeech_stat) + mu, std = stat + mu = paddle.to_tensor(mu) + std = paddle.to_tensor(std) + speedyspeech_normalizer = ZScore(mu, std) + + speedyspeech_inference = SpeedySpeechInference(speedyspeech_normalizer, + model) + speedyspeech_inference.eval() + + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + sentences, speaker_set = get_phn_dur(args.dur_file) + merge_silence(sentences) + + if args.dataset == "baker": + wav_files = sorted(list((rootdir / "Wave").rglob("*.wav"))) + # split data into 3 sections + num_train = 9800 + num_dev = 100 + train_wav_files = wav_files[:num_train] + dev_wav_files = wav_files[num_train:num_train + num_dev] + test_wav_files = wav_files[num_train + num_dev:] + elif args.dataset == "aishell3": + sub_num_dev = 5 + wav_dir = rootdir / "train" / "wav" + train_wav_files = [] + dev_wav_files = [] + test_wav_files = [] + for speaker in os.listdir(wav_dir): + wav_files = sorted(list((wav_dir / speaker).rglob("*.wav"))) + if len(wav_files) > 100: + train_wav_files += wav_files[:-sub_num_dev * 2] + dev_wav_files += wav_files[-sub_num_dev * 2:-sub_num_dev] + test_wav_files += wav_files[-sub_num_dev:] + else: + train_wav_files += wav_files + + train_wav_files = [ + os.path.basename(str(str_path)) for str_path in train_wav_files + ] + dev_wav_files = [ + os.path.basename(str(str_path)) for str_path in dev_wav_files + ] + test_wav_files = [ + os.path.basename(str(str_path)) for str_path in test_wav_files + ] + + for i, utt_id in enumerate(tqdm(sentences)): + phones = sentences[utt_id][0] + durations = sentences[utt_id][1] + speaker = sentences[utt_id][2] + # 裁剪掉开头和结尾的 sil + if args.cut_sil: + if phones[0] == "sil" and len(durations) > 1: + durations = durations[1:] + phones = phones[1:] + if phones[-1] == 'sil' and len(durations) > 1: + durations = durations[:-1] + phones = phones[:-1] + + phones, tones = frontend._get_phone_tone(phones, get_tone_ids=True) + if tones: + tone_ids = frontend._t2id(tones) + tone_ids = paddle.to_tensor(tone_ids) + if phones: + phone_ids = frontend._p2id(phones) + phone_ids = paddle.to_tensor(phone_ids) + + if args.speaker_dict: + speaker_id = int( + [item[1] for item in spk_id_list if speaker == item[0]][0]) + speaker_id = paddle.to_tensor(speaker_id) + else: + speaker_id = None + + durations = paddle.to_tensor(np.array(durations)) + durations = paddle.unsqueeze(durations, axis=0) + + # 生成的和真实的可能有 1, 2 帧的差距,但是 batch_fn 会修复 + # split data into 3 sections + + wav_path = utt_id + ".wav" + + if wav_path in train_wav_files: + sub_output_dir = output_dir / ("train/raw") + elif wav_path in dev_wav_files: + sub_output_dir = output_dir / ("dev/raw") + elif wav_path in test_wav_files: + sub_output_dir = output_dir / ("test/raw") + + sub_output_dir.mkdir(parents=True, exist_ok=True) + + with paddle.no_grad(): + mel = speedyspeech_inference( + phone_ids, tone_ids, durations=durations, spk_id=speaker_id) + np.save(sub_output_dir / (utt_id + "_feats.npy"), mel) + + +def main(): + # parse args and config and redirect to train_sp + parser = argparse.ArgumentParser( + description="Synthesize with speedyspeech & parallel wavegan.") + parser.add_argument( + "--dataset", + default="baker", + type=str, + help="name of dataset, should in {baker, ljspeech, vctk} now") + parser.add_argument( + "--rootdir", default=None, type=str, help="directory to dataset.") + parser.add_argument( + "--speedyspeech-config", type=str, help="speedyspeech config file.") + parser.add_argument( + "--speedyspeech-checkpoint", + type=str, + help="speedyspeech checkpoint to load.") + parser.add_argument( + "--speedyspeech-stat", + type=str, + help="mean and standard deviation used to normalize spectrogram when training speedyspeech." + ) + + parser.add_argument( + "--phones-dict", + type=str, + default="phone_id_map.txt", + help="phone vocabulary file.") + parser.add_argument( + "--tones-dict", + type=str, + default="tone_id_map.txt", + help="tone vocabulary file.") + parser.add_argument( + "--speaker-dict", type=str, default=None, help="speaker id map file.") + + parser.add_argument( + "--dur-file", default=None, type=str, help="path to durations.txt.") + parser.add_argument("--output-dir", type=str, help="output dir.") + parser.add_argument( + "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") + + def str2bool(str): + return True if str.lower() == 'true' else False + + parser.add_argument( + "--cut-sil", + type=str2bool, + default=True, + help="whether cut sil in the edge of audio") + + args = parser.parse_args() + + if args.ngpu == 0: + paddle.set_device("cpu") + elif args.ngpu > 0: + paddle.set_device("gpu") + else: + print("ngpu should >= 0 !") + + with open(args.speedyspeech_config) as f: + speedyspeech_config = CfgNode(yaml.safe_load(f)) + + print("========Args========") + print(yaml.safe_dump(vars(args))) + print("========Config========") + print(speedyspeech_config) + + evaluate(args, speedyspeech_config) + + +if __name__ == "__main__": + main() diff --git a/paddlespeech/t2s/exps/speedyspeech/normalize.py b/paddlespeech/t2s/exps/speedyspeech/normalize.py index 91d15c40b201b70780fceea77f37d50af5383c4a..a427c46928cf92ba298b73313d78fe6b52a7fc14 100644 --- a/paddlespeech/t2s/exps/speedyspeech/normalize.py +++ b/paddlespeech/t2s/exps/speedyspeech/normalize.py @@ -47,7 +47,8 @@ def main(): "--phones-dict", type=str, default=None, help="phone vocabulary file.") parser.add_argument( "--tones-dict", type=str, default=None, help="tone vocabulary file.") - + parser.add_argument( + "--speaker-dict", type=str, default=None, help="speaker id map file.") parser.add_argument( "--verbose", type=int, @@ -121,6 +122,12 @@ def main(): for tone, id in tone_id: vocab_tones[tone] = int(id) + vocab_speaker = {} + with open(args.speaker_dict, 'rt') as f: + spk_id = [line.strip().split() for line in f.readlines()] + for spk, id in spk_id: + vocab_speaker[spk] = int(id) + # process each file output_metadata = [] @@ -135,11 +142,13 @@ def main(): np.save(mel_path, mel.astype(np.float32), allow_pickle=False) phone_ids = [vocab_phones[p] for p in item['phones']] tone_ids = [vocab_tones[p] for p in item['tones']] + spk_id = vocab_speaker[item["speaker"]] if args.use_relative_path: # convert absolute path to relative path: mel_path = mel_path.relative_to(dumpdir) output_metadata.append({ 'utt_id': utt_id, + "spk_id": spk_id, 'phones': phone_ids, 'tones': tone_ids, 'num_phones': item['num_phones'], diff --git a/paddlespeech/t2s/exps/speedyspeech/preprocess.py b/paddlespeech/t2s/exps/speedyspeech/preprocess.py index aa589d5a323dc2bf67e08d49d455e2cbc347ca47..9ff771442e4ef16cd5e9b87df664be7a5306329c 100644 --- a/paddlespeech/t2s/exps/speedyspeech/preprocess.py +++ b/paddlespeech/t2s/exps/speedyspeech/preprocess.py @@ -31,6 +31,7 @@ from paddlespeech.t2s.data.get_feats import LogMelFBank from paddlespeech.t2s.datasets.preprocess_utils import compare_duration_and_mel_length from paddlespeech.t2s.datasets.preprocess_utils import get_phn_dur from paddlespeech.t2s.datasets.preprocess_utils import get_phones_tones +from paddlespeech.t2s.datasets.preprocess_utils import get_spk_id_map from paddlespeech.t2s.datasets.preprocess_utils import merge_silence @@ -101,6 +102,7 @@ def process_sentence(config: Dict[str, Any], "utt_id": utt_id, "phones": phones, "tones": tones, + "speaker": speaker, "num_phones": len(phones), "num_frames": num_frames, "durations": durations, @@ -229,6 +231,8 @@ def main(): tone_id_map_path = dumpdir / "tone_id_map.txt" get_phones_tones(sentences, phone_id_map_path, tone_id_map_path, args.dataset) + speaker_id_map_path = dumpdir / "speaker_id_map.txt" + get_spk_id_map(speaker_set, speaker_id_map_path) if args.dataset == "baker": wav_files = sorted(list((rootdir / "Wave").rglob("*.wav"))) diff --git a/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py b/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py index 2854d0555ad3041de9a6c4d35b6fd1c673a5042f..cb742c59587fa91f442d4ba5868c7b13a23fe085 100644 --- a/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py +++ b/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py @@ -173,7 +173,6 @@ def main(): parser.add_argument("--output-dir", type=str, help="output dir") parser.add_argument( "--inference-dir", type=str, help="dir to save inference models") - parser.add_argument("--verbose", type=int, default=1, help="verbose") parser.add_argument( "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") diff --git a/paddlespeech/t2s/exps/speedyspeech/train.py b/paddlespeech/t2s/exps/speedyspeech/train.py index 001e22aea6d608566f1c6a8c16f84eb6abb68bf4..448cd7bbf356f1e16dba5bc33464bc0910b2b65f 100644 --- a/paddlespeech/t2s/exps/speedyspeech/train.py +++ b/paddlespeech/t2s/exps/speedyspeech/train.py @@ -27,7 +27,8 @@ from paddle.io import DataLoader from paddle.io import DistributedBatchSampler from yacs.config import CfgNode -from paddlespeech.t2s.datasets.am_batch_fn import speedyspeech_batch_fn +from paddlespeech.t2s.datasets.am_batch_fn import speedyspeech_multi_spk_batch_fn +from paddlespeech.t2s.datasets.am_batch_fn import speedyspeech_single_spk_batch_fn from paddlespeech.t2s.datasets.data_table import DataTable from paddlespeech.t2s.models.speedyspeech import SpeedySpeech from paddlespeech.t2s.models.speedyspeech import SpeedySpeechEvaluator @@ -57,6 +58,23 @@ def train_sp(args, config): f"rank: {dist.get_rank()}, pid: {os.getpid()}, parent_pid: {os.getppid()}", ) + fields = [ + "phones", "tones", "num_phones", "num_frames", "feats", "durations" + ] + + spk_num = None + if args.speaker_dict is not None: + print("multiple speaker speedyspeech!") + collate_fn = speedyspeech_multi_spk_batch_fn + with open(args.speaker_dict, 'rt') as f: + spk_id = [line.strip().split() for line in f.readlines()] + spk_num = len(spk_id) + fields += ["spk_id"] + else: + print("single speaker speedyspeech!") + collate_fn = speedyspeech_single_spk_batch_fn + print("spk_num:", spk_num) + # dataloader has been too verbose logging.getLogger("DataLoader").disabled = True @@ -71,9 +89,7 @@ def train_sp(args, config): train_dataset = DataTable( data=train_metadata, - fields=[ - "phones", "tones", "num_phones", "num_frames", "feats", "durations" - ], + fields=fields, converters={ "feats": np.load, }, ) @@ -87,9 +103,7 @@ def train_sp(args, config): dev_dataset = DataTable( data=dev_metadata, - fields=[ - "phones", "tones", "num_phones", "num_frames", "feats", "durations" - ], + fields=fields, converters={ "feats": np.load, }, ) @@ -105,14 +119,14 @@ def train_sp(args, config): train_dataloader = DataLoader( train_dataset, batch_sampler=train_sampler, - collate_fn=speedyspeech_batch_fn, + collate_fn=collate_fn, num_workers=config.num_workers) dev_dataloader = DataLoader( dev_dataset, shuffle=False, drop_last=False, batch_size=config.batch_size, - collate_fn=speedyspeech_batch_fn, + collate_fn=collate_fn, num_workers=config.num_workers) print("dataloaders done!") with open(args.phones_dict, "r") as f: @@ -125,7 +139,10 @@ def train_sp(args, config): print("tone_size:", tone_size) model = SpeedySpeech( - vocab_size=vocab_size, tone_size=tone_size, **config["model"]) + vocab_size=vocab_size, + tone_size=tone_size, + spk_num=spk_num, + **config["model"]) if world_size > 1: model = DataParallel(model) print("model done!") @@ -168,7 +185,6 @@ def main(): parser.add_argument("--output-dir", type=str, help="output dir.") parser.add_argument( "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") - parser.add_argument("--verbose", type=int, default=1, help="verbose.") def str2bool(str): return True if str.lower() == 'true' else False @@ -185,6 +201,12 @@ def main(): parser.add_argument( "--tones-dict", type=str, default=None, help="tone vocabulary file.") + parser.add_argument( + "--speaker-dict", + type=str, + default=None, + help="speaker id map file for multiple speaker model.") + # 这里可以多传入 max_epoch 等 args, rest = parser.parse_known_args() diff --git a/paddlespeech/t2s/exps/synthesize_e2e.py b/paddlespeech/t2s/exps/synthesize_e2e.py index 9a83ec1bc22caf9d12a64b66510bc89854d8089f..15ed1e4d44d7d6c34031b955c4449ed04315fb60 100644 --- a/paddlespeech/t2s/exps/synthesize_e2e.py +++ b/paddlespeech/t2s/exps/synthesize_e2e.py @@ -196,41 +196,50 @@ def evaluate(args): output_dir = Path(args.output_dir) output_dir.mkdir(parents=True, exist_ok=True) - + merge_sentences = False for utt_id, sentence in sentences: get_tone_ids = False if am_name == 'speedyspeech': get_tone_ids = True if args.lang == 'zh': input_ids = frontend.get_input_ids( - sentence, merge_sentences=True, get_tone_ids=get_tone_ids) + sentence, + merge_sentences=merge_sentences, + get_tone_ids=get_tone_ids) phone_ids = input_ids["phone_ids"] - phone_ids = phone_ids[0] if get_tone_ids: tone_ids = input_ids["tone_ids"] - tone_ids = tone_ids[0] elif args.lang == 'en': - input_ids = frontend.get_input_ids(sentence) + input_ids = frontend.get_input_ids( + sentence, merge_sentences=merge_sentences) phone_ids = input_ids["phone_ids"] else: print("lang should in {'zh', 'en'}!") - with paddle.no_grad(): - # acoustic model - if am_name == 'fastspeech2': - # multi speaker - if am_dataset in {"aishell3", "vctk"}: - spk_id = paddle.to_tensor(args.spk_id) - mel = am_inference(phone_ids, spk_id) + flags = 0 + for i in range(len(phone_ids)): + part_phone_ids = phone_ids[i] + # acoustic model + if am_name == 'fastspeech2': + # multi speaker + if am_dataset in {"aishell3", "vctk"}: + spk_id = paddle.to_tensor(args.spk_id) + mel = am_inference(part_phone_ids, spk_id) + else: + mel = am_inference(part_phone_ids) + elif am_name == 'speedyspeech': + part_tone_ids = tone_ids[i] + mel = am_inference(part_phone_ids, part_tone_ids) + # vocoder + wav = voc_inference(mel) + if flags == 0: + wav_all = wav + flags = 1 else: - mel = am_inference(phone_ids) - elif am_name == 'speedyspeech': - mel = am_inference(phone_ids, tone_ids) - # vocoder - wav = voc_inference(mel) + wav_all = paddle.concat([wav_all, wav]) sf.write( str(output_dir / (utt_id + ".wav")), - wav.numpy(), + wav_all.numpy(), samplerate=am_config.fs) print(f"{utt_id} done!") diff --git a/paddlespeech/t2s/exps/transformer_tts/synthesize.py b/paddlespeech/t2s/exps/transformer_tts/synthesize.py index 666c3b7237f56894d804134e0674df0ee525a92b..7b6b1873fca65d4765df40ac2c8a233f0e33ae3a 100644 --- a/paddlespeech/t2s/exps/transformer_tts/synthesize.py +++ b/paddlespeech/t2s/exps/transformer_tts/synthesize.py @@ -118,7 +118,6 @@ def main(): parser.add_argument("--output-dir", type=str, help="output dir.") parser.add_argument( "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") - parser.add_argument("--verbose", type=int, default=1, help="verbose.") args = parser.parse_args() diff --git a/paddlespeech/t2s/exps/transformer_tts/synthesize_e2e.py b/paddlespeech/t2s/exps/transformer_tts/synthesize_e2e.py index ba197f43cbda55733d6aa07236d1620fcc16dd15..0cd7d224e0b983d1461bfd42ffebd812f79380b4 100644 --- a/paddlespeech/t2s/exps/transformer_tts/synthesize_e2e.py +++ b/paddlespeech/t2s/exps/transformer_tts/synthesize_e2e.py @@ -137,7 +137,6 @@ def main(): parser.add_argument("--output-dir", type=str, help="output dir.") parser.add_argument( "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") - parser.add_argument("--verbose", type=int, default=1, help="verbose.") args = parser.parse_args() diff --git a/paddlespeech/t2s/exps/transformer_tts/train.py b/paddlespeech/t2s/exps/transformer_tts/train.py index 163339f4a99948da4ad647ab693bc68c141d6811..8695c06a9706a12c7b48fd59acdb5b6c16e572a1 100644 --- a/paddlespeech/t2s/exps/transformer_tts/train.py +++ b/paddlespeech/t2s/exps/transformer_tts/train.py @@ -165,7 +165,6 @@ def main(): parser.add_argument("--output-dir", type=str, help="output dir.") parser.add_argument( "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") - parser.add_argument("--verbose", type=int, default=1, help="verbose.") parser.add_argument( "--phones-dict", type=str, default=None, help="phone vocabulary file.") diff --git a/paddlespeech/t2s/frontend/phonectic.py b/paddlespeech/t2s/frontend/phonectic.py index fbc8fd388a1b07cfee0ebafe3b83e43cdfddc614..254138713d123fef0e60ac7110a5ef9cbfe96dcd 100644 --- a/paddlespeech/t2s/frontend/phonectic.py +++ b/paddlespeech/t2s/frontend/phonectic.py @@ -13,7 +13,9 @@ # limitations under the License. from abc import ABC from abc import abstractmethod +from typing import List +import numpy as np import paddle from g2p_en import G2p from g2pM import G2pM @@ -21,6 +23,7 @@ from g2pM import G2pM from paddlespeech.t2s.frontend.normalizer.normalizer import normalize from paddlespeech.t2s.frontend.punctuation import get_punctuations from paddlespeech.t2s.frontend.vocab import Vocab +from paddlespeech.t2s.frontend.zh_normalization.text_normlization import TextNormalizer # discard opencc untill we find an easy solution to install it on windows # from opencc import OpenCC @@ -53,6 +56,7 @@ class English(Phonetics): self.vocab = Vocab(self.phonemes + self.punctuations) self.vocab_phones = {} self.punc = ":,;。?!“”‘’':,;.?!" + self.text_normalizer = TextNormalizer() if phone_vocab_path: with open(phone_vocab_path, 'rt') as f: phn_id = [line.strip().split() for line in f.readlines()] @@ -78,19 +82,42 @@ class English(Phonetics): phonemes = [item for item in phonemes if item in self.vocab.stoi] return phonemes - def get_input_ids(self, sentence: str) -> paddle.Tensor: - result = {} - phones = self.phoneticize(sentence) - # remove start_symbol and end_symbol - phones = phones[1:-1] - phones = [phn for phn in phones if not phn.isspace()] - phones = [ + def _p2id(self, phonemes: List[str]) -> np.array: + # replace unk phone with sp + phonemes = [ phn if (phn in self.vocab_phones and phn not in self.punc) else "sp" - for phn in phones + for phn in phonemes ] - phone_ids = [self.vocab_phones[phn] for phn in phones] - phone_ids = paddle.to_tensor(phone_ids) - result["phone_ids"] = phone_ids + phone_ids = [self.vocab_phones[item] for item in phonemes] + return np.array(phone_ids, np.int64) + + def get_input_ids(self, sentence: str, + merge_sentences: bool=False) -> paddle.Tensor: + result = {} + sentences = self.text_normalizer._split(sentence, lang="en") + phones_list = [] + temp_phone_ids = [] + for sentence in sentences: + phones = self.phoneticize(sentence) + # remove start_symbol and end_symbol + phones = phones[1:-1] + phones = [phn for phn in phones if not phn.isspace()] + phones_list.append(phones) + + if merge_sentences: + merge_list = sum(phones_list, []) + # rm the last 'sp' to avoid the noise at the end + # cause in the training data, no 'sp' in the end + if merge_list[-1] == 'sp': + merge_list = merge_list[:-1] + phones_list = [] + phones_list.append(merge_list) + + for part_phones_list in phones_list: + phone_ids = self._p2id(part_phones_list) + phone_ids = paddle.to_tensor(phone_ids) + temp_phone_ids.append(phone_ids) + result["phone_ids"] = temp_phone_ids return result def numericalize(self, phonemes): diff --git a/paddlespeech/t2s/frontend/tone_sandhi.py b/paddlespeech/t2s/frontend/tone_sandhi.py index 6ba567bb9eaf3a09de1fd4ec0b4318f5c79c2606..5264e0687557c75023eb8f004350869346e7df6c 100644 --- a/paddlespeech/t2s/frontend/tone_sandhi.py +++ b/paddlespeech/t2s/frontend/tone_sandhi.py @@ -65,6 +65,7 @@ class ToneSandhi(): self.must_not_neural_tone_words = { "男子", "女子", "分子", "原子", "量子", "莲子", "石子", "瓜子", "电子" } + self.punc = ":,;。?!“”‘’':,;.?!" # the meaning of jieba pos tag: https://blog.csdn.net/weixin_44174352/article/details/113731041 # e.g. @@ -147,7 +148,9 @@ class ToneSandhi(): finals[i] = finals[i][:-1] + "2" # "一" before non-tone4 should be yi4, e.g. 一天 else: - finals[i] = finals[i][:-1] + "4" + # "一" 后面如果是标点,还读一声 + if word[i + 1] not in self.punc: + finals[i] = finals[i][:-1] + "4" return finals def _split_word(self, word: str) -> List[str]: diff --git a/paddlespeech/t2s/frontend/zh_frontend.py b/paddlespeech/t2s/frontend/zh_frontend.py index 8eb55ff253f5c535908fd08982391e9fbc81973d..a905c412d4d9c901fae1d5b80677a472a24c6071 100644 --- a/paddlespeech/t2s/frontend/zh_frontend.py +++ b/paddlespeech/t2s/frontend/zh_frontend.py @@ -105,6 +105,8 @@ class Frontend(): phones_list = [] for seg in segments: phones = [] + # Replace all English words in the sentence + seg = re.sub('[a-zA-Z]+', '', seg) seg_cut = psg.lcut(seg) initials = [] finals = [] diff --git a/paddlespeech/t2s/frontend/zh_normalization/char_convert.py b/paddlespeech/t2s/frontend/zh_normalization/char_convert.py index 22462a0c595ce2f86338a13782ac0e78338df07e..dcf95d72861348b36e4f21c31350f47e56934fc1 100644 --- a/paddlespeech/t2s/frontend/zh_normalization/char_convert.py +++ b/paddlespeech/t2s/frontend/zh_normalization/char_convert.py @@ -14,7 +14,7 @@ # limitations under the License. """Traditional and simplified Chinese conversion, a simplified character may correspond to multiple traditional characters. """ -simplified_charcters = '制咖片型超声盘鉴定仔点他命书歌粉巾字帐恤手指记忆棒形转弯沟光○〇㐄㐅㐆㐌㐖毒㐜㐡㐤㐰㐺㑇㑳㒳㒸㔾㗂㗎㝵㞎㞙㞞以㢲㢴㤅㥁㥯㨗㫺㬎㮎㮚㮸㲋㲱㲾㳮涧㵪㶸㷖㷭㹢㹴犬㺢狓㺵碗㽮㿝䍃䔢䖟䖸䗈䗥䗪䝓射䥯䦉䯝鲃鱼䲔䳗鹅䵹鼄䶑一对应映射丁不识下儿子做二休世丘之貉并中台原则串为甚谓干净了百事无成八变五十些人得道鸡升天代如并来去个国政策劲幽灵在欧洲游荡接样萝卜坑侧化传价元论醇共再准刀两断切分耕耘收获钱货物向看旧就绪险刻千金动劳永逸匙零夜半卡通回复返影踪反常态口咬气句话同吐快吹周味呼诺呜品红锅哄而散起唱和问三知生熟团漆黑火糟堆场空块面塌糊涂尘染壁厢夔已足多情露水大早到晚夫妻当关万莫开失古恨套所料既往孔见提师要家主审寸阴难买斗牛小撮部阵局展身层巴掌帆风顺席地带过年计于春头载四季期被蛇怕井绳度愿式份弹顷深前律径心意念差愁孤行俱全房厅交遮打技长把抓死拿眼泪鼻涕钥锁折段抿拍即合扫排掬挥拨拥上入击洞掷揽改故辙败文值名斑方面旁族日秋餐隔雅里终父旦时晌会霎间晃暴寒曝更月望垠际朝夕本正经利杯羹东西板枝独秀根筋杆进条龙服务概模次函数又性程总付步脚印趋登毛拔呵氧氮碳决雌雄波未平派谎言流清楚白准溜烟潭有获闻是处降琴鹤甲病发可拾沙目然了直以相眨穿睹瞥瞬矢的解石鸟神教秉虔诚秘种窝蜂穷窍笑置笔苟勾销抹杀煞等奖箍节吃箭仇双雕诗筹箩筐系列纸级士官统丝毫挂维网尽线微吭响股脑胎脉承腔臂力致效资源址器举功投般说讲规贸易叶障着慎满皆输号木电池衣倾钟高低视仁觉醒览遗角银币触溃九鼎蔽抄出驷马追重语破贫洗贯走路安蹴至几蹶振跃役胆汗较辈轮辞赞退六连遍递边针血锤音错门思闪真倒项栽雾类保护川先惊乍体哄鳞爪鸣滴泡邻域党专鼓作齐炒丑烯亥克内酯冬加奴卯肝炎基尺梁街裤镐客宠庭巳汝昌烷玲磊糖肇酉醛啷青县韪良香骨鲷丂七集河市弦喜嘴张舌堵区工业姊妹星架构巧彩扭歪拼凑余热曜武州爷浮屠美乡老阶树荤素碎落能魄鳃鳗珠丄丅丆万俟丈尚摸母娘量管群亚虎必我堂令申件装伏位博侠义界表女墟台戏臭皮匠胜诸葛亮赛顶倍催请运算包立叉戟离疫苗土史志演围揭瓦晒夷姑婆帝村宝烂尖杉碱屉桌山岔岛由纪峡坝库镇废从德后拗汤治旬食明昧曹朋友框栏极权幂曲归依猫民氟硼氯磷铁江侗自旅法司洋浦梅园温暖湾焦班幸用田略番叠皇炮捶硝苯酸腺苷棱草镜穗跳远索锦纲聚氰胺联店胚膲爱色堇紫罗兰芝茶饭菱云虫藏藩乱叛苏亲债凳学座恐恋柱测肌腹衩锥系貂企乌跪叩军车农题迭都甘油屯奏键短阿姨陪姐只顾茅庐槽驾魂鲜鹿页其菜单乘任供势午齿汉组织吊调泻唇坡城报坟外夸将尉建筑岸岗公床扬新剑升杭林栗校楼标款汽社浣海商馆剧院钢华港机械广媒环球融第医科证券综财乐育游涨犹岭疏瘾睑确兵领导缴肢膛船艾瑟尔苍蔡虞效衫覆访诉课谕议轨述野钩限敌鞋颌颔颚饶首龈站例修凡划垂届属崽颏厨拜挫摆放旋削棋榻槛礼沉注滑营狱画确仪聘花葬诏员跌辖周达酒锚闸陷陆雨雪飞威丌于丹久乏予理评产亢卑亦乎舞己悲矩圆词害志但住佞佳便俗信票案幅翁倦伦假偏倚斜亏鬼敲停备伤脾胃仅此像俭匮免宜穴焉戴兼容许冻伯仲负彼昼皂轩轾实刊划颠卫战哥比省非好黄饰别拘束掩奶睬选择摇扰烦苦枚写协厌及格受欢迎约只估侵犯割状告或缺抗拒挽撤救药喻磨灭端倪少逆逾越避靠适吉誉吝玉含延咎歹听啻渊善谋均匀堪忍够太惹妙妥妨孕症孝术室完纳推冠积宣疑辩栗碴称屈挠屑干涉衡待很忙恶忿怎么怠急耻恭息悦惑惜惟想愉愧怍慌愤启懂懈怀材才紧招认扣抵拉舍也罢插揣冒搭撞南墙扩核支攻敢雷攀敬里吗需景智暇曾罪遇朽枉止况竞争辱求愈渝溶济左右袒困补爽特寂寞示弱找谢畏强疾徐痛痒冤符眠睦瞅董何厚云措活疲羞者轻玻璃祥兆禁移稂莠稳佛换答简结果盟绝缕途给谈否羁翼耐肖胫毋宁兴舒若菲莱痕迹窠臼虚衰脸兔撒鹰棺范该详讳抬泰让须眉象众赀账费灰赖奇虑训辍辨菽麦辛近送透逞徒速续逮捕遂遑违逊斧钺艰醉锈随观弃显饱脂肪使丏丐帮丒且慢末丕替桃宗王尊凉爵各图屋脊粮署录坛吾禄职胄袭君厦丗北壑桐疹损逢陵鹬丙寅戌氨腈唑纶辰酮脱氢酶醚丞丢现掉纱帽弄扯炮碗丠両丣坐存激肩臻蒂莲悖序驱丨丩丫挺杈髻鬟细介俄伊犁京尼布订普渡央委监察检查剂圈设警队斯督剩震境航舶革防托播促质版蝾螈锋研艺历残消频谱精密制造陲邮候埔坚压坜凹汇执府究邦俘摄寮彬狼岳肺肿庸英讯诊埋粒胞括控码韩暑枪枢砥澳哇牟寿甸钻探篇签缀缝继耳肯照妇埃悬璧轴柜台辣搁浅邪跑纤阮阳私囊魔丮丰姿采丱烧丳丵丶丷丸参寨朗桂瑞砂衷霞貌凤仆舰因嫌宰峰干络牌持旨祭祷簿编罚宾办丼丿乀乂乃乄仰慕盛旷留考验阔乆乇么丑麽乊湖燃乑乒乓乕乖僻忤戾离谬迕乗危肥劫除隙浪婿乙炔肠酰吡咯盐乚乛乜嘢卿玄宫尾狐龟塔嶷兄弟泉章霄钉耙乞扎哀怜恕讨乢乣乤乥乧乨乩童乪乫乭乳晕汁液瑶浆牙癌突窦罩腐胶猪酪蛋糕菌瘤乴乵乶乷乸乹乺乼乾俸冰嘉哕嚎坤妈尸垒旱枯涸俐渴潮涩煸豆燥爹瘦瘪癣瞪袋脆姜贝隆馏乿亀亁叫咕攘扔搞男砸窜蓬麻亃亄亅却亇迟典今临繁累卵奉婚聪躬巨与迁添裂副宿岁怪恶尕仑愣杆硅硫钛铀锰芑杂异钠砷胂磺琥珀舱棍簧胡茬盗浩盆贩郎腿亍洪亐互欠助勉惠操斥诿系户译亓墓碑刑铃卅渠缤纷斗米旗宪钒灯徽瘟祖拳福谷丰脏腑绑肉腌苓蕴桥铺霸颜闹判喷冈底蛙陉矿亖亘亜罕们娜桑那努哈喀弗烈曼松森杜氏杯奥琛敦戊穆圣裔汇薛孙亟亡佚虏羊牢奋释卷卸契媾感额睫缠谊趾塞挤纽阻还配驰庄亨洛祚亪享津沪畿郊慈菴枇杷膏亭阁锃丽亳亶亹诛初责翻疯偶杰丛稠妖拖寰居吸授慧蜗吞壮魅狗矛盾益渣患忧稀描猿梦暂涯畜祸缘沸搜引擎臣横纭谁混援蒸兽狮税剖亻亼亽亡什献刹邡么仂仃仄仆富怨仈仉毕昔晨壳绍仍仏仒仕宦仗欺恃腰叹叹炬梓讫施仙后琼逝仚仝仞仟悔仡佬偿填泊拓扑簇羔购顿钦佩发棻阃驭养亿儆尤借帧赈凌叙帖李柔刚沃眦睚戒讹取飨读仨仫仮著泳卧躺韶夏裁仳仵唯贤凭钓诞仿似宋佛讽伀硕盼鹅伄儅伈伉俪柯始娃迈戈坦堡帕茨萨庙玛莉莎藤霍姆伋伍奢胥廷芳豪伎俩侍汛勒希羲雏伐憩整谟闲闲伕伙伴颐伜伝伢叔恒兹恩翰伱伲侣伶俜悧鼬伸懒缩喇叭伹伺伻伽倻辐伾似佃伫布乔妮墨佉卢佌贷劣廉昂档浓矮伞洼缓耗胸谷迷挡率龋宅沫舍疗佐贰佑占优据铧尝呢须鲁晓佗佘余坪寺瓜铳僧蒙芒陀龛哼呕坊奸孽弊揖祟茧缚誓贼佝偻瞀佟你夺赶佡佢佣佤佧贾佪佫佯佰佱洁绩酿肴佴卷佶佷佸佹佺佻佼佽佾具唤窘坏娱怒慨硬习惯聋膨胀蔓骇贵痹侀侁侂侃侄侅鸿燕侇侈糜靡侉侌妾侏儒仓鼠侐侑侔仑侘侚链侜偎傍钴循柳葫芦附価侮骂蔑侯岩截蚀局贴壶嬛宴捷携桶笺酌俣狭膝狄俅俉俊俏俎俑俓俔谚俚俛黎健呈固墒增守康箱湿祐镖镳杠盒靖膜龄俞豹猎噪孚封札筒托衍鸽剪撰稿炼厂禊练缮葺俯瞰撑冲效俳俴俵俶俷俺备俾伥倂倅储卒惶敷猝逃颉蓄崇隐倌倏忽刺蜡烛噍嚼坍扁抽毙葱楣灌灶粪背薮卖赔闭霉腾倓倔幸倘倜傥倝借箸挹浇阅倡狂倢倣値倥偬倨傲倩匡嗣冲柝珍倬倭寇猩倮倶倷倹勤赞偁偃充伪吏嗓寐惺扮拱芫茜藉虢钞偈伟晶偌宕距析滤殿疼瘫注颇偓偕鸭歇滞偝偟偢忘怡旺偨偩逼偫偭偯偰偱偲侦缉蹄偷减惰漏窥窃偸偺迹傀儡傅傈僳骂篱傎奎琳迪叟芭傒傔傕伧悉荒傜傞傢傣芽逼佣婢傮睨寄檄诵谣颂伛担辜弓惨蒿悼疤傺傻屄臆巢泄箧羡盖轧颓傿㑩僄僇佥僊働僎侨僔僖僚僝伪僣僤侥僦猴偾僩僬僭僮僯僰雇僵殖签静僾僿征陇儁侬儃儇侩朴薄儊儋儌儍傧儓俦侪拟尽儜儞儤儦儩汰哉寡渥裕酷儭儱罐儳儵儹傩俨儽兀臬臲鹫允勋勋宙宵帅憝彝谐嫂阋畅沛溢盈饥赫凶悍狠猛顽愚妣斩秦遣鞭耀敏荣槃泽爆碟磁秃缆辉霁卤朵娄孜烽酱勃汀箕裘钳耶蒙蕾彻兑软遭黜兎児韵媳爸兕觥兖兙兛兜售鍪肚兝兞兟兡兢兣樽殓涅睡禀籍赘泌啡肽奸幕涵涝熵疚眷稃衬讧赴焕椒歼植跏没试误猜栖窗肋袖颊兪卦撇胡岐廓轿疸枫茴珑厕秩募勺吨寓斤历亩迫筷厘最淫螺韬兮宽匪筛襄赢轭复兲诈刃堰戎痞蚁饷它冀铸冂冃円冇冉册嫁厉砺竭醮冏牧冑冓冔冕冖冗冘冞冢窄抑诬冥冫烘菇蛰冷凝坨橇淇淋炭饼砖碛窖醋雕雹霜冱冶炉艳嘲峻滩淡漠煖飕饮冼冽凃凄怆梗凅凇净凊凋敝蒙凔凛遵汞脢凞几凢処凰凯凵凶焰凸折刷纹预丧喽奔巡榜殡芙蓉租笼辑鞘萃凼锯镬刁蛮刂娩崩批拆摊掰蘖骤歧颗秒袂赃勿嘱忌磋琢肤刈羽刎讼戮舂桨艇刓刖霹雳刜创犊刡恙墅帜筵致劫劫刨昏默攸尿欲熏润薰圭删刮痧铲刱刲刳刴刵踏磅戳柏槐绣芹苋猬舟铭鹄鹜劫剁剃辫刭锉履铅克剌姻咽哨廊掠桅沿召瞻翅赵卜渺茫郭剒剔剕沥剚愎毅讷才剜剥啄采剞剟剡剣剤䌽剐肾驶黏剰袍剀紊铲剸剺剽剿劁劂札劈啪柴扳啦刘奭姥夼昫涓熙禅禹锡翔雁鹗刽刿弩柄蜻蛉劒劓劖劘劙澜篑赏矶釜晋甜薪逐劦熔纣虐赤囚劬劭労劵效劻劼劾峭艮勅勇励勍勐腊脖庞漫饲荡粥辄勖勗勘骄馁碌泮雇捐竹骑殊阱绩朴恳谨剿勧勩勯勰劢勋勷劝惩慰诫谏勹芡践阑匁庇拯粟扎袱裹饺匆遽匈匉匊匋匍匐茎匏匕妆痰脓蛹斋苑烤蹈塘羌熊阀螳螂疆碚竿纬荷茵邙魏匚匜匝匟扶稷匣匦拢匸匹耦匽匾匿卂叮疮禧轸堤棚迢钧炼卄卆遐卉瓷盲瓶当胱腱裸卋卌卍卐怯污贱鄙龌龊陋卓溪唐梯渔陈枣泥漳浔涧梨芬谯赡辕迦郑単驴弈洽鳌卛占筮卝卞卟吩啉屎翠厄卣卨卪卬卮榫袄玺绶钮蚤惧殆笃耸卲帘帙绕恤卼卽厂厎厓厔厖厗奚厘厍厜厝谅厕厤厥厪腻孢厮厰厳厣厹厺粕垢芜菁厼厾叁悟茸薯叄吵笄悌哺讥坫垄弧芯杠潜婴刍袁诘贪谍煽馈驳収岳缔灾贿骗叚叡吻拦蘑蜜诀燧玩砚筝椎蔺铜逗骊另觅叨唠谒杵姓喊嚷嚣咚咛塑寻恼憎擦只泣渗蝠叱吒咄咤喝籀黛舵舷叵叶铎懿昭穰苴辽叻叼吁堑嫖赌瞧爬众抒吅吆夥卺橡涤抱纵摩郡唁坠扇篮膀袜颈吋忾谘酬哭妓媛暗表缰迩妃羿絮蕃浑拐葵暮隅吔吖啶嗪戚吜啬噬咽吟哦咏吠吧唧嗒咐吪隽咀征燐苞茹钙哧吮吰吱嘎吲哚吴栋娇窟孟箫忠晗淞阖闾趼宇呐睛嘘拂捧疵熄竽笛糠吼吽呀吕韦蒙呃呆笨呇贡呉罄呋喃呎呏呔呠呡痴呣呤呦呧瑛眩扒晬淑姬瑜璇鹃呪呫哔嚅嗫呬呯呰呱呲咧噌钝呴呶呷呸呺呻哱咻啸噜吁坎坷逻呿咁咂咆哮咇咈咋蟹煦珅蔼咍咑咒诅咔哒嚓咾哝哩喱咗咠咡咢咣咥咦咨嗟询咩咪咫啮啮咭咮咱咲咳呛嗽咴啕咸咹咺呙喉咿婉恸悯赋矜绿茗蓝哂抢瞒哆嗦啰噻啾滨彗哋哌哎唷哟哏哐哞哢哤哪里哫啼喘哰哲萎蚌哳咩哽哿呗唅唆唈唉唎唏哗尧棣殇璜睿肃唔睇唕吣唞唣喳唪唬唰喏唲唳唵嘛唶唸唹唻唼唾唿啁啃鹦鹉啅埠栈榷祺铺鞅飙啊啍啎啐啓啕啖啗啜哑祈啢衔啤啥啫啱啲啵啺饥啽噶昆沁喁喂喆裙喈咙喋喌喎喑喒喓喔粗喙幛庆滋鹊喟喣喤喥喦喧骚喨喩梆吃葡萄喭驼挑吓碰枞瓣纯疱藻趟铬喵営喹喺喼喿嗀嗃嗄嗅嗈嗉嗊嗍嗐嗑嗔诟嗕嗖嗙嗛嗜痂癖嗝嗡嗤嗥嗨唢嗬嗯嗰嗲嗵叽嗷嗹嗾嗿嘀嘁嘂嘅惋嘈峪禾荫啀嘌嘏嘐嘒啯啧嘚唛嘞嘟囔嘣嘥嘦嘧嘬嘭这谑严敞馋松哓嘶嗥呒虾嘹嘻啴嘿噀噂噅噇噉噎噏噔噗噘噙噚咝噞噢噤蝉皿噩噫噭嗳噱哙噳嚏涌洒欲巫霏噷噼嚃嚄嚆抖哜尝嚔苏嚚嚜嚞嚟呖嚬嚭嚮嚯亸喾饬按竣苛嚵嘤啭冁呓膪谦囍囒囓囗囘萧酚飘溅谛囝溯眸纥銮鹘囟殉囡団囤囥囧囨囱囫囵囬囮囯囲図囶囷囸囹圄圉拟囻囿圀圂圃圊粹蠹赦圌垦圏滚鲱凿枘圕圛圜圞坯埂壤骸炕祠窑豚绅魠鲮鳖圧握圩圪垯圬圮圯炸岬幔毯祇窨菩溉圳圴圻圾坂坆沾坋坌舛壈昆垫墩椅坒坓坩埚坭坰坱坳坴坵坻坼杨挣涎帘垃垈垌垍垓垔垕垗垚垛垝垣垞垟垤垧垮垵垺垾垿埀畔埄埆埇埈埌殃隍埏埒埕埗埜垭埤埦埧埭埯埰埲埳埴埵埶绋埸培怖桩础辅埼埽堀诃侄庑堃堄摧磐贞韧砌堈堉垩堋堌堍堎垴堙堞堠礁堧堨舆堭堮蜓摘堲堳堽堿塁塄塈煤茔棵塍垲埘塓绸塕鸦沽虱塙冢塝缪塡坞埙塥塩塬塱场螨塼塽塾塿墀墁墈墉墐夯増毁墝墠墦渍钵墫墬堕墰墺墙橱壅壆壊壌壎壒榨蒜壔壕壖圹垆壜壝垅壡壬壭壱売壴壹壻壸寝壿夂夅夆変夊夌漱邑夓腕泄甥御骼夗夘夙衮瑙妊娠醣枭珊莺鹭戗幻魇夤蹀秘擂鸫姚宛闺屿庾挞拇賛蛤裨菠氅漓捞湄蚊霆鲨箐篆篷荆肆舅荔鲆巷惭骰辟邱镕镰阪漂烩鲵鲽鳄鸨胪鹏妒峨谭枰晏玑癸祝秤竺牡籁恢罡蝼蝎赐绒御梭夬夭砣榆怙枕夶夹馅奄崛葩谲奈贺祀赠奌奂奓奕䜣詝奘奜奠奡奣陶奨奁魁奫奬奰娲孩贬隶酥宄狡猾她姹嫣妁毡荼皋膻蝇嫔妄妍嫉媚娆妗趣妚妞妤碍妬娅妯娌妲妳妵妺姁姅姉姗姒姘姙姜姝姞姣姤姧姫姮娥姱姸姺姽婀娀诱慑胁娉婷娑娓娟娣娭娯娵娶娸娼婊婐婕婞婤婥溪孺婧婪婬婹婺婼婽媁媄媊媕媞媟媠媢媬媮妫媲媵媸媺媻媪眯媿嫄嫈袅嫏嫕妪嫘嫚嫜嫠嫡嫦嫩嫪毐嫫嫬嫰妩嫺娴嫽嫿妫嬃嬅嬉耍婵痴艳嬔嬖嬗嫱袅嫒嬢嬷嬦嬬嬭幼嬲嬴婶嬹嬾嬿孀娘孅娈孏曰癫屏孑孓雀孖斟篓谜摺孛矻鸠崮轲祜鸾孥邈毓棠膑孬孭孰孱孳孵泛罔衔孻孪宀㝉冗拙株薇掣抚琪瓿榴谧弥宊濂祁瑕宍宏碁宓邸谳実潢町宥宧宨宬徵崎骏掖阙臊煮禽蚕宸豫寀寁寥寃檐庶寎暄碜寔寖寘寙寛寠苫寤肘洱滥蒗陕核寪弘绰螽宝擅疙瘩晷対檐専尃尅赎绌缭畴衅尌峙醌襟痲碧屁昊槌淘恵瀑牝畑莓缸羚觑蔻脏躁尔尓锐尗尙尜尟尢尥尨尪尬尭尰擒尲尶尴尸尹潽蠖蛾尻扣梢蚴鳍脬蹲屇屌蚵屐屃挪屖屘屙屛屝屡屣峦嶂岩舄屧屦屩屪屃屮戍驻钾崖嵛巅旮旯楂榄榉芋茱萸靛麓屴屹屺屼岀岊岌岍阜岑彭巩岒岝岢岚岣岧岨岫岱岵岷峁峇峋峒峓峞峠嵋峨峰峱岘峹峿崀崁崆祯崋崌崃岖昆崒崔嵬巍萤颢崚崞崟崠峥巆崤崦崧殂岽崱崳崴崶崿嵂嵇嵊泗嵌嵎嵒嵓岁嵙嵞嵡嵩嵫嵯嵴嵼嵾嵝崭崭晴嶋嶌嶒嶓嵚崂嶙嶝嶞峤嶡嶢峄嶨嶭嶮嶰嶲岙嵘巂巃巇巉岿巌巓巘巛滇芎巟巠弋回巣巤炊擘蜥蟒蛊觋巰蜀彦淖杏茂甫楞巻巽帼巿帛斐鲫蕊帑帔帗帚琉汶帟帡帣帨裙帯帰帷帹暆帏幄帮幋幌幏帻幙帮幞幠幡幢幦幨幩幪帱幭幯幰遥蹉跎馀庚鉴幵幷稚邃庀庁広庄庈庉笠庋跋庖牺庠庤庥鲸庬庱庳庴庵馨衢庹庿廃厩廆廋廌廎廏廐廑廒荫廖廛厮搏锣廞弛袤廥廧廨廪廱绵踵髓廸迫瓯邺廻廼廾廿躔弁皱弇弌弍弎弐弑吊诡憾荐弝弢弣弤弨弭弮弰弪霖繇焘斌旭溥骞弶弸弼弾彀彄别累纠强彔彖彘彟彟陌彤贻彧绘虹彪炳雕蔚鸥彰瘅彲彳彴仿彷徉徨彸彽踩敛旆徂徇徊渭畲铉裼従筌徘徙徜徕膳苏萌渐徬徭醺徯徳徴潘徻徼忀瘁胖燎怦悸颤扉犀澎湃砰恍惚绞隘忉惮挨饿忐忑忒忖応忝忞耿忡忪忭忮忱忸怩忻悠懑怏遏怔怗怚怛怞怼黍讶怫怭懦怱怲恍怵惕怸怹恁恂恇恉恌恏恒恓恔恘恚恛恝恞恟恠恣恧眄恪恫恬澹恰恿悀悁悃悄悆悊悐悒晦悚悛悜悝悤您悩悪悮悰悱凄恻德悴怅惘闷悻悾惄愫钟蒐惆惇惌惎惏惓惔惙惛耄惝疟浊恿惦德恽惴蠢惸拈愀愃愆愈愊愍愐愑愒愓愔愕恪氓蠢騃昵惬赧悫愬愮愯恺愼慁恿慅慆慇霭慉慊愠慝慥怄怂慬慱悭慴慵慷戚焚憀灼郁憃惫憋憍眺捏轼愦憔憖憙憧憬憨憪憭怃憯憷憸憹憺懃懅懆邀懊懋怿懔懐懞懠懤懥恹懫懮懰懱毖懵遁梁雍忏懽戁戄戆戉戋戕戛戝戛戠戡戢戣戤戥戦戬戭戯轰戱披菊牖戸戹戺戻卯戽锹扂楔扃扆扈扊杖牵绢铐镯赉扐搂搅烊盹瞌跟趸镲靶鼾払扗玫腮扛扞扠扡扢盔押扤扦扱罾揄绥鞍郤窾扻扼扽抃抆抈抉抌抏瞎抔缳缢擞抜拗択抨摔歉蹿牾抶抻搐泵菸拃拄拊髀抛拌脯拎拏拑擢秧沓曳挛迂拚拝拠拡拫拭拮踢拴拶拷攒拽掇芥橐簪摹疔挈瓢骥捺蹻挌挍挎挐拣挓挖掘浚挙揍聩挲挶挟挿捂捃捄捅捆捉捋胳膊揎捌捍捎躯蛛捗捘捙捜捥捩扪捭据捱捻捼捽掀掂抡臀膘掊掎掏掐笙掔掗掞棉芍掤搪阐掫掮掯揉掱掲掽掾揃揅揆搓揌诨揕揗揘揜揝揞揠揥揩揪揫橥遒麈揰揲揵揶揸背揺搆搉搊搋搌搎搔搕撼橹捣搘搠搡搢搣搤搥搦搧搨搬楦裢讪赸掏搰搲搳搴揾搷搽搾搿摀摁摂摃摎掴摒摓跤摙摛掼摞摠摦喉羯摭摮挚摰摲抠摴抟摷掺摽撂撃撅稻撊撋挦锏泼撕撙撚㧑挢撢掸撦撅撩撬撱朔揿蚍蜉挝捡擀掳闯擉缶觚擐擕擖擗擡擣擤澡腚擧擨擩擫擭摈拧撷擸撸擽擿攃摅撵攉攥攐攓撄搀撺每攩攫辔澄攮攰攲攴轶攷砭讦攽碘敁敃敇敉叙敎筏敔敕敖闰诲敜煌敧敪敳敹敺敻敿斁衽斄牒绉诌斉斎斓鹑谰驳鳢斒筲斛斝斞斠斡斢斨斫斮晾沂潟颖绛邵斲斸釳於琅斾斿旀旗旃旄涡旌旎旐旒旓旖旛旝旟旡旣浴旰獭魃旴时旻旼旽昀昃昄昇昉晰躲澈熹皎皓矾昑昕昜昝昞昡昤晖笋昦昨是昱昳昴昶昺昻晁蹇隧蔬髦晄晅晒晛晜晞晟晡晢晤晥曦晩萘莹顗晿暁暋暌暍暐暔暕煅旸暝暠暡曚暦暨暪朦胧昵暲殄冯暵暸暹暻暾曀晔昙曈曌曏曐暧曘曙曛叠昽曩骆曱甴肱曷牍禺锟曽沧耽朁朅朆杪栓夸竟粘绦朊膺朏朐朓朕朘朙瞄觐溘饔飧朠朢朣栅椆淀虱朩朮朰朱炆璋钰炽鹮朳槿朵朾朿杅杇杌陧欣钊湛漼楷瀍煜玟缨翱肇舜贽适逵杓杕杗杙荀蘅杝杞脩珓筊杰榔狍閦颦缅莞杲杳眇杴杶杸杻杼枋枌枒枓衾葄翘纾逋枙狸桠枟槁枲枳枴枵枷枸橼枹枻柁柂柃柅柈柊柎某柑橘柒柘柙柚柜柞栎柟柢柣柤柩柬柮柰柲橙柶柷柸柺査柿栃栄栒栔栘栝栟柏栩栫栭栱栲栳栴檀栵栻桀骜桁镁桄桉桋桎梏椹葚桓桔桕桜桟桫椤桭杯桯桲桴桷桹湘溟梃梊梍梐潼栀枧梜梠梡梣梧梩梱梲梳梴梵梹棁棃樱棐棑棕榈簑绷蓑枨棘棜棨棩棪棫棬棯棰棱棳棸棹椁棼碗椄苕椈椊椋椌椐椑椓椗検椤椪椰椳椴椵椷椸椽椿楀匾楅篪楋楍楎楗楘楙楛楝楟楠楢楥桢楩楪楫楬楮楯楰梅楸楹楻楽榀榃榊榎槺榕榖榘榛狉莽搒笞榠榡榤榥榦榧杩榭榰榱梿霰榼榾桤槊闩槎槑槔槖様槜槢槥椠槪槭椮槱槲槻槼槾樆樊樏樑樕樗樘樛樟樠樧樨権樲樴樵猢狲桦樻罍樾樿橁橄橆桡笥龠橕橚橛辆椭橤橧竖膈跨橾橿檩檃檇柽檍檎檑檖檗桧槚檠樯檨檫檬梼槟檴檵柠棹櫆櫌栉櫜椟櫡槠栌枥榇栊櫹棂茄櫽欀欂欃欐欑栾欙棂溴欨欬欱欵欶欷歔欸欹欻欼欿歁歃歆艎歈歊莳蝶歓歕歘歙歛歜欤歠蹦诠镶蹒跚升陟歩歮歯歰歳歴璞歺瞑歾殁夭殈殍殑殗殜殙殛殒殢殣殥殪殚僵殰殳荃殷殸殹蛟殻肴谤殴毈毉喂毎毑蕈毗毘毚茛邓毧毬毳毷毹毽毾毵牦氄氆靴氉氊氇氍氐聊氕氖気氘氙氚氛氜氝氡汹焊痉氤氲氥氦铝锌氪烃氩铵痤汪浒漉痘盂碾菖蒲蕹蛭螅氵冰氹氺氽烫氾氿渚汆汊汋汍汎汏汐汔汕褟汙汚汜蓠沼秽蔑汧汨汩汭汲汳汴堤汾沄沅沆瀣沇沈葆浸沦湎溺痼疴沌沍沏沐沔沕沘浜畹砾沚沢沬沭沮沰沱灢沴沷籽沺烹濡洄泂肛泅泆涌肓泐泑泒泓泔泖泙泚泜泝泠漩馍涛粼泞藓鳅泩泫泭泯铢泱泲洇洊泾琵琶荽蓟箔洌洎洏洑潄濯洙洚洟洢洣洧洨洩痢滔洫洮洳洴洵洸洹洺洼洿淌蜚浄浉浙赣渫浠浡浤浥淼瀚浬浭翩萍浯浰蜃淀苔蛞蝓蜇螵蛸煲鲤浃浼浽溦涂涊涐涑涒涔滂莅涘涙涪涫涬涮涴涶涷涿淄淅淆淊凄黯淓淙涟淜淝淟淠淢淤渌淦淩猥藿亵淬淮淯淰淳诣涞纺淸淹炖癯绮渇済渉渋渓渕涣渟渢滓渤澥渧渨渮渰渲渶渼湅湉湋湍湑湓湔黔湜湝浈湟湢湣湩湫湮麟湱湲湴涅満沩溍溎溏溛舐漭溠溤溧驯溮溱溲溳溵溷溻溼溽溾滁滃滉滊荥滏稽滕滘汇滝滫滮羼耷卤滹浐煎漈漊漎绎漕漖漘漙沤漜漪漾漥漦漯漰溆漶漷濞潀颍潎潏潕潗潚潝潞潠潦祉疡潲潵滗潸潺潾涠澁澂澃澉澌澍澐澒澔澙渑澣澦澧澨澫澬浍澰澴澶澼熏郁濆濇濈濉濊貊濔疣濜濠濩觞浚濮盥潍濲泺瀁滢渎渖瀌浏瀒瀔濒泸瀛潇潆瀡潴泷濑瀬弥潋瀳瀵瀹瀺瀼沣滠灉灋灒漓灖灏灞灠滦灥灨滟灪蜴灮烬獴灴灸灺炁炅鱿炗炘炙炤炫疽烙钎炯炰炱炲炴炷毁炻烀烋瘴鲳烓烔焙烜烝烳饪烺焃焄耆焌焐焓焗焜焞焠焢焮焯焱焼煁煃煆煇煊熠煍熬煐炜煕暖熏硷霾煚煝煟煠茕矸煨琐炀萁煳煺煻熀熅熇熉罴荧穹炝熘熛熜稔谙烁熤熨熯熰眶蚂颎熳熸熿燀烨燂燄盏燊燋燏燔隼燖焖燠燡灿燨燮燹燻燽燿爇爊爓爚爝爟爨蟾爯爰为爻丬爿牀牁牂牄牋窗牏牓窗釉牚腩蒡虻牠虽蛎牣牤牮牯牲牳牴牷牸牼绊牿靬犂犄犆犇犉犍犎犒荦犗犛犟犠犨犩犪犮犰狳犴犵犺狁甩狃狆狎狒獾狘狙黠狨狩狫狴狷狺狻豕狈蜘猁猇猈猊猋猓猖獗猗猘狰狞犸猞猟獕猭猱猲猳猷猸猹猺玃獀獃獉獍獏獐獒毙獙獚獜獝獞獠獢獣獧鼇蹊狯猃獬豸狝獯鬻獳犷猕猡玁菟玅玆玈珉糁禛郅玍玎玓瓅玔玕玖玗玘玞玠玡玢玤玥玦珏瑰玭玳瑁玶玷玹玼珂珇珈瑚珌馐馔珔珖珙珛珞珡珣珥珧珩珪佩珶珷珺珽琀琁陨玡琇琖琚琠琤琦琨琫琬琭琮琯琰琱琲琅琴珐珲瑀瑂瑄瑉玮瑑瑔瑗瑢瑭瑱瑲瑳瑽瑾瑿璀璨璁璅璆璈琏璊璐璘璚璝璟璠璡璥瑷璩璪璫璯璲玙璸璺璿瓀璎瓖瓘瓒瓛脐瓞瓠瓤瓧瓩瓮瓰瓱瓴瓸瓻瓼甀甁甃甄甇甋甍甎甏甑甒甓甔瓮甖甗饴蔗甙诧钜粱盎锈团甡褥産甪甬甭甮宁铠甹甽甾甿畀畁畇畈畊畋畎畓畚畛畟鄂畤畦畧荻畯畳畵畷畸畽畾疃叠疋疍疎箪疐疒疕疘疝疢疥疧疳疶疿痁痄痊痌痍痏痐痒痔痗瘢痚痠痡痣痦痩痭痯痱痳痵痻痿瘀痖瘃瘈瘉瘊瘌瘏瘐痪瘕瘖瘙瘚瘛疭瘜瘝瘗瘠瘥瘨瘭瘆瘯瘰疬瘳疠瘵瘸瘺瘘瘼癃痨痫癈癎癐癔癙癜癠疖症癞蟆癪瘿痈発踔绀蔫酵皙砬砒翎翳蔹钨镴皑鹎驹暨粤褶皀皁荚皃镈皈皌皋皒朱皕皖皘皜皝皞皤皦皨皪皫皭糙绽皴皲皻皽盅盋碗盍盚盝踞盦盩秋千盬盭眦睁瞤盯盱眙裰盵盻睐眂眅眈眊県眑眕眚眛眞眢眣眭眳眴眵眹瞓眽郛睃睅睆睊睍睎困睒睖睙睟睠睢睥睪睾睯睽睾眯瞈瞋瞍逛瞏瞕瞖眍䁖瞟瞠瞢瞫瞭瞳瞵瞷瞹瞽阇瞿眬矉矍铄矔矗矙瞩矞矟矠矣矧矬矫矰矱硪碇磙罅舫阡、矼矽礓砃砅砆砉砍砑砕砝砟砠砢砦砧砩砫砮砳艏砵砹砼硇硌硍硎硏硐硒硜硖砗磲茚钡硭硻硾碃碉碏碣碓碔碞碡碪碫碬砀碯碲砜碻礴磈磉磎硙磔磕磖磛磟磠磡磤磥蹭磪磬磴磵磹磻硗礀硚礅礌礐礚礜礞礤礧礮砻礲礵礽礿祂祄祅祆禳祊祍祏祓祔祕祗祘祛祧祫祲祻祼饵脔锢禂禇禋祦禔祎隋禖禘禚禜禝禠祃禢禤禥禨禫祢禴禸秆秈秊闱飒秋秏秕笈蘵赁秠秣秪秫秬秭秷秸稊稌稍稑稗稙稛稞稬秸稲稹稼颡稿穂穄穇穈穉穋稣贮穏穜穟秾穑穣穤穧穨穭穮穵穸窿阒窀窂窅窆窈窕窊窋窌窒窗窔窞窣窬黩蹙窑窳窴窵窭窸窗竁竃竈竑竜并竦竖篦篾笆鲛竾笉笊笎笏笐靥笓笤箓笪笫笭笮笰笱笲笳笵笸笻筀筅筇筈筎筑筘筠筤筥筦笕筒筭箸筰筱筳筴宴筸箂个箊箎箑箒箘箙箛箜篌箝箠箬镞箯箴箾篁筼筜篘篙篚篛篜篝篟篠篡篢篥篧篨篭篰篲筚篴篶篹篼箦簁簃簆簉簋簌簏簜簟簠簥簦簨簬簰簸簻籊藤籒籓籔签籚篯箨籣籥籧笾簖籫籯芾麴籵籸籹籼粁秕粋粑粔粝粛粞粢粧粨粲粳稗粻粽辟粿糅糆糈糌糍糒糔萼糗蛆蹋糢糨糬粽糯糱籴粜糸糺紃蹼鲣霉纡纨绔纫闽襻紑纰纮锭鸢鹞纴紞紟扎紩紬绂绁纻紽紾绐絁絃絅経絍绗絏缡褵絓絖絘絜绚絣螯絪絫聒絰絵绝絺絻絿綀绡綅绠绨绣綌綍綎捆綖綘継続缎绻綦綪线綮綯绾罟蝽綷縩绺绫緁绲緅緆缁绯緌緎総緑绱緖缃缄缂绵缗緤褓缌纂緪緰缑缈缏缇縁縃縄萦缙缒縏缣縕缞縚缜缟缛縠縡縢縦绦縯縰骋缧縳纤缦絷缥縻衙縿繄缫繈繊繋繐缯繖繘繙繠缋繣繨缰缲繸繻缱纁纆纇缬缵纩纑纕缵纙纚纛缾罃罆坛罋罂罎罏罖罘罛罝罠罣罥罦罨罫罭锾罳罶罹罻罽罿羂羃羇芈蕉51鸵羑羖羌羜羝羢羣羟羧羭羮羰羱羵羶羸藜鲐翀翃翅翊翌翏翕翛翟翡翣翥翦跹翪翫翚翮翯翱翽翾翿板饕鸹锨耋耇耎耏专耒耜耔耞耡耤耨耩耪耧耰鬓耵聍聃聆聎聝聡聦聱聴聂聼阈聿肄肏肐肕腋肙肜肟肧胛肫肬肭肰肴肵肸肼胊胍胏胑胔胗胙胝胠铨胤胦胩胬胭胯胰胲胴胹胻胼胾脇脘脝脞脡脣脤脥脧脰脲脳腆腊腌臜腍腒腓胨腜腠脶腥腧腬腯踝蹬镣腴腶蠕诽膂腽嗉膇膋膔腘膗膙膟黐膣膦膫膰膴膵膷脍臃臄臇臈臌臐臑臓膘臖臙臛臝臞臧蓐诩臽臾臿舀舁鳑鲏舋舎舔舗馆舝舠舡舢舨舭舲舳舴舸舺艁艄艅艉艋艑艕艖艗艘艚艜艟艣舣艨艩舻艬艭荏艴艳艸艹艻艿芃芄芊萰陂藭芏芔芘芚蕙芟芣芤茉芧芨芩芪芮芰鲢芴芷芸荛豢芼芿苄苒苘苙苜蓿苠苡苣荬苤苎苪镑苶苹苺苻苾茀茁范蠡萣茆茇茈茌茍茖茞茠茢茥茦菰茭茯茳藨茷藘茼荁荄荅荇荈菅蜢鸮荍荑荘豆荵荸荠莆莒莔莕莘莙莚莛莜莝莦莨菪莩莪莭莰莿菀菆菉菎菏菐菑菓菔芲菘菝菡菢菣菥蓂菧菫毂蓥菶菷菹醢菺菻菼菾萅萆苌萋萏萐萑萜萩萱萴莴扁萻葇葍葎葑荭葖葙葠葥苇葧葭药葳葴葶葸葹葽蒄蒎莼茏薹莅蒟蒻蒢蒦蒨蒭藁蒯蒱鉾蒴蒹蒺蒽荪蓁蓆蓇蓊蓌蓍蓏蓓蓖蓧蓪蓫荜跣藕苁蓰蓱莼蓷蓺蓼蔀蔂蔃蔆蔇蔉蔊蔋蔌蔎蔕蔘蔙蒌蔟锷蒋雯茑蔯蔳麻蔵蔸蔾荨蒇蕋蕍荞蕐蕑芸莸蕖蕗蕝蕞蕠蕡蒉蕣蕤蕨蕳蓣蕸蕺蕻薀薁薃薅薆荟薉芗薏薐蔷薖薘剃谔钗薜薠薢薤薧薨薫薬薳薶薷薸薽薾薿藄藇藋荩藐藙藚藟藦藳藴苈藷藾蘀蘁蕲苹蘗蘘蘝蘤蘧蘩蘸蘼虀虆虍蟠虒虓虖虡虣虥虩虬虰蛵蛇虷鳟虺虼蚆蚈蚋蚓蚔蚖蚘蚜蚡蚣蚧蚨蚩蚪蚯蚰蜒蚱蚳蚶蚹蚺蚻蚿蛀蛁蛄蛅蝮蛌蛍蛐蟮蛑蛓蛔蛘蛚蛜蛡蛣蜊蛩蛱蜕螫蜅蚬蜈蝣蜋蜍蜎蜑蠊蜛饯蜞蜣蜨蜩蜮蜱蜷蜺蜾蜿蝀蝃蝋蝌蝍蝎蝏蝗蝘蝙蝝鲼蝡蝤蝥猿蝰虻蝲蝴蝻螃蠏蛳螉螋螒螓螗螘螙螚蟥螟螣螥螬螭䗖螾螀蟀蟅蝈蟊蟋蟑蟓蟛蟜蟟蟢虮蟨蟪蟭蛲蟳蛏蟷蟺蟿蠁蠂蠃虿蠋蛴蠓蚝蠗蠙蠚蠛蠜蠧蟏蠩蜂蠮蠰蠲蠵蠸蠼蠽衁衄衄衇衈衉衋衎衒同衖胡衞裳钩衭衲衵衹衺衿袈裟袗袚袟袢袪袮袲袴袷袺袼褙袽裀裉袅裋夹裍裎裒裛裯裱裲裴裾褀褂褉褊裈褎褐褒褓褔褕袆褚褡褢褦褧褪褫袅褯褰褱裆褛褽褾襁褒襆裥襉襋襌襏襚襛襜裣襞襡襢褴襦襫襬襭襮襕襶襼襽襾覂覃覅霸覉覊覌覗觇覚覜觍觎覧覩觊觏覰観觌觔觕觖觜觽觝觡酲觩觫觭觱觳觯觷觼觾觿言赅讣訇訏訑訒诂讬訧訬訳訹证訾詀詅诋毁詈詊讵詑诒诐詗诎察詨诜詶詸詹詻诙诖誂誃诔锄诓誋诳诶悖誙诮诰誧説読誯谇訚谄谆諆諌诤诹诼諕谂谀諝谝諟喧谥諴諵谌谖誊謆謇歌謍謏謑谡谥謡謦謪谪讴謷謼谩哗譅譆譈譊讹譒撰谮鑫譞噪譩谵譬譱譲谴譸譹谫讅讆詟䜩雠讐谗谶讙谠讟谽豁豉豇岂豊豋豌豏豔豞豖豗豜豝豣豦豨豭豱豳豵豶豷豺豻貅貆狸猊貔貘䝙貜貤餍贳餸贶贲赂賏赊赇赒賝赓赕賨赍斗賮賵賸赚赙赜赟贉赆赑贕赝赬赭赱赳迄趁趂趄趐趑趒趔趡趦趫趮趯趱趴趵趷趹趺趿跁跂跅跆踬跄跐跕跖跗跙跛跦跧跩跫跬跮跱跲跴跺跼跽踅踆踈踉踊踒踖踘踜踟躇蹰踠踡踣踤踥踦踧跷踫踮逾踱踊踶踹踺踼踽躞蹁蹂躏蹎蹐蹓蹔跸蹚蹜蹝迹蹠蹡蹢跶蹧蹩蹪蹯鞠蹽躃躄躅踌跻躐踯跞躘躙躗躝躠蹑躜躧躩躭躰躬躶軃軆辊軏轫軘軜軝腭転軥軨軭軱轱辘軷轵轺軽軿輀輂辇辂辁輈挽輗辄辎辋輠輤輬輭輮辏輴輵輶輹輼辗辒轇轏轑轒辚轕轖轗轘轙轝轞轹轳罪辣辞辵辶辺込辿迅迋迍麿迓迣迤逦迥迨迮迸迺迻迿逄逅逌逍逑逓迳逖逡逭逯逴逶逹遄遅侦遘遛遝遢遨遫遯遰遴绕遹遻邂邅邉邋邎邕邗邘邛邠邢邧邨邯郸邰邲邳邴邶邷邽邾邿郃郄郇郈郔郕郗郙郚郜郝郞郏郠郢郪郫郯郰郲郳郴郷郹郾郿鄀鄄郓鄇鄈鄋鄍鄎鄏鄐鄑邹邬鄕郧鄗鄘鄚鄜鄞鄠鄢鄣鄤鄦鄩鄫鄬鄮鄯鄱郐鄷鄹邝鄻鄾鄿酃酅酆酇郦酊酋酎酏酐酣酔酕醄酖酗酞酡酢酤酩酴酹酺醁醅醆醊醍醐醑醓醖醝酝醡醤醨醪醭醯醰酦醲醴醵醸醹醼醽醾釂酾酽釆釈鲈镏阊钆钇钌钯钋鼢鼹钐钏釪釬釭釱钍釸钕钫鈃钭鈆鈇钚鈊鈌钤钣鈒鈤钬钪鈬铌铈钶铛钹铍钸钿鉄鉆铊铇鉌铋鉏铂钷铆钵鉥钲鉨钼钽鉱鉲鉶铰铒鉼铪銍銎铣銕镂铫铦铑铷銤铱铟銧铥铕铯銭銰焊銶锑锉汞鋂锒鋆鋈鋊铤鋍铗鋐鋑鋕鋘鋙锊锓锔锇铓鋭铖锆锂铽鋳鋹鋺鉴镚钎錀锞锖锫锩錍铔锕錔锱铮锛錞锬锜錤錩錬録铼錼锝钔锴鍉镀鍏鍐铡鍚锻锽锸锲锘鍫鍭鍱鍴锶鍹锗针锺锿镅鎉鎋鎌鎍鎏鎒鎓鎗镉鎚鎞镃鎤铩锼鎭鎯镒镍鎴镓鎸鎹镎镟鏊镆镠镝鏖铿锵鏚镗镘镛鏠鏦錾镤鏸镪鏻鏽鏾铙鐄鐇鐏铹镦镡鐗馗镫镢镨鐡锎镄鐩镌鐬鐱镭鐶鐻鐽镱鑀鑅镔鑐鑕鑚鑛鑢鑤镥鑪镧鑯鑱鑴鑵镊镢钃镻闫闬闶闳閒闵閗閟阂関合閤哄阆閲阉閺阎阏阍阌暗闉阕阗闑闒闿闘闚阚闟闠闤闼阞阢阤阨阬阯阹阼阽陁陑陔陛陜陡陥陬骘陴険陼陾阴隃隈隒隗隞隠隣隤隩隮隰颧隳隷隹雂雈雉雊雎雑雒雗雘雚雝雟雩雰雱驿霂霅霈霊沾霒霓霙霝霢霣霤霨霩霪霫霮靁叇叆靑靓靣腼靪靮靰靳靷靸靺靼靿鞀鞃鞄鞍鞗鞙鞚鞝鞞鞡鞣鞨鞫鞬鞮鞶鞹鞾鞑韅鞯驮韍韎韔韖韘韝韫韡韣韭韭韱韹韺頀刮頄顸顼頍颀颃颁頖頞頠頫頬颅頯頲颕頼悴顋顑颙颛颜顕顚顜颟顣颥颞飐飑台飓颸飏飖颽颾颿飀飂飚飌翻飡飣饲飥饨饫飮飧飶餀餂饸饹餇餈饽哺馂餖餗餚馄馃餟餠餤餧餩餪餫糊餮糇餲饧馎糕饩馈馊馌馒饇馑馓膳饎饐饘饟馕馘馥馝馡馣骝骡馵馹駃駄駅駆駉駋驽駓驵駗骀驸駜骂骈駪駬骃駴骎駹駽駾騂騄骓騆騉騋骒骐麟騑騒験騕骛騠騢騣騤騧骧騵驺骟騺蓦骖骠骢驆驈骅驌骁驎骣驒驔驖驙驦驩驫骺鲠骫骭肮骱骴骶骷髅骾髁髂髄髆膀髇髑髌髋髙髝髞髟髡髣髧髪髫髭髯髲髳髹髺髽髾鬁鬃鬅鬈鬋鬎鬏鬐鬑鬒鬖鬗鬘鬙鬠鬣斗鬫鬬阄鬯鬰鬲鬵鬷魆魈魊魋魍魉魑魖鳔魛魟魣魦魨魬鲂魵魸鮀鲅鮆鲧鲇鲍鲋鮓鲒鲕鮟鱇鮠鮦鮨鲔鲑鮶鮸鮿鲧鯄鯆鲩鯈鲻鯕鲭鲞鯙鯠鲲鯥鲰鲶鳀鯸鳊鲗䲠鹣鳇鰋鳄鳆鰕鰛鰜鲥鰤鳏鰦鳎鳐鳁鳓鰶鲦鲡鰼鰽鱀鱄鳙鱆鳕鱎鱐鳝鳝鳜鲟鲎鱠鳣鱨鲚鱮鱲鱵鱻鲅鳦凫鳯鳲鳷鳻鴂鴃鴄鸩鴈鴎鸰鴔鴗鸳鸯鸲鹆鸱鴠鴢鸪鴥鸸鹋鴳鸻鴷鴽鵀鵁鸺鹁鵖鵙鹈鹕鹅鵟鵩鹌鵫鵵鵷鵻鹍鶂鶊鶏鶒鹙鶗鶡鶤鶦鶬鶱鹟鶵鶸鶹鹡鶿鹚鷁鷃鷄鷇䴘䴘鷊鷏鹧鷕鹥鸷鷞鷟鸶鹪鹩鷩鷫鷭鹇鹇鸴鷾䴙鸂鸇䴙鸏鸑鸒鸓鸬鹳鸜鹂鹸咸鹾麀麂麃麄麇麋麌麐麑麒麚麛麝麤麸面麫麮麯麰麺麾黁黈黉黢黒黓黕黙黝黟黥黦黧黮黰黱黪黶黹黻黼黾鼋鼂鼃鼅鼈鼍鼏鼐鼒冬鼖鼙鼚鼛鼡鼩鼱鼪鼫鼯鼷鼽齁齆齇齈齉齌赍齑龀齕齗龅齚龇齞龃龉龆齢出齧齩齮齯齰齱齵齾厐龑龒龚龖龘龝龡龢龤' +simplified_charcters = '制咖片型超声盘鉴定仔点他命书歌粉巾字帐恤手指记忆棒形转弯沟光○〇㐄㐅㐆㐌㐖毒㐜㐡㐤㐰㐺㑇㑳㒳㒸㔾㗂㗎㝵㞎㞙㞞以㢲㢴㤅㥁㥯㨗㫺㬎㮎㮚㮸㲋㲱㲾㳮涧㵪㶸㷖㷭㹢㹴犬㺢狓㺵碗㽮㿝䍃䔢䖟䖸䗈䗥䗪䝓射䥯䦉䯝鲃鱼䲔䳗鹅䵹鼄䶑一对应映射丁不识下儿子做二休世丘之貉并中台原则串为甚谓干净了百事无成八变五十些人得道鸡升天代如并来去个国政策劲幽灵在欧洲游荡接样萝卜坑侧化传价元论醇共再准刀两断切分耕耘收获钱货物向看旧就绪险刻千金动劳永逸匙零夜半卡通回复返影踪反常态口咬气句话同吐快吹周味呼诺呜品红锅哄而散起唱和问三知生熟团漆黑火糟堆场空块面塌糊涂尘染壁厢夔已足多情露水大早到晚夫妻当关万莫开失古恨套所料既往孔见提师要家主审寸阴难买斗牛小撮部阵局展身层巴掌帆风顺席地带过年计于春头载四季期被蛇怕井绳度愿式份弹顷深前律径心意念差愁孤行俱全房厅交遮打技长把抓死拿眼泪鼻涕钥锁折段抿拍即合扫排掬挥拨拥上入击洞掷揽改故辙败文值名斑方面旁族日秋餐隔雅里终父旦时晌会霎间晃暴寒曝更月望垠际朝夕本正经利杯羹东西板枝独秀根筋杆进条龙服务概模次函数又性程总付步脚印趋登毛拔呵氧氮碳决雌雄波未平派谎言流清楚白准溜烟潭有获闻是处降琴鹤甲病发可拾沙目然了直以相眨穿睹瞥瞬矢的解石鸟神教秉虔诚秘种窝蜂穷窍笑置笔苟勾销抹杀煞等奖箍节吃箭仇双雕诗筹箩筐系列纸级士官统丝毫挂维网尽线微吭响股脑胎脉承腔臂力致效资源址器举功投般说讲规贸易叶障着慎满皆输号木电池衣倾钟高低视仁觉醒览遗角银币触溃九鼎蔽抄出驷马追重语破贫洗贯走路安蹴至几蹶振跃役胆汗较辈轮辞赞退六连遍递边针血锤音错门思闪真倒项栽雾类保护川先惊乍体哄鳞爪鸣滴泡邻域党专鼓作齐炒丑烯亥克内酯冬加奴卯肝炎基尺梁街裤镐客宠庭巳汝昌烷玲磊糖肇酉醛啷青县韪良香骨鲷丂七集河市弦喜嘴张舌堵区工业姊妹星架构巧彩扭歪拼凑余热曜武州爷浮屠美乡老阶树荤素碎落能魄鳃鳗珠丄丅丆万俟丈尚摸母娘量管群亚虎必我堂令申件装伏位博侠义界表女墟台戏臭皮匠胜诸葛亮赛顶倍催请运算包立叉戟离疫苗土史志演围揭瓦晒夷姑婆帝村宝烂尖杉碱屉桌山岔岛由纪峡坝库镇废从德后拗汤治旬食明昧曹朋友框栏极权幂曲归依猫民氟硼氯磷铁江侗自旅法司洋浦梅园温暖湾焦班幸用田略番叠皇炮捶硝苯酸腺苷棱草镜穗跳远索锦纲聚氰胺联店胚膲爱色堇紫罗兰芝茶饭菱云虫藏藩乱叛苏亲债凳学座恐恋柱测肌腹衩锥系貂企乌跪叩军车农题迭都甘油屯奏键短阿姨陪姐只顾茅庐槽驾魂鲜鹿页其菜单乘任供势午齿汉组织吊调泻唇坡城报坟外夸将尉建筑岸岗公床扬新剑升杭林栗校楼标款汽社浣海商馆剧院钢华港机械广媒环球融第医科证券综财乐育游涨犹岭疏瘾睑确兵领导缴肢膛船艾瑟尔苍蔡虞效衫覆访诉课谕议轨述野钩限敌鞋颌颔颚饶首龈站例修凡划垂届属崽颏厨拜挫摆放旋削棋榻槛礼沉注滑营狱画确仪聘花葬诏员跌辖周达酒锚闸陷陆雨雪飞威丌于丹久乏予理评产亢卑亦乎舞己悲矩圆词害志但住佞佳便俗信票案幅翁倦伦假偏倚斜亏鬼敲停备伤脾胃仅此像俭匮免宜穴焉戴兼容许冻伯仲负彼昼皂轩轾实刊划颠卫战哥比省非好黄饰别拘束掩奶睬选择摇扰烦苦枚写协厌及格受欢迎约只估侵犯割状告或缺抗拒挽撤救药喻磨灭端倪少逆逾越避靠适吉誉吝玉含延咎歹听啻渊善谋均匀堪忍够太惹妙妥妨孕症孝术室完纳推冠积宣疑辩栗碴称屈挠屑干涉衡待很忙恶忿怎么怠急耻恭息悦惑惜惟想愉愧怍慌愤启懂懈怀材才紧招认扣抵拉舍也罢插揣冒搭撞南墙扩核支攻敢雷攀敬里吗需景智暇曾罪遇朽枉止况竞争辱求愈渝溶济左右袒困补爽特寂寞示弱找谢畏强疾徐痛痒冤符眠睦瞅董何厚云措活疲羞者轻玻璃祥兆禁移稂莠稳佛换答简结果盟绝缕途给谈否羁翼耐肖胫毋宁兴舒若菲莱痕迹窠臼虚衰脸兔撒鹰棺范该详讳抬泰让须眉象众赀账费灰赖奇虑训辍辨菽麦辛近送透逞徒速续逮捕遂遑违逊斧钺艰醉锈随观弃显饱脂肪使丏丐帮丒且慢末丕替桃宗王尊凉爵各图屋脊粮署录坛吾禄职胄袭君厦丗北壑桐疹损逢陵鹬丙寅戌氨腈唑纶辰酮脱氢酶醚丞丢现掉纱帽弄扯炮碗丠両丣坐存激肩臻蒂莲悖序驱丨丩丫挺杈髻鬟细介俄伊犁京尼布订普渡央委监察检查剂圈设警队斯督剩震境航舶革防托播促质版蝾螈锋研艺历残消频谱精密制造陲邮候埔坚压坜凹汇执府究邦俘摄寮彬狼岳肺肿庸英讯诊埋粒胞括控码韩暑枪枢砥澳哇牟寿甸钻探篇签缀缝继耳肯照妇埃悬璧轴柜台辣搁浅邪跑纤阮阳私囊魔丮丰姿采丱烧丳丵丶丷丸参寨朗桂瑞砂衷霞貌凤仆舰因嫌宰峰干络牌持旨祭祷簿编罚宾办丼丿乀乂乃乄仰慕盛旷留考验阔乆乇么丑麽乊湖燃乑乒乓乕乖僻忤戾离谬迕乗危肥劫除隙浪婿乙炔肠酰吡咯盐乚乛乜嘢卿玄宫尾狐龟塔嶷兄弟泉章霄钉耙乞扎哀怜恕讨乢乣乤乥乧乨乩童乪乫乭乳晕汁液瑶浆牙癌突窦罩腐胶猪酪蛋糕菌瘤乴乵乶乷乸乹乺乼乾俸冰嘉哕嚎坤妈尸垒旱枯涸俐渴潮涩煸豆燥爹瘦瘪癣瞪袋脆姜贝隆馏乿亀亁叫咕攘扔搞男砸窜蓬麻亃亄亅却亇迟典今临繁累卵奉婚聪躬巨与迁添裂副宿岁怪恶尕仑愣杆硅硫钛铀锰芑杂异钠砷胂磺琥珀舱棍簧胡茬盗浩盆贩郎腿亍洪亐互欠助勉惠操斥诿系户译亓墓碑刑铃卅渠缤纷斗米旗宪钒灯徽瘟祖拳福谷丰脏腑绑肉腌苓蕴桥铺霸颜闹判喷冈底蛙陉矿亖亘亜罕们娜桑那努哈喀弗烈曼松森杜氏杯奥琛敦戊穆圣裔汇薛孙亟亡佚虏羊牢奋释卷卸契媾感额睫缠谊趾塞挤纽阻还配驰庄亨洛祚亪享津沪畿郊慈菴枇杷膏亭阁锃丽亳亶亹诛初责翻疯偶杰丛稠妖拖寰居吸授慧蜗吞壮魅狗矛盾益渣患忧稀描猿梦暂涯畜祸缘沸搜引擎臣横纭谁混援蒸兽狮税剖亻亼亽亡什献刹邡么仂仃仄仆富怨仈仉毕昔晨壳绍仍仏仒仕宦仗欺恃腰叹叹炬梓讫施仙后琼逝仚仝仞仟悔仡佬偿填泊拓扑簇羔购顿钦佩发棻阃驭养亿儆尤借帧赈凌叙帖李柔刚沃眦睚戒讹取飨读仨仫仮著泳卧躺韶夏裁仳仵唯贤凭钓诞仿似宋佛讽伀硕盼鹅伄儅伈伉俪柯始娃迈戈坦堡帕茨萨庙玛莉莎藤霍姆伋伍奢胥廷芳豪伎俩侍汛勒希羲雏伐憩整谟闲闲伕伙伴颐伜伝伢叔恒兹恩翰伱伲侣伶俜悧鼬伸懒缩喇叭伹伺伻伽倻辐伾似佃伫布乔妮墨佉卢佌贷劣廉昂档浓矮伞洼缓耗胸谷迷挡率龋宅沫舍疗佐贰佑占优据铧尝呢须鲁晓佗佘余坪寺瓜铳僧蒙芒陀龛哼呕坊奸孽弊揖祟茧缚誓贼佝偻瞀佟你夺赶佡佢佣佤佧贾佪佫佯佰佱洁绩酿肴佴卷佶佷佸佹佺佻佼佽佾具唤窘坏娱怒慨硬习惯聋膨胀蔓骇贵痹侀侁侂侃侄侅鸿燕侇侈糜靡侉侌妾侏儒仓鼠侐侑侔仑侘侚链侜偎傍钴循柳葫芦附価侮骂蔑侯岩截蚀局贴壶嬛宴捷携桶笺酌俣狭膝狄俅俉俊俏俎俑俓俔谚俚俛黎健呈固墒增守康箱湿祐镖镳杠盒靖膜龄俞豹猎噪孚封札筒托衍鸽剪撰稿炼厂禊练缮葺俯瞰撑冲效俳俴俵俶俷俺备俾伥倂倅储卒惶敷猝逃颉蓄崇隐倌倏忽刺蜡烛噍嚼坍扁抽毙葱楣灌灶粪背薮卖赔闭霉腾倓倔幸倘倜傥倝借箸挹浇阅倡狂倢倣値倥偬倨傲倩匡嗣冲柝珍倬倭寇猩倮倶倷倹勤赞偁偃充伪吏嗓寐惺扮拱芫茜藉虢钞偈伟晶偌宕距析滤殿疼瘫注颇偓偕鸭歇滞偝偟偢忘怡旺偨偩逼偫偭偯偰偱偲侦缉蹄偷减惰漏窥窃偸偺迹傀儡傅傈僳骂篱傎奎琳迪叟芭傒傔傕伧悉荒傜傞傢傣芽逼佣婢傮睨寄檄诵谣颂伛担辜弓惨蒿悼疤傺傻屄臆巢泄箧羡盖轧颓傿㑩僄僇佥僊働僎侨僔僖僚僝伪僣僤侥僦猴偾僩僬僭僮僯僰雇僵殖签静僾僿征陇儁侬儃儇侩朴薄儊儋儌儍傧儓俦侪拟尽儜儞儤儦儩汰哉寡渥裕酷儭儱罐儳儵儹傩俨儽兀臬臲鹫允勋勋宙宵帅憝彝谐嫂阋畅沛溢盈饥赫凶悍狠猛顽愚妣斩秦遣鞭耀敏荣槃泽爆碟磁秃缆辉霁卤朵娄孜烽酱勃汀箕裘钳耶蒙蕾彻兑软遭黜兎児韵媳爸兕觥兖兙兛兜售鍪肚兝兞兟兡兢兣樽殓涅睡禀籍赘泌啡肽奸幕涵涝熵疚眷稃衬讧赴焕椒歼植跏没试误猜栖窗肋袖颊兪卦撇胡岐廓轿疸枫茴珑厕秩募勺吨寓斤历亩迫筷厘最淫螺韬兮宽匪筛襄赢轭复兲诈刃堰戎痞蚁饷它冀铸冂冃円冇冉册嫁厉砺竭醮冏牧冑冓冔冕冖冗冘冞冢窄抑诬冥冫烘菇蛰冷凝坨橇淇淋炭饼砖碛窖醋雕雹霜冱冶炉艳嘲峻滩淡漠煖飕饮冼冽凃凄怆梗凅凇净凊凋敝蒙凔凛遵汞脢凞几凢処凰凯凵凶焰凸折刷纹预丧喽奔巡榜殡芙蓉租笼辑鞘萃凼锯镬刁蛮刂娩崩批拆摊掰蘖骤歧颗秒袂赃勿嘱忌磋琢肤刈羽刎讼戮舂桨艇刓刖霹雳刜创犊刡恙墅帜筵致劫劫刨昏默攸尿欲熏润薰圭删刮痧铲刱刲刳刴刵踏磅戳柏槐绣芹苋猬舟铭鹄鹜劫剁剃辫刭锉履铅克剌姻咽哨廊掠桅沿召瞻翅赵卜渺茫郭剒剔剕沥剚愎毅讷才剜剥啄采剞剟剡剣剤䌽剐肾驶黏剰袍剀紊铲剸剺剽剿劁劂札劈啪柴扳啦刘奭姥夼昫涓熙禅禹锡翔雁鹗刽刿弩柄蜻蛉劒劓劖劘劙澜篑赏矶釜晋甜薪逐劦熔纣虐赤囚劬劭労劵效劻劼劾峭艮勅勇励勍勐腊脖庞漫饲荡粥辄勖勗勘骄馁碌泮雇捐竹骑殊阱绩朴恳谨剿勧勩勯勰劢勋勷劝惩慰诫谏勹芡践阑匁庇拯粟扎袱裹饺匆遽匈匉匊匋匍匐茎匏匕妆痰脓蛹斋苑烤蹈塘羌熊阀螳螂疆碚竿纬荷茵邙魏匚匜匝匟扶稷匣匦拢匸匹耦匽匾匿卂叮疮禧轸堤棚迢钧炼卄卆遐卉瓷盲瓶当胱腱裸卋卌卍卐怯污贱鄙龌龊陋卓溪唐梯渔陈枣泥漳浔涧梨芬谯赡辕迦郑単驴弈洽鳌卛占筮卝卞卟吩啉屎翠厄卣卨卪卬卮榫袄玺绶钮蚤惧殆笃耸卲帘帙绕恤卼卽厂厎厓厔厖厗奚厘厍厜厝谅厕厤厥厪腻孢厮厰厳厣厹厺粕垢芜菁厼厾叁悟茸薯叄吵笄悌哺讥坫垄弧芯杠潜婴刍袁诘贪谍煽馈驳収岳缔灾贿骗叚叡吻拦蘑蜜诀燧玩砚筝椎蔺铜逗骊另觅叨唠谒杵姓喊嚷嚣咚咛塑寻恼憎擦只泣渗蝠叱吒咄咤喝籀黛舵舷叵叶铎懿昭穰苴辽叻叼吁堑嫖赌瞧爬众抒吅吆夥卺橡涤抱纵摩郡唁坠扇篮膀袜颈吋忾谘酬哭妓媛暗表缰迩妃羿絮蕃浑拐葵暮隅吔吖啶嗪戚吜啬噬咽吟哦咏吠吧唧嗒咐吪隽咀征燐苞茹钙哧吮吰吱嘎吲哚吴栋娇窟孟箫忠晗淞阖闾趼宇呐睛嘘拂捧疵熄竽笛糠吼吽呀吕韦蒙呃呆笨呇贡呉罄呋喃呎呏呔呠呡痴呣呤呦呧瑛眩扒晬淑姬瑜璇鹃呪呫哔嚅嗫呬呯呰呱呲咧噌钝呴呶呷呸呺呻哱咻啸噜吁坎坷逻呿咁咂咆哮咇咈咋蟹煦珅蔼咍咑咒诅咔哒嚓咾哝哩喱咗咠咡咢咣咥咦咨嗟询咩咪咫啮啮咭咮咱咲咳呛嗽咴啕咸咹咺呙喉咿婉恸悯赋矜绿茗蓝哂抢瞒哆嗦啰噻啾滨彗哋哌哎唷哟哏哐哞哢哤哪里哫啼喘哰哲萎蚌哳咩哽哿呗唅唆唈唉唎唏哗尧棣殇璜睿肃唔睇唕吣唞唣喳唪唬唰喏唲唳唵嘛唶唸唹唻唼唾唿啁啃鹦鹉啅埠栈榷祺铺鞅飙啊啍啎啐啓啕啖啗啜哑祈啢衔啤啥啫啱啲啵啺饥啽噶昆沁喁喂喆裙喈咙喋喌喎喑喒喓喔粗喙幛庆滋鹊喟喣喤喥喦喧骚喨喩梆吃葡萄喭驼挑吓碰枞瓣纯疱藻趟铬喵営喹喺喼喿嗀嗃嗄嗅嗈嗉嗊嗍嗐嗑嗔诟嗕嗖嗙嗛嗜痂癖嗝嗡嗤嗥嗨唢嗬嗯嗰嗲嗵叽嗷嗹嗾嗿嘀嘁嘂嘅惋嘈峪禾荫啀嘌嘏嘐嘒啯啧嘚唛嘞嘟囔嘣嘥嘦嘧嘬嘭这谑严敞馋松哓嘶嗥呒虾嘹嘻啴嘿噀噂噅噇噉噎噏噔噗噘噙噚咝噞噢噤蝉皿噩噫噭嗳噱哙噳嚏涌洒欲巫霏噷噼嚃嚄嚆抖哜尝嚔苏嚚嚜嚞嚟呖嚬嚭嚮嚯亸喾饬按竣苛嚵嘤啭冁呓膪谦囍囒囓囗囘萧酚飘溅谛囝溯眸纥銮鹘囟殉囡団囤囥囧囨囱囫囵囬囮囯囲図囶囷囸囹圄圉拟囻囿圀圂圃圊粹蠹赦圌垦圏滚鲱凿枘圕圛圜圞坯埂壤骸炕祠窑豚绅魠鲮鳖圧握圩圪垯圬圮圯炸岬幔毯祇窨菩溉圳圴圻圾坂坆沾坋坌舛壈昆垫墩椅坒坓坩埚坭坰坱坳坴坵坻坼杨挣涎帘垃垈垌垍垓垔垕垗垚垛垝垣垞垟垤垧垮垵垺垾垿埀畔埄埆埇埈埌殃隍埏埒埕埗埜垭埤埦埧埭埯埰埲埳埴埵埶绋埸培怖桩础辅埼埽堀诃侄庑堃堄摧磐贞韧砌堈堉垩堋堌堍堎垴堙堞堠礁堧堨舆堭堮蜓摘堲堳堽堿塁塄塈煤茔棵塍垲埘塓绸塕鸦沽虱塙冢塝缪塡坞埙塥塩塬塱场螨塼塽塾塿墀墁墈墉墐夯増毁墝墠墦渍钵墫墬堕墰墺墙橱壅壆壊壌壎壒榨蒜壔壕壖圹垆壜壝垅壡壬壭壱売壴壹壻壸寝壿夂夅夆変夊夌漱邑夓腕泄甥御骼夗夘夙衮瑙妊娠醣枭珊莺鹭戗幻魇夤蹀秘擂鸫姚宛闺屿庾挞拇賛蛤裨菠氅漓捞湄蚊霆鲨箐篆篷荆肆舅荔鲆巷惭骰辟邱镕镰阪漂烩鲵鲽鳄鸨胪鹏妒峨谭枰晏玑癸祝秤竺牡籁恢罡蝼蝎赐绒御梭夬夭砣榆怙枕夶夹馅奄崛葩谲奈贺祀赠奌奂奓奕䜣詝奘奜奠奡奣陶奨奁魁奫奬奰娲孩贬隶酥宄狡猾她姹嫣妁毡荼皋膻蝇嫔妄妍嫉媚娆妗趣妚妞妤碍妬娅妯娌妲妳妵妺姁姅姉姗姒姘姙姜姝姞姣姤姧姫姮娥姱姸姺姽婀娀诱慑胁娉婷娑娓娟娣娭娯娵娶娸娼婊婐婕婞婤婥溪孺婧婪婬婹婺婼婽媁媄媊媕媞媟媠媢媬媮妫媲媵媸媺媻媪眯媿嫄嫈袅嫏嫕妪嫘嫚嫜嫠嫡嫦嫩嫪毐嫫嫬嫰妩嫺娴嫽嫿妫嬃嬅嬉耍婵痴艳嬔嬖嬗嫱袅嫒嬢嬷嬦嬬嬭幼嬲嬴婶嬹嬾嬿孀娘孅娈孏曰癫屏孑孓雀孖斟篓谜摺孛矻鸠崮轲祜鸾孥邈毓棠膑孬孭孰孱孳孵泛罔衔孻孪宀宁冗拙株薇掣抚琪瓿榴谧弥宊濂祁瑕宍宏碁宓邸谳実潢町宥宧宨宬徵崎骏掖阙臊煮禽蚕宸豫寀寁寥寃檐庶寎暄碜寔寖寘寙寛寠苫寤肘洱滥蒗陕核寪弘绰螽宝擅疙瘩晷対檐専尃尅赎绌缭畴衅尌峙醌襟痲碧屁昊槌淘恵瀑牝畑莓缸羚觑蔻脏躁尔尓锐尗尙尜尟尢尥尨尪尬尭尰擒尲尶尴尸尹潽蠖蛾尻扣梢蚴鳍脬蹲屇屌蚵屐屃挪屖屘屙屛屝屡屣峦嶂岩舄屧屦屩屪屃屮戍驻钾崖嵛巅旮旯楂榄榉芋茱萸靛麓屴屹屺屼岀岊岌岍阜岑彭巩岒岝岢岚岣岧岨岫岱岵岷峁峇峋峒峓峞峠嵋峨峰峱岘峹峿崀崁崆祯崋崌崃岖昆崒崔嵬巍萤颢崚崞崟崠峥巆崤崦崧殂岽崱崳崴崶崿嵂嵇嵊泗嵌嵎嵒嵓岁嵙嵞嵡嵩嵫嵯嵴嵼嵾嵝崭崭晴嶋嶌嶒嶓嵚崂嶙嶝嶞峤嶡嶢峄嶨嶭嶮嶰嶲岙嵘巂巃巇巉岿巌巓巘巛滇芎巟巠弋回巣巤炊擘蜥蟒蛊觋巰蜀彦淖杏茂甫楞巻巽帼巿帛斐鲫蕊帑帔帗帚琉汶帟帡帣帨裙帯帰帷帹暆帏幄帮幋幌幏帻幙帮幞幠幡幢幦幨幩幪帱幭幯幰遥蹉跎馀庚鉴幵幷稚邃庀庁広庄庈庉笠庋跋庖牺庠庤庥鲸庬庱庳庴庵馨衢庹庿廃厩廆廋廌廎廏廐廑廒荫廖廛厮搏锣廞弛袤廥廧廨廪廱绵踵髓廸迫瓯邺廻廼廾廿躔弁皱弇弌弍弎弐弑吊诡憾荐弝弢弣弤弨弭弮弰弪霖繇焘斌旭溥骞弶弸弼弾彀彄别累纠强彔彖彘彟彟陌彤贻彧绘虹彪炳雕蔚鸥彰瘅彲彳彴仿彷徉徨彸彽踩敛旆徂徇徊渭畲铉裼従筌徘徙徜徕膳苏萌渐徬徭醺徯徳徴潘徻徼忀瘁胖燎怦悸颤扉犀澎湃砰恍惚绞隘忉惮挨饿忐忑忒忖応忝忞耿忡忪忭忮忱忸怩忻悠懑怏遏怔怗怚怛怞怼黍讶怫怭懦怱怲恍怵惕怸怹恁恂恇恉恌恏恒恓恔恘恚恛恝恞恟恠恣恧眄恪恫恬澹恰恿悀悁悃悄悆悊悐悒晦悚悛悜悝悤您悩悪悮悰悱凄恻德悴怅惘闷悻悾惄愫钟蒐惆惇惌惎惏惓惔惙惛耄惝疟浊恿惦德恽惴蠢惸拈愀愃愆愈愊愍愐愑愒愓愔愕恪氓蠢騃昵惬赧悫愬愮愯恺愼慁恿慅慆慇霭慉慊愠慝慥怄怂慬慱悭慴慵慷戚焚憀灼郁憃惫憋憍眺捏轼愦憔憖憙憧憬憨憪憭怃憯憷憸憹憺懃懅懆邀懊懋怿懔懐懞懠懤懥恹懫懮懰懱毖懵遁梁雍忏懽戁戄戆戉戋戕戛戝戛戠戡戢戣戤戥戦戬戭戯轰戱披菊牖戸戹戺戻卯戽锹扂楔扃扆扈扊杖牵绢铐镯赉扐搂搅烊盹瞌跟趸镲靶鼾払扗玫腮扛扞扠扡扢盔押扤扦扱罾揄绥鞍郤窾扻扼扽抃抆抈抉抌抏瞎抔缳缢擞抜拗択抨摔歉蹿牾抶抻搐泵菸拃拄拊髀抛拌脯拎拏拑擢秧沓曳挛迂拚拝拠拡拫拭拮踢拴拶拷攒拽掇芥橐簪摹疔挈瓢骥捺蹻挌挍挎挐拣挓挖掘浚挙揍聩挲挶挟挿捂捃捄捅捆捉捋胳膊揎捌捍捎躯蛛捗捘捙捜捥捩扪捭据捱捻捼捽掀掂抡臀膘掊掎掏掐笙掔掗掞棉芍掤搪阐掫掮掯揉掱掲掽掾揃揅揆搓揌诨揕揗揘揜揝揞揠揥揩揪揫橥遒麈揰揲揵揶揸背揺搆搉搊搋搌搎搔搕撼橹捣搘搠搡搢搣搤搥搦搧搨搬楦裢讪赸掏搰搲搳搴揾搷搽搾搿摀摁摂摃摎掴摒摓跤摙摛掼摞摠摦喉羯摭摮挚摰摲抠摴抟摷掺摽撂撃撅稻撊撋挦锏泼撕撙撚㧑挢撢掸撦撅撩撬撱朔揿蚍蜉挝捡擀掳闯擉缶觚擐擕擖擗擡擣擤澡腚擧擨擩擫擭摈拧撷擸撸擽擿攃摅撵攉攥攐攓撄搀撺每攩攫辔澄攮攰攲攴轶攷砭讦攽碘敁敃敇敉叙敎筏敔敕敖闰诲敜煌敧敪敳敹敺敻敿斁衽斄牒绉诌斉斎斓鹑谰驳鳢斒筲斛斝斞斠斡斢斨斫斮晾沂潟颖绛邵斲斸釳於琅斾斿旀旗旃旄涡旌旎旐旒旓旖旛旝旟旡旣浴旰獭魃旴时旻旼旽昀昃昄昇昉晰躲澈熹皎皓矾昑昕昜昝昞昡昤晖笋昦昨是昱昳昴昶昺昻晁蹇隧蔬髦晄晅晒晛晜晞晟晡晢晤晥曦晩萘莹顗晿暁暋暌暍暐暔暕煅旸暝暠暡曚暦暨暪朦胧昵暲殄冯暵暸暹暻暾曀晔昙曈曌曏曐暧曘曙曛叠昽曩骆曱甴肱曷牍禺锟曽沧耽朁朅朆杪栓夸竟粘绦朊膺朏朐朓朕朘朙瞄觐溘饔飧朠朢朣栅椆淀虱朩朮朰朱炆璋钰炽鹮朳槿朵朾朿杅杇杌陧欣钊湛漼楷瀍煜玟缨翱肇舜贽适逵杓杕杗杙荀蘅杝杞脩珓筊杰榔狍閦颦缅莞杲杳眇杴杶杸杻杼枋枌枒枓衾葄翘纾逋枙狸桠枟槁枲枳枴枵枷枸橼枹枻柁柂柃柅柈柊柎某柑橘柒柘柙柚柜柞栎柟柢柣柤柩柬柮柰柲橙柶柷柸柺査柿栃栄栒栔栘栝栟柏栩栫栭栱栲栳栴檀栵栻桀骜桁镁桄桉桋桎梏椹葚桓桔桕桜桟桫椤桭杯桯桲桴桷桹湘溟梃梊梍梐潼栀枧梜梠梡梣梧梩梱梲梳梴梵梹棁棃樱棐棑棕榈簑绷蓑枨棘棜棨棩棪棫棬棯棰棱棳棸棹椁棼碗椄苕椈椊椋椌椐椑椓椗検椤椪椰椳椴椵椷椸椽椿楀匾楅篪楋楍楎楗楘楙楛楝楟楠楢楥桢楩楪楫楬楮楯楰梅楸楹楻楽榀榃榊榎槺榕榖榘榛狉莽搒笞榠榡榤榥榦榧杩榭榰榱梿霰榼榾桤槊闩槎槑槔槖様槜槢槥椠槪槭椮槱槲槻槼槾樆樊樏樑樕樗樘樛樟樠樧樨権樲樴樵猢狲桦樻罍樾樿橁橄橆桡笥龠橕橚橛辆椭橤橧竖膈跨橾橿檩檃檇柽檍檎檑檖檗桧槚檠樯檨檫檬梼槟檴檵柠棹櫆櫌栉櫜椟櫡槠栌枥榇栊櫹棂茄櫽欀欂欃欐欑栾欙棂溴欨欬欱欵欶欷歔欸欹欻欼欿歁歃歆艎歈歊莳蝶歓歕歘歙歛歜欤歠蹦诠镶蹒跚升陟歩歮歯歰歳歴璞歺瞑歾殁夭殈殍殑殗殜殙殛殒殢殣殥殪殚僵殰殳荃殷殸殹蛟殻肴谤殴毈毉喂毎毑蕈毗毘毚茛邓毧毬毳毷毹毽毾毵牦氄氆靴氉氊氇氍氐聊氕氖気氘氙氚氛氜氝氡汹焊痉氤氲氥氦铝锌氪烃氩铵痤汪浒漉痘盂碾菖蒲蕹蛭螅氵冰氹氺氽烫氾氿渚汆汊汋汍汎汏汐汔汕褟汙汚汜蓠沼秽蔑汧汨汩汭汲汳汴堤汾沄沅沆瀣沇沈葆浸沦湎溺痼疴沌沍沏沐沔沕沘浜畹砾沚沢沬沭沮沰沱灢沴沷籽沺烹濡洄泂肛泅泆涌肓泐泑泒泓泔泖泙泚泜泝泠漩馍涛粼泞藓鳅泩泫泭泯铢泱泲洇洊泾琵琶荽蓟箔洌洎洏洑潄濯洙洚洟洢洣洧洨洩痢滔洫洮洳洴洵洸洹洺洼洿淌蜚浄浉浙赣渫浠浡浤浥淼瀚浬浭翩萍浯浰蜃淀苔蛞蝓蜇螵蛸煲鲤浃浼浽溦涂涊涐涑涒涔滂莅涘涙涪涫涬涮涴涶涷涿淄淅淆淊凄黯淓淙涟淜淝淟淠淢淤渌淦淩猥藿亵淬淮淯淰淳诣涞纺淸淹炖癯绮渇済渉渋渓渕涣渟渢滓渤澥渧渨渮渰渲渶渼湅湉湋湍湑湓湔黔湜湝浈湟湢湣湩湫湮麟湱湲湴涅満沩溍溎溏溛舐漭溠溤溧驯溮溱溲溳溵溷溻溼溽溾滁滃滉滊荥滏稽滕滘汇滝滫滮羼耷卤滹浐煎漈漊漎绎漕漖漘漙沤漜漪漾漥漦漯漰溆漶漷濞潀颍潎潏潕潗潚潝潞潠潦祉疡潲潵滗潸潺潾涠澁澂澃澉澌澍澐澒澔澙渑澣澦澧澨澫澬浍澰澴澶澼熏郁濆濇濈濉濊貊濔疣濜濠濩觞浚濮盥潍濲泺瀁滢渎渖瀌浏瀒瀔濒泸瀛潇潆瀡潴泷濑瀬弥潋瀳瀵瀹瀺瀼沣滠灉灋灒漓灖灏灞灠滦灥灨滟灪蜴灮烬獴灴灸灺炁炅鱿炗炘炙炤炫疽烙钎炯炰炱炲炴炷毁炻烀烋瘴鲳烓烔焙烜烝烳饪烺焃焄耆焌焐焓焗焜焞焠焢焮焯焱焼煁煃煆煇煊熠煍熬煐炜煕暖熏硷霾煚煝煟煠茕矸煨琐炀萁煳煺煻熀熅熇熉罴荧穹炝熘熛熜稔谙烁熤熨熯熰眶蚂颎熳熸熿燀烨燂燄盏燊燋燏燔隼燖焖燠燡灿燨燮燹燻燽燿爇爊爓爚爝爟爨蟾爯爰为爻丬爿牀牁牂牄牋窗牏牓窗釉牚腩蒡虻牠虽蛎牣牤牮牯牲牳牴牷牸牼绊牿靬犂犄犆犇犉犍犎犒荦犗犛犟犠犨犩犪犮犰狳犴犵犺狁甩狃狆狎狒獾狘狙黠狨狩狫狴狷狺狻豕狈蜘猁猇猈猊猋猓猖獗猗猘狰狞犸猞猟獕猭猱猲猳猷猸猹猺玃獀獃獉獍獏獐獒毙獙獚獜獝獞獠獢獣獧鼇蹊狯猃獬豸狝獯鬻獳犷猕猡玁菟玅玆玈珉糁禛郅玍玎玓瓅玔玕玖玗玘玞玠玡玢玤玥玦珏瑰玭玳瑁玶玷玹玼珂珇珈瑚珌馐馔珔珖珙珛珞珡珣珥珧珩珪佩珶珷珺珽琀琁陨玡琇琖琚琠琤琦琨琫琬琭琮琯琰琱琲琅琴珐珲瑀瑂瑄瑉玮瑑瑔瑗瑢瑭瑱瑲瑳瑽瑾瑿璀璨璁璅璆璈琏璊璐璘璚璝璟璠璡璥瑷璩璪璫璯璲玙璸璺璿瓀璎瓖瓘瓒瓛脐瓞瓠瓤瓧瓩瓮瓰瓱瓴瓸瓻瓼甀甁甃甄甇甋甍甎甏甑甒甓甔瓮甖甗饴蔗甙诧钜粱盎锈团甡褥産甪甬甭甮宁铠甹甽甾甿畀畁畇畈畊畋畎畓畚畛畟鄂畤畦畧荻畯畳畵畷畸畽畾疃叠疋疍疎箪疐疒疕疘疝疢疥疧疳疶疿痁痄痊痌痍痏痐痒痔痗瘢痚痠痡痣痦痩痭痯痱痳痵痻痿瘀痖瘃瘈瘉瘊瘌瘏瘐痪瘕瘖瘙瘚瘛疭瘜瘝瘗瘠瘥瘨瘭瘆瘯瘰疬瘳疠瘵瘸瘺瘘瘼癃痨痫癈癎癐癔癙癜癠疖症癞蟆癪瘿痈発踔绀蔫酵皙砬砒翎翳蔹钨镴皑鹎驹暨粤褶皀皁荚皃镈皈皌皋皒朱皕皖皘皜皝皞皤皦皨皪皫皭糙绽皴皲皻皽盅盋碗盍盚盝踞盦盩秋千盬盭眦睁瞤盯盱眙裰盵盻睐眂眅眈眊県眑眕眚眛眞眢眣眭眳眴眵眹瞓眽郛睃睅睆睊睍睎困睒睖睙睟睠睢睥睪睾睯睽睾眯瞈瞋瞍逛瞏瞕瞖眍䁖瞟瞠瞢瞫瞭瞳瞵瞷瞹瞽阇瞿眬矉矍铄矔矗矙瞩矞矟矠矣矧矬矫矰矱硪碇磙罅舫阡、矼矽礓砃砅砆砉砍砑砕砝砟砠砢砦砧砩砫砮砳艏砵砹砼硇硌硍硎硏硐硒硜硖砗磲茚钡硭硻硾碃碉碏碣碓碔碞碡碪碫碬砀碯碲砜碻礴磈磉磎硙磔磕磖磛磟磠磡磤磥蹭磪磬磴磵磹磻硗礀硚礅礌礐礚礜礞礤礧礮砻礲礵礽礿祂祄祅祆禳祊祍祏祓祔祕祗祘祛祧祫祲祻祼饵脔锢禂禇禋祦禔祎隋禖禘禚禜禝禠祃禢禤禥禨禫祢禴禸秆秈秊闱飒秋秏秕笈蘵赁秠秣秪秫秬秭秷秸稊稌稍稑稗稙稛稞稬秸稲稹稼颡稿穂穄穇穈穉穋稣贮穏穜穟秾穑穣穤穧穨穭穮穵穸窿阒窀窂窅窆窈窕窊窋窌窒窗窔窞窣窬黩蹙窑窳窴窵窭窸窗竁竃竈竑竜并竦竖篦篾笆鲛竾笉笊笎笏笐靥笓笤箓笪笫笭笮笰笱笲笳笵笸笻筀筅筇筈筎筑筘筠筤筥筦笕筒筭箸筰筱筳筴宴筸箂个箊箎箑箒箘箙箛箜篌箝箠箬镞箯箴箾篁筼筜篘篙篚篛篜篝篟篠篡篢篥篧篨篭篰篲筚篴篶篹篼箦簁簃簆簉簋簌簏簜簟簠簥簦簨簬簰簸簻籊藤籒籓籔签籚篯箨籣籥籧笾簖籫籯芾麴籵籸籹籼粁秕粋粑粔粝粛粞粢粧粨粲粳稗粻粽辟粿糅糆糈糌糍糒糔萼糗蛆蹋糢糨糬粽糯糱籴粜糸糺紃蹼鲣霉纡纨绔纫闽襻紑纰纮锭鸢鹞纴紞紟扎紩紬绂绁纻紽紾绐絁絃絅経絍绗絏缡褵絓絖絘絜绚絣螯絪絫聒絰絵绝絺絻絿綀绡綅绠绨绣綌綍綎捆綖綘継続缎绻綦綪线綮綯绾罟蝽綷縩绺绫緁绲緅緆缁绯緌緎総緑绱緖缃缄缂绵缗緤褓缌纂緪緰缑缈缏缇縁縃縄萦缙缒縏缣縕缞縚缜缟缛縠縡縢縦绦縯縰骋缧縳纤缦絷缥縻衙縿繄缫繈繊繋繐缯繖繘繙繠缋繣繨缰缲繸繻缱纁纆纇缬缵纩纑纕缵纙纚纛缾罃罆坛罋罂罎罏罖罘罛罝罠罣罥罦罨罫罭锾罳罶罹罻罽罿羂羃羇芈蕉51鸵羑羖羌羜羝羢羣羟羧羭羮羰羱羵羶羸藜鲐翀翃翅翊翌翏翕翛翟翡翣翥翦跹翪翫翚翮翯翱翽翾翿板饕鸹锨耋耇耎耏专耒耜耔耞耡耤耨耩耪耧耰鬓耵聍聃聆聎聝聡聦聱聴聂聼阈聿肄肏肐肕腋肙肜肟肧胛肫肬肭肰肴肵肸肼胊胍胏胑胔胗胙胝胠铨胤胦胩胬胭胯胰胲胴胹胻胼胾脇脘脝脞脡脣脤脥脧脰脲脳腆腊腌臜腍腒腓胨腜腠脶腥腧腬腯踝蹬镣腴腶蠕诽膂腽嗉膇膋膔腘膗膙膟黐膣膦膫膰膴膵膷脍臃臄臇臈臌臐臑臓膘臖臙臛臝臞臧蓐诩臽臾臿舀舁鳑鲏舋舎舔舗馆舝舠舡舢舨舭舲舳舴舸舺艁艄艅艉艋艑艕艖艗艘艚艜艟艣舣艨艩舻艬艭荏艴艳艸艹艻艿芃芄芊萰陂藭芏芔芘芚蕙芟芣芤茉芧芨芩芪芮芰鲢芴芷芸荛豢芼芿苄苒苘苙苜蓿苠苡苣荬苤苎苪镑苶苹苺苻苾茀茁范蠡萣茆茇茈茌茍茖茞茠茢茥茦菰茭茯茳藨茷藘茼荁荄荅荇荈菅蜢鸮荍荑荘豆荵荸荠莆莒莔莕莘莙莚莛莜莝莦莨菪莩莪莭莰莿菀菆菉菎菏菐菑菓菔芲菘菝菡菢菣菥蓂菧菫毂蓥菶菷菹醢菺菻菼菾萅萆苌萋萏萐萑萜萩萱萴莴扁萻葇葍葎葑荭葖葙葠葥苇葧葭药葳葴葶葸葹葽蒄蒎莼茏薹莅蒟蒻蒢蒦蒨蒭藁蒯蒱鉾蒴蒹蒺蒽荪蓁蓆蓇蓊蓌蓍蓏蓓蓖蓧蓪蓫荜跣藕苁蓰蓱莼蓷蓺蓼蔀蔂蔃蔆蔇蔉蔊蔋蔌蔎蔕蔘蔙蒌蔟锷蒋雯茑蔯蔳麻蔵蔸蔾荨蒇蕋蕍荞蕐蕑芸莸蕖蕗蕝蕞蕠蕡蒉蕣蕤蕨蕳蓣蕸蕺蕻薀薁薃薅薆荟薉芗薏薐蔷薖薘剃谔钗薜薠薢薤薧薨薫薬薳薶薷薸薽薾薿藄藇藋荩藐藙藚藟藦藳藴苈藷藾蘀蘁蕲苹蘗蘘蘝蘤蘧蘩蘸蘼虀虆虍蟠虒虓虖虡虣虥虩虬虰蛵蛇虷鳟虺虼蚆蚈蚋蚓蚔蚖蚘蚜蚡蚣蚧蚨蚩蚪蚯蚰蜒蚱蚳蚶蚹蚺蚻蚿蛀蛁蛄蛅蝮蛌蛍蛐蟮蛑蛓蛔蛘蛚蛜蛡蛣蜊蛩蛱蜕螫蜅蚬蜈蝣蜋蜍蜎蜑蠊蜛饯蜞蜣蜨蜩蜮蜱蜷蜺蜾蜿蝀蝃蝋蝌蝍蝎蝏蝗蝘蝙蝝鲼蝡蝤蝥猿蝰虻蝲蝴蝻螃蠏蛳螉螋螒螓螗螘螙螚蟥螟螣螥螬螭䗖螾螀蟀蟅蝈蟊蟋蟑蟓蟛蟜蟟蟢虮蟨蟪蟭蛲蟳蛏蟷蟺蟿蠁蠂蠃虿蠋蛴蠓蚝蠗蠙蠚蠛蠜蠧蟏蠩蜂蠮蠰蠲蠵蠸蠼蠽衁衄衄衇衈衉衋衎衒同衖胡衞裳钩衭衲衵衹衺衿袈裟袗袚袟袢袪袮袲袴袷袺袼褙袽裀裉袅裋夹裍裎裒裛裯裱裲裴裾褀褂褉褊裈褎褐褒褓褔褕袆褚褡褢褦褧褪褫袅褯褰褱裆褛褽褾襁褒襆裥襉襋襌襏襚襛襜裣襞襡襢褴襦襫襬襭襮襕襶襼襽襾覂覃覅霸覉覊覌覗觇覚覜觍觎覧覩觊觏覰観觌觔觕觖觜觽觝觡酲觩觫觭觱觳觯觷觼觾觿言赅讣訇訏訑訒诂讬訧訬訳訹证訾詀詅诋毁詈詊讵詑诒诐詗诎察詨诜詶詸詹詻诙诖誂誃诔锄诓誋诳诶悖誙诮诰誧説読誯谇訚谄谆諆諌诤诹诼諕谂谀諝谝諟喧谥諴諵谌谖誊謆謇歌謍謏謑谡谥謡謦謪谪讴謷謼谩哗譅譆譈譊讹譒撰谮鑫譞噪譩谵譬譱譲谴譸譹谫讅讆詟䜩雠讐谗谶讙谠讟谽豁豉豇岂豊豋豌豏豔豞豖豗豜豝豣豦豨豭豱豳豵豶豷豺豻貅貆狸猊貔貘䝙貜貤餍贳餸贶贲赂賏赊赇赒賝赓赕賨赍斗賮賵賸赚赙赜赟贉赆赑贕赝赬赭赱赳迄趁趂趄趐趑趒趔趡趦趫趮趯趱趴趵趷趹趺趿跁跂跅跆踬跄跐跕跖跗跙跛跦跧跩跫跬跮跱跲跴跺跼跽踅踆踈踉踊踒踖踘踜踟躇蹰踠踡踣踤踥踦踧跷踫踮逾踱踊踶踹踺踼踽躞蹁蹂躏蹎蹐蹓蹔跸蹚蹜蹝迹蹠蹡蹢跶蹧蹩蹪蹯鞠蹽躃躄躅踌跻躐踯跞躘躙躗躝躠蹑躜躧躩躭躰躬躶軃軆辊軏轫軘軜軝腭転軥軨軭軱轱辘軷轵轺軽軿輀輂辇辂辁輈挽輗辄辎辋輠輤輬輭輮辏輴輵輶輹輼辗辒轇轏轑轒辚轕轖轗轘轙轝轞轹轳罪辣辞辵辶辺込辿迅迋迍麿迓迣迤逦迥迨迮迸迺迻迿逄逅逌逍逑逓迳逖逡逭逯逴逶逹遄遅侦遘遛遝遢遨遫遯遰遴绕遹遻邂邅邉邋邎邕邗邘邛邠邢邧邨邯郸邰邲邳邴邶邷邽邾邿郃郄郇郈郔郕郗郙郚郜郝郞郏郠郢郪郫郯郰郲郳郴郷郹郾郿鄀鄄郓鄇鄈鄋鄍鄎鄏鄐鄑邹邬鄕郧鄗鄘鄚鄜鄞鄠鄢鄣鄤鄦鄩鄫鄬鄮鄯鄱郐鄷鄹邝鄻鄾鄿酃酅酆酇郦酊酋酎酏酐酣酔酕醄酖酗酞酡酢酤酩酴酹酺醁醅醆醊醍醐醑醓醖醝酝醡醤醨醪醭醯醰酦醲醴醵醸醹醼醽醾釂酾酽釆釈鲈镏阊钆钇钌钯钋鼢鼹钐钏釪釬釭釱钍釸钕钫鈃钭鈆鈇钚鈊鈌钤钣鈒鈤钬钪鈬铌铈钶铛钹铍钸钿鉄鉆铊铇鉌铋鉏铂钷铆钵鉥钲鉨钼钽鉱鉲鉶铰铒鉼铪銍銎铣銕镂铫铦铑铷銤铱铟銧铥铕铯銭銰焊銶锑锉汞鋂锒鋆鋈鋊铤鋍铗鋐鋑鋕鋘鋙锊锓锔锇铓鋭铖锆锂铽鋳鋹鋺鉴镚钎錀锞锖锫锩錍铔锕錔锱铮锛錞锬锜錤錩錬録铼錼锝钔锴鍉镀鍏鍐铡鍚锻锽锸锲锘鍫鍭鍱鍴锶鍹锗针锺锿镅鎉鎋鎌鎍鎏鎒鎓鎗镉鎚鎞镃鎤铩锼鎭鎯镒镍鎴镓鎸鎹镎镟鏊镆镠镝鏖铿锵鏚镗镘镛鏠鏦錾镤鏸镪鏻鏽鏾铙鐄鐇鐏铹镦镡鐗馗镫镢镨鐡锎镄鐩镌鐬鐱镭鐶鐻鐽镱鑀鑅镔鑐鑕鑚鑛鑢鑤镥鑪镧鑯鑱鑴鑵镊镢钃镻闫闬闶闳閒闵閗閟阂関合閤哄阆閲阉閺阎阏阍阌暗闉阕阗闑闒闿闘闚阚闟闠闤闼阞阢阤阨阬阯阹阼阽陁陑陔陛陜陡陥陬骘陴険陼陾阴隃隈隒隗隞隠隣隤隩隮隰颧隳隷隹雂雈雉雊雎雑雒雗雘雚雝雟雩雰雱驿霂霅霈霊沾霒霓霙霝霢霣霤霨霩霪霫霮靁叇叆靑靓靣腼靪靮靰靳靷靸靺靼靿鞀鞃鞄鞍鞗鞙鞚鞝鞞鞡鞣鞨鞫鞬鞮鞶鞹鞾鞑韅鞯驮韍韎韔韖韘韝韫韡韣韭韭韱韹韺頀刮頄顸顼頍颀颃颁頖頞頠頫頬颅頯頲颕頼悴顋顑颙颛颜顕顚顜颟顣颥颞飐飑台飓颸飏飖颽颾颿飀飂飚飌翻飡飣饲飥饨饫飮飧飶餀餂饸饹餇餈饽哺馂餖餗餚馄馃餟餠餤餧餩餪餫糊餮糇餲饧馎糕饩馈馊馌馒饇馑馓膳饎饐饘饟馕馘馥馝馡馣骝骡馵馹駃駄駅駆駉駋驽駓驵駗骀驸駜骂骈駪駬骃駴骎駹駽駾騂騄骓騆騉騋骒骐麟騑騒験騕骛騠騢騣騤騧骧騵驺骟騺蓦骖骠骢驆驈骅驌骁驎骣驒驔驖驙驦驩驫骺鲠骫骭肮骱骴骶骷髅骾髁髂髄髆膀髇髑髌髋髙髝髞髟髡髣髧髪髫髭髯髲髳髹髺髽髾鬁鬃鬅鬈鬋鬎鬏鬐鬑鬒鬖鬗鬘鬙鬠鬣斗鬫鬬阄鬯鬰鬲鬵鬷魆魈魊魋魍魉魑魖鳔魛魟魣魦魨魬鲂魵魸鮀鲅鮆鲧鲇鲍鲋鮓鲒鲕鮟鱇鮠鮦鮨鲔鲑鮶鮸鮿鲧鯄鯆鲩鯈鲻鯕鲭鲞鯙鯠鲲鯥鲰鲶鳀鯸鳊鲗䲠鹣鳇鰋鳄鳆鰕鰛鰜鲥鰤鳏鰦鳎鳐鳁鳓鰶鲦鲡鰼鰽鱀鱄鳙鱆鳕鱎鱐鳝鳝鳜鲟鲎鱠鳣鱨鲚鱮鱲鱵鱻鲅鳦凫鳯鳲鳷鳻鴂鴃鴄鸩鴈鴎鸰鴔鴗鸳鸯鸲鹆鸱鴠鴢鸪鴥鸸鹋鴳鸻鴷鴽鵀鵁鸺鹁鵖鵙鹈鹕鹅鵟鵩鹌鵫鵵鵷鵻鹍鶂鶊鶏鶒鹙鶗鶡鶤鶦鶬鶱鹟鶵鶸鶹鹡鶿鹚鷁鷃鷄鷇䴘䴘鷊鷏鹧鷕鹥鸷鷞鷟鸶鹪鹩鷩鷫鷭鹇鹇鸴鷾䴙鸂鸇䴙鸏鸑鸒鸓鸬鹳鸜鹂鹸咸鹾麀麂麃麄麇麋麌麐麑麒麚麛麝麤麸面麫麮麯麰麺麾黁黈黉黢黒黓黕黙黝黟黥黦黧黮黰黱黪黶黹黻黼黾鼋鼂鼃鼅鼈鼍鼏鼐鼒冬鼖鼙鼚鼛鼡鼩鼱鼪鼫鼯鼷鼽齁齆齇齈齉齌赍齑龀齕齗龅齚龇齞龃龉龆齢出齧齩齮齯齰齱齵齾厐龑龒龚龖龘龝龡龢龤' traditional_characters = '制咖片型超聲盤鑒定仔點他命書歌粉巾字帳恤手指記憶棒形轉彎溝光○〇㐄㐅㐆㐌㐖毒㐜㐡㐤㐰㐺㑇㑳㒳㒸㔾㗂㗎㝵㞎㞙㞞㠯㢲㢴㤅㥁㥯㨗㫺㬎㮎㮚㮸㲋㲱㲾㳮㵎㵪㶸㷖㷭㹢㹴犬㺢狓㺵㼝㽮㿝䍃䔢䖟䖸䗈䗥䗪䝓䠶䥯䦉䯝䰾魚䲔䳗䳘䵹鼄䶑一對應映射丁不識下兒子做二休世丘之貉並中台原則串為甚謂乾淨了百事無成八變五十些人得道雞升天代如併來去個國政策勁幽靈在歐洲遊蕩接樣蘿蔔坑側化傳價元論醇共再准刀兩斷切分耕耘收穫錢貨物向看舊就緒險刻千金動勞永逸匙零夜半卡通回復返影蹤反常態口咬氣句話同吐快吹周味呼諾嗚品紅鍋哄而散起唱和問三知生熟團漆黑火糟堆場空塊麵塌糊塗塵染壁廂夔已足多情露水大早到晚夫妻當關萬莫開失古恨套所料既往孔見提師要家主審寸陰難買鬥牛小撮部陣局展身層巴掌帆風順席地帶過年計於春頭載四季期被蛇怕井繩度願式份彈頃深前律徑心意念差愁孤行俱全房廳交遮打技長把抓死拿眼淚鼻涕鑰鎖折段抿拍即合掃排掬揮撥擁上入擊洞擲攬改故轍敗文值名斑方面旁族日秋餐隔雅里終父旦時晌會霎間晃暴寒曝更月望垠際朝夕本正經利杯羹東西板枝獨秀根筋桿進條龍服務概模次函數又性程總付步腳印趨登毛拔呵氧氮碳決雌雄波未平派謊言流清楚白準溜煙潭有獲聞是處降琴鶴甲病發可拾沙目然瞭直以相眨穿睹瞥瞬矢的解石鳥神教秉虔誠秘種窩蜂窮竅笑置筆苟勾銷抹殺煞等獎箍節吃箭仇雙鵰詩籌籮筐系列紙級士官統絲毫掛維網盡線微吭響股腦胎脈承腔臂力致效資源址器舉功投般說講規貿易葉障著慎滿皆輸號木電池衣傾鐘高低視仁覺醒覽遺角銀幣觸潰九鼎蔽抄出駟馬追重語破貧洗貫走路安蹴至幾蹶振躍役膽汗較輩輪辭贊退六連遍遞邊針血錘音錯門思閃真倒項栽霧類保護川先驚乍體鬨鱗爪鳴滴泡鄰域黨專鼓作齊炒丑烯亥克內酯冬加奴卯肝炎基尺梁街褲鎬客寵庭巳汝昌烷玲磊糖肇酉醛啷青縣韙良香骨鯛丂七集河市弦喜嘴張舌堵區工業姊妹星架構巧彩扭歪拼湊餘熱曜武州爺浮屠美鄉老階樹葷素碎落能魄鰓鰻珠丄丅丆万俟丈尚摸母娘量管群亞虎必我堂令申件裝伏位博俠義界表女墟臺戲臭皮匠勝諸葛亮賽頂倍催請運算包立叉戟離疫苗土史志演圍揭瓦曬夷姑婆帝村寶爛尖杉鹼屜桌山岔島由紀峽壩庫鎮廢從德後拗湯治旬食明昧曹朋友框欄極權冪曲歸依貓民氟硼氯磷鐵江侗自旅法司洋浦梅園溫暖灣焦班幸用田略番疊皇炮捶硝苯酸腺苷稜草鏡穗跳遠索錦綱聚氰胺聯店胚膲愛色堇紫羅蘭芝茶飯菱雲蟲藏藩亂叛蘇親債凳學座恐戀柱測肌腹衩錐係貂企烏跪叩軍車農題迭都甘油屯奏鍵短阿姨陪姐隻顧茅廬槽駕魂鮮鹿頁其菜單乘任供勢午齒漢組織吊調瀉唇坡城報墳外夸將尉建築岸崗公床揚新劍昇杭林栗校樓標款汽社浣海商館劇院鋼華港機械廣媒環球融第醫科證券綜財樂育游漲猶嶺疏癮瞼確兵領導繳肢膛船艾瑟爾蒼蔡虞傚衫覆訪訴課諭議軌述野鉤限敵鞋頜頷顎饒首齦站例修凡劃垂屆屬崽頦廚拜挫擺放旋削棋榻檻禮沉注滑營獄畫确儀聘花葬詔員跌轄週達酒錨閘陷陸雨雪飛威丌于丹久乏予理評產亢卑亦乎舞己悲矩圓詞害誌但住佞佳便俗信票案幅翁倦倫假偏倚斜虧鬼敲停備傷脾胃僅此像儉匱免宜穴焉戴兼容許凍伯仲負彼晝皂軒輊實刊划顛衛戰哥比省非好黃飾別拘束掩奶睬選擇搖擾煩苦枚寫協厭及格受歡迎約只估侵犯割狀告或缺抗拒挽撤救藥喻磨滅端倪少逆逾越避靠適吉譽吝玉含延咎歹聽啻淵善謀均勻堪忍夠太惹妙妥妨孕症孝術室完納推冠積宣疑辯慄碴稱屈撓屑干涉衡待很忙惡忿怎麼怠急恥恭息悅惑惜惟想愉愧怍慌憤啟懂懈懷材才緊招認扣抵拉捨也罷插揣冒搭撞南牆擴核支攻敢雷攀敬裡嗎需景智暇曾罪遇朽枉止況競爭辱求癒渝溶濟左右袒困補爽特寂寞示弱找謝畏強疾徐痛癢冤符眠睦瞅董何厚云措活疲羞者輕玻璃祥兆禁移稂莠穩佛換答簡結果盟絕縷途給談否羈翼耐肖脛毋寧興舒若菲萊痕跡窠臼虛衰臉兔撒鷹棺範該詳諱抬泰讓鬚眉象眾貲賬費灰賴奇慮訓輟辨菽麥辛近送透逞徒速續逮捕遂遑違遜斧鉞艱醉鏽隨觀棄顯飽脂肪使丏丐幫丒且慢末丕替桃宗王尊涼爵各圖屋脊糧署錄壇吾祿職胄襲君廈丗北壑桐疹損逢陵鷸丙寅戌氨腈唑綸辰酮脫氫酶醚丞丟現掉紗帽弄扯砲碗丠両丣坐存激肩臻蒂蓮悖序驅丨丩丫挺杈髻鬟細介俄伊犁京尼布訂普渡央委監察檢查劑圈設警隊斯督剩震境航舶革防托播促質版蠑螈鋒研藝歷殘消頻譜精密製造陲郵候埔堅壓壢凹匯執府究邦俘攝寮彬狼嶽肺腫庸英訊診埋粒胞括控碼韓暑槍樞砥澳哇牟壽甸鑽探篇簽綴縫繼耳肯照婦埃懸璧軸櫃檯辣擱淺邪跑纖阮陽私囊魔丮丰姿采丱燒丳丵丶丷丸參寨朗桂瑞砂衷霞貌鳳僕艦因嫌宰峰幹絡牌持旨祭禱簿編罰賓辦丼丿乀乂乃乄仰慕盛曠留考驗闊乆乇么醜麼乊湖燃乑乒乓乕乖僻忤戾离謬迕乗危肥劫除隙浪婿乙炔腸酰吡咯鹽乚乛乜嘢卿玄宮尾狐龜塔嶷兄弟泉章霄釘耙乞扎哀憐恕討乢乣乤乥乧乨乩童乪乫乭乳暈汁液瑤漿牙癌突竇罩腐膠豬酪蛋糕菌瘤乴乵乶乷乸乹乺乼乾俸冰嘉噦嚎坤媽屍壘旱枯涸俐渴潮澀煸豆燥爹瘦癟癬瞪袋脆薑貝隆餾乿亀亁叫咕攘扔搞男砸竄蓬麻亃亄亅卻亇遲典今臨繁累卵奉婚聰躬巨與遷添裂副宿歲怪噁尕崙愣杆硅硫鈦鈾錳芑雜異鈉砷胂磺琥珀艙棍簧胡茬盜浩盆販郎腿亍洪亐互欠助勉惠操斥諉繫戶譯亓墓碑刑鈴卅渠繽紛斗米旗憲釩燈徽瘟祖拳福穀豐臟腑綁肉醃苓蘊橋鋪霸顏鬧判噴岡底蛙陘礦亖亙亜罕們娜桑那努哈喀弗烈曼松森杜氏盃奧琛敦戊穆聖裔彙薛孫亟亡佚虜羊牢奮釋卷卸契媾感額睫纏誼趾塞擠紐阻還配馳莊亨洛祚亪享津滬畿郊慈菴枇杷膏亭閣鋥麗亳亶亹誅初責翻瘋偶傑叢稠妖拖寰居吸授慧蝸吞壯魅狗矛盾益渣患憂稀描猿夢暫涯畜禍緣沸搜引擎臣橫紜誰混援蒸獸獅稅剖亻亼亽亾什獻剎邡麽仂仃仄仆富怨仈仉畢昔晨殼紹仍仏仒仕宦仗欺恃腰嘆歎炬梓訖施仙后瓊逝仚仝仞仟悔仡佬償填泊拓撲簇羔購頓欽佩髮棻閫馭養億儆尤藉幀賑凌敘帖李柔剛沃眥睚戒訛取饗讀仨仫仮著泳臥躺韶夏裁仳仵唯賢憑釣誕仿似宋彿諷伀碩盼鵝伄儅伈伉儷柯始娃邁戈坦堡帕茨薩廟瑪莉莎藤霍姆伋伍奢胥廷芳豪伎倆侍汛勒希羲雛伐憩整謨閑閒伕伙伴頤伜伝伢叔恆茲恩翰伱伲侶伶俜悧鼬伸懶縮喇叭伹伺伻伽倻輻伾佀佃佇佈喬妮墨佉盧佌貸劣廉昂檔濃矮傘窪緩耗胸谷迷擋率齲宅沫舍療佐貳佑佔優據鏵嘗呢須魯曉佗佘余坪寺瓜銃僧蒙芒陀龕哼嘔坊姦孽弊揖祟繭縛誓賊佝僂瞀佟你奪趕佡佢佣佤佧賈佪佫佯佰佱潔績釀餚佴捲佶佷佸佹佺佻佼佽佾具喚窘壞娛怒慨硬習慣聾膨脹蔓駭貴痺侀侁侂侃侄侅鴻燕侇侈糜靡侉侌妾侏儒倉鼠侐侑侔侖侘侚鏈侜偎傍鈷循柳葫蘆附価侮罵蔑侯岩截蝕侷貼壺嬛宴捷攜桶箋酌俁狹膝狄俅俉俊俏俎俑俓俔諺俚俛黎健呈固墒增守康箱濕祐鏢鑣槓盒靖膜齡俞豹獵噪孚封札筒託衍鴿剪撰稿煉廠禊練繕葺俯瞰撐衝俲俳俴俵俶俷俺俻俾倀倂倅儲卒惶敷猝逃頡蓄崇隱倌倏忽刺蠟燭噍嚼坍扁抽斃蔥楣灌灶糞背藪賣賠閉霉騰倓倔倖倘倜儻倝借箸挹澆閱倡狂倢倣値倥傯倨傲倩匡嗣沖柝珍倬倭寇猩倮倶倷倹勤讚偁偃充偽吏嗓寐惺扮拱芫茜藉虢鈔偈偉晶偌宕距析濾殿疼癱註頗偓偕鴨歇滯偝偟偢忘怡旺偨偩偪偫偭偯偰偱偲偵緝蹄偷減惰漏窺竊偸偺迹傀儡傅傈僳傌籬傎奎琳迪叟芭傒傔傕傖悉荒傜傞傢傣芽逼傭婢傮睨寄檄誦謠頌傴擔辜弓慘蒿悼疤傺傻屄臆巢洩篋羨蓋軋頹傿儸僄僇僉僊働僎僑僔僖僚僝僞僣僤僥僦猴僨僩僬僭僮僯僰僱僵殖籤靜僾僿征隴儁儂儃儇儈朴薄儊儋儌儍儐儓儔儕儗儘儜儞儤儦儩汰哉寡渥裕酷儭儱罐儳儵儹儺儼儽兀臬臲鷲允勛勳宙宵帥憝彞諧嫂鬩暢沛溢盈飢赫兇悍狠猛頑愚妣斬秦遣鞭耀敏榮槃澤爆碟磁禿纜輝霽鹵朵婁孜烽醬勃汀箕裘鉗耶懞蕾徹兌軟遭黜兎児韻媳爸兕觥兗兙兛兜售鍪肚兝兞兟兡兢兣樽殮涅睡稟籍贅泌啡肽奸幕涵澇熵疚眷稃襯訌赴煥椒殲植跏沒試誤猜棲窗肋袖頰兪卦撇鬍岐廓轎疸楓茴瓏廁秩募勺噸寓斤曆畝迫筷釐最淫螺韜兮寬匪篩襄贏軛複兲詐刃堰戎痞蟻餉它冀鑄冂冃円冇冉冊嫁厲礪竭醮冏牧冑冓冔冕冖冗冘冞冢窄抑誣冥冫烘菇蟄冷凝坨橇淇淋炭餅磚磧窖醋雕雹霜冱冶爐艷嘲峻灘淡漠煖颼飲冼冽凃凄愴梗凅凇凈凊凋敝濛凔凜遵汞脢凞几凢処凰凱凵凶焰凸摺刷紋預喪嘍奔巡榜殯芙蓉租籠輯鞘萃凼鋸鑊刁蠻刂娩崩批拆攤掰櫱驟歧顆秒袂贓勿囑忌磋琢膚刈羽刎訟戮舂槳艇刓刖霹靂刜創犢刡恙墅幟筵緻刦刧刨昏默攸尿慾薰潤薰圭刪刮痧鏟刱刲刳刴刵踏磅戳柏槐繡芹莧蝟舟銘鵠鶩刼剁剃辮剄剉履鉛剋剌姻咽哨廊掠桅沿召瞻翅趙卜渺茫郭剒剔剕瀝剚愎毅訥纔剜剝啄採剞剟剡剣剤綵剮腎駛黏剰袍剴紊剷剸剺剽剿劁劂劄劈啪柴扳啦劉奭姥夼昫涓熙禪禹錫翔雁鶚劊劌弩柄蜻蛉劒劓劖劘劙瀾簣賞磯釜晉甜薪逐劦熔紂虐赤囚劬劭労劵効劻劼劾峭艮勅勇勵勍勐臘脖龐漫飼盪粥輒勖勗勘驕餒碌泮雇捐竹騎殊阱勣樸懇謹勦勧勩勯勰勱勲勷勸懲慰誡諫勹芡踐闌匁庇拯粟紮袱裹餃匆遽匈匉匊匋匍匐莖匏匕妝痰膿蛹齋苑烤蹈塘羌熊閥螳螂疆碚竿緯荷茵邙魏匚匜匝匟扶稷匣匭攏匸匹耦匽匾匿卂叮瘡禧軫堤棚迢鈞鍊卄卆遐卉瓷盲瓶噹胱腱裸卋卌卍卐怯污賤鄙齷齪陋卓溪唐梯漁陳棗泥漳潯澗梨芬譙贍轅迦鄭単驢弈洽鰲卛占筮卝卞卟吩啉屎翠厄卣卨卪卬卮榫襖璽綬鈕蚤懼殆篤聳卲帘帙繞卹卼卽厂厎厓厔厖厗奚厘厙厜厝諒厠厤厥厪膩孢厮厰厳厴厹厺粕垢蕪菁厼厾叁悟茸薯叄吵笄悌哺譏坫壟弧芯杠潛嬰芻袁詰貪諜煽饋駁収岳締災賄騙叚叡吻攔蘑蜜訣燧玩硯箏椎藺銅逗驪另覓叨嘮謁杵姓喊嚷囂咚嚀塑尋惱憎擦祇泣滲蝠叱吒咄咤喝籀黛舵舷叵叶鐸懿昭穰苴遼叻叼吁塹嫖賭瞧爬衆抒吅吆夥巹橡滌抱縱摩郡唁墜扇籃膀襪頸吋愾諮酬哭妓媛暗錶韁邇妃羿絮蕃渾拐葵暮隅吔吖啶嗪戚吜嗇噬嚥吟哦詠吠吧唧嗒咐吪雋咀徵燐苞茹鈣哧吮吰吱嘎吲哚吳棟嬌窟孟簫忠晗淞闔閭趼宇吶睛噓拂捧疵熄竽笛糠吼吽呀呂韋矇呃呆笨呇貢呉罄呋喃呎呏呔呠呡癡呣呤呦呧瑛眩扒晬淑姬瑜璇鵑呪呫嗶嚅囁呬呯呰呱呲咧噌鈍呴呶呷呸呺呻哱咻嘯嚕籲坎坷邏呿咁咂咆哮咇咈咋蟹煦珅藹咍咑咒詛咔噠嚓咾噥哩喱咗咠咡咢咣咥咦咨嗟詢咩咪咫嚙齧咭咮咱咲咳嗆嗽咴咷咸咹咺咼喉咿婉慟憫賦矜綠茗藍哂搶瞞哆嗦囉噻啾濱彗哋哌哎唷喲哏哐哞哢哤哪裏哫啼喘哰哲萎蚌哳哶哽哿唄唅唆唈唉唎唏嘩堯棣殤璜睿肅唔睇唕唚唞唣喳唪唬唰喏唲唳唵嘛唶唸唹唻唼唾唿啁啃鸚鵡啅埠棧榷祺舖鞅飆啊啍啎啐啓啕啖啗啜啞祈啢啣啤啥啫啱啲啵啺饑啽噶崑沁喁喂喆裙喈嚨喋喌喎喑喒喓喔粗喙幛慶滋鵲喟喣喤喥喦喧騷喨喩梆喫葡萄喭駝挑嚇碰樅瓣純皰藻趟鉻喵営喹喺喼喿嗀嗃嗄嗅嗈嗉嗊嗍嗐嗑嗔詬嗕嗖嗙嗛嗜痂癖嗝嗡嗤嗥嗨嗩嗬嗯嗰嗲嗵嘰嗷嗹嗾嗿嘀嘁嘂嘅惋嘈峪禾蔭嘊嘌嘏嘐嘒嘓嘖嘚嘜嘞嘟囔嘣嘥嘦嘧嘬嘭這謔嚴敞饞鬆嘵嘶嘷嘸蝦嘹嘻嘽嘿噀噂噅噇噉噎噏噔噗噘噙噚噝噞噢噤蟬皿噩噫噭噯噱噲噳嚏涌灑欲巫霏噷噼嚃嚄嚆抖嚌嚐嚔囌嚚嚜嚞嚟嚦嚬嚭嚮嚯嚲嚳飭按竣苛嚵嚶囀囅囈膪謙囍囒囓囗囘蕭酚飄濺諦囝溯眸紇鑾鶻囟殉囡団囤囥囧囨囪囫圇囬囮囯囲図囶囷囸囹圄圉擬囻囿圀圂圃圊粹蠹赦圌墾圏滾鯡鑿枘圕圛圜圞坯埂壤骸炕祠窯豚紳魠鯪鱉圧握圩圪垯圬圮圯炸岬幔毯祇窨菩溉圳圴圻圾坂坆沾坋坌舛壈昆墊墩椅坒坓坩堝坭坰坱坳坴坵坻坼楊掙涎簾垃垈垌垍垓垔垕垗垚垛垝垣垞垟垤垧垮垵垺垾垿埀畔埄埆埇埈埌殃隍埏埒埕埗埜埡埤埦埧埭埯埰埲埳埴埵埶紼埸培怖樁礎輔埼埽堀訶姪廡堃堄摧磐貞韌砌堈堉堊堋堌堍堎堖堙堞堠礁堧堨輿堭堮蜓摘堲堳堽堿塁塄塈煤塋棵塍塏塒塓綢塕鴉沽虱塙塚塝繆塡塢塤塥塩塬塱塲蟎塼塽塾塿墀墁墈墉墐夯増毀墝墠墦漬缽墫墬墮墰墺墻櫥壅壆壊壌壎壒榨蒜壔壕壖壙壚壜壝壠壡壬壭壱売壴壹壻壼寢壿夂夅夆変夊夌漱邑夓腕泄甥禦骼夗夘夙袞瑙妊娠醣梟珊鶯鷺戧幻魘夤蹀祕擂鶇姚宛閨嶼庾撻拇賛蛤裨菠氅漓撈湄蚊霆鯊箐篆篷荊肆舅荔鮃巷慚骰辟邱鎔鐮阪漂燴鯢鰈鱷鴇臚鵬妒峨譚枰晏璣癸祝秤竺牡籟恢罡螻蠍賜絨御梭夬夭砣榆怙枕夶夾餡奄崛葩譎奈賀祀贈奌奐奓奕訢詝奘奜奠奡奣陶奨奩魁奫奬奰媧孩貶隸酥宄狡猾她奼嫣妁氈荼皋膻蠅嬪妄妍嫉媚嬈妗趣妚妞妤礙妬婭妯娌妲妳妵妺姁姅姉姍姒姘姙姜姝姞姣姤姧姫姮娥姱姸姺姽婀娀誘懾脅娉婷娑娓娟娣娭娯娵娶娸娼婊婐婕婞婤婥谿孺婧婪婬婹婺婼婽媁媄媊媕媞媟媠媢媬媮媯媲媵媸媺媻媼眯媿嫄嫈嫋嫏嫕嫗嫘嫚嫜嫠嫡嫦嫩嫪毐嫫嫬嫰嫵嫺嫻嫽嫿嬀嬃嬅嬉耍嬋痴豔嬔嬖嬗嬙嬝嬡嬢嬤嬦嬬嬭幼嬲嬴嬸嬹嬾嬿孀孃孅孌孏曰癲屏孑孓雀孖斟簍謎摺孛矻鳩崮軻祜鸞孥邈毓棠臏孬孭孰孱孳孵泛罔銜孻孿宀宁宂拙株薇掣撫琪瓿榴謐彌宊濂祁瑕宍宏碁宓邸讞実潢町宥宧宨宬徵崎駿掖闕臊煮禽蠶宸豫寀寁寥寃簷庶寎暄磣寔寖寘寙寛寠苫寤肘洱濫蒗陝覈寪弘綽螽寳擅疙瘩晷対檐専尃尅贖絀繚疇釁尌峙醌襟痲碧屁昊槌淘恵瀑牝畑莓缸羚覷蔻髒躁尒尓銳尗尙尜尟尢尥尨尪尬尭尰擒尲尶尷尸尹潽蠖蛾尻釦梢蚴鰭脬蹲屇屌蚵屐屓挪屖屘屙屛屝屢屣巒嶂巖舄屧屨屩屪屭屮戍駐鉀崖嵛巔旮旯楂欖櫸芋茱萸靛麓屴屹屺屼岀岊岌岍阜岑彭鞏岒岝岢嵐岣岧岨岫岱岵岷峁峇峋峒峓峞峠嵋峩峯峱峴峹峿崀崁崆禎崋崌崍嶇崐崒崔嵬巍螢顥崚崞崟崠崢巆崤崦崧殂崬崱崳崴崶崿嵂嵇嵊泗嵌嵎嵒嵓嵗嵙嵞嵡嵩嵫嵯嵴嵼嵾嶁嶃嶄晴嶋嶌嶒嶓嶔嶗嶙嶝嶞嶠嶡嶢嶧嶨嶭嶮嶰嶲嶴嶸巂巃巇巉巋巌巓巘巛滇芎巟巠弋迴巣巤炊擘蜥蟒蠱覡巰蜀彥淖杏茂甫楞巻巽幗巿帛斐鯽蕊帑帔帗帚琉汶帟帡帣帨帬帯帰帷帹暆幃幄幇幋幌幏幘幙幚幞幠幡幢幦幨幩幪幬幭幯幰遙蹉跎餘庚鑑幵幷稚邃庀庁広庄庈庉笠庋跋庖犧庠庤庥鯨庬庱庳庴庵馨衢庹庿廃廄廆廋廌廎廏廐廑廒廕廖廛廝搏鑼廞弛袤廥廧廨廩廱綿踵髓廸廹甌鄴廻廼廾廿躔弁皺弇弌弍弎弐弒弔詭憾薦弝弢弣弤弨弭弮弰弳霖繇燾斌旭溥騫弶弸弼弾彀彄彆纍糾彊彔彖彘彟彠陌彤貽彧繪虹彪炳彫蔚鷗彰癉彲彳彴彷彷徉徨彸彽踩斂旆徂徇徊渭畬鉉裼従筌徘徙徜徠膳甦萌漸徬徭醺徯徳徴潘徻徼忀瘁胖燎怦悸顫扉犀澎湃砰恍惚絞隘忉憚挨餓忐忑忒忖応忝忞耿忡忪忭忮忱忸怩忻悠懣怏遏怔怗怚怛怞懟黍訝怫怭懦怱怲怳怵惕怸怹恁恂恇恉恌恏恒恓恔恘恚恛恝恞恟恠恣恧眄恪恫恬澹恰恿悀悁悃悄悆悊悐悒晦悚悛悜悝悤您悩悪悮悰悱悽惻悳悴悵惘悶悻悾惄愫鍾蒐惆惇惌惎惏惓惔惙惛耄惝瘧濁惥惦惪惲惴惷惸拈愀愃愆愈愊愍愐愑愒愓愔愕愙氓蠢騃昵愜赧愨愬愮愯愷愼慁慂慅慆慇靄慉慊慍慝慥慪慫慬慱慳慴慵慷慼焚憀灼鬱憃憊憋憍眺捏軾憒憔憖憙憧憬憨憪憭憮憯憷憸憹憺懃懅懆邀懊懋懌懍懐懞懠懤懥懨懫懮懰懱毖懵遁樑雍懺懽戁戄戇戉戔戕戛戝戞戠戡戢戣戤戥戦戩戭戯轟戱披菊牖戸戹戺戻戼戽鍬扂楔扃扆扈扊杖牽絹銬鐲賚扐摟攪烊盹瞌跟躉鑔靶鼾払扗玫腮扛扞扠扡扢盔押扤扦扱罾揄綏鞍郤窾扻扼扽抃抆抈抉抌抏瞎抔繯縊擻抜抝択抨摔歉躥牾抶抻搐泵菸拃拄拊髀拋拌脯拎拏拑擢秧沓曳攣迂拚拝拠拡拫拭拮踢拴拶拷攢拽掇芥橐簪摹疔挈瓢驥捺蹻挌挍挎挐揀挓挖掘浚挙揍聵挲挶挾挿捂捃捄捅捆捉捋胳膊揎捌捍捎軀蛛捗捘捙捜捥捩捫捭据捱捻捼捽掀掂掄臀膘掊掎掏掐笙掔掗掞棉芍掤搪闡掫掮掯揉掱掲掽掾揃揅揆搓揌諢揕揗揘揜揝揞揠揥揩揪揫櫫遒麈揰揲揵揶揸揹揺搆搉搊搋搌搎搔搕撼櫓搗搘搠搡搢搣搤搥搦搧搨搬楦褳訕赸搯搰搲搳搴搵搷搽搾搿摀摁摂摃摎摑摒摓跤摙摛摜摞摠摦睺羯摭摮摯摰摲摳摴摶摷摻摽撂撃撅稻撊撋撏鐧潑撕撙撚撝撟撢撣撦撧撩撬撱朔撳蚍蜉撾撿擀擄闖擉缶觚擐擕擖擗擡擣擤澡腚擧擨擩擫擭擯擰擷擸擼擽擿攃攄攆攉攥攐攓攖攙攛每攩攫轡澄攮攰攲攴軼攷砭訐攽碘敁敃敇敉敍敎筏敔敕敖閏誨敜煌敧敪敱敹敺敻敿斁衽斄牒縐謅斉斎斕鶉讕駮鱧斒筲斛斝斞斠斡斢斨斫斮晾沂潟穎絳邵斲斸釳於琅斾斿旀旂旃旄渦旌旎旐旒旓旖旛旝旟旡旣浴旰獺魃旴旹旻旼旽昀昃昄昇昉晰躲澈熹皎皓礬昑昕昜昝昞昡昤暉筍昦昨昰昱昳昴昶昺昻晁蹇隧蔬髦晄晅晒晛晜晞晟晡晢晤晥曦晩萘瑩顗晿暁暋暌暍暐暔暕煅暘暝暠暡曚暦暨暪朦朧暱暲殄馮暵暸暹暻暾曀曄曇曈曌曏曐曖曘曙曛曡曨曩駱曱甴肱曷牘禺錕曽滄耽朁朅朆杪栓誇竟粘絛朊膺朏朐朓朕朘朙瞄覲溘饔飧朠朢朣柵椆澱蝨朩朮朰朱炆璋鈺熾鹮朳槿朶朾朿杅杇杌隉欣釗湛漼楷瀍煜玟纓翱肈舜贄适逵杓杕杗杙荀蘅杝杞脩珓筊杰榔狍閦顰緬莞杲杳眇杴杶杸杻杼枋枌枒枓衾葄翹紓逋枙狸椏枟槁枲枳枴枵枷枸櫞枹枻柁柂柃柅柈柊柎某柑橘柒柘柙柚柜柞櫟柟柢柣柤柩柬柮柰柲橙柶柷柸柺査柿栃栄栒栔栘栝栟栢栩栫栭栱栲栳栴檀栵栻桀驁桁鎂桄桉桋桎梏椹葚桓桔桕桜桟桫欏桭桮桯桲桴桷桹湘溟梃梊梍梐潼梔梘梜梠梡梣梧梩梱梲梳梴梵梹棁棃櫻棐棑棕櫚簑繃蓑棖棘棜棨棩棪棫棬棯棰棱棳棸棹槨棼椀椄苕椈椊椋椌椐椑椓椗検椤椪椰椳椴椵椷椸椽椿楀楄楅篪楋楍楎楗楘楙楛楝楟楠楢楥楨楩楪楫楬楮楯楰楳楸楹楻楽榀榃榊榎槺榕榖榘榛狉莽榜笞榠榡榤榥榦榧榪榭榰榱槤霰榼榾榿槊閂槎槑槔槖様槜槢槥槧槪槭槮槱槲槻槼槾樆樊樏樑樕樗樘樛樟樠樧樨権樲樴樵猢猻樺樻罍樾樿橁橄橆橈笥龠橕橚橛輛橢橤橧豎膈跨橾橿檁檃檇檉檍檎檑檖檗檜檟檠檣檨檫檬檮檳檴檵檸櫂櫆櫌櫛櫜櫝櫡櫧櫨櫪櫬櫳櫹櫺茄櫽欀欂欃欐欑欒欙欞溴欨欬欱欵欶欷歔欸欹欻欼欿歁歃歆艎歈歊蒔蝶歓歕歘歙歛歜歟歠蹦詮鑲蹣跚陞陟歩歮歯歰歳歴璞歺瞑歾歿殀殈殍殑殗殜殙殛殞殢殣殥殪殫殭殰殳荃殷殸殹蛟殻殽謗毆毈毉餵毎毑蕈毗毘毚茛鄧毧毬毳毷毹毽毾毿氂氄氆靴氉氊氌氍氐聊氕氖気氘氙氚氛氜氝氡洶焊痙氤氳氥氦鋁鋅氪烴氬銨痤汪滸漉痘盂碾菖蒲蕹蛭螅氵氷氹氺氽燙氾氿渚汆汊汋汍汎汏汐汔汕褟汙汚汜蘺沼穢衊汧汨汩汭汲汳汴隄汾沄沅沆瀣沇沈葆浸淪湎溺痼痾沌沍沏沐沔沕沘浜畹礫沚沢沬沭沮沰沱灢沴沷籽沺烹濡洄泂肛泅泆湧肓泐泑泒泓泔泖泙泚泜泝泠漩饃濤粼濘蘚鰍泩泫泭泯銖泱泲洇洊涇琵琶荽薊箔洌洎洏洑潄濯洙洚洟洢洣洧洨洩痢滔洫洮洳洴洵洸洹洺洼洿淌蜚浄浉浙贛渫浠浡浤浥淼瀚浬浭翩萍浯浰蜃淀苔蛞蝓蜇螵蛸煲鯉浹浼浽溦涂涊涐涑涒涔滂涖涘涙涪涫涬涮涴涶涷涿淄淅淆淊淒黯淓淙漣淜淝淟淠淢淤淥淦淩猥藿褻淬淮淯淰淳詣淶紡淸淹燉癯綺渇済渉渋渓渕渙渟渢滓渤澥渧渨渮渰渲渶渼湅湉湋湍湑湓湔黔湜湝湞湟湢湣湩湫湮麟湱湲湴湼満溈溍溎溏溛舐漭溠溤溧馴溮溱溲溳溵溷溻溼溽溾滁滃滉滊滎滏稽滕滘滙滝滫滮羼耷滷滹滻煎漈漊漎繹漕漖漘漙漚漜漪漾漥漦漯漰漵漶漷濞潀潁潎潏潕潗潚潝潞潠潦祉瘍潲潵潷潸潺潾潿澁澂澃澉澌澍澐澒澔澙澠澣澦澧澨澫澬澮澰澴澶澼熏郁濆濇濈濉濊貊濔疣濜濠濩觴濬濮盥濰濲濼瀁瀅瀆瀋瀌瀏瀒瀔瀕瀘瀛瀟瀠瀡瀦瀧瀨瀬瀰瀲瀳瀵瀹瀺瀼灃灄灉灋灒灕灖灝灞灠灤灥灨灩灪蜴灮燼獴灴灸灺炁炅魷炗炘炙炤炫疽烙釺炯炰炱炲炴炷燬炻烀烋瘴鯧烓烔焙烜烝烳飪烺焃焄耆焌焐焓焗焜焞焠焢焮焯焱焼煁煃煆煇煊熠煍熬煐煒煕煗燻礆霾煚煝煟煠煢矸煨瑣煬萁煳煺煻熀熅熇熉羆熒穹熗熘熛熜稔諳爍熤熨熯熰眶螞熲熳熸熿燀燁燂燄盞燊燋燏燔隼燖燜燠燡燦燨燮燹燻燽燿爇爊爓爚爝爟爨蟾爯爰爲爻爿爿牀牁牂牄牋牎牏牓牕釉牚腩蒡虻牠雖蠣牣牤牮牯牲牳牴牷牸牼絆牿靬犂犄犆犇犉犍犎犒犖犗犛犟犠犨犩犪犮犰狳犴犵犺狁甩狃狆狎狒獾狘狙黠狨狩狫狴狷狺狻豕狽蜘猁猇猈猊猋猓猖獗猗猘猙獰獁猞猟獕猭猱猲猳猷猸猹猺玃獀獃獉獍獏獐獒獘獙獚獜獝獞獠獢獣獧鼇蹊獪獫獬豸獮獯鬻獳獷獼玀玁菟玅玆玈珉糝禛郅玍玎玓瓅玔玕玖玗玘玞玠玡玢玤玥玦玨瑰玭玳瑁玶玷玹玼珂珇珈瑚珌饈饌珔珖珙珛珞珡珣珥珧珩珪珮珶珷珺珽琀琁隕琊琇琖琚琠琤琦琨琫琬琭琮琯琰琱琲瑯琹琺琿瑀瑂瑄瑉瑋瑑瑔瑗瑢瑭瑱瑲瑳瑽瑾瑿璀璨璁璅璆璈璉璊璐璘璚璝璟璠璡璥璦璩璪璫璯璲璵璸璺璿瓀瓔瓖瓘瓚瓛臍瓞瓠瓤瓧瓩瓮瓰瓱瓴瓸瓻瓼甀甁甃甄甇甋甍甎甏甑甒甓甔甕甖甗飴蔗甙詫鉅粱盎銹糰甡褥産甪甬甭甮甯鎧甹甽甾甿畀畁畇畈畊畋畎畓畚畛畟鄂畤畦畧荻畯畳畵畷畸畽畾疃疉疋疍疎簞疐疒疕疘疝疢疥疧疳疶疿痁痄痊痌痍痏痐痒痔痗瘢痚痠痡痣痦痩痭痯痱痳痵痻痿瘀瘂瘃瘈瘉瘊瘌瘏瘐瘓瘕瘖瘙瘚瘛瘲瘜瘝瘞瘠瘥瘨瘭瘮瘯瘰癧瘳癘瘵瘸瘺瘻瘼癃癆癇癈癎癐癔癙癜癠癤癥癩蟆癪癭癰発踔紺蔫酵皙砬砒翎翳蘞鎢鑞皚鵯駒鱀粵褶皀皁莢皃鎛皈皌皐皒硃皕皖皘皜皝皞皤皦皨皪皫皭糙綻皴皸皻皽盅盋盌盍盚盝踞盦盩鞦韆盬盭眦睜瞤盯盱眙裰盵盻睞眂眅眈眊県眑眕眚眛眞眢眣眭眳眴眵眹瞓眽郛睃睅睆睊睍睎睏睒睖睙睟睠睢睥睪睪睯睽睾瞇瞈瞋瞍逛瞏瞕瞖瞘瞜瞟瞠瞢瞫瞭瞳瞵瞷瞹瞽闍瞿矓矉矍鑠矔矗矙矚矞矟矠矣矧矬矯矰矱硪碇磙罅舫阡、矼矽礓砃砅砆砉砍砑砕砝砟砠砢砦砧砩砫砮砳艏砵砹砼硇硌硍硎硏硐硒硜硤硨磲茚鋇硭硻硾碃碉碏碣碓碔碞碡碪碫碬碭碯碲碸碻礡磈磉磎磑磔磕磖磛磟磠磡磤磥蹭磪磬磴磵磹磻磽礀礄礅礌礐礚礜礞礤礧礮礱礲礵礽礿祂祄祅祆禳祊祍祏祓祔祕祗祘祛祧祫祲祻祼餌臠錮禂禇禋禑禔禕隋禖禘禚禜禝禠禡禢禤禥禨禫禰禴禸稈秈秊闈颯秌秏秕笈蘵賃秠秣秪秫秬秭秷秸稊稌稍稑稗稙稛稞稬稭稲稹稼顙稾穂穄穇穈穉穋穌貯穏穜穟穠穡穣穤穧穨穭穮穵穸窿闃窀窂窅窆窈窕窊窋窌窒窓窔窞窣窬黷蹙窰窳窴窵窶窸窻竁竃竈竑竜竝竦竪篦篾笆鮫竾笉笊笎笏笐靨笓笤籙笪笫笭笮笰笱笲笳笵笸笻筀筅筇筈筎筑筘筠筤筥筦筧筩筭筯筰筱筳筴讌筸箂箇箊箎箑箒箘箙箛箜篌箝箠箬鏃箯箴箾篁篔簹篘篙篚篛篜篝篟篠篡篢篥篧篨篭篰篲篳篴篶篹篼簀簁簃簆簉簋簌簏簜簟簠簥簦簨簬簰簸簻籊籐籒籓籔籖籚籛籜籣籥籧籩籪籫籯芾麴籵籸籹籼粁粃粋粑粔糲粛粞粢粧粨粲粳粺粻粽闢粿糅糆糈糌糍糒糔萼糗蛆蹋糢糨糬糭糯糱糴糶糸糺紃蹼鰹黴紆紈絝紉閩襻紑紕紘錠鳶鷂紝紞紟紥紩紬紱紲紵紽紾紿絁絃絅経絍絎絏縭褵絓絖絘絜絢絣螯絪絫聒絰絵絶絺絻絿綀綃綅綆綈綉綌綍綎綑綖綘継続緞綣綦綪綫綮綯綰罟蝽綷縩綹綾緁緄緅緆緇緋緌緎総緑緔緖緗緘緙緜緡緤緥緦纂緪緰緱緲緶緹縁縃縄縈縉縋縏縑縕縗縚縝縞縟縠縡縢縦縧縯縰騁縲縳縴縵縶縹縻衙縿繄繅繈繊繋繐繒繖繘繙繠繢繣繨繮繰繸繻繾纁纆纇纈纉纊纑纕纘纙纚纛缾罃罆罈罋罌罎罏罖罘罛罝罠罣罥罦罨罫罭鍰罳罶罹罻罽罿羂羃羇羋蕉51鴕羑羖羗羜羝羢羣羥羧羭羮羰羱羵羶羸藜鮐翀翃翄翊翌翏翕翛翟翡翣翥翦躚翪翫翬翮翯翺翽翾翿闆饕鴰鍁耋耇耎耏耑耒耜耔耞耡耤耨耩耪耬耰鬢耵聹聃聆聎聝聡聦聱聴聶聼閾聿肄肏肐肕腋肙肜肟肧胛肫肬肭肰肴肵肸肼胊胍胏胑胔胗胙胝胠銓胤胦胩胬胭胯胰胲胴胹胻胼胾脇脘脝脞脡脣脤脥脧脰脲脳腆腊腌臢腍腒腓腖腜腠腡腥腧腬腯踝蹬鐐腴腶蠕誹膂膃膆膇膋膔膕膗膙膟黐膣膦膫膰膴膵膷膾臃臄臇臈臌臐臑臓臕臖臙臛臝臞臧蓐詡臽臾臿舀舁鰟鮍舋舎舔舗舘舝舠舡舢舨舭舲舳舴舸舺艁艄艅艉艋艑艕艖艗艘艚艜艟艣艤艨艩艫艬艭荏艴艶艸艹艻艿芃芄芊萰陂藭芏芔芘芚蕙芟芣芤茉芧芨芩芪芮芰鰱芴芷芸蕘豢芼芿苄苒苘苙苜蓿苠苡苣蕒苤苧苪鎊苶苹苺苻苾茀茁范蠡萣茆茇茈茌茍茖茞茠茢茥茦菰茭茯茳藨茷藘茼荁荄荅荇荈菅蜢鴞荍荑荘荳荵荸薺莆莒莔莕莘莙莚莛莜莝莦莨菪莩莪莭莰莿菀菆菉菎菏菐菑菓菔菕菘菝菡菢菣菥蓂菧菫轂鎣菶菷菹醢菺菻菼菾萅萆萇萋萏萐萑萜萩萱萴萵萹萻葇葍葎葑葒葖葙葠葥葦葧葭葯葳葴葶葸葹葽蒄蒎蒓蘢薹蒞蒟蒻蒢蒦蒨蒭藁蒯蒱鉾蒴蒹蒺蒽蓀蓁蓆蓇蓊蓌蓍蓏蓓蓖蓧蓪蓫蓽跣藕蓯蓰蓱蓴蓷蓺蓼蔀蔂蔃蔆蔇蔉蔊蔋蔌蔎蔕蔘蔙蔞蔟鍔蔣雯蔦蔯蔳蔴蔵蔸蔾蕁蕆蕋蕍蕎蕐蕑蕓蕕蕖蕗蕝蕞蕠蕡蕢蕣蕤蕨蕳蕷蕸蕺蕻薀薁薃薅薆薈薉薌薏薐薔薖薘薙諤釵薜薠薢薤薧薨薫薬薳薶薷薸薽薾薿藄藇藋藎藐藙藚藟藦藳藴藶藷藾蘀蘁蘄蘋蘗蘘蘝蘤蘧蘩蘸蘼虀虆虍蟠虒虓虖虡虣虥虩虯虰蛵虵虷鱒虺虼蚆蚈蚋蚓蚔蚖蚘蚜蚡蚣蚧蚨蚩蚪蚯蚰蜒蚱蚳蚶蚹蚺蚻蚿蛀蛁蛄蛅蝮蛌蛍蛐蟮蛑蛓蛔蛘蛚蛜蛡蛣蜊蛩蛺蛻螫蜅蜆蜈蝣蜋蜍蜎蜑蠊蜛餞蜞蜣蜨蜩蜮蜱蜷蜺蜾蜿蝀蝃蝋蝌蝍蝎蝏蝗蝘蝙蝝鱝蝡蝤蝥蝯蝰蝱蝲蝴蝻螃蠏螄螉螋螒螓螗螘螙螚蟥螟螣螥螬螭螮螾螿蟀蟅蟈蟊蟋蟑蟓蟛蟜蟟蟢蟣蟨蟪蟭蟯蟳蟶蟷蟺蟿蠁蠂蠃蠆蠋蠐蠓蠔蠗蠙蠚蠛蠜蠧蠨蠩蠭蠮蠰蠲蠵蠸蠼蠽衁衂衄衇衈衉衋衎衒衕衖衚衞裳鈎衭衲衵衹衺衿袈裟袗袚袟袢袪袮袲袴袷袺袼褙袽裀裉裊裋裌裍裎裒裛裯裱裲裴裾褀褂褉褊褌褎褐褒褓褔褕褘褚褡褢褦褧褪褫褭褯褰褱襠褸褽褾襁襃襆襇襉襋襌襏襚襛襜襝襞襡襢襤襦襫襬襭襮襴襶襼襽襾覂覃覅覇覉覊覌覗覘覚覜覥覦覧覩覬覯覰観覿觔觕觖觜觽觝觡酲觩觫觭觱觳觶觷觼觾觿言賅訃訇訏訑訒詁託訧訬訳訹証訾詀詅詆譭詈詊詎詑詒詖詗詘詧詨詵詶詸詹詻詼詿誂誃誄鋤誆誋誑誒誖誙誚誥誧説読誯誶誾諂諄諆諌諍諏諑諕諗諛諝諞諟諠諡諴諵諶諼謄謆謇謌謍謏謑謖謚謡謦謪謫謳謷謼謾譁譅譆譈譊譌譒譔譖鑫譞譟譩譫譬譱譲譴譸譹譾讅讆讋讌讎讐讒讖讙讜讟谽豁豉豇豈豊豋豌豏豔豞豖豗豜豝豣豦豨豭豱豳豵豶豷豺豻貅貆貍貎貔貘貙貜貤饜貰餸貺賁賂賏賒賕賙賝賡賧賨賫鬭賮賵賸賺賻賾贇贉贐贔贕贗赬赭赱赳迄趁趂趄趐趑趒趔趡趦趫趮趯趲趴趵趷趹趺趿跁跂跅跆躓蹌跐跕跖跗跙跛跦跧跩跫跬跮跱跲跴跺跼跽踅踆踈踉踊踒踖踘踜踟躇躕踠踡踣踤踥踦踧蹺踫踮踰踱踴踶踹踺踼踽躞蹁蹂躪蹎蹐蹓蹔蹕蹚蹜蹝蹟蹠蹡蹢躂蹧蹩蹪蹯鞠蹽躃躄躅躊躋躐躑躒躘躙躛躝躠躡躦躧躩躭躰躳躶軃軆輥軏軔軘軜軝齶転軥軨軭軱軲轆軷軹軺軽軿輀輂輦輅輇輈輓輗輙輜輞輠輤輬輭輮輳輴輵輶輹輼輾轀轇轏轑轒轔轕轖轗轘轙轝轞轢轤辠辢辤辵辶辺込辿迅迋迍麿迓迣迤邐迥迨迮迸迺迻迿逄逅逌逍逑逓逕逖逡逭逯逴逶逹遄遅遉遘遛遝遢遨遫遯遰遴遶遹遻邂邅邉邋邎邕邗邘邛邠邢邧邨邯鄲邰邲邳邴邶邷邽邾邿郃郄郇郈郔郕郗郙郚郜郝郞郟郠郢郪郫郯郰郲郳郴郷郹郾郿鄀鄄鄆鄇鄈鄋鄍鄎鄏鄐鄑鄒鄔鄕鄖鄗鄘鄚鄜鄞鄠鄢鄣鄤鄦鄩鄫鄬鄮鄯鄱鄶鄷鄹鄺鄻鄾鄿酃酅酆酇酈酊酋酎酏酐酣酔酕醄酖酗酞酡酢酤酩酴酹酺醁醅醆醊醍醐醑醓醖醝醞醡醤醨醪醭醯醰醱醲醴醵醸醹醼醽醾釂釃釅釆釈鱸鎦閶釓釔釕鈀釙鼢鼴釤釧釪釬釭釱釷釸釹鈁鈃鈄鈆鈇鈈鈊鈌鈐鈑鈒鈤鈥鈧鈬鈮鈰鈳鐺鈸鈹鈽鈿鉄鉆鉈鉋鉌鉍鉏鉑鉕鉚鉢鉥鉦鉨鉬鉭鉱鉲鉶鉸鉺鉼鉿銍銎銑銕鏤銚銛銠銣銤銥銦銧銩銪銫銭銰銲銶銻銼銾鋂鋃鋆鋈鋊鋌鋍鋏鋐鋑鋕鋘鋙鋝鋟鋦鋨鋩鋭鋮鋯鋰鋱鋳鋹鋺鋻鏰鐱錀錁錆錇錈錍錏錒錔錙錚錛錞錟錡錤錩錬録錸錼鍀鍆鍇鍉鍍鍏鍐鍘鍚鍛鍠鍤鍥鍩鍫鍭鍱鍴鍶鍹鍺鍼鍾鎄鎇鎉鎋鎌鎍鎏鎒鎓鎗鎘鎚鎞鎡鎤鎩鎪鎭鎯鎰鎳鎴鎵鎸鎹鎿鏇鏊鏌鏐鏑鏖鏗鏘鏚鏜鏝鏞鏠鏦鏨鏷鏸鏹鏻鏽鏾鐃鐄鐇鐏鐒鐓鐔鐗馗鐙鐝鐠鐡鐦鐨鐩鐫鐬鐱鐳鐶鐻鐽鐿鑀鑅鑌鑐鑕鑚鑛鑢鑤鑥鑪鑭鑯鑱鑴鑵鑷钁钃镻閆閈閌閎閒閔閗閟閡関閤閤閧閬閲閹閺閻閼閽閿闇闉闋闐闑闒闓闘闚闞闟闠闤闥阞阢阤阨阬阯阹阼阽陁陑陔陛陜陡陥陬騭陴険陼陾隂隃隈隒隗隞隠隣隤隩隮隰顴隳隷隹雂雈雉雊雎雑雒雗雘雚雝雟雩雰雱驛霂霅霈霊霑霒霓霙霝霢霣霤霨霩霪霫霮靁靆靉靑靚靣靦靪靮靰靳靷靸靺靼靿鞀鞃鞄鞌鞗鞙鞚鞝鞞鞡鞣鞨鞫鞬鞮鞶鞹鞾韃韅韉馱韍韎韔韖韘韝韞韡韣韭韮韱韹韺頀颳頄頇頊頍頎頏頒頖頞頠頫頬顱頯頲頴頼顇顋顑顒顓顔顕顚顜顢顣顬顳颭颮颱颶颸颺颻颽颾颿飀飂飈飌飜飡飣飤飥飩飫飮飱飶餀餂餄餎餇餈餑餔餕餖餗餚餛餜餟餠餤餧餩餪餫餬餮餱餲餳餺餻餼餽餿饁饅饇饉饊饍饎饐饘饟饢馘馥馝馡馣騮騾馵馹駃駄駅駆駉駋駑駓駔駗駘駙駜駡駢駪駬駰駴駸駹駽駾騂騄騅騆騉騋騍騏驎騑騒験騕騖騠騢騣騤騧驤騵騶騸騺驀驂驃驄驆驈驊驌驍驎驏驒驔驖驙驦驩驫骺鯁骫骭骯骱骴骶骷髏骾髁髂髄髆髈髐髑髕髖髙髝髞髟髡髣髧髪髫髭髯髲髳髹髺髽髾鬁鬃鬅鬈鬋鬎鬏鬐鬑鬒鬖鬗鬘鬙鬠鬣鬪鬫鬬鬮鬯鬰鬲鬵鬷魆魈魊魋魍魎魑魖鰾魛魟魣魦魨魬魴魵魸鮀鮁鮆鮌鮎鮑鮒鮓鮚鮞鮟鱇鮠鮦鮨鮪鮭鮶鮸鮿鯀鯄鯆鯇鯈鯔鯕鯖鯗鯙鯠鯤鯥鯫鯰鯷鯸鯿鰂鰆鶼鰉鰋鰐鰒鰕鰛鰜鰣鰤鰥鰦鰨鰩鰮鰳鰶鰷鱺鰼鰽鱀鱄鱅鱆鱈鱎鱐鱓鱔鱖鱘鱟鱠鱣鱨鱭鱮鱲鱵鱻鲅鳦鳧鳯鳲鳷鳻鴂鴃鴄鴆鴈鴎鴒鴔鴗鴛鴦鴝鵒鴟鴠鴢鴣鴥鴯鶓鴳鴴鴷鴽鵀鵁鵂鵓鵖鵙鵜鶘鵞鵟鵩鵪鵫鵵鵷鵻鵾鶂鶊鶏鶒鶖鶗鶡鶤鶦鶬鶱鶲鶵鶸鶹鶺鶿鷀鷁鷃鷄鷇鷈鷉鷊鷏鷓鷕鷖鷙鷞鷟鷥鷦鷯鷩鷫鷭鷳鷴鷽鷾鷿鸂鸇鸊鸏鸑鸒鸓鸕鸛鸜鸝鹸鹹鹺麀麂麃麄麇麋麌麐麑麒麚麛麝麤麩麪麫麮麯麰麺麾黁黈黌黢黒黓黕黙黝黟黥黦黧黮黰黱黲黶黹黻黼黽黿鼂鼃鼅鼈鼉鼏鼐鼒鼕鼖鼙鼚鼛鼡鼩鼱鼪鼫鼯鼷鼽齁齆齇齈齉齌齎齏齔齕齗齙齚齜齞齟齬齠齢齣齧齩齮齯齰齱齵齾龎龑龒龔龖龘龝龡龢龤' diff --git a/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py b/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py index c68caeeb78e675a69bea501ac33c0c951c5673fa..9794a70075bf79cbb978c83a4e9fa0bad3e76e1b 100644 --- a/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py +++ b/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py @@ -51,9 +51,9 @@ from .quantifier import replace_temperature class TextNormalizer(): def __init__(self): - self.SENTENCE_SPLITOR = re.compile(r'([:,;。?!,;?!][”’]?)') + self.SENTENCE_SPLITOR = re.compile(r'([:、,;。?!,;?!][”’]?)') - def _split(self, text: str) -> List[str]: + def _split(self, text: str, lang="zh") -> List[str]: """Split long text into sentences with sentence-splitting punctuations. Parameters ---------- @@ -65,7 +65,8 @@ class TextNormalizer(): Sentences. """ # Only for pure Chinese here - text = text.replace(" ", "") + if lang == "zh": + text = text.replace(" ", "") text = self.SENTENCE_SPLITOR.sub(r'\1\n', text) text = text.strip() sentences = [sentence.strip() for sentence in re.split(r'\n+', text)] diff --git a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py index cdec03abc15ef89a96bf2ae7df4932bb94f91437..405ad957d478621c15ddc3185bd31d459006c140 100644 --- a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py +++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py @@ -907,7 +907,9 @@ class StyleFastSpeech2Inference(FastSpeech2Inference): energy: Union[paddle.Tensor, np.ndarray]=None, energy_scale: Union[int, float]=None, energy_bias: Union[int, float]=None, - robot: bool=False): + robot: bool=False, + spk_emb=None, + spk_id=None): """ Parameters ---------- @@ -939,7 +941,12 @@ class StyleFastSpeech2Inference(FastSpeech2Inference): Output sequence of features (L, odim). """ normalized_mel, d_outs, p_outs, e_outs = self.acoustic_model.inference( - text, durations=None, pitch=None, energy=None) + text, + durations=None, + pitch=None, + energy=None, + spk_emb=spk_emb, + spk_id=spk_id) # priority: groundtruth > scale/bias > previous output # set durations if isinstance(durations, np.ndarray): @@ -991,7 +998,9 @@ class StyleFastSpeech2Inference(FastSpeech2Inference): durations=durations, pitch=pitch, energy=energy, - use_teacher_forcing=True) + use_teacher_forcing=True, + spk_emb=spk_emb, + spk_id=spk_id) logmel = self.normalizer.inverse(normalized_mel) return logmel diff --git a/paddlespeech/t2s/models/melgan/style_melgan.py b/paddlespeech/t2s/models/melgan/style_melgan.py index 0854c0a98125e6fd239c4a088734015c9f5537bd..bd451e1fd5fd928c298d7e410cda4fdcc500213d 100644 --- a/paddlespeech/t2s/models/melgan/style_melgan.py +++ b/paddlespeech/t2s/models/melgan/style_melgan.py @@ -188,7 +188,8 @@ class StyleMelGANGenerator(nn.Layer): try: if layer: nn.utils.remove_weight_norm(layer) - except ValueError: + # add AttributeError to bypass https://github.com/PaddlePaddle/Paddle/issues/38532 temporarily + except (ValueError, AttributeError): pass self.apply(_remove_weight_norm) diff --git a/paddlespeech/t2s/models/speedyspeech/speedyspeech.py b/paddlespeech/t2s/models/speedyspeech/speedyspeech.py index ece5c279f2f82e76b873d2f0b80b78173fce098f..cc9e20662caaa1fff2a0b09995995be74e7fd180 100644 --- a/paddlespeech/t2s/models/speedyspeech/speedyspeech.py +++ b/paddlespeech/t2s/models/speedyspeech/speedyspeech.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import numpy as np import paddle from paddle import nn @@ -23,18 +22,18 @@ def expand(encodings: paddle.Tensor, durations: paddle.Tensor) -> paddle.Tensor: encodings: (B, T, C) durations: (B, T) """ - batch_size, t_enc = durations.shape - durations = durations.numpy() - slens = np.sum(durations, -1) - t_dec = np.max(slens) - M = np.zeros([batch_size, t_dec, t_enc]) + batch_size, t_enc = paddle.shape(durations) + slens = paddle.sum(durations, -1) + t_dec = paddle.max(slens) + M = paddle.zeros([batch_size, t_dec, t_enc]) for i in range(batch_size): k = 0 for j in range(t_enc): d = durations[i, j] - M[i, k:k + d, j] = 1 + # If the d == 0, slice action is meaningless and not supported + if d >= 1: + M[0, k:k + d, j] = 1 k += d - M = paddle.to_tensor(M, dtype=encodings.dtype) encodings = paddle.matmul(M, encodings) return encodings @@ -95,8 +94,13 @@ class TextEmbedding(nn.Layer): class SpeedySpeechEncoder(nn.Layer): - def __init__(self, vocab_size, tone_size, hidden_size, kernel_size, - dilations): + def __init__(self, + vocab_size, + tone_size, + hidden_size, + kernel_size, + dilations, + spk_num=None): super().__init__() self.embedding = TextEmbedding( vocab_size, @@ -104,6 +108,15 @@ class SpeedySpeechEncoder(nn.Layer): tone_size, padding_idx=0, tone_padding_idx=0) + + if spk_num: + self.spk_emb = nn.Embedding( + num_embeddings=spk_num, + embedding_dim=hidden_size, + padding_idx=0) + else: + self.spk_emb = None + self.prenet = nn.Sequential( nn.Linear(hidden_size, hidden_size), nn.ReLU(), ) @@ -118,8 +131,10 @@ class SpeedySpeechEncoder(nn.Layer): nn.BatchNorm1D(hidden_size, data_format="NLC"), nn.Linear(hidden_size, hidden_size), ) - def forward(self, text, tones): + def forward(self, text, tones, spk_id=None): embedding = self.embedding(text, tones) + if self.spk_emb: + embedding += self.spk_emb(spk_id).unsqueeze(1) embedding = self.prenet(embedding) x = self.res_blocks(embedding) x = embedding + self.postnet1(x) @@ -160,22 +175,22 @@ class SpeedySpeechDecoder(nn.Layer): class SpeedySpeech(nn.Layer): - def __init__( - self, - vocab_size, - encoder_hidden_size, - encoder_kernel_size, - encoder_dilations, - duration_predictor_hidden_size, - decoder_hidden_size, - decoder_output_size, - decoder_kernel_size, - decoder_dilations, - tone_size=None, ): + def __init__(self, + vocab_size, + encoder_hidden_size, + encoder_kernel_size, + encoder_dilations, + duration_predictor_hidden_size, + decoder_hidden_size, + decoder_output_size, + decoder_kernel_size, + decoder_dilations, + tone_size=None, + spk_num=None): super().__init__() encoder = SpeedySpeechEncoder(vocab_size, tone_size, encoder_hidden_size, encoder_kernel_size, - encoder_dilations) + encoder_dilations, spk_num) duration_predictor = DurationPredictor(duration_predictor_hidden_size) decoder = SpeedySpeechDecoder(decoder_hidden_size, decoder_output_size, decoder_kernel_size, decoder_dilations) @@ -184,13 +199,15 @@ class SpeedySpeech(nn.Layer): self.duration_predictor = duration_predictor self.decoder = decoder - def forward(self, text, tones, durations): + def forward(self, text, tones, durations, spk_id: paddle.Tensor=None): # input of embedding must be int64 text = paddle.cast(text, 'int64') tones = paddle.cast(tones, 'int64') + if spk_id is not None: + spk_id = paddle.cast(spk_id, 'int64') durations = paddle.cast(durations, 'int64') - encodings = self.encoder(text, tones) - # (B, T) + encodings = self.encoder(text, tones, spk_id) + pred_durations = self.duration_predictor(encodings.detach()) # expand encodings @@ -204,7 +221,7 @@ class SpeedySpeech(nn.Layer): decoded = self.decoder(encodings) return decoded, pred_durations - def inference(self, text, tones=None): + def inference(self, text, tones=None, durations=None, spk_id=None): # text: [T] # tones: [T] # input of embedding must be int64 @@ -214,25 +231,16 @@ class SpeedySpeech(nn.Layer): tones = paddle.cast(tones, 'int64') tones = tones.unsqueeze(0) - encodings = self.encoder(text, tones) - pred_durations = self.duration_predictor(encodings) # (1, T) - durations_to_expand = paddle.round(pred_durations.exp()) - durations_to_expand = (durations_to_expand).astype(paddle.int64) + encodings = self.encoder(text, tones, spk_id) - slens = paddle.sum(durations_to_expand, -1) # [1] - t_dec = slens[0] # [1] - t_enc = paddle.shape(pred_durations)[-1] - M = paddle.zeros([1, t_dec, t_enc]) - - k = paddle.full([1], 0, dtype=paddle.int64) - for j in range(t_enc): - d = durations_to_expand[0, j] - # If the d == 0, slice action is meaningless and not supported - if d >= 1: - M[0, k:k + d, j] = 1 - k += d - - encodings = paddle.matmul(M, encodings) + if durations is None: + # (1, T) + pred_durations = self.duration_predictor(encodings) + durations_to_expand = paddle.round(pred_durations.exp()) + durations_to_expand = durations_to_expand.astype(paddle.int64) + else: + durations_to_expand = durations + encodings = expand(encodings, durations_to_expand) shape = paddle.shape(encodings) t_dec, feature_size = shape[1], shape[2] @@ -247,7 +255,8 @@ class SpeedySpeechInference(nn.Layer): self.normalizer = normalizer self.acoustic_model = speedyspeech_model - def forward(self, phones, tones): - normalized_mel = self.acoustic_model.inference(phones, tones) + def forward(self, phones, tones, durations=None, spk_id=None): + normalized_mel = self.acoustic_model.inference( + phones, tones, durations=durations, spk_id=spk_id) logmel = self.normalizer.inverse(normalized_mel) return logmel diff --git a/paddlespeech/t2s/models/speedyspeech/speedyspeech_updater.py b/paddlespeech/t2s/models/speedyspeech/speedyspeech_updater.py index 6f9937a51845ff7c3962ba14efc1a53f0d15b94d..ee45cdc85dc6cd0c078f6699468aaa442c79a38d 100644 --- a/paddlespeech/t2s/models/speedyspeech/speedyspeech_updater.py +++ b/paddlespeech/t2s/models/speedyspeech/speedyspeech_updater.py @@ -50,10 +50,14 @@ class SpeedySpeechUpdater(StandardUpdater): self.msg = "Rank: {}, ".format(dist.get_rank()) losses_dict = {} + # spk_id!=None in multiple spk speedyspeech + spk_id = batch["spk_id"] if "spk_id" in batch else None + decoded, predicted_durations = self.model( text=batch["phones"], tones=batch["tones"], - durations=batch["durations"]) + durations=batch["durations"], + spk_id=spk_id) target_mel = batch["feats"] spec_mask = F.sequence_mask( @@ -112,10 +116,13 @@ class SpeedySpeechEvaluator(StandardEvaluator): self.msg = "Evaluate: " losses_dict = {} + spk_id = batch["spk_id"] if "spk_id" in batch else None + decoded, predicted_durations = self.model( text=batch["phones"], tones=batch["tones"], - durations=batch["durations"]) + durations=batch["durations"], + spk_id=spk_id) target_mel = batch["feats"] spec_mask = F.sequence_mask( diff --git a/paddlespeech/t2s/modules/predictor/duration_predictor.py b/paddlespeech/t2s/modules/predictor/duration_predictor.py index 6d7adf236d285db283b0a3558c1ad086c59a3108..6b7c6a6bef206388c1e6ec7853ea6378e7b2434d 100644 --- a/paddlespeech/t2s/modules/predictor/duration_predictor.py +++ b/paddlespeech/t2s/modules/predictor/duration_predictor.py @@ -115,8 +115,8 @@ class DurationPredictor(nn.Layer): Returns ---------- - Tensor - Batch of predicted durations in log domain (B, Tmax). + Tensor + Batch of predicted durations in log domain (B, Tmax). """ return self._forward(xs, x_masks, False) diff --git a/paddlespeech/t2s/modules/predictor/length_regulator.py b/paddlespeech/t2s/modules/predictor/length_regulator.py index bf595b24e89b713b19cc52e300c22ccd4363e469..f1ecfb7c1b5ade79947f9d6abc97fcbdbb496a6d 100644 --- a/paddlespeech/t2s/modules/predictor/length_regulator.py +++ b/paddlespeech/t2s/modules/predictor/length_regulator.py @@ -70,8 +70,8 @@ class LengthRegulator(nn.Layer): ---------- xs : Tensor Batch of sequences of char or phoneme embeddings (B, Tmax, D). - ds : LongTensor - Batch of durations of each frame (B, T). + ds : Tensor(int64) + Batch of durations of each frame (B, T). alpha : float, optional Alpha value to control speed of speech. diff --git a/paddlespeech/t2s/modules/tade_res_block.py b/paddlespeech/t2s/modules/tade_res_block.py index 19b07639f863237febbd3b06877fd42591427e49..1ca4e6d817375401b677402fa1aa8cca16e4211d 100644 --- a/paddlespeech/t2s/modules/tade_res_block.py +++ b/paddlespeech/t2s/modules/tade_res_block.py @@ -33,7 +33,11 @@ class TADELayer(nn.Layer): """Initilize TADE layer.""" super().__init__() self.norm = nn.InstanceNorm1D( - in_channels, momentum=0.1, data_format="NCL") + in_channels, + momentum=0.1, + data_format="NCL", + weight_attr=False, + bias_attr=False) self.aux_conv = nn.Sequential( nn.Conv1D( aux_channels, diff --git a/paddlespeech/text/training/__init__.py b/paddlespeech/text/exps/__init__.py similarity index 89% rename from paddlespeech/text/training/__init__.py rename to paddlespeech/text/exps/__init__.py index 185a92b8d94d3426d616c0624f0f2ee04339349e..abf198b97e6e818e1fbe59006f98492640bcee54 100644 --- a/paddlespeech/text/training/__init__.py +++ b/paddlespeech/text/exps/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/paddlespeech/text/utils/__init__.py b/paddlespeech/text/exps/ernie_linear/__init__.py similarity index 89% rename from paddlespeech/text/utils/__init__.py rename to paddlespeech/text/exps/ernie_linear/__init__.py index 185a92b8d94d3426d616c0624f0f2ee04339349e..abf198b97e6e818e1fbe59006f98492640bcee54 100644 --- a/paddlespeech/text/utils/__init__.py +++ b/paddlespeech/text/exps/ernie_linear/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/paddlespeech/text/exps/ernie_linear/punc_restore.py b/paddlespeech/text/exps/ernie_linear/punc_restore.py new file mode 100644 index 0000000000000000000000000000000000000000..2cb4d07199d790ba600834c836d383f6b6f19238 --- /dev/null +++ b/paddlespeech/text/exps/ernie_linear/punc_restore.py @@ -0,0 +1,110 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +import re + +import paddle +import yaml +from paddlenlp.transformers import ErnieTokenizer +from yacs.config import CfgNode + +from paddlespeech.text.models.ernie_linear import ErnieLinear + +DefinedClassifier = { + 'ErnieLinear': ErnieLinear, +} + +tokenizer = ErnieTokenizer.from_pretrained('ernie-1.0') + + +def _clean_text(text, punc_list): + text = text.lower() + text = re.sub('[^A-Za-z0-9\u4e00-\u9fa5]', '', text) + text = re.sub(f'[{"".join([p for p in punc_list][1:])}]', '', text) + return text + + +def preprocess(text, punc_list): + clean_text = _clean_text(text, punc_list) + assert len(clean_text) > 0, f'Invalid input string: {text}' + tokenized_input = tokenizer( + list(clean_text), return_length=True, is_split_into_words=True) + _inputs = dict() + _inputs['input_ids'] = tokenized_input['input_ids'] + _inputs['seg_ids'] = tokenized_input['token_type_ids'] + _inputs['seq_len'] = tokenized_input['seq_len'] + return _inputs + + +def test(args): + with open(args.config) as f: + config = CfgNode(yaml.safe_load(f)) + print("========Args========") + print(yaml.safe_dump(vars(args))) + print("========Config========") + print(config) + + punc_list = [] + with open(config["data_params"]["punc_path"], 'r') as f: + for line in f: + punc_list.append(line.strip()) + + model = DefinedClassifier[config["model_type"]](**config["model"]) + state_dict = paddle.load(args.checkpoint) + model.set_state_dict(state_dict["main_params"]) + model.eval() + _inputs = preprocess(args.text, punc_list) + seq_len = _inputs['seq_len'] + input_ids = paddle.to_tensor(_inputs['input_ids']).unsqueeze(0) + seg_ids = paddle.to_tensor(_inputs['seg_ids']).unsqueeze(0) + logits, _ = model(input_ids, seg_ids) + preds = paddle.argmax(logits, axis=-1).squeeze(0) + tokens = tokenizer.convert_ids_to_tokens( + _inputs['input_ids'][1:seq_len - 1]) + labels = preds[1:seq_len - 1].tolist() + assert len(tokens) == len(labels) + # add 0 for non punc + punc_list = [0] + punc_list + text = '' + for t, l in zip(tokens, labels): + text += t + if l != 0: # Non punc. + text += punc_list[l] + print("Punctuation Restoration Result:", text) + return text + + +def main(): + # parse args and config and redirect to train_sp + parser = argparse.ArgumentParser(description="Run Punctuation Restoration.") + parser.add_argument("--config", type=str, help="ErnieLinear config file.") + parser.add_argument("--checkpoint", type=str, help="snapshot to load.") + parser.add_argument("--text", type=str, help="raw text to be restored.") + parser.add_argument( + "--ngpu", type=int, default=1, help="if ngpu=0, use cpu.") + + args = parser.parse_args() + + if args.ngpu == 0: + paddle.set_device("cpu") + elif args.ngpu > 0: + paddle.set_device("gpu") + else: + print("ngpu should >= 0 !") + + test(args) + + +if __name__ == "__main__": + main() diff --git a/paddlespeech/text/exps/ernie_linear/test.py b/paddlespeech/text/exps/ernie_linear/test.py index 3cd507fbbe4fc6528a4cf1c1f6deb1ac33eed124..4302a1a3bfa80e8e8417fc7d5ec2786eda8417df 100644 --- a/paddlespeech/text/exps/ernie_linear/test.py +++ b/paddlespeech/text/exps/ernie_linear/test.py @@ -11,36 +11,110 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Evaluation for model.""" +import argparse + +import numpy as np +import paddle +import pandas as pd import yaml +from paddle import nn +from paddle.io import DataLoader +from sklearn.metrics import classification_report +from sklearn.metrics import precision_recall_fscore_support +from yacs.config import CfgNode -from paddlespeech.s2t.utils.utility import print_arguments -from paddlespeech.text.training.trainer import Tester -from paddlespeech.text.utils.default_parser import default_argument_parser +from paddlespeech.text.models.ernie_linear import ErnieLinear +from paddlespeech.text.models.ernie_linear import PuncDataset +from paddlespeech.text.models.ernie_linear import PuncDatasetFromErnieTokenizer +DefinedClassifier = { + 'ErnieLinear': ErnieLinear, +} -def main_sp(config, args): - exp = Tester(config, args) - exp.setup() - exp.run_test() +DefinedLoss = { + "ce": nn.CrossEntropyLoss, +} +DefinedDataset = { + 'Punc': PuncDataset, + 'Ernie': PuncDatasetFromErnieTokenizer, +} -def main(config, args): - main_sp(config, args) +def evaluation(y_pred, y_test): + precision, recall, f1, _ = precision_recall_fscore_support( + y_test, y_pred, average=None, labels=[1, 2, 3]) + overall = precision_recall_fscore_support( + y_test, y_pred, average='macro', labels=[1, 2, 3]) + result = pd.DataFrame( + np.array([precision, recall, f1]), + columns=list(['O', 'COMMA', 'PERIOD', 'QUESTION'])[1:], + index=['Precision', 'Recall', 'F1']) + result['OVERALL'] = overall[:3] + return result + + +def test(args): + with open(args.config) as f: + config = CfgNode(yaml.safe_load(f)) + print("========Args========") + print(yaml.safe_dump(vars(args))) + print("========Config========") + print(config) + + test_dataset = DefinedDataset[config["dataset_type"]]( + train_path=config["test_path"], **config["data_params"]) + test_loader = DataLoader( + test_dataset, + batch_size=config.batch_size, + shuffle=False, + drop_last=False) + model = DefinedClassifier[config["model_type"]](**config["model"]) + state_dict = paddle.load(args.checkpoint) + model.set_state_dict(state_dict["main_params"]) + model.eval() + + punc_list = [] + for i in range(len(test_loader.dataset.id2punc)): + punc_list.append(test_loader.dataset.id2punc[i]) + + test_total_label = [] + test_total_predict = [] + + for i, batch in enumerate(test_loader): + input, label = batch + label = paddle.reshape(label, shape=[-1]) + y, logit = model(input) + pred = paddle.argmax(logit, axis=1) + test_total_label.extend(label.numpy().tolist()) + test_total_predict.extend(pred.numpy().tolist()) + t = classification_report( + test_total_label, test_total_predict, target_names=punc_list) + print(t) + t2 = evaluation(test_total_label, test_total_predict) + print('=========================================================') + print(t2) + + +def main(): + # parse args and config and redirect to train_sp + parser = argparse.ArgumentParser(description="Test a ErnieLinear model.") + parser.add_argument("--config", type=str, help="ErnieLinear config file.") + parser.add_argument("--checkpoint", type=str, help="snapshot to load.") + parser.add_argument( + "--ngpu", type=int, default=1, help="if ngpu=0, use cpu.") -if __name__ == "__main__": - parser = default_argument_parser() args = parser.parse_args() - print_arguments(args, globals()) - # https://yaml.org/type/float.html - with open(args.config, "r") as f: - config = yaml.load(f, Loader=yaml.FullLoader) + if args.ngpu == 0: + paddle.set_device("cpu") + elif args.ngpu > 0: + paddle.set_device("gpu") + else: + print("ngpu should >= 0 !") + + test(args) - print(config) - if args.dump_config: - with open(args.dump_config, 'w') as f: - print(config, file=f) - main(config, args) +if __name__ == "__main__": + main() diff --git a/paddlespeech/text/exps/ernie_linear/train.py b/paddlespeech/text/exps/ernie_linear/train.py index 09071438139bec882caaec6e212ceff57ba1095c..0d730d6666ae3a9ec8290e5c73ebcd45a0842302 100644 --- a/paddlespeech/text/exps/ernie_linear/train.py +++ b/paddlespeech/text/exps/ernie_linear/train.py @@ -11,40 +11,163 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Trainer for punctuation_restoration task.""" +import argparse +import logging +import os +import shutil +from pathlib import Path + +import paddle import yaml +from paddle import DataParallel from paddle import distributed as dist +from paddle import nn +from paddle.io import DataLoader +from paddle.optimizer import Adam +from paddle.optimizer.lr import ExponentialDecay +from yacs.config import CfgNode -from paddlespeech.s2t.utils.utility import print_arguments -from paddlespeech.text.training.trainer import Trainer -from paddlespeech.text.utils.default_parser import default_argument_parser +from paddlespeech.t2s.training.extensions.snapshot import Snapshot +from paddlespeech.t2s.training.extensions.visualizer import VisualDL +from paddlespeech.t2s.training.seeding import seed_everything +from paddlespeech.t2s.training.trainer import Trainer +from paddlespeech.text.models.ernie_linear import ErnieLinear +from paddlespeech.text.models.ernie_linear import ErnieLinearEvaluator +from paddlespeech.text.models.ernie_linear import ErnieLinearUpdater +from paddlespeech.text.models.ernie_linear import PuncDataset +from paddlespeech.text.models.ernie_linear import PuncDatasetFromErnieTokenizer +DefinedClassifier = { + 'ErnieLinear': ErnieLinear, +} -def main_sp(config, args): - exp = Trainer(config, args) - exp.setup() - exp.run() +DefinedLoss = { + "ce": nn.CrossEntropyLoss, +} +DefinedDataset = { + 'Punc': PuncDataset, + 'Ernie': PuncDatasetFromErnieTokenizer, +} -def main(config, args): - if args.ngpu > 1: - dist.spawn(main_sp, args=(config, args), nprocs=args.ngpu) + +def train_sp(args, config): + # decides device type and whether to run in parallel + # setup running environment correctly + if (not paddle.is_compiled_with_cuda()) or args.ngpu == 0: + paddle.set_device("cpu") else: - main_sp(config, args) + paddle.set_device("gpu") + world_size = paddle.distributed.get_world_size() + if world_size > 1: + paddle.distributed.init_parallel_env() + # set the random seed, it is a must for multiprocess training + seed_everything(config.seed) + + print( + f"rank: {dist.get_rank()}, pid: {os.getpid()}, parent_pid: {os.getppid()}", + ) + # dataloader has been too verbose + logging.getLogger("DataLoader").disabled = True + train_dataset = DefinedDataset[config["dataset_type"]]( + train_path=config["train_path"], **config["data_params"]) + dev_dataset = DefinedDataset[config["dataset_type"]]( + train_path=config["dev_path"], **config["data_params"]) + train_dataloader = DataLoader( + train_dataset, + shuffle=True, + num_workers=config.num_workers, + batch_size=config.batch_size) + + dev_dataloader = DataLoader( + dev_dataset, + batch_size=config.batch_size, + shuffle=False, + drop_last=False, + num_workers=config.num_workers) + + print("dataloaders done!") + + model = DefinedClassifier[config["model_type"]](**config["model"]) + + if world_size > 1: + model = DataParallel(model) + print("model done!") + + criterion = DefinedLoss[config["loss_type"]]( + **config["loss"]) if "loss_type" in config else DefinedLoss["ce"]() + + print("criterions done!") + + lr_schedule = ExponentialDecay(**config["scheduler_params"]) + optimizer = Adam( + learning_rate=lr_schedule, + parameters=model.parameters(), + weight_decay=paddle.regularizer.L2Decay( + config["optimizer_params"]["weight_decay"])) + + print("optimizer done!") + + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + if dist.get_rank() == 0: + config_name = args.config.split("/")[-1] + # copy conf to output_dir + shutil.copyfile(args.config, output_dir / config_name) + + updater = ErnieLinearUpdater( + model=model, + criterion=criterion, + scheduler=lr_schedule, + optimizer=optimizer, + dataloader=train_dataloader, + output_dir=output_dir) + + trainer = Trainer(updater, (config.max_epoch, 'epoch'), output_dir) + + evaluator = ErnieLinearEvaluator( + model=model, + criterion=criterion, + dataloader=dev_dataloader, + output_dir=output_dir) + + if dist.get_rank() == 0: + trainer.extend(evaluator, trigger=(1, "epoch")) + trainer.extend(VisualDL(output_dir), trigger=(1, "iteration")) + trainer.extend( + Snapshot(max_size=config.num_snapshots), trigger=(1, 'epoch')) + # print(trainer.extensions) + trainer.run() + + +def main(): + # parse args and config and redirect to train_sp + parser = argparse.ArgumentParser(description="Train a ErnieLinear model.") + parser.add_argument("--config", type=str, help="ErnieLinear config file.") + parser.add_argument("--output-dir", type=str, help="output dir.") + parser.add_argument( + "--ngpu", type=int, default=1, help="if ngpu=0, use cpu.") -if __name__ == "__main__": - parser = default_argument_parser() args = parser.parse_args() - print_arguments(args, globals()) - # https://yaml.org/type/float.html - with open(args.config, "r") as f: - config = yaml.load(f, Loader=yaml.FullLoader) + with open(args.config) as f: + config = CfgNode(yaml.safe_load(f)) + print("========Args========") + print(yaml.safe_dump(vars(args))) + print("========Config========") print(config) - if args.dump_config: - with open(args.dump_config, 'w') as f: - print(config, file=f) + print( + f"master see the word size: {dist.get_world_size()}, from pid: {os.getpid()}" + ) + + # dispatch + if args.ngpu > 1: + dist.spawn(train_sp, (args, config), nprocs=args.ngpu) + else: + train_sp(args, config) + - main(config, args) +if __name__ == "__main__": + main() diff --git a/paddlespeech/text/models/ernie_linear/__init__.py b/paddlespeech/text/models/ernie_linear/__init__.py index 93453ce7473645aba18e516300a15d93bdc614ba..0a10a6eb2d3119b89a7e88d98254571735b29cfa 100644 --- a/paddlespeech/text/models/ernie_linear/__init__.py +++ b/paddlespeech/text/models/ernie_linear/__init__.py @@ -11,4 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from .model import ErnieLinear +from .dataset import * +from .ernie_linear import * +from .ernie_linear_updater import * diff --git a/paddlespeech/text/models/ernie_linear/dataset.py b/paddlespeech/text/models/ernie_linear/dataset.py index 086e91bb890e82c49f9a00636c742bd44358203c..64c8d0bdfd34d32d016b539f147bdfa145b604f0 100644 --- a/paddlespeech/text/models/ernie_linear/dataset.py +++ b/paddlespeech/text/models/ernie_linear/dataset.py @@ -99,10 +99,8 @@ class PuncDatasetFromErnieTokenizer(Dataset): self.tokenizer = ErnieTokenizer.from_pretrained(pretrained_token) self.paddingID = self.tokenizer.pad_token_id self.seq_len = seq_len - self.punc2id = self.load_vocab(punc_path, extra_word_list=[" "]) self.id2punc = {k: v for (v, k) in self.punc2id.items()} - tmp_seqs = open(train_path, encoding='utf-8').readlines() self.txt_seqs = [i for seq in tmp_seqs for i in seq.split()] self.preprocess(self.txt_seqs) @@ -125,6 +123,7 @@ class PuncDatasetFromErnieTokenizer(Dataset): input_data = [] label = [] count = 0 + print("Preprocessing in PuncDatasetFromErnieTokenizer...") for i in range(len(txt_seqs) - 1): word = txt_seqs[i] punc = txt_seqs[i + 1] diff --git a/paddlespeech/text/models/ernie_linear/model.py b/paddlespeech/text/models/ernie_linear/ernie_linear.py similarity index 100% rename from paddlespeech/text/models/ernie_linear/model.py rename to paddlespeech/text/models/ernie_linear/ernie_linear.py diff --git a/paddlespeech/text/models/ernie_linear/ernie_linear_updater.py b/paddlespeech/text/models/ernie_linear/ernie_linear_updater.py new file mode 100644 index 0000000000000000000000000000000000000000..8b3d7410e04afff43ea16a821473fe05eb4f195d --- /dev/null +++ b/paddlespeech/text/models/ernie_linear/ernie_linear_updater.py @@ -0,0 +1,123 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import logging + +import paddle +from paddle import distributed as dist +from paddle.io import DataLoader +from paddle.nn import Layer +from paddle.optimizer import Optimizer +from paddle.optimizer.lr import LRScheduler +from sklearn.metrics import f1_score + +from paddlespeech.t2s.training.extensions.evaluator import StandardEvaluator +from paddlespeech.t2s.training.reporter import report +from paddlespeech.t2s.training.updaters.standard_updater import StandardUpdater +logging.basicConfig( + format='%(asctime)s [%(levelname)s] [%(filename)s:%(lineno)d] %(message)s', + datefmt='[%Y-%m-%d %H:%M:%S]') +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + + +class ErnieLinearUpdater(StandardUpdater): + def __init__(self, + model: Layer, + criterion: Layer, + scheduler: LRScheduler, + optimizer: Optimizer, + dataloader: DataLoader, + output_dir=None): + super().__init__(model, optimizer, dataloader, init_state=None) + self.model = model + self.dataloader = dataloader + + self.criterion = criterion + self.scheduler = scheduler + self.optimizer = optimizer + + log_file = output_dir / 'worker_{}.log'.format(dist.get_rank()) + self.filehandler = logging.FileHandler(str(log_file)) + logger.addHandler(self.filehandler) + self.logger = logger + self.msg = "" + + def update_core(self, batch): + self.msg = "Rank: {}, ".format(dist.get_rank()) + losses_dict = {} + + input, label = batch + label = paddle.reshape(label, shape=[-1]) + y, logit = self.model(input) + pred = paddle.argmax(logit, axis=1) + + loss = self.criterion(y, label) + + self.optimizer.clear_grad() + loss.backward() + + self.optimizer.step() + self.scheduler.step() + + F1_score = f1_score( + label.numpy().tolist(), pred.numpy().tolist(), average="macro") + + report("train/loss", float(loss)) + losses_dict["loss"] = float(loss) + report("train/F1_score", float(F1_score)) + losses_dict["F1_score"] = float(F1_score) + + self.msg += ', '.join('{}: {:>.6f}'.format(k, v) + for k, v in losses_dict.items()) + + +class ErnieLinearEvaluator(StandardEvaluator): + def __init__(self, + model: Layer, + criterion: Layer, + dataloader: DataLoader, + output_dir=None): + super().__init__(model, dataloader) + self.model = model + self.criterion = criterion + self.dataloader = dataloader + + log_file = output_dir / 'worker_{}.log'.format(dist.get_rank()) + self.filehandler = logging.FileHandler(str(log_file)) + logger.addHandler(self.filehandler) + self.logger = logger + self.msg = "" + + def evaluate_core(self, batch): + self.msg = "Evaluate: " + losses_dict = {} + + input, label = batch + label = paddle.reshape(label, shape=[-1]) + y, logit = self.model(input) + pred = paddle.argmax(logit, axis=1) + + loss = self.criterion(y, label) + + F1_score = f1_score( + label.numpy().tolist(), pred.numpy().tolist(), average="macro") + + report("eval/loss", float(loss)) + losses_dict["loss"] = float(loss) + report("eval/F1_score", float(F1_score)) + losses_dict["F1_score"] = float(F1_score) + + self.msg += ', '.join('{}: {:>.6f}'.format(k, v) + for k, v in losses_dict.items()) + self.logger.info(self.msg) diff --git a/paddlespeech/text/training/trainer.py b/paddlespeech/text/training/trainer.py deleted file mode 100644 index b5e6a563cc3d8f15ffad92f6fa3ff961104c4d54..0000000000000000000000000000000000000000 --- a/paddlespeech/text/training/trainer.py +++ /dev/null @@ -1,524 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import logging -import time -from collections import defaultdict -from pathlib import Path - -import numpy as np -import paddle -import paddle.nn as nn -import pandas as pd -from paddle import distributed as dist -from paddle.io import DataLoader -from sklearn.metrics import classification_report -from sklearn.metrics import f1_score -from sklearn.metrics import precision_recall_fscore_support - -from ...s2t.utils import layer_tools -from ...s2t.utils import mp_tools -from ...s2t.utils.checkpoint import Checkpoint -from ...text.models import ErnieLinear -from ...text.models.ernie_linear.dataset import PuncDataset -from ...text.models.ernie_linear.dataset import PuncDatasetFromErnieTokenizer - -__all__ = ["Trainer", "Tester"] - -DefinedClassifier = { - 'ErnieLinear': ErnieLinear, -} - -DefinedLoss = { - "ce": nn.CrossEntropyLoss, -} - -DefinedDataset = { - 'Punc': PuncDataset, - 'Ernie': PuncDatasetFromErnieTokenizer, -} - - -class Trainer(): - def __init__(self, config, args): - self.config = config - self.args = args - self.optimizer = None - self.output_dir = None - self.log_dir = None - self.checkpoint_dir = None - self.iteration = 0 - self.epoch = 0 - - def setup(self): - """Setup the experiment. - """ - self.setup_log_dir() - self.setup_logger() - if self.args.ngpu > 0: - paddle.set_device('gpu') - else: - paddle.set_device('cpu') - if self.parallel: - self.init_parallel() - - self.setup_output_dir() - self.dump_config() - self.setup_checkpointer() - - self.setup_model() - - self.setup_dataloader() - - self.iteration = 0 - self.epoch = 1 - - @property - def parallel(self): - """A flag indicating whether the experiment should run with - multiprocessing. - """ - return self.args.ngpu > 1 - - def init_parallel(self): - """Init environment for multiprocess training. - """ - dist.init_parallel_env() - - @mp_tools.rank_zero_only - def save(self, tag=None, infos: dict=None): - """Save checkpoint (model parameters and optimizer states). - - Args: - tag (int or str, optional): None for step, else using tag, e.g epoch. Defaults to None. - infos (dict, optional): meta data to save. Defaults to None. - """ - - infos = infos if infos else dict() - infos.update({ - "step": self.iteration, - "epoch": self.epoch, - "lr": self.optimizer.get_lr() - }) - self.checkpointer.save_parameters(self.checkpoint_dir, self.iteration - if tag is None else tag, self.model, - self.optimizer, infos) - - def resume_or_scratch(self): - """Resume from latest checkpoint at checkpoints in the output - directory or load a specified checkpoint. - - If ``args.checkpoint_path`` is not None, load the checkpoint, else - resume training. - """ - scratch = None - infos = self.checkpointer.load_parameters( - self.model, - self.optimizer, - checkpoint_dir=self.checkpoint_dir, - checkpoint_path=self.args.checkpoint_path) - if infos: - # restore from ckpt - self.iteration = infos["step"] - self.epoch = infos["epoch"] - scratch = False - else: - self.iteration = 0 - self.epoch = 0 - scratch = True - - return scratch - - def new_epoch(self): - """Reset the train loader seed and increment `epoch`. - """ - self.epoch += 1 - if self.parallel: - self.train_loader.batch_sampler.set_epoch(self.epoch) - - def train(self): - """The training process control by epoch.""" - from_scratch = self.resume_or_scratch() - - if from_scratch: - # save init model, i.e. 0 epoch - self.save(tag="init") - - self.lr_scheduler.step(self.iteration) - if self.parallel: - self.train_loader.batch_sampler.set_epoch(self.epoch) - - self.logger.info( - f"Train Total Examples: {len(self.train_loader.dataset)}") - self.punc_list = [] - for i in range(len(self.train_loader.dataset.id2punc)): - self.punc_list.append(self.train_loader.dataset.id2punc[i]) - while self.epoch < self.config["training"]["n_epoch"]: - self.model.train() - self.total_label_train = [] - self.total_predict_train = [] - try: - data_start_time = time.time() - for batch_index, batch in enumerate(self.train_loader): - dataload_time = time.time() - data_start_time - msg = "Train: Rank: {}, ".format(dist.get_rank()) - msg += "epoch: {}, ".format(self.epoch) - msg += "step: {}, ".format(self.iteration) - msg += "batch : {}/{}, ".format(batch_index + 1, - len(self.train_loader)) - msg += "lr: {:>.8f}, ".format(self.lr_scheduler()) - msg += "data time: {:>.3f}s, ".format(dataload_time) - self.train_batch(batch_index, batch, msg) - data_start_time = time.time() - # t = classification_report( - # self.total_label_train, - # self.total_predict_train, - # target_names=self.punc_list) - # self.logger.info(t) - except Exception as e: - self.logger.error(e) - raise e - - total_loss, F1_score = self.valid() - self.logger.info("Epoch {} Val info val_loss {}, F1_score {}". - format(self.epoch, total_loss, F1_score)) - - self.save( - tag=self.epoch, infos={"val_loss": total_loss, - "F1": F1_score}) - # step lr every epoch - self.lr_scheduler.step() - self.new_epoch() - - def run(self): - """The routine of the experiment after setup. This method is intended - to be used by the user. - """ - try: - self.train() - except KeyboardInterrupt: - self.logger.info("Training was aborted by keybord interrupt.") - self.save() - exit(-1) - finally: - self.destory() - self.logger.info("Training Done.") - - def setup_output_dir(self): - """Create a directory used for output. - """ - # output dir - output_dir = Path(self.args.output_dir).expanduser() - output_dir.mkdir(parents=True, exist_ok=True) - - self.output_dir = output_dir - - def setup_log_dir(self): - """Create a directory used for logging. - """ - # log dir - log_dir = Path(self.args.log_dir).expanduser() - log_dir.mkdir(parents=True, exist_ok=True) - - self.log_dir = log_dir - - def setup_checkpointer(self): - """Create a directory used to save checkpoints into. - - It is "checkpoints" inside the output directory. - """ - # checkpoint dir - self.checkpointer = Checkpoint(self.config["checkpoint"]["kbest_n"], - self.config["checkpoint"]["latest_n"]) - - checkpoint_dir = self.output_dir / "checkpoints" - checkpoint_dir.mkdir(exist_ok=True) - - self.checkpoint_dir = checkpoint_dir - - def setup_logger(self): - LOG_FORMAT = "%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s" - format_str = logging.Formatter( - '%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s' - ) - logging.basicConfig( - filename=self.config["training"]["log_path"], - level=logging.INFO, - format=LOG_FORMAT) - self.logger = logging.getLogger(__name__) - - self.logger.setLevel(logging.INFO) - sh = logging.StreamHandler() - sh.setFormatter(format_str) - self.logger.addHandler(sh) - - self.logger.info('info') - - @mp_tools.rank_zero_only - def destory(self): - pass - - @mp_tools.rank_zero_only - def dump_config(self): - """Save the configuration used for this experiment. - - It is saved in to ``config.yaml`` in the output directory at the - beginning of the experiment. - """ - with open(self.output_dir / "config.yaml", "wt") as f: - print(self.config, file=f) - - def train_batch(self, batch_index, batch_data, msg): - start = time.time() - - input, label = batch_data - label = paddle.reshape(label, shape=[-1]) - y, logit = self.model(input) - pred = paddle.argmax(logit, axis=1) - self.total_label_train.extend(label.numpy().tolist()) - self.total_predict_train.extend(pred.numpy().tolist()) - loss = self.crit(y, label) - - loss.backward() - layer_tools.print_grads(self.model, print_func=None) - self.optimizer.step() - self.optimizer.clear_grad() - iteration_time = time.time() - start - - losses_np = { - "train_loss": float(loss), - } - msg += "train time: {:>.3f}s, ".format(iteration_time) - msg += "batch size: {}, ".format(self.config["data"]["batch_size"]) - msg += ", ".join("{}: {:>.6f}".format(k, v) - for k, v in losses_np.items()) - self.logger.info(msg) - self.iteration += 1 - - @paddle.no_grad() - def valid(self): - self.logger.info( - f"Valid Total Examples: {len(self.valid_loader.dataset)}") - self.model.eval() - valid_losses = defaultdict(list) - num_seen_utts = 1 - total_loss = 0.0 - valid_total_label = [] - valid_total_predict = [] - for i, batch in enumerate(self.valid_loader): - input, label = batch - label = paddle.reshape(label, shape=[-1]) - y, logit = self.model(input) - pred = paddle.argmax(logit, axis=1) - valid_total_label.extend(label.numpy().tolist()) - valid_total_predict.extend(pred.numpy().tolist()) - loss = self.crit(y, label) - - if paddle.isfinite(loss): - num_utts = batch[1].shape[0] - num_seen_utts += num_utts - total_loss += float(loss) * num_utts - valid_losses["val_loss"].append(float(loss)) - - if (i + 1) % self.config["training"]["log_interval"] == 0: - valid_dump = {k: np.mean(v) for k, v in valid_losses.items()} - valid_dump["val_history_loss"] = total_loss / num_seen_utts - - # logging - msg = f"Valid: Rank: {dist.get_rank()}, " - msg += "epoch: {}, ".format(self.epoch) - msg += "step: {}, ".format(self.iteration) - msg += "batch : {}/{}, ".format(i + 1, len(self.valid_loader)) - msg += ", ".join("{}: {:>.6f}".format(k, v) - for k, v in valid_dump.items()) - self.logger.info(msg) - - self.logger.info("Rank {} Val info val_loss {}".format( - dist.get_rank(), total_loss / num_seen_utts)) - F1_score = f1_score( - valid_total_label, valid_total_predict, average="macro") - return total_loss / num_seen_utts, F1_score - - def setup_model(self): - config = self.config - - model = DefinedClassifier[self.config["model_type"]]( - **self.config["model_params"]) - self.crit = DefinedLoss[self.config["loss_type"]](**self.config[ - "loss"]) if "loss_type" in self.config else DefinedLoss["ce"]() - - if self.parallel: - model = paddle.DataParallel(model) - - # self.logger.info(f"{model}") - # layer_tools.print_params(model, self.logger.info) - - lr_scheduler = paddle.optimizer.lr.ExponentialDecay( - learning_rate=config["training"]["lr"], - gamma=config["training"]["lr_decay"], - verbose=True) - optimizer = paddle.optimizer.Adam( - learning_rate=lr_scheduler, - parameters=model.parameters(), - weight_decay=paddle.regularizer.L2Decay( - config["training"]["weight_decay"])) - - self.model = model - self.optimizer = optimizer - self.lr_scheduler = lr_scheduler - self.logger.info("Setup model/criterion/optimizer/lr_scheduler!") - - def setup_dataloader(self): - config = self.config["data"].copy() - train_dataset = DefinedDataset[config["dataset_type"]]( - train_path=config["train_path"], **config["data_params"]) - dev_dataset = DefinedDataset[config["dataset_type"]]( - train_path=config["dev_path"], **config["data_params"]) - - self.train_loader = DataLoader( - train_dataset, - num_workers=config["num_workers"], - batch_size=config["batch_size"]) - self.valid_loader = DataLoader( - dev_dataset, - batch_size=config["batch_size"], - shuffle=False, - drop_last=False, - num_workers=config["num_workers"]) - self.logger.info("Setup train/valid Dataloader!") - - -class Tester(Trainer): - def __init__(self, config, args): - super().__init__(config, args) - - @mp_tools.rank_zero_only - @paddle.no_grad() - def test(self): - self.logger.info( - f"Test Total Examples: {len(self.test_loader.dataset)}") - self.punc_list = [] - for i in range(len(self.test_loader.dataset.id2punc)): - self.punc_list.append(self.test_loader.dataset.id2punc[i]) - self.model.eval() - test_total_label = [] - test_total_predict = [] - with open(self.args.result_file, 'w') as fout: - for i, batch in enumerate(self.test_loader): - input, label = batch - label = paddle.reshape(label, shape=[-1]) - y, logit = self.model(input) - pred = paddle.argmax(logit, axis=1) - test_total_label.extend(label.numpy().tolist()) - test_total_predict.extend(pred.numpy().tolist()) - - # logging - msg = "Test: " - msg += "epoch: {}, ".format(self.epoch) - msg += "step: {}, ".format(self.iteration) - self.logger.info(msg) - t = classification_report( - test_total_label, test_total_predict, target_names=self.punc_list) - print(t) - t2 = self.evaluation(test_total_label, test_total_predict) - print(t2) - - def evaluation(self, y_pred, y_test): - precision, recall, f1, _ = precision_recall_fscore_support( - y_test, y_pred, average=None, labels=[1, 2, 3]) - overall = precision_recall_fscore_support( - y_test, y_pred, average='macro', labels=[1, 2, 3]) - result = pd.DataFrame( - np.array([precision, recall, f1]), - columns=list(['O', 'COMMA', 'PERIOD', 'QUESTION'])[1:], - index=['Precision', 'Recall', 'F1']) - result['OVERALL'] = overall[:3] - return result - - def run_test(self): - self.resume_or_scratch() - try: - self.test() - except KeyboardInterrupt: - self.logger.info("Testing was aborted by keybord interrupt.") - exit(-1) - - def setup(self): - """Setup the experiment. - """ - if self.args.ngpu > 0: - paddle.set_device('gpu') - else: - paddle.set_device('cpu') - self.setup_logger() - self.setup_output_dir() - self.setup_checkpointer() - - self.setup_dataloader() - self.setup_model() - - self.iteration = 0 - self.epoch = 0 - - def setup_model(self): - config = self.config - model = DefinedClassifier[self.config["model_type"]]( - **self.config["model_params"]) - - self.model = model - self.logger.info("Setup model!") - - def setup_dataloader(self): - config = self.config["data"].copy() - - test_dataset = DefinedDataset[config["dataset_type"]]( - train_path=config["test_path"], **config["data_params"]) - - self.test_loader = DataLoader( - test_dataset, - batch_size=config["batch_size"], - shuffle=False, - drop_last=False) - self.logger.info("Setup test Dataloader!") - - def setup_output_dir(self): - """Create a directory used for output. - """ - # output dir - if self.args.output_dir: - output_dir = Path(self.args.output_dir).expanduser() - output_dir.mkdir(parents=True, exist_ok=True) - else: - output_dir = Path( - self.args.checkpoint_path).expanduser().parent.parent - output_dir.mkdir(parents=True, exist_ok=True) - - self.output_dir = output_dir - - def setup_logger(self): - LOG_FORMAT = "%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s" - format_str = logging.Formatter( - '%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s' - ) - logging.basicConfig( - filename=self.config["testing"]["log_path"], - level=logging.INFO, - format=LOG_FORMAT) - self.logger = logging.getLogger(__name__) - - self.logger.setLevel(logging.INFO) - sh = logging.StreamHandler() - sh.setFormatter(format_str) - self.logger.addHandler(sh) - - self.logger.info('info') diff --git a/paddlespeech/text/utils/default_parser.py b/paddlespeech/text/utils/default_parser.py deleted file mode 100644 index 469157a69da6198932173dcbcbc38de845c9e97a..0000000000000000000000000000000000000000 --- a/paddlespeech/text/utils/default_parser.py +++ /dev/null @@ -1,73 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import argparse - - -def default_argument_parser(): - r"""A simple yet genral argument parser for experiments with t2s. - - This is used in examples with t2s. And it is intended to be used by - other experiments with t2s. It requires a minimal set of command line - arguments to start a training script. - - The ``--config`` and ``--opts`` are used for overwrite the deault - configuration. - - The ``--data`` and ``--output`` specifies the data path and output path. - Resuming training from existing progress at the output directory is the - intended default behavior. - - The ``--checkpoint_path`` specifies the checkpoint to load from. - - The ``--ngpu`` specifies how to run the training. - - - See Also - -------- - paddlespeech.t2s.training.experiment - Returns - ------- - argparse.ArgumentParser - the parser - """ - parser = argparse.ArgumentParser() - - # yapf: disable - # data and output - parser.add_argument("--config", metavar="FILE", help="path of the config file to overwrite to default config with.") - parser.add_argument("--dump-config", metavar="FILE", help="dump config to yaml file.") - # parser.add_argument("--data", metavar="DATA_DIR", help="path to the datatset.") - parser.add_argument("--output_dir", metavar="OUTPUT_DIR", help="path to save checkpoint.") - parser.add_argument("--log_dir", metavar="LOG_DIR", help="path to save logs.") - - # load from saved checkpoint - parser.add_argument("--checkpoint_path", type=str, help="path of the checkpoint to load") - - # save jit model to - parser.add_argument("--export_path", type=str, help="path of the jit model to save") - - # save asr result to - parser.add_argument("--result_file", type=str, help="path of save the asr result") - - # running - parser.add_argument("--ngpu", type=int, default=1, help="number of parallel processes to use. if ngpu=0, using cpu.") - - # overwrite extra config and default config - # parser.add_argument("--opts", nargs=argparse.REMAINDER, - # help="options to overwrite --config file and the default config, passing in KEY VALUE pairs") - parser.add_argument("--opts", type=str, default=[], nargs='+', - help="options to overwrite --config file and the default config, passing in KEY VALUE pairs") - # yapd: enable - - return parser diff --git a/paddlespeech/vector/models/ecapa_tdnn.py b/paddlespeech/vector/models/ecapa_tdnn.py new file mode 100644 index 0000000000000000000000000000000000000000..e493b8004e2255135a070483f01ca709698fc81c --- /dev/null +++ b/paddlespeech/vector/models/ecapa_tdnn.py @@ -0,0 +1,409 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import math + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + + +def length_to_mask(length, max_len=None, dtype=None): + assert len(length.shape) == 1 + + if max_len is None: + max_len = length.max().astype( + 'int').item() # using arange to generate mask + mask = paddle.arange( + max_len, dtype=length.dtype).expand( + (len(length), max_len)) < length.unsqueeze(1) + + if dtype is None: + dtype = length.dtype + + mask = paddle.to_tensor(mask, dtype=dtype) + return mask + + +class Conv1d(nn.Layer): + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding="same", + dilation=1, + groups=1, + bias=True, + padding_mode="reflect", ): + super().__init__() + + self.kernel_size = kernel_size + self.stride = stride + self.dilation = dilation + self.padding = padding + self.padding_mode = padding_mode + + self.conv = nn.Conv1D( + in_channels, + out_channels, + self.kernel_size, + stride=self.stride, + padding=0, + dilation=self.dilation, + groups=groups, + bias_attr=bias, ) + + def forward(self, x): + if self.padding == "same": + x = self._manage_padding(x, self.kernel_size, self.dilation, + self.stride) + else: + raise ValueError("Padding must be 'same'. Got {self.padding}") + + return self.conv(x) + + def _manage_padding(self, x, kernel_size: int, dilation: int, stride: int): + L_in = x.shape[-1] # Detecting input shape + padding = self._get_padding_elem(L_in, stride, kernel_size, + dilation) # Time padding + x = F.pad( + x, padding, mode=self.padding_mode, + data_format="NCL") # Applying padding + return x + + def _get_padding_elem(self, + L_in: int, + stride: int, + kernel_size: int, + dilation: int): + if stride > 1: + n_steps = math.ceil(((L_in - kernel_size * dilation) / stride) + 1) + L_out = stride * (n_steps - 1) + kernel_size * dilation + padding = [kernel_size // 2, kernel_size // 2] + else: + L_out = (L_in - dilation * (kernel_size - 1) - 1) // stride + 1 + + padding = [(L_in - L_out) // 2, (L_in - L_out) // 2] + + return padding + + +class BatchNorm1d(nn.Layer): + def __init__( + self, + input_size, + eps=1e-05, + momentum=0.9, + weight_attr=None, + bias_attr=None, + data_format='NCL', + use_global_stats=None, ): + super().__init__() + + self.norm = nn.BatchNorm1D( + input_size, + epsilon=eps, + momentum=momentum, + weight_attr=weight_attr, + bias_attr=bias_attr, + data_format=data_format, + use_global_stats=use_global_stats, ) + + def forward(self, x): + x_n = self.norm(x) + return x_n + + +class TDNNBlock(nn.Layer): + def __init__( + self, + in_channels, + out_channels, + kernel_size, + dilation, + activation=nn.ReLU, ): + super().__init__() + self.conv = Conv1d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + dilation=dilation, ) + self.activation = activation() + self.norm = BatchNorm1d(input_size=out_channels) + + def forward(self, x): + return self.norm(self.activation(self.conv(x))) + + +class Res2NetBlock(nn.Layer): + def __init__(self, in_channels, out_channels, scale=8, dilation=1): + super().__init__() + assert in_channels % scale == 0 + assert out_channels % scale == 0 + + in_channel = in_channels // scale + hidden_channel = out_channels // scale + + self.blocks = nn.LayerList([ + TDNNBlock( + in_channel, hidden_channel, kernel_size=3, dilation=dilation) + for i in range(scale - 1) + ]) + self.scale = scale + + def forward(self, x): + y = [] + for i, x_i in enumerate(paddle.chunk(x, self.scale, axis=1)): + if i == 0: + y_i = x_i + elif i == 1: + y_i = self.blocks[i - 1](x_i) + else: + y_i = self.blocks[i - 1](x_i + y_i) + y.append(y_i) + y = paddle.concat(y, axis=1) + return y + + +class SEBlock(nn.Layer): + def __init__(self, in_channels, se_channels, out_channels): + super().__init__() + + self.conv1 = Conv1d( + in_channels=in_channels, out_channels=se_channels, kernel_size=1) + self.relu = paddle.nn.ReLU() + self.conv2 = Conv1d( + in_channels=se_channels, out_channels=out_channels, kernel_size=1) + self.sigmoid = paddle.nn.Sigmoid() + + def forward(self, x, lengths=None): + L = x.shape[-1] + if lengths is not None: + mask = length_to_mask(lengths * L, max_len=L) + mask = mask.unsqueeze(1) + total = mask.sum(axis=2, keepdim=True) + s = (x * mask).sum(axis=2, keepdim=True) / total + else: + s = x.mean(axis=2, keepdim=True) + + s = self.relu(self.conv1(s)) + s = self.sigmoid(self.conv2(s)) + + return s * x + + +class AttentiveStatisticsPooling(nn.Layer): + def __init__(self, channels, attention_channels=128, global_context=True): + super().__init__() + + self.eps = 1e-12 + self.global_context = global_context + if global_context: + self.tdnn = TDNNBlock(channels * 3, attention_channels, 1, 1) + else: + self.tdnn = TDNNBlock(channels, attention_channels, 1, 1) + self.tanh = nn.Tanh() + self.conv = Conv1d( + in_channels=attention_channels, + out_channels=channels, + kernel_size=1) + + def forward(self, x, lengths=None): + C, L = x.shape[1], x.shape[2] # KP: (N, C, L) + + def _compute_statistics(x, m, axis=2, eps=self.eps): + mean = (m * x).sum(axis) + std = paddle.sqrt( + (m * (x - mean.unsqueeze(axis)).pow(2)).sum(axis).clip(eps)) + return mean, std + + if lengths is None: + lengths = paddle.ones([x.shape[0]]) + + # Make binary mask of shape [N, 1, L] + mask = length_to_mask(lengths * L, max_len=L) + mask = mask.unsqueeze(1) + + # Expand the temporal context of the pooling layer by allowing the + # self-attention to look at global properties of the utterance. + if self.global_context: + total = mask.sum(axis=2, keepdim=True).astype('float32') + mean, std = _compute_statistics(x, mask / total) + mean = mean.unsqueeze(2).tile((1, 1, L)) + std = std.unsqueeze(2).tile((1, 1, L)) + attn = paddle.concat([x, mean, std], axis=1) + else: + attn = x + + # Apply layers + attn = self.conv(self.tanh(self.tdnn(attn))) + + # Filter out zero-paddings + attn = paddle.where( + mask.tile((1, C, 1)) == 0, + paddle.ones_like(attn) * float("-inf"), attn) + + attn = F.softmax(attn, axis=2) + mean, std = _compute_statistics(x, attn) + + # Append mean and std of the batch + pooled_stats = paddle.concat((mean, std), axis=1) + pooled_stats = pooled_stats.unsqueeze(2) + + return pooled_stats + + +class SERes2NetBlock(nn.Layer): + def __init__( + self, + in_channels, + out_channels, + res2net_scale=8, + se_channels=128, + kernel_size=1, + dilation=1, + activation=nn.ReLU, ): + super().__init__() + self.out_channels = out_channels + self.tdnn1 = TDNNBlock( + in_channels, + out_channels, + kernel_size=1, + dilation=1, + activation=activation, ) + self.res2net_block = Res2NetBlock(out_channels, out_channels, + res2net_scale, dilation) + self.tdnn2 = TDNNBlock( + out_channels, + out_channels, + kernel_size=1, + dilation=1, + activation=activation, ) + self.se_block = SEBlock(out_channels, se_channels, out_channels) + + self.shortcut = None + if in_channels != out_channels: + self.shortcut = Conv1d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=1, ) + + def forward(self, x, lengths=None): + residual = x + if self.shortcut: + residual = self.shortcut(x) + + x = self.tdnn1(x) + x = self.res2net_block(x) + x = self.tdnn2(x) + x = self.se_block(x, lengths) + + return x + residual + + +class EcapaTdnn(nn.Layer): + def __init__( + self, + input_size, + lin_neurons=192, + activation=nn.ReLU, + channels=[512, 512, 512, 512, 1536], + kernel_sizes=[5, 3, 3, 3, 1], + dilations=[1, 2, 3, 4, 1], + attention_channels=128, + res2net_scale=8, + se_channels=128, + global_context=True, ): + + super().__init__() + assert len(channels) == len(kernel_sizes) + assert len(channels) == len(dilations) + self.channels = channels + self.blocks = nn.LayerList() + self.emb_size = lin_neurons + + # The initial TDNN layer + self.blocks.append( + TDNNBlock( + input_size, + channels[0], + kernel_sizes[0], + dilations[0], + activation, )) + + # SE-Res2Net layers + for i in range(1, len(channels) - 1): + self.blocks.append( + SERes2NetBlock( + channels[i - 1], + channels[i], + res2net_scale=res2net_scale, + se_channels=se_channels, + kernel_size=kernel_sizes[i], + dilation=dilations[i], + activation=activation, )) + + # Multi-layer feature aggregation + self.mfa = TDNNBlock( + channels[-1], + channels[-1], + kernel_sizes[-1], + dilations[-1], + activation, ) + + # Attentive Statistical Pooling + self.asp = AttentiveStatisticsPooling( + channels[-1], + attention_channels=attention_channels, + global_context=global_context, ) + self.asp_bn = BatchNorm1d(input_size=channels[-1] * 2) + + # Final linear transformation + self.fc = Conv1d( + in_channels=channels[-1] * 2, + out_channels=self.emb_size, + kernel_size=1, ) + + def forward(self, x, lengths=None): + """ + Compute embeddings. + + Args: + x (paddle.Tensor): Input log-fbanks with shape (N, n_mels, T). + lengths (paddle.Tensor, optional): Length proportions of batch length with shape (N). Defaults to None. + + Returns: + paddle.Tensor: Output embeddings with shape (N, self.emb_size, 1) + """ + xl = [] + for layer in self.blocks: + try: + x = layer(x, lengths=lengths) + except TypeError: + x = layer(x) + xl.append(x) + + # Multi-layer feature aggregation + x = paddle.concat(xl[1:], axis=1) + x = self.mfa(x) + + # Attentive Statistical Pooling + x = self.asp(x, lengths=lengths) + x = self.asp_bn(x) + + # Final linear transformation + x = self.fc(x) + + return x diff --git a/requirements.txt b/requirements.txt index 42cb33f67b598fef6ab5fdc8ecbb0deec322ceeb..760821662d0251a3ba8c8f0c8e50bf1d4795a774 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,6 +15,7 @@ matplotlib nara_wpe nltk paddleaudio +paddlenlp paddlespeech_ctcdecoders paddlespeech_feat pandas @@ -42,5 +43,6 @@ typeguard unidecode visualdl webrtcvad -yacs +yacs~=0.1.8 yq +zhon diff --git a/setup.py b/setup.py index 73f392c6cbd412ec171115c15c7613fb5540e135..a6b18f9793163f65f8531be330586f9e26c778e1 100644 --- a/setup.py +++ b/setup.py @@ -41,9 +41,9 @@ requirements = { "loguru", "matplotlib", "nara_wpe", - "nltk", "pandas", "paddleaudio", + "paddlenlp", "paddlespeech_feat", "praatio==5.0.0", "pypinyin", @@ -60,7 +60,7 @@ requirements = { "typeguard", "visualdl", "webrtcvad", - "yacs", + "yacs~=0.1.8", ], "develop": [ "ConfigArgParse", @@ -77,6 +77,7 @@ requirements = { "unidecode", "yq", "pre-commit", + "zhon", ] } @@ -126,7 +127,7 @@ def _post_install(install_lib_dir): print("tools install.") # ctcdecoder - ctcdecoder_dir = HERE / 'paddlespeech/s2t/decoders/ctcdecoder/swig' + ctcdecoder_dir = HERE / 'third_party/ctc_decoders' with pushd(ctcdecoder_dir): check_call("bash -e setup.sh") print("ctcdecoder install.") @@ -171,7 +172,7 @@ class UploadCommand(Command): setup_info = dict( # Metadata name='paddlespeech', - version='0.1.0', + version='0.1.1', author='PaddlePaddle Speech and Language Team', author_email='paddlesl@baidu.com', url='https://github.com/PaddlePaddle/PaddleSpeech', diff --git a/setup_audio.py b/setup_audio.py index 24c9bb9b98691bd7c9d8f0efe7e2cabdd4664399..5f01406561f1f9611d7913561c107faae2610082 100644 --- a/setup_audio.py +++ b/setup_audio.py @@ -13,8 +13,10 @@ # limitations under the License. import setuptools +import paddleaudio + # set the version here -version = '0.1.0a' +version = paddleaudio.__version__ setuptools.setup( name="paddleaudio", diff --git a/tests/benchmark/conformer/run.sh b/tests/benchmark/conformer/run.sh index fcd0c2359690373e73acf077e3b3edeeeb6e470f..c9d640ed2580e6b86a60ad99c9f55dad329e2a12 100644 --- a/tests/benchmark/conformer/run.sh +++ b/tests/benchmark/conformer/run.sh @@ -22,6 +22,7 @@ sed -i "s/ accum_grad: 2/ accum_grad: 1/g" conf/benchmark/conformer.yaml fp_item_list=(fp32) bs_item=(16) config_path=conf/benchmark/conformer.yaml +decode_config_path=conf/tuning/decode.yaml seed=0 output=exp/conformer profiler_options=None @@ -34,13 +35,13 @@ for fp_item in ${fp_item_list[@]}; do echo "index is speed, 8gpus, run_mode is multi_process, begin, conformer" run_mode=mp ngpu=8 - CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash ${CUR_DIR}/run_benchmark.sh ${run_mode} ${config_path} ${output} ${seed} ${ngpu} ${profiler_options} ${bs_item} ${fp_item} ${model_item} | tee ${log_path}/${log_name}_speed_8gpus8p 2>&1 + CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash ${CUR_DIR}/run_benchmark.sh ${run_mode} ${config_path} ${decode_config_path} ${output} ${seed} ${ngpu} ${profiler_options} ${bs_item} ${fp_item} ${model_item} | tee ${log_path}/${log_name}_speed_8gpus8p 2>&1 sleep 60 log_name=speech_${model_item}_bs${bs_item}_${fp_item} # 如:clas_MobileNetv1_mp_bs32_fp32_8 echo "index is speed, 1gpus, begin, ${log_name}" run_mode=sp ngpu=1 - CUDA_VISIBLE_DEVICES=0 bash ${CUR_DIR}/run_benchmark.sh ${run_mode} ${config_path} ${output} ${seed} ${ngpu} ${profiler_options} ${bs_item} ${fp_item} ${model_item} | tee ${log_path}/${log_name}_speed_1gpus 2>&1 # (5min) + CUDA_VISIBLE_DEVICES=0 bash ${CUR_DIR}/run_benchmark.sh ${run_mode} ${config_path} ${decode_config_path} ${output} ${seed} ${ngpu} ${profiler_options} ${bs_item} ${fp_item} ${model_item} | tee ${log_path}/${log_name}_speed_1gpus 2>&1 # (5min) sleep 60 done done diff --git a/tests/benchmark/conformer/run_benchmark.sh b/tests/benchmark/conformer/run_benchmark.sh index 5b83b15ce5fda954ee272e5040e419f74b2a97a6..16cd410e22643fc720e18e592cf43dcf5383abed 100644 --- a/tests/benchmark/conformer/run_benchmark.sh +++ b/tests/benchmark/conformer/run_benchmark.sh @@ -5,13 +5,14 @@ function _set_params(){ run_mode=${1:-"sp"} # 单卡sp|多卡mp config_path=${2:-"conf/conformer.yaml"} - output=${3:-"exp/conformer"} - seed=${4:-"0"} - ngpu=${5:-"1"} - profiler_options=${6:-"None"} - batch_size=${7:-"32"} - fp_item=${8:-"fp32"} - model_item=${9:-"conformer"} + decode_config_path=${3:-"conf/tuning/decode.yaml"} + output=${4:-"exp/conformer"} + seed=${5:-"0"} + ngpu=${6:-"1"} + profiler_options=${7:-"None"} + batch_size=${8:-"32"} + fp_item=${9:-"fp32"} + model_item=${10:-"conformer"} benchmark_max_step=0 run_log_path=${TRAIN_LOG_DIR:-$(pwd)} # TRAIN_LOG_DIR 后续QA设置该参数 # 添加日志解析需要的参数 @@ -35,6 +36,7 @@ function _train(){ echo "Train on ${num_gpu_devices} GPUs" echo "current CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES, gpus=$num_gpu_devices, batch_size=$batch_size" train_cmd="--config=${config_path} \ + --decode_cfg=${decode_config_path} \ --output=${output} \ --seed=${seed} \ --ngpu=${ngpu} \ @@ -69,6 +71,6 @@ function _train(){ source ${BENCHMARK_ROOT}/scripts/run_model.sh # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;该脚本在连调时可从benchmark repo中下载https://github.com/PaddlePaddle/benchmark/blob/master/scripts/run_model.sh;如果不联调只想要产出训练log可以注掉本行,提交时需打开 _set_params $@ -# _train # 如果只想产出训练log,不解析,可取消注释 +#_train # 如果只想产出训练log,不解析,可取消注释 _run # 该函数在run_model.sh中,执行时会调用_train; 如果不联调只想要产出训练log可以注掉本行,提交时需打开 diff --git a/tests/chains/ds2/ds2_params_lite_train_infer.txt b/tests/chains/ds2/ds2_params_lite_train_infer.txt index b11872bd006d8264cbe8f652f719ed4f25d78a62..cad8efa3c52cce058f10b4876e6ab9db2fccc15f 100644 --- a/tests/chains/ds2/ds2_params_lite_train_infer.txt +++ b/tests/chains/ds2/ds2_params_lite_train_infer.txt @@ -21,13 +21,13 @@ null:null null:null ## ===========================eval_params=========================== -eval: ../../../paddlespeech/s2t/exps/deepspeech2/bin/test.py --ngpu 1 --config conf/deepspeech2.yaml --checkpoint_path exp/deepspeech_tiny/checkpoints/9 --result_file tests/9.rsl --model_type offline +eval: ../../../paddlespeech/s2t/exps/deepspeech2/bin/test.py --ngpu 1 --config conf/deepspeech2.yaml --decode_cfg conf/tuning/decode.yaml --checkpoint_path exp/deepspeech_tiny/checkpoints/4 --result_file tests/4.rsl --model_type offline null:null ## ===========================infer_params=========================== null:null null:null -norm_export: ../../../paddlespeech/s2t/exps/deepspeech2/bin/export.py --ngpu 1 --config conf/deepspeech2.yaml --model_type offline --checkpoint_path exp/deepspeech_tiny/checkpoints/9 --export_path exp/deepspeech_tiny/checkpoints/9.jit +norm_export: ../../../paddlespeech/s2t/exps/deepspeech2/bin/export.py --ngpu 1 --config conf/deepspeech2.yaml --model_type offline --checkpoint_path exp/deepspeech_tiny/checkpoints/4 --export_path exp/deepspeech_tiny/checkpoints/4.jit quant_export:null fpgm_export:null distill_export:null diff --git a/tests/chains/ds2/ds2_params_whole_train_infer.txt b/tests/chains/ds2/ds2_params_whole_train_infer.txt index 875e3ccf9bb87fffd321452808c48ab692a379e2..5c6195069cc5f4c206425cd31dffd7009c9d8d65 100644 --- a/tests/chains/ds2/ds2_params_whole_train_infer.txt +++ b/tests/chains/ds2/ds2_params_whole_train_infer.txt @@ -21,7 +21,7 @@ null:null null:null ## ===========================eval_params=========================== -eval: ../../../paddlespeech/s2t/exps/deepspeech2/bin/test.py --ngpu 1 --config conf/deepspeech2.yaml --result_file tests/49.rsl --checkpoint_path exp/deepspeech_whole/checkpoints/49 --model_type offline +eval: ../../../paddlespeech/s2t/exps/deepspeech2/bin/test.py --ngpu 1 --config conf/deepspeech2.yaml --decode_cfg conf/tuning/decode.yaml --result_file tests/49.rsl --checkpoint_path exp/deepspeech_whole/checkpoints/49 --model_type offline null:null ## ===========================infer_params=========================== diff --git a/tests/chains/ds2/lite_train_infer.sh b/tests/chains/ds2/lite_train_infer.sh index 76b22a38c4dfb44d718b85e28f4739a3d59993ab..1dce1b2910219faede4b710f1ff433ca3d447bb0 100644 --- a/tests/chains/ds2/lite_train_infer.sh +++ b/tests/chains/ds2/lite_train_infer.sh @@ -1,5 +1,5 @@ bash prepare.sh ds2_params_lite_train_infer.txt lite_train_infer -cd ../../examples/tiny/s0 +cd ../../../examples/tiny/asr0 source path.sh -bash ../../../tests/chains/test.sh ../../../tests/chains/ds2_params_lite_train_infer.txt lite_train_infer +bash ../../../tests/chains/ds2/test.sh ../../../tests/chains/ds2/ds2_params_lite_train_infer.txt lite_train_infer cd ../../../tests/chains diff --git a/tests/chains/ds2/prepare.sh b/tests/chains/ds2/prepare.sh index 73a3028362cba8829516ce1d0b86eecc3cdf0f5b..4913ce42e2deb7aaef5b0264a508bc83bc43a5da 100644 --- a/tests/chains/ds2/prepare.sh +++ b/tests/chains/ds2/prepare.sh @@ -34,7 +34,7 @@ MODE=$2 if [ ${MODE} = "lite_train_infer" ];then # pretrain lite train data curPath=$(readlink -f "$(dirname "$0")") - cd ${curPath}/../../examples/tiny/s0 + cd ${curPath}/../../../examples/tiny/asr0 source path.sh # download audio data bash ./local/data.sh || exit -1 @@ -47,7 +47,7 @@ if [ ${MODE} = "lite_train_infer" ];then elif [ ${MODE} = "whole_train_infer" ];then curPath=$(readlink -f "$(dirname "$0")") - cd ${curPath}/../../examples/aishell/s0 + cd ${curPath}/../../../examples/aishell/asr0 source path.sh # download audio data bash ./local/data.sh || exit -1 @@ -59,7 +59,7 @@ elif [ ${MODE} = "whole_train_infer" ];then cd ${curPath} elif [ ${MODE} = "whole_infer" ];then curPath=$(readlink -f "$(dirname "$0")") - cd ${curPath}/../../examples/aishell/s0 + cd ${curPath}/../../../examples/aishell/asr0 source path.sh # download audio data bash ./local/data.sh || exit -1 @@ -71,7 +71,7 @@ elif [ ${MODE} = "whole_infer" ];then cd ${curPath} else curPath=$(readlink -f "$(dirname "$0")") - cd ${curPath}/../../examples/aishell/s0 + cd ${curPath}/../../../examples/aishell/asr0 source path.sh # download audio data bash ./local/data.sh || exit -1 diff --git a/tests/chains/ds2/test.sh b/tests/chains/ds2/test.sh index c930782051e0b75f9be0d7a95674b2e16da4ba40..26917c672b8aba788dad4301038531c0af7585b1 100644 --- a/tests/chains/ds2/test.sh +++ b/tests/chains/ds2/test.sh @@ -324,6 +324,7 @@ else gsu=${gpu//,/ } nump=`echo $gsu | wc -w` cmd="${python} ${run_train} --ngpu=$nump" + export CUDA_VISIBLE_DEVICES=${gpu} else # train with multi-machine cmd="${python} -m paddle.distributed.launch --ips=${ips} --gpus=${gpu} ${run_train} ${set_save_model} ${set_pretrain} ${set_epoch} ${set_autocast} ${set_batchsize} ${set_train_params1}" fi diff --git a/tests/unit/cli/test_cli.sh b/tests/unit/cli/test_cli.sh new file mode 100755 index 0000000000000000000000000000000000000000..845c5d6a2983b499246754f07ee227d753003515 --- /dev/null +++ b/tests/unit/cli/test_cli.sh @@ -0,0 +1,26 @@ +#!/bin/bash +set -e +# Audio classification +wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/cat.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/dog.wav +paddlespeech cls --input ./cat.wav --topk 10 + +# Punctuation_restoration +paddlespeech text --input 今天的天气真不错啊你下午有空吗我想约你一起去吃饭 + +# Speech_recognition +wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav +paddlespeech asr --input ./zh.wav +paddlespeech asr --model transformer_librispeech --lang en --input ./en.wav + +# Text To Speech +paddlespeech tts --input "你好,欢迎使用百度飞桨深度学习框架!" +paddlespeech tts --am speedyspeech_csmsc --input "你好,欢迎使用百度飞桨深度学习框架!" +paddlespeech tts --voc mb_melgan_csmsc --input "你好,欢迎使用百度飞桨深度学习框架!" +paddlespeech tts --voc style_melgan_csmsc --input "你好,欢迎使用百度飞桨深度学习框架!" +paddlespeech tts --voc hifigan_csmsc --input "你好,欢迎使用百度飞桨深度学习框架!" +paddlespeech tts --am fastspeech2_aishell3 --voc pwgan_aishell3 --input "你好,欢迎使用百度飞桨深度学习框架!" --spk_id 0 +paddlespeech tts --am fastspeech2_ljspeech --voc pwgan_ljspeech --lang en --input "hello world" +paddlespeech tts --am fastspeech2_vctk --voc pwgan_vctk --input "hello, boys" --lang en --spk_id 0 + +# Speech Translation (only support linux) +paddlespeech st --input ./en.wav diff --git a/paddlespeech/s2t/decoders/ctcdecoder/swig/.gitignore b/third_party/ctc_decoders/.gitignore similarity index 100% rename from paddlespeech/s2t/decoders/ctcdecoder/swig/.gitignore rename to third_party/ctc_decoders/.gitignore diff --git a/third_party/ctc_decoders/COPYING.APACHE2.0 b/third_party/ctc_decoders/COPYING.APACHE2.0 new file mode 100644 index 0000000000000000000000000000000000000000..261eeb9e9f8b2b4b0d119366dda99c6fd7d35c64 --- /dev/null +++ b/third_party/ctc_decoders/COPYING.APACHE2.0 @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/third_party/ctc_decoders/COPYING.LESSER.3 b/third_party/ctc_decoders/COPYING.LESSER.3 new file mode 100644 index 0000000000000000000000000000000000000000..cca7fc278f5c81ce23a2687208f0d63a6ea44009 --- /dev/null +++ b/third_party/ctc_decoders/COPYING.LESSER.3 @@ -0,0 +1,165 @@ + GNU LESSER GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + + This version of the GNU Lesser General Public License incorporates +the terms and conditions of version 3 of the GNU General Public +License, supplemented by the additional permissions listed below. + + 0. Additional Definitions. + + As used herein, "this License" refers to version 3 of the GNU Lesser +General Public License, and the "GNU GPL" refers to version 3 of the GNU +General Public License. + + "The Library" refers to a covered work governed by this License, +other than an Application or a Combined Work as defined below. + + An "Application" is any work that makes use of an interface provided +by the Library, but which is not otherwise based on the Library. +Defining a subclass of a class defined by the Library is deemed a mode +of using an interface provided by the Library. + + A "Combined Work" is a work produced by combining or linking an +Application with the Library. The particular version of the Library +with which the Combined Work was made is also called the "Linked +Version". + + The "Minimal Corresponding Source" for a Combined Work means the +Corresponding Source for the Combined Work, excluding any source code +for portions of the Combined Work that, considered in isolation, are +based on the Application, and not on the Linked Version. + + The "Corresponding Application Code" for a Combined Work means the +object code and/or source code for the Application, including any data +and utility programs needed for reproducing the Combined Work from the +Application, but excluding the System Libraries of the Combined Work. + + 1. Exception to Section 3 of the GNU GPL. + + You may convey a covered work under sections 3 and 4 of this License +without being bound by section 3 of the GNU GPL. + + 2. Conveying Modified Versions. + + If you modify a copy of the Library, and, in your modifications, a +facility refers to a function or data to be supplied by an Application +that uses the facility (other than as an argument passed when the +facility is invoked), then you may convey a copy of the modified +version: + + a) under this License, provided that you make a good faith effort to + ensure that, in the event an Application does not supply the + function or data, the facility still operates, and performs + whatever part of its purpose remains meaningful, or + + b) under the GNU GPL, with none of the additional permissions of + this License applicable to that copy. + + 3. Object Code Incorporating Material from Library Header Files. + + The object code form of an Application may incorporate material from +a header file that is part of the Library. You may convey such object +code under terms of your choice, provided that, if the incorporated +material is not limited to numerical parameters, data structure +layouts and accessors, or small macros, inline functions and templates +(ten or fewer lines in length), you do both of the following: + + a) Give prominent notice with each copy of the object code that the + Library is used in it and that the Library and its use are + covered by this License. + + b) Accompany the object code with a copy of the GNU GPL and this license + document. + + 4. Combined Works. + + You may convey a Combined Work under terms of your choice that, +taken together, effectively do not restrict modification of the +portions of the Library contained in the Combined Work and reverse +engineering for debugging such modifications, if you also do each of +the following: + + a) Give prominent notice with each copy of the Combined Work that + the Library is used in it and that the Library and its use are + covered by this License. + + b) Accompany the Combined Work with a copy of the GNU GPL and this license + document. + + c) For a Combined Work that displays copyright notices during + execution, include the copyright notice for the Library among + these notices, as well as a reference directing the user to the + copies of the GNU GPL and this license document. + + d) Do one of the following: + + 0) Convey the Minimal Corresponding Source under the terms of this + License, and the Corresponding Application Code in a form + suitable for, and under terms that permit, the user to + recombine or relink the Application with a modified version of + the Linked Version to produce a modified Combined Work, in the + manner specified by section 6 of the GNU GPL for conveying + Corresponding Source. + + 1) Use a suitable shared library mechanism for linking with the + Library. A suitable mechanism is one that (a) uses at run time + a copy of the Library already present on the user's computer + system, and (b) will operate properly with a modified version + of the Library that is interface-compatible with the Linked + Version. + + e) Provide Installation Information, but only if you would otherwise + be required to provide such information under section 6 of the + GNU GPL, and only to the extent that such information is + necessary to install and execute a modified version of the + Combined Work produced by recombining or relinking the + Application with a modified version of the Linked Version. (If + you use option 4d0, the Installation Information must accompany + the Minimal Corresponding Source and Corresponding Application + Code. If you use option 4d1, you must provide the Installation + Information in the manner specified by section 6 of the GNU GPL + for conveying Corresponding Source.) + + 5. Combined Libraries. + + You may place library facilities that are a work based on the +Library side by side in a single library together with other library +facilities that are not Applications and are not covered by this +License, and convey such a combined library under terms of your +choice, if you do both of the following: + + a) Accompany the combined library with a copy of the same work based + on the Library, uncombined with any other library facilities, + conveyed under the terms of this License. + + b) Give prominent notice with the combined library that part of it + is a work based on the Library, and explaining where to find the + accompanying uncombined form of the same work. + + 6. Revised Versions of the GNU Lesser General Public License. + + The Free Software Foundation may publish revised and/or new versions +of the GNU Lesser General Public License from time to time. Such new +versions will be similar in spirit to the present version, but may +differ in detail to address new problems or concerns. + + Each version is given a distinguishing version number. If the +Library as you received it specifies that a certain numbered version +of the GNU Lesser General Public License "or any later version" +applies to it, you have the option of following the terms and +conditions either of that published version or of any later version +published by the Free Software Foundation. If the Library as you +received it does not specify a version number of the GNU Lesser +General Public License, you may choose any version of the GNU Lesser +General Public License ever published by the Free Software Foundation. + + If the Library as you received it specifies that a proxy can decide +whether future versions of the GNU Lesser General Public License shall +apply, that proxy's public statement of acceptance of any version is +permanent authorization for you to choose that version for the +Library. diff --git a/third_party/ctc_decoders/LICENSE b/third_party/ctc_decoders/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..eeef74b3099776f6afa02dd3fe0ca7f304138248 --- /dev/null +++ b/third_party/ctc_decoders/LICENSE @@ -0,0 +1,8 @@ +Most of the code here is licensed under the Apache License 2.0. +There are exceptions that have their own licenses, listed below. + +score.h and score.cpp is under the LGPL license. +The two files include the header files from KenLM project. + +For the rest: +The default licence of paddlespeech-ctcdecoders is Apache License 2.0. diff --git a/paddlespeech/s2t/decoders/ctcdecoder/swig/__init__.py b/third_party/ctc_decoders/__init__.py similarity index 100% rename from paddlespeech/s2t/decoders/ctcdecoder/swig/__init__.py rename to third_party/ctc_decoders/__init__.py diff --git a/paddlespeech/s2t/decoders/ctcdecoder/swig/ctc_beam_search_decoder.cpp b/third_party/ctc_decoders/ctc_beam_search_decoder.cpp similarity index 98% rename from paddlespeech/s2t/decoders/ctcdecoder/swig/ctc_beam_search_decoder.cpp rename to third_party/ctc_decoders/ctc_beam_search_decoder.cpp index 8469a194dfe461192b2280a43c7b2c6e3bf64cdd..db742fbbe7a4250a3a8264460914fdac4b11beae 100644 --- a/paddlespeech/s2t/decoders/ctcdecoder/swig/ctc_beam_search_decoder.cpp +++ b/third_party/ctc_decoders/ctc_beam_search_decoder.cpp @@ -1,6 +1,6 @@ // Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. // -// Licensed under the Apache License, Version 2.0 (the "License"); +// Licensed under the Apache License, Version 2.0 (the "COPYING.APACHE2.0"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // @@ -183,7 +183,7 @@ std::vector> ctc_beam_search_decoder( std::sort( prefixes.begin(), prefixes.begin() + num_prefixes, prefix_compare); - // compute aproximate ctc score as the return score, without affecting the + // compute approximate ctc score as the return score, without affecting the // return order of decoding result. To delete when decoder gets stable. for (size_t i = 0; i < beam_size && i < prefixes.size(); ++i) { double approx_ctc = prefixes[i]->score; diff --git a/paddlespeech/s2t/decoders/ctcdecoder/swig/ctc_beam_search_decoder.h b/third_party/ctc_decoders/ctc_beam_search_decoder.h similarity index 97% rename from paddlespeech/s2t/decoders/ctcdecoder/swig/ctc_beam_search_decoder.h rename to third_party/ctc_decoders/ctc_beam_search_decoder.h index eaba9da8c8a05ae10391d81f30c368f125524aac..584226574b3ae06502315ff8134b002642d52fe2 100644 --- a/paddlespeech/s2t/decoders/ctcdecoder/swig/ctc_beam_search_decoder.h +++ b/third_party/ctc_decoders/ctc_beam_search_decoder.h @@ -1,6 +1,6 @@ // Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. // -// Licensed under the Apache License, Version 2.0 (the "License"); +// Licensed under the Apache License, Version 2.0 (the "COPYING.APACHE2.0"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // diff --git a/paddlespeech/s2t/decoders/ctcdecoder/swig/ctc_greedy_decoder.cpp b/third_party/ctc_decoders/ctc_greedy_decoder.cpp similarity index 96% rename from paddlespeech/s2t/decoders/ctcdecoder/swig/ctc_greedy_decoder.cpp rename to third_party/ctc_decoders/ctc_greedy_decoder.cpp index 53a04fba0fac515e0877a714f7e5b48b4e76c807..a178c67341c3f8882e142c4e82b1a3e75035d65d 100644 --- a/paddlespeech/s2t/decoders/ctcdecoder/swig/ctc_greedy_decoder.cpp +++ b/third_party/ctc_decoders/ctc_greedy_decoder.cpp @@ -1,6 +1,6 @@ // Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. // -// Licensed under the Apache License, Version 2.0 (the "License"); +// Licensed under the Apache License, Version 2.0 (the "COPYING.APACHE2.0"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // diff --git a/paddlespeech/s2t/decoders/ctcdecoder/swig/ctc_greedy_decoder.h b/third_party/ctc_decoders/ctc_greedy_decoder.h similarity index 93% rename from paddlespeech/s2t/decoders/ctcdecoder/swig/ctc_greedy_decoder.h rename to third_party/ctc_decoders/ctc_greedy_decoder.h index dd1b333153510766ed64875200825d145a374186..4d60beaf1702375f74355b11dfd0feef32c5ec0d 100644 --- a/paddlespeech/s2t/decoders/ctcdecoder/swig/ctc_greedy_decoder.h +++ b/third_party/ctc_decoders/ctc_greedy_decoder.h @@ -1,6 +1,6 @@ // Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. // -// Licensed under the Apache License, Version 2.0 (the "License"); +// Licensed under the Apache License, Version 2.0 (the "COPYING.APACHE2.0"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // diff --git a/paddlespeech/s2t/decoders/ctcdecoder/swig/decoder_utils.cpp b/third_party/ctc_decoders/decoder_utils.cpp similarity index 98% rename from paddlespeech/s2t/decoders/ctcdecoder/swig/decoder_utils.cpp rename to third_party/ctc_decoders/decoder_utils.cpp index 5d69ad0326e97bbb84140309dd3abeabfea831ef..c7ef65428e11d0a719a8a9904be094e58f4c0a0a 100644 --- a/paddlespeech/s2t/decoders/ctcdecoder/swig/decoder_utils.cpp +++ b/third_party/ctc_decoders/decoder_utils.cpp @@ -1,6 +1,6 @@ // Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. // -// Licensed under the Apache License, Version 2.0 (the "License"); +// Licensed under the Apache License, Version 2.0 (the "COPYING.APACHE2.0"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // @@ -26,7 +26,7 @@ std::vector> get_pruned_log_probs( for (size_t i = 0; i < prob_step.size(); ++i) { prob_idx.push_back(std::pair(i, prob_step[i])); } - // pruning of vacobulary + // pruning of vocabulary size_t cutoff_len = prob_step.size(); if (cutoff_prob < 1.0 || cutoff_top_n < cutoff_len) { std::sort(prob_idx.begin(), diff --git a/paddlespeech/s2t/decoders/ctcdecoder/swig/decoder_utils.h b/third_party/ctc_decoders/decoder_utils.h similarity index 98% rename from paddlespeech/s2t/decoders/ctcdecoder/swig/decoder_utils.h rename to third_party/ctc_decoders/decoder_utils.h index 1d75d03dbef5e4a9a4c5514e8dab38a0c803aa34..0987415529a506f65eea98ac8dc2477334561cbf 100644 --- a/paddlespeech/s2t/decoders/ctcdecoder/swig/decoder_utils.h +++ b/third_party/ctc_decoders/decoder_utils.h @@ -1,6 +1,6 @@ // Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. // -// Licensed under the Apache License, Version 2.0 (the "License"); +// Licensed under the Apache License, Version 2.0 (the "COPYING.APACHE2.0"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // diff --git a/paddlespeech/s2t/decoders/ctcdecoder/swig/decoders.i b/third_party/ctc_decoders/decoders.i similarity index 100% rename from paddlespeech/s2t/decoders/ctcdecoder/swig/decoders.i rename to third_party/ctc_decoders/decoders.i diff --git a/paddlespeech/s2t/decoders/ctcdecoder/swig/path_trie.cpp b/third_party/ctc_decoders/path_trie.cpp similarity index 98% rename from paddlespeech/s2t/decoders/ctcdecoder/swig/path_trie.cpp rename to third_party/ctc_decoders/path_trie.cpp index f52d1157352f8013cebdb1932135d5b3422332e4..a5e7dd3da9479bcca2b5b3fae36f93b2d36eea31 100644 --- a/paddlespeech/s2t/decoders/ctcdecoder/swig/path_trie.cpp +++ b/third_party/ctc_decoders/path_trie.cpp @@ -1,6 +1,6 @@ // Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. // -// Licensed under the Apache License, Version 2.0 (the "License"); +// Licensed under the Apache License, Version 2.0 (the "COPYING.APACHE2.0"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // diff --git a/paddlespeech/s2t/decoders/ctcdecoder/swig/path_trie.h b/third_party/ctc_decoders/path_trie.h similarity index 96% rename from paddlespeech/s2t/decoders/ctcdecoder/swig/path_trie.h rename to third_party/ctc_decoders/path_trie.h index 717d4b00435affc412e6910ef728358f9907bf0b..5193e0a47e6a9ee41d2ed1f9f5ca6f379d2de110 100644 --- a/paddlespeech/s2t/decoders/ctcdecoder/swig/path_trie.h +++ b/third_party/ctc_decoders/path_trie.h @@ -1,6 +1,6 @@ // Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. // -// Licensed under the Apache License, Version 2.0 (the "License"); +// Licensed under the Apache License, Version 2.0 (the "COPYING.APACHE2.0"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // diff --git a/paddlespeech/s2t/decoders/ctcdecoder/swig/scorer.cpp b/third_party/ctc_decoders/scorer.cpp similarity index 90% rename from paddlespeech/s2t/decoders/ctcdecoder/swig/scorer.cpp rename to third_party/ctc_decoders/scorer.cpp index 7bd6542dfef1c003327161c6fb886e094cef0c2c..977112d17f7868bd9b14510a183d43026a504f2e 100644 --- a/paddlespeech/s2t/decoders/ctcdecoder/swig/scorer.cpp +++ b/third_party/ctc_decoders/scorer.cpp @@ -1,16 +1,4 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. +// Licensed under GNU Lesser General Public License v3 (LGPLv3) (LGPL-3) (the "COPYING.LESSER.3"); #include "scorer.h" @@ -20,8 +8,6 @@ #include "lm/config.hh" #include "lm/model.hh" #include "lm/state.hh" -#include "util/string_piece.hh" -#include "util/tokenize_piece.hh" #include "decoder_utils.h" @@ -223,7 +209,7 @@ void Scorer::fill_dictionary(bool add_space) { * This gets rid of "epsilon" transitions in the FST. * These are transitions that don't require a string input to be taken. - * Getting rid of them is necessary to make the FST determinisitc, but + * Getting rid of them is necessary to make the FST deterministic, but * can greatly increase the size of the FST */ fst::RmEpsilon(&dictionary); diff --git a/paddlespeech/s2t/decoders/ctcdecoder/swig/scorer.h b/third_party/ctc_decoders/scorer.h similarity index 82% rename from paddlespeech/s2t/decoders/ctcdecoder/swig/scorer.h rename to third_party/ctc_decoders/scorer.h index 3f3001e77c222af41ee60b57d6f0bc38e81cb9ef..5739339df7c9afe3f9620cc3a70c0e76c6b5c963 100644 --- a/paddlespeech/s2t/decoders/ctcdecoder/swig/scorer.h +++ b/third_party/ctc_decoders/scorer.h @@ -1,16 +1,4 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. +// Licensed under GNU Lesser General Public License v3 (LGPLv3) (LGPL-3) (the "COPYING.LESSER.3"); #ifndef SCORER_H_ #define SCORER_H_ @@ -23,7 +11,6 @@ #include "lm/enumerate_vocab.hh" #include "lm/virtual_interface.hh" #include "lm/word_index.hh" -#include "util/string_piece.hh" #include "path_trie.h" diff --git a/paddlespeech/s2t/decoders/ctcdecoder/swig/setup.py b/third_party/ctc_decoders/setup.py similarity index 97% rename from paddlespeech/s2t/decoders/ctcdecoder/swig/setup.py rename to third_party/ctc_decoders/setup.py index 8a2086d6bc061cadf53a788bf90936f4d5f4c72a..6484b87c543c7e0cf80b841a8eadec485b7f28a9 100644 --- a/paddlespeech/s2t/decoders/ctcdecoder/swig/setup.py +++ b/third_party/ctc_decoders/setup.py @@ -127,11 +127,11 @@ decoders_module = [ setup( name='paddlespeech_ctcdecoders', - version='0.1.0', + version='0.1.1', description="CTC decoders in paddlespeech", author="PaddlePaddle Speech and Language Team", author_email="paddlesl@baidu.com", url="https://github.com/PaddlePaddle/PaddleSpeech", - license='Apache 2.0', + license='Apache 2.0, GNU Lesser General Public License v3 (LGPLv3) (LGPL-3)', ext_modules=decoders_module, py_modules=['swig_decoders']) diff --git a/paddlespeech/s2t/decoders/ctcdecoder/swig/setup.sh b/third_party/ctc_decoders/setup.sh similarity index 100% rename from paddlespeech/s2t/decoders/ctcdecoder/swig/setup.sh rename to third_party/ctc_decoders/setup.sh diff --git a/tools/extras/install_kaldi.sh b/tools/extras/install_kaldi.sh index b93e7ecf6372c9f84391289ace2c4a1b3590b45e..f8cd961fc346ea8b085c5797ec88eadac93b83dd 100755 --- a/tools/extras/install_kaldi.sh +++ b/tools/extras/install_kaldi.sh @@ -34,7 +34,7 @@ make -j4 pushd ../src OPENBLAS_DIR=${KALDI_DIR}/../OpenBLAS mkdir -p ${OPENBLAS_DIR}/install -if [ $SHARED == true ]; +if [ $SHARED == true ]; then ./configure --shared --use-cuda=no --static-math --mathlib=OPENBLAS --openblas-root=${OPENBLAS_DIR}/install else ./configure --static --use-cuda=no --static-math --mathlib=OPENBLAS --openblas-root=${OPENBLAS_DIR}/install diff --git a/tools/release_note.py b/tools/release_note.py index 07a0576972cf5bde45bc9c2cc0c43d11b8611ba6..2016c1a905597a1d88f8f60f877ed3c957f52948 100755 --- a/tools/release_note.py +++ b/tools/release_note.py @@ -14,191 +14,180 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== - """ Create release notes with the issues from a milestone. python3 release_notes.py -c didi delta v.xxxxx """ - -import sys -import json import argparse -import urllib.request import collections +import json +import sys +import urllib.request github_url = 'https://api.github.com/repos' if __name__ == '__main__': - # usage: - # 1. close milestone on github - # 2. python3 tools/release_notes.py -c didi delta v0.3.3 - - # Parse arguments - parser = argparse.ArgumentParser( - description='Create a draft release with the issues from a milestone.', - ) - - parser.add_argument( - 'user', - metavar='user', - type=str, - default='paddlepaddle', - help='github user: paddlepaddle' - ) - - parser.add_argument( - 'repository', - metavar='repository', - type=str, - default='paddlespeech', - help='github repository: paddlespeech' - ) - - parser.add_argument( - 'milestone', - metavar='milestone', - type=str, - help='name of used milestone: v0.3.3' - ) - - parser.add_argument( - '-c', '--closed', - help='Fetch closed milestones/issues', - action='store_true' - ) - - parser.print_help() - args = parser.parse_args() - - # Fetch milestone infos - url = "%s/%s/%s/milestones" % ( - github_url, - args.user, - args.repository - ) - - headers = { - 'Origin': 'https://github.com', - 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) ' - 'AppleWebKit/537.11 (KHTML, like Gecko) ' - 'Chrome/23.0.1271.64 Safari/537.11', - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', - 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3', - 'Accept-Encoding': 'none', - 'Accept-Language': 'en-US,en;q=0.8', - 'Connection': 'keep-alive'} - - if args.closed: - url += "?state=closed" - - req = urllib.request.Request(url, headers=headers) - github_request = urllib.request.urlopen(req) - if not github_request: - parser.error('Cannot read milestone list.') - - decoder = json.JSONDecoder() - milestones = decoder.decode(github_request.read().decode('utf-8')) - github_request.close() - - print('parse milestones', file=sys.stderr) - milestone_id = None - for milestone in milestones: - if milestone['title'] == args.milestone: - milestone_id = milestone['number'] - if not milestone_id: - parser.error('Cannot find milestone') - - - # Get milestone related issue info - url = '%s/%s/%s/issues?milestone=%d' % ( - github_url, - args.user, - args.repository, - milestone_id - ) - if args.closed: - url += "&state=closed" - - req = urllib.request.Request(url, headers=headers) - github_request = urllib.request.urlopen(req) - if not github_request: - parser.error('Cannot read issue list.') - - issues = decoder.decode(github_request.read().decode('utf-8')) - github_request.close() - - #print('parse issues', file=sys.stderr) - #final_data = [] - #labels = [] - #thanks_to = [] - #for issue in issues: - - # for label in issue['labels']: - # labels.append(label['name']) - - # thanks_to.append('@%s' % (issue['user']['login'])) - # final_data.append(' * **[%s]** - %s #%d by **@%s**\n' % ( - # label['name'], - # issue['title'], - # issue['number'], - # issue['user']['login'] - # )) - - #dic = collections.defaultdict(set) - #for l_release in list(set(labels)): - - # for f_data in final_data: - # if '[%s]' % l_release in f_data: - # dic[l_release].add(f_data) - - #with open(f"release_note_issues_{args.milestone}.md", 'w') as f: - # for key, value in dic.items(): - # print('# %s\n%s' % (key, ''.join(value)), file=f) - # print('# %s\n%s' % ('Acknowledgements', 'Special thanks to %s ' % (' '.join(list(set(thanks_to))))), file=f) - - - # Get milestone related PR info - url = '%s/%s/%s/pulls?milestone=%d' % ( - github_url, - args.user, - args.repository, - milestone_id - ) - if args.closed: - url += "&state=closed" - - req = urllib.request.Request(url, headers=headers) - github_request = urllib.request.urlopen(req) - if not github_request: - parser.error('Cannot read issue list.') - - issues = decoder.decode(github_request.read().decode('utf-8')) - github_request.close() - - print('parse pulls', file=sys.stderr) - final_data = [] - labels = [] - thanks_to = [] - for issue in issues: - - for label in issue['labels']: - labels.append(label['name']) - - thanks_to.append('@%s' % (issue['user']['login'])) - final_data.append(' * **[%s]** - %s #%d by **@%s**\n' % ( - label['name'], - issue['title'], - issue['number'], - issue['user']['login'] - )) - - dic = collections.defaultdict(set) - for l_release in list(set(labels)): - - for f_data in final_data: - if '[%s]' % l_release in f_data: - dic[l_release].add(f_data) - - with open(f"release_note_pulls_{args.milestone}.md", 'w') as f: - for key, value in dic.items(): - print('# %s\n%s' % (key, ''.join(value)), file=f) - print('# %s\n%s' % ('Acknowledgements', 'Special thanks to %s ' % (' '.join(list(set(thanks_to))))), file=f) + # usage: + # 1. close milestone on github + # 2. python3 tools/release_notes.py -c didi delta v0.3.3 + + # Parse arguments + parser = argparse.ArgumentParser( + description='Create a draft release with the issues from a milestone.', + ) + + parser.add_argument( + 'user', + metavar='user', + type=str, + default='paddlepaddle', + help='github user: paddlepaddle') + + parser.add_argument( + 'repository', + metavar='repository', + type=str, + default='paddlespeech', + help='github repository: paddlespeech') + + parser.add_argument( + 'milestone', + metavar='milestone', + type=str, + help='name of used milestone: v0.3.3') + + parser.add_argument( + '-c', + '--closed', + help='Fetch closed milestones/issues', + action='store_true') + + parser.print_help() + args = parser.parse_args() + + # Fetch milestone infos + url = "%s/%s/%s/milestones" % (github_url, args.user, args.repository) + + headers = { + 'Origin': + 'https://github.com', + 'User-Agent': + 'Mozilla/5.0 (X11; Linux x86_64) ' + 'AppleWebKit/537.11 (KHTML, like Gecko) ' + 'Chrome/23.0.1271.64 Safari/537.11', + 'Accept': + 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', + 'Accept-Charset': + 'ISO-8859-1,utf-8;q=0.7,*;q=0.3', + 'Accept-Encoding': + 'none', + 'Accept-Language': + 'en-US,en;q=0.8', + 'Connection': + 'keep-alive' + } + + if args.closed: + url += "?state=closed" + + req = urllib.request.Request(url, headers=headers) + github_request = urllib.request.urlopen(req) + if not github_request: + parser.error('Cannot read milestone list.') + + decoder = json.JSONDecoder() + milestones = decoder.decode(github_request.read().decode('utf-8')) + github_request.close() + + print('parse milestones', file=sys.stderr) + milestone_id = None + for milestone in milestones: + if milestone['title'] == args.milestone: + milestone_id = milestone['number'] + if not milestone_id: + parser.error('Cannot find milestone') + + # Get milestone related issue info + url = '%s/%s/%s/issues?milestone=%d' % (github_url, args.user, + args.repository, milestone_id) + if args.closed: + url += "&state=closed" + + req = urllib.request.Request(url, headers=headers) + github_request = urllib.request.urlopen(req) + if not github_request: + parser.error('Cannot read issue list.') + + issues = decoder.decode(github_request.read().decode('utf-8')) + github_request.close() + + #print('parse issues', file=sys.stderr) + #final_data = [] + #labels = [] + #thanks_to = [] + #for issue in issues: + + # for label in issue['labels']: + # labels.append(label['name']) + + # thanks_to.append('@%s' % (issue['user']['login'])) + # final_data.append(' * **[%s]** - %s #%d by **@%s**\n' % ( + # label['name'], + # issue['title'], + # issue['number'], + # issue['user']['login'] + # )) + + #dic = collections.defaultdict(set) + #for l_release in list(set(labels)): + + # for f_data in final_data: + # if '[%s]' % l_release in f_data: + # dic[l_release].add(f_data) + + #with open(f"release_note_issues_{args.milestone}.md", 'w') as f: + # for key, value in dic.items(): + # print('# %s\n%s' % (key, ''.join(value)), file=f) + # print('# %s\n%s' % ('Acknowledgements', 'Special thanks to %s ' % (' '.join(list(set(thanks_to))))), file=f) + + # Get milestone related PR info + url = '%s/%s/%s/pulls?milestone=%d' % (github_url, args.user, + args.repository, milestone_id) + if args.closed: + url += "&state=closed" + + req = urllib.request.Request(url, headers=headers) + github_request = urllib.request.urlopen(req) + if not github_request: + parser.error('Cannot read issue list.') + + issues = decoder.decode(github_request.read().decode('utf-8')) + github_request.close() + + print('parse pulls', file=sys.stderr) + final_data = [] + labels = [] + thanks_to = [] + for issue in issues: + + for label in issue['labels']: + labels.append(label['name']) + + thanks_to.append('@%s' % (issue['user']['login'])) + final_data.append(' * **[%s]** - %s #%d by **@%s**\n' % + (label['name'], issue['title'], issue['number'], + issue['user']['login'])) + + dic = collections.defaultdict(set) + for l_release in list(set(labels)): + + for f_data in final_data: + if '[%s]' % l_release in f_data: + dic[l_release].add(f_data) + + with open(f"release_note_pulls_{args.milestone}.md", 'w') as f: + for key, value in dic.items(): + print('# %s\n%s' % (key, ''.join(value)), file=f) + print( + '# %s\n%s' % ('Acknowledgements', 'Special thanks to %s ' % + (' '.join(list(set(thanks_to))))), + file=f) diff --git a/utils/generate_infer_yaml.py b/utils/generate_infer_yaml.py new file mode 100644 index 0000000000000000000000000000000000000000..a2eb28c76076c8a7e1d3afacbf09fcf494ccd17a --- /dev/null +++ b/utils/generate_infer_yaml.py @@ -0,0 +1,183 @@ +#!/usr/bin/env python3 +# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + +''' + Merge training configs into a single inference config. + The single inference config is for CLI, which only takes a single config to do inferencing. + The trainig configs includes: model config, preprocess config, decode config, vocab file and cmvn file. +''' + +import yaml +import json +import os +import argparse +import math +from yacs.config import CfgNode + +from paddlespeech.s2t.frontend.utility import load_dict +from contextlib import redirect_stdout + + +def save(save_path, config): + with open(save_path, 'w') as fp: + with redirect_stdout(fp): + print(config.dump()) + + +def load(save_path): + config = CfgNode(new_allowed=True) + config.merge_from_file(save_path) + return config + +def load_json(json_path): + with open(json_path) as f: + json_content = json.load(f) + return json_content + +def remove_config_part(config, key_list): + if len(key_list) == 0: + return + for i in range(len(key_list) -1): + config = config[key_list[i]] + config.pop(key_list[-1]) + +def load_cmvn_from_json(cmvn_stats): + means = cmvn_stats['mean_stat'] + variance = cmvn_stats['var_stat'] + count = cmvn_stats['frame_num'] + for i in range(len(means)): + means[i] /= count + variance[i] = variance[i] / count - means[i] * means[i] + if variance[i] < 1.0e-20: + variance[i] = 1.0e-20 + variance[i] = 1.0 / math.sqrt(variance[i]) + cmvn_stats = {"mean":means, "istd":variance} + return cmvn_stats + +def merge_configs( + conf_path = "conf/conformer.yaml", + preprocess_path = "conf/preprocess.yaml", + decode_path = "conf/tuning/decode.yaml", + vocab_path = "data/vocab.txt", + cmvn_path = "data/mean_std.json", + save_path = "conf/conformer_infer.yaml", + ): + + # Load the configs + config = load(conf_path) + decode_config = load(decode_path) + vocab_list = load_dict(vocab_path) + + # If use the kaldi feature, do not load the cmvn file + if cmvn_path.split(".")[-1] == 'json': + cmvn_stats = load_json(cmvn_path) + if os.path.exists(preprocess_path): + preprocess_config = load(preprocess_path) + for idx, process in enumerate(preprocess_config["process"]): + if process['type'] == "cmvn_json": + preprocess_config["process"][idx][ + "cmvn_path"] = cmvn_stats + break + + config.preprocess_config = preprocess_config + else: + cmvn_stats = load_cmvn_from_json(cmvn_stats) + config.mean_std_filepath = [{"cmvn_stats":cmvn_stats}] + config.augmentation_config = '' + # the cmvn file is end with .ark + else: + config.cmvn_path = cmvn_path + # Updata the config + config.vocab_filepath = vocab_list + config.input_dim = config.feat_dim + config.output_dim = len(config.vocab_filepath) + config.decode = decode_config + # Remove some parts of the config + + if os.path.exists(preprocess_path): + remove_train_list = ["train_manifest", + "dev_manifest", + "test_manifest", + "n_epoch", + "accum_grad", + "global_grad_clip", + "optim", + "optim_conf", + "scheduler", + "scheduler_conf", + "log_interval", + "checkpoint", + "shuffle_method", + "weight_decay", + "ctc_grad_norm_type", + "minibatches", + "subsampling_factor", + "batch_bins", + "batch_count", + "batch_frames_in", + "batch_frames_inout", + "batch_frames_out", + "sortagrad", + "feat_dim", + "stride_ms", + "window_ms", + "batch_size", + "maxlen_in", + "maxlen_out", + ] + else: + remove_train_list = ["train_manifest", + "dev_manifest", + "test_manifest", + "n_epoch", + "accum_grad", + "global_grad_clip", + "log_interval", + "checkpoint", + "lr", + "lr_decay", + "batch_size", + "shuffle_method", + "weight_decay", + "sortagrad", + "num_workers", + ] + + for item in remove_train_list: + try: + remove_config_part(config, [item]) + except: + print ( item + " " +"can not be removed") + + # Save the config + save(save_path, config) + + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + prog='Config merge', add_help=True) + parser.add_argument( + '--cfg_pth', type=str, default = 'conf/transformer.yaml', help='origin config file') + parser.add_argument( + '--pre_pth', type=str, default= "conf/preprocess.yaml", help='') + parser.add_argument( + '--dcd_pth', type=str, default= "conf/tuninig/decode.yaml", help='') + parser.add_argument( + '--vb_pth', type=str, default= "data/lang_char/vocab.txt", help='') + parser.add_argument( + '--cmvn_pth', type=str, default= "data/mean_std.json", help='') + parser.add_argument( + '--save_pth', type=str, default= "conf/transformer_infer.yaml", help='') + parser_args = parser.parse_args() + + merge_configs( + conf_path = parser_args.cfg_pth, + decode_path = parser_args.dcd_pth, + preprocess_path = parser_args.pre_pth, + vocab_path = parser_args.vb_pth, + cmvn_path = parser_args.cmvn_pth, + save_path = parser_args.save_pth, + ) + + diff --git a/examples/csmsc/voc3/local/link_wav.py b/utils/link_wav.py similarity index 77% rename from examples/csmsc/voc3/local/link_wav.py rename to utils/link_wav.py index c81e0d4b83320665b98720d09a940e9de6dc63cd..8fe2156b273a0c9c56a96f9d6d22e00d5f160d98 100644 --- a/examples/csmsc/voc3/local/link_wav.py +++ b/utils/link_wav.py @@ -18,6 +18,7 @@ from pathlib import Path import jsonlines import numpy as np +from tqdm import tqdm def main(): @@ -52,15 +53,24 @@ def main(): output_dir = dump_dir / sub output_dir.mkdir(parents=True, exist_ok=True) results = [] - for name in os.listdir(output_dir / "raw"): - # 003918_feats.npy - utt_id = name.split("_")[0] + files = os.listdir(output_dir / "raw") + for name in tqdm(files): + utt_id = name.split("_feats.npy")[0] mel_path = output_dir / ("raw/" + name) gen_mel = np.load(mel_path) wave_name = utt_id + "_wave.npy" - wav = np.load(old_dump_dir / sub / ("raw/" + wave_name)) - os.symlink(old_dump_dir / sub / ("raw/" + wave_name), - output_dir / ("raw/" + wave_name)) + try: + wav = np.load(old_dump_dir / sub / ("raw/" + wave_name)) + os.symlink(old_dump_dir / sub / ("raw/" + wave_name), + output_dir / ("raw/" + wave_name)) + except FileNotFoundError: + print("delete " + name + + " because it cannot be found in the dump folder") + os.remove(output_dir / "raw" / name) + continue + except FileExistsError: + print("file " + name + " exists, skip.") + continue num_sample = wav.shape[0] num_frames = gen_mel.shape[0] wav_path = output_dir / ("raw/" + wave_name)