diff --git a/.gitignore b/.gitignore index cc8fff8770b97a3f31eb49270ad32ac25af30fad..778824f5e8a3c655cea60c81f259625da45dd40f 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,7 @@ *.pyc .vscode *log +*.wav *.pdmodel *.pdiparams* *.zip @@ -30,5 +31,8 @@ tools/OpenBLAS/ tools/Miniconda3-latest-Linux-x86_64.sh tools/activate_python.sh tools/miniconda.sh +tools/CRF++-0.58/ + +speechx/fc_patch/ *output/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 60f0b92f6025d78908cf5043161c6b21771aaa95..7fb01708a3de083c368031e7353fd35e2455788a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -50,12 +50,13 @@ repos: entry: bash .pre-commit-hooks/clang-format.hook -i language: system files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|cuh|proto)$ + exclude: (?=speechx/speechx/kaldi).*(\.cpp|\.cc|\.h|\.py)$ - id: copyright_checker name: copyright_checker entry: python .pre-commit-hooks/copyright-check.hook language: system files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|py)$ - exclude: (?=third_party|pypinyin).*(\.cpp|\.h|\.py)$ + exclude: (?=third_party|pypinyin|speechx/speechx/kaldi).*(\.cpp|\.cc|\.h|\.py)$ - repo: https://github.com/asottile/reorder_python_imports rev: v2.4.0 hooks: diff --git a/README.md b/README.md index 46730797ba6c99549ddc178a4babd38875217768..46f492e998028485e0b1322d550eeb52eb49d0d7 100644 --- a/README.md +++ b/README.md @@ -148,6 +148,12 @@ For more synthesized audios, please refer to [PaddleSpeech Text-to-Speech sample - [PaddleSpeech Demo Video](https://paddlespeech.readthedocs.io/en/latest/demo_video.html) +- **[VTuberTalk](https://github.com/jerryuhoo/VTuberTalk): Use PaddleSpeech TTS and ASR to clone voice from videos.** + +
+ +
+ ### 🔥 Hot Activities - 2021.12.21~12.24 @@ -196,16 +202,18 @@ Developers can have a try of our models with [PaddleSpeech Command Line](./paddl ```shell paddlespeech cls --input input.wav ``` + **Automatic Speech Recognition** ```shell paddlespeech asr --lang zh --input input_16k.wav ``` -**Speech Translation** (English to Chinese) +**Speech Translation** (English to Chinese) (not support for Mac and Windows now) ```shell paddlespeech st --input input_16k.wav ``` + **Text-to-Speech** ```shell paddlespeech tts --input "你好,欢迎使用飞桨深度学习框架!" --output output.wav @@ -218,7 +226,16 @@ paddlespeech tts --input "你好,欢迎使用飞桨深度学习框架!" --ou paddlespeech text --task punc --input 今天的天气真不错啊你下午有空吗我想约你一起去吃饭 ``` - +**Batch Process** +``` +echo -e "1 欢迎光临。\n2 谢谢惠顾。" | paddlespeech tts +``` + +**Shell Pipeline** +- ASR + Punctuation Restoration +``` +paddlespeech asr --input ./zh.wav | paddlespeech text --task punc +``` For more command lines, please see: [demos](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos) @@ -561,6 +578,9 @@ You are warmly welcome to submit questions in [discussions](https://github.com/P - Many thanks to [JiehangXie](https://github.com/JiehangXie)/[PaddleBoBo](https://github.com/JiehangXie/PaddleBoBo) for developing Virtual Uploader(VUP)/Virtual YouTuber(VTuber) with PaddleSpeech TTS function. - Many thanks to [745165806](https://github.com/745165806)/[PaddleSpeechTask](https://github.com/745165806/PaddleSpeechTask) for contributing Punctuation Restoration model. - Many thanks to [kslz](https://github.com/745165806) for supplementary Chinese documents. +- Many thanks to [awmmmm](https://github.com/awmmmm) for contributing fastspeech2 aishell3 conformer pretrained model. +- Many thanks to [phecda-xu](https://github.com/phecda-xu)/[PaddleDubbing](https://github.com/phecda-xu/PaddleDubbing) for developing a dubbing tool with GUI based on PaddleSpeech TTS model. +- Many thanks to [jerryuhoo](https://github.com/jerryuhoo)/[VTuberTalk](https://github.com/jerryuhoo/VTuberTalk) for developing a GUI tool based on PaddleSpeech TTS and code for making datasets from videos based on PaddleSpeech ASR. Besides, PaddleSpeech depends on a lot of open source repositories. See [references](./docs/source/reference.md) for more information. diff --git a/README_cn.md b/README_cn.md index 9782240a6de620d92e753740843ebe8e5b1ae8e8..e8494737299f6abdb079e4522f771705e8aac074 100644 --- a/README_cn.md +++ b/README_cn.md @@ -150,6 +150,12 @@ from https://github.com/18F/open-source-guide/blob/18f-pages/pages/making-readme - [PaddleSpeech 示例视频](https://paddlespeech.readthedocs.io/en/latest/demo_video.html) +- **[VTuberTalk](https://github.com/jerryuhoo/VTuberTalk): 使用 PaddleSpeech 的语音合成和语音识别从视频中克隆人声。** + +
+ +
+ ### 🔥 热门活动 - 2021.12.21~12.24 @@ -216,6 +222,17 @@ paddlespeech tts --input "你好,欢迎使用百度飞桨深度学习框架! paddlespeech text --task punc --input 今天的天气真不错啊你下午有空吗我想约你一起去吃饭 ``` +**批处理** +``` +echo -e "1 欢迎光临。\n2 谢谢惠顾。" | paddlespeech tts +``` + +**Shell管道** +ASR + Punc: +``` +paddlespeech asr --input ./zh.wav | paddlespeech text --task punc +``` + 更多命令行命令请参考 [demos](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos) > Note: 如果需要训练或者微调,请查看[语音识别](./docs/source/asr/quick_start.md), [语音合成](./docs/source/tts/quick_start.md)。 @@ -556,6 +573,10 @@ year={2021} - 非常感谢 [JiehangXie](https://github.com/JiehangXie)/[PaddleBoBo](https://github.com/JiehangXie/PaddleBoBo) 采用 PaddleSpeech 语音合成功能实现 Virtual Uploader(VUP)/Virtual YouTuber(VTuber) 虚拟主播。 - 非常感谢 [745165806](https://github.com/745165806)/[PaddleSpeechTask](https://github.com/745165806/PaddleSpeechTask) 贡献标点重建相关模型。 - 非常感谢 [kslz](https://github.com/kslz) 补充中文文档。 +- 非常感谢 [awmmmm](https://github.com/awmmmm) 提供 fastspeech2 aishell3 conformer 预训练模型。 +- 非常感谢 [phecda-xu](https://github.com/phecda-xu)/[PaddleDubbing](https://github.com/phecda-xu/PaddleDubbing) 基于 PaddleSpeech 的 TTS 模型搭建带 GUI 操作界面的配音工具。 +- 非常感谢 [jerryuhoo](https://github.com/jerryuhoo)/[VTuberTalk](https://github.com/jerryuhoo/VTuberTalk) 基于 PaddleSpeech 的 TTS GUI 界面和基于 ASR 制作数据集的相关代码。 + 此外,PaddleSpeech 依赖于许多开源存储库。有关更多信息,请参阅 [references](./docs/source/reference.md)。 diff --git a/dataset/voxceleb/voxceleb1.py b/dataset/voxceleb/voxceleb1.py index ce744751679d6e6ae756c8119cd4388adbebe404..e50c91bc169541612cc94575b85ba3794f7dbd05 100644 --- a/dataset/voxceleb/voxceleb1.py +++ b/dataset/voxceleb/voxceleb1.py @@ -80,6 +80,7 @@ parser.add_argument( args = parser.parse_args() + def create_manifest(data_dir, manifest_path_prefix): print("Creating manifest %s ..." % manifest_path_prefix) json_lines = [] @@ -128,6 +129,7 @@ def create_manifest(data_dir, manifest_path_prefix): print(f"{total_text / total_sec} text/sec", file=f) print(f"{total_sec / total_num} sec/utt", file=f) + def prepare_dataset(base_url, data_list, target_dir, manifest_path, target_data): if not os.path.exists(target_dir): @@ -164,6 +166,7 @@ def prepare_dataset(base_url, data_list, target_dir, manifest_path, # create the manifest file create_manifest(data_dir=target_dir, manifest_path_prefix=manifest_path) + def main(): if args.target_dir.startswith('~'): args.target_dir = os.path.expanduser(args.target_dir) @@ -184,5 +187,6 @@ def main(): print("Manifest prepare done!") + if __name__ == '__main__': main() diff --git a/demos/speech_recognition/.gitignore b/demos/speech_recognition/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..d8dd7532abcc65af52e9db03c516274e3d674dc1 --- /dev/null +++ b/demos/speech_recognition/.gitignore @@ -0,0 +1 @@ +*.wav diff --git a/demos/speech_recognition/README.md b/demos/speech_recognition/README.md index c49afa35c2d8027011c333eb110eb22b1d08924d..5d964fceac73f60632b2b31a750941e958b59966 100644 --- a/demos/speech_recognition/README.md +++ b/demos/speech_recognition/README.md @@ -27,6 +27,8 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee paddlespeech asr --input ./zh.wav # English paddlespeech asr --model transformer_librispeech --lang en --input ./en.wav + # Chinese ASR + Punctuation Restoration + paddlespeech asr --input ./zh.wav | paddlespeech text --task punc ``` (It doesn't matter if package `paddlespeech-ctcdecoders` is not found, this package is optional.) diff --git a/demos/speech_recognition/README_cn.md b/demos/speech_recognition/README_cn.md index c2e38c91bc6b6374e8ab93f720b5c59330f3e05c..ba1f1d65c5ca9dec435cc1e998117238077407be 100644 --- a/demos/speech_recognition/README_cn.md +++ b/demos/speech_recognition/README_cn.md @@ -25,6 +25,8 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee paddlespeech asr --input ./zh.wav # 英文 paddlespeech asr --model transformer_librispeech --lang en --input ./en.wav + # 中文 + 标点恢复 + paddlespeech asr --input ./zh.wav | paddlespeech text --task punc ``` (如果显示 `paddlespeech-ctcdecoders` 这个 python 包没有找到的 Error,没有关系,这个包是非必须的。) diff --git a/demos/speech_recognition/run.sh b/demos/speech_recognition/run.sh index 5efc8b81f97f818753059c6fa19e718f7f3f05ae..06466928611f51bfec65529cad5d04966bf2607a 100755 --- a/demos/speech_recognition/run.sh +++ b/demos/speech_recognition/run.sh @@ -1,4 +1,10 @@ #!/bin/bash wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav + +# asr paddlespeech asr --input ./zh.wav + + +# asr + punc +paddlespeech asr --input ./zh.wav | paddlespeech text --task punc \ No newline at end of file diff --git a/demos/speech_server/README.md b/demos/speech_server/README.md index 39007f6caacf8fa7924a2f0d74bfc734277f6a61..ac5cc4b00ac81d7c1b05ad94a4e3ceff428ff8ce 100644 --- a/demos/speech_server/README.md +++ b/demos/speech_server/README.md @@ -15,6 +15,17 @@ You can choose one way from easy, meduim and hard to install paddlespeech. ### 2. Prepare config File The configuration file contains the service-related configuration files and the model configuration related to the voice tasks contained in the service. They are all under the `conf` folder. +**Note: The configuration of `engine_backend` in `application.yaml` represents all speech tasks included in the started service. ** +If the service you want to start contains only a certain speech task, then you need to comment out the speech tasks that do not need to be included. For example, if you only want to use the speech recognition (ASR) service, then you can comment out the speech synthesis (TTS) service, as in the following example: +```bash +engine_backend: + asr: 'conf/asr/asr.yaml' + #tts: 'conf/tts/tts.yaml' +``` + +**Note: The configuration file of `engine_backend` in `application.yaml` needs to match the configuration type of `engine_type`. ** +When the configuration file of `engine_backend` is `XXX.yaml`, the configuration type of `engine_type` needs to be set to `python`; when the configuration file of `engine_backend` is `XXX_pd.yaml`, the configuration of `engine_type` needs to be set type is `inference`; + The input of ASR client demo should be a WAV file(`.wav`), and the sample rate must be the same as the model. Here are sample files for thisASR client demo that can be downloaded: @@ -76,6 +87,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee ### 4. ASR Client Usage +**Note:** The response time will be slightly longer when using the client for the first time - Command Line (Recommended) ``` paddlespeech_client asr --server_ip 127.0.0.1 --port 8090 --input ./zh.wav @@ -122,6 +134,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee ``` ### 5. TTS Client Usage +**Note:** The response time will be slightly longer when using the client for the first time - Command Line (Recommended) ```bash paddlespeech_client tts --server_ip 127.0.0.1 --port 8090 --input "您好,欢迎使用百度飞桨语音合成服务。" --output output.wav @@ -147,8 +160,6 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee [2022-02-23 15:20:37,875] [ INFO] - Save synthesized audio successfully on output.wav. [2022-02-23 15:20:37,875] [ INFO] - Audio duration: 3.612500 s. [2022-02-23 15:20:37,875] [ INFO] - Response time: 0.348050 s. - [2022-02-23 15:20:37,875] [ INFO] - RTF: 0.096346 - ``` @@ -174,51 +185,13 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee Save synthesized audio successfully on ./output.wav. Audio duration: 3.612500 s. Response time: 0.388317 s. - RTF: 0.107493 ``` -## Pretrained Models +## Models supported by the service ### ASR model -Here is a list of [ASR pretrained models](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/demos/speech_recognition/README.md#4pretrained-models) released by PaddleSpeech, both command line and python interfaces are available: - -| Model | Language | Sample Rate -| :--- | :---: | :---: | -| conformer_wenetspeech| zh| 16000 -| transformer_librispeech| en| 16000 +Get all models supported by the ASR service via `paddlespeech_server stats --task asr`, where static models can be used for paddle inference inference. ### TTS model -Here is a list of [TTS pretrained models](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/demos/text_to_speech/README.md#4-pretrained-models) released by PaddleSpeech, both command line and python interfaces are available: - -- Acoustic model - | Model | Language - | :--- | :---: | - | speedyspeech_csmsc| zh - | fastspeech2_csmsc| zh - | fastspeech2_aishell3| zh - | fastspeech2_ljspeech| en - | fastspeech2_vctk| en - -- Vocoder - | Model | Language - | :--- | :---: | - | pwgan_csmsc| zh - | pwgan_aishell3| zh - | pwgan_ljspeech| en - | pwgan_vctk| en - | mb_melgan_csmsc| zh - -Here is a list of **TTS pretrained static models** released by PaddleSpeech, both command line and python interfaces are available: -- Acoustic model - | Model | Language - | :--- | :---: | - | speedyspeech_csmsc| zh - | fastspeech2_csmsc| zh - -- Vocoder - | Model | Language - | :--- | :---: | - | pwgan_csmsc| zh - | mb_melgan_csmsc| zh - | hifigan_csmsc| zh +Get all models supported by the TTS service via `paddlespeech_server stats --task tts`, where static models can be used for paddle inference inference. diff --git a/demos/speech_server/README_cn.md b/demos/speech_server/README_cn.md index f56660705800f9d2061b222ec6cd412c7319b759..f202a30cd3ee3891231f81cd789bc89712baf2ec 100644 --- a/demos/speech_server/README_cn.md +++ b/demos/speech_server/README_cn.md @@ -14,6 +14,15 @@ ### 2. 准备配置文件 配置文件包含服务相关的配置文件和服务中包含的语音任务相关的模型配置。 它们都在 `conf` 文件夹下。 +**注意:`application.yaml` 中 `engine_backend` 的配置表示启动的服务中包含的所有语音任务。** +如果你想启动的服务中只包含某项语音任务,那么你需要注释掉不需要包含的语音任务。例如你只想使用语音识别(ASR)服务,那么你可以将语音合成(TTS)服务注释掉,如下示例: +```bash +engine_backend: + asr: 'conf/asr/asr.yaml' + #tts: 'conf/tts/tts.yaml' +``` +**注意:`application.yaml` 中 `engine_backend` 的配置文件需要和 `engine_type` 的配置类型匹配。** +当`engine_backend` 的配置文件为`XXX.yaml`时,需要设置`engine_type`的配置类型为`python`;当`engine_backend` 的配置文件为`XXX_pd.yaml`时,需要设置`engine_type`的配置类型为`inference`; 这个 ASR client 的输入应该是一个 WAV 文件(`.wav`),并且采样率必须与模型的采样率相同。 @@ -75,6 +84,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee ``` ### 4. ASR客户端使用方法 +**注意:**初次使用客户端时响应时间会略长 - 命令行 (推荐使用) ``` paddlespeech_client asr --server_ip 127.0.0.1 --port 8090 --input ./zh.wav @@ -123,6 +133,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee ``` ### 5. TTS客户端使用方法 +**注意:**初次使用客户端时响应时间会略长 ```bash paddlespeech_client tts --server_ip 127.0.0.1 --port 8090 --input "您好,欢迎使用百度飞桨语音合成服务。" --output output.wav ``` @@ -148,7 +159,6 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee [2022-02-23 15:20:37,875] [ INFO] - Save synthesized audio successfully on output.wav. [2022-02-23 15:20:37,875] [ INFO] - Audio duration: 3.612500 s. [2022-02-23 15:20:37,875] [ INFO] - Response time: 0.348050 s. - [2022-02-23 15:20:37,875] [ INFO] - RTF: 0.096346 ``` - Python API @@ -173,50 +183,12 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee Save synthesized audio successfully on ./output.wav. Audio duration: 3.612500 s. Response time: 0.388317 s. - RTF: 0.107493 ``` -## Pretrained Models -### ASR model -下面是PaddleSpeech发布的[ASR预训练模型](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/demos/speech_recognition/README.md#4pretrained-models)列表,命令行和python接口均可用: - -| Model | Language | Sample Rate -| :--- | :---: | :---: | -| conformer_wenetspeech| zh| 16000 -| transformer_librispeech| en| 16000 - -### TTS model -下面是PaddleSpeech发布的 [TTS预训练模型](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/demos/text_to_speech/README.md#4-pretrained-models) 列表,命令行和python接口均可用: - -- Acoustic model - | Model | Language - | :--- | :---: | - | speedyspeech_csmsc| zh - | fastspeech2_csmsc| zh - | fastspeech2_aishell3| zh - | fastspeech2_ljspeech| en - | fastspeech2_vctk| en - -- Vocoder - | Model | Language - | :--- | :---: | - | pwgan_csmsc| zh - | pwgan_aishell3| zh - | pwgan_ljspeech| en - | pwgan_vctk| en - | mb_melgan_csmsc| zh - -下面是PaddleSpeech发布的 **TTS预训练静态模型** 列表,命令行和python接口均可用: -- Acoustic model - | Model | Language - | :--- | :---: | - | speedyspeech_csmsc| zh - | fastspeech2_csmsc| zh - -- Vocoder - | Model | Language - | :--- | :---: | - | pwgan_csmsc| zh - | mb_melgan_csmsc| zh - | hifigan_csmsc| zh +## 服务支持的模型 +### ASR支持的模型 +通过 `paddlespeech_server stats --task asr` 获取ASR服务支持的所有模型,其中静态模型可用于 paddle inference 推理。 + +### TTS支持的模型 +通过 `paddlespeech_server stats --task tts` 获取TTS服务支持的所有模型,其中静态模型可用于 paddle inference 推理。 diff --git a/demos/speech_server/conf/application.yaml b/demos/speech_server/conf/application.yaml index c8d71f2f6ad816e9848096e84c637c0069757594..6dcae74a944fdf129a35b991b23be5c724d5df16 100644 --- a/demos/speech_server/conf/application.yaml +++ b/demos/speech_server/conf/application.yaml @@ -3,15 +3,25 @@ ################################################################## # SERVER SETTING # ################################################################## -host: '0.0.0.0' +host: '127.0.0.1' port: 8090 ################################################################## # CONFIG FILE # ################################################################## -# add engine type (Options: asr, tts) and config file here. - +# add engine backend type (Options: asr, tts) and config file here. +# Adding a speech task to engine_backend means starting the service. engine_backend: asr: 'conf/asr/asr.yaml' tts: 'conf/tts/tts.yaml' +# The engine_type of speech task needs to keep the same type as the config file of speech task. +# E.g: The engine_type of asr is 'python', the engine_backend of asr is 'XX/asr.yaml' +# E.g: The engine_type of asr is 'inference', the engine_backend of asr is 'XX/asr_pd.yaml' +# +# add engine type (Options: python, inference) +engine_type: + asr: 'python' + tts: 'python' + + diff --git a/demos/speech_server/conf/asr/asr.yaml b/demos/speech_server/conf/asr/asr.yaml index 4c3b0a67e30273681fe765fc2e827f86a21ac380..a6743b77513e504f2bcd374ea8235d8e39a7c98c 100644 --- a/demos/speech_server/conf/asr/asr.yaml +++ b/demos/speech_server/conf/asr/asr.yaml @@ -1,7 +1,8 @@ model: 'conformer_wenetspeech' lang: 'zh' sample_rate: 16000 -cfg_path: -ckpt_path: +cfg_path: # [optional] +ckpt_path: # [optional] decode_method: 'attention_rescoring' -force_yes: False +force_yes: True +device: # set 'gpu:id' or 'cpu' diff --git a/demos/speech_server/conf/asr/asr_pd.yaml b/demos/speech_server/conf/asr/asr_pd.yaml new file mode 100644 index 0000000000000000000000000000000000000000..4c415ac791edeab2d9832e8db2e9a66411aaed06 --- /dev/null +++ b/demos/speech_server/conf/asr/asr_pd.yaml @@ -0,0 +1,26 @@ +# This is the parameter configuration file for ASR server. +# These are the static models that support paddle inference. + +################################################################## +# ACOUSTIC MODEL SETTING # +# am choices=['deepspeech2offline_aishell'] TODO +################################################################## +model_type: 'deepspeech2offline_aishell' +am_model: # the pdmodel file of am static model [optional] +am_params: # the pdiparams file of am static model [optional] +lang: 'zh' +sample_rate: 16000 +cfg_path: +decode_method: +force_yes: True + +am_predictor_conf: + device: # set 'gpu:id' or 'cpu' + switch_ir_optim: True + glog_info: False # True -> print glog + summary: True # False -> do not show predictor config + + +################################################################## +# OTHERS # +################################################################## diff --git a/demos/speech_server/conf/tts/tts.yaml b/demos/speech_server/conf/tts/tts.yaml index cb4829c881efdc4802530f68abbc13f24bac4a61..19207f0b03579a906c80ba6eff356792974eeefd 100644 --- a/demos/speech_server/conf/tts/tts.yaml +++ b/demos/speech_server/conf/tts/tts.yaml @@ -29,4 +29,4 @@ voc_stat: # OTHERS # ################################################################## lang: 'zh' -device: 'gpu:2' +device: # set 'gpu:id' or 'cpu' diff --git a/demos/speech_server/conf/tts/tts_pd.yaml b/demos/speech_server/conf/tts/tts_pd.yaml index c268c6a336bb21be7879980cb3cb3c59611d64cd..e27b9665bbe1ee8b5d5c39fd3e5f87d841dd64de 100644 --- a/demos/speech_server/conf/tts/tts_pd.yaml +++ b/demos/speech_server/conf/tts/tts_pd.yaml @@ -6,8 +6,8 @@ # am choices=['speedyspeech_csmsc', 'fastspeech2_csmsc'] ################################################################## am: 'fastspeech2_csmsc' -am_model: # the pdmodel file of am static model -am_params: # the pdiparams file of am static model +am_model: # the pdmodel file of your am static model (XX.pdmodel) +am_params: # the pdiparams file of your am static model (XX.pdipparams) am_sample_rate: 24000 phones_dict: tones_dict: @@ -15,9 +15,10 @@ speaker_dict: spk_id: 0 am_predictor_conf: - use_gpu: True - enable_mkldnn: True + device: # set 'gpu:id' or 'cpu' switch_ir_optim: True + glog_info: False # True -> print glog + summary: True # False -> do not show predictor config ################################################################## @@ -25,17 +26,17 @@ am_predictor_conf: # voc choices=['pwgan_csmsc', 'mb_melgan_csmsc','hifigan_csmsc'] ################################################################## voc: 'pwgan_csmsc' -voc_model: # the pdmodel file of vocoder static model -voc_params: # the pdiparams file of vocoder static model +voc_model: # the pdmodel file of your vocoder static model (XX.pdmodel) +voc_params: # the pdiparams file of your vocoder static model (XX.pdipparams) voc_sample_rate: 24000 voc_predictor_conf: - use_gpu: True - enable_mkldnn: True + device: # set 'gpu:id' or 'cpu' switch_ir_optim: True + glog_info: False # True -> print glog + summary: True # False -> do not show predictor config ################################################################## # OTHERS # ################################################################## lang: 'zh' -device: paddle.get_device() diff --git a/demos/text_to_speech/README.md b/demos/text_to_speech/README.md index 9d3c4ac539a1afcd62a03c4f98b2dfe4cb622aae..2df72a82dec88ddc55505c9575721aee2de09536 100644 --- a/demos/text_to_speech/README.md +++ b/demos/text_to_speech/README.md @@ -17,11 +17,14 @@ The input of this demo should be a text of the specific language that can be pas ### 3. Usage - Command Line (Recommended) - Chinese - The default acoustic model is `Fastspeech2`, and the default vocoder is `Parallel WaveGAN`. ```bash paddlespeech tts --input "你好,欢迎使用百度飞桨深度学习框架!" ``` + - Batch Process + ```bash + echo -e "1 欢迎光临。\n2 谢谢惠顾。" | paddlespeech tts + ``` - Chinese, use `SpeedySpeech` as the acoustic model ```bash paddlespeech tts --am speedyspeech_csmsc --input "你好,欢迎使用百度飞桨深度学习框架!" diff --git a/demos/text_to_speech/README_cn.md b/demos/text_to_speech/README_cn.md index f075efdafc1a236b4517764568b31499159c151b..7e02b962483b4b0959fa9b9fe0c082bb0a6fdc3e 100644 --- a/demos/text_to_speech/README_cn.md +++ b/demos/text_to_speech/README_cn.md @@ -24,6 +24,10 @@ ```bash paddlespeech tts --input "你好,欢迎使用百度飞桨深度学习框架!" ``` + - 批处理 + ```bash + echo -e "1 欢迎光临。\n2 谢谢惠顾。" | paddlespeech tts + ``` - 中文,使用 `SpeedySpeech` 作为声学模型 ```bash paddlespeech tts --am speedyspeech_csmsc --input "你好,欢迎使用百度飞桨深度学习框架!" diff --git a/demos/text_to_speech/run.sh b/demos/text_to_speech/run.sh index c2487aeed38ed5b0e3bc7e5c256eff0139bcca2b..b1340241bf833129de9ae5581ada4a542253f96c 100755 --- a/demos/text_to_speech/run.sh +++ b/demos/text_to_speech/run.sh @@ -1,3 +1,7 @@ #!/bin/bash +# single process paddlespeech tts --input 今天的天气不错啊 + +# Batch process +echo -e "1 欢迎光临。\n2 谢谢惠顾。" | paddlespeech tts \ No newline at end of file diff --git a/docs/topic/ctc/ctc_loss_compare.ipynb b/docs/topic/ctc/ctc_loss_compare.ipynb index 95b2af5085b79a76ba7898089327c1b278921dac..c313710c262ea6bc2888da83e19081d80996c73f 100644 --- a/docs/topic/ctc/ctc_loss_compare.ipynb +++ b/docs/topic/ctc/ctc_loss_compare.ipynb @@ -30,12 +30,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Cloning into 'warp-ctc'...\n", - "remote: Enumerating objects: 829, done.\u001b[K\n", - "remote: Total 829 (delta 0), reused 0 (delta 0), pack-reused 829\u001b[K\n", - "Receiving objects: 100% (829/829), 388.85 KiB | 140.00 KiB/s, done.\n", - "Resolving deltas: 100% (419/419), done.\n", - "Checking connectivity... done.\n" + "fatal: destination path 'warp-ctc' already exists and is not an empty directory.\r\n" ] } ], @@ -99,30 +94,6 @@ "name": "stdout", "output_type": "stream", "text": [ - "-- The C compiler identification is GNU 5.4.0\n", - "-- The CXX compiler identification is GNU 5.4.0\n", - "-- Check for working C compiler: /usr/bin/cc\n", - "-- Check for working C compiler: /usr/bin/cc -- works\n", - "-- Detecting C compiler ABI info\n", - "-- Detecting C compiler ABI info - done\n", - "-- Detecting C compile features\n", - "-- Detecting C compile features - done\n", - "-- Check for working CXX compiler: /usr/bin/c++\n", - "-- Check for working CXX compiler: /usr/bin/c++ -- works\n", - "-- Detecting CXX compiler ABI info\n", - "-- Detecting CXX compiler ABI info - done\n", - "-- Detecting CXX compile features\n", - "-- Detecting CXX compile features - done\n", - "-- Looking for pthread.h\n", - "-- Looking for pthread.h - found\n", - "-- Performing Test CMAKE_HAVE_LIBC_PTHREAD\n", - "-- Performing Test CMAKE_HAVE_LIBC_PTHREAD - Failed\n", - "-- Looking for pthread_create in pthreads\n", - "-- Looking for pthread_create in pthreads - not found\n", - "-- Looking for pthread_create in pthread\n", - "-- Looking for pthread_create in pthread - found\n", - "-- Found Threads: TRUE \n", - "-- Found CUDA: /usr/local/cuda (found suitable version \"10.2\", minimum required is \"6.5\") \n", "-- cuda found TRUE\n", "-- Building shared library with GPU support\n", "-- Configuring done\n", @@ -145,20 +116,11 @@ "name": "stdout", "output_type": "stream", "text": [ - "[ 11%] \u001b[34m\u001b[1mBuilding NVCC (Device) object CMakeFiles/warpctc.dir/src/warpctc_generated_reduce.cu.o\u001b[0m\n", - "[ 22%] \u001b[34m\u001b[1mBuilding NVCC (Device) object CMakeFiles/warpctc.dir/src/warpctc_generated_ctc_entrypoint.cu.o\u001b[0m\n", - "\u001b[35m\u001b[1mScanning dependencies of target warpctc\u001b[0m\n", - "[ 33%] \u001b[32m\u001b[1mLinking CXX shared library libwarpctc.so\u001b[0m\n", + "[ 11%] \u001b[32m\u001b[1mLinking CXX shared library libwarpctc.so\u001b[0m\n", "[ 33%] Built target warpctc\n", - "[ 44%] \u001b[34m\u001b[1mBuilding NVCC (Device) object CMakeFiles/test_gpu.dir/tests/test_gpu_generated_test_gpu.cu.o\u001b[0m\n", - "\u001b[35m\u001b[1mScanning dependencies of target test_cpu\u001b[0m\n", - "[ 55%] \u001b[32mBuilding CXX object CMakeFiles/test_cpu.dir/tests/test_cpu.cpp.o\u001b[0m\n", - "[ 66%] \u001b[32mBuilding CXX object CMakeFiles/test_cpu.dir/tests/random.cpp.o\u001b[0m\n", - "[ 77%] \u001b[32m\u001b[1mLinking CXX executable test_cpu\u001b[0m\n", + "[ 44%] \u001b[32m\u001b[1mLinking CXX executable test_cpu\u001b[0m\n", + "[ 55%] \u001b[32m\u001b[1mLinking CXX executable test_gpu\u001b[0m\n", "[ 77%] Built target test_cpu\n", - "\u001b[35m\u001b[1mScanning dependencies of target test_gpu\u001b[0m\n", - "[ 88%] \u001b[32mBuilding CXX object CMakeFiles/test_gpu.dir/tests/random.cpp.o\u001b[0m\n", - "[100%] \u001b[32m\u001b[1mLinking CXX executable test_gpu\u001b[0m\n", "[100%] Built target test_gpu\n" ] } @@ -169,7 +131,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "id": "31761a31", "metadata": {}, "outputs": [ @@ -187,7 +149,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 10, "id": "f53316f6", "metadata": {}, "outputs": [ @@ -205,7 +167,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 11, "id": "084f1e49", "metadata": {}, "outputs": [ @@ -216,29 +178,20 @@ "running install\n", "running bdist_egg\n", "running egg_info\n", - "creating warpctc_pytorch.egg-info\n", "writing warpctc_pytorch.egg-info/PKG-INFO\n", "writing dependency_links to warpctc_pytorch.egg-info/dependency_links.txt\n", "writing top-level names to warpctc_pytorch.egg-info/top_level.txt\n", "writing manifest file 'warpctc_pytorch.egg-info/SOURCES.txt'\n", - "writing manifest file 'warpctc_pytorch.egg-info/SOURCES.txt'\n", "installing library code to build/bdist.linux-x86_64/egg\n", "running install_lib\n", "running build_py\n", - "creating build\n", - "creating build/lib.linux-x86_64-3.9\n", - "creating build/lib.linux-x86_64-3.9/warpctc_pytorch\n", - "copying warpctc_pytorch/__init__.py -> build/lib.linux-x86_64-3.9/warpctc_pytorch\n", "running build_ext\n", "building 'warpctc_pytorch._warp_ctc' extension\n", - "creating /workspace/zhanghui/DeepSpeech-2.x/docs/topic/ctc/warp-ctc/pytorch_binding/build/temp.linux-x86_64-3.9\n", - "creating /workspace/zhanghui/DeepSpeech-2.x/docs/topic/ctc/warp-ctc/pytorch_binding/build/temp.linux-x86_64-3.9/src\n", "Emitting ninja build file /workspace/zhanghui/DeepSpeech-2.x/docs/topic/ctc/warp-ctc/pytorch_binding/build/temp.linux-x86_64-3.9/build.ninja...\n", "Compiling objects...\n", "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n", - "[1/1] c++ -MMD -MF /workspace/zhanghui/DeepSpeech-2.x/docs/topic/ctc/warp-ctc/pytorch_binding/build/temp.linux-x86_64-3.9/src/binding.o.d -pthread -B /workspace/zhanghui/DeepSpeech-2.x/tools/venv/compiler_compat -Wl,--sysroot=/ -Wno-unused-result -Wsign-compare -DNDEBUG -O2 -Wall -fPIC -O2 -isystem /workspace/zhanghui/DeepSpeech-2.x/tools/venv/include -fPIC -O2 -isystem /workspace/zhanghui/DeepSpeech-2.x/tools/venv/include -fPIC -I/workspace/zhanghui/DeepSpeech-2.x/docs/topic/ctc/warp-ctc/include -I/workspace/zhanghui/DeepSpeech-2.x/tools/venv/lib/python3.9/site-packages/torch/include -I/workspace/zhanghui/DeepSpeech-2.x/tools/venv/lib/python3.9/site-packages/torch/include/torch/csrc/api/include -I/workspace/zhanghui/DeepSpeech-2.x/tools/venv/lib/python3.9/site-packages/torch/include/TH -I/workspace/zhanghui/DeepSpeech-2.x/tools/venv/lib/python3.9/site-packages/torch/include/THC -I/usr/local/cuda/include -I/workspace/zhanghui/DeepSpeech-2.x/tools/venv/include/python3.9 -c -c /workspace/zhanghui/DeepSpeech-2.x/docs/topic/ctc/warp-ctc/pytorch_binding/src/binding.cpp -o /workspace/zhanghui/DeepSpeech-2.x/docs/topic/ctc/warp-ctc/pytorch_binding/build/temp.linux-x86_64-3.9/src/binding.o -std=c++14 -fPIC -DWARPCTC_ENABLE_GPU -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE=\"_gcc\"' '-DPYBIND11_STDLIB=\"_libstdcpp\"' '-DPYBIND11_BUILD_ABI=\"_cxxabi1011\"' -DTORCH_EXTENSION_NAME=_warp_ctc -D_GLIBCXX_USE_CXX11_ABI=0\n", + "ninja: no work to do.\n", "g++ -pthread -B /workspace/zhanghui/DeepSpeech-2.x/tools/venv/compiler_compat -Wl,--sysroot=/ -shared -Wl,-rpath,/workspace/zhanghui/DeepSpeech-2.x/tools/venv/lib -Wl,-rpath-link,/workspace/zhanghui/DeepSpeech-2.x/tools/venv/lib -L/workspace/zhanghui/DeepSpeech-2.x/tools/venv/lib -Wl,-rpath,/workspace/zhanghui/DeepSpeech-2.x/tools/venv/lib -Wl,-rpath-link,/workspace/zhanghui/DeepSpeech-2.x/tools/venv/lib -L/workspace/zhanghui/DeepSpeech-2.x/tools/venv/lib /workspace/zhanghui/DeepSpeech-2.x/docs/topic/ctc/warp-ctc/pytorch_binding/build/temp.linux-x86_64-3.9/src/binding.o -L/workspace/zhanghui/DeepSpeech-2.x/docs/topic/ctc/warp-ctc/build -L/workspace/zhanghui/DeepSpeech-2.x/tools/venv/lib/python3.9/site-packages/torch/lib -L/usr/local/cuda/lib64 -lwarpctc -lc10 -ltorch -ltorch_cpu -ltorch_python -lcudart -lc10_cuda -ltorch_cuda -o build/lib.linux-x86_64-3.9/warpctc_pytorch/_warp_ctc.cpython-39-x86_64-linux-gnu.so -Wl,-rpath,/workspace/zhanghui/DeepSpeech-2.x/docs/topic/ctc/warp-ctc/build\n", - "creating build/bdist.linux-x86_64\n", "creating build/bdist.linux-x86_64/egg\n", "creating build/bdist.linux-x86_64/egg/warpctc_pytorch\n", "copying build/lib.linux-x86_64-3.9/warpctc_pytorch/__init__.py -> build/bdist.linux-x86_64/egg/warpctc_pytorch\n", @@ -254,7 +207,6 @@ "writing build/bdist.linux-x86_64/egg/EGG-INFO/native_libs.txt\n", "zip_safe flag not set; analyzing archive contents...\n", "warpctc_pytorch.__pycache__._warp_ctc.cpython-39: module references __file__\n", - "creating dist\n", "creating 'dist/warpctc_pytorch-0.1-py3.9-linux-x86_64.egg' and adding 'build/bdist.linux-x86_64/egg' to it\n", "removing 'build/bdist.linux-x86_64/egg' (and everything under it)\n", "Processing warpctc_pytorch-0.1-py3.9-linux-x86_64.egg\n", @@ -275,7 +227,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 12, "id": "ee4ca9e3", "metadata": {}, "outputs": [ @@ -293,7 +245,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 13, "id": "59255ed8", "metadata": {}, "outputs": [ @@ -311,21 +263,14 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 22, "id": "1dae09b9", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "grep: warning: GREP_OPTIONS is deprecated; please use an alias or script\n" - ] - } - ], + "outputs": [], "source": [ "import torch\n", "import torch.nn as nn\n", + "import torch.nn.functional as F\n", "import warpctc_pytorch as wp\n", "import paddle.nn as pn\n", "import paddle" @@ -333,7 +278,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 15, "id": "83d0762e", "metadata": {}, "outputs": [ @@ -343,7 +288,7 @@ "'1.10.0+cu102'" ] }, - "execution_count": 16, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -354,17 +299,17 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 16, "id": "62501e2c", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "'2.2.0'" + "'2.2.1'" ] }, - "execution_count": 17, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -375,7 +320,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 17, "id": "9e8e0f40", "metadata": {}, "outputs": [ @@ -392,6 +337,7 @@ } ], "source": [ + "# warpctc_pytorch CTCLoss\n", "probs = torch.FloatTensor([[\n", " [0.1, 0.6, 0.1, 0.1, 0.1], [0.1, 0.1, 0.6, 0.1, 0.1]\n", " ]]).transpose(0, 1).contiguous()\n", @@ -412,7 +358,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 18, "id": "2cd46569", "metadata": {}, "outputs": [ @@ -428,6 +374,7 @@ } ], "source": [ + "# pytorch CTCLoss\n", "probs = torch.FloatTensor([[\n", " [0.1, 0.6, 0.1, 0.1, 0.1], [0.1, 0.1, 0.6, 0.1, 0.1]\n", " ]]).transpose(0, 1).contiguous()\n", @@ -449,7 +396,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 27, "id": "85c3461a", "metadata": {}, "outputs": [ @@ -467,6 +414,7 @@ } ], "source": [ + "# Paddle CTCLoss\n", "paddle.set_device('cpu')\n", "probs = paddle.to_tensor([[\n", " [0.1, 0.6, 0.1, 0.1, 0.1], [0.1, 0.1, 0.6, 0.1, 0.1],\n", @@ -490,7 +438,55 @@ { "cell_type": "code", "execution_count": null, - "id": "d390cd91", + "id": "8cdf76c2", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "2c305eaf", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "torch.Size([2, 1, 5])\n", + "2.4628584384918213\n", + "[[[ 0.17703117 -0.7081247 0.17703117 0.17703117 0.17703117]]\n", + "\n", + " [[ 0.17703117 0.17703117 -0.7081247 0.17703117 0.17703117]]]\n" + ] + } + ], + "source": [ + "# warpctc_pytorch CTCLoss, log_softmax idempotent\n", + "probs = torch.FloatTensor([[\n", + " [0.1, 0.6, 0.1, 0.1, 0.1], [0.1, 0.1, 0.6, 0.1, 0.1]\n", + " ]]).transpose(0, 1).contiguous()\n", + "print(probs.size())\n", + "labels = torch.IntTensor([1, 2])\n", + "label_sizes = torch.IntTensor([2])\n", + "probs_sizes = torch.IntTensor([2])\n", + "probs.requires_grad_(True)\n", + "bs = probs.size(1)\n", + "\n", + "ctc_loss = wp.CTCLoss(size_average=False, length_average=False)\n", + "\n", + "log_probs = torch.log_softmax(probs, axis=-1)\n", + "cost = ctc_loss(log_probs, labels, probs_sizes, label_sizes)\n", + "cost = cost.sum() / bs\n", + "print(cost.item())\n", + "cost.backward()\n", + "print(probs.grad.numpy())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "443336f0", "metadata": {}, "outputs": [], "source": [] diff --git a/docs/topic/ctc/ctc_loss_speed_compare.ipynb b/docs/topic/ctc/ctc_loss_speed_compare.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..eb7a030c7e6be03e43016d8a47aa049ea3e40eee --- /dev/null +++ b/docs/topic/ctc/ctc_loss_speed_compare.ipynb @@ -0,0 +1,369 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "a1e738e0", + "metadata": {}, + "source": [ + "## 获取测试的 logit 数据" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "29d3368b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "hlens.npy\n", + "logits.npy\n", + "ys_lens.npy\n", + "ys_pad.npy\n" + ] + } + ], + "source": [ + "!mkdir -p ./test_data\n", + "!test -f ./test_data/ctc_loss_compare_data.tgz || wget -P ./test_data https://paddlespeech.bj.bcebos.com/datasets/unit_test/asr/ctc_loss_compare_data.tgz\n", + "!tar xzvf test_data/ctc_loss_compare_data.tgz -C ./test_data\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "240caf1d", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import numpy as np\n", + "import time\n", + "\n", + "data_dir=\"./test_data\"\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "91bad949", + "metadata": {}, + "outputs": [], + "source": [ + "logits_np = np.load(os.path.join(data_dir, \"logits.npy\"))\n", + "ys_pad_np = np.load(os.path.join(data_dir, \"ys_pad.npy\"))\n", + "hlens_np = np.load(os.path.join(data_dir, \"hlens.npy\"))\n", + "ys_lens_np = np.load(os.path.join(data_dir, \"ys_lens.npy\"))" + ] + }, + { + "cell_type": "markdown", + "id": "4cef2f15", + "metadata": {}, + "source": [ + "## 使用 torch 的 ctc loss" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "90612004", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'1.10.1+cu102'" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import torch\n", + "torch.__version__" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "00799f97", + "metadata": {}, + "outputs": [], + "source": [ + "def torch_ctc_loss(use_cpu):\n", + " if use_cpu:\n", + " device = torch.device(\"cpu\")\n", + " else:\n", + " device = torch.device(\"cuda\")\n", + "\n", + " reduction_type = \"sum\" \n", + "\n", + " ctc_loss = torch.nn.CTCLoss(reduction=reduction_type)\n", + "\n", + " ys_hat = torch.tensor(logits_np, device = device)\n", + " ys_pad = torch.tensor(ys_pad_np, device = device)\n", + " hlens = torch.tensor(hlens_np, device = device)\n", + " ys_lens = torch.tensor(ys_lens_np, device = device)\n", + "\n", + " ys_hat = ys_hat.transpose(0, 1)\n", + " \n", + " # 开始计算时间\n", + " start_time = time.time()\n", + " ys_hat = ys_hat.log_softmax(2)\n", + " loss = ctc_loss(ys_hat, ys_pad, hlens, ys_lens)\n", + " end_time = time.time()\n", + " \n", + " loss = loss / ys_hat.size(1)\n", + " return end_time - start_time, loss.item()" + ] + }, + { + "cell_type": "markdown", + "id": "ba47b5a4", + "metadata": {}, + "source": [ + "## 使用 paddle 的 ctc loss" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "6882a06e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'2.2.2'" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import paddle\n", + "paddle.__version__" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "3cfa3b7c", + "metadata": {}, + "outputs": [], + "source": [ + "def paddle_ctc_loss(use_cpu): \n", + " import paddle.nn as pn\n", + " if use_cpu:\n", + " device = \"cpu\"\n", + " else:\n", + " device = \"gpu\"\n", + "\n", + " paddle.set_device(device)\n", + "\n", + " logits = paddle.to_tensor(logits_np)\n", + " ys_pad = paddle.to_tensor(ys_pad_np,dtype='int32')\n", + " hlens = paddle.to_tensor(hlens_np, dtype='int64')\n", + " ys_lens = paddle.to_tensor(ys_lens_np, dtype='int64')\n", + "\n", + " logits = logits.transpose([1,0,2])\n", + "\n", + " ctc_loss = pn.CTCLoss(reduction='sum')\n", + " # 开始计算时间\n", + " start_time = time.time()\n", + " pn_loss = ctc_loss(logits, ys_pad, hlens, ys_lens)\n", + " end_time = time.time()\n", + " \n", + " pn_loss = pn_loss / logits.shape[1]\n", + " return end_time - start_time, pn_loss.item()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "40413ef9", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU, iteration 10\n", + "torch_ctc_loss 159.17137145996094\n", + "paddle_ctc_loss 159.16574096679688\n", + "paddle average time 1.718252992630005\n", + "torch average time 0.17536230087280275\n", + "paddle time / torch time (cpu) 9.798303193320452\n", + "\n", + "GPU, iteration 10\n", + "torch_ctc_loss 159.172119140625\n", + "paddle_ctc_loss 159.17205810546875\n", + "paddle average time 0.018606925010681154\n", + "torch average time 0.0026710033416748047\n", + "paddle time / torch time (gpu) 6.966267963938231\n" + ] + } + ], + "source": [ + "# 使用 CPU\n", + "\n", + "iteration = 10\n", + "use_cpu = True\n", + "torch_total_time = 0\n", + "paddle_total_time = 0\n", + "for _ in range(iteration):\n", + " cost_time, torch_loss = torch_ctc_loss(use_cpu)\n", + " torch_total_time += cost_time\n", + "for _ in range(iteration):\n", + " cost_time, paddle_loss = paddle_ctc_loss(use_cpu)\n", + " paddle_total_time += cost_time\n", + "print (\"CPU, iteration\", iteration)\n", + "print (\"torch_ctc_loss\", torch_loss)\n", + "print (\"paddle_ctc_loss\", paddle_loss)\n", + "print (\"paddle average time\", paddle_total_time / iteration)\n", + "print (\"torch average time\", torch_total_time / iteration)\n", + "print (\"paddle time / torch time (cpu)\" , paddle_total_time/ torch_total_time)\n", + "\n", + "print (\"\")\n", + "\n", + "# 使用 GPU\n", + "\n", + "use_cpu = False\n", + "torch_total_time = 0\n", + "paddle_total_time = 0\n", + "for _ in range(iteration):\n", + " cost_time, torch_loss = torch_ctc_loss(use_cpu)\n", + " torch_total_time += cost_time\n", + "for _ in range(iteration):\n", + " cost_time, paddle_loss = paddle_ctc_loss(use_cpu)\n", + " paddle_total_time += cost_time\n", + "print (\"GPU, iteration\", iteration)\n", + "print (\"torch_ctc_loss\", torch_loss)\n", + "print (\"paddle_ctc_loss\", paddle_loss)\n", + "print (\"paddle average time\", paddle_total_time / iteration)\n", + "print (\"torch average time\", torch_total_time / iteration)\n", + "print (\"paddle time / torch time (gpu)\" , paddle_total_time/ torch_total_time)" + ] + }, + { + "cell_type": "markdown", + "id": "7cdf8697", + "metadata": {}, + "source": [ + "## 其他: 使用 PaddleSpeech 中的 ctcloss 查一下loss值" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "73fad81d", + "metadata": {}, + "outputs": [], + "source": [ + "logits_np = np.load(os.path.join(data_dir, \"logits.npy\"))\n", + "ys_pad_np = np.load(os.path.join(data_dir, \"ys_pad.npy\"))\n", + "hlens_np = np.load(os.path.join(data_dir, \"hlens.npy\"))\n", + "ys_lens_np = np.load(os.path.join(data_dir, \"ys_lens.npy\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "2b41e45d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2022-02-25 11:34:34.143 | INFO | paddlespeech.s2t.modules.loss:__init__:41 - CTCLoss Loss reduction: sum, div-bs: True\n", + "2022-02-25 11:34:34.143 | INFO | paddlespeech.s2t.modules.loss:__init__:42 - CTCLoss Grad Norm Type: instance\n", + "2022-02-25 11:34:34.144 | INFO | paddlespeech.s2t.modules.loss:__init__:73 - CTCLoss() kwargs:{'norm_by_times': True}, not support: {'norm_by_batchsize': False, 'norm_by_total_logits_len': False}\n", + "loss 159.17205810546875\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/root/miniconda3/lib/python3.7/site-packages/paddle/fluid/dygraph/math_op_patch.py:253: UserWarning: The dtype of left and right variables are not the same, left dtype is paddle.float32, but right dtype is paddle.int32, the right dtype will convert to paddle.float32\n", + " format(lhs_dtype, rhs_dtype, lhs_dtype))\n" + ] + } + ], + "source": [ + "use_cpu = False\n", + "\n", + "from paddlespeech.s2t.modules.loss import CTCLoss\n", + "\n", + "if use_cpu:\n", + " device = \"cpu\"\n", + "else:\n", + " device = \"gpu\"\n", + "\n", + "paddle.set_device(device)\n", + "\n", + "blank_id=0\n", + "reduction_type='sum'\n", + "batch_average= True\n", + "grad_norm_type='instance'\n", + "\n", + "criterion = CTCLoss(\n", + " blank=blank_id,\n", + " reduction=reduction_type,\n", + " batch_average=batch_average,\n", + " grad_norm_type=grad_norm_type)\n", + "\n", + "logits = paddle.to_tensor(logits_np)\n", + "ys_pad = paddle.to_tensor(ys_pad_np,dtype='int32')\n", + "hlens = paddle.to_tensor(hlens_np, dtype='int64')\n", + "ys_lens = paddle.to_tensor(ys_lens_np, dtype='int64')\n", + "\n", + "pn_ctc_loss = criterion(logits, ys_pad, hlens, ys_lens)\n", + "print(\"loss\", pn_ctc_loss.item())\n", + " " + ] + }, + { + "cell_type": "markdown", + "id": "de525d38", + "metadata": {}, + "source": [ + "## 结论\n", + "在 CPU 环境下: torch 的 CTC loss 的计算速度是 paddle 的 9.8 倍 \n", + "在 GPU 环境下: torch 的 CTC loss 的计算速度是 paddle 的 6.87 倍\n", + "\n", + "## 其他结论\n", + "torch 的 ctc loss 在 CPU 和 GPU 下 都没有完全对齐。其中CPU的前向对齐精度大约为 1e-2。 GPU 的前向对齐精度大约为 1e-4 。" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/aishell3/tts3/README.md b/examples/aishell3/tts3/README.md index 281ad836b0144e6bb14e4b8278bfaceb026b65b4..d02ad1b6373c26f0cd0ffa4d58c3bd4af57f9e72 100644 --- a/examples/aishell3/tts3/README.md +++ b/examples/aishell3/tts3/README.md @@ -225,7 +225,9 @@ optional arguments: 9. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. ## Pretrained Model -Pretrained FastSpeech2 model with no silence in the edge of audios. [fastspeech2_nosil_aishell3_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_aishell3_ckpt_0.4.zip) +Pretrained FastSpeech2 model with no silence in the edge of audios: +- [fastspeech2_nosil_aishell3_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_aishell3_ckpt_0.4.zip) +- [fastspeech2_conformer_aishell3_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_conformer_aishell3_ckpt_0.2.0.zip) (Thanks for [@awmmmm](https://github.com/awmmmm)'s contribution) FastSpeech2 checkpoint contains files listed below. diff --git a/examples/aishell3/tts3/conf/conformer.yaml b/examples/aishell3/tts3/conf/conformer.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ea73593d77a3117e8b46baab9785bd576a66a093 --- /dev/null +++ b/examples/aishell3/tts3/conf/conformer.yaml @@ -0,0 +1,110 @@ +########################################################### +# FEATURE EXTRACTION SETTING # +########################################################### + +fs: 24000 # sr +n_fft: 2048 # FFT size (samples). +n_shift: 300 # Hop size (samples). 12.5ms +win_length: 1200 # Window length (samples). 50ms + # If set to null, it will be the same as fft_size. +window: "hann" # Window function. + +# Only used for feats_type != raw + +fmin: 80 # Minimum frequency of Mel basis. +fmax: 7600 # Maximum frequency of Mel basis. +n_mels: 80 # The number of mel basis. + +# Only used for the model using pitch features (e.g. FastSpeech2) +f0min: 80 # Maximum f0 for pitch extraction. +f0max: 400 # Minimum f0 for pitch extraction. + + +########################################################### +# DATA SETTING # +########################################################### +batch_size: 32 +num_workers: 4 + + +########################################################### +# MODEL SETTING # +########################################################### +model: + adim: 384 # attention dimension + aheads: 2 # number of attention heads + elayers: 4 # number of encoder layers + eunits: 1536 # number of encoder ff units + dlayers: 4 # number of decoder layers + dunits: 1536 # number of decoder ff units + positionwise_layer_type: conv1d # type of position-wise layer + positionwise_conv_kernel_size: 3 # kernel size of position wise conv layer + duration_predictor_layers: 2 # number of layers of duration predictor + duration_predictor_chans: 256 # number of channels of duration predictor + duration_predictor_kernel_size: 3 # filter size of duration predictor + postnet_layers: 5 # number of layers of postnset + postnet_filts: 5 # filter size of conv layers in postnet + postnet_chans: 256 # number of channels of conv layers in postnet + encoder_normalize_before: True # whether to perform layer normalization before the input + decoder_normalize_before: True # whether to perform layer normalization before the input + reduction_factor: 1 # reduction factor + encoder_type: conformer # encoder type + decoder_type: conformer # decoder type + conformer_pos_enc_layer_type: rel_pos # conformer positional encoding type + conformer_self_attn_layer_type: rel_selfattn # conformer self-attention type + conformer_activation_type: swish # conformer activation type + use_macaron_style_in_conformer: true # whether to use macaron style in conformer + use_cnn_in_conformer: true # whether to use CNN in conformer + conformer_enc_kernel_size: 7 # kernel size in CNN module of conformer-based encoder + conformer_dec_kernel_size: 31 # kernel size in CNN module of conformer-based decoder + init_type: xavier_uniform # initialization type + transformer_enc_dropout_rate: 0.2 # dropout rate for transformer encoder layer + transformer_enc_positional_dropout_rate: 0.2 # dropout rate for transformer encoder positional encoding + transformer_enc_attn_dropout_rate: 0.2 # dropout rate for transformer encoder attention layer + transformer_dec_dropout_rate: 0.2 # dropout rate for transformer decoder layer + transformer_dec_positional_dropout_rate: 0.2 # dropout rate for transformer decoder positional encoding + transformer_dec_attn_dropout_rate: 0.2 # dropout rate for transformer decoder attention layer + pitch_predictor_layers: 5 # number of conv layers in pitch predictor + pitch_predictor_chans: 256 # number of channels of conv layers in pitch predictor + pitch_predictor_kernel_size: 5 # kernel size of conv leyers in pitch predictor + pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor + pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch + pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch + stop_gradient_from_pitch_predictor: true # whether to stop the gradient from pitch predictor to encoder + energy_predictor_layers: 2 # number of conv layers in energy predictor + energy_predictor_chans: 256 # number of channels of conv layers in energy predictor + energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor + energy_predictor_dropout: 0.5 # dropout rate in energy predictor + energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy + energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy + stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder + spk_embed_dim: 256 # speaker embedding dimension + spk_embed_integration_type: concat # speaker embedding integration type + + +########################################################### +# UPDATER SETTING # +########################################################### +updater: + use_masking: True # whether to apply masking for padded part in loss calculation + + + +########################################################### +# OPTIMIZER SETTING # +########################################################### +optimizer: + optim: adam # optimizer type + learning_rate: 0.001 # learning rate + +########################################################### +# TRAINING SETTING # +########################################################### +max_epoch: 1000 +num_snapshots: 5 + + +########################################################### +# OTHER SETTING # +########################################################### +seed: 10086 diff --git a/examples/ami/sd0/local/ami_prepare.py b/examples/ami/sd0/local/ami_prepare.py index b7bb8e67eda82bc5f33e520e60c2b90aa1c22509..d03810a777aab5d5dcd85d25ea34a1ad59db3f6f 100644 --- a/examples/ami/sd0/local/ami_prepare.py +++ b/examples/ami/sd0/local/ami_prepare.py @@ -22,19 +22,17 @@ Authors * qingenz123@126.com (Qingen ZHAO) 2022 """ - -import os -import logging import argparse -import xml.etree.ElementTree as et import glob import json -from ami_splits import get_AMI_split +import logging +import os +import xml.etree.ElementTree as et from distutils.util import strtobool -from dataio import ( - load_pkl, - save_pkl, ) +from ami_splits import get_AMI_split +from dataio import load_pkl +from dataio import save_pkl logger = logging.getLogger(__name__) SAMPLERATE = 16000 diff --git a/examples/csmsc/tts0/local/synthesize.sh b/examples/csmsc/tts0/local/synthesize.sh index 4be06dd80558945d70ddaa65e48a0341bfe4372b..5b8ed15e5271e5d4e6baaaa02c7e7b44046e6b72 100755 --- a/examples/csmsc/tts0/local/synthesize.sh +++ b/examples/csmsc/tts0/local/synthesize.sh @@ -3,18 +3,98 @@ config_path=$1 train_output_path=$2 ckpt_name=$3 +stage=0 +stop_stage=0 -FLAGS_allocator_strategy=naive_best_fit \ -FLAGS_fraction_of_gpu_memory_to_use=0.01 \ -python3 ${BIN_DIR}/../synthesize.py \ - --am=tacotron2_csmsc \ - --am_config=${config_path} \ - --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ - --am_stat=dump/train/speech_stats.npy \ - --voc=pwgan_csmsc \ - --voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \ - --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ - --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \ - --test_metadata=dump/test/norm/metadata.jsonl \ - --output_dir=${train_output_path}/test \ - --phones_dict=dump/phone_id_map.txt +# pwgan +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + FLAGS_allocator_strategy=naive_best_fit \ + FLAGS_fraction_of_gpu_memory_to_use=0.01 \ + python3 ${BIN_DIR}/../synthesize.py \ + --am=tacotron2_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/speech_stats.npy \ + --voc=pwgan_csmsc \ + --voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \ + --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ + --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \ + --test_metadata=dump/test/norm/metadata.jsonl \ + --output_dir=${train_output_path}/test \ + --phones_dict=dump/phone_id_map.txt +fi + +# for more GAN Vocoders +# multi band melgan +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + FLAGS_allocator_strategy=naive_best_fit \ + FLAGS_fraction_of_gpu_memory_to_use=0.01 \ + python3 ${BIN_DIR}/../synthesize.py \ + --am=tacotron2_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/speech_stats.npy \ + --voc=mb_melgan_csmsc \ + --voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \ + --voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\ + --voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \ + --test_metadata=dump/test/norm/metadata.jsonl \ + --output_dir=${train_output_path}/test \ + --phones_dict=dump/phone_id_map.txt +fi + +# style melgan +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + FLAGS_allocator_strategy=naive_best_fit \ + FLAGS_fraction_of_gpu_memory_to_use=0.01 \ + python3 ${BIN_DIR}/../synthesize.py \ + --am=tacotron2_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/speech_stats.npy \ + --voc=style_melgan_csmsc \ + --voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \ + --voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \ + --voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \ + --test_metadata=dump/test/norm/metadata.jsonl \ + --output_dir=${train_output_path}/test \ + --phones_dict=dump/phone_id_map.txt +fi + +# hifigan +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + echo "in hifigan syn" + FLAGS_allocator_strategy=naive_best_fit \ + FLAGS_fraction_of_gpu_memory_to_use=0.01 \ + python3 ${BIN_DIR}/../synthesize.py \ + --am=tacotron2_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/speech_stats.npy \ + --voc=hifigan_csmsc \ + --voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \ + --voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \ + --voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \ + --test_metadata=dump/test/norm/metadata.jsonl \ + --output_dir=${train_output_path}/test \ + --phones_dict=dump/phone_id_map.txt +fi + +# wavernn +if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + echo "in wavernn syn" + FLAGS_allocator_strategy=naive_best_fit \ + FLAGS_fraction_of_gpu_memory_to_use=0.01 \ + python3 ${BIN_DIR}/../synthesize.py \ + --am=tacotron2_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/speech_stats.npy \ + --voc=wavernn_csmsc \ + --voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \ + --voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \ + --voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \ + --test_metadata=dump/test/norm/metadata.jsonl \ + --output_dir=${train_output_path}/test \ + --phones_dict=dump/phone_id_map.txt +fi diff --git a/examples/csmsc/tts0/local/synthesize_e2e.sh b/examples/csmsc/tts0/local/synthesize_e2e.sh index 79bb9f833e8db89baddbadaccce985a49f9629eb..f7675873386b3a28f720e26d18a36fdcc092c75c 100755 --- a/examples/csmsc/tts0/local/synthesize_e2e.sh +++ b/examples/csmsc/tts0/local/synthesize_e2e.sh @@ -8,6 +8,7 @@ stage=0 stop_stage=0 # TODO: tacotron2 动转静的结果没有静态图的响亮, 可能还是 decode 的时候某个函数动静不对齐 +# pwgan if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then FLAGS_allocator_strategy=naive_best_fit \ FLAGS_fraction_of_gpu_memory_to_use=0.01 \ @@ -39,14 +40,14 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ --am_stat=dump/train/speech_stats.npy \ --voc=mb_melgan_csmsc \ - --voc_config=mb_melgan_baker_finetune_ckpt_0.5/finetune.yaml \ - --voc_ckpt=mb_melgan_baker_finetune_ckpt_0.5/snapshot_iter_2000000.pdz\ - --voc_stat=mb_melgan_baker_finetune_ckpt_0.5/feats_stats.npy \ + --voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \ + --voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\ + --voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \ --lang=zh \ --text=${BIN_DIR}/../sentences.txt \ --output_dir=${train_output_path}/test_e2e \ - --inference_dir=${train_output_path}/inference \ - --phones_dict=dump/phone_id_map.txt + --phones_dict=dump/phone_id_map.txt \ + --inference_dir=${train_output_path}/inference fi # the pretrained models haven't release now @@ -88,8 +89,8 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then --lang=zh \ --text=${BIN_DIR}/../sentences.txt \ --output_dir=${train_output_path}/test_e2e \ - --inference_dir=${train_output_path}/inference \ - --phones_dict=dump/phone_id_map.txt + --phones_dict=dump/phone_id_map.txt \ + --inference_dir=${train_output_path}/inference fi # wavernn @@ -111,4 +112,4 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then --output_dir=${train_output_path}/test_e2e \ --phones_dict=dump/phone_id_map.txt \ --inference_dir=${train_output_path}/inference -fi \ No newline at end of file +fi diff --git a/examples/csmsc/tts2/local/synthesize.sh b/examples/csmsc/tts2/local/synthesize.sh index cedc9717d774666f94f43a8b91c8cafd6e2ad6c3..37b2981831e0ec8bd4d89b106d41a6e9fc5bbe47 100755 --- a/examples/csmsc/tts2/local/synthesize.sh +++ b/examples/csmsc/tts2/local/synthesize.sh @@ -1,20 +1,105 @@ #!/bin/bash + config_path=$1 train_output_path=$2 ckpt_name=$3 +stage=0 +stop_stage=0 + +# pwgan +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + FLAGS_allocator_strategy=naive_best_fit \ + FLAGS_fraction_of_gpu_memory_to_use=0.01 \ + python3 ${BIN_DIR}/../synthesize.py \ + --am=speedyspeech_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/speech_stats.npy \ + --voc=pwgan_csmsc \ + --voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \ + --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ + --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \ + --test_metadata=dump/test/norm/metadata.jsonl \ + --output_dir=${train_output_path}/test \ + --phones_dict=dump/phone_id_map.txt \ + --tones_dict=dump/tone_id_map.txt +fi + +# for more GAN Vocoders +# multi band melgan +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + FLAGS_allocator_strategy=naive_best_fit \ + FLAGS_fraction_of_gpu_memory_to_use=0.01 \ + python3 ${BIN_DIR}/../synthesize.py \ + --am=speedyspeech_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/speech_stats.npy \ + --voc=mb_melgan_csmsc \ + --voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \ + --voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\ + --voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \ + --test_metadata=dump/test/norm/metadata.jsonl \ + --output_dir=${train_output_path}/test \ + --phones_dict=dump/phone_id_map.txt \ + --tones_dict=dump/tone_id_map.txt +fi + +# style melgan +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + FLAGS_allocator_strategy=naive_best_fit \ + FLAGS_fraction_of_gpu_memory_to_use=0.01 \ + python3 ${BIN_DIR}/../synthesize.py \ + --am=speedyspeech_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/speech_stats.npy \ + --voc=style_melgan_csmsc \ + --voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \ + --voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \ + --voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \ + --test_metadata=dump/test/norm/metadata.jsonl \ + --output_dir=${train_output_path}/test \ + --phones_dict=dump/phone_id_map.txt \ + --tones_dict=dump/tone_id_map.txt +fi + +# hifigan +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + echo "in hifigan syn" + FLAGS_allocator_strategy=naive_best_fit \ + FLAGS_fraction_of_gpu_memory_to_use=0.01 \ + python3 ${BIN_DIR}/../synthesize.py \ + --am=speedyspeech_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/speech_stats.npy \ + --voc=hifigan_csmsc \ + --voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \ + --voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \ + --voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \ + --test_metadata=dump/test/norm/metadata.jsonl \ + --output_dir=${train_output_path}/test \ + --phones_dict=dump/phone_id_map.txt \ + --tones_dict=dump/tone_id_map.txt +fi -FLAGS_allocator_strategy=naive_best_fit \ -FLAGS_fraction_of_gpu_memory_to_use=0.01 \ -python3 ${BIN_DIR}/../synthesize.py \ - --am=speedyspeech_csmsc \ - --am_config=${config_path} \ - --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ - --am_stat=dump/train/feats_stats.npy \ - --voc=pwgan_csmsc \ - --voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \ - --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ - --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \ - --test_metadata=dump/test/norm/metadata.jsonl \ - --output_dir=${train_output_path}/test \ - --phones_dict=dump/phone_id_map.txt \ - --tones_dict=dump/tone_id_map.txt \ No newline at end of file +# wavernn +if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + echo "in wavernn syn" + FLAGS_allocator_strategy=naive_best_fit \ + FLAGS_fraction_of_gpu_memory_to_use=0.01 \ + python3 ${BIN_DIR}/../synthesize.py \ + --am=speedyspeech_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/speech_stats.npy \ + --voc=wavernn_csmsc \ + --voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \ + --voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \ + --voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \ + --test_metadata=dump/test/norm/metadata.jsonl \ + --output_dir=${train_output_path}/test \ + --tones_dict=dump/tone_id_map.txt \ + --phones_dict=dump/phone_id_map.txt +fi diff --git a/examples/csmsc/tts2/local/synthesize_e2e.sh b/examples/csmsc/tts2/local/synthesize_e2e.sh index 35fcf251850b3c62bacc6e51e3df5af0ffab5cce..553b45543faea1c0eca1b4fcaaa89cffe447334f 100755 --- a/examples/csmsc/tts2/local/synthesize_e2e.sh +++ b/examples/csmsc/tts2/local/synthesize_e2e.sh @@ -7,6 +7,7 @@ ckpt_name=$3 stage=0 stop_stage=0 +# pwgan if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then FLAGS_allocator_strategy=naive_best_fit \ FLAGS_fraction_of_gpu_memory_to_use=0.01 \ @@ -22,9 +23,9 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then --lang=zh \ --text=${BIN_DIR}/../sentences.txt \ --output_dir=${train_output_path}/test_e2e \ - --inference_dir=${train_output_path}/inference \ --phones_dict=dump/phone_id_map.txt \ - --tones_dict=dump/tone_id_map.txt + --tones_dict=dump/tone_id_map.txt \ + --inference_dir=${train_output_path}/inference fi # for more GAN Vocoders @@ -44,9 +45,9 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then --lang=zh \ --text=${BIN_DIR}/../sentences.txt \ --output_dir=${train_output_path}/test_e2e \ - --inference_dir=${train_output_path}/inference \ --phones_dict=dump/phone_id_map.txt \ - --tones_dict=dump/tone_id_map.txt + --tones_dict=dump/tone_id_map.txt \ + --inference_dir=${train_output_path}/inference fi # the pretrained models haven't release now @@ -88,12 +89,11 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then --lang=zh \ --text=${BIN_DIR}/../sentences.txt \ --output_dir=${train_output_path}/test_e2e \ - --inference_dir=${train_output_path}/inference \ --phones_dict=dump/phone_id_map.txt \ - --tones_dict=dump/tone_id_map.txt + --tones_dict=dump/tone_id_map.txt \ + --inference_dir=${train_output_path}/inference fi - # wavernn if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then echo "in wavernn syn_e2e" diff --git a/examples/csmsc/tts3/local/synthesize.sh b/examples/csmsc/tts3/local/synthesize.sh index 1976742660c42a46e4dd8ceef61e629286c08b18..043bb52f40ab11b192c946bf935a170b1d7377e6 100755 --- a/examples/csmsc/tts3/local/synthesize.sh +++ b/examples/csmsc/tts3/local/synthesize.sh @@ -3,18 +3,98 @@ config_path=$1 train_output_path=$2 ckpt_name=$3 +stage=0 +stop_stage=0 -FLAGS_allocator_strategy=naive_best_fit \ -FLAGS_fraction_of_gpu_memory_to_use=0.01 \ -python3 ${BIN_DIR}/../synthesize.py \ - --am=fastspeech2_csmsc \ - --am_config=${config_path} \ - --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ - --am_stat=dump/train/speech_stats.npy \ - --voc=pwgan_csmsc \ - --voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \ - --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ - --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \ - --test_metadata=dump/test/norm/metadata.jsonl \ - --output_dir=${train_output_path}/test \ - --phones_dict=dump/phone_id_map.txt +# pwgan +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + FLAGS_allocator_strategy=naive_best_fit \ + FLAGS_fraction_of_gpu_memory_to_use=0.01 \ + python3 ${BIN_DIR}/../synthesize.py \ + --am=fastspeech2_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/speech_stats.npy \ + --voc=pwgan_csmsc \ + --voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \ + --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ + --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \ + --test_metadata=dump/test/norm/metadata.jsonl \ + --output_dir=${train_output_path}/test \ + --phones_dict=dump/phone_id_map.txt +fi + +# for more GAN Vocoders +# multi band melgan +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + FLAGS_allocator_strategy=naive_best_fit \ + FLAGS_fraction_of_gpu_memory_to_use=0.01 \ + python3 ${BIN_DIR}/../synthesize.py \ + --am=fastspeech2_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/speech_stats.npy \ + --voc=mb_melgan_csmsc \ + --voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \ + --voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\ + --voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \ + --test_metadata=dump/test/norm/metadata.jsonl \ + --output_dir=${train_output_path}/test \ + --phones_dict=dump/phone_id_map.txt +fi + +# style melgan +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + FLAGS_allocator_strategy=naive_best_fit \ + FLAGS_fraction_of_gpu_memory_to_use=0.01 \ + python3 ${BIN_DIR}/../synthesize.py \ + --am=fastspeech2_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/speech_stats.npy \ + --voc=style_melgan_csmsc \ + --voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \ + --voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \ + --voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \ + --test_metadata=dump/test/norm/metadata.jsonl \ + --output_dir=${train_output_path}/test \ + --phones_dict=dump/phone_id_map.txt +fi + +# hifigan +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + echo "in hifigan syn" + FLAGS_allocator_strategy=naive_best_fit \ + FLAGS_fraction_of_gpu_memory_to_use=0.01 \ + python3 ${BIN_DIR}/../synthesize.py \ + --am=fastspeech2_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/speech_stats.npy \ + --voc=hifigan_csmsc \ + --voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \ + --voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \ + --voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \ + --test_metadata=dump/test/norm/metadata.jsonl \ + --output_dir=${train_output_path}/test \ + --phones_dict=dump/phone_id_map.txt +fi + +# wavernn +if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + echo "in wavernn syn" + FLAGS_allocator_strategy=naive_best_fit \ + FLAGS_fraction_of_gpu_memory_to_use=0.01 \ + python3 ${BIN_DIR}/../synthesize.py \ + --am=fastspeech2_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/speech_stats.npy \ + --voc=wavernn_csmsc \ + --voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \ + --voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \ + --voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \ + --test_metadata=dump/test/norm/metadata.jsonl \ + --output_dir=${train_output_path}/test \ + --phones_dict=dump/phone_id_map.txt +fi diff --git a/examples/csmsc/tts3/local/synthesize_e2e.sh b/examples/csmsc/tts3/local/synthesize_e2e.sh index 44356e4be6f8a67e25ce987eb7873926cfdd2f95..512e062b7792a7c363640f48bb0a665f1be96ab6 100755 --- a/examples/csmsc/tts3/local/synthesize_e2e.sh +++ b/examples/csmsc/tts3/local/synthesize_e2e.sh @@ -7,6 +7,7 @@ ckpt_name=$3 stage=0 stop_stage=0 +# pwgan if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then FLAGS_allocator_strategy=naive_best_fit \ FLAGS_fraction_of_gpu_memory_to_use=0.01 \ @@ -22,8 +23,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then --lang=zh \ --text=${BIN_DIR}/../sentences.txt \ --output_dir=${train_output_path}/test_e2e \ - --inference_dir=${train_output_path}/inference \ - --phones_dict=dump/phone_id_map.txt + --phones_dict=dump/phone_id_map.txt \ + --inference_dir=${train_output_path}/inference fi # for more GAN Vocoders @@ -43,8 +44,8 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then --lang=zh \ --text=${BIN_DIR}/../sentences.txt \ --output_dir=${train_output_path}/test_e2e \ - --inference_dir=${train_output_path}/inference \ - --phones_dict=dump/phone_id_map.txt + --phones_dict=dump/phone_id_map.txt \ + --inference_dir=${train_output_path}/inference fi # the pretrained models haven't release now @@ -86,8 +87,8 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then --lang=zh \ --text=${BIN_DIR}/../sentences.txt \ --output_dir=${train_output_path}/test_e2e \ - --inference_dir=${train_output_path}/inference \ - --phones_dict=dump/phone_id_map.txt + --phones_dict=dump/phone_id_map.txt \ + --inference_dir=${train_output_path}/inference fi diff --git a/examples/other/g2p/README.md b/examples/other/g2p/README.md index c0f55bd42130a34a32ed21e34b5d5e297fff2f7c..141f7f7412891b44be81fc5e026c175c3fe83bb1 100644 --- a/examples/other/g2p/README.md +++ b/examples/other/g2p/README.md @@ -10,7 +10,7 @@ Run the command below to get the results of the test. ```bash ./run.sh ``` -The `avg WER` of g2p is: 0.027124048652822204 +The `avg WER` of g2p is: 0.026014352515701198 ```text ,--------------------------------------------------------------------. | | # Snt # Wrd | Corr Sub Del Ins Err S.Err | diff --git a/examples/voxceleb/sv0/local/make_voxceleb_kaldi_trial.py b/examples/voxceleb/sv0/local/make_voxceleb_kaldi_trial.py index c92ede1ab5113d568265a1d261e1709213ef00d2..4e9639dc7d707df2111df09cae6b3bb5b4245571 100644 --- a/examples/voxceleb/sv0/local/make_voxceleb_kaldi_trial.py +++ b/examples/voxceleb/sv0/local/make_voxceleb_kaldi_trial.py @@ -12,28 +12,30 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - """ Make VoxCeleb1 trial of kaldi format this script creat the test trial from kaldi trial voxceleb1_test_v2.txt or official trial veri_test2.txt to kaldi trial format """ - import argparse import codecs import os parser = argparse.ArgumentParser(description=__doc__) -parser.add_argument("--voxceleb_trial", - default="voxceleb1_test_v2", - type=str, - help="VoxCeleb trial file. Default we use the kaldi trial voxceleb1_test_v2.txt") -parser.add_argument("--trial", - default="data/test/trial", - type=str, - help="Kaldi format trial file") +parser.add_argument( + "--voxceleb_trial", + default="voxceleb1_test_v2", + type=str, + help="VoxCeleb trial file. Default we use the kaldi trial voxceleb1_test_v2.txt" +) +parser.add_argument( + "--trial", + default="data/test/trial", + type=str, + help="Kaldi format trial file") args = parser.parse_args() + def main(voxceleb_trial, trial): """ VoxCeleb provide several trial file, which format is different with kaldi format. @@ -58,7 +60,9 @@ def main(voxceleb_trial, trial): """ print("Start convert the voxceleb trial to kaldi format") if not os.path.exists(voxceleb_trial): - raise RuntimeError("{} does not exist. Pleas input the correct file path".format(voxceleb_trial)) + raise RuntimeError( + "{} does not exist. Pleas input the correct file path".format( + voxceleb_trial)) trial_dirname = os.path.dirname(trial) if not os.path.exists(trial_dirname): @@ -66,9 +70,9 @@ def main(voxceleb_trial, trial): with codecs.open(voxceleb_trial, 'r', encoding='utf-8') as f, \ codecs.open(trial, 'w', encoding='utf-8') as w: - for line in f: + for line in f: target_or_nontarget, path1, path2 = line.strip().split() - + utt_id1 = "-".join(path1.split("/")) utt_id2 = "-".join(path2.split("/")) target = "nontarget" @@ -77,5 +81,6 @@ def main(voxceleb_trial, trial): w.write("{} {} {}\n".format(utt_id1, utt_id2, target)) print("Convert the voxceleb trial to kaldi format successfully") + if __name__ == "__main__": main(args.voxceleb_trial, args.trial) diff --git a/paddlespeech/__init__.py b/paddlespeech/__init__.py index 42537b15945d48513063d80abadf20ca1736cb50..185a92b8d94d3426d616c0624f0f2ee04339349e 100644 --- a/paddlespeech/__init__.py +++ b/paddlespeech/__init__.py @@ -11,14 +11,3 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - - - - - - - - - - - diff --git a/paddlespeech/cli/__init__.py b/paddlespeech/cli/__init__.py index cecf76fee5b6e8e73c3e7d588698f0cb890461cf..b526a3849b0ed5deddd519e7a0573a592c743d2f 100644 --- a/paddlespeech/cli/__init__.py +++ b/paddlespeech/cli/__init__.py @@ -18,6 +18,7 @@ from .base_commands import BaseCommand from .base_commands import HelpCommand from .cls import CLSExecutor from .st import STExecutor +from .stats import StatsExecutor from .text import TextExecutor from .tts import TTSExecutor diff --git a/paddlespeech/cli/asr/infer.py b/paddlespeech/cli/asr/infer.py index 7f648b4c3a28ed567f17bea099e7d5cc254ba53a..1fb4be43486fbe896b97d6d6a3ac766c53f208e1 100644 --- a/paddlespeech/cli/asr/infer.py +++ b/paddlespeech/cli/asr/infer.py @@ -413,7 +413,8 @@ class ASRExecutor(BaseExecutor): def _check(self, audio_file: str, sample_rate: int, force_yes: bool): self.sample_rate = sample_rate if self.sample_rate != 16000 and self.sample_rate != 8000: - logger.error("invalid sample rate, please input --sr 8000 or --sr 16000") + logger.error( + "invalid sample rate, please input --sr 8000 or --sr 16000") return False if isinstance(audio_file, (str, os.PathLike)): diff --git a/paddlespeech/cli/stats/__init__.py b/paddlespeech/cli/stats/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..9fe6c4abaf10de2f24f751ddd62f456768a82475 --- /dev/null +++ b/paddlespeech/cli/stats/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from .infer import StatsExecutor diff --git a/paddlespeech/cli/stats/infer.py b/paddlespeech/cli/stats/infer.py new file mode 100644 index 0000000000000000000000000000000000000000..4ef50449c37e08c1a3c5f9b8894a5b4141e1c33f --- /dev/null +++ b/paddlespeech/cli/stats/infer.py @@ -0,0 +1,193 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +from typing import List + +from prettytable import PrettyTable + +from ..log import logger +from ..utils import cli_register +from ..utils import stats_wrapper + +__all__ = ['StatsExecutor'] + +model_name_format = { + 'asr': 'Model-Language-Sample Rate', + 'cls': 'Model-Sample Rate', + 'st': 'Model-Source language-Target language', + 'text': 'Model-Task-Language', + 'tts': 'Model-Language' +} + + +@cli_register( + name='paddlespeech.stats', + description='Get speech tasks support models list.') +class StatsExecutor(): + def __init__(self): + super(StatsExecutor, self).__init__() + + self.parser = argparse.ArgumentParser( + prog='paddlespeech.stats', add_help=True) + self.parser.add_argument( + '--task', + type=str, + default='asr', + choices=['asr', 'cls', 'st', 'text', 'tts'], + help='Choose speech task.', + required=True) + self.task_choices = ['asr', 'cls', 'st', 'text', 'tts'] + + def show_support_models(self, pretrained_models: dict): + fields = model_name_format[self.task].split("-") + table = PrettyTable(fields) + for key in pretrained_models: + table.add_row(key.split("-")) + print(table) + + def execute(self, argv: List[str]) -> bool: + """ + Command line entry. + """ + parser_args = self.parser.parse_args(argv) + self.task = parser_args.task + if self.task not in self.task_choices: + logger.error( + "Please input correct speech task, choices = ['asr', 'cls', 'st', 'text', 'tts']" + ) + return False + + elif self.task == 'asr': + try: + from ..asr.infer import pretrained_models + logger.info( + "Here is the list of ASR pretrained models released by PaddleSpeech that can be used by command line and python API" + ) + self.show_support_models(pretrained_models) + return True + except BaseException: + logger.error("Failed to get the list of ASR pretrained models.") + return False + + elif self.task == 'cls': + try: + from ..cls.infer import pretrained_models + logger.info( + "Here is the list of CLS pretrained models released by PaddleSpeech that can be used by command line and python API" + ) + self.show_support_models(pretrained_models) + return True + except BaseException: + logger.error("Failed to get the list of CLS pretrained models.") + return False + + elif self.task == 'st': + try: + from ..st.infer import pretrained_models + logger.info( + "Here is the list of ST pretrained models released by PaddleSpeech that can be used by command line and python API" + ) + self.show_support_models(pretrained_models) + return True + except BaseException: + logger.error("Failed to get the list of ST pretrained models.") + return False + + elif self.task == 'text': + try: + from ..text.infer import pretrained_models + logger.info( + "Here is the list of TEXT pretrained models released by PaddleSpeech that can be used by command line and python API" + ) + self.show_support_models(pretrained_models) + return True + except BaseException: + logger.error( + "Failed to get the list of TEXT pretrained models.") + return False + + elif self.task == 'tts': + try: + from ..tts.infer import pretrained_models + logger.info( + "Here is the list of TTS pretrained models released by PaddleSpeech that can be used by command line and python API" + ) + self.show_support_models(pretrained_models) + return True + except BaseException: + logger.error("Failed to get the list of TTS pretrained models.") + return False + + @stats_wrapper + def __call__( + self, + task: str=None, ): + """ + Python API to call an executor. + """ + self.task = task + if self.task not in self.task_choices: + print( + "Please input correct speech task, choices = ['asr', 'cls', 'st', 'text', 'tts']" + ) + + elif self.task == 'asr': + try: + from ..asr.infer import pretrained_models + print( + "Here is the list of ASR pretrained models released by PaddleSpeech that can be used by command line and python API" + ) + self.show_support_models(pretrained_models) + except BaseException: + print("Failed to get the list of ASR pretrained models.") + + elif self.task == 'cls': + try: + from ..cls.infer import pretrained_models + print( + "Here is the list of CLS pretrained models released by PaddleSpeech that can be used by command line and python API" + ) + self.show_support_models(pretrained_models) + except BaseException: + print("Failed to get the list of CLS pretrained models.") + + elif self.task == 'st': + try: + from ..st.infer import pretrained_models + print( + "Here is the list of ST pretrained models released by PaddleSpeech that can be used by command line and python API" + ) + self.show_support_models(pretrained_models) + except BaseException: + print("Failed to get the list of ST pretrained models.") + + elif self.task == 'text': + try: + from ..text.infer import pretrained_models + print( + "Here is the list of TEXT pretrained models released by PaddleSpeech that can be used by command line and python API" + ) + self.show_support_models(pretrained_models) + except BaseException: + print("Failed to get the list of TEXT pretrained models.") + + elif self.task == 'tts': + try: + from ..tts.infer import pretrained_models + print( + "Here is the list of TTS pretrained models released by PaddleSpeech that can be used by command line and python API" + ) + self.show_support_models(pretrained_models) + except BaseException: + print("Failed to get the list of TTS pretrained models.") diff --git a/paddlespeech/cli/tts/infer.py b/paddlespeech/cli/tts/infer.py index ba15d652415d33abfe3ae3b2252675cd22b54aba..8423dfa8d1cbf7fc651ff5e538d0ec0993ca2e9f 100644 --- a/paddlespeech/cli/tts/infer.py +++ b/paddlespeech/cli/tts/infer.py @@ -13,6 +13,7 @@ # limitations under the License. import argparse import os +import time from collections import OrderedDict from typing import Any from typing import List @@ -621,6 +622,7 @@ class TTSExecutor(BaseExecutor): am_dataset = am[am.rindex('_') + 1:] get_tone_ids = False merge_sentences = False + frontend_st = time.time() if am_name == 'speedyspeech': get_tone_ids = True if lang == 'zh': @@ -637,9 +639,13 @@ class TTSExecutor(BaseExecutor): phone_ids = input_ids["phone_ids"] else: print("lang should in {'zh', 'en'}!") + self.frontend_time = time.time() - frontend_st + self.am_time = 0 + self.voc_time = 0 flags = 0 for i in range(len(phone_ids)): + am_st = time.time() part_phone_ids = phone_ids[i] # am if am_name == 'speedyspeech': @@ -653,13 +659,16 @@ class TTSExecutor(BaseExecutor): part_phone_ids, spk_id=paddle.to_tensor(spk_id)) else: mel = self.am_inference(part_phone_ids) + self.am_time += (time.time() - am_st) # voc + voc_st = time.time() wav = self.voc_inference(mel) if flags == 0: wav_all = wav flags = 1 else: wav_all = paddle.concat([wav_all, wav]) + self.voc_time += (time.time() - voc_st) self._outputs['wav'] = wav_all def postprocess(self, output: str='output.wav') -> Union[str, os.PathLike]: diff --git a/paddlespeech/s2t/io/sampler.py b/paddlespeech/s2t/io/sampler.py index 89752bb9fdb98faecc0ccc5b8f59ea1f09efc8b6..ac55af1236f11d175e9e7717220980cf95c7d79b 100644 --- a/paddlespeech/s2t/io/sampler.py +++ b/paddlespeech/s2t/io/sampler.py @@ -51,7 +51,7 @@ def _batch_shuffle(indices, batch_size, epoch, clipped=False): """ rng = np.random.RandomState(epoch) shift_len = rng.randint(0, batch_size - 1) - batch_indices = list(zip(*[iter(indices[shift_len:])] * batch_size)) + batch_indices = list(zip(* [iter(indices[shift_len:])] * batch_size)) rng.shuffle(batch_indices) batch_indices = [item for batch in batch_indices for item in batch] assert clipped is False diff --git a/paddlespeech/s2t/io/utility.py b/paddlespeech/s2t/io/utility.py index ce5e772307754a4dc2a8bb2c3b000d62c64cbc83..c08b5535a6cccb7ddf8ba7df53f6c7703e6bb96e 100644 --- a/paddlespeech/s2t/io/utility.py +++ b/paddlespeech/s2t/io/utility.py @@ -11,8 +11,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import List from io import BytesIO +from typing import List import numpy as np diff --git a/paddlespeech/s2t/models/u2_st/u2_st.py b/paddlespeech/s2t/models/u2_st/u2_st.py index f7b05714ef6e9961a1bff79027015889815d5811..999723e5100309976c1b89cbf256ac106d8829e6 100644 --- a/paddlespeech/s2t/models/u2_st/u2_st.py +++ b/paddlespeech/s2t/models/u2_st/u2_st.py @@ -33,8 +33,6 @@ from paddlespeech.s2t.modules.decoder import TransformerDecoder from paddlespeech.s2t.modules.encoder import ConformerEncoder from paddlespeech.s2t.modules.encoder import TransformerEncoder from paddlespeech.s2t.modules.loss import LabelSmoothingLoss -from paddlespeech.s2t.modules.mask import mask_finished_preds -from paddlespeech.s2t.modules.mask import mask_finished_scores from paddlespeech.s2t.modules.mask import subsequent_mask from paddlespeech.s2t.utils import checkpoint from paddlespeech.s2t.utils import layer_tools @@ -291,7 +289,7 @@ class U2STBaseModel(nn.Layer): device = speech.place # Let's assume B = batch_size and N = beam_size - # 1. Encoder and init hypothesis + # 1. Encoder and init hypothesis encoder_out, encoder_mask = self._forward_encoder( speech, speech_lengths, decoding_chunk_size, num_decoding_left_chunks, diff --git a/paddlespeech/server/bin/__init__.py b/paddlespeech/server/bin/__init__.py index bd75747f79948ea42229b8c164174dbe4240d4b1..025aab098f2b6d56ced56d499ce619feb190ab2d 100644 --- a/paddlespeech/server/bin/__init__.py +++ b/paddlespeech/server/bin/__init__.py @@ -14,3 +14,4 @@ from .paddlespeech_client import ASRClientExecutor from .paddlespeech_client import TTSClientExecutor from .paddlespeech_server import ServerExecutor +from .paddlespeech_server import ServerStatsExecutor diff --git a/paddlespeech/server/bin/main.py b/paddlespeech/server/bin/main.py index dda0bbd7f1bc8dcccf16c67fc04eb606a2bfdcd8..360d295ef583a4d490a76392ff9a362c40ee4656 100644 --- a/paddlespeech/server/bin/main.py +++ b/paddlespeech/server/bin/main.py @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. import argparse + import uvicorn -import yaml from fastapi import FastAPI from paddlespeech.server.engine.engine_pool import init_engine_pool diff --git a/paddlespeech/server/bin/paddlespeech_client.py b/paddlespeech/server/bin/paddlespeech_client.py index 3730d607066ed2650929502a27de674308903701..ee6ab7ad764b873a899d0503550a2ad51cd7eadf 100644 --- a/paddlespeech/server/bin/paddlespeech_client.py +++ b/paddlespeech/server/bin/paddlespeech_client.py @@ -48,8 +48,9 @@ class TTSClientExecutor(BaseExecutor): self.parser.add_argument( '--input', type=str, - default="你好,欢迎使用语音合成服务", - help='A sentence to be synthesized.') + default=None, + help='Text to be synthesized.', + required=True) self.parser.add_argument( '--spk_id', type=int, default=0, help='Speaker id') self.parser.add_argument( @@ -120,10 +121,9 @@ class TTSClientExecutor(BaseExecutor): (args.output)) logger.info("Audio duration: %f s." % (duration)) logger.info("Response time: %f s." % (time_consume)) - logger.info("RTF: %f " % (time_consume / duration)) return True - except: + except BaseException: logger.error("Failed to synthesized audio.") return False @@ -163,7 +163,7 @@ class TTSClientExecutor(BaseExecutor): print("Audio duration: %f s." % (duration)) print("Response time: %f s." % (time_consume)) print("RTF: %f " % (time_consume / duration)) - except: + except BaseException: print("Failed to synthesized audio.") @@ -181,8 +181,9 @@ class ASRClientExecutor(BaseExecutor): self.parser.add_argument( '--input', type=str, - default="./paddlespeech/server/tests/16_audio.wav", - help='Audio file to be recognized') + default=None, + help='Audio file to be recognized', + required=True) self.parser.add_argument( '--sample_rate', type=int, default=16000, help='audio sample rate') self.parser.add_argument( @@ -209,7 +210,7 @@ class ASRClientExecutor(BaseExecutor): logger.info(r.json()) logger.info("time cost %f s." % (time_end - time_start)) return True - except: + except BaseException: logger.error("Failed to speech recognition.") return False @@ -240,5 +241,5 @@ class ASRClientExecutor(BaseExecutor): time_end = time.time() print(r.json()) print("time cost %f s." % (time_end - time_start)) - except: - print("Failed to speech recognition.") \ No newline at end of file + except BaseException: + print("Failed to speech recognition.") diff --git a/paddlespeech/server/bin/paddlespeech_server.py b/paddlespeech/server/bin/paddlespeech_server.py index 7c88d8a0ed3820dd421f7015247a8d6a7faa0e04..21fc5c65e965a87c483046d66e45036d1b091b5d 100644 --- a/paddlespeech/server/bin/paddlespeech_server.py +++ b/paddlespeech/server/bin/paddlespeech_server.py @@ -16,15 +16,17 @@ from typing import List import uvicorn from fastapi import FastAPI +from prettytable import PrettyTable from ..executor import BaseExecutor from ..util import cli_server_register from ..util import stats_wrapper -from paddlespeech.server.engine.engine_factory import EngineFactory +from paddlespeech.cli.log import logger +from paddlespeech.server.engine.engine_pool import init_engine_pool from paddlespeech.server.restful.api import setup_router from paddlespeech.server.utils.config import get_config -__all__ = ['ServerExecutor'] +__all__ = ['ServerExecutor', 'ServerStatsExecutor'] app = FastAPI( title="PaddleSpeech Serving API", description="Api", version="0.0.1") @@ -41,7 +43,8 @@ class ServerExecutor(BaseExecutor): "--config_file", action="store", help="yaml file of the app", - default="./conf/application.yaml") + default=None, + required=True) self.parser.add_argument( "--log_file", @@ -51,8 +54,10 @@ class ServerExecutor(BaseExecutor): def init(self, config) -> bool: """system initialization + Args: config (CfgNode): config object + Returns: bool: """ @@ -61,13 +66,8 @@ class ServerExecutor(BaseExecutor): api_router = setup_router(api_list) app.include_router(api_router) - # init engine - engine_pool = [] - for engine in config.engine_backend: - engine_pool.append(EngineFactory.get_engine(engine_name=engine)) - if not engine_pool[-1].init( - config_file=config.engine_backend[engine]): - return False + if not init_engine_pool(config): + return False return True @@ -88,3 +88,139 @@ class ServerExecutor(BaseExecutor): config = get_config(config_file) if self.init(config): uvicorn.run(app, host=config.host, port=config.port, debug=True) + + +@cli_server_register( + name='paddlespeech_server.stats', + description='Get the models supported by each speech task in the service.') +class ServerStatsExecutor(): + def __init__(self): + super(ServerStatsExecutor, self).__init__() + + self.parser = argparse.ArgumentParser( + prog='paddlespeech_server.stats', add_help=True) + self.parser.add_argument( + '--task', + type=str, + default=None, + choices=['asr', 'tts'], + help='Choose speech task.', + required=True) + self.task_choices = ['asr', 'tts'] + self.model_name_format = { + 'asr': 'Model-Language-Sample Rate', + 'tts': 'Model-Language' + } + + def show_support_models(self, pretrained_models: dict): + fields = self.model_name_format[self.task].split("-") + table = PrettyTable(fields) + for key in pretrained_models: + table.add_row(key.split("-")) + print(table) + + def execute(self, argv: List[str]) -> bool: + """ + Command line entry. + """ + parser_args = self.parser.parse_args(argv) + self.task = parser_args.task + if self.task not in self.task_choices: + logger.error( + "Please input correct speech task, choices = ['asr', 'tts']") + return False + + elif self.task == 'asr': + try: + from paddlespeech.cli.asr.infer import pretrained_models + logger.info( + "Here is the table of ASR pretrained models supported in the service." + ) + self.show_support_models(pretrained_models) + + # show ASR static pretrained model + from paddlespeech.server.engine.asr.paddleinference.asr_engine import pretrained_models + logger.info( + "Here is the table of ASR static pretrained models supported in the service." + ) + self.show_support_models(pretrained_models) + + return True + except BaseException: + logger.error( + "Failed to get the table of ASR pretrained models supported in the service." + ) + return False + + elif self.task == 'tts': + try: + from paddlespeech.cli.tts.infer import pretrained_models + logger.info( + "Here is the table of TTS pretrained models supported in the service." + ) + self.show_support_models(pretrained_models) + + # show TTS static pretrained model + from paddlespeech.server.engine.tts.paddleinference.tts_engine import pretrained_models + logger.info( + "Here is the table of TTS static pretrained models supported in the service." + ) + self.show_support_models(pretrained_models) + + return True + except BaseException: + logger.error( + "Failed to get the table of TTS pretrained models supported in the service." + ) + return False + + @stats_wrapper + def __call__( + self, + task: str=None, ): + """ + Python API to call an executor. + """ + self.task = task + if self.task not in self.task_choices: + print("Please input correct speech task, choices = ['asr', 'tts']") + + elif self.task == 'asr': + try: + from paddlespeech.cli.asr.infer import pretrained_models + print( + "Here is the table of ASR pretrained models supported in the service." + ) + self.show_support_models(pretrained_models) + + # show ASR static pretrained model + from paddlespeech.server.engine.asr.paddleinference.asr_engine import pretrained_models + print( + "Here is the table of ASR static pretrained models supported in the service." + ) + self.show_support_models(pretrained_models) + + except BaseException: + print( + "Failed to get the table of ASR pretrained models supported in the service." + ) + + elif self.task == 'tts': + try: + from paddlespeech.cli.tts.infer import pretrained_models + print( + "Here is the table of TTS pretrained models supported in the service." + ) + self.show_support_models(pretrained_models) + + # show TTS static pretrained model + from paddlespeech.server.engine.tts.paddleinference.tts_engine import pretrained_models + print( + "Here is the table of TTS static pretrained models supported in the service." + ) + self.show_support_models(pretrained_models) + + except BaseException: + print( + "Failed to get the table of TTS pretrained models supported in the service." + ) diff --git a/paddlespeech/server/conf/application.yaml b/paddlespeech/server/conf/application.yaml index 154ef9af41a4fb77318d7a5bab2cc6278a662b95..6dcae74a944fdf129a35b991b23be5c724d5df16 100644 --- a/paddlespeech/server/conf/application.yaml +++ b/paddlespeech/server/conf/application.yaml @@ -3,18 +3,25 @@ ################################################################## # SERVER SETTING # ################################################################## -host: '0.0.0.0' +host: '127.0.0.1' port: 8090 ################################################################## # CONFIG FILE # ################################################################## +# add engine backend type (Options: asr, tts) and config file here. +# Adding a speech task to engine_backend means starting the service. +engine_backend: + asr: 'conf/asr/asr.yaml' + tts: 'conf/tts/tts.yaml' + +# The engine_type of speech task needs to keep the same type as the config file of speech task. +# E.g: The engine_type of asr is 'python', the engine_backend of asr is 'XX/asr.yaml' +# E.g: The engine_type of asr is 'inference', the engine_backend of asr is 'XX/asr_pd.yaml' +# # add engine type (Options: python, inference) engine_type: - asr: 'inference' - # tts: 'inference' + asr: 'python' + tts: 'python' + -# add engine backend type (Options: asr, tts) and config file here. -engine_backend: - asr: 'conf/asr/asr_pd.yaml' - #tts: 'conf/tts/tts_pd.yaml' diff --git a/paddlespeech/server/conf/asr/asr.yaml b/paddlespeech/server/conf/asr/asr.yaml index 50e55a3ca0534d0534aa719ed426e49b35bc7675..a6743b77513e504f2bcd374ea8235d8e39a7c98c 100644 --- a/paddlespeech/server/conf/asr/asr.yaml +++ b/paddlespeech/server/conf/asr/asr.yaml @@ -5,3 +5,4 @@ cfg_path: # [optional] ckpt_path: # [optional] decode_method: 'attention_rescoring' force_yes: True +device: # set 'gpu:id' or 'cpu' diff --git a/paddlespeech/server/conf/asr/asr_pd.yaml b/paddlespeech/server/conf/asr/asr_pd.yaml index 43a63f1bd8031af387b171d176b31a4ac6368413..4c415ac791edeab2d9832e8db2e9a66411aaed06 100644 --- a/paddlespeech/server/conf/asr/asr_pd.yaml +++ b/paddlespeech/server/conf/asr/asr_pd.yaml @@ -15,9 +15,10 @@ decode_method: force_yes: True am_predictor_conf: - use_gpu: True - enable_mkldnn: True + device: # set 'gpu:id' or 'cpu' switch_ir_optim: True + glog_info: False # True -> print glog + summary: True # False -> do not show predictor config ################################################################## diff --git a/paddlespeech/server/conf/tts/tts.yaml b/paddlespeech/server/conf/tts/tts.yaml index d0e128eaee0c14783d23867563ee0275fbceef1b..19207f0b03579a906c80ba6eff356792974eeefd 100644 --- a/paddlespeech/server/conf/tts/tts.yaml +++ b/paddlespeech/server/conf/tts/tts.yaml @@ -29,4 +29,4 @@ voc_stat: # OTHERS # ################################################################## lang: 'zh' -device: paddle.get_device() \ No newline at end of file +device: # set 'gpu:id' or 'cpu' diff --git a/paddlespeech/server/conf/tts/tts_pd.yaml b/paddlespeech/server/conf/tts/tts_pd.yaml index c268c6a336bb21be7879980cb3cb3c59611d64cd..e27b9665bbe1ee8b5d5c39fd3e5f87d841dd64de 100644 --- a/paddlespeech/server/conf/tts/tts_pd.yaml +++ b/paddlespeech/server/conf/tts/tts_pd.yaml @@ -6,8 +6,8 @@ # am choices=['speedyspeech_csmsc', 'fastspeech2_csmsc'] ################################################################## am: 'fastspeech2_csmsc' -am_model: # the pdmodel file of am static model -am_params: # the pdiparams file of am static model +am_model: # the pdmodel file of your am static model (XX.pdmodel) +am_params: # the pdiparams file of your am static model (XX.pdipparams) am_sample_rate: 24000 phones_dict: tones_dict: @@ -15,9 +15,10 @@ speaker_dict: spk_id: 0 am_predictor_conf: - use_gpu: True - enable_mkldnn: True + device: # set 'gpu:id' or 'cpu' switch_ir_optim: True + glog_info: False # True -> print glog + summary: True # False -> do not show predictor config ################################################################## @@ -25,17 +26,17 @@ am_predictor_conf: # voc choices=['pwgan_csmsc', 'mb_melgan_csmsc','hifigan_csmsc'] ################################################################## voc: 'pwgan_csmsc' -voc_model: # the pdmodel file of vocoder static model -voc_params: # the pdiparams file of vocoder static model +voc_model: # the pdmodel file of your vocoder static model (XX.pdmodel) +voc_params: # the pdiparams file of your vocoder static model (XX.pdipparams) voc_sample_rate: 24000 voc_predictor_conf: - use_gpu: True - enable_mkldnn: True + device: # set 'gpu:id' or 'cpu' switch_ir_optim: True + glog_info: False # True -> print glog + summary: True # False -> do not show predictor config ################################################################## # OTHERS # ################################################################## lang: 'zh' -device: paddle.get_device() diff --git a/paddlespeech/server/engine/asr/paddleinference/asr_engine.py b/paddlespeech/server/engine/asr/paddleinference/asr_engine.py index 6d072322918bbf8f4ba7dc3d11d00e1209e21f95..cb973e924efb5bcd7de440f97a27c0d29fda29c0 100644 --- a/paddlespeech/server/engine/asr/paddleinference/asr_engine.py +++ b/paddlespeech/server/engine/asr/paddleinference/asr_engine.py @@ -13,31 +13,25 @@ # limitations under the License. import io import os -from typing import List +import time from typing import Optional -from typing import Union -import librosa import paddle -import soundfile from yacs.config import CfgNode -from paddlespeech.cli.utils import MODEL_HOME -from paddlespeech.s2t.modules.ctc import CTCDecoder from paddlespeech.cli.asr.infer import ASRExecutor from paddlespeech.cli.log import logger +from paddlespeech.cli.utils import MODEL_HOME from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer -from paddlespeech.s2t.transform.transformation import Transformation -from paddlespeech.s2t.utils.dynamic_import import dynamic_import +from paddlespeech.s2t.modules.ctc import CTCDecoder from paddlespeech.s2t.utils.utility import UpdateConfig +from paddlespeech.server.engine.base_engine import BaseEngine from paddlespeech.server.utils.config import get_config from paddlespeech.server.utils.paddle_predictor import init_predictor from paddlespeech.server.utils.paddle_predictor import run_model -from paddlespeech.server.engine.base_engine import BaseEngine __all__ = ['ASREngine'] - pretrained_models = { "deepspeech2offline_aishell-zh-16k": { 'url': @@ -143,7 +137,6 @@ class ASRServerExecutor(ASRExecutor): batch_average=True, # sum / batch_size grad_norm_type=self.config.get('ctc_grad_norm_type', None)) - @paddle.no_grad() def infer(self, model_type: str): """ @@ -161,9 +154,8 @@ class ASRServerExecutor(ASRExecutor): cfg.beam_size, cfg.cutoff_prob, cfg.cutoff_top_n, cfg.num_proc_bsearch) - output_data = run_model( - self.am_predictor, - [audio.numpy(), audio_len.numpy()]) + output_data = run_model(self.am_predictor, + [audio.numpy(), audio_len.numpy()]) probs = output_data[0] eouts_len = output_data[1] @@ -206,16 +198,15 @@ class ASREngine(BaseEngine): self.executor = ASRServerExecutor() self.config = get_config(config_file) - paddle.set_device(paddle.get_device()) self.executor._init_from_path( - model_type=self.config.model_type, - am_model=self.config.am_model, - am_params=self.config.am_params, - lang=self.config.lang, - sample_rate=self.config.sample_rate, - cfg_path=self.config.cfg_path, - decode_method=self.config.decode_method, - am_predictor_conf=self.config.am_predictor_conf) + model_type=self.config.model_type, + am_model=self.config.am_model, + am_params=self.config.am_params, + lang=self.config.lang, + sample_rate=self.config.sample_rate, + cfg_path=self.config.cfg_path, + decode_method=self.config.decode_method, + am_predictor_conf=self.config.am_predictor_conf) logger.info("Initialize ASR server engine successfully.") return True @@ -230,14 +221,20 @@ class ASREngine(BaseEngine): io.BytesIO(audio_data), self.config.sample_rate, self.config.force_yes): logger.info("start running asr engine") - self.executor.preprocess(self.config.model_type, io.BytesIO(audio_data)) + self.executor.preprocess(self.config.model_type, + io.BytesIO(audio_data)) + st = time.time() self.executor.infer(self.config.model_type) + infer_time = time.time() - st self.output = self.executor.postprocess() # Retrieve result of asr. logger.info("end inferring asr engine") else: logger.info("file check failed!") self.output = None + logger.info("inference time: {}".format(infer_time)) + logger.info("asr engine type: paddle inference") + def postprocess(self): """postprocess """ diff --git a/paddlespeech/server/engine/asr/python/asr_engine.py b/paddlespeech/server/engine/asr/python/asr_engine.py index fd67b0291fd7a2adc09370f2155a859967eb292e..1e2c5cc270dab1f82caa9c0810411211c8cdbe2e 100644 --- a/paddlespeech/server/engine/asr/python/asr_engine.py +++ b/paddlespeech/server/engine/asr/python/asr_engine.py @@ -12,21 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. import io -import os -from typing import List -from typing import Optional -from typing import Union +import time -import librosa import paddle -import soundfile from paddlespeech.cli.asr.infer import ASRExecutor from paddlespeech.cli.log import logger -from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer -from paddlespeech.s2t.transform.transformation import Transformation -from paddlespeech.s2t.utils.dynamic_import import dynamic_import -from paddlespeech.s2t.utils.utility import UpdateConfig from paddlespeech.server.engine.base_engine import BaseEngine from paddlespeech.server.utils.config import get_config @@ -63,13 +54,24 @@ class ASREngine(BaseEngine): self.executor = ASRServerExecutor() self.config = get_config(config_file) - paddle.set_device(paddle.get_device()) + try: + if self.config.device: + self.device = self.config.device + else: + self.device = paddle.get_device() + paddle.set_device(self.device) + except BaseException: + logger.error( + "Set device failed, please check if device is already used and the parameter 'device' in the yaml file" + ) + self.executor._init_from_path( self.config.model, self.config.lang, self.config.sample_rate, self.config.cfg_path, self.config.decode_method, self.config.ckpt_path) - logger.info("Initialize ASR server engine successfully.") + logger.info("Initialize ASR server engine successfully on device: %s." % + (self.device)) return True def run(self, audio_data): @@ -83,12 +85,17 @@ class ASREngine(BaseEngine): self.config.force_yes): logger.info("start run asr engine") self.executor.preprocess(self.config.model, io.BytesIO(audio_data)) + st = time.time() self.executor.infer(self.config.model) + infer_time = time.time() - st self.output = self.executor.postprocess() # Retrieve result of asr. else: logger.info("file check failed!") self.output = None + logger.info("inference time: {}".format(infer_time)) + logger.info("asr engine type: python") + def postprocess(self): """postprocess """ diff --git a/paddlespeech/server/engine/base_engine.py b/paddlespeech/server/engine/base_engine.py index 0cc20209479ea7e033943b799a7e161ac21e3b35..0f020d1c783e194f96af84de9326eba25595435c 100644 --- a/paddlespeech/server/engine/base_engine.py +++ b/paddlespeech/server/engine/base_engine.py @@ -12,8 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. import os -from typing import Any -from typing import List from typing import Union from pattern_singleton import Singleton diff --git a/paddlespeech/server/engine/engine_factory.py b/paddlespeech/server/engine/engine_factory.py index 05f13568106f6646f342fe94885934203036c26c..546541edfcfbfd619275646446dbd4e086536c4f 100644 --- a/paddlespeech/server/engine/engine_factory.py +++ b/paddlespeech/server/engine/engine_factory.py @@ -13,7 +13,6 @@ # limitations under the License. from typing import Text - __all__ = ['EngineFactory'] diff --git a/paddlespeech/server/engine/engine_pool.py b/paddlespeech/server/engine/engine_pool.py index 0198bd80aa9e2e32e4c44bf6761b77a78c09abec..f6a4d2aab2c894149efae75afacf6a275a5dd6b0 100644 --- a/paddlespeech/server/engine/engine_pool.py +++ b/paddlespeech/server/engine/engine_pool.py @@ -29,8 +29,10 @@ def init_engine_pool(config) -> bool: """ global ENGINE_POOL for engine in config.engine_backend: - ENGINE_POOL[engine] = EngineFactory.get_engine(engine_name=engine, engine_type=config.engine_type[engine]) - if not ENGINE_POOL[engine].init(config_file=config.engine_backend[engine]): + ENGINE_POOL[engine] = EngineFactory.get_engine( + engine_name=engine, engine_type=config.engine_type[engine]) + if not ENGINE_POOL[engine].init( + config_file=config.engine_backend[engine]): return False return True diff --git a/paddlespeech/server/engine/tts/paddleinference/tts_engine.py b/paddlespeech/server/engine/tts/paddleinference/tts_engine.py index 7679b02f03b2b5bf6f52482ae3a926f1081f3d65..5955c1a216a304629c4896a0f9462d39d9121715 100644 --- a/paddlespeech/server/engine/tts/paddleinference/tts_engine.py +++ b/paddlespeech/server/engine/tts/paddleinference/tts_engine.py @@ -14,6 +14,7 @@ import base64 import io import os +import time from typing import Optional import librosa @@ -179,7 +180,7 @@ class TTSServerExecutor(TTSExecutor): self.phones_dict = os.path.abspath(phones_dict) self.am_sample_rate = am_sample_rate self.am_res_path = os.path.dirname(os.path.abspath(self.am_model)) - print("self.phones_dict:", self.phones_dict) + logger.info("self.phones_dict: {}".format(self.phones_dict)) # for speedyspeech self.tones_dict = None @@ -224,21 +225,21 @@ class TTSServerExecutor(TTSExecutor): with open(self.phones_dict, "r") as f: phn_id = [line.strip().split() for line in f.readlines()] vocab_size = len(phn_id) - print("vocab_size:", vocab_size) + logger.info("vocab_size: {}".format(vocab_size)) tone_size = None if self.tones_dict: with open(self.tones_dict, "r") as f: tone_id = [line.strip().split() for line in f.readlines()] tone_size = len(tone_id) - print("tone_size:", tone_size) + logger.info("tone_size: {}".format(tone_size)) spk_num = None if self.speaker_dict: with open(self.speaker_dict, 'rt') as f: spk_id = [line.strip().split() for line in f.readlines()] spk_num = len(spk_id) - print("spk_num:", spk_num) + logger.info("spk_num: {}".format(spk_num)) # frontend if lang == 'zh': @@ -248,21 +249,29 @@ class TTSServerExecutor(TTSExecutor): elif lang == 'en': self.frontend = English(phone_vocab_path=self.phones_dict) - print("frontend done!") - - # am predictor - self.am_predictor_conf = am_predictor_conf - self.am_predictor = init_predictor( - model_file=self.am_model, - params_file=self.am_params, - predictor_conf=self.am_predictor_conf) - - # voc predictor - self.voc_predictor_conf = voc_predictor_conf - self.voc_predictor = init_predictor( - model_file=self.voc_model, - params_file=self.voc_params, - predictor_conf=self.voc_predictor_conf) + logger.info("frontend done!") + + try: + # am predictor + self.am_predictor_conf = am_predictor_conf + self.am_predictor = init_predictor( + model_file=self.am_model, + params_file=self.am_params, + predictor_conf=self.am_predictor_conf) + logger.info("Create AM predictor successfully.") + except BaseException: + logger.error("Failed to create AM predictor.") + + try: + # voc predictor + self.voc_predictor_conf = voc_predictor_conf + self.voc_predictor = init_predictor( + model_file=self.voc_model, + params_file=self.voc_params, + predictor_conf=self.voc_predictor_conf) + logger.info("Create Vocoder predictor successfully.") + except BaseException: + logger.error("Failed to create Vocoder predictor.") @paddle.no_grad() def infer(self, @@ -277,6 +286,7 @@ class TTSServerExecutor(TTSExecutor): am_dataset = am[am.rindex('_') + 1:] get_tone_ids = False merge_sentences = False + frontend_st = time.time() if am_name == 'speedyspeech': get_tone_ids = True if lang == 'zh': @@ -292,10 +302,14 @@ class TTSServerExecutor(TTSExecutor): text, merge_sentences=merge_sentences) phone_ids = input_ids["phone_ids"] else: - print("lang should in {'zh', 'en'}!") + logger.error("lang should in {'zh', 'en'}!") + self.frontend_time = time.time() - frontend_st + self.am_time = 0 + self.voc_time = 0 flags = 0 for i in range(len(phone_ids)): + am_st = time.time() part_phone_ids = phone_ids[i] # am if am_name == 'speedyspeech': @@ -314,7 +328,10 @@ class TTSServerExecutor(TTSExecutor): am_result = run_model(self.am_predictor, [part_phone_ids.numpy()]) mel = am_result[0] + self.am_time += (time.time() - am_st) + # voc + voc_st = time.time() voc_result = run_model(self.voc_predictor, [mel]) wav = voc_result[0] wav = paddle.to_tensor(wav) @@ -324,6 +341,7 @@ class TTSServerExecutor(TTSExecutor): flags = 1 else: wav_all = paddle.concat([wav_all, wav]) + self.voc_time += (time.time() - voc_st) self._outputs['wav'] = wav_all @@ -344,7 +362,6 @@ class TTSEngine(BaseEngine): try: self.config = get_config(config_file) - self.executor._init_from_path( am=self.config.am, am_model=self.config.am_model, @@ -361,8 +378,8 @@ class TTSEngine(BaseEngine): am_predictor_conf=self.config.am_predictor_conf, voc_predictor_conf=self.config.voc_predictor_conf, ) - except: - logger.info("Initialize TTS server engine Failed.") + except BaseException: + logger.error("Initialize TTS server engine Failed.") return False logger.info("Initialize TTS server engine successfully.") @@ -371,7 +388,7 @@ class TTSEngine(BaseEngine): def postprocess(self, wav, original_fs: int, - target_fs: int=16000, + target_fs: int=0, volume: float=1.0, speed: float=1.0, audio_path: str=None): @@ -396,36 +413,50 @@ class TTSEngine(BaseEngine): if target_fs == 0 or target_fs > original_fs: target_fs = original_fs wav_tar_fs = wav + logger.info( + "The sample rate of synthesized audio is the same as model, which is {}Hz". + format(original_fs)) else: wav_tar_fs = librosa.resample( np.squeeze(wav), original_fs, target_fs) - + logger.info( + "The sample rate of model is {}Hz and the target sample rate is {}Hz. Converting the sample rate of the synthesized audio successfully.". + format(original_fs, target_fs)) # transform volume wav_vol = wav_tar_fs * volume + logger.info("Transform the volume of the audio successfully.") # transform speed try: # windows not support soxbindings wav_speed = change_speed(wav_vol, speed, target_fs) - except: + logger.info("Transform the speed of the audio successfully.") + except ServerBaseException: raise ServerBaseException( ErrorCode.SERVER_INTERNAL_ERR, - "Transform speed failed. Can not install soxbindings on your system. \ + "Failed to transform speed. Can not install soxbindings on your system. \ You need to set speed value 1.0.") + except BaseException: + logger.error("Failed to transform speed.") # wav to base64 buf = io.BytesIO() wavfile.write(buf, target_fs, wav_speed) base64_bytes = base64.b64encode(buf.read()) wav_base64 = base64_bytes.decode('utf-8') + logger.info("Audio to string successfully.") # save audio - if audio_path is not None and audio_path.endswith(".wav"): - sf.write(audio_path, wav_speed, target_fs) - elif audio_path is not None and audio_path.endswith(".pcm"): - wav_norm = wav_speed * (32767 / max(0.001, - np.max(np.abs(wav_speed)))) - with open(audio_path, "wb") as f: - f.write(wav_norm.astype(np.int16)) + if audio_path is not None: + if audio_path.endswith(".wav"): + sf.write(audio_path, wav_speed, target_fs) + elif audio_path.endswith(".pcm"): + wav_norm = wav_speed * (32767 / max(0.001, + np.max(np.abs(wav_speed)))) + with open(audio_path, "wb") as f: + f.write(wav_norm.astype(np.int16)) + logger.info("Save audio to {} successfully.".format(audio_path)) + else: + logger.info("There is no need to save audio.") return target_fs, wav_base64 @@ -461,13 +492,20 @@ class TTSEngine(BaseEngine): lang = self.config.lang try: + infer_st = time.time() self.executor.infer( text=sentence, lang=lang, am=self.config.am, spk_id=spk_id) - except: + infer_et = time.time() + infer_time = infer_et - infer_st + + except ServerBaseException: raise ServerBaseException(ErrorCode.SERVER_INTERNAL_ERR, "tts infer failed.") + except BaseException: + logger.error("tts infer failed.") try: + postprocess_st = time.time() target_sample_rate, wav_base64 = self.postprocess( wav=self.executor._outputs['wav'].numpy(), original_fs=self.executor.am_sample_rate, @@ -475,8 +513,34 @@ class TTSEngine(BaseEngine): volume=volume, speed=speed, audio_path=save_path) - except: + postprocess_et = time.time() + postprocess_time = postprocess_et - postprocess_st + duration = len(self.executor._outputs['wav'] + .numpy()) / self.executor.am_sample_rate + rtf = infer_time / duration + + except ServerBaseException: raise ServerBaseException(ErrorCode.SERVER_INTERNAL_ERR, "tts postprocess failed.") + except BaseException: + logger.error("tts postprocess failed.") + + logger.info("AM model: {}".format(self.config.am)) + logger.info("Vocoder model: {}".format(self.config.voc)) + logger.info("Language: {}".format(lang)) + logger.info("tts engine type: paddle inference") + + logger.info("audio duration: {}".format(duration)) + logger.info( + "frontend inference time: {}".format(self.executor.frontend_time)) + logger.info("AM inference time: {}".format(self.executor.am_time)) + logger.info("Vocoder inference time: {}".format(self.executor.voc_time)) + logger.info("total inference time: {}".format(infer_time)) + logger.info( + "postprocess (change speed, volume, target sample rate) time: {}". + format(postprocess_time)) + logger.info("total generate audio time: {}".format(infer_time + + postprocess_time)) + logger.info("RTF: {}".format(rtf)) return lang, target_sample_rate, wav_base64 diff --git a/paddlespeech/server/engine/tts/python/tts_engine.py b/paddlespeech/server/engine/tts/python/tts_engine.py index e11cfb1d1671ae26816a8974c1d55bf0d39e3c06..7dd576699d02c2ecef8b0993a0273f9826c08a6b 100644 --- a/paddlespeech/server/engine/tts/python/tts_engine.py +++ b/paddlespeech/server/engine/tts/python/tts_engine.py @@ -13,6 +13,7 @@ # limitations under the License. import base64 import io +import time import librosa import numpy as np @@ -54,8 +55,20 @@ class TTSEngine(BaseEngine): try: self.config = get_config(config_file) - paddle.set_device(self.config.device) + if self.config.device: + self.device = self.config.device + else: + self.device = paddle.get_device() + paddle.set_device(self.device) + except BaseException: + logger.error( + "Set device failed, please check if device is already used and the parameter 'device' in the yaml file" + ) + logger.error("Initialize TTS server engine Failed on device: %s." % + (self.device)) + return False + try: self.executor._init_from_path( am=self.config.am, am_config=self.config.am_config, @@ -69,17 +82,20 @@ class TTSEngine(BaseEngine): voc_ckpt=self.config.voc_ckpt, voc_stat=self.config.voc_stat, lang=self.config.lang) - except: - logger.info("Initialize TTS server engine Failed.") + except BaseException: + logger.error("Failed to get model related files.") + logger.error("Initialize TTS server engine Failed on device: %s." % + (self.device)) return False - logger.info("Initialize TTS server engine successfully.") + logger.info("Initialize TTS server engine successfully on device: %s." % + (self.device)) return True def postprocess(self, wav, original_fs: int, - target_fs: int=16000, + target_fs: int=0, volume: float=1.0, speed: float=1.0, audio_path: str=None): @@ -104,35 +120,50 @@ class TTSEngine(BaseEngine): if target_fs == 0 or target_fs > original_fs: target_fs = original_fs wav_tar_fs = wav + logger.info( + "The sample rate of synthesized audio is the same as model, which is {}Hz". + format(original_fs)) else: wav_tar_fs = librosa.resample( np.squeeze(wav), original_fs, target_fs) - + logger.info( + "The sample rate of model is {}Hz and the target sample rate is {}Hz. Converting the sample rate of the synthesized audio successfully.". + format(original_fs, target_fs)) # transform volume wav_vol = wav_tar_fs * volume + logger.info("Transform the volume of the audio successfully.") # transform speed try: # windows not support soxbindings wav_speed = change_speed(wav_vol, speed, target_fs) - except: + logger.info("Transform the speed of the audio successfully.") + except ServerBaseException: raise ServerBaseException( ErrorCode.SERVER_INTERNAL_ERR, - "Can not install soxbindings on your system.") + "Failed to transform speed. Can not install soxbindings on your system. \ + You need to set speed value 1.0.") + except BaseException: + logger.error("Failed to transform speed.") # wav to base64 buf = io.BytesIO() wavfile.write(buf, target_fs, wav_speed) base64_bytes = base64.b64encode(buf.read()) wav_base64 = base64_bytes.decode('utf-8') + logger.info("Audio to string successfully.") # save audio - if audio_path is not None and audio_path.endswith(".wav"): - sf.write(audio_path, wav_speed, target_fs) - elif audio_path is not None and audio_path.endswith(".pcm"): - wav_norm = wav_speed * (32767 / max(0.001, - np.max(np.abs(wav_speed)))) - with open(audio_path, "wb") as f: - f.write(wav_norm.astype(np.int16)) + if audio_path is not None: + if audio_path.endswith(".wav"): + sf.write(audio_path, wav_speed, target_fs) + elif audio_path.endswith(".pcm"): + wav_norm = wav_speed * (32767 / max(0.001, + np.max(np.abs(wav_speed)))) + with open(audio_path, "wb") as f: + f.write(wav_norm.astype(np.int16)) + logger.info("Save audio to {} successfully.".format(audio_path)) + else: + logger.info("There is no need to save audio.") return target_fs, wav_base64 @@ -168,13 +199,23 @@ class TTSEngine(BaseEngine): lang = self.config.lang try: + infer_st = time.time() self.executor.infer( text=sentence, lang=lang, am=self.config.am, spk_id=spk_id) - except: + infer_et = time.time() + infer_time = infer_et - infer_st + duration = len(self.executor._outputs['wav'] + .numpy()) / self.executor.am_config.fs + rtf = infer_time / duration + + except ServerBaseException: raise ServerBaseException(ErrorCode.SERVER_INTERNAL_ERR, "tts infer failed.") + except BaseException: + logger.error("tts infer failed.") try: + postprocess_st = time.time() target_sample_rate, wav_base64 = self.postprocess( wav=self.executor._outputs['wav'].numpy(), original_fs=self.executor.am_config.fs, @@ -182,8 +223,32 @@ class TTSEngine(BaseEngine): volume=volume, speed=speed, audio_path=save_path) - except: + postprocess_et = time.time() + postprocess_time = postprocess_et - postprocess_st + + except ServerBaseException: raise ServerBaseException(ErrorCode.SERVER_INTERNAL_ERR, "tts postprocess failed.") + except BaseException: + logger.error("tts postprocess failed.") + + logger.info("AM model: {}".format(self.config.am)) + logger.info("Vocoder model: {}".format(self.config.voc)) + logger.info("Language: {}".format(lang)) + logger.info("tts engine type: python") + + logger.info("audio duration: {}".format(duration)) + logger.info( + "frontend inference time: {}".format(self.executor.frontend_time)) + logger.info("AM inference time: {}".format(self.executor.am_time)) + logger.info("Vocoder inference time: {}".format(self.executor.voc_time)) + logger.info("total inference time: {}".format(infer_time)) + logger.info( + "postprocess (change speed, volume, target sample rate) time: {}". + format(postprocess_time)) + logger.info("total generate audio time: {}".format(infer_time + + postprocess_time)) + logger.info("RTF: {}".format(rtf)) + logger.info("device: {}".format(self.device)) return lang, target_sample_rate, wav_base64 diff --git a/paddlespeech/server/restful/asr_api.py b/paddlespeech/server/restful/asr_api.py index 4806c0421da0384c20297670869538b4ff17a169..cf46735dcc84dc92c8bfcfa71b426604ed7c1843 100644 --- a/paddlespeech/server/restful/asr_api.py +++ b/paddlespeech/server/restful/asr_api.py @@ -14,6 +14,7 @@ import base64 import traceback from typing import Union + from fastapi import APIRouter from paddlespeech.server.engine.engine_pool import get_engine_pool @@ -83,7 +84,7 @@ def asr(request_body: ASRRequest): except ServerBaseException as e: response = failed_response(e.error_code, e.msg) - except: + except BaseException: response = failed_response(ErrorCode.SERVER_UNKOWN_ERR) traceback.print_exc() diff --git a/paddlespeech/server/restful/request.py b/paddlespeech/server/restful/request.py index 2be5f0e546dee6c1c042820ac1a3838a446e23ea..28908801977d346e56a24ba075263a33f37e7d34 100644 --- a/paddlespeech/server/restful/request.py +++ b/paddlespeech/server/restful/request.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import List from typing import Optional from pydantic import BaseModel diff --git a/paddlespeech/server/restful/response.py b/paddlespeech/server/restful/response.py index ab5e395ba6914482e320d13abf2744e2fef71ec0..4e18ee0d790248313b6f14f068ac3f37a33aeba6 100644 --- a/paddlespeech/server/restful/response.py +++ b/paddlespeech/server/restful/response.py @@ -11,9 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import List -from typing import Optional - from pydantic import BaseModel __all__ = ['ASRResponse', 'TTSResponse'] diff --git a/paddlespeech/server/restful/tts_api.py b/paddlespeech/server/restful/tts_api.py index d5fa1d42c4db0e822ab2d545ad69225ebb382222..0af0f6d07901d91887b401d5a2dfb411aa9d80b9 100644 --- a/paddlespeech/server/restful/tts_api.py +++ b/paddlespeech/server/restful/tts_api.py @@ -16,7 +16,8 @@ from typing import Union from fastapi import APIRouter -from paddlespeech.server.engine.tts.paddleinference.tts_engine import TTSEngine +from paddlespeech.cli.log import logger +from paddlespeech.server.engine.engine_pool import get_engine_pool from paddlespeech.server.restful.request import TTSRequest from paddlespeech.server.restful.response import ErrorResponse from paddlespeech.server.restful.response import TTSResponse @@ -60,28 +61,45 @@ def tts(request_body: TTSRequest): Returns: json: [description] """ - # json to dict - item_dict = request_body.dict() - sentence = item_dict['text'] - spk_id = item_dict['spk_id'] - speed = item_dict['speed'] - volume = item_dict['volume'] - sample_rate = item_dict['sample_rate'] - save_path = item_dict['save_path'] - # Check parameters - if speed <=0 or speed > 3 or volume <=0 or volume > 3 or \ - sample_rate not in [0, 16000, 8000] or \ - (save_path is not None and not save_path.endswith("pcm") and not save_path.endswith("wav")): - return failed_response(ErrorCode.SERVER_PARAM_ERR) + logger.info("request: {}".format(request_body)) + + # get params + text = request_body.text + spk_id = request_body.spk_id + speed = request_body.speed + volume = request_body.volume + sample_rate = request_body.sample_rate + save_path = request_body.save_path - # single - tts_engine = TTSEngine() + # Check parameters + if speed <= 0 or speed > 3: + return failed_response( + ErrorCode.SERVER_PARAM_ERR, + "invalid speed value, the value should be between 0 and 3.") + if volume <= 0 or volume > 3: + return failed_response( + ErrorCode.SERVER_PARAM_ERR, + "invalid volume value, the value should be between 0 and 3.") + if sample_rate not in [0, 16000, 8000]: + return failed_response( + ErrorCode.SERVER_PARAM_ERR, + "invalid sample_rate value, the choice of value is 0, 8000, 16000.") + if save_path is not None and not save_path.endswith( + "pcm") and not save_path.endswith("wav"): + return failed_response( + ErrorCode.SERVER_PARAM_ERR, + "invalid save_path, saved audio formats support pcm and wav") # run try: + # get single engine from engine pool + engine_pool = get_engine_pool() + tts_engine = engine_pool['tts'] + logger.info("Get tts engine successfully.") + lang, target_sample_rate, wav_base64 = tts_engine.run( - sentence, spk_id, speed, volume, sample_rate, save_path) + text, spk_id, speed, volume, sample_rate, save_path) response = { "success": True, @@ -101,7 +119,7 @@ def tts(request_body: TTSRequest): } except ServerBaseException as e: response = failed_response(e.error_code, e.msg) - except: + except BaseException: response = failed_response(ErrorCode.SERVER_UNKOWN_ERR) traceback.print_exc() diff --git a/paddlespeech/server/tests/asr/http_client.py b/paddlespeech/server/tests/asr/http_client.py index 14adb5741989790140fa509bb4e6eeca1b48546f..49f2adf7c28954af1fc2efc42b81169989ad471e 100644 --- a/paddlespeech/server/tests/asr/http_client.py +++ b/paddlespeech/server/tests/asr/http_client.py @@ -10,11 +10,11 @@ # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the -import requests +import base64 import json import time -import base64 -import io + +import requests def readwav2base64(wav_file): @@ -34,23 +34,23 @@ def main(): url = "http://127.0.0.1:8090/paddlespeech/asr" # start Timestamp - time_start=time.time() + time_start = time.time() test_audio_dir = "./16_audio.wav" audio = readwav2base64(test_audio_dir) data = { - "audio": audio, - "audio_format": "wav", - "sample_rate": 16000, - "lang": "zh_cn", - } + "audio": audio, + "audio_format": "wav", + "sample_rate": 16000, + "lang": "zh_cn", + } r = requests.post(url=url, data=json.dumps(data)) # ending Timestamp - time_end=time.time() - print('time cost',time_end - time_start, 's') + time_end = time.time() + print('time cost', time_end - time_start, 's') print(r.json()) diff --git a/paddlespeech/server/tests/tts/test_client.py b/paddlespeech/server/tests/tts/test_client.py index 65f4ccfece121f5ab472fe3a2e9e2f34244136b9..e42c9bcfa1cf586333ca333251f63e9b50a1b62f 100644 --- a/paddlespeech/server/tests/tts/test_client.py +++ b/paddlespeech/server/tests/tts/test_client.py @@ -25,6 +25,7 @@ import soundfile from paddlespeech.server.utils.audio_process import wav2pcm + # Request and response def tts_client(args): """ Request and response @@ -99,5 +100,5 @@ if __name__ == "__main__": print("Inference time: %f" % (time_consume)) print("The duration of synthesized audio: %f" % (duration)) print("The RTF is: %f" % (rtf)) - except: + except BaseException: print("Failed to synthesized audio.") diff --git a/paddlespeech/server/util.py b/paddlespeech/server/util.py index 48c4b8cbd09a51460c1150f4a45a43ed9244a4c9..1f1b0be1bd82f112bfa7c6162fde42c236739243 100644 --- a/paddlespeech/server/util.py +++ b/paddlespeech/server/util.py @@ -219,7 +219,7 @@ class ConfigCache: try: cfg = yaml.load(file, Loader=yaml.FullLoader) self._data.update(cfg) - except: + except BaseException: self.flush() @property diff --git a/paddlespeech/server/utils/paddle_predictor.py b/paddlespeech/server/utils/paddle_predictor.py index f910161b88896e054439c855da3efcdad10b21ae..4035d48d8c9928aa9c537ec3be25eb606a68960b 100644 --- a/paddlespeech/server/utils/paddle_predictor.py +++ b/paddlespeech/server/utils/paddle_predictor.py @@ -15,6 +15,7 @@ import os from typing import List from typing import Optional +import paddle from paddle.inference import Config from paddle.inference import create_predictor @@ -40,14 +41,30 @@ def init_predictor(model_dir: Optional[os.PathLike]=None, else: config = Config(model_file, params_file) - config.enable_memory_optim() - if predictor_conf["use_gpu"]: - config.enable_use_gpu(1000, 0) - if predictor_conf["enable_mkldnn"]: - config.enable_mkldnn() + # set device + if predictor_conf["device"]: + device = predictor_conf["device"] + else: + device = paddle.get_device() + if "gpu" in device: + gpu_id = device.split(":")[-1] + config.enable_use_gpu(1000, int(gpu_id)) + + # IR optim if predictor_conf["switch_ir_optim"]: config.switch_ir_optim() + # glog + if not predictor_conf["glog_info"]: + config.disable_glog_info() + + # config summary + if predictor_conf["summary"]: + print(config.summary()) + + # memory optim + config.enable_memory_optim() + predictor = create_predictor(config) return predictor diff --git a/paddlespeech/t2s/datasets/dataset.py b/paddlespeech/t2s/datasets/dataset.py index f81c2877ca8214833ba71188db8659ea3f701758..2d6c03cb19c585a0736e1da61266d31e88b90dc8 100644 --- a/paddlespeech/t2s/datasets/dataset.py +++ b/paddlespeech/t2s/datasets/dataset.py @@ -258,4 +258,4 @@ class ChainDataset(Dataset): return dataset[i] i -= len(dataset) - raise IndexError("dataset index out of range") \ No newline at end of file + raise IndexError("dataset index out of range") diff --git a/paddlespeech/t2s/exps/synthesize.py b/paddlespeech/t2s/exps/synthesize.py index 1c42a87c9ff7574326b25e2c3e6cf0edcb5bef4e..81da14f2eaed69743c095ac82455b4629cd13b9c 100644 --- a/paddlespeech/t2s/exps/synthesize.py +++ b/paddlespeech/t2s/exps/synthesize.py @@ -20,6 +20,7 @@ import numpy as np import paddle import soundfile as sf import yaml +from timer import timer from yacs.config import CfgNode from paddlespeech.s2t.utils.dynamic_import import dynamic_import @@ -50,6 +51,18 @@ model_alias = { "paddlespeech.t2s.models.melgan:MelGANGenerator", "mb_melgan_inference": "paddlespeech.t2s.models.melgan:MelGANInference", + "style_melgan": + "paddlespeech.t2s.models.melgan:StyleMelGANGenerator", + "style_melgan_inference": + "paddlespeech.t2s.models.melgan:StyleMelGANInference", + "hifigan": + "paddlespeech.t2s.models.hifigan:HiFiGANGenerator", + "hifigan_inference": + "paddlespeech.t2s.models.hifigan:HiFiGANInference", + "wavernn": + "paddlespeech.t2s.models.wavernn:WaveRNN", + "wavernn_inference": + "paddlespeech.t2s.models.wavernn:WaveRNNInference", } @@ -146,10 +159,15 @@ def evaluate(args): voc_name = args.voc[:args.voc.rindex('_')] voc_class = dynamic_import(voc_name, model_alias) voc_inference_class = dynamic_import(voc_name + '_inference', model_alias) - voc = voc_class(**voc_config["generator_params"]) - voc.set_state_dict(paddle.load(args.voc_ckpt)["generator_params"]) - voc.remove_weight_norm() - voc.eval() + if voc_name != 'wavernn': + voc = voc_class(**voc_config["generator_params"]) + voc.set_state_dict(paddle.load(args.voc_ckpt)["generator_params"]) + voc.remove_weight_norm() + voc.eval() + else: + voc = voc_class(**voc_config["model"]) + voc.set_state_dict(paddle.load(args.voc_ckpt)["main_params"]) + voc.eval() voc_mu, voc_std = np.load(args.voc_stat) voc_mu = paddle.to_tensor(voc_mu) voc_std = paddle.to_tensor(voc_std) @@ -162,38 +180,51 @@ def evaluate(args): output_dir = Path(args.output_dir) output_dir.mkdir(parents=True, exist_ok=True) + N = 0 + T = 0 + for datum in test_dataset: utt_id = datum["utt_id"] - with paddle.no_grad(): - # acoustic model - if am_name == 'fastspeech2': - phone_ids = paddle.to_tensor(datum["text"]) - spk_emb = None - spk_id = None - # multi speaker - if args.voice_cloning and "spk_emb" in datum: - spk_emb = paddle.to_tensor(np.load(datum["spk_emb"])) - elif "spk_id" in datum: - spk_id = paddle.to_tensor(datum["spk_id"]) - mel = am_inference(phone_ids, spk_id=spk_id, spk_emb=spk_emb) - elif am_name == 'speedyspeech': - phone_ids = paddle.to_tensor(datum["phones"]) - tone_ids = paddle.to_tensor(datum["tones"]) - mel = am_inference(phone_ids, tone_ids) - elif am_name == 'tacotron2': - phone_ids = paddle.to_tensor(datum["text"]) - spk_emb = None - # multi speaker - if args.voice_cloning and "spk_emb" in datum: - spk_emb = paddle.to_tensor(np.load(datum["spk_emb"])) - mel = am_inference(phone_ids, spk_emb=spk_emb) + with timer() as t: + with paddle.no_grad(): + # acoustic model + if am_name == 'fastspeech2': + phone_ids = paddle.to_tensor(datum["text"]) + spk_emb = None + spk_id = None + # multi speaker + if args.voice_cloning and "spk_emb" in datum: + spk_emb = paddle.to_tensor(np.load(datum["spk_emb"])) + elif "spk_id" in datum: + spk_id = paddle.to_tensor(datum["spk_id"]) + mel = am_inference( + phone_ids, spk_id=spk_id, spk_emb=spk_emb) + elif am_name == 'speedyspeech': + phone_ids = paddle.to_tensor(datum["phones"]) + tone_ids = paddle.to_tensor(datum["tones"]) + mel = am_inference(phone_ids, tone_ids) + elif am_name == 'tacotron2': + phone_ids = paddle.to_tensor(datum["text"]) + spk_emb = None + # multi speaker + if args.voice_cloning and "spk_emb" in datum: + spk_emb = paddle.to_tensor(np.load(datum["spk_emb"])) + mel = am_inference(phone_ids, spk_emb=spk_emb) # vocoder wav = voc_inference(mel) + + wav = wav.numpy() + N += wav.size + T += t.elapse + speed = wav.size / t.elapse + rtf = am_config.fs / speed + print( + f"{utt_id}, mel: {mel.shape}, wave: {wav.size}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}." + ) sf.write( - str(output_dir / (utt_id + ".wav")), - wav.numpy(), - samplerate=am_config.fs) + str(output_dir / (utt_id + ".wav")), wav, samplerate=am_config.fs) print(f"{utt_id} done!") + print(f"generation speed: {N / T}Hz, RTF: {am_config.fs / (N / T) }") def main(): @@ -246,7 +277,8 @@ def main(): default='pwgan_csmsc', choices=[ 'pwgan_csmsc', 'pwgan_ljspeech', 'pwgan_aishell3', 'pwgan_vctk', - 'mb_melgan_csmsc' + 'mb_melgan_csmsc', 'wavernn_csmsc', 'hifigan_csmsc', + 'style_melgan_csmsc' ], help='Choose vocoder type of tts task.') diff --git a/paddlespeech/t2s/exps/synthesize_e2e.py b/paddlespeech/t2s/exps/synthesize_e2e.py index 75c631b847a1459ec56fbb32f97c88ce6ee8fce9..94180f8531af480b5e2a68d3cd72dc5e9070f3ce 100644 --- a/paddlespeech/t2s/exps/synthesize_e2e.py +++ b/paddlespeech/t2s/exps/synthesize_e2e.py @@ -21,6 +21,7 @@ import soundfile as sf import yaml from paddle import jit from paddle.static import InputSpec +from timer import timer from yacs.config import CfgNode from paddlespeech.s2t.utils.dynamic_import import dynamic_import @@ -194,10 +195,10 @@ def evaluate(args): am_inference = jit.to_static( am_inference, input_spec=[ - InputSpec([-1], dtype=paddle.int64), # text - InputSpec([-1], dtype=paddle.int64), # tone - None, # duration - InputSpec([-1], dtype=paddle.int64) # spk_id + InputSpec([-1], dtype=paddle.int64), # text + InputSpec([-1], dtype=paddle.int64), # tone + InputSpec([1], dtype=paddle.int64), # spk_id + None # duration ]) else: am_inference = jit.to_static( @@ -233,59 +234,68 @@ def evaluate(args): # but still not stopping in the end (NOTE by yuantian01 Feb 9 2022) if am_name == 'tacotron2': merge_sentences = True - + N = 0 + T = 0 for utt_id, sentence in sentences: - get_tone_ids = False - if am_name == 'speedyspeech': - get_tone_ids = True - if args.lang == 'zh': - input_ids = frontend.get_input_ids( - sentence, - merge_sentences=merge_sentences, - get_tone_ids=get_tone_ids) - phone_ids = input_ids["phone_ids"] - if get_tone_ids: - tone_ids = input_ids["tone_ids"] - elif args.lang == 'en': - input_ids = frontend.get_input_ids( - sentence, merge_sentences=merge_sentences) - phone_ids = input_ids["phone_ids"] - else: - print("lang should in {'zh', 'en'}!") - with paddle.no_grad(): - flags = 0 - for i in range(len(phone_ids)): - part_phone_ids = phone_ids[i] - # acoustic model - if am_name == 'fastspeech2': - # multi speaker - if am_dataset in {"aishell3", "vctk"}: - spk_id = paddle.to_tensor(args.spk_id) - mel = am_inference(part_phone_ids, spk_id) - else: + with timer() as t: + get_tone_ids = False + if am_name == 'speedyspeech': + get_tone_ids = True + if args.lang == 'zh': + input_ids = frontend.get_input_ids( + sentence, + merge_sentences=merge_sentences, + get_tone_ids=get_tone_ids) + phone_ids = input_ids["phone_ids"] + if get_tone_ids: + tone_ids = input_ids["tone_ids"] + elif args.lang == 'en': + input_ids = frontend.get_input_ids( + sentence, merge_sentences=merge_sentences) + phone_ids = input_ids["phone_ids"] + else: + print("lang should in {'zh', 'en'}!") + with paddle.no_grad(): + flags = 0 + for i in range(len(phone_ids)): + part_phone_ids = phone_ids[i] + # acoustic model + if am_name == 'fastspeech2': + # multi speaker + if am_dataset in {"aishell3", "vctk"}: + spk_id = paddle.to_tensor(args.spk_id) + mel = am_inference(part_phone_ids, spk_id) + else: + mel = am_inference(part_phone_ids) + elif am_name == 'speedyspeech': + part_tone_ids = tone_ids[i] + if am_dataset in {"aishell3", "vctk"}: + spk_id = paddle.to_tensor(args.spk_id) + mel = am_inference(part_phone_ids, part_tone_ids, + spk_id) + else: + mel = am_inference(part_phone_ids, part_tone_ids) + elif am_name == 'tacotron2': mel = am_inference(part_phone_ids) - elif am_name == 'speedyspeech': - part_tone_ids = tone_ids[i] - if am_dataset in {"aishell3", "vctk"}: - spk_id = paddle.to_tensor(args.spk_id) - mel = am_inference(part_phone_ids, part_tone_ids, - spk_id) + # vocoder + wav = voc_inference(mel) + if flags == 0: + wav_all = wav + flags = 1 else: - mel = am_inference(part_phone_ids, part_tone_ids) - elif am_name == 'tacotron2': - mel = am_inference(part_phone_ids) - # vocoder - wav = voc_inference(mel) - if flags == 0: - wav_all = wav - flags = 1 - else: - wav_all = paddle.concat([wav_all, wav]) + wav_all = paddle.concat([wav_all, wav]) + wav = wav_all.numpy() + N += wav.size + T += t.elapse + speed = wav.size / t.elapse + rtf = am_config.fs / speed + print( + f"{utt_id}, mel: {mel.shape}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}." + ) sf.write( - str(output_dir / (utt_id + ".wav")), - wav_all.numpy(), - samplerate=am_config.fs) + str(output_dir / (utt_id + ".wav")), wav, samplerate=am_config.fs) print(f"{utt_id} done!") + print(f"generation speed: {N / T}Hz, RTF: {am_config.fs / (N / T) }") def main(): diff --git a/paddlespeech/t2s/exps/wavernn/synthesize.py b/paddlespeech/t2s/exps/wavernn/synthesize.py index 4357b2825da5a97176841674df98b7af470e0d2f..d23e9cb7ed99104d20d73af6541cab462a4c5e11 100644 --- a/paddlespeech/t2s/exps/wavernn/synthesize.py +++ b/paddlespeech/t2s/exps/wavernn/synthesize.py @@ -91,7 +91,7 @@ def main(): target=config.inference.target, overlap=config.inference.overlap, mu_law=config.mu_law, - gen_display=True) + gen_display=False) wav = wav.numpy() N += wav.size T += t.elapse diff --git a/paddlespeech/t2s/frontend/tone_sandhi.py b/paddlespeech/t2s/frontend/tone_sandhi.py index 5264e0687557c75023eb8f004350869346e7df6c..07f7fa2b8f8615af73fd656b0abd381e551179f9 100644 --- a/paddlespeech/t2s/frontend/tone_sandhi.py +++ b/paddlespeech/t2s/frontend/tone_sandhi.py @@ -63,7 +63,7 @@ class ToneSandhi(): '扫把', '惦记' } self.must_not_neural_tone_words = { - "男子", "女子", "分子", "原子", "量子", "莲子", "石子", "瓜子", "电子" + "男子", "女子", "分子", "原子", "量子", "莲子", "石子", "瓜子", "电子", "人人", "虎虎" } self.punc = ":,;。?!“”‘’':,;.?!" @@ -77,7 +77,9 @@ class ToneSandhi(): # reduplication words for n. and v. e.g. 奶奶, 试试, 旺旺 for j, item in enumerate(word): - if j - 1 >= 0 and item == word[j - 1] and pos[0] in {"n", "v", "a"}: + if j - 1 >= 0 and item == word[j - 1] and pos[0] in { + "n", "v", "a" + } and word not in self.must_not_neural_tone_words: finals[j] = finals[j][:-1] + "5" ge_idx = word.find("个") if len(word) >= 1 and word[-1] in "吧呢哈啊呐噻嘛吖嗨呐哦哒额滴哩哟喽啰耶喔诶": diff --git a/paddlespeech/t2s/frontend/zh_frontend.py b/paddlespeech/t2s/frontend/zh_frontend.py index a905c412d4d9c901fae1d5b80677a472a24c6071..bb8ed5b4919ecfb67d3f54aade65b0d31e1d1a00 100644 --- a/paddlespeech/t2s/frontend/zh_frontend.py +++ b/paddlespeech/t2s/frontend/zh_frontend.py @@ -20,7 +20,10 @@ import numpy as np import paddle from g2pM import G2pM from pypinyin import lazy_pinyin +from pypinyin import load_phrases_dict +from pypinyin import load_single_dict from pypinyin import Style +from pypinyin_dict.phrase_pinyin_data import large_pinyin from paddlespeech.t2s.frontend.generate_lexicon import generate_lexicon from paddlespeech.t2s.frontend.tone_sandhi import ToneSandhi @@ -41,6 +44,8 @@ class Frontend(): self.g2pM_model = G2pM() self.pinyin2phone = generate_lexicon( with_tone=True, with_erhua=False) + else: + self.__init__pypinyin() self.must_erhua = {"小院儿", "胡同儿", "范儿", "老汉儿", "撒欢儿", "寻老礼儿", "妥妥儿"} self.not_erhua = { "虐儿", "为儿", "护儿", "瞒儿", "救儿", "替儿", "有儿", "一儿", "我儿", "俺儿", "妻儿", @@ -62,6 +67,23 @@ class Frontend(): for tone, id in tone_id: self.vocab_tones[tone] = int(id) + def __init__pypinyin(self): + large_pinyin.load() + + load_phrases_dict({u'开户行': [[u'ka1i'], [u'hu4'], [u'hang2']]}) + load_phrases_dict({u'发卡行': [[u'fa4'], [u'ka3'], [u'hang2']]}) + load_phrases_dict({u'放款行': [[u'fa4ng'], [u'kua3n'], [u'hang2']]}) + load_phrases_dict({u'茧行': [[u'jia3n'], [u'hang2']]}) + load_phrases_dict({u'行号': [[u'hang2'], [u'ha4o']]}) + load_phrases_dict({u'各地': [[u'ge4'], [u'di4']]}) + load_phrases_dict({u'借还款': [[u'jie4'], [u'hua2n'], [u'kua3n']]}) + load_phrases_dict({u'时间为': [[u'shi2'], [u'jia1n'], [u'we2i']]}) + load_phrases_dict({u'为准': [[u'we2i'], [u'zhu3n']]}) + load_phrases_dict({u'色差': [[u'se4'], [u'cha1']]}) + + # 调整字的拼音顺序 + load_single_dict({ord(u'地'): u'de,di4'}) + def _get_initials_finals(self, word: str) -> List[List[str]]: initials = [] finals = [] diff --git a/paddlespeech/t2s/frontend/zh_normalization/chronology.py b/paddlespeech/t2s/frontend/zh_normalization/chronology.py index bfa7d2b1969ddb26c72c1846e2cd7a9a0d29bfee..ea51891353ad8c6fe942edcdf7efb22ec60526ce 100644 --- a/paddlespeech/t2s/frontend/zh_normalization/chronology.py +++ b/paddlespeech/t2s/frontend/zh_normalization/chronology.py @@ -63,7 +63,10 @@ def replace_time(match) -> str: result = f"{num2str(hour)}点" if minute.lstrip('0'): - result += f"{_time_num2str(minute)}分" + if int(minute) == 30: + result += f"半" + else: + result += f"{_time_num2str(minute)}分" if second and second.lstrip('0'): result += f"{_time_num2str(second)}秒" @@ -71,7 +74,10 @@ def replace_time(match) -> str: result += "至" result += f"{num2str(hour_2)}点" if minute_2.lstrip('0'): - result += f"{_time_num2str(minute_2)}分" + if int(minute) == 30: + result += f"半" + else: + result += f"{_time_num2str(minute_2)}分" if second_2 and second_2.lstrip('0'): result += f"{_time_num2str(second_2)}秒" diff --git a/paddlespeech/t2s/frontend/zh_normalization/num.py b/paddlespeech/t2s/frontend/zh_normalization/num.py index 27a2f84651759e50d75c97adb7dcfd2225d9beb7..a83b42a47b70b30452d5908e58d6e7a5b1c2f93c 100644 --- a/paddlespeech/t2s/frontend/zh_normalization/num.py +++ b/paddlespeech/t2s/frontend/zh_normalization/num.py @@ -28,7 +28,7 @@ UNITS = OrderedDict({ 8: '亿', }) -COM_QUANTIFIERS = '(朵|匹|张|座|回|场|尾|条|个|首|阙|阵|网|炮|顶|丘|棵|只|支|袭|辆|挑|担|颗|壳|窠|曲|墙|群|腔|砣|座|客|贯|扎|捆|刀|令|打|手|罗|坡|山|岭|江|溪|钟|队|单|双|对|出|口|头|脚|板|跳|枝|件|贴|针|线|管|名|位|身|堂|课|本|页|家|户|层|丝|毫|厘|分|钱|两|斤|担|铢|石|钧|锱|忽|(千|毫|微)克|毫|厘|(公)分|分|寸|尺|丈|里|寻|常|铺|程|(千|分|厘|毫|微)米|米|撮|勺|合|升|斗|石|盘|碗|碟|叠|桶|笼|盆|盒|杯|钟|斛|锅|簋|篮|盘|桶|罐|瓶|壶|卮|盏|箩|箱|煲|啖|袋|钵|年|月|日|季|刻|时|周|天|秒|分|旬|纪|岁|世|更|夜|春|夏|秋|冬|代|伏|辈|丸|泡|粒|颗|幢|堆|条|根|支|道|面|片|张|颗|块|元|(亿|千万|百万|万|千|百)|(亿|千万|百万|万|千|百|美|)元|(亿|千万|百万|万|千|百|)块|角|毛|分)' +COM_QUANTIFIERS = '(所|朵|匹|张|座|回|场|尾|条|个|首|阙|阵|网|炮|顶|丘|棵|只|支|袭|辆|挑|担|颗|壳|窠|曲|墙|群|腔|砣|座|客|贯|扎|捆|刀|令|打|手|罗|坡|山|岭|江|溪|钟|队|单|双|对|出|口|头|脚|板|跳|枝|件|贴|针|线|管|名|位|身|堂|课|本|页|家|户|层|丝|毫|厘|分|钱|两|斤|担|铢|石|钧|锱|忽|(千|毫|微)克|毫|厘|(公)分|分|寸|尺|丈|里|寻|常|铺|程|(千|分|厘|毫|微)米|米|撮|勺|合|升|斗|石|盘|碗|碟|叠|桶|笼|盆|盒|杯|钟|斛|锅|簋|篮|盘|桶|罐|瓶|壶|卮|盏|箩|箱|煲|啖|袋|钵|年|月|日|季|刻|时|周|天|秒|分|小时|旬|纪|岁|世|更|夜|春|夏|秋|冬|代|伏|辈|丸|泡|粒|颗|幢|堆|条|根|支|道|面|片|张|颗|块|元|(亿|千万|百万|万|千|百)|(亿|千万|百万|万|千|百|美|)元|(亿|千万|百万|万|千|百|)块|角|毛|分)' # 分数表达式 RE_FRAC = re.compile(r'(-?)(\d+)/(\d+)') @@ -110,7 +110,7 @@ def replace_default_num(match): # 纯小数 RE_DECIMAL_NUM = re.compile(r'(-?)((\d+)(\.\d+))' r'|(\.(\d+))') # 正整数 + 量词 -RE_POSITIVE_QUANTIFIERS = re.compile(r"(\d+)([多余几])?" + COM_QUANTIFIERS) +RE_POSITIVE_QUANTIFIERS = re.compile(r"(\d+)([多余几\+])?" + COM_QUANTIFIERS) RE_NUMBER = re.compile(r'(-?)((\d+)(\.\d+)?)' r'|(\.(\d+))') @@ -123,6 +123,8 @@ def replace_positive_quantifier(match) -> str: """ number = match.group(1) match_2 = match.group(2) + if match_2 == "+": + match_2 = "多" match_2: str = match_2 if match_2 else "" quantifiers: str = match.group(3) number: str = num2str(number) @@ -151,6 +153,7 @@ def replace_number(match) -> str: # 范围表达式 # match.group(1) and match.group(8) are copy from RE_NUMBER + RE_RANGE = re.compile( r'((-?)((\d+)(\.\d+)?)|(\.(\d+)))[-~]((-?)((\d+)(\.\d+)?)|(\.(\d+)))') diff --git a/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py b/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py index f9d1b8cb859ab5f449a4bc573c6133a101096fa1..bc663c70d77da24c9ef9b21fea64a5b1fc6cf2e9 100644 --- a/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py +++ b/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py @@ -63,11 +63,19 @@ class TextNormalizer(): # Only for pure Chinese here if lang == "zh": text = text.replace(" ", "") + # 过滤掉特殊字符 + text = re.sub(r'[《》【】<=>{}()()#&@“”^_|…\\]', '', text) text = self.SENTENCE_SPLITOR.sub(r'\1\n', text) text = text.strip() sentences = [sentence.strip() for sentence in re.split(r'\n+', text)] return sentences + def _post_replace(self, sentence: str) -> str: + sentence = sentence.replace('/', '每') + sentence = sentence.replace('~', '至') + + return sentence + def normalize_sentence(self, sentence: str) -> str: # basic character conversions sentence = tranditional_to_simplified(sentence) @@ -97,6 +105,7 @@ class TextNormalizer(): sentence) sentence = RE_DEFAULT_NUM.sub(replace_default_num, sentence) sentence = RE_NUMBER.sub(replace_number, sentence) + sentence = self._post_replace(sentence) return sentence diff --git a/paddlespeech/t2s/models/melgan/melgan.py b/paddlespeech/t2s/models/melgan/melgan.py index 6a13965913320f173ab66fdf40bdde0e5fa7d98f..22d8fd9e764c5c7f3c71ca1e2d17acc641a029cd 100644 --- a/paddlespeech/t2s/models/melgan/melgan.py +++ b/paddlespeech/t2s/models/melgan/melgan.py @@ -66,7 +66,7 @@ class MelGANGenerator(nn.Layer): nonlinear_activation_params (Dict[str, Any], optional): Parameters passed to the linear activation in the upsample network, by default {} pad (str): Padding function module name before dilated convolution layer. - pad_params (dict): Hyperparameters for padding function. + pad_params (dict): Hyperparameters for padding function. use_final_nonlinear_activation (nn.Layer): Activation function for the final layer. use_weight_norm (bool): Whether to use weight norm. If set to true, it will be applied to all of the conv layers. diff --git a/paddlespeech/t2s/models/speedyspeech/speedyspeech.py b/paddlespeech/t2s/models/speedyspeech/speedyspeech.py index 42e8f743225373f1e543ffdbdf75802756de6bd4..44ccfc60ff3508eaf06b450a08e53418c82bcc12 100644 --- a/paddlespeech/t2s/models/speedyspeech/speedyspeech.py +++ b/paddlespeech/t2s/models/speedyspeech/speedyspeech.py @@ -247,7 +247,7 @@ class SpeedySpeechInference(nn.Layer): self.normalizer = normalizer self.acoustic_model = speedyspeech_model - def forward(self, phones, tones, durations=None, spk_id=None): + def forward(self, phones, tones, spk_id=None, durations=None): normalized_mel = self.acoustic_model.inference( phones, tones, durations=durations, spk_id=spk_id) logmel = self.normalizer.inverse(normalized_mel) diff --git a/paddlespeech/t2s/models/wavernn/wavernn.py b/paddlespeech/t2s/models/wavernn/wavernn.py index 1320ffa34c50774625113ac9f9600ca55c6572cb..9590704328b757b27a6626e8e3a5c675da41c8e0 100644 --- a/paddlespeech/t2s/models/wavernn/wavernn.py +++ b/paddlespeech/t2s/models/wavernn/wavernn.py @@ -509,16 +509,20 @@ class WaveRNN(nn.Layer): total_len = num_folds * (target + overlap) + overlap # Need some silence for the run warmup - slience_len = overlap // 2 + slience_len = 0 + linear_len = slience_len fade_len = overlap - slience_len slience = paddle.zeros([slience_len], dtype=paddle.float32) - linear = paddle.ones([fade_len], dtype=paddle.float32) + linear = paddle.ones([linear_len], dtype=paddle.float32) # Equal power crossfade # fade_in increase from 0 to 1, fade_out reduces from 1 to 0 - t = paddle.linspace(-1, 1, fade_len, dtype=paddle.float32) - fade_in = paddle.sqrt(0.5 * (1 + t)) - fade_out = paddle.sqrt(0.5 * (1 - t)) + sigmoid_scale = 2.3 + t = paddle.linspace( + -sigmoid_scale, sigmoid_scale, fade_len, dtype=paddle.float32) + # sigmoid 曲线应该更好 + fade_in = paddle.nn.functional.sigmoid(t) + fade_out = 1 - paddle.nn.functional.sigmoid(t) # Concat the silence to the fades fade_out = paddle.concat([linear, fade_out]) fade_in = paddle.concat([slience, fade_in]) diff --git a/paddlespeech/t2s/modules/transformer/repeat.py b/paddlespeech/t2s/modules/transformer/repeat.py index 2073a78b9330201dba15b42badf77cee0caceab1..1e946adf7e469fd6c05c2a8c8d9e6f16f638524e 100644 --- a/paddlespeech/t2s/modules/transformer/repeat.py +++ b/paddlespeech/t2s/modules/transformer/repeat.py @@ -36,4 +36,4 @@ def repeat(N, fn): Returns: MultiSequential: Repeated model instance. """ - return MultiSequential(*[fn(n) for n in range(N)]) + return MultiSequential(* [fn(n) for n in range(N)]) diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 760821662d0251a3ba8c8f0c8e50bf1d4795a774..0000000000000000000000000000000000000000 --- a/requirements.txt +++ /dev/null @@ -1,48 +0,0 @@ -ConfigArgParse -coverage -editdistance -g2p_en -g2pM -gpustat -h5py -inflect -jieba -jsonlines -kaldiio -librosa -loguru -matplotlib -nara_wpe -nltk -paddleaudio -paddlenlp -paddlespeech_ctcdecoders -paddlespeech_feat -pandas -phkit -Pillow -praatio==5.0.0 -pre-commit -pybind11 -pypi-kenlm -pypinyin -python-dateutil -pyworld -resampy==0.2.2 -sacrebleu -scipy -sentencepiece~=0.1.96 -snakeviz -soundfile~=0.10 -sox -soxbindings -textgrid -timer -tqdm -typeguard -unidecode -visualdl -webrtcvad -yacs~=0.1.8 -yq -zhon diff --git a/setup.py b/setup.py index 9bb11d0dd5829b9f551fca46867e752d71b03c43..f86758bab25d9b5283126054834777f4a3e7f478 100644 --- a/setup.py +++ b/setup.py @@ -27,46 +27,54 @@ from setuptools.command.install import install HERE = Path(os.path.abspath(os.path.dirname(__file__))) -VERSION = '0.1.1' +VERSION = '0.1.2' + +base = [ + "editdistance", + "g2p_en", + "g2pM", + "h5py", + "inflect", + "jieba", + "jsonlines", + "kaldiio", + "librosa==0.8.1", + "loguru", + "matplotlib", + "nara_wpe", + "pandas", + "paddleaudio", + "paddlenlp", + "paddlespeech_feat", + "praatio==5.0.0", + "pypinyin", + "pypinyin-dict", + "python-dateutil", + "pyworld", + "resampy==0.2.2", + "sacrebleu", + "scipy", + "sentencepiece~=0.1.96", + "soundfile~=0.10", + "textgrid", + "timer", + "tqdm", + "typeguard", + "visualdl", + "webrtcvad", + "yacs~=0.1.8", + "prettytable", +] + +server = [ + "fastapi", + "uvicorn", + "pattern_singleton", +] requirements = { - "install": [ - "editdistance", - "g2p_en", - "g2pM", - "h5py", - "inflect", - "jieba", - "jsonlines", - "kaldiio", - "librosa", - "loguru", - "matplotlib", - "nara_wpe", - "pandas", - "paddleaudio", - "paddlenlp", - "paddlespeech_feat", - "praatio==5.0.0", - "pypinyin", - "python-dateutil", - "pyworld", - "resampy==0.2.2", - "sacrebleu", - "scipy", - "sentencepiece~=0.1.96", - "soundfile~=0.10", - "textgrid", - "timer", - "tqdm", - "typeguard", - "visualdl", - "webrtcvad", - "yacs~=0.1.8", - # fastapi server - "fastapi", - "uvicorn", - ], + "install": + base + server, "develop": [ "ConfigArgParse", "coverage", diff --git a/tests/test_tipc/configs/conformer/train_benchmark.txt b/tests/test_tipc/configs/conformer/train_infer_python.txt similarity index 91% rename from tests/test_tipc/configs/conformer/train_benchmark.txt rename to tests/test_tipc/configs/conformer/train_infer_python.txt index 3833f144c6f9642ca3720caf0a0ddbaeaae5bd5d..33b1debdc59a8bfb22c1787064940020815dd9df 100644 --- a/tests/test_tipc/configs/conformer/train_benchmark.txt +++ b/tests/test_tipc/configs/conformer/train_infer_python.txt @@ -54,4 +54,4 @@ batch_size:16|30 fp_items:fp32 iteration:50 --profiler-options:"batch_range=[10,35];state=GPU;tracer_option=Default;profile_path=model.profile" -flags:FLAGS_eager_delete_tensor_gb=0.0;FLAGS_fraction_of_gpu_memory_to_use=0.98;FLAGS_conv_workspace_size_limit=4096 +flags:null diff --git a/tests/test_tipc/configs/pwgan/train_benchmark.txt b/tests/test_tipc/configs/pwgan/train_infer_python.txt similarity index 91% rename from tests/test_tipc/configs/pwgan/train_benchmark.txt rename to tests/test_tipc/configs/pwgan/train_infer_python.txt index e936da3c2bc1ebc3e289e3d47b323c147d885562..c64984dcfc0439c6fc458d34d55adafa4dcbcdad 100644 --- a/tests/test_tipc/configs/pwgan/train_benchmark.txt +++ b/tests/test_tipc/configs/pwgan/train_infer_python.txt @@ -54,4 +54,4 @@ batch_size:6|16 fp_items:fp32 iteration:50 --profiler_options:"batch_range=[10,35];state=GPU;tracer_option=Default;profile_path=model.profile" -flags:FLAGS_eager_delete_tensor_gb=0.0;FLAGS_fraction_of_gpu_memory_to_use=0.98;FLAGS_conv_workspace_size_limit=4096 +flags:null diff --git a/tests/test_tipc/prepare.sh b/tests/test_tipc/prepare.sh index 0280e5d411d156ebd99452db9100db1fddce82fe..b46b203223fca9cc9de88b34d868b09c99446e0c 100644 --- a/tests/test_tipc/prepare.sh +++ b/tests/test_tipc/prepare.sh @@ -26,15 +26,19 @@ if [ ${MODE} = "benchmark_train" ];then curPath=$(readlink -f "$(dirname "$0")") echo "curPath:"${curPath} cd ${curPath}/../.. - pip install . + apt-get install libsndfile1 + pip install pytest-runner kaldiio setuptools_scm -i https://pypi.tuna.tsinghua.edu.cn/simple + pip install . -i https://pypi.tuna.tsinghua.edu.cn/simple cd - if [ ${model_name} == "conformer" ]; then # set the URL for aishell_tiny dataset - URL='None' + URL=${conformer_data_URL:-"None"} echo "URL:"${URL} if [ ${URL} == 'None' ];then echo "please contact author to get the URL.\n" exit + else + wget -P ${curPath}/../../dataset/aishell/ ${URL} fi sed -i "s#^URL_ROOT_TAG#URL_ROOT = '${URL}'#g" ${curPath}/conformer/scripts/aishell_tiny.py cp ${curPath}/conformer/scripts/aishell_tiny.py ${curPath}/../../dataset/aishell/ @@ -42,6 +46,7 @@ if [ ${MODE} = "benchmark_train" ];then source path.sh # download audio data sed -i "s#aishell.py#aishell_tiny.py#g" ./local/data.sh + sed -i "s#python3#python#g" ./local/data.sh bash ./local/data.sh || exit -1 if [ $? -ne 0 ]; then exit 1 @@ -56,7 +61,6 @@ if [ ${MODE} = "benchmark_train" ];then sed -i "s#conf/#test_tipc/conformer/benchmark_train/conf/#g" ${curPath}/conformer/benchmark_train/conf/conformer.yaml sed -i "s#data/#test_tipc/conformer/benchmark_train/data/#g" ${curPath}/conformer/benchmark_train/conf/tuning/decode.yaml sed -i "s#data/#test_tipc/conformer/benchmark_train/data/#g" ${curPath}/conformer/benchmark_train/conf/preprocess.yaml - fi if [ ${model_name} == "pwgan" ]; then @@ -73,4 +77,4 @@ if [ ${MODE} = "benchmark_train" ];then python ../paddlespeech/t2s/exps/gan_vocoder/normalize.py --metadata=dump/test/raw/metadata.jsonl --dumpdir=dump/test/norm --stats=dump/train/feats_stats.npy fi -fi \ No newline at end of file +fi diff --git a/tests/unit/asr/deepspeech2_online_model_test.py b/tests/unit/asr/deepspeech2_online_model_test.py index f623c5acd5066795cfa1cae43c622254a5ac88e0..f23c49263ec033280dc9b1ed0ad1b74b68d714c1 100644 --- a/tests/unit/asr/deepspeech2_online_model_test.py +++ b/tests/unit/asr/deepspeech2_online_model_test.py @@ -11,11 +11,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import os +import pickle import unittest import numpy as np import paddle +from paddle import inference +from paddlespeech.s2t.models.ds2_online import DeepSpeech2InferModelOnline from paddlespeech.s2t.models.ds2_online import DeepSpeech2ModelOnline @@ -182,5 +186,77 @@ class TestDeepSpeech2ModelOnline(unittest.TestCase): paddle.allclose(final_state_c_box, final_state_c_box_chk), True) +class TestDeepSpeech2StaticModelOnline(unittest.TestCase): + def setUp(self): + export_prefix = "exp/deepspeech2_online/checkpoints/test_export" + if not os.path.exists(os.path.dirname(export_prefix)): + os.makedirs(os.path.dirname(export_prefix), mode=0o755) + infer_model = DeepSpeech2InferModelOnline( + feat_size=161, + dict_size=4233, + num_conv_layers=2, + num_rnn_layers=5, + rnn_size=1024, + num_fc_layers=0, + fc_layers_size_list=[-1], + use_gru=False) + static_model = infer_model.export() + paddle.jit.save(static_model, export_prefix) + + with open("test_data/static_ds2online_inputs.pickle", "rb") as f: + self.data_dict = pickle.load(f) + + self.setup_model(export_prefix) + + def setup_model(self, export_prefix): + deepspeech_config = inference.Config(export_prefix + ".pdmodel", + export_prefix + ".pdiparams") + if ('CUDA_VISIBLE_DEVICES' in os.environ.keys() and + os.environ['CUDA_VISIBLE_DEVICES'].strip() != ''): + deepspeech_config.enable_use_gpu(100, 0) + deepspeech_config.enable_memory_optim() + deepspeech_predictor = inference.create_predictor(deepspeech_config) + self.predictor = deepspeech_predictor + + def test_unit(self): + input_names = self.predictor.get_input_names() + audio_handle = self.predictor.get_input_handle(input_names[0]) + audio_len_handle = self.predictor.get_input_handle(input_names[1]) + h_box_handle = self.predictor.get_input_handle(input_names[2]) + c_box_handle = self.predictor.get_input_handle(input_names[3]) + + x_chunk = self.data_dict["audio_chunk"] + x_chunk_lens = self.data_dict["audio_chunk_lens"] + chunk_state_h_box = self.data_dict["chunk_state_h_box"] + chunk_state_c_box = self.data_dict["chunk_state_c_bos"] + + audio_handle.reshape(x_chunk.shape) + audio_handle.copy_from_cpu(x_chunk) + + audio_len_handle.reshape(x_chunk_lens.shape) + audio_len_handle.copy_from_cpu(x_chunk_lens) + + h_box_handle.reshape(chunk_state_h_box.shape) + h_box_handle.copy_from_cpu(chunk_state_h_box) + + c_box_handle.reshape(chunk_state_c_box.shape) + c_box_handle.copy_from_cpu(chunk_state_c_box) + + output_names = self.predictor.get_output_names() + output_handle = self.predictor.get_output_handle(output_names[0]) + output_lens_handle = self.predictor.get_output_handle(output_names[1]) + output_state_h_handle = self.predictor.get_output_handle( + output_names[2]) + output_state_c_handle = self.predictor.get_output_handle( + output_names[3]) + self.predictor.run() + + output_chunk_probs = output_handle.copy_to_cpu() + output_chunk_lens = output_lens_handle.copy_to_cpu() + chunk_state_h_box = output_state_h_handle.copy_to_cpu() + chunk_state_c_box = output_state_c_handle.copy_to_cpu() + return True + + if __name__ == '__main__': unittest.main() diff --git a/tests/unit/asr/deepspeech2_online_model_test.sh b/tests/unit/asr/deepspeech2_online_model_test.sh new file mode 100644 index 0000000000000000000000000000000000000000..629238fd04716a5844156898c8697e9b0e158c9f --- /dev/null +++ b/tests/unit/asr/deepspeech2_online_model_test.sh @@ -0,0 +1,3 @@ +mkdir -p ./test_data +wget -P ./test_data https://paddlespeech.bj.bcebos.com/datasets/unit_test/asr/static_ds2online_inputs.pickle +python deepspeech2_online_model_test.py diff --git a/utils/DER.py b/utils/DER.py index 5b62094dfbe730c00f5201157032c9de1ee0f5db..d6ab695d8f498dd9aafebe6b43b645cc5de709e3 100755 --- a/utils/DER.py +++ b/utils/DER.py @@ -23,10 +23,11 @@ Credits This code is adapted from https://github.com/nryant/dscore """ import argparse -from distutils.util import strtobool import os import re import subprocess +from distutils.util import strtobool + import numpy as np FILE_IDS = re.compile(r"(?<=Speaker Diarization for).+(?=\*\*\*)")