diff --git a/.gitignore b/.gitignore index cc8fff8770b97a3f31eb49270ad32ac25af30fad..778824f5e8a3c655cea60c81f259625da45dd40f 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,7 @@ *.pyc .vscode *log +*.wav *.pdmodel *.pdiparams* *.zip @@ -30,5 +31,8 @@ tools/OpenBLAS/ tools/Miniconda3-latest-Linux-x86_64.sh tools/activate_python.sh tools/miniconda.sh +tools/CRF++-0.58/ + +speechx/fc_patch/ *output/ diff --git a/README.md b/README.md index 66178662ad84881f363b4a164b3c5497ff29cab1..837d24783d8b10952d3821580ec7bd0219d8dad6 100644 --- a/README.md +++ b/README.md @@ -196,16 +196,18 @@ Developers can have a try of our models with [PaddleSpeech Command Line](./paddl ```shell paddlespeech cls --input input.wav ``` + **Automatic Speech Recognition** ```shell paddlespeech asr --lang zh --input input_16k.wav ``` -**Speech Translation** (English to Chinese) +**Speech Translation** (English to Chinese) (not support for Mac and Windows now) ```shell paddlespeech st --input input_16k.wav ``` + **Text-to-Speech** ```shell paddlespeech tts --input "你好,欢迎使用飞桨深度学习框架!" --output output.wav @@ -218,7 +220,16 @@ paddlespeech tts --input "你好,欢迎使用飞桨深度学习框架!" --ou paddlespeech text --task punc --input 今天的天气真不错啊你下午有空吗我想约你一起去吃饭 ``` - +**Batch Process** +``` +echo -e "1 欢迎光临。\n2 谢谢惠顾。" | paddlespeech tts +``` + +**Shell Pipeline** +- ASR + Punctuation Restoration +``` +paddlespeech asr --input ./zh.wav | paddlespeech text --task punc +``` For more command lines, please see: [demos](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos) diff --git a/README_cn.md b/README_cn.md index 1196eec11ed181d875fa1050433b7a8c1ee41753..5c00637d3c86cb42f12edb1be8e6a0565988ec12 100644 --- a/README_cn.md +++ b/README_cn.md @@ -216,6 +216,17 @@ paddlespeech tts --input "你好,欢迎使用百度飞桨深度学习框架! paddlespeech text --task punc --input 今天的天气真不错啊你下午有空吗我想约你一起去吃饭 ``` +**批处理** +``` +echo -e "1 欢迎光临。\n2 谢谢惠顾。" | paddlespeech tts +``` + +**Shell管道** +ASR + Punc: +``` +paddlespeech asr --input ./zh.wav | paddlespeech text --task punc +``` + 更多命令行命令请参考 [demos](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos) > Note: 如果需要训练或者微调,请查看[语音识别](./docs/source/asr/quick_start.md), [语音合成](./docs/source/tts/quick_start.md)。 @@ -558,6 +569,7 @@ year={2021} - 非常感谢 [kslz](https://github.com/kslz) 补充中文文档。 - 非常感谢 [awmmmm](https://github.com/awmmmm) 提供 fastspeech2 aishell3 conformer 预训练模型。 - 非常感谢 [phecda-xu](https://github.com/phecda-xu)/[PaddleDubbing](https://github.com/phecda-xu/PaddleDubbing) 基于 PaddleSpeech 的 TTS 模型搭建带 GUI 操作界面的配音工具。 + 此外,PaddleSpeech 依赖于许多开源存储库。有关更多信息,请参阅 [references](./docs/source/reference.md)。 diff --git a/demos/speech_recognition/.gitignore b/demos/speech_recognition/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..d8dd7532abcc65af52e9db03c516274e3d674dc1 --- /dev/null +++ b/demos/speech_recognition/.gitignore @@ -0,0 +1 @@ +*.wav diff --git a/demos/speech_recognition/README.md b/demos/speech_recognition/README.md index c49afa35c2d8027011c333eb110eb22b1d08924d..5d964fceac73f60632b2b31a750941e958b59966 100644 --- a/demos/speech_recognition/README.md +++ b/demos/speech_recognition/README.md @@ -27,6 +27,8 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee paddlespeech asr --input ./zh.wav # English paddlespeech asr --model transformer_librispeech --lang en --input ./en.wav + # Chinese ASR + Punctuation Restoration + paddlespeech asr --input ./zh.wav | paddlespeech text --task punc ``` (It doesn't matter if package `paddlespeech-ctcdecoders` is not found, this package is optional.) diff --git a/demos/speech_recognition/README_cn.md b/demos/speech_recognition/README_cn.md index c2e38c91bc6b6374e8ab93f720b5c59330f3e05c..ba1f1d65c5ca9dec435cc1e998117238077407be 100644 --- a/demos/speech_recognition/README_cn.md +++ b/demos/speech_recognition/README_cn.md @@ -25,6 +25,8 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee paddlespeech asr --input ./zh.wav # 英文 paddlespeech asr --model transformer_librispeech --lang en --input ./en.wav + # 中文 + 标点恢复 + paddlespeech asr --input ./zh.wav | paddlespeech text --task punc ``` (如果显示 `paddlespeech-ctcdecoders` 这个 python 包没有找到的 Error,没有关系,这个包是非必须的。) diff --git a/demos/speech_recognition/run.sh b/demos/speech_recognition/run.sh index 5efc8b81f97f818753059c6fa19e718f7f3f05ae..06466928611f51bfec65529cad5d04966bf2607a 100755 --- a/demos/speech_recognition/run.sh +++ b/demos/speech_recognition/run.sh @@ -1,4 +1,10 @@ #!/bin/bash wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav + +# asr paddlespeech asr --input ./zh.wav + + +# asr + punc +paddlespeech asr --input ./zh.wav | paddlespeech text --task punc \ No newline at end of file diff --git a/demos/text_to_speech/README.md b/demos/text_to_speech/README.md index 9d3c4ac539a1afcd62a03c4f98b2dfe4cb622aae..2df72a82dec88ddc55505c9575721aee2de09536 100644 --- a/demos/text_to_speech/README.md +++ b/demos/text_to_speech/README.md @@ -17,11 +17,14 @@ The input of this demo should be a text of the specific language that can be pas ### 3. Usage - Command Line (Recommended) - Chinese - The default acoustic model is `Fastspeech2`, and the default vocoder is `Parallel WaveGAN`. ```bash paddlespeech tts --input "你好,欢迎使用百度飞桨深度学习框架!" ``` + - Batch Process + ```bash + echo -e "1 欢迎光临。\n2 谢谢惠顾。" | paddlespeech tts + ``` - Chinese, use `SpeedySpeech` as the acoustic model ```bash paddlespeech tts --am speedyspeech_csmsc --input "你好,欢迎使用百度飞桨深度学习框架!" diff --git a/demos/text_to_speech/README_cn.md b/demos/text_to_speech/README_cn.md index f075efdafc1a236b4517764568b31499159c151b..7e02b962483b4b0959fa9b9fe0c082bb0a6fdc3e 100644 --- a/demos/text_to_speech/README_cn.md +++ b/demos/text_to_speech/README_cn.md @@ -24,6 +24,10 @@ ```bash paddlespeech tts --input "你好,欢迎使用百度飞桨深度学习框架!" ``` + - 批处理 + ```bash + echo -e "1 欢迎光临。\n2 谢谢惠顾。" | paddlespeech tts + ``` - 中文,使用 `SpeedySpeech` 作为声学模型 ```bash paddlespeech tts --am speedyspeech_csmsc --input "你好,欢迎使用百度飞桨深度学习框架!" diff --git a/demos/text_to_speech/run.sh b/demos/text_to_speech/run.sh index c2487aeed38ed5b0e3bc7e5c256eff0139bcca2b..b1340241bf833129de9ae5581ada4a542253f96c 100755 --- a/demos/text_to_speech/run.sh +++ b/demos/text_to_speech/run.sh @@ -1,3 +1,7 @@ #!/bin/bash +# single process paddlespeech tts --input 今天的天气不错啊 + +# Batch process +echo -e "1 欢迎光临。\n2 谢谢惠顾。" | paddlespeech tts \ No newline at end of file diff --git a/docs/topic/ctc/ctc_loss_speed_compare.ipynb b/docs/topic/ctc/ctc_loss_speed_compare.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..eb7a030c7e6be03e43016d8a47aa049ea3e40eee --- /dev/null +++ b/docs/topic/ctc/ctc_loss_speed_compare.ipynb @@ -0,0 +1,369 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "a1e738e0", + "metadata": {}, + "source": [ + "## 获取测试的 logit 数据" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "29d3368b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "hlens.npy\n", + "logits.npy\n", + "ys_lens.npy\n", + "ys_pad.npy\n" + ] + } + ], + "source": [ + "!mkdir -p ./test_data\n", + "!test -f ./test_data/ctc_loss_compare_data.tgz || wget -P ./test_data https://paddlespeech.bj.bcebos.com/datasets/unit_test/asr/ctc_loss_compare_data.tgz\n", + "!tar xzvf test_data/ctc_loss_compare_data.tgz -C ./test_data\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "240caf1d", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import numpy as np\n", + "import time\n", + "\n", + "data_dir=\"./test_data\"\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "91bad949", + "metadata": {}, + "outputs": [], + "source": [ + "logits_np = np.load(os.path.join(data_dir, \"logits.npy\"))\n", + "ys_pad_np = np.load(os.path.join(data_dir, \"ys_pad.npy\"))\n", + "hlens_np = np.load(os.path.join(data_dir, \"hlens.npy\"))\n", + "ys_lens_np = np.load(os.path.join(data_dir, \"ys_lens.npy\"))" + ] + }, + { + "cell_type": "markdown", + "id": "4cef2f15", + "metadata": {}, + "source": [ + "## 使用 torch 的 ctc loss" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "90612004", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'1.10.1+cu102'" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import torch\n", + "torch.__version__" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "00799f97", + "metadata": {}, + "outputs": [], + "source": [ + "def torch_ctc_loss(use_cpu):\n", + " if use_cpu:\n", + " device = torch.device(\"cpu\")\n", + " else:\n", + " device = torch.device(\"cuda\")\n", + "\n", + " reduction_type = \"sum\" \n", + "\n", + " ctc_loss = torch.nn.CTCLoss(reduction=reduction_type)\n", + "\n", + " ys_hat = torch.tensor(logits_np, device = device)\n", + " ys_pad = torch.tensor(ys_pad_np, device = device)\n", + " hlens = torch.tensor(hlens_np, device = device)\n", + " ys_lens = torch.tensor(ys_lens_np, device = device)\n", + "\n", + " ys_hat = ys_hat.transpose(0, 1)\n", + " \n", + " # 开始计算时间\n", + " start_time = time.time()\n", + " ys_hat = ys_hat.log_softmax(2)\n", + " loss = ctc_loss(ys_hat, ys_pad, hlens, ys_lens)\n", + " end_time = time.time()\n", + " \n", + " loss = loss / ys_hat.size(1)\n", + " return end_time - start_time, loss.item()" + ] + }, + { + "cell_type": "markdown", + "id": "ba47b5a4", + "metadata": {}, + "source": [ + "## 使用 paddle 的 ctc loss" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "6882a06e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'2.2.2'" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import paddle\n", + "paddle.__version__" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "3cfa3b7c", + "metadata": {}, + "outputs": [], + "source": [ + "def paddle_ctc_loss(use_cpu): \n", + " import paddle.nn as pn\n", + " if use_cpu:\n", + " device = \"cpu\"\n", + " else:\n", + " device = \"gpu\"\n", + "\n", + " paddle.set_device(device)\n", + "\n", + " logits = paddle.to_tensor(logits_np)\n", + " ys_pad = paddle.to_tensor(ys_pad_np,dtype='int32')\n", + " hlens = paddle.to_tensor(hlens_np, dtype='int64')\n", + " ys_lens = paddle.to_tensor(ys_lens_np, dtype='int64')\n", + "\n", + " logits = logits.transpose([1,0,2])\n", + "\n", + " ctc_loss = pn.CTCLoss(reduction='sum')\n", + " # 开始计算时间\n", + " start_time = time.time()\n", + " pn_loss = ctc_loss(logits, ys_pad, hlens, ys_lens)\n", + " end_time = time.time()\n", + " \n", + " pn_loss = pn_loss / logits.shape[1]\n", + " return end_time - start_time, pn_loss.item()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "40413ef9", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU, iteration 10\n", + "torch_ctc_loss 159.17137145996094\n", + "paddle_ctc_loss 159.16574096679688\n", + "paddle average time 1.718252992630005\n", + "torch average time 0.17536230087280275\n", + "paddle time / torch time (cpu) 9.798303193320452\n", + "\n", + "GPU, iteration 10\n", + "torch_ctc_loss 159.172119140625\n", + "paddle_ctc_loss 159.17205810546875\n", + "paddle average time 0.018606925010681154\n", + "torch average time 0.0026710033416748047\n", + "paddle time / torch time (gpu) 6.966267963938231\n" + ] + } + ], + "source": [ + "# 使用 CPU\n", + "\n", + "iteration = 10\n", + "use_cpu = True\n", + "torch_total_time = 0\n", + "paddle_total_time = 0\n", + "for _ in range(iteration):\n", + " cost_time, torch_loss = torch_ctc_loss(use_cpu)\n", + " torch_total_time += cost_time\n", + "for _ in range(iteration):\n", + " cost_time, paddle_loss = paddle_ctc_loss(use_cpu)\n", + " paddle_total_time += cost_time\n", + "print (\"CPU, iteration\", iteration)\n", + "print (\"torch_ctc_loss\", torch_loss)\n", + "print (\"paddle_ctc_loss\", paddle_loss)\n", + "print (\"paddle average time\", paddle_total_time / iteration)\n", + "print (\"torch average time\", torch_total_time / iteration)\n", + "print (\"paddle time / torch time (cpu)\" , paddle_total_time/ torch_total_time)\n", + "\n", + "print (\"\")\n", + "\n", + "# 使用 GPU\n", + "\n", + "use_cpu = False\n", + "torch_total_time = 0\n", + "paddle_total_time = 0\n", + "for _ in range(iteration):\n", + " cost_time, torch_loss = torch_ctc_loss(use_cpu)\n", + " torch_total_time += cost_time\n", + "for _ in range(iteration):\n", + " cost_time, paddle_loss = paddle_ctc_loss(use_cpu)\n", + " paddle_total_time += cost_time\n", + "print (\"GPU, iteration\", iteration)\n", + "print (\"torch_ctc_loss\", torch_loss)\n", + "print (\"paddle_ctc_loss\", paddle_loss)\n", + "print (\"paddle average time\", paddle_total_time / iteration)\n", + "print (\"torch average time\", torch_total_time / iteration)\n", + "print (\"paddle time / torch time (gpu)\" , paddle_total_time/ torch_total_time)" + ] + }, + { + "cell_type": "markdown", + "id": "7cdf8697", + "metadata": {}, + "source": [ + "## 其他: 使用 PaddleSpeech 中的 ctcloss 查一下loss值" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "73fad81d", + "metadata": {}, + "outputs": [], + "source": [ + "logits_np = np.load(os.path.join(data_dir, \"logits.npy\"))\n", + "ys_pad_np = np.load(os.path.join(data_dir, \"ys_pad.npy\"))\n", + "hlens_np = np.load(os.path.join(data_dir, \"hlens.npy\"))\n", + "ys_lens_np = np.load(os.path.join(data_dir, \"ys_lens.npy\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "2b41e45d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2022-02-25 11:34:34.143 | INFO | paddlespeech.s2t.modules.loss:__init__:41 - CTCLoss Loss reduction: sum, div-bs: True\n", + "2022-02-25 11:34:34.143 | INFO | paddlespeech.s2t.modules.loss:__init__:42 - CTCLoss Grad Norm Type: instance\n", + "2022-02-25 11:34:34.144 | INFO | paddlespeech.s2t.modules.loss:__init__:73 - CTCLoss() kwargs:{'norm_by_times': True}, not support: {'norm_by_batchsize': False, 'norm_by_total_logits_len': False}\n", + "loss 159.17205810546875\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/root/miniconda3/lib/python3.7/site-packages/paddle/fluid/dygraph/math_op_patch.py:253: UserWarning: The dtype of left and right variables are not the same, left dtype is paddle.float32, but right dtype is paddle.int32, the right dtype will convert to paddle.float32\n", + " format(lhs_dtype, rhs_dtype, lhs_dtype))\n" + ] + } + ], + "source": [ + "use_cpu = False\n", + "\n", + "from paddlespeech.s2t.modules.loss import CTCLoss\n", + "\n", + "if use_cpu:\n", + " device = \"cpu\"\n", + "else:\n", + " device = \"gpu\"\n", + "\n", + "paddle.set_device(device)\n", + "\n", + "blank_id=0\n", + "reduction_type='sum'\n", + "batch_average= True\n", + "grad_norm_type='instance'\n", + "\n", + "criterion = CTCLoss(\n", + " blank=blank_id,\n", + " reduction=reduction_type,\n", + " batch_average=batch_average,\n", + " grad_norm_type=grad_norm_type)\n", + "\n", + "logits = paddle.to_tensor(logits_np)\n", + "ys_pad = paddle.to_tensor(ys_pad_np,dtype='int32')\n", + "hlens = paddle.to_tensor(hlens_np, dtype='int64')\n", + "ys_lens = paddle.to_tensor(ys_lens_np, dtype='int64')\n", + "\n", + "pn_ctc_loss = criterion(logits, ys_pad, hlens, ys_lens)\n", + "print(\"loss\", pn_ctc_loss.item())\n", + " " + ] + }, + { + "cell_type": "markdown", + "id": "de525d38", + "metadata": {}, + "source": [ + "## 结论\n", + "在 CPU 环境下: torch 的 CTC loss 的计算速度是 paddle 的 9.8 倍 \n", + "在 GPU 环境下: torch 的 CTC loss 的计算速度是 paddle 的 6.87 倍\n", + "\n", + "## 其他结论\n", + "torch 的 ctc loss 在 CPU 和 GPU 下 都没有完全对齐。其中CPU的前向对齐精度大约为 1e-2。 GPU 的前向对齐精度大约为 1e-4 。" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/aishell3/tts3/README.md b/examples/aishell3/tts3/README.md index 281ad836b0144e6bb14e4b8278bfaceb026b65b4..d02ad1b6373c26f0cd0ffa4d58c3bd4af57f9e72 100644 --- a/examples/aishell3/tts3/README.md +++ b/examples/aishell3/tts3/README.md @@ -225,7 +225,9 @@ optional arguments: 9. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. ## Pretrained Model -Pretrained FastSpeech2 model with no silence in the edge of audios. [fastspeech2_nosil_aishell3_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_aishell3_ckpt_0.4.zip) +Pretrained FastSpeech2 model with no silence in the edge of audios: +- [fastspeech2_nosil_aishell3_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_aishell3_ckpt_0.4.zip) +- [fastspeech2_conformer_aishell3_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_conformer_aishell3_ckpt_0.2.0.zip) (Thanks for [@awmmmm](https://github.com/awmmmm)'s contribution) FastSpeech2 checkpoint contains files listed below. diff --git a/examples/aishell3/tts3/conf/conformer.yaml b/examples/aishell3/tts3/conf/conformer.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ea73593d77a3117e8b46baab9785bd576a66a093 --- /dev/null +++ b/examples/aishell3/tts3/conf/conformer.yaml @@ -0,0 +1,110 @@ +########################################################### +# FEATURE EXTRACTION SETTING # +########################################################### + +fs: 24000 # sr +n_fft: 2048 # FFT size (samples). +n_shift: 300 # Hop size (samples). 12.5ms +win_length: 1200 # Window length (samples). 50ms + # If set to null, it will be the same as fft_size. +window: "hann" # Window function. + +# Only used for feats_type != raw + +fmin: 80 # Minimum frequency of Mel basis. +fmax: 7600 # Maximum frequency of Mel basis. +n_mels: 80 # The number of mel basis. + +# Only used for the model using pitch features (e.g. FastSpeech2) +f0min: 80 # Maximum f0 for pitch extraction. +f0max: 400 # Minimum f0 for pitch extraction. + + +########################################################### +# DATA SETTING # +########################################################### +batch_size: 32 +num_workers: 4 + + +########################################################### +# MODEL SETTING # +########################################################### +model: + adim: 384 # attention dimension + aheads: 2 # number of attention heads + elayers: 4 # number of encoder layers + eunits: 1536 # number of encoder ff units + dlayers: 4 # number of decoder layers + dunits: 1536 # number of decoder ff units + positionwise_layer_type: conv1d # type of position-wise layer + positionwise_conv_kernel_size: 3 # kernel size of position wise conv layer + duration_predictor_layers: 2 # number of layers of duration predictor + duration_predictor_chans: 256 # number of channels of duration predictor + duration_predictor_kernel_size: 3 # filter size of duration predictor + postnet_layers: 5 # number of layers of postnset + postnet_filts: 5 # filter size of conv layers in postnet + postnet_chans: 256 # number of channels of conv layers in postnet + encoder_normalize_before: True # whether to perform layer normalization before the input + decoder_normalize_before: True # whether to perform layer normalization before the input + reduction_factor: 1 # reduction factor + encoder_type: conformer # encoder type + decoder_type: conformer # decoder type + conformer_pos_enc_layer_type: rel_pos # conformer positional encoding type + conformer_self_attn_layer_type: rel_selfattn # conformer self-attention type + conformer_activation_type: swish # conformer activation type + use_macaron_style_in_conformer: true # whether to use macaron style in conformer + use_cnn_in_conformer: true # whether to use CNN in conformer + conformer_enc_kernel_size: 7 # kernel size in CNN module of conformer-based encoder + conformer_dec_kernel_size: 31 # kernel size in CNN module of conformer-based decoder + init_type: xavier_uniform # initialization type + transformer_enc_dropout_rate: 0.2 # dropout rate for transformer encoder layer + transformer_enc_positional_dropout_rate: 0.2 # dropout rate for transformer encoder positional encoding + transformer_enc_attn_dropout_rate: 0.2 # dropout rate for transformer encoder attention layer + transformer_dec_dropout_rate: 0.2 # dropout rate for transformer decoder layer + transformer_dec_positional_dropout_rate: 0.2 # dropout rate for transformer decoder positional encoding + transformer_dec_attn_dropout_rate: 0.2 # dropout rate for transformer decoder attention layer + pitch_predictor_layers: 5 # number of conv layers in pitch predictor + pitch_predictor_chans: 256 # number of channels of conv layers in pitch predictor + pitch_predictor_kernel_size: 5 # kernel size of conv leyers in pitch predictor + pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor + pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch + pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch + stop_gradient_from_pitch_predictor: true # whether to stop the gradient from pitch predictor to encoder + energy_predictor_layers: 2 # number of conv layers in energy predictor + energy_predictor_chans: 256 # number of channels of conv layers in energy predictor + energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor + energy_predictor_dropout: 0.5 # dropout rate in energy predictor + energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy + energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy + stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder + spk_embed_dim: 256 # speaker embedding dimension + spk_embed_integration_type: concat # speaker embedding integration type + + +########################################################### +# UPDATER SETTING # +########################################################### +updater: + use_masking: True # whether to apply masking for padded part in loss calculation + + + +########################################################### +# OPTIMIZER SETTING # +########################################################### +optimizer: + optim: adam # optimizer type + learning_rate: 0.001 # learning rate + +########################################################### +# TRAINING SETTING # +########################################################### +max_epoch: 1000 +num_snapshots: 5 + + +########################################################### +# OTHER SETTING # +########################################################### +seed: 10086 diff --git a/examples/other/g2p/README.md b/examples/other/g2p/README.md index c0f55bd42130a34a32ed21e34b5d5e297fff2f7c..141f7f7412891b44be81fc5e026c175c3fe83bb1 100644 --- a/examples/other/g2p/README.md +++ b/examples/other/g2p/README.md @@ -10,7 +10,7 @@ Run the command below to get the results of the test. ```bash ./run.sh ``` -The `avg WER` of g2p is: 0.027124048652822204 +The `avg WER` of g2p is: 0.026014352515701198 ```text ,--------------------------------------------------------------------. | | # Snt # Wrd | Corr Sub Del Ins Err S.Err | diff --git a/paddlespeech/cli/__init__.py b/paddlespeech/cli/__init__.py index cecf76fee5b6e8e73c3e7d588698f0cb890461cf..12ff9919a29f453a11853571eb3dad836f824556 100644 --- a/paddlespeech/cli/__init__.py +++ b/paddlespeech/cli/__init__.py @@ -20,5 +20,6 @@ from .cls import CLSExecutor from .st import STExecutor from .text import TextExecutor from .tts import TTSExecutor +from .stats import StatsExecutor _locale._getdefaultlocale = (lambda *args: ['en_US', 'utf8']) diff --git a/paddlespeech/cli/stats/__init__.py b/paddlespeech/cli/stats/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..9fe6c4abaf10de2f24f751ddd62f456768a82475 --- /dev/null +++ b/paddlespeech/cli/stats/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from .infer import StatsExecutor diff --git a/paddlespeech/cli/stats/infer.py b/paddlespeech/cli/stats/infer.py new file mode 100644 index 0000000000000000000000000000000000000000..4ef50449c37e08c1a3c5f9b8894a5b4141e1c33f --- /dev/null +++ b/paddlespeech/cli/stats/infer.py @@ -0,0 +1,193 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +from typing import List + +from prettytable import PrettyTable + +from ..log import logger +from ..utils import cli_register +from ..utils import stats_wrapper + +__all__ = ['StatsExecutor'] + +model_name_format = { + 'asr': 'Model-Language-Sample Rate', + 'cls': 'Model-Sample Rate', + 'st': 'Model-Source language-Target language', + 'text': 'Model-Task-Language', + 'tts': 'Model-Language' +} + + +@cli_register( + name='paddlespeech.stats', + description='Get speech tasks support models list.') +class StatsExecutor(): + def __init__(self): + super(StatsExecutor, self).__init__() + + self.parser = argparse.ArgumentParser( + prog='paddlespeech.stats', add_help=True) + self.parser.add_argument( + '--task', + type=str, + default='asr', + choices=['asr', 'cls', 'st', 'text', 'tts'], + help='Choose speech task.', + required=True) + self.task_choices = ['asr', 'cls', 'st', 'text', 'tts'] + + def show_support_models(self, pretrained_models: dict): + fields = model_name_format[self.task].split("-") + table = PrettyTable(fields) + for key in pretrained_models: + table.add_row(key.split("-")) + print(table) + + def execute(self, argv: List[str]) -> bool: + """ + Command line entry. + """ + parser_args = self.parser.parse_args(argv) + self.task = parser_args.task + if self.task not in self.task_choices: + logger.error( + "Please input correct speech task, choices = ['asr', 'cls', 'st', 'text', 'tts']" + ) + return False + + elif self.task == 'asr': + try: + from ..asr.infer import pretrained_models + logger.info( + "Here is the list of ASR pretrained models released by PaddleSpeech that can be used by command line and python API" + ) + self.show_support_models(pretrained_models) + return True + except BaseException: + logger.error("Failed to get the list of ASR pretrained models.") + return False + + elif self.task == 'cls': + try: + from ..cls.infer import pretrained_models + logger.info( + "Here is the list of CLS pretrained models released by PaddleSpeech that can be used by command line and python API" + ) + self.show_support_models(pretrained_models) + return True + except BaseException: + logger.error("Failed to get the list of CLS pretrained models.") + return False + + elif self.task == 'st': + try: + from ..st.infer import pretrained_models + logger.info( + "Here is the list of ST pretrained models released by PaddleSpeech that can be used by command line and python API" + ) + self.show_support_models(pretrained_models) + return True + except BaseException: + logger.error("Failed to get the list of ST pretrained models.") + return False + + elif self.task == 'text': + try: + from ..text.infer import pretrained_models + logger.info( + "Here is the list of TEXT pretrained models released by PaddleSpeech that can be used by command line and python API" + ) + self.show_support_models(pretrained_models) + return True + except BaseException: + logger.error( + "Failed to get the list of TEXT pretrained models.") + return False + + elif self.task == 'tts': + try: + from ..tts.infer import pretrained_models + logger.info( + "Here is the list of TTS pretrained models released by PaddleSpeech that can be used by command line and python API" + ) + self.show_support_models(pretrained_models) + return True + except BaseException: + logger.error("Failed to get the list of TTS pretrained models.") + return False + + @stats_wrapper + def __call__( + self, + task: str=None, ): + """ + Python API to call an executor. + """ + self.task = task + if self.task not in self.task_choices: + print( + "Please input correct speech task, choices = ['asr', 'cls', 'st', 'text', 'tts']" + ) + + elif self.task == 'asr': + try: + from ..asr.infer import pretrained_models + print( + "Here is the list of ASR pretrained models released by PaddleSpeech that can be used by command line and python API" + ) + self.show_support_models(pretrained_models) + except BaseException: + print("Failed to get the list of ASR pretrained models.") + + elif self.task == 'cls': + try: + from ..cls.infer import pretrained_models + print( + "Here is the list of CLS pretrained models released by PaddleSpeech that can be used by command line and python API" + ) + self.show_support_models(pretrained_models) + except BaseException: + print("Failed to get the list of CLS pretrained models.") + + elif self.task == 'st': + try: + from ..st.infer import pretrained_models + print( + "Here is the list of ST pretrained models released by PaddleSpeech that can be used by command line and python API" + ) + self.show_support_models(pretrained_models) + except BaseException: + print("Failed to get the list of ST pretrained models.") + + elif self.task == 'text': + try: + from ..text.infer import pretrained_models + print( + "Here is the list of TEXT pretrained models released by PaddleSpeech that can be used by command line and python API" + ) + self.show_support_models(pretrained_models) + except BaseException: + print("Failed to get the list of TEXT pretrained models.") + + elif self.task == 'tts': + try: + from ..tts.infer import pretrained_models + print( + "Here is the list of TTS pretrained models released by PaddleSpeech that can be used by command line and python API" + ) + self.show_support_models(pretrained_models) + except BaseException: + print("Failed to get the list of TTS pretrained models.") diff --git a/paddlespeech/s2t/io/sampler.py b/paddlespeech/s2t/io/sampler.py index 89752bb9fdb98faecc0ccc5b8f59ea1f09efc8b6..ac55af1236f11d175e9e7717220980cf95c7d79b 100644 --- a/paddlespeech/s2t/io/sampler.py +++ b/paddlespeech/s2t/io/sampler.py @@ -51,7 +51,7 @@ def _batch_shuffle(indices, batch_size, epoch, clipped=False): """ rng = np.random.RandomState(epoch) shift_len = rng.randint(0, batch_size - 1) - batch_indices = list(zip(*[iter(indices[shift_len:])] * batch_size)) + batch_indices = list(zip(* [iter(indices[shift_len:])] * batch_size)) rng.shuffle(batch_indices) batch_indices = [item for batch in batch_indices for item in batch] assert clipped is False diff --git a/paddlespeech/s2t/models/u2_st/u2_st.py b/paddlespeech/s2t/models/u2_st/u2_st.py index f7b05714ef6e9961a1bff79027015889815d5811..999723e5100309976c1b89cbf256ac106d8829e6 100644 --- a/paddlespeech/s2t/models/u2_st/u2_st.py +++ b/paddlespeech/s2t/models/u2_st/u2_st.py @@ -33,8 +33,6 @@ from paddlespeech.s2t.modules.decoder import TransformerDecoder from paddlespeech.s2t.modules.encoder import ConformerEncoder from paddlespeech.s2t.modules.encoder import TransformerEncoder from paddlespeech.s2t.modules.loss import LabelSmoothingLoss -from paddlespeech.s2t.modules.mask import mask_finished_preds -from paddlespeech.s2t.modules.mask import mask_finished_scores from paddlespeech.s2t.modules.mask import subsequent_mask from paddlespeech.s2t.utils import checkpoint from paddlespeech.s2t.utils import layer_tools @@ -291,7 +289,7 @@ class U2STBaseModel(nn.Layer): device = speech.place # Let's assume B = batch_size and N = beam_size - # 1. Encoder and init hypothesis + # 1. Encoder and init hypothesis encoder_out, encoder_mask = self._forward_encoder( speech, speech_lengths, decoding_chunk_size, num_decoding_left_chunks, diff --git a/paddlespeech/server/bin/__init__.py b/paddlespeech/server/bin/__init__.py index bd75747f79948ea42229b8c164174dbe4240d4b1..025aab098f2b6d56ced56d499ce619feb190ab2d 100644 --- a/paddlespeech/server/bin/__init__.py +++ b/paddlespeech/server/bin/__init__.py @@ -14,3 +14,4 @@ from .paddlespeech_client import ASRClientExecutor from .paddlespeech_client import TTSClientExecutor from .paddlespeech_server import ServerExecutor +from .paddlespeech_server import ServerStatsExecutor diff --git a/paddlespeech/server/bin/paddlespeech_server.py b/paddlespeech/server/bin/paddlespeech_server.py index aff77d54436eac55fda46c8e2ed218cc115a0085..21fc5c65e965a87c483046d66e45036d1b091b5d 100644 --- a/paddlespeech/server/bin/paddlespeech_server.py +++ b/paddlespeech/server/bin/paddlespeech_server.py @@ -16,15 +16,17 @@ from typing import List import uvicorn from fastapi import FastAPI +from prettytable import PrettyTable from ..executor import BaseExecutor from ..util import cli_server_register from ..util import stats_wrapper +from paddlespeech.cli.log import logger from paddlespeech.server.engine.engine_pool import init_engine_pool from paddlespeech.server.restful.api import setup_router from paddlespeech.server.utils.config import get_config -__all__ = ['ServerExecutor'] +__all__ = ['ServerExecutor', 'ServerStatsExecutor'] app = FastAPI( title="PaddleSpeech Serving API", description="Api", version="0.0.1") @@ -86,3 +88,139 @@ class ServerExecutor(BaseExecutor): config = get_config(config_file) if self.init(config): uvicorn.run(app, host=config.host, port=config.port, debug=True) + + +@cli_server_register( + name='paddlespeech_server.stats', + description='Get the models supported by each speech task in the service.') +class ServerStatsExecutor(): + def __init__(self): + super(ServerStatsExecutor, self).__init__() + + self.parser = argparse.ArgumentParser( + prog='paddlespeech_server.stats', add_help=True) + self.parser.add_argument( + '--task', + type=str, + default=None, + choices=['asr', 'tts'], + help='Choose speech task.', + required=True) + self.task_choices = ['asr', 'tts'] + self.model_name_format = { + 'asr': 'Model-Language-Sample Rate', + 'tts': 'Model-Language' + } + + def show_support_models(self, pretrained_models: dict): + fields = self.model_name_format[self.task].split("-") + table = PrettyTable(fields) + for key in pretrained_models: + table.add_row(key.split("-")) + print(table) + + def execute(self, argv: List[str]) -> bool: + """ + Command line entry. + """ + parser_args = self.parser.parse_args(argv) + self.task = parser_args.task + if self.task not in self.task_choices: + logger.error( + "Please input correct speech task, choices = ['asr', 'tts']") + return False + + elif self.task == 'asr': + try: + from paddlespeech.cli.asr.infer import pretrained_models + logger.info( + "Here is the table of ASR pretrained models supported in the service." + ) + self.show_support_models(pretrained_models) + + # show ASR static pretrained model + from paddlespeech.server.engine.asr.paddleinference.asr_engine import pretrained_models + logger.info( + "Here is the table of ASR static pretrained models supported in the service." + ) + self.show_support_models(pretrained_models) + + return True + except BaseException: + logger.error( + "Failed to get the table of ASR pretrained models supported in the service." + ) + return False + + elif self.task == 'tts': + try: + from paddlespeech.cli.tts.infer import pretrained_models + logger.info( + "Here is the table of TTS pretrained models supported in the service." + ) + self.show_support_models(pretrained_models) + + # show TTS static pretrained model + from paddlespeech.server.engine.tts.paddleinference.tts_engine import pretrained_models + logger.info( + "Here is the table of TTS static pretrained models supported in the service." + ) + self.show_support_models(pretrained_models) + + return True + except BaseException: + logger.error( + "Failed to get the table of TTS pretrained models supported in the service." + ) + return False + + @stats_wrapper + def __call__( + self, + task: str=None, ): + """ + Python API to call an executor. + """ + self.task = task + if self.task not in self.task_choices: + print("Please input correct speech task, choices = ['asr', 'tts']") + + elif self.task == 'asr': + try: + from paddlespeech.cli.asr.infer import pretrained_models + print( + "Here is the table of ASR pretrained models supported in the service." + ) + self.show_support_models(pretrained_models) + + # show ASR static pretrained model + from paddlespeech.server.engine.asr.paddleinference.asr_engine import pretrained_models + print( + "Here is the table of ASR static pretrained models supported in the service." + ) + self.show_support_models(pretrained_models) + + except BaseException: + print( + "Failed to get the table of ASR pretrained models supported in the service." + ) + + elif self.task == 'tts': + try: + from paddlespeech.cli.tts.infer import pretrained_models + print( + "Here is the table of TTS pretrained models supported in the service." + ) + self.show_support_models(pretrained_models) + + # show TTS static pretrained model + from paddlespeech.server.engine.tts.paddleinference.tts_engine import pretrained_models + print( + "Here is the table of TTS static pretrained models supported in the service." + ) + self.show_support_models(pretrained_models) + + except BaseException: + print( + "Failed to get the table of TTS pretrained models supported in the service." + ) diff --git a/paddlespeech/t2s/frontend/tone_sandhi.py b/paddlespeech/t2s/frontend/tone_sandhi.py index 5264e0687557c75023eb8f004350869346e7df6c..07f7fa2b8f8615af73fd656b0abd381e551179f9 100644 --- a/paddlespeech/t2s/frontend/tone_sandhi.py +++ b/paddlespeech/t2s/frontend/tone_sandhi.py @@ -63,7 +63,7 @@ class ToneSandhi(): '扫把', '惦记' } self.must_not_neural_tone_words = { - "男子", "女子", "分子", "原子", "量子", "莲子", "石子", "瓜子", "电子" + "男子", "女子", "分子", "原子", "量子", "莲子", "石子", "瓜子", "电子", "人人", "虎虎" } self.punc = ":,;。?!“”‘’':,;.?!" @@ -77,7 +77,9 @@ class ToneSandhi(): # reduplication words for n. and v. e.g. 奶奶, 试试, 旺旺 for j, item in enumerate(word): - if j - 1 >= 0 and item == word[j - 1] and pos[0] in {"n", "v", "a"}: + if j - 1 >= 0 and item == word[j - 1] and pos[0] in { + "n", "v", "a" + } and word not in self.must_not_neural_tone_words: finals[j] = finals[j][:-1] + "5" ge_idx = word.find("个") if len(word) >= 1 and word[-1] in "吧呢哈啊呐噻嘛吖嗨呐哦哒额滴哩哟喽啰耶喔诶": diff --git a/paddlespeech/t2s/frontend/zh_frontend.py b/paddlespeech/t2s/frontend/zh_frontend.py index a905c412d4d9c901fae1d5b80677a472a24c6071..bb8ed5b4919ecfb67d3f54aade65b0d31e1d1a00 100644 --- a/paddlespeech/t2s/frontend/zh_frontend.py +++ b/paddlespeech/t2s/frontend/zh_frontend.py @@ -20,7 +20,10 @@ import numpy as np import paddle from g2pM import G2pM from pypinyin import lazy_pinyin +from pypinyin import load_phrases_dict +from pypinyin import load_single_dict from pypinyin import Style +from pypinyin_dict.phrase_pinyin_data import large_pinyin from paddlespeech.t2s.frontend.generate_lexicon import generate_lexicon from paddlespeech.t2s.frontend.tone_sandhi import ToneSandhi @@ -41,6 +44,8 @@ class Frontend(): self.g2pM_model = G2pM() self.pinyin2phone = generate_lexicon( with_tone=True, with_erhua=False) + else: + self.__init__pypinyin() self.must_erhua = {"小院儿", "胡同儿", "范儿", "老汉儿", "撒欢儿", "寻老礼儿", "妥妥儿"} self.not_erhua = { "虐儿", "为儿", "护儿", "瞒儿", "救儿", "替儿", "有儿", "一儿", "我儿", "俺儿", "妻儿", @@ -62,6 +67,23 @@ class Frontend(): for tone, id in tone_id: self.vocab_tones[tone] = int(id) + def __init__pypinyin(self): + large_pinyin.load() + + load_phrases_dict({u'开户行': [[u'ka1i'], [u'hu4'], [u'hang2']]}) + load_phrases_dict({u'发卡行': [[u'fa4'], [u'ka3'], [u'hang2']]}) + load_phrases_dict({u'放款行': [[u'fa4ng'], [u'kua3n'], [u'hang2']]}) + load_phrases_dict({u'茧行': [[u'jia3n'], [u'hang2']]}) + load_phrases_dict({u'行号': [[u'hang2'], [u'ha4o']]}) + load_phrases_dict({u'各地': [[u'ge4'], [u'di4']]}) + load_phrases_dict({u'借还款': [[u'jie4'], [u'hua2n'], [u'kua3n']]}) + load_phrases_dict({u'时间为': [[u'shi2'], [u'jia1n'], [u'we2i']]}) + load_phrases_dict({u'为准': [[u'we2i'], [u'zhu3n']]}) + load_phrases_dict({u'色差': [[u'se4'], [u'cha1']]}) + + # 调整字的拼音顺序 + load_single_dict({ord(u'地'): u'de,di4'}) + def _get_initials_finals(self, word: str) -> List[List[str]]: initials = [] finals = [] diff --git a/paddlespeech/t2s/frontend/zh_normalization/chronology.py b/paddlespeech/t2s/frontend/zh_normalization/chronology.py index bfa7d2b1969ddb26c72c1846e2cd7a9a0d29bfee..ea51891353ad8c6fe942edcdf7efb22ec60526ce 100644 --- a/paddlespeech/t2s/frontend/zh_normalization/chronology.py +++ b/paddlespeech/t2s/frontend/zh_normalization/chronology.py @@ -63,7 +63,10 @@ def replace_time(match) -> str: result = f"{num2str(hour)}点" if minute.lstrip('0'): - result += f"{_time_num2str(minute)}分" + if int(minute) == 30: + result += f"半" + else: + result += f"{_time_num2str(minute)}分" if second and second.lstrip('0'): result += f"{_time_num2str(second)}秒" @@ -71,7 +74,10 @@ def replace_time(match) -> str: result += "至" result += f"{num2str(hour_2)}点" if minute_2.lstrip('0'): - result += f"{_time_num2str(minute_2)}分" + if int(minute) == 30: + result += f"半" + else: + result += f"{_time_num2str(minute_2)}分" if second_2 and second_2.lstrip('0'): result += f"{_time_num2str(second_2)}秒" diff --git a/paddlespeech/t2s/frontend/zh_normalization/num.py b/paddlespeech/t2s/frontend/zh_normalization/num.py index 27a2f84651759e50d75c97adb7dcfd2225d9beb7..a83b42a47b70b30452d5908e58d6e7a5b1c2f93c 100644 --- a/paddlespeech/t2s/frontend/zh_normalization/num.py +++ b/paddlespeech/t2s/frontend/zh_normalization/num.py @@ -28,7 +28,7 @@ UNITS = OrderedDict({ 8: '亿', }) -COM_QUANTIFIERS = '(朵|匹|张|座|回|场|尾|条|个|首|阙|阵|网|炮|顶|丘|棵|只|支|袭|辆|挑|担|颗|壳|窠|曲|墙|群|腔|砣|座|客|贯|扎|捆|刀|令|打|手|罗|坡|山|岭|江|溪|钟|队|单|双|对|出|口|头|脚|板|跳|枝|件|贴|针|线|管|名|位|身|堂|课|本|页|家|户|层|丝|毫|厘|分|钱|两|斤|担|铢|石|钧|锱|忽|(千|毫|微)克|毫|厘|(公)分|分|寸|尺|丈|里|寻|常|铺|程|(千|分|厘|毫|微)米|米|撮|勺|合|升|斗|石|盘|碗|碟|叠|桶|笼|盆|盒|杯|钟|斛|锅|簋|篮|盘|桶|罐|瓶|壶|卮|盏|箩|箱|煲|啖|袋|钵|年|月|日|季|刻|时|周|天|秒|分|旬|纪|岁|世|更|夜|春|夏|秋|冬|代|伏|辈|丸|泡|粒|颗|幢|堆|条|根|支|道|面|片|张|颗|块|元|(亿|千万|百万|万|千|百)|(亿|千万|百万|万|千|百|美|)元|(亿|千万|百万|万|千|百|)块|角|毛|分)' +COM_QUANTIFIERS = '(所|朵|匹|张|座|回|场|尾|条|个|首|阙|阵|网|炮|顶|丘|棵|只|支|袭|辆|挑|担|颗|壳|窠|曲|墙|群|腔|砣|座|客|贯|扎|捆|刀|令|打|手|罗|坡|山|岭|江|溪|钟|队|单|双|对|出|口|头|脚|板|跳|枝|件|贴|针|线|管|名|位|身|堂|课|本|页|家|户|层|丝|毫|厘|分|钱|两|斤|担|铢|石|钧|锱|忽|(千|毫|微)克|毫|厘|(公)分|分|寸|尺|丈|里|寻|常|铺|程|(千|分|厘|毫|微)米|米|撮|勺|合|升|斗|石|盘|碗|碟|叠|桶|笼|盆|盒|杯|钟|斛|锅|簋|篮|盘|桶|罐|瓶|壶|卮|盏|箩|箱|煲|啖|袋|钵|年|月|日|季|刻|时|周|天|秒|分|小时|旬|纪|岁|世|更|夜|春|夏|秋|冬|代|伏|辈|丸|泡|粒|颗|幢|堆|条|根|支|道|面|片|张|颗|块|元|(亿|千万|百万|万|千|百)|(亿|千万|百万|万|千|百|美|)元|(亿|千万|百万|万|千|百|)块|角|毛|分)' # 分数表达式 RE_FRAC = re.compile(r'(-?)(\d+)/(\d+)') @@ -110,7 +110,7 @@ def replace_default_num(match): # 纯小数 RE_DECIMAL_NUM = re.compile(r'(-?)((\d+)(\.\d+))' r'|(\.(\d+))') # 正整数 + 量词 -RE_POSITIVE_QUANTIFIERS = re.compile(r"(\d+)([多余几])?" + COM_QUANTIFIERS) +RE_POSITIVE_QUANTIFIERS = re.compile(r"(\d+)([多余几\+])?" + COM_QUANTIFIERS) RE_NUMBER = re.compile(r'(-?)((\d+)(\.\d+)?)' r'|(\.(\d+))') @@ -123,6 +123,8 @@ def replace_positive_quantifier(match) -> str: """ number = match.group(1) match_2 = match.group(2) + if match_2 == "+": + match_2 = "多" match_2: str = match_2 if match_2 else "" quantifiers: str = match.group(3) number: str = num2str(number) @@ -151,6 +153,7 @@ def replace_number(match) -> str: # 范围表达式 # match.group(1) and match.group(8) are copy from RE_NUMBER + RE_RANGE = re.compile( r'((-?)((\d+)(\.\d+)?)|(\.(\d+)))[-~]((-?)((\d+)(\.\d+)?)|(\.(\d+)))') diff --git a/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py b/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py index f9d1b8cb859ab5f449a4bc573c6133a101096fa1..bc663c70d77da24c9ef9b21fea64a5b1fc6cf2e9 100644 --- a/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py +++ b/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py @@ -63,11 +63,19 @@ class TextNormalizer(): # Only for pure Chinese here if lang == "zh": text = text.replace(" ", "") + # 过滤掉特殊字符 + text = re.sub(r'[《》【】<=>{}()()#&@“”^_|…\\]', '', text) text = self.SENTENCE_SPLITOR.sub(r'\1\n', text) text = text.strip() sentences = [sentence.strip() for sentence in re.split(r'\n+', text)] return sentences + def _post_replace(self, sentence: str) -> str: + sentence = sentence.replace('/', '每') + sentence = sentence.replace('~', '至') + + return sentence + def normalize_sentence(self, sentence: str) -> str: # basic character conversions sentence = tranditional_to_simplified(sentence) @@ -97,6 +105,7 @@ class TextNormalizer(): sentence) sentence = RE_DEFAULT_NUM.sub(replace_default_num, sentence) sentence = RE_NUMBER.sub(replace_number, sentence) + sentence = self._post_replace(sentence) return sentence diff --git a/paddlespeech/t2s/modules/transformer/repeat.py b/paddlespeech/t2s/modules/transformer/repeat.py index 2073a78b9330201dba15b42badf77cee0caceab1..1e946adf7e469fd6c05c2a8c8d9e6f16f638524e 100644 --- a/paddlespeech/t2s/modules/transformer/repeat.py +++ b/paddlespeech/t2s/modules/transformer/repeat.py @@ -36,4 +36,4 @@ def repeat(N, fn): Returns: MultiSequential: Repeated model instance. """ - return MultiSequential(*[fn(n) for n in range(N)]) + return MultiSequential(* [fn(n) for n in range(N)]) diff --git a/setup.py b/setup.py index 3f3632b37f2c2d0642eddb727ca0739b79fe3e41..f86758bab25d9b5283126054834777f4a3e7f478 100644 --- a/setup.py +++ b/setup.py @@ -48,6 +48,7 @@ base = [ "paddlespeech_feat", "praatio==5.0.0", "pypinyin", + "pypinyin-dict", "python-dateutil", "pyworld", "resampy==0.2.2", @@ -62,6 +63,7 @@ base = [ "visualdl", "webrtcvad", "yacs~=0.1.8", + "prettytable", ] server = [ diff --git a/tests/test_tipc/configs/conformer/train_benchmark.txt b/tests/test_tipc/configs/conformer/train_infer_python.txt similarity index 91% rename from tests/test_tipc/configs/conformer/train_benchmark.txt rename to tests/test_tipc/configs/conformer/train_infer_python.txt index 3833f144c6f9642ca3720caf0a0ddbaeaae5bd5d..33b1debdc59a8bfb22c1787064940020815dd9df 100644 --- a/tests/test_tipc/configs/conformer/train_benchmark.txt +++ b/tests/test_tipc/configs/conformer/train_infer_python.txt @@ -54,4 +54,4 @@ batch_size:16|30 fp_items:fp32 iteration:50 --profiler-options:"batch_range=[10,35];state=GPU;tracer_option=Default;profile_path=model.profile" -flags:FLAGS_eager_delete_tensor_gb=0.0;FLAGS_fraction_of_gpu_memory_to_use=0.98;FLAGS_conv_workspace_size_limit=4096 +flags:null diff --git a/tests/test_tipc/configs/pwgan/train_benchmark.txt b/tests/test_tipc/configs/pwgan/train_infer_python.txt similarity index 91% rename from tests/test_tipc/configs/pwgan/train_benchmark.txt rename to tests/test_tipc/configs/pwgan/train_infer_python.txt index e936da3c2bc1ebc3e289e3d47b323c147d885562..c64984dcfc0439c6fc458d34d55adafa4dcbcdad 100644 --- a/tests/test_tipc/configs/pwgan/train_benchmark.txt +++ b/tests/test_tipc/configs/pwgan/train_infer_python.txt @@ -54,4 +54,4 @@ batch_size:6|16 fp_items:fp32 iteration:50 --profiler_options:"batch_range=[10,35];state=GPU;tracer_option=Default;profile_path=model.profile" -flags:FLAGS_eager_delete_tensor_gb=0.0;FLAGS_fraction_of_gpu_memory_to_use=0.98;FLAGS_conv_workspace_size_limit=4096 +flags:null diff --git a/tests/test_tipc/prepare.sh b/tests/test_tipc/prepare.sh index 0280e5d411d156ebd99452db9100db1fddce82fe..b46b203223fca9cc9de88b34d868b09c99446e0c 100644 --- a/tests/test_tipc/prepare.sh +++ b/tests/test_tipc/prepare.sh @@ -26,15 +26,19 @@ if [ ${MODE} = "benchmark_train" ];then curPath=$(readlink -f "$(dirname "$0")") echo "curPath:"${curPath} cd ${curPath}/../.. - pip install . + apt-get install libsndfile1 + pip install pytest-runner kaldiio setuptools_scm -i https://pypi.tuna.tsinghua.edu.cn/simple + pip install . -i https://pypi.tuna.tsinghua.edu.cn/simple cd - if [ ${model_name} == "conformer" ]; then # set the URL for aishell_tiny dataset - URL='None' + URL=${conformer_data_URL:-"None"} echo "URL:"${URL} if [ ${URL} == 'None' ];then echo "please contact author to get the URL.\n" exit + else + wget -P ${curPath}/../../dataset/aishell/ ${URL} fi sed -i "s#^URL_ROOT_TAG#URL_ROOT = '${URL}'#g" ${curPath}/conformer/scripts/aishell_tiny.py cp ${curPath}/conformer/scripts/aishell_tiny.py ${curPath}/../../dataset/aishell/ @@ -42,6 +46,7 @@ if [ ${MODE} = "benchmark_train" ];then source path.sh # download audio data sed -i "s#aishell.py#aishell_tiny.py#g" ./local/data.sh + sed -i "s#python3#python#g" ./local/data.sh bash ./local/data.sh || exit -1 if [ $? -ne 0 ]; then exit 1 @@ -56,7 +61,6 @@ if [ ${MODE} = "benchmark_train" ];then sed -i "s#conf/#test_tipc/conformer/benchmark_train/conf/#g" ${curPath}/conformer/benchmark_train/conf/conformer.yaml sed -i "s#data/#test_tipc/conformer/benchmark_train/data/#g" ${curPath}/conformer/benchmark_train/conf/tuning/decode.yaml sed -i "s#data/#test_tipc/conformer/benchmark_train/data/#g" ${curPath}/conformer/benchmark_train/conf/preprocess.yaml - fi if [ ${model_name} == "pwgan" ]; then @@ -73,4 +77,4 @@ if [ ${MODE} = "benchmark_train" ];then python ../paddlespeech/t2s/exps/gan_vocoder/normalize.py --metadata=dump/test/raw/metadata.jsonl --dumpdir=dump/test/norm --stats=dump/train/feats_stats.npy fi -fi \ No newline at end of file +fi diff --git a/tests/unit/asr/deepspeech2_online_model_test.py b/tests/unit/asr/deepspeech2_online_model_test.py index f623c5acd5066795cfa1cae43c622254a5ac88e0..f23c49263ec033280dc9b1ed0ad1b74b68d714c1 100644 --- a/tests/unit/asr/deepspeech2_online_model_test.py +++ b/tests/unit/asr/deepspeech2_online_model_test.py @@ -11,11 +11,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import os +import pickle import unittest import numpy as np import paddle +from paddle import inference +from paddlespeech.s2t.models.ds2_online import DeepSpeech2InferModelOnline from paddlespeech.s2t.models.ds2_online import DeepSpeech2ModelOnline @@ -182,5 +186,77 @@ class TestDeepSpeech2ModelOnline(unittest.TestCase): paddle.allclose(final_state_c_box, final_state_c_box_chk), True) +class TestDeepSpeech2StaticModelOnline(unittest.TestCase): + def setUp(self): + export_prefix = "exp/deepspeech2_online/checkpoints/test_export" + if not os.path.exists(os.path.dirname(export_prefix)): + os.makedirs(os.path.dirname(export_prefix), mode=0o755) + infer_model = DeepSpeech2InferModelOnline( + feat_size=161, + dict_size=4233, + num_conv_layers=2, + num_rnn_layers=5, + rnn_size=1024, + num_fc_layers=0, + fc_layers_size_list=[-1], + use_gru=False) + static_model = infer_model.export() + paddle.jit.save(static_model, export_prefix) + + with open("test_data/static_ds2online_inputs.pickle", "rb") as f: + self.data_dict = pickle.load(f) + + self.setup_model(export_prefix) + + def setup_model(self, export_prefix): + deepspeech_config = inference.Config(export_prefix + ".pdmodel", + export_prefix + ".pdiparams") + if ('CUDA_VISIBLE_DEVICES' in os.environ.keys() and + os.environ['CUDA_VISIBLE_DEVICES'].strip() != ''): + deepspeech_config.enable_use_gpu(100, 0) + deepspeech_config.enable_memory_optim() + deepspeech_predictor = inference.create_predictor(deepspeech_config) + self.predictor = deepspeech_predictor + + def test_unit(self): + input_names = self.predictor.get_input_names() + audio_handle = self.predictor.get_input_handle(input_names[0]) + audio_len_handle = self.predictor.get_input_handle(input_names[1]) + h_box_handle = self.predictor.get_input_handle(input_names[2]) + c_box_handle = self.predictor.get_input_handle(input_names[3]) + + x_chunk = self.data_dict["audio_chunk"] + x_chunk_lens = self.data_dict["audio_chunk_lens"] + chunk_state_h_box = self.data_dict["chunk_state_h_box"] + chunk_state_c_box = self.data_dict["chunk_state_c_bos"] + + audio_handle.reshape(x_chunk.shape) + audio_handle.copy_from_cpu(x_chunk) + + audio_len_handle.reshape(x_chunk_lens.shape) + audio_len_handle.copy_from_cpu(x_chunk_lens) + + h_box_handle.reshape(chunk_state_h_box.shape) + h_box_handle.copy_from_cpu(chunk_state_h_box) + + c_box_handle.reshape(chunk_state_c_box.shape) + c_box_handle.copy_from_cpu(chunk_state_c_box) + + output_names = self.predictor.get_output_names() + output_handle = self.predictor.get_output_handle(output_names[0]) + output_lens_handle = self.predictor.get_output_handle(output_names[1]) + output_state_h_handle = self.predictor.get_output_handle( + output_names[2]) + output_state_c_handle = self.predictor.get_output_handle( + output_names[3]) + self.predictor.run() + + output_chunk_probs = output_handle.copy_to_cpu() + output_chunk_lens = output_lens_handle.copy_to_cpu() + chunk_state_h_box = output_state_h_handle.copy_to_cpu() + chunk_state_c_box = output_state_c_handle.copy_to_cpu() + return True + + if __name__ == '__main__': unittest.main() diff --git a/tests/unit/asr/deepspeech2_online_model_test.sh b/tests/unit/asr/deepspeech2_online_model_test.sh new file mode 100644 index 0000000000000000000000000000000000000000..629238fd04716a5844156898c8697e9b0e158c9f --- /dev/null +++ b/tests/unit/asr/deepspeech2_online_model_test.sh @@ -0,0 +1,3 @@ +mkdir -p ./test_data +wget -P ./test_data https://paddlespeech.bj.bcebos.com/datasets/unit_test/asr/static_ds2online_inputs.pickle +python deepspeech2_online_model_test.py