diff --git a/demos/streaming_tts_server/conf/tts_online_application.yaml b/demos/streaming_tts_server/conf/tts_online_application.yaml index 67d4641a0f75c08d57798ff43cca99be25d35298..714f4a68969b2ec196c483692c4f712baeaad3a3 100644 --- a/demos/streaming_tts_server/conf/tts_online_application.yaml +++ b/demos/streaming_tts_server/conf/tts_online_application.yaml @@ -43,12 +43,12 @@ tts_online: device: 'cpu' # set 'gpu:id' or 'cpu' # am_block and am_pad only for fastspeech2_cnndecoder_onnx model to streaming am infer, # when am_pad set 12, streaming synthetic audio is the same as non-streaming synthetic audio - am_block: 42 + am_block: 72 am_pad: 12 # voc_pad and voc_block voc model to streaming voc infer, # when voc model is mb_melgan_csmsc, voc_pad set 14, streaming synthetic audio is the same as non-streaming synthetic audio; The minimum value of pad can be set to 7, streaming synthetic audio sounds normal # when voc model is hifigan_csmsc, voc_pad set 20, streaming synthetic audio is the same as non-streaming synthetic audio; voc_pad set 14, streaming synthetic audio sounds normal - voc_block: 14 + voc_block: 36 voc_pad: 14 @@ -91,12 +91,12 @@ tts_online-onnx: lang: 'zh' # am_block and am_pad only for fastspeech2_cnndecoder_onnx model to streaming am infer, # when am_pad set 12, streaming synthetic audio is the same as non-streaming synthetic audio - am_block: 42 + am_block: 72 am_pad: 12 # voc_pad and voc_block voc model to streaming voc infer, # when voc model is mb_melgan_csmsc_onnx, voc_pad set 14, streaming synthetic audio is the same as non-streaming synthetic audio; The minimum value of pad can be set to 7, streaming synthetic audio sounds normal # when voc model is hifigan_csmsc_onnx, voc_pad set 20, streaming synthetic audio is the same as non-streaming synthetic audio; voc_pad set 14, streaming synthetic audio sounds normal - voc_block: 14 + voc_block: 36 voc_pad: 14 # voc_upsample should be same as n_shift on voc config. voc_upsample: 300 diff --git a/paddlespeech/server/bin/paddlespeech_client.py b/paddlespeech/server/bin/paddlespeech_client.py index 8677279b77b77ab22d565cc8bce1a392b6adadd1..19bdc10b1ac03f5c197f5801a71c413dfc77b688 100644 --- a/paddlespeech/server/bin/paddlespeech_client.py +++ b/paddlespeech/server/bin/paddlespeech_client.py @@ -31,6 +31,7 @@ from ..util import stats_wrapper from paddlespeech.cli.log import logger from paddlespeech.server.utils.audio_handler import ASRWsAudioHandler from paddlespeech.server.utils.audio_process import wav2pcm +from paddlespeech.server.utils.util import compute_delay from paddlespeech.server.utils.util import wav2base64 __all__ = [ @@ -221,7 +222,7 @@ class TTSOnlineClientExecutor(BaseExecutor): play = args.play try: - res = self( + self( input=input_, server_ip=server_ip, port=port, @@ -257,17 +258,42 @@ class TTSOnlineClientExecutor(BaseExecutor): logger.info("tts http client start") from paddlespeech.server.utils.audio_handler import TTSHttpHandler handler = TTSHttpHandler(server_ip, port, play) - handler.run(input, spk_id, speed, volume, sample_rate, output) + first_response, final_response, duration, save_audio_success, receive_time_list, chunk_duration_list = handler.run( + input, spk_id, speed, volume, sample_rate, output) + delay_time_list = compute_delay(receive_time_list, + chunk_duration_list) elif protocol == "websocket": from paddlespeech.server.utils.audio_handler import TTSWsHandler logger.info("tts websocket client start") handler = TTSWsHandler(server_ip, port, play) loop = asyncio.get_event_loop() - loop.run_until_complete(handler.run(input, output)) + first_response, final_response, duration, save_audio_success, receive_time_list, chunk_duration_list = loop.run_until_complete( + handler.run(input, output)) + delay_time_list = compute_delay(receive_time_list, + chunk_duration_list) else: logger.error("Please set correct protocol, http or websocket") + return False + + logger.info(f"sentence: {input}") + logger.info(f"duration: {duration} s") + logger.info(f"first response: {first_response} s") + logger.info(f"final response: {final_response} s") + logger.info(f"RTF: {final_response/duration}") + if output is not None: + if save_audio_success: + logger.info(f"Audio successfully saved in {output}") + else: + logger.error("Audio save failed.") + + if delay_time_list != []: + logger.info( + f"Delay situation: total number of packages: {len(receive_time_list)}, the number of delayed packets: {len(delay_time_list)}, minimum delay time: {min(delay_time_list)} s, maximum delay time: {max(delay_time_list)} s, average delay time: {sum(delay_time_list)/len(delay_time_list)} s, delay rate:{len(delay_time_list)/len(receive_time_list)}" + ) + else: + logger.info("The sentence has no delay in streaming synthesis.") @cli_client_register( diff --git a/paddlespeech/server/conf/application.yaml b/paddlespeech/server/conf/application.yaml index b6a9942ed7bfa4248004a2c27d9a32bbb3ac0386..31a37ef04e2dc910314bad88c1e81fdbff07bb4b 100644 --- a/paddlespeech/server/conf/application.yaml +++ b/paddlespeech/server/conf/application.yaml @@ -1,4 +1,4 @@ -# This is the parameter configuration file for PaddleSpeech Serving. +# This is the parameter configuration file for PaddleSpeech Offline Serving.. ################################################################################# # SERVER SETTING # @@ -7,9 +7,7 @@ host: 127.0.0.1 port: 8090 # The task format in the engin_list is: _ -# task choices = ['asr_python', 'asr_inference', 'tts_python', 'tts_inference'] -# protocol = ['websocket', 'http'] (only one can be selected). -# http only support offline engine type. +# task choices = ['asr_python', 'asr_inference', 'tts_python', 'tts_inference', 'cls_python', 'cls_inference'] protocol: 'http' engine_list: ['asr_python', 'tts_python', 'cls_python', 'text_python', 'vector_python'] @@ -50,24 +48,6 @@ asr_inference: summary: True # False -> do not show predictor config -################### speech task: asr; engine_type: online ####################### -asr_online: - model_type: 'deepspeech2online_aishell' - am_model: # the pdmodel file of am static model [optional] - am_params: # the pdiparams file of am static model [optional] - lang: 'zh' - sample_rate: 16000 - cfg_path: - decode_method: - force_yes: True - - am_predictor_conf: - device: # set 'gpu:id' or 'cpu' - switch_ir_optim: True - glog_info: False # True -> print glog - summary: True # False -> do not show predictor config - - ################################### TTS ######################################### ################### speech task: tts; engine_type: python ####################### tts_python: diff --git a/paddlespeech/server/conf/tts_online_application.yaml b/paddlespeech/server/conf/tts_online_application.yaml index 67d4641a0f75c08d57798ff43cca99be25d35298..714f4a68969b2ec196c483692c4f712baeaad3a3 100644 --- a/paddlespeech/server/conf/tts_online_application.yaml +++ b/paddlespeech/server/conf/tts_online_application.yaml @@ -43,12 +43,12 @@ tts_online: device: 'cpu' # set 'gpu:id' or 'cpu' # am_block and am_pad only for fastspeech2_cnndecoder_onnx model to streaming am infer, # when am_pad set 12, streaming synthetic audio is the same as non-streaming synthetic audio - am_block: 42 + am_block: 72 am_pad: 12 # voc_pad and voc_block voc model to streaming voc infer, # when voc model is mb_melgan_csmsc, voc_pad set 14, streaming synthetic audio is the same as non-streaming synthetic audio; The minimum value of pad can be set to 7, streaming synthetic audio sounds normal # when voc model is hifigan_csmsc, voc_pad set 20, streaming synthetic audio is the same as non-streaming synthetic audio; voc_pad set 14, streaming synthetic audio sounds normal - voc_block: 14 + voc_block: 36 voc_pad: 14 @@ -91,12 +91,12 @@ tts_online-onnx: lang: 'zh' # am_block and am_pad only for fastspeech2_cnndecoder_onnx model to streaming am infer, # when am_pad set 12, streaming synthetic audio is the same as non-streaming synthetic audio - am_block: 42 + am_block: 72 am_pad: 12 # voc_pad and voc_block voc model to streaming voc infer, # when voc model is mb_melgan_csmsc_onnx, voc_pad set 14, streaming synthetic audio is the same as non-streaming synthetic audio; The minimum value of pad can be set to 7, streaming synthetic audio sounds normal # when voc model is hifigan_csmsc_onnx, voc_pad set 20, streaming synthetic audio is the same as non-streaming synthetic audio; voc_pad set 14, streaming synthetic audio sounds normal - voc_block: 14 + voc_block: 36 voc_pad: 14 # voc_upsample should be same as n_shift on voc config. voc_upsample: 300 diff --git a/paddlespeech/server/engine/asr/online/asr_engine.py b/paddlespeech/server/engine/asr/online/asr_engine.py index 99d34a3050eff54d2185ff6d0dda0ffb9ae27dbe..ad1e6fa390a1b290a052a5eb976fd34149b2a494 100644 --- a/paddlespeech/server/engine/asr/online/asr_engine.py +++ b/paddlespeech/server/engine/asr/online/asr_engine.py @@ -20,10 +20,9 @@ import paddle from numpy import float32 from yacs.config import CfgNode +from .pretrained_models import pretrained_models from paddlespeech.cli.asr.infer import ASRExecutor -from paddlespeech.cli.asr.infer import model_alias from paddlespeech.cli.log import logger -from paddlespeech.cli.utils import download_and_decompress from paddlespeech.cli.utils import MODEL_HOME from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer from paddlespeech.s2t.frontend.speech import SpeechSegment @@ -40,45 +39,6 @@ from paddlespeech.server.utils.paddle_predictor import init_predictor __all__ = ['ASREngine'] -pretrained_models = { - "deepspeech2online_aishell-zh-16k": { - 'url': - 'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_fbank161_ckpt_0.2.1.model.tar.gz', - 'md5': - '98b87b171b7240b7cae6e07d8d0bc9be', - 'cfg_path': - 'model.yaml', - 'ckpt_path': - 'exp/deepspeech2_online/checkpoints/avg_1', - 'model': - 'exp/deepspeech2_online/checkpoints/avg_1.jit.pdmodel', - 'params': - 'exp/deepspeech2_online/checkpoints/avg_1.jit.pdiparams', - 'lm_url': - 'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm', - 'lm_md5': - '29e02312deb2e59b3c8686c7966d4fe3' - }, - "conformer_online_multicn-zh-16k": { - 'url': - 'https://paddlespeech.bj.bcebos.com/s2t/multi_cn/asr1/asr1_chunk_conformer_multi_cn_ckpt_0.2.3.model.tar.gz', - 'md5': - '0ac93d390552336f2a906aec9e33c5fa', - 'cfg_path': - 'model.yaml', - 'ckpt_path': - 'exp/chunk_conformer/checkpoints/multi_cn', - 'model': - 'exp/chunk_conformer/checkpoints/multi_cn.pdparams', - 'params': - 'exp/chunk_conformer/checkpoints/multi_cn.pdparams', - 'lm_url': - 'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm', - 'lm_md5': - '29e02312deb2e59b3c8686c7966d4fe3' - }, -} - # ASR server connection process class class PaddleASRConnectionHanddler: @@ -625,24 +585,7 @@ class PaddleASRConnectionHanddler: class ASRServerExecutor(ASRExecutor): def __init__(self): super().__init__() - pass - - def _get_pretrained_path(self, tag: str) -> os.PathLike: - """ - Download and returns pretrained resources path of current task. - """ - support_models = list(pretrained_models.keys()) - assert tag in pretrained_models, 'The model "{}" you want to use has not been supported, please choose other models.\nThe support models includes:\n\t\t{}\n'.format( - tag, '\n\t\t'.join(support_models)) - - res_path = os.path.join(MODEL_HOME, tag) - decompressed_path = download_and_decompress(pretrained_models[tag], - res_path) - decompressed_path = os.path.abspath(decompressed_path) - logger.info( - 'Use pretrained model stored in: {}'.format(decompressed_path)) - - return decompressed_path + self.pretrained_models = pretrained_models def _init_from_path(self, model_type: str='deepspeech2online_aishell', @@ -658,20 +601,20 @@ class ASRServerExecutor(ASRExecutor): """ self.model_type = model_type self.sample_rate = sample_rate + sample_rate_str = '16k' if sample_rate == 16000 else '8k' + tag = model_type + '-' + lang + '-' + sample_rate_str if cfg_path is None or am_model is None or am_params is None: - sample_rate_str = '16k' if sample_rate == 16000 else '8k' - tag = model_type + '-' + lang + '-' + sample_rate_str logger.info(f"Load the pretrained model, tag = {tag}") res_path = self._get_pretrained_path(tag) # wenetspeech_zh self.res_path = res_path - self.cfg_path = os.path.join(res_path, - pretrained_models[tag]['cfg_path']) + self.cfg_path = os.path.join( + res_path, self.pretrained_models[tag]['cfg_path']) self.am_model = os.path.join(res_path, - pretrained_models[tag]['model']) + self.pretrained_models[tag]['model']) self.am_params = os.path.join(res_path, - pretrained_models[tag]['params']) + self.pretrained_models[tag]['params']) logger.info(res_path) else: self.cfg_path = os.path.abspath(cfg_path) @@ -699,8 +642,8 @@ class ASRServerExecutor(ASRExecutor): self.text_feature = TextFeaturizer( unit_type=self.config.unit_type, vocab=self.vocab) - lm_url = pretrained_models[tag]['lm_url'] - lm_md5 = pretrained_models[tag]['lm_md5'] + lm_url = self.pretrained_models[tag]['lm_url'] + lm_md5 = self.pretrained_models[tag]['lm_md5'] logger.info(f"Start to load language model {lm_url}") self.download_lm( lm_url, @@ -773,7 +716,7 @@ class ASRServerExecutor(ASRExecutor): model_name = model_type[:model_type.rindex( '_')] # model_type: {model_name}_{dataset} logger.info(f"model name: {model_name}") - model_class = dynamic_import(model_name, model_alias) + model_class = dynamic_import(model_name, self.model_alias) model_conf = self.config model = model_class.from_config(model_conf) self.model = model diff --git a/paddlespeech/server/engine/asr/online/pretrained_models.py b/paddlespeech/server/engine/asr/online/pretrained_models.py new file mode 100644 index 0000000000000000000000000000000000000000..005977b46b309d42f2b0bdf981fdfdfef22fdcb2 --- /dev/null +++ b/paddlespeech/server/engine/asr/online/pretrained_models.py @@ -0,0 +1,52 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +pretrained_models = { + "deepspeech2online_aishell-zh-16k": { + 'url': + 'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_fbank161_ckpt_0.2.1.model.tar.gz', + 'md5': + '98b87b171b7240b7cae6e07d8d0bc9be', + 'cfg_path': + 'model.yaml', + 'ckpt_path': + 'exp/deepspeech2_online/checkpoints/avg_1', + 'model': + 'exp/deepspeech2_online/checkpoints/avg_1.jit.pdmodel', + 'params': + 'exp/deepspeech2_online/checkpoints/avg_1.jit.pdiparams', + 'lm_url': + 'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm', + 'lm_md5': + '29e02312deb2e59b3c8686c7966d4fe3' + }, + "conformer_online_multicn-zh-16k": { + 'url': + 'https://paddlespeech.bj.bcebos.com/s2t/multi_cn/asr1/asr1_chunk_conformer_multi_cn_ckpt_0.2.3.model.tar.gz', + 'md5': + '0ac93d390552336f2a906aec9e33c5fa', + 'cfg_path': + 'model.yaml', + 'ckpt_path': + 'exp/chunk_conformer/checkpoints/multi_cn', + 'model': + 'exp/chunk_conformer/checkpoints/multi_cn.pdparams', + 'params': + 'exp/chunk_conformer/checkpoints/multi_cn.pdparams', + 'lm_url': + 'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm', + 'lm_md5': + '29e02312deb2e59b3c8686c7966d4fe3' + }, +} diff --git a/paddlespeech/server/engine/asr/paddleinference/asr_engine.py b/paddlespeech/server/engine/asr/paddleinference/asr_engine.py index 1925bf1d623613d073bb028133a348842b591127..e275f1088f648df62947ded43f297cbb8d2c70c2 100644 --- a/paddlespeech/server/engine/asr/paddleinference/asr_engine.py +++ b/paddlespeech/server/engine/asr/paddleinference/asr_engine.py @@ -19,6 +19,7 @@ from typing import Optional import paddle from yacs.config import CfgNode +from .pretrained_models import pretrained_models from paddlespeech.cli.asr.infer import ASRExecutor from paddlespeech.cli.log import logger from paddlespeech.cli.utils import MODEL_HOME @@ -31,32 +32,11 @@ from paddlespeech.server.utils.paddle_predictor import run_model __all__ = ['ASREngine'] -pretrained_models = { - "deepspeech2offline_aishell-zh-16k": { - 'url': - 'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_aishell_ckpt_0.1.1.model.tar.gz', - 'md5': - '932c3593d62fe5c741b59b31318aa314', - 'cfg_path': - 'model.yaml', - 'ckpt_path': - 'exp/deepspeech2/checkpoints/avg_1', - 'model': - 'exp/deepspeech2/checkpoints/avg_1.jit.pdmodel', - 'params': - 'exp/deepspeech2/checkpoints/avg_1.jit.pdiparams', - 'lm_url': - 'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm', - 'lm_md5': - '29e02312deb2e59b3c8686c7966d4fe3' - }, -} - class ASRServerExecutor(ASRExecutor): def __init__(self): super().__init__() - pass + self.pretrained_models = pretrained_models def _init_from_path(self, model_type: str='wenetspeech', @@ -71,18 +51,18 @@ class ASRServerExecutor(ASRExecutor): Init model and other resources from a specific path. """ + sample_rate_str = '16k' if sample_rate == 16000 else '8k' + tag = model_type + '-' + lang + '-' + sample_rate_str if cfg_path is None or am_model is None or am_params is None: - sample_rate_str = '16k' if sample_rate == 16000 else '8k' - tag = model_type + '-' + lang + '-' + sample_rate_str res_path = self._get_pretrained_path(tag) # wenetspeech_zh self.res_path = res_path - self.cfg_path = os.path.join(res_path, - pretrained_models[tag]['cfg_path']) + self.cfg_path = os.path.join( + res_path, self.pretrained_models[tag]['cfg_path']) self.am_model = os.path.join(res_path, - pretrained_models[tag]['model']) + self.pretrained_models[tag]['model']) self.am_params = os.path.join(res_path, - pretrained_models[tag]['params']) + self.pretrained_models[tag]['params']) logger.info(res_path) logger.info(self.cfg_path) logger.info(self.am_model) @@ -109,8 +89,8 @@ class ASRServerExecutor(ASRExecutor): self.text_feature = TextFeaturizer( unit_type=self.config.unit_type, vocab=self.vocab) - lm_url = pretrained_models[tag]['lm_url'] - lm_md5 = pretrained_models[tag]['lm_md5'] + lm_url = self.pretrained_models[tag]['lm_url'] + lm_md5 = self.pretrained_models[tag]['lm_md5'] self.download_lm( lm_url, os.path.dirname(self.config.decode.lang_model_path), lm_md5) diff --git a/paddlespeech/server/engine/asr/paddleinference/pretrained_models.py b/paddlespeech/server/engine/asr/paddleinference/pretrained_models.py new file mode 100644 index 0000000000000000000000000000000000000000..c4c23e38cfb0b126e91090053054bcc50dc733e1 --- /dev/null +++ b/paddlespeech/server/engine/asr/paddleinference/pretrained_models.py @@ -0,0 +1,34 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +pretrained_models = { + "deepspeech2offline_aishell-zh-16k": { + 'url': + 'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_aishell_ckpt_0.1.1.model.tar.gz', + 'md5': + '932c3593d62fe5c741b59b31318aa314', + 'cfg_path': + 'model.yaml', + 'ckpt_path': + 'exp/deepspeech2/checkpoints/avg_1', + 'model': + 'exp/deepspeech2/checkpoints/avg_1.jit.pdmodel', + 'params': + 'exp/deepspeech2/checkpoints/avg_1.jit.pdiparams', + 'lm_url': + 'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm', + 'lm_md5': + '29e02312deb2e59b3c8686c7966d4fe3' + }, +} diff --git a/paddlespeech/server/engine/cls/paddleinference/cls_engine.py b/paddlespeech/server/engine/cls/paddleinference/cls_engine.py index 3982effd902c9d79b7b7684a7bd0268d0e8c1049..0906c2412d36f2d27393731da18e994772c2addd 100644 --- a/paddlespeech/server/engine/cls/paddleinference/cls_engine.py +++ b/paddlespeech/server/engine/cls/paddleinference/cls_engine.py @@ -20,83 +20,20 @@ import numpy as np import paddle import yaml +from .pretrained_models import pretrained_models from paddlespeech.cli.cls.infer import CLSExecutor from paddlespeech.cli.log import logger -from paddlespeech.cli.utils import download_and_decompress -from paddlespeech.cli.utils import MODEL_HOME from paddlespeech.server.engine.base_engine import BaseEngine from paddlespeech.server.utils.paddle_predictor import init_predictor from paddlespeech.server.utils.paddle_predictor import run_model __all__ = ['CLSEngine'] -pretrained_models = { - "panns_cnn6-32k": { - 'url': - 'https://paddlespeech.bj.bcebos.com/cls/inference_model/panns_cnn6_static.tar.gz', - 'md5': - 'da087c31046d23281d8ec5188c1967da', - 'cfg_path': - 'panns.yaml', - 'model_path': - 'inference.pdmodel', - 'params_path': - 'inference.pdiparams', - 'label_file': - 'audioset_labels.txt', - }, - "panns_cnn10-32k": { - 'url': - 'https://paddlespeech.bj.bcebos.com/cls/inference_model/panns_cnn10_static.tar.gz', - 'md5': - '5460cc6eafbfaf0f261cc75b90284ae1', - 'cfg_path': - 'panns.yaml', - 'model_path': - 'inference.pdmodel', - 'params_path': - 'inference.pdiparams', - 'label_file': - 'audioset_labels.txt', - }, - "panns_cnn14-32k": { - 'url': - 'https://paddlespeech.bj.bcebos.com/cls/inference_model/panns_cnn14_static.tar.gz', - 'md5': - 'ccc80b194821274da79466862b2ab00f', - 'cfg_path': - 'panns.yaml', - 'model_path': - 'inference.pdmodel', - 'params_path': - 'inference.pdiparams', - 'label_file': - 'audioset_labels.txt', - }, -} - class CLSServerExecutor(CLSExecutor): def __init__(self): super().__init__() - pass - - def _get_pretrained_path(self, tag: str) -> os.PathLike: - """ - Download and returns pretrained resources path of current task. - """ - support_models = list(pretrained_models.keys()) - assert tag in pretrained_models, 'The model "{}" you want to use has not been supported, please choose other models.\nThe support models includes:\n\t\t{}\n'.format( - tag, '\n\t\t'.join(support_models)) - - res_path = os.path.join(MODEL_HOME, tag) - decompressed_path = download_and_decompress(pretrained_models[tag], - res_path) - decompressed_path = os.path.abspath(decompressed_path) - logger.info( - 'Use pretrained model stored in: {}'.format(decompressed_path)) - - return decompressed_path + self.pretrained_models = pretrained_models def _init_from_path( self, @@ -113,14 +50,14 @@ class CLSServerExecutor(CLSExecutor): if cfg_path is None or model_path is None or params_path is None or label_file is None: tag = model_type + '-' + '32k' self.res_path = self._get_pretrained_path(tag) - self.cfg_path = os.path.join(self.res_path, - pretrained_models[tag]['cfg_path']) - self.model_path = os.path.join(self.res_path, - pretrained_models[tag]['model_path']) + self.cfg_path = os.path.join( + self.res_path, self.pretrained_models[tag]['cfg_path']) + self.model_path = os.path.join( + self.res_path, self.pretrained_models[tag]['model_path']) self.params_path = os.path.join( - self.res_path, pretrained_models[tag]['params_path']) - self.label_file = os.path.join(self.res_path, - pretrained_models[tag]['label_file']) + self.res_path, self.pretrained_models[tag]['params_path']) + self.label_file = os.path.join( + self.res_path, self.pretrained_models[tag]['label_file']) else: self.cfg_path = os.path.abspath(cfg_path) self.model_path = os.path.abspath(model_path) diff --git a/paddlespeech/server/engine/cls/paddleinference/pretrained_models.py b/paddlespeech/server/engine/cls/paddleinference/pretrained_models.py new file mode 100644 index 0000000000000000000000000000000000000000..e4914874600c2198e434d267c775dea66f3f252a --- /dev/null +++ b/paddlespeech/server/engine/cls/paddleinference/pretrained_models.py @@ -0,0 +1,58 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +pretrained_models = { + "panns_cnn6-32k": { + 'url': + 'https://paddlespeech.bj.bcebos.com/cls/inference_model/panns_cnn6_static.tar.gz', + 'md5': + 'da087c31046d23281d8ec5188c1967da', + 'cfg_path': + 'panns.yaml', + 'model_path': + 'inference.pdmodel', + 'params_path': + 'inference.pdiparams', + 'label_file': + 'audioset_labels.txt', + }, + "panns_cnn10-32k": { + 'url': + 'https://paddlespeech.bj.bcebos.com/cls/inference_model/panns_cnn10_static.tar.gz', + 'md5': + '5460cc6eafbfaf0f261cc75b90284ae1', + 'cfg_path': + 'panns.yaml', + 'model_path': + 'inference.pdmodel', + 'params_path': + 'inference.pdiparams', + 'label_file': + 'audioset_labels.txt', + }, + "panns_cnn14-32k": { + 'url': + 'https://paddlespeech.bj.bcebos.com/cls/inference_model/panns_cnn14_static.tar.gz', + 'md5': + 'ccc80b194821274da79466862b2ab00f', + 'cfg_path': + 'panns.yaml', + 'model_path': + 'inference.pdmodel', + 'params_path': + 'inference.pdiparams', + 'label_file': + 'audioset_labels.txt', + }, +} diff --git a/paddlespeech/server/engine/tts/online/onnx/pretrained_models.py b/paddlespeech/server/engine/tts/online/onnx/pretrained_models.py new file mode 100644 index 0000000000000000000000000000000000000000..789f5be7d7ca16965459fec6df7e40f7713ee104 --- /dev/null +++ b/paddlespeech/server/engine/tts/online/onnx/pretrained_models.py @@ -0,0 +1,69 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# support online model +pretrained_models = { + # fastspeech2 + "fastspeech2_csmsc_onnx-zh": { + 'url': + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_csmsc_onnx_0.2.0.zip', + 'md5': + 'fd3ad38d83273ad51f0ea4f4abf3ab4e', + 'ckpt': ['fastspeech2_csmsc.onnx'], + 'phones_dict': + 'phone_id_map.txt', + 'sample_rate': + 24000, + }, + "fastspeech2_cnndecoder_csmsc_onnx-zh": { + 'url': + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_streaming_onnx_1.0.0.zip', + 'md5': + '5f70e1a6bcd29d72d54e7931aa86f266', + 'ckpt': [ + 'fastspeech2_csmsc_am_encoder_infer.onnx', + 'fastspeech2_csmsc_am_decoder.onnx', + 'fastspeech2_csmsc_am_postnet.onnx', + ], + 'speech_stats': + 'speech_stats.npy', + 'phones_dict': + 'phone_id_map.txt', + 'sample_rate': + 24000, + }, + + # mb_melgan + "mb_melgan_csmsc_onnx-zh": { + 'url': + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_onnx_0.2.0.zip', + 'md5': + '5b83ec746e8414bc29032d954ffd07ec', + 'ckpt': + 'mb_melgan_csmsc.onnx', + 'sample_rate': + 24000, + }, + + # hifigan + "hifigan_csmsc_onnx-zh": { + 'url': + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_onnx_0.2.0.zip', + 'md5': + '1a7dc0385875889e46952e50c0994a6b', + 'ckpt': + 'hifigan_csmsc.onnx', + 'sample_rate': + 24000, + }, +} diff --git a/paddlespeech/server/engine/tts/online/onnx/tts_engine.py b/paddlespeech/server/engine/tts/online/onnx/tts_engine.py index 22c1c960700d74d003914f59595679cb8dbad9f1..792442065074af9168f84b1ce695bb484b01e388 100644 --- a/paddlespeech/server/engine/tts/online/onnx/tts_engine.py +++ b/paddlespeech/server/engine/tts/online/onnx/tts_engine.py @@ -20,10 +20,9 @@ from typing import Optional import numpy as np import paddle +from .pretrained_models import pretrained_models from paddlespeech.cli.log import logger from paddlespeech.cli.tts.infer import TTSExecutor -from paddlespeech.cli.utils import download_and_decompress -from paddlespeech.cli.utils import MODEL_HOME from paddlespeech.server.engine.base_engine import BaseEngine from paddlespeech.server.utils.audio_process import float2pcm from paddlespeech.server.utils.onnx_infer import get_sess @@ -34,83 +33,6 @@ from paddlespeech.t2s.frontend.zh_frontend import Frontend __all__ = ['TTSEngine'] -# support online model -pretrained_models = { - # fastspeech2 - "fastspeech2_csmsc_onnx-zh": { - 'url': - 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_csmsc_onnx_0.2.0.zip', - 'md5': - 'fd3ad38d83273ad51f0ea4f4abf3ab4e', - 'ckpt': ['fastspeech2_csmsc.onnx'], - 'phones_dict': - 'phone_id_map.txt', - 'sample_rate': - 24000, - }, - "fastspeech2_cnndecoder_csmsc_onnx-zh": { - 'url': - 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_streaming_onnx_1.0.0.zip', - 'md5': - '5f70e1a6bcd29d72d54e7931aa86f266', - 'ckpt': [ - 'fastspeech2_csmsc_am_encoder_infer.onnx', - 'fastspeech2_csmsc_am_decoder.onnx', - 'fastspeech2_csmsc_am_postnet.onnx', - ], - 'speech_stats': - 'speech_stats.npy', - 'phones_dict': - 'phone_id_map.txt', - 'sample_rate': - 24000, - }, - - # mb_melgan - "mb_melgan_csmsc_onnx-zh": { - 'url': - 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_onnx_0.2.0.zip', - 'md5': - '5b83ec746e8414bc29032d954ffd07ec', - 'ckpt': - 'mb_melgan_csmsc.onnx', - 'sample_rate': - 24000, - }, - - # hifigan - "hifigan_csmsc_onnx-zh": { - 'url': - 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_onnx_0.2.0.zip', - 'md5': - '1a7dc0385875889e46952e50c0994a6b', - 'ckpt': - 'hifigan_csmsc.onnx', - 'sample_rate': - 24000, - }, -} - -model_alias = { - # acoustic model - "fastspeech2": - "paddlespeech.t2s.models.fastspeech2:FastSpeech2", - "fastspeech2_inference": - "paddlespeech.t2s.models.fastspeech2:FastSpeech2Inference", - - # voc - "mb_melgan": - "paddlespeech.t2s.models.melgan:MelGANGenerator", - "mb_melgan_inference": - "paddlespeech.t2s.models.melgan:MelGANInference", - "hifigan": - "paddlespeech.t2s.models.hifigan:HiFiGANGenerator", - "hifigan_inference": - "paddlespeech.t2s.models.hifigan:HiFiGANInference", -} - -__all__ = ['TTSEngine'] - class TTSServerExecutor(TTSExecutor): def __init__(self, am_block, am_pad, voc_block, voc_pad, voc_upsample): @@ -122,23 +44,6 @@ class TTSServerExecutor(TTSExecutor): self.voc_upsample = voc_upsample self.pretrained_models = pretrained_models - self.model_alias = model_alias - - def _get_pretrained_path(self, tag: str) -> os.PathLike: - """ - #Download and returns pretrained resources path of current task. - """ - support_models = list(pretrained_models.keys()) - assert tag in pretrained_models, 'The model "{}" you want to use has not been supported, please choose other models.\nThe support models includes:\n\t\t{}\n'.format( - tag, '\n\t\t'.join(support_models)) - - res_path = os.path.join(MODEL_HOME, tag) - decompressed_path = download_and_decompress(pretrained_models[tag], - res_path) - decompressed_path = os.path.abspath(decompressed_path) - logger.info( - 'Use pretrained model stored in: {}'.format(decompressed_path)) - return decompressed_path def _init_from_path( self, @@ -173,10 +78,10 @@ class TTSServerExecutor(TTSExecutor): am_res_path = self._get_pretrained_path(am_tag) self.am_res_path = am_res_path self.am_ckpt = os.path.join( - am_res_path, pretrained_models[am_tag]['ckpt'][0]) + am_res_path, self.pretrained_models[am_tag]['ckpt'][0]) # must have phones_dict in acoustic self.phones_dict = os.path.join( - am_res_path, pretrained_models[am_tag]['phones_dict']) + am_res_path, self.pretrained_models[am_tag]['phones_dict']) else: self.am_ckpt = os.path.abspath(am_ckpt[0]) @@ -192,16 +97,16 @@ class TTSServerExecutor(TTSExecutor): am_res_path = self._get_pretrained_path(am_tag) self.am_res_path = am_res_path self.am_encoder_infer = os.path.join( - am_res_path, pretrained_models[am_tag]['ckpt'][0]) + am_res_path, self.pretrained_models[am_tag]['ckpt'][0]) self.am_decoder = os.path.join( - am_res_path, pretrained_models[am_tag]['ckpt'][1]) + am_res_path, self.pretrained_models[am_tag]['ckpt'][1]) self.am_postnet = os.path.join( - am_res_path, pretrained_models[am_tag]['ckpt'][2]) + am_res_path, self.pretrained_models[am_tag]['ckpt'][2]) # must have phones_dict in acoustic self.phones_dict = os.path.join( - am_res_path, pretrained_models[am_tag]['phones_dict']) + am_res_path, self.pretrained_models[am_tag]['phones_dict']) self.am_stat = os.path.join( - am_res_path, pretrained_models[am_tag]['speech_stats']) + am_res_path, self.pretrained_models[am_tag]['speech_stats']) else: self.am_encoder_infer = os.path.abspath(am_ckpt[0]) @@ -229,8 +134,8 @@ class TTSServerExecutor(TTSExecutor): if voc_ckpt is None: voc_res_path = self._get_pretrained_path(voc_tag) self.voc_res_path = voc_res_path - self.voc_ckpt = os.path.join(voc_res_path, - pretrained_models[voc_tag]['ckpt']) + self.voc_ckpt = os.path.join( + voc_res_path, self.pretrained_models[voc_tag]['ckpt']) else: self.voc_ckpt = os.path.abspath(voc_ckpt) self.voc_res_path = os.path.dirname(os.path.abspath(self.voc_ckpt)) @@ -283,7 +188,6 @@ class TTSServerExecutor(TTSExecutor): """ Model inference and result stored in self.output. """ - #import pdb;pdb.set_trace() am_block = self.am_block am_pad = self.am_pad @@ -453,10 +357,21 @@ class TTSEngine(BaseEngine): self.config.am_block, self.config.am_pad, self.config.voc_block, self.config.voc_pad, self.config.voc_upsample) - if "cpu" in self.config.am_sess_conf.device or "cpu" in self.config.voc_sess_conf.device: - paddle.set_device("cpu") - else: - paddle.set_device(self.config.am_sess_conf.device) + try: + if self.config.am_sess_conf.device is not None: + self.device = self.config.am_sess_conf.device + elif self.config.voc_sess_conf.device is not None: + self.device = self.config.voc_sess_conf.device + else: + self.device = paddle.get_device() + paddle.set_device(self.device) + except BaseException as e: + logger.error( + "Set device failed, please check if device is already used and the parameter 'device' in the yaml file" + ) + logger.error("Initialize TTS server engine Failed on device: %s." % + (self.device)) + return False try: self.executor._init_from_path( @@ -480,16 +395,17 @@ class TTSEngine(BaseEngine): (self.config.voc_sess_conf.device)) return False - logger.info("Initialize TTS server engine successfully on device: %s." % - (self.config.voc_sess_conf.device)) - # warm up try: self.warm_up() + logger.info("Warm up successfully.") except Exception as e: logger.error("Failed to warm up on tts engine.") return False + logger.info("Initialize TTS server engine successfully on device: %s." % + (self.config.voc_sess_conf.device)) + return True def warm_up(self): @@ -499,9 +415,7 @@ class TTSEngine(BaseEngine): sentence = "您好,欢迎使用语音合成服务。" if self.config.lang == 'en': sentence = "Hello and welcome to the speech synthesis service." - logger.info( - "*******************************warm up ********************************" - ) + logger.info("Start to warm up.") for i in range(3): for wav in self.executor.infer( text=sentence, @@ -512,9 +426,6 @@ class TTSEngine(BaseEngine): f"The first response time of the {i} warm up: {self.executor.first_response_time} s" ) break - logger.info( - "**********************************************************************" - ) def preprocess(self, text_bese64: str=None, text_bytes: bytes=None): # Convert byte to text diff --git a/paddlespeech/server/engine/tts/online/python/pretrained_models.py b/paddlespeech/server/engine/tts/online/python/pretrained_models.py new file mode 100644 index 0000000000000000000000000000000000000000..bf6aded51168c2c21172ec8101413b4cb0e05154 --- /dev/null +++ b/paddlespeech/server/engine/tts/online/python/pretrained_models.py @@ -0,0 +1,73 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# support online model +pretrained_models = { + # fastspeech2 + "fastspeech2_csmsc-zh": { + 'url': + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip', + 'md5': + '637d28a5e53aa60275612ba4393d5f22', + 'config': + 'default.yaml', + 'ckpt': + 'snapshot_iter_76000.pdz', + 'speech_stats': + 'speech_stats.npy', + 'phones_dict': + 'phone_id_map.txt', + }, + "fastspeech2_cnndecoder_csmsc-zh": { + 'url': + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_ckpt_1.0.0.zip', + 'md5': + '6eb28e22ace73e0ebe7845f86478f89f', + 'config': + 'cnndecoder.yaml', + 'ckpt': + 'snapshot_iter_153000.pdz', + 'speech_stats': + 'speech_stats.npy', + 'phones_dict': + 'phone_id_map.txt', + }, + + # mb_melgan + "mb_melgan_csmsc-zh": { + 'url': + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_ckpt_0.1.1.zip', + 'md5': + 'ee5f0604e20091f0d495b6ec4618b90d', + 'config': + 'default.yaml', + 'ckpt': + 'snapshot_iter_1000000.pdz', + 'speech_stats': + 'feats_stats.npy', + }, + + # hifigan + "hifigan_csmsc-zh": { + 'url': + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_ckpt_0.1.1.zip', + 'md5': + 'dd40a3d88dfcf64513fba2f0f961ada6', + 'config': + 'default.yaml', + 'ckpt': + 'snapshot_iter_2500000.pdz', + 'speech_stats': + 'feats_stats.npy', + }, +} diff --git a/paddlespeech/server/engine/tts/online/python/tts_engine.py b/paddlespeech/server/engine/tts/online/python/tts_engine.py index 1f51586bc19149db4d4aac1142470cf824bbd197..1fca5283745325a21b8299c1fdbc661100af7aaf 100644 --- a/paddlespeech/server/engine/tts/online/python/tts_engine.py +++ b/paddlespeech/server/engine/tts/online/python/tts_engine.py @@ -22,10 +22,9 @@ import paddle import yaml from yacs.config import CfgNode +from .pretrained_models import pretrained_models from paddlespeech.cli.log import logger from paddlespeech.cli.tts.infer import TTSExecutor -from paddlespeech.cli.utils import download_and_decompress -from paddlespeech.cli.utils import MODEL_HOME from paddlespeech.s2t.utils.dynamic_import import dynamic_import from paddlespeech.server.engine.base_engine import BaseEngine from paddlespeech.server.utils.audio_process import float2pcm @@ -37,87 +36,6 @@ from paddlespeech.t2s.modules.normalizer import ZScore __all__ = ['TTSEngine'] -# support online model -pretrained_models = { - # fastspeech2 - "fastspeech2_csmsc-zh": { - 'url': - 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip', - 'md5': - '637d28a5e53aa60275612ba4393d5f22', - 'config': - 'default.yaml', - 'ckpt': - 'snapshot_iter_76000.pdz', - 'speech_stats': - 'speech_stats.npy', - 'phones_dict': - 'phone_id_map.txt', - }, - "fastspeech2_cnndecoder_csmsc-zh": { - 'url': - 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_ckpt_1.0.0.zip', - 'md5': - '6eb28e22ace73e0ebe7845f86478f89f', - 'config': - 'cnndecoder.yaml', - 'ckpt': - 'snapshot_iter_153000.pdz', - 'speech_stats': - 'speech_stats.npy', - 'phones_dict': - 'phone_id_map.txt', - }, - - # mb_melgan - "mb_melgan_csmsc-zh": { - 'url': - 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_ckpt_0.1.1.zip', - 'md5': - 'ee5f0604e20091f0d495b6ec4618b90d', - 'config': - 'default.yaml', - 'ckpt': - 'snapshot_iter_1000000.pdz', - 'speech_stats': - 'feats_stats.npy', - }, - - # hifigan - "hifigan_csmsc-zh": { - 'url': - 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_ckpt_0.1.1.zip', - 'md5': - 'dd40a3d88dfcf64513fba2f0f961ada6', - 'config': - 'default.yaml', - 'ckpt': - 'snapshot_iter_2500000.pdz', - 'speech_stats': - 'feats_stats.npy', - }, -} - -model_alias = { - # acoustic model - "fastspeech2": - "paddlespeech.t2s.models.fastspeech2:FastSpeech2", - "fastspeech2_inference": - "paddlespeech.t2s.models.fastspeech2:FastSpeech2Inference", - - # voc - "mb_melgan": - "paddlespeech.t2s.models.melgan:MelGANGenerator", - "mb_melgan_inference": - "paddlespeech.t2s.models.melgan:MelGANInference", - "hifigan": - "paddlespeech.t2s.models.hifigan:HiFiGANGenerator", - "hifigan_inference": - "paddlespeech.t2s.models.hifigan:HiFiGANInference", -} - -__all__ = ['TTSEngine'] - class TTSServerExecutor(TTSExecutor): def __init__(self, am_block, am_pad, voc_block, voc_pad): @@ -126,6 +44,7 @@ class TTSServerExecutor(TTSExecutor): self.am_pad = am_pad self.voc_block = voc_block self.voc_pad = voc_pad + self.pretrained_models = pretrained_models def get_model_info(self, field: str, @@ -146,7 +65,7 @@ class TTSServerExecutor(TTSExecutor): [Tensor]: standard deviation """ - model_class = dynamic_import(model_name, model_alias) + model_class = dynamic_import(model_name, self.model_alias) if field == "am": odim = self.am_config.n_mels @@ -169,22 +88,6 @@ class TTSServerExecutor(TTSExecutor): return model, model_mu, model_std - def _get_pretrained_path(self, tag: str) -> os.PathLike: - """ - Download and returns pretrained resources path of current task. - """ - support_models = list(pretrained_models.keys()) - assert tag in pretrained_models, 'The model "{}" you want to use has not been supported, please choose other models.\nThe support models includes:\n\t\t{}\n'.format( - tag, '\n\t\t'.join(support_models)) - - res_path = os.path.join(MODEL_HOME, tag) - decompressed_path = download_and_decompress(pretrained_models[tag], - res_path) - decompressed_path = os.path.abspath(decompressed_path) - logger.info( - 'Use pretrained model stored in: {}'.format(decompressed_path)) - return decompressed_path - def _init_from_path( self, am: str='fastspeech2_csmsc', @@ -210,15 +113,15 @@ class TTSServerExecutor(TTSExecutor): if am_ckpt is None or am_config is None or am_stat is None or phones_dict is None: am_res_path = self._get_pretrained_path(am_tag) self.am_res_path = am_res_path - self.am_config = os.path.join(am_res_path, - pretrained_models[am_tag]['config']) + self.am_config = os.path.join( + am_res_path, self.pretrained_models[am_tag]['config']) self.am_ckpt = os.path.join(am_res_path, - pretrained_models[am_tag]['ckpt']) + self.pretrained_models[am_tag]['ckpt']) self.am_stat = os.path.join( - am_res_path, pretrained_models[am_tag]['speech_stats']) + am_res_path, self.pretrained_models[am_tag]['speech_stats']) # must have phones_dict in acoustic self.phones_dict = os.path.join( - am_res_path, pretrained_models[am_tag]['phones_dict']) + am_res_path, self.pretrained_models[am_tag]['phones_dict']) print("self.phones_dict:", self.phones_dict) logger.info(am_res_path) logger.info(self.am_config) @@ -239,12 +142,12 @@ class TTSServerExecutor(TTSExecutor): if voc_ckpt is None or voc_config is None or voc_stat is None: voc_res_path = self._get_pretrained_path(voc_tag) self.voc_res_path = voc_res_path - self.voc_config = os.path.join(voc_res_path, - pretrained_models[voc_tag]['config']) - self.voc_ckpt = os.path.join(voc_res_path, - pretrained_models[voc_tag]['ckpt']) + self.voc_config = os.path.join( + voc_res_path, self.pretrained_models[voc_tag]['config']) + self.voc_ckpt = os.path.join( + voc_res_path, self.pretrained_models[voc_tag]['ckpt']) self.voc_stat = os.path.join( - voc_res_path, pretrained_models[voc_tag]['speech_stats']) + voc_res_path, self.pretrained_models[voc_tag]['speech_stats']) logger.info(voc_res_path) logger.info(self.voc_config) logger.info(self.voc_ckpt) @@ -286,7 +189,7 @@ class TTSServerExecutor(TTSExecutor): self.am_ckpt, self.am_stat) am_normalizer = ZScore(am_mu, am_std) am_inference_class = dynamic_import(self.am_name + '_inference', - model_alias) + self.model_alias) self.am_inference = am_inference_class(am_normalizer, am) self.am_inference.eval() print("acoustic model done!") @@ -297,7 +200,7 @@ class TTSServerExecutor(TTSExecutor): self.voc_ckpt, self.voc_stat) voc_normalizer = ZScore(voc_mu, voc_std) voc_inference_class = dynamic_import(self.voc_name + '_inference', - model_alias) + self.model_alias) self.voc_inference = voc_inference_class(voc_normalizer, voc) self.voc_inference.eval() print("voc done!") @@ -477,7 +380,7 @@ class TTSEngine(BaseEngine): ), "Please set correct voc_block and voc_pad, they should be more than 0." try: - if self.config.device: + if self.config.device is not None: self.device = self.config.device else: self.device = paddle.get_device() @@ -513,16 +416,16 @@ class TTSEngine(BaseEngine): (self.device)) return False - logger.info("Initialize TTS server engine successfully on device: %s." % - (self.device)) - # warm up try: self.warm_up() + logger.info("Warm up successfully.") except Exception as e: logger.error("Failed to warm up on tts engine.") return False + logger.info("Initialize TTS server engine successfully on device: %s." % + (self.device)) return True def warm_up(self): @@ -532,9 +435,7 @@ class TTSEngine(BaseEngine): sentence = "您好,欢迎使用语音合成服务。" if self.config.lang == 'en': sentence = "Hello and welcome to the speech synthesis service." - logger.info( - "*******************************warm up ********************************" - ) + logger.info("Start to warm up.") for i in range(3): for wav in self.executor.infer( text=sentence, @@ -545,9 +446,6 @@ class TTSEngine(BaseEngine): f"The first response time of the {i} warm up: {self.executor.first_response_time} s" ) break - logger.info( - "**********************************************************************" - ) def preprocess(self, text_bese64: str=None, text_bytes: bytes=None): # Convert byte to text diff --git a/paddlespeech/server/engine/tts/paddleinference/pretrained_models.py b/paddlespeech/server/engine/tts/paddleinference/pretrained_models.py new file mode 100644 index 0000000000000000000000000000000000000000..9618a7a697765f532a172c551b6be733a68a1bec --- /dev/null +++ b/paddlespeech/server/engine/tts/paddleinference/pretrained_models.py @@ -0,0 +1,87 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Static model applied on paddle inference +pretrained_models = { + # speedyspeech + "speedyspeech_csmsc-zh": { + 'url': + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_static_0.5.zip', + 'md5': + 'f10cbdedf47dc7a9668d2264494e1823', + 'model': + 'speedyspeech_csmsc.pdmodel', + 'params': + 'speedyspeech_csmsc.pdiparams', + 'phones_dict': + 'phone_id_map.txt', + 'tones_dict': + 'tone_id_map.txt', + 'sample_rate': + 24000, + }, + # fastspeech2 + "fastspeech2_csmsc-zh": { + 'url': + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_static_0.4.zip', + 'md5': + '9788cd9745e14c7a5d12d32670b2a5a7', + 'model': + 'fastspeech2_csmsc.pdmodel', + 'params': + 'fastspeech2_csmsc.pdiparams', + 'phones_dict': + 'phone_id_map.txt', + 'sample_rate': + 24000, + }, + # pwgan + "pwgan_csmsc-zh": { + 'url': + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_static_0.4.zip', + 'md5': + 'e3504aed9c5a290be12d1347836d2742', + 'model': + 'pwgan_csmsc.pdmodel', + 'params': + 'pwgan_csmsc.pdiparams', + 'sample_rate': + 24000, + }, + # mb_melgan + "mb_melgan_csmsc-zh": { + 'url': + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_static_0.1.1.zip', + 'md5': + 'ac6eee94ba483421d750433f4c3b8d36', + 'model': + 'mb_melgan_csmsc.pdmodel', + 'params': + 'mb_melgan_csmsc.pdiparams', + 'sample_rate': + 24000, + }, + # hifigan + "hifigan_csmsc-zh": { + 'url': + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_static_0.1.1.zip', + 'md5': + '7edd8c436b3a5546b3a7cb8cff9d5a0c', + 'model': + 'hifigan_csmsc.pdmodel', + 'params': + 'hifigan_csmsc.pdiparams', + 'sample_rate': + 24000, + }, +} diff --git a/paddlespeech/server/engine/tts/paddleinference/tts_engine.py b/paddlespeech/server/engine/tts/paddleinference/tts_engine.py index db8813ba901a93fa935ce003b8a7abdeec245485..f1ce8b76e2eacd378ccb8657486716ffb5ad4036 100644 --- a/paddlespeech/server/engine/tts/paddleinference/tts_engine.py +++ b/paddlespeech/server/engine/tts/paddleinference/tts_engine.py @@ -23,10 +23,9 @@ import paddle import soundfile as sf from scipy.io import wavfile +from .pretrained_models import pretrained_models from paddlespeech.cli.log import logger from paddlespeech.cli.tts.infer import TTSExecutor -from paddlespeech.cli.utils import download_and_decompress -from paddlespeech.cli.utils import MODEL_HOME from paddlespeech.server.engine.base_engine import BaseEngine from paddlespeech.server.utils.audio_process import change_speed from paddlespeech.server.utils.errors import ErrorCode @@ -38,101 +37,11 @@ from paddlespeech.t2s.frontend.zh_frontend import Frontend __all__ = ['TTSEngine'] -# Static model applied on paddle inference -pretrained_models = { - # speedyspeech - "speedyspeech_csmsc-zh": { - 'url': - 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_static_0.5.zip', - 'md5': - 'f10cbdedf47dc7a9668d2264494e1823', - 'model': - 'speedyspeech_csmsc.pdmodel', - 'params': - 'speedyspeech_csmsc.pdiparams', - 'phones_dict': - 'phone_id_map.txt', - 'tones_dict': - 'tone_id_map.txt', - 'sample_rate': - 24000, - }, - # fastspeech2 - "fastspeech2_csmsc-zh": { - 'url': - 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_static_0.4.zip', - 'md5': - '9788cd9745e14c7a5d12d32670b2a5a7', - 'model': - 'fastspeech2_csmsc.pdmodel', - 'params': - 'fastspeech2_csmsc.pdiparams', - 'phones_dict': - 'phone_id_map.txt', - 'sample_rate': - 24000, - }, - # pwgan - "pwgan_csmsc-zh": { - 'url': - 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_static_0.4.zip', - 'md5': - 'e3504aed9c5a290be12d1347836d2742', - 'model': - 'pwgan_csmsc.pdmodel', - 'params': - 'pwgan_csmsc.pdiparams', - 'sample_rate': - 24000, - }, - # mb_melgan - "mb_melgan_csmsc-zh": { - 'url': - 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_static_0.1.1.zip', - 'md5': - 'ac6eee94ba483421d750433f4c3b8d36', - 'model': - 'mb_melgan_csmsc.pdmodel', - 'params': - 'mb_melgan_csmsc.pdiparams', - 'sample_rate': - 24000, - }, - # hifigan - "hifigan_csmsc-zh": { - 'url': - 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_static_0.1.1.zip', - 'md5': - '7edd8c436b3a5546b3a7cb8cff9d5a0c', - 'model': - 'hifigan_csmsc.pdmodel', - 'params': - 'hifigan_csmsc.pdiparams', - 'sample_rate': - 24000, - }, -} - class TTSServerExecutor(TTSExecutor): def __init__(self): super().__init__() - pass - - def _get_pretrained_path(self, tag: str) -> os.PathLike: - """ - Download and returns pretrained resources path of current task. - """ - assert tag in pretrained_models, 'Can not find pretrained resources of {}.'.format( - tag) - - res_path = os.path.join(MODEL_HOME, tag) - decompressed_path = download_and_decompress(pretrained_models[tag], - res_path) - decompressed_path = os.path.abspath(decompressed_path) - logger.info( - 'Use pretrained model stored in: {}'.format(decompressed_path)) - return decompressed_path + self.pretrained_models = pretrained_models def _init_from_path( self, @@ -161,14 +70,14 @@ class TTSServerExecutor(TTSExecutor): if am_model is None or am_params is None or phones_dict is None: am_res_path = self._get_pretrained_path(am_tag) self.am_res_path = am_res_path - self.am_model = os.path.join(am_res_path, - pretrained_models[am_tag]['model']) - self.am_params = os.path.join(am_res_path, - pretrained_models[am_tag]['params']) + self.am_model = os.path.join( + am_res_path, self.pretrained_models[am_tag]['model']) + self.am_params = os.path.join( + am_res_path, self.pretrained_models[am_tag]['params']) # must have phones_dict in acoustic self.phones_dict = os.path.join( - am_res_path, pretrained_models[am_tag]['phones_dict']) - self.am_sample_rate = pretrained_models[am_tag]['sample_rate'] + am_res_path, self.pretrained_models[am_tag]['phones_dict']) + self.am_sample_rate = self.pretrained_models[am_tag]['sample_rate'] logger.info(am_res_path) logger.info(self.am_model) @@ -183,17 +92,17 @@ class TTSServerExecutor(TTSExecutor): # for speedyspeech self.tones_dict = None - if 'tones_dict' in pretrained_models[am_tag]: + if 'tones_dict' in self.pretrained_models[am_tag]: self.tones_dict = os.path.join( - am_res_path, pretrained_models[am_tag]['tones_dict']) + am_res_path, self.pretrained_models[am_tag]['tones_dict']) if tones_dict: self.tones_dict = tones_dict # for multi speaker fastspeech2 self.speaker_dict = None - if 'speaker_dict' in pretrained_models[am_tag]: + if 'speaker_dict' in self.pretrained_models[am_tag]: self.speaker_dict = os.path.join( - am_res_path, pretrained_models[am_tag]['speaker_dict']) + am_res_path, self.pretrained_models[am_tag]['speaker_dict']) if speaker_dict: self.speaker_dict = speaker_dict @@ -202,11 +111,12 @@ class TTSServerExecutor(TTSExecutor): if voc_model is None or voc_params is None: voc_res_path = self._get_pretrained_path(voc_tag) self.voc_res_path = voc_res_path - self.voc_model = os.path.join(voc_res_path, - pretrained_models[voc_tag]['model']) - self.voc_params = os.path.join(voc_res_path, - pretrained_models[voc_tag]['params']) - self.voc_sample_rate = pretrained_models[voc_tag]['sample_rate'] + self.voc_model = os.path.join( + voc_res_path, self.pretrained_models[voc_tag]['model']) + self.voc_params = os.path.join( + voc_res_path, self.pretrained_models[voc_tag]['params']) + self.voc_sample_rate = self.pretrained_models[voc_tag][ + 'sample_rate'] logger.info(voc_res_path) logger.info(self.voc_model) logger.info(self.voc_params) @@ -352,8 +262,24 @@ class TTSEngine(BaseEngine): def init(self, config: dict) -> bool: self.executor = TTSServerExecutor() - self.config = config + + try: + if self.config.am_predictor_conf.device is not None: + self.device = self.config.am_predictor_conf.device + elif self.config.voc_predictor_conf.device is not None: + self.device = self.config.voc_predictor_conf.device + else: + self.device = paddle.get_device() + paddle.set_device(self.device) + except BaseException as e: + logger.error( + "Set device failed, please check if device is already used and the parameter 'device' in the yaml file" + ) + logger.error("Initialize TTS server engine Failed on device: %s." % + (self.device)) + return False + self.executor._init_from_path( am=self.config.am, am_model=self.config.am_model, @@ -370,9 +296,35 @@ class TTSEngine(BaseEngine): am_predictor_conf=self.config.am_predictor_conf, voc_predictor_conf=self.config.voc_predictor_conf, ) + # warm up + try: + self.warm_up() + logger.info("Warm up successfully.") + except Exception as e: + logger.error("Failed to warm up on tts engine.") + return False + logger.info("Initialize TTS server engine successfully.") return True + def warm_up(self): + """warm up + """ + if self.config.lang == 'zh': + sentence = "您好,欢迎使用语音合成服务。" + if self.config.lang == 'en': + sentence = "Hello and welcome to the speech synthesis service." + logger.info("Start to warm up.") + for i in range(3): + st = time.time() + self.executor.infer( + text=sentence, + lang=self.config.lang, + am=self.config.am, + spk_id=0, ) + logger.info( + f"The response time of the {i} warm up: {time.time() - st} s") + def postprocess(self, wav, original_fs: int, diff --git a/paddlespeech/server/engine/tts/python/tts_engine.py b/paddlespeech/server/engine/tts/python/tts_engine.py index f153f60b966682fea72418643b29adc38ffa1f07..d0002baa4f46c949e8258a7bea527a18b781b657 100644 --- a/paddlespeech/server/engine/tts/python/tts_engine.py +++ b/paddlespeech/server/engine/tts/python/tts_engine.py @@ -51,15 +51,15 @@ class TTSEngine(BaseEngine): def init(self, config: dict) -> bool: self.executor = TTSServerExecutor() + self.config = config try: - self.config = config - if self.config.device: + if self.config.device is not None: self.device = self.config.device else: self.device = paddle.get_device() paddle.set_device(self.device) - except BaseException: + except BaseException as e: logger.error( "Set device failed, please check if device is already used and the parameter 'device' in the yaml file" ) @@ -87,10 +87,36 @@ class TTSEngine(BaseEngine): (self.device)) return False + # warm up + try: + self.warm_up() + logger.info("Warm up successfully.") + except Exception as e: + logger.error("Failed to warm up on tts engine.") + return False + logger.info("Initialize TTS server engine successfully on device: %s." % (self.device)) return True + def warm_up(self): + """warm up + """ + if self.config.lang == 'zh': + sentence = "您好,欢迎使用语音合成服务。" + if self.config.lang == 'en': + sentence = "Hello and welcome to the speech synthesis service." + logger.info("Start to warm up.") + for i in range(3): + st = time.time() + self.executor.infer( + text=sentence, + lang=self.config.lang, + am=self.config.am, + spk_id=0, ) + logger.info( + f"The response time of the {i} warm up: {time.time() - st} s") + def postprocess(self, wav, original_fs: int, diff --git a/paddlespeech/server/restful/tts_api.py b/paddlespeech/server/restful/tts_api.py index d1268428a0b53d41fdf9abb8fd7dbff4d485decc..15d618d9324fcda2616d571a4d074ea0876f0fb5 100644 --- a/paddlespeech/server/restful/tts_api.py +++ b/paddlespeech/server/restful/tts_api.py @@ -128,7 +128,7 @@ def tts(request_body: TTSRequest): return response -@router.post("/paddlespeech/streaming/tts") +@router.post("/paddlespeech/tts/streaming") async def stream_tts(request_body: TTSRequest): text = request_body.text diff --git a/paddlespeech/server/tests/tts/online/http_client.py b/paddlespeech/server/tests/tts/online/http_client.py index 756f7b5be204cbd3ae3dd125c0f04a78b9879421..47b781ed9030e55a33a6a8383f83eb1ba61b617d 100644 --- a/paddlespeech/server/tests/tts/online/http_client.py +++ b/paddlespeech/server/tests/tts/online/http_client.py @@ -14,6 +14,7 @@ import argparse from paddlespeech.server.utils.audio_handler import TTSHttpHandler +from paddlespeech.server.utils.util import compute_delay if __name__ == "__main__": parser = argparse.ArgumentParser() @@ -43,5 +44,25 @@ if __name__ == "__main__": print("tts http client start") handler = TTSHttpHandler(args.server, args.port, args.play) - handler.run(args.text, args.spk_id, args.speed, args.volume, - args.sample_rate, args.output) + first_response, final_response, duration, save_audio_success, receive_time_list, chunk_duration_list = handler.run( + args.text, args.spk_id, args.speed, args.volume, args.sample_rate, + args.output) + delay_time_list = compute_delay(receive_time_list, chunk_duration_list) + + print(f"sentence: {args.text}") + print(f"duration: {duration} s") + print(f"first response: {first_response} s") + print(f"final response: {final_response} s") + print(f"RTF: {final_response/duration}") + if args.output is not None: + if save_audio_success: + print(f"Audio successfully saved in {args.output}") + else: + print("Audio save failed.") + + if delay_time_list != []: + print( + f"Delay situation: total number of packages: {len(receive_time_list)}, the number of delayed packets: {len(delay_time_list)}, minimum delay time: {min(delay_time_list)} s, maximum delay time: {max(delay_time_list)} s, average delay time: {sum(delay_time_list)/len(delay_time_list)} s, delay rate:{len(delay_time_list)/len(receive_time_list)}" + ) + else: + print("The sentence has no delay in streaming synthesis.") diff --git a/paddlespeech/server/tests/tts/online/ws_client.py b/paddlespeech/server/tests/tts/online/ws_client.py index 821d82a9a6e254987cb4774a6a7e50f8b076b3d2..0b1794c8aaef4dc3af3cea3f80b9166548a7a39c 100644 --- a/paddlespeech/server/tests/tts/online/ws_client.py +++ b/paddlespeech/server/tests/tts/online/ws_client.py @@ -15,6 +15,7 @@ import argparse import asyncio from paddlespeech.server.utils.audio_handler import TTSWsHandler +from paddlespeech.server.utils.util import compute_delay if __name__ == "__main__": parser = argparse.ArgumentParser() @@ -35,4 +36,24 @@ if __name__ == "__main__": print("tts websocket client start") handler = TTSWsHandler(args.server, args.port, args.play) loop = asyncio.get_event_loop() - loop.run_until_complete(handler.run(args.text, args.output)) + first_response, final_response, duration, save_audio_success, receive_time_list, chunk_duration_list = loop.run_until_complete( + handler.run(args.text, args.output)) + delay_time_list = compute_delay(receive_time_list, chunk_duration_list) + + print(f"sentence: {args.text}") + print(f"duration: {duration} s") + print(f"first response: {first_response} s") + print(f"final response: {final_response} s") + print(f"RTF: {final_response/duration}") + if args.output is not None: + if save_audio_success: + print(f"Audio successfully saved in {args.output}") + else: + print("Audio save failed.") + + if delay_time_list != []: + print( + f"Delay situation: total number of packages: {len(receive_time_list)}, the number of delayed packets: {len(delay_time_list)}, minimum delay time: {min(delay_time_list)} s, maximum delay time: {max(delay_time_list)} s, average delay time: {sum(delay_time_list)/len(delay_time_list)} s, delay rate:{len(delay_time_list)/len(receive_time_list)}" + ) + else: + print("The sentence has no delay in streaming synthesis.") diff --git a/paddlespeech/server/utils/audio_handler.py b/paddlespeech/server/utils/audio_handler.py index a088929f2ec60c8cbee79feb06fb6d914c5b2476..75f4a10bebe3db1759bfe948a471087b7abe3ec2 100644 --- a/paddlespeech/server/utils/audio_handler.py +++ b/paddlespeech/server/utils/audio_handler.py @@ -259,7 +259,8 @@ class TTSWsHandler: """ self.server = server self.port = port - self.url = "ws://" + self.server + ":" + str(self.port) + "/ws/tts" + self.url = "ws://" + self.server + ":" + str( + self.port) + "/paddlespeech/tts/streaming" self.play = play if self.play: import pyaudio @@ -295,6 +296,8 @@ class TTSWsHandler: output (str): save audio path """ all_bytes = b'' + receive_time_list = [] + chunk_duration_list = [] # 1. Send websocket handshake protocal async with websockets.connect(self.url) as ws: @@ -309,14 +312,15 @@ class TTSWsHandler: # 3. Process the received response message = await ws.recv() - logger.info(f"句子:{text}") - logger.info(f"首包响应:{time.time() - st} s") + first_response = time.time() - st message = json.loads(message) status = message["status"] while (status == 1): + receive_time_list.append(time.time()) audio = message["audio"] audio = base64.b64decode(audio) # bytes + chunk_duration_list.append(len(audio) / 2.0 / 24000) all_bytes += audio if self.play: self.mutex.acquire() @@ -334,15 +338,11 @@ class TTSWsHandler: if status == 2: final_response = time.time() - st duration = len(all_bytes) / 2.0 / 24000 - logger.info(f"尾包响应:{final_response} s") - logger.info(f"音频时长:{duration} s") - logger.info(f"RTF: {final_response / duration}") if output is not None: - if save_audio(all_bytes, output): - logger.info(f"音频保存至:{output}") - else: - logger.error("save audio error") + save_audio_success = save_audio(all_bytes, output) + else: + save_audio_success = False else: logger.error("infer error") @@ -352,6 +352,8 @@ class TTSWsHandler: self.stream.close() self.p.terminate() + return first_response, final_response, duration, save_audio_success, receive_time_list, chunk_duration_list + class TTSHttpHandler: def __init__(self, server="127.0.0.1", port=8092, play: bool=False): @@ -365,7 +367,7 @@ class TTSHttpHandler: self.server = server self.port = port self.url = "http://" + str(self.server) + ":" + str( - self.port) + "/paddlespeech/streaming/tts" + self.port) + "/paddlespeech/tts/streaming" self.play = play if self.play: @@ -423,13 +425,16 @@ class TTSHttpHandler: all_bytes = b'' first_flag = 1 + receive_time_list = [] + chunk_duration_list = [] # 2. Send request st = time.time() html = requests.post(self.url, json.dumps(params), stream=True) # 3. Process the received response - for chunk in html.iter_content(chunk_size=1024): + for chunk in html.iter_content(chunk_size=None): + receive_time_list.append(time.time()) audio = base64.b64decode(chunk) # bytes if first_flag: first_response = time.time() - st @@ -443,21 +448,15 @@ class TTSHttpHandler: self.t.start() self.start_play = False all_bytes += audio + chunk_duration_list.append(len(audio) / 2.0 / 24000) final_response = time.time() - st duration = len(all_bytes) / 2.0 / 24000 - logger.info(f"句子:{text}") - logger.info(f"首包响应:{first_response} s") - logger.info(f"尾包响应:{final_response} s") - logger.info(f"音频时长:{duration} s") - logger.info(f"RTF: {final_response / duration}") - if output is not None: - if save_audio(all_bytes, output): - logger.info(f"音频保存至:{output}") - else: - logger.error("save audio error") + save_audio_success = save_audio(all_bytes, output) + else: + save_audio_success = False if self.play: self.t.join() @@ -465,6 +464,8 @@ class TTSHttpHandler: self.stream.close() self.p.terminate() + return first_response, final_response, duration, save_audio_success, receive_time_list, chunk_duration_list + class VectorHttpHandler: def __init__(self, server_ip=None, port=None): diff --git a/paddlespeech/server/utils/util.py b/paddlespeech/server/utils/util.py index 72ee0060e246d437052b916362b2b55b1946fc65..061b213c78360d523d1cc3cc180f93cfaac387ab 100644 --- a/paddlespeech/server/utils/util.py +++ b/paddlespeech/server/utils/util.py @@ -75,3 +75,74 @@ def get_chunks(data, block_size, pad_size, step): else: print("Please set correct type to get chunks, am or voc") return chunks + + +def compute_delay(receive_time_list, chunk_duration_list): + """compute delay + Args: + receive_time_list (list): Time to receive each packet + chunk_duration_list (list): The audio duration corresponding to each packet + Returns: + [list]: Delay time list + """ + assert (len(receive_time_list) == len(chunk_duration_list)) + delay_time_list = [] + play_time = receive_time_list[0] + chunk_duration_list[0] + for i in range(1, len(receive_time_list)): + receive_time = receive_time_list[i] + delay_time = receive_time - play_time + # 有延迟 + if delay_time > 0: + play_time = play_time + delay_time + chunk_duration_list[i] + delay_time_list.append(delay_time) + # 没有延迟 + else: + play_time = play_time + chunk_duration_list[i] + + return delay_time_list + + +def count_engine(logfile: str="./nohup.out"): + """For inference on the statistical engine side + Args: + logfile (str, optional): server log. Defaults to "./nohup.out". + """ + first_response_list = [] + final_response_list = [] + duration_list = [] + + with open(logfile, "r") as f: + for line in f.readlines(): + if "- first response time:" in line: + first_response = float(line.splie(" ")[-2]) + first_response_list.append(first_response) + elif "- final response time:" in line: + final_response = float(line.splie(" ")[-2]) + final_response_list.append(final_response) + elif "- The durations of audio is:" in line: + duration = float(line.splie(" ")[-2]) + duration_list.append(duration) + + assert (len(first_response_list) == len(final_response_list) and + len(final_response_list) == len(duration_list)) + + avg_first_response = sum(first_response_list) / len(first_response_list) + avg_final_response = sum(final_response_list) / len(final_response_list) + avg_duration = sum(duration_list) / len(duration_list) + RTF = sum(final_response_list) / sum(duration_list) + + print( + "************************* engine result ***************************************" + ) + print( + f"test num: {len(duration_list)}, avg first response: {avg_first_response} s, avg final response: {avg_final_response} s, avg duration: {avg_duration}, RTF: {RTF}" + ) + print( + f"min duration: {min(duration_list)} s, max duration: {max(duration_list)} s" + ) + print( + f"max first response: {max(first_response_list)} s, min first response: {min(first_response_list)} s" + ) + print( + f"max final response: {max(final_response_list)} s, min final response: {min(final_response_list)} s" + ) diff --git a/paddlespeech/server/ws/tts_socket.py b/paddlespeech/server/ws/tts_socket.py index 699ee412bb43a2b8f39d164e96360afd88cda689..482aeb79b0dc36aa028f3fa9be44205926dcd8a9 100644 --- a/paddlespeech/server/ws/tts_socket.py +++ b/paddlespeech/server/ws/tts_socket.py @@ -24,7 +24,7 @@ from paddlespeech.server.engine.engine_pool import get_engine_pool router = APIRouter() -@router.websocket('/ws/tts') +@router.websocket('/paddlespeech/tts/streaming') async def websocket_endpoint(websocket: WebSocket): await websocket.accept() diff --git a/tests/unit/server/offline/change_yaml.py b/tests/unit/server/offline/change_yaml.py index cdeaebdbcf35b8c4345d1bdaca5ccfa6ebffbc1b..d51a6259178c981f30d8864c64682d257e2eb1cd 100644 --- a/tests/unit/server/offline/change_yaml.py +++ b/tests/unit/server/offline/change_yaml.py @@ -19,7 +19,7 @@ def change_device(yamlfile: str, engine: str, device: str): if device == 'cpu': set_device = 'cpu' elif device == 'gpu': - set_device = 'gpu:0' + set_device = 'gpu:3' else: print("Please set correct device: cpu or gpu.") diff --git a/tests/unit/server/offline/conf/application.yaml b/tests/unit/server/offline/conf/application.yaml index 2b1a05998083e08377d63ee02bc77323a7c4dce5..762f4af6e952fad3c671b452899584ffcfe81aeb 100644 --- a/tests/unit/server/offline/conf/application.yaml +++ b/tests/unit/server/offline/conf/application.yaml @@ -1,4 +1,4 @@ -# This is the parameter configuration file for PaddleSpeech Serving. +# This is the parameter configuration file for PaddleSpeech Offline Serving. ################################################################################# # SERVER SETTING # @@ -7,8 +7,8 @@ host: 127.0.0.1 port: 8090 # The task format in the engin_list is: _ -# task choices = ['asr_python', 'asr_inference', 'tts_python', 'tts_inference'] - +# task choices = ['asr_python', 'asr_inference', 'tts_python', 'tts_inference', 'cls_python', 'cls_inference'] +protocol: 'http' engine_list: ['asr_python', 'tts_python', 'cls_python'] diff --git a/tests/unit/server/offline/test_server_client.sh b/tests/unit/server/offline/test_server_client.sh index e7ae7604d177f4fa0f654a10e64e5eb9ba45669e..875008a75f5ac903f3d024ba1250fd653b4c7ebd 100644 --- a/tests/unit/server/offline/test_server_client.sh +++ b/tests/unit/server/offline/test_server_client.sh @@ -21,6 +21,8 @@ StartService(){ } ClientTest(){ + echo "aaaaaaaaaaaaaaaaaaaaaaaaaa $server_ip" + echo "aaaaaaaaaaaaaaaaaaaaaaaaaa $port" # Client test # test asr client paddlespeech_client asr --server_ip $server_ip --port $port --input ./zh.wav @@ -39,6 +41,7 @@ ClientTest(){ ((test_times+=1)) paddlespeech_client cls --server_ip $server_ip --port $port --input ./zh.wav ((test_times+=1)) + } GetTestResult() { @@ -58,6 +61,7 @@ rm -rf log/server.log.wf rm -rf log/server.log rm -rf log/test_result.log +cp ../../../../demos/speech_server/conf/application.yaml ./conf/ config_file=./conf/application.yaml server_ip=$(cat $config_file | grep "host" | awk -F " " '{print $2}') port=$(cat $config_file | grep "port" | awk '/port:/ {print $2}') @@ -191,5 +195,4 @@ echo "***************** Here are all the test results ********************" cat ./log/test_result.log # Restoring conf is the same as demos/speech_server -rm -rf ./conf -cp ../../../demos/speech_server/conf/ ./ -rf \ No newline at end of file +cp ../../../../demos/speech_server/conf/application.yaml ./conf/ diff --git a/tests/unit/server/online/tts/check_server/conf/application.yaml b/tests/unit/server/online/tts/check_server/conf/application.yaml index 26cd325b96d6ea3ff229e8505c6c52b8dec286b8..dd1a7e197875df335b491f5fab971c58bc7d1a23 100644 --- a/tests/unit/server/online/tts/check_server/conf/application.yaml +++ b/tests/unit/server/online/tts/check_server/conf/application.yaml @@ -39,9 +39,9 @@ tts_online: # others lang: 'zh' device: 'cpu' # set 'gpu:id' or 'cpu' - am_block: 42 + am_block: 72 am_pad: 12 - voc_block: 14 + voc_block: 36 voc_pad: 14 @@ -80,9 +80,9 @@ tts_online-onnx: # others lang: 'zh' - am_block: 42 + am_block: 72 am_pad: 12 - voc_block: 14 + voc_block: 36 voc_pad: 14 voc_upsample: 300 diff --git a/tests/unit/server/online/tts/check_server/test_all.sh b/tests/unit/server/online/tts/check_server/test_all.sh index b2ea6b44563299745145852e2f0389efd46a841c..94129860e1eb562d401f8b8ad1ef1f8b16a93109 100644 --- a/tests/unit/server/online/tts/check_server/test_all.sh +++ b/tests/unit/server/online/tts/check_server/test_all.sh @@ -10,7 +10,6 @@ bash test.sh tts_online $log_all_dir/log_tts_online_cpu python change_yaml.py --change_type engine_type --target_key engine_list --target_value tts_online-onnx bash test.sh tts_online-onnx $log_all_dir/log_tts_online-onnx_cpu - python change_yaml.py --change_type device --target_key device --target_value gpu:3 bash test.sh tts_online $log_all_dir/log_tts_online_gpu diff --git a/tests/unit/server/online/tts/check_server/tts_online_application.yaml b/tests/unit/server/online/tts/check_server/tts_online_application.yaml index 26cd325b96d6ea3ff229e8505c6c52b8dec286b8..dd1a7e197875df335b491f5fab971c58bc7d1a23 100644 --- a/tests/unit/server/online/tts/check_server/tts_online_application.yaml +++ b/tests/unit/server/online/tts/check_server/tts_online_application.yaml @@ -39,9 +39,9 @@ tts_online: # others lang: 'zh' device: 'cpu' # set 'gpu:id' or 'cpu' - am_block: 42 + am_block: 72 am_pad: 12 - voc_block: 14 + voc_block: 36 voc_pad: 14 @@ -80,9 +80,9 @@ tts_online-onnx: # others lang: 'zh' - am_block: 42 + am_block: 72 am_pad: 12 - voc_block: 14 + voc_block: 36 voc_pad: 14 voc_upsample: 300 diff --git a/tests/unit/server/online/tts/test_server/test_http_client.py b/tests/unit/server/online/tts/test_server/test_http_client.py index 96372ab37c141825d7d59d79f876ab6dccd22b9e..7fdb4e00c45df002138732314c884e204ea567fc 100644 --- a/tests/unit/server/online/tts/test_server/test_http_client.py +++ b/tests/unit/server/online/tts/test_server/test_http_client.py @@ -12,117 +12,35 @@ # See the License for the specific language governing permissions and # limitations under the License. import argparse -import base64 -import json +import asyncio import os -import time -import requests - -from paddlespeech.server.utils.audio_process import pcm2wav +from paddlespeech.server.utils.util import compute_delay from paddlespeech.t2s.exps.syn_utils import get_sentences -def save_audio(buffer, audio_path) -> bool: - if audio_path.endswith("pcm"): - with open(audio_path, "wb") as f: - f.write(buffer) - elif audio_path.endswith("wav"): - with open("./tmp.pcm", "wb") as f: - f.write(buffer) - pcm2wav("./tmp.pcm", audio_path, channels=1, bits=16, sample_rate=24000) - os.system("rm ./tmp.pcm") - else: - print("Only supports saved audio format is pcm or wav") - return False - - return True - - def test(args, text, utt_id): - params = { - "text": text, - "spk_id": args.spk_id, - "speed": args.speed, - "volume": args.volume, - "sample_rate": args.sample_rate, - "save_path": '' - } - - buffer = b'' - flag = 1 - url = "http://" + str(args.server) + ":" + str( - args.port) + "/paddlespeech/streaming/tts" - st = time.time() - html = requests.post(url, json.dumps(params), stream=True) - for chunk in html.iter_content(chunk_size=1024): - chunk = base64.b64decode(chunk) # bytes - if flag: - first_response = time.time() - st - print(f"首包响应:{first_response} s") - flag = 0 - buffer += chunk - - final_response = time.time() - st - duration = len(buffer) / 2.0 / 24000 - - print(f"sentence: {text}") - print(f"尾包响应:{final_response} s") - print(f"音频时长:{duration} s") - print(f"RTF: {final_response / duration}") - - save_path = str(args.output_dir + "/" + utt_id + ".wav") - save_audio(buffer, save_path) - print("音频保存至:", save_path) - - return first_response, final_response, duration - - -def count_engine(logfile: str="./nohup.out"): - """For inference on the statistical engine side - - Args: - logfile (str, optional): server log. Defaults to "./nohup.out". - """ - first_response_list = [] - final_response_list = [] - duration_list = [] + output = str(args.output_dir + "/" + utt_id + ".wav") + if args.protocol == "http": + print("tts http client start") + from paddlespeech.server.utils.audio_handler import TTSHttpHandler + handler = TTSHttpHandler(args.server_ip, args.port, args.play) + first_response, final_response, duration, save_audio_success, receive_time_list, chunk_duration_list = handler.run( + text, args.spk_id, args.speed, args.volume, args.sample_rate, + output) + + elif args.protocol == "websocket": + from paddlespeech.server.utils.audio_handler import TTSWsHandler + print("tts websocket client start") + handler = TTSWsHandler(args.server_ip, args.port, args.play) + loop = asyncio.get_event_loop() + first_response, final_response, duration, save_audio_success, receive_time_list, chunk_duration_list = loop.run_until_complete( + handler.run(text, output)) - with open(logfile, "r") as f: - for line in f.readlines(): - if "- first response time:" in line: - first_response = float(line.splie(" ")[-2]) - first_response_list.append(first_response) - elif "- final response time:" in line: - final_response = float(line.splie(" ")[-2]) - final_response_list.append(final_response) - elif "- The durations of audio is:" in line: - duration = float(line.splie(" ")[-2]) - duration_list.append(duration) + else: + print("Please set correct protocol, http or websocket") - assert (len(first_response_list) == len(final_response_list) and - len(final_response_list) == len(duration_list)) - - avg_first_response = sum(first_response_list) / len(first_response_list) - avg_final_response = sum(final_response_list) / len(final_response_list) - avg_duration = sum(duration_list) / len(duration_list) - RTF = sum(final_response_list) / sum(duration_list) - - print( - "************************* engine result ***************************************" - ) - print( - f"test num: {len(duration_list)}, avg first response: {avg_first_response} s, avg final response: {avg_final_response} s, avg duration: {avg_duration}, RTF: {RTF}" - ) - print( - f"min duration: {min(duration_list)} s, max duration: {max(duration_list)} s" - ) - print( - f"max first response: {max(first_response_list)} s, min first response: {min(first_response_list)} s" - ) - print( - f"max final response: {max(final_response_list)} s, min final response: {min(final_response_list)} s" - ) + return first_response, final_response, duration, save_audio_success, receive_time_list, chunk_duration_list if __name__ == "__main__": @@ -142,10 +60,18 @@ if __name__ == "__main__": default=0, help='Sampling rate, the default is the same as the model') parser.add_argument( - "--server", type=str, help="server ip", default="127.0.0.1") + "--server_ip", type=str, help="server ip", default="127.0.0.1") parser.add_argument("--port", type=int, help="server port", default=8092) + parser.add_argument( + "--protocol", + type=str, + choices=['http', 'websocket'], + help="server protocol", + default="http") parser.add_argument( "--output_dir", type=str, default="./output", help="output dir") + parser.add_argument( + "--play", type=bool, help="whether to play audio", default=False) args = parser.parse_args() @@ -155,13 +81,35 @@ if __name__ == "__main__": first_response_list = [] final_response_list = [] duration_list = [] + all_delay_list = [] + packet_count = 0.0 sentences = get_sentences(text_file=args.text, lang="zh") for utt_id, sentence in sentences: - first_response, final_response, duration = test(args, sentence, utt_id) + first_response, final_response, duration, save_audio_success, receive_time_list, chunk_duration_list = test( + args, sentence, utt_id) + delay_time_list = compute_delay(receive_time_list, chunk_duration_list) first_response_list.append(first_response) final_response_list.append(final_response) duration_list.append(duration) + packet_count += len(receive_time_list) + + print(f"句子:{sentence}") + print(f"首包响应时间:{first_response} s") + print(f"尾包响应时间:{final_response} s") + print(f"音频时长:{duration} s") + print(f"该句RTF:{final_response/duration}") + + if delay_time_list != []: + for t in delay_time_list: + all_delay_list.append(t) + print( + f"该句流式合成的延迟情况:总包个数:{len(receive_time_list)},延迟包个数:{len(delay_time_list)}, 最小延迟时间:{min(delay_time_list)} s, 最大延迟时间:{max(delay_time_list)} s, 平均延迟时间:{sum(delay_time_list)/len(delay_time_list)} s, 延迟率:{len(delay_time_list)/len(receive_time_list)}" + ) + else: + print("该句流式合成无延迟情况") + + packet_count += len(receive_time_list) assert (len(first_response_list) == len(final_response_list) and len(final_response_list) == len(duration_list)) @@ -170,19 +118,35 @@ if __name__ == "__main__": avg_final_response = sum(final_response_list) / len(final_response_list) avg_duration = sum(duration_list) / len(duration_list) RTF = sum(final_response_list) / sum(duration_list) + if all_delay_list != []: + delay_count = len(all_delay_list) + avg_delay = sum(all_delay_list) / len(all_delay_list) + delay_ratio = len(all_delay_list) / packet_count + min_delay = min(all_delay_list) + max_delay = max(all_delay_list) + else: + delay_count = 0.0 + avg_delay = 0.0 + delay_ratio = 0.0 + min_delay = 0.0 + max_delay = 0.0 print( "************************* server/client result ***************************************" ) print( - f"test num: {len(duration_list)}, avg first response: {avg_first_response} s, avg final response: {avg_final_response} s, avg duration: {avg_duration}, RTF: {RTF}" + f"test num: {len(duration_list)}, avg first response: {avg_first_response} s, avg final response: {avg_final_response} s, avg duration: {avg_duration}, RTF: {RTF}." + ) + print( + f"test num: {len(duration_list)}, packet count: {packet_count}, delay count: {delay_count}, avg delay time: {avg_delay} s, delay ratio: {delay_ratio} " ) print( f"min duration: {min(duration_list)} s, max duration: {max(duration_list)} s" ) print( - f"max first response: {max(first_response_list)} s, min first response: {min(first_response_list)} s" + f"min first response: {min(first_response_list)} s, max first response: {max(first_response_list)} s." ) print( - f"max final response: {max(final_response_list)} s, min final response: {min(final_response_list)} s" + f"min final response: {min(final_response_list)} s, max final response: {max(final_response_list)} s." ) + print(f"min delay: {min_delay} s, max delay: {max_delay}")