diff --git a/demos/speech_server/conf/application.yaml b/demos/speech_server/conf/application.yaml index c8d71f2f6ad816e9848096e84c637c0069757594..fd4f5f37567486b05b026568dc09a2973491b12e 100644 --- a/demos/speech_server/conf/application.yaml +++ b/demos/speech_server/conf/application.yaml @@ -9,9 +9,17 @@ port: 8090 ################################################################## # CONFIG FILE # ################################################################## -# add engine type (Options: asr, tts) and config file here. +# The engine_type of speech task needs to keep the same type as the config file of speech task. +# E.g: The engine_type of asr is 'python', the engine_backend of asr is 'XX/asr.yaml' +# E.g: The engine_type of asr is 'inference', the engine_backend of asr is 'XX/asr_pd.yaml' +# +# add engine type (Options: python, inference) +engine_type: + asr: 'inference' + tts: 'inference' +# add engine backend type (Options: asr, tts) and config file here. +# Adding a speech task to engine_backend means starting the service. engine_backend: - asr: 'conf/asr/asr.yaml' - tts: 'conf/tts/tts.yaml' - + asr: 'conf/asr/asr_pd.yaml' + tts: 'conf/tts/tts_pd.yaml' diff --git a/demos/speech_server/conf/asr/asr.yaml b/demos/speech_server/conf/asr/asr.yaml index 4c3b0a67e30273681fe765fc2e827f86a21ac380..1a805142a9a1a85b2dfd67a22e216c236bcc9664 100644 --- a/demos/speech_server/conf/asr/asr.yaml +++ b/demos/speech_server/conf/asr/asr.yaml @@ -1,7 +1,8 @@ model: 'conformer_wenetspeech' lang: 'zh' sample_rate: 16000 -cfg_path: -ckpt_path: +cfg_path: # [optional] +ckpt_path: # [optional] decode_method: 'attention_rescoring' -force_yes: False +force_yes: True +device: 'cpu' # set 'gpu:id' or 'cpu' diff --git a/demos/speech_server/conf/asr/asr_pd.yaml b/demos/speech_server/conf/asr/asr_pd.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6cddb4503fc253ba98585d5e0a9d8a079a26aeaf --- /dev/null +++ b/demos/speech_server/conf/asr/asr_pd.yaml @@ -0,0 +1,25 @@ +# This is the parameter configuration file for ASR server. +# These are the static models that support paddle inference. + +################################################################## +# ACOUSTIC MODEL SETTING # +# am choices=['deepspeech2offline_aishell'] TODO +################################################################## +model_type: 'deepspeech2offline_aishell' +am_model: # the pdmodel file of am static model [optional] +am_params: # the pdiparams file of am static model [optional] +lang: 'zh' +sample_rate: 16000 +cfg_path: +decode_method: +force_yes: True + +am_predictor_conf: + device: 'cpu' # set 'gpu:id' or 'cpu' + enable_mkldnn: True + switch_ir_optim: True + + +################################################################## +# OTHERS # +################################################################## diff --git a/demos/speech_server/conf/tts/tts.yaml b/demos/speech_server/conf/tts/tts.yaml index cb4829c881efdc4802530f68abbc13f24bac4a61..19e8874e31c04d99cef2cfb66ab1f86f6605d12e 100644 --- a/demos/speech_server/conf/tts/tts.yaml +++ b/demos/speech_server/conf/tts/tts.yaml @@ -29,4 +29,4 @@ voc_stat: # OTHERS # ################################################################## lang: 'zh' -device: 'gpu:2' +device: 'cpu' # set 'gpu:id' or 'cpu' diff --git a/demos/speech_server/conf/tts/tts_pd.yaml b/demos/speech_server/conf/tts/tts_pd.yaml index c268c6a336bb21be7879980cb3cb3c59611d64cd..97df526132a8f12210db91c49fb51258ab976c35 100644 --- a/demos/speech_server/conf/tts/tts_pd.yaml +++ b/demos/speech_server/conf/tts/tts_pd.yaml @@ -6,8 +6,8 @@ # am choices=['speedyspeech_csmsc', 'fastspeech2_csmsc'] ################################################################## am: 'fastspeech2_csmsc' -am_model: # the pdmodel file of am static model -am_params: # the pdiparams file of am static model +am_model: # the pdmodel file of your am static model (XX.pdmodel) +am_params: # the pdiparams file of your am static model (XX.pdipparams) am_sample_rate: 24000 phones_dict: tones_dict: @@ -15,9 +15,9 @@ speaker_dict: spk_id: 0 am_predictor_conf: - use_gpu: True - enable_mkldnn: True - switch_ir_optim: True + device: 'cpu' # set 'gpu:id' or 'cpu' + enable_mkldnn: False + switch_ir_optim: False ################################################################## @@ -25,17 +25,16 @@ am_predictor_conf: # voc choices=['pwgan_csmsc', 'mb_melgan_csmsc','hifigan_csmsc'] ################################################################## voc: 'pwgan_csmsc' -voc_model: # the pdmodel file of vocoder static model -voc_params: # the pdiparams file of vocoder static model +voc_model: # the pdmodel file of your vocoder static model (XX.pdmodel) +voc_params: # the pdiparams file of your vocoder static model (XX.pdipparams) voc_sample_rate: 24000 voc_predictor_conf: - use_gpu: True - enable_mkldnn: True - switch_ir_optim: True + device: 'cpu' # set 'gpu:id' or 'cpu' + enable_mkldnn: False + switch_ir_optim: False ################################################################## # OTHERS # ################################################################## lang: 'zh' -device: paddle.get_device() diff --git a/paddlespeech/server/bin/main.py b/paddlespeech/server/bin/main.py index dda0bbd7f1bc8dcccf16c67fc04eb606a2bfdcd8..360d295ef583a4d490a76392ff9a362c40ee4656 100644 --- a/paddlespeech/server/bin/main.py +++ b/paddlespeech/server/bin/main.py @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. import argparse + import uvicorn -import yaml from fastapi import FastAPI from paddlespeech.server.engine.engine_pool import init_engine_pool diff --git a/paddlespeech/server/bin/paddlespeech_client.py b/paddlespeech/server/bin/paddlespeech_client.py index 3730d607066ed2650929502a27de674308903701..853d272fb4a40ebe10890b6717e433aacb768ea0 100644 --- a/paddlespeech/server/bin/paddlespeech_client.py +++ b/paddlespeech/server/bin/paddlespeech_client.py @@ -48,8 +48,9 @@ class TTSClientExecutor(BaseExecutor): self.parser.add_argument( '--input', type=str, - default="你好,欢迎使用语音合成服务", - help='A sentence to be synthesized.') + default=None, + help='Text to be synthesized.', + required=True) self.parser.add_argument( '--spk_id', type=int, default=0, help='Speaker id') self.parser.add_argument( @@ -123,7 +124,7 @@ class TTSClientExecutor(BaseExecutor): logger.info("RTF: %f " % (time_consume / duration)) return True - except: + except BaseException: logger.error("Failed to synthesized audio.") return False @@ -163,7 +164,7 @@ class TTSClientExecutor(BaseExecutor): print("Audio duration: %f s." % (duration)) print("Response time: %f s." % (time_consume)) print("RTF: %f " % (time_consume / duration)) - except: + except BaseException: print("Failed to synthesized audio.") @@ -181,8 +182,9 @@ class ASRClientExecutor(BaseExecutor): self.parser.add_argument( '--input', type=str, - default="./paddlespeech/server/tests/16_audio.wav", - help='Audio file to be recognized') + default=None, + help='Audio file to be recognized', + required=True) self.parser.add_argument( '--sample_rate', type=int, default=16000, help='audio sample rate') self.parser.add_argument( @@ -209,7 +211,7 @@ class ASRClientExecutor(BaseExecutor): logger.info(r.json()) logger.info("time cost %f s." % (time_end - time_start)) return True - except: + except BaseException: logger.error("Failed to speech recognition.") return False @@ -240,5 +242,5 @@ class ASRClientExecutor(BaseExecutor): time_end = time.time() print(r.json()) print("time cost %f s." % (time_end - time_start)) - except: - print("Failed to speech recognition.") \ No newline at end of file + except BaseException: + print("Failed to speech recognition.") diff --git a/paddlespeech/server/bin/paddlespeech_server.py b/paddlespeech/server/bin/paddlespeech_server.py index 7c88d8a0ed3820dd421f7015247a8d6a7faa0e04..aff77d54436eac55fda46c8e2ed218cc115a0085 100644 --- a/paddlespeech/server/bin/paddlespeech_server.py +++ b/paddlespeech/server/bin/paddlespeech_server.py @@ -20,7 +20,7 @@ from fastapi import FastAPI from ..executor import BaseExecutor from ..util import cli_server_register from ..util import stats_wrapper -from paddlespeech.server.engine.engine_factory import EngineFactory +from paddlespeech.server.engine.engine_pool import init_engine_pool from paddlespeech.server.restful.api import setup_router from paddlespeech.server.utils.config import get_config @@ -41,7 +41,8 @@ class ServerExecutor(BaseExecutor): "--config_file", action="store", help="yaml file of the app", - default="./conf/application.yaml") + default=None, + required=True) self.parser.add_argument( "--log_file", @@ -51,8 +52,10 @@ class ServerExecutor(BaseExecutor): def init(self, config) -> bool: """system initialization + Args: config (CfgNode): config object + Returns: bool: """ @@ -61,13 +64,8 @@ class ServerExecutor(BaseExecutor): api_router = setup_router(api_list) app.include_router(api_router) - # init engine - engine_pool = [] - for engine in config.engine_backend: - engine_pool.append(EngineFactory.get_engine(engine_name=engine)) - if not engine_pool[-1].init( - config_file=config.engine_backend[engine]): - return False + if not init_engine_pool(config): + return False return True diff --git a/paddlespeech/server/conf/application.yaml b/paddlespeech/server/conf/application.yaml index 154ef9af41a4fb77318d7a5bab2cc6278a662b95..cc08665eabde72596373cbfdc13bef3f9d4ad314 100644 --- a/paddlespeech/server/conf/application.yaml +++ b/paddlespeech/server/conf/application.yaml @@ -9,12 +9,17 @@ port: 8090 ################################################################## # CONFIG FILE # ################################################################## +# The engine_type of speech task needs to keep the same type as the config file of speech task. +# E.g: The engine_type of asr is 'python', the engine_backend of asr is 'XX/asr.yaml' +# E.g: The engine_type of asr is 'inference', the engine_backend of asr is 'XX/asr_pd.yaml' +# # add engine type (Options: python, inference) engine_type: - asr: 'inference' - # tts: 'inference' + asr: 'python' + tts: 'python' # add engine backend type (Options: asr, tts) and config file here. +# Adding a speech task to engine_backend means starting the service. engine_backend: - asr: 'conf/asr/asr_pd.yaml' - #tts: 'conf/tts/tts_pd.yaml' + asr: 'conf/asr/asr.yaml' + tts: 'conf/tts/tts.yaml' diff --git a/paddlespeech/server/conf/asr/asr.yaml b/paddlespeech/server/conf/asr/asr.yaml index 50e55a3ca0534d0534aa719ed426e49b35bc7675..1a805142a9a1a85b2dfd67a22e216c236bcc9664 100644 --- a/paddlespeech/server/conf/asr/asr.yaml +++ b/paddlespeech/server/conf/asr/asr.yaml @@ -5,3 +5,4 @@ cfg_path: # [optional] ckpt_path: # [optional] decode_method: 'attention_rescoring' force_yes: True +device: 'cpu' # set 'gpu:id' or 'cpu' diff --git a/paddlespeech/server/conf/asr/asr_pd.yaml b/paddlespeech/server/conf/asr/asr_pd.yaml index 43a63f1bd8031af387b171d176b31a4ac6368413..6cddb4503fc253ba98585d5e0a9d8a079a26aeaf 100644 --- a/paddlespeech/server/conf/asr/asr_pd.yaml +++ b/paddlespeech/server/conf/asr/asr_pd.yaml @@ -15,7 +15,7 @@ decode_method: force_yes: True am_predictor_conf: - use_gpu: True + device: 'cpu' # set 'gpu:id' or 'cpu' enable_mkldnn: True switch_ir_optim: True diff --git a/paddlespeech/server/conf/tts/tts.yaml b/paddlespeech/server/conf/tts/tts.yaml index d0e128eaee0c14783d23867563ee0275fbceef1b..19e8874e31c04d99cef2cfb66ab1f86f6605d12e 100644 --- a/paddlespeech/server/conf/tts/tts.yaml +++ b/paddlespeech/server/conf/tts/tts.yaml @@ -29,4 +29,4 @@ voc_stat: # OTHERS # ################################################################## lang: 'zh' -device: paddle.get_device() \ No newline at end of file +device: 'cpu' # set 'gpu:id' or 'cpu' diff --git a/paddlespeech/server/conf/tts/tts_pd.yaml b/paddlespeech/server/conf/tts/tts_pd.yaml index c268c6a336bb21be7879980cb3cb3c59611d64cd..019c7ed6a96c97a32fc7b474ab82d8b72d4b4006 100644 --- a/paddlespeech/server/conf/tts/tts_pd.yaml +++ b/paddlespeech/server/conf/tts/tts_pd.yaml @@ -6,18 +6,18 @@ # am choices=['speedyspeech_csmsc', 'fastspeech2_csmsc'] ################################################################## am: 'fastspeech2_csmsc' -am_model: # the pdmodel file of am static model -am_params: # the pdiparams file of am static model -am_sample_rate: 24000 +am_model: # the pdmodel file of your am static model (XX.pdmodel) +am_params: # the pdiparams file of your am static model (XX.pdipparams) +am_sample_rate: 24000 # must match the model phones_dict: tones_dict: speaker_dict: spk_id: 0 am_predictor_conf: - use_gpu: True - enable_mkldnn: True - switch_ir_optim: True + device: 'cpu' # set 'gpu:id' or 'cpu' + enable_mkldnn: False + switch_ir_optim: False ################################################################## @@ -25,17 +25,16 @@ am_predictor_conf: # voc choices=['pwgan_csmsc', 'mb_melgan_csmsc','hifigan_csmsc'] ################################################################## voc: 'pwgan_csmsc' -voc_model: # the pdmodel file of vocoder static model -voc_params: # the pdiparams file of vocoder static model -voc_sample_rate: 24000 +voc_model: # the pdmodel file of your vocoder static model (XX.pdmodel) +voc_params: # the pdiparams file of your vocoder static model (XX.pdipparams) +voc_sample_rate: 24000 #must match the model voc_predictor_conf: - use_gpu: True - enable_mkldnn: True - switch_ir_optim: True + device: 'cpu' # set 'gpu:id' or 'cpu' + enable_mkldnn: False + switch_ir_optim: False ################################################################## # OTHERS # ################################################################## lang: 'zh' -device: paddle.get_device() diff --git a/paddlespeech/server/engine/asr/paddleinference/asr_engine.py b/paddlespeech/server/engine/asr/paddleinference/asr_engine.py index 6d072322918bbf8f4ba7dc3d11d00e1209e21f95..5d4c4fa6aba15d3c8501687435559daf26de1445 100644 --- a/paddlespeech/server/engine/asr/paddleinference/asr_engine.py +++ b/paddlespeech/server/engine/asr/paddleinference/asr_engine.py @@ -13,31 +13,24 @@ # limitations under the License. import io import os -from typing import List from typing import Optional -from typing import Union -import librosa import paddle -import soundfile from yacs.config import CfgNode -from paddlespeech.cli.utils import MODEL_HOME -from paddlespeech.s2t.modules.ctc import CTCDecoder from paddlespeech.cli.asr.infer import ASRExecutor from paddlespeech.cli.log import logger +from paddlespeech.cli.utils import MODEL_HOME from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer -from paddlespeech.s2t.transform.transformation import Transformation -from paddlespeech.s2t.utils.dynamic_import import dynamic_import +from paddlespeech.s2t.modules.ctc import CTCDecoder from paddlespeech.s2t.utils.utility import UpdateConfig +from paddlespeech.server.engine.base_engine import BaseEngine from paddlespeech.server.utils.config import get_config from paddlespeech.server.utils.paddle_predictor import init_predictor from paddlespeech.server.utils.paddle_predictor import run_model -from paddlespeech.server.engine.base_engine import BaseEngine __all__ = ['ASREngine'] - pretrained_models = { "deepspeech2offline_aishell-zh-16k": { 'url': @@ -143,7 +136,6 @@ class ASRServerExecutor(ASRExecutor): batch_average=True, # sum / batch_size grad_norm_type=self.config.get('ctc_grad_norm_type', None)) - @paddle.no_grad() def infer(self, model_type: str): """ @@ -161,9 +153,8 @@ class ASRServerExecutor(ASRExecutor): cfg.beam_size, cfg.cutoff_prob, cfg.cutoff_top_n, cfg.num_proc_bsearch) - output_data = run_model( - self.am_predictor, - [audio.numpy(), audio_len.numpy()]) + output_data = run_model(self.am_predictor, + [audio.numpy(), audio_len.numpy()]) probs = output_data[0] eouts_len = output_data[1] @@ -208,14 +199,14 @@ class ASREngine(BaseEngine): paddle.set_device(paddle.get_device()) self.executor._init_from_path( - model_type=self.config.model_type, - am_model=self.config.am_model, - am_params=self.config.am_params, - lang=self.config.lang, - sample_rate=self.config.sample_rate, - cfg_path=self.config.cfg_path, - decode_method=self.config.decode_method, - am_predictor_conf=self.config.am_predictor_conf) + model_type=self.config.model_type, + am_model=self.config.am_model, + am_params=self.config.am_params, + lang=self.config.lang, + sample_rate=self.config.sample_rate, + cfg_path=self.config.cfg_path, + decode_method=self.config.decode_method, + am_predictor_conf=self.config.am_predictor_conf) logger.info("Initialize ASR server engine successfully.") return True @@ -230,7 +221,8 @@ class ASREngine(BaseEngine): io.BytesIO(audio_data), self.config.sample_rate, self.config.force_yes): logger.info("start running asr engine") - self.executor.preprocess(self.config.model_type, io.BytesIO(audio_data)) + self.executor.preprocess(self.config.model_type, + io.BytesIO(audio_data)) self.executor.infer(self.config.model_type) self.output = self.executor.postprocess() # Retrieve result of asr. logger.info("end inferring asr engine") diff --git a/paddlespeech/server/engine/asr/python/asr_engine.py b/paddlespeech/server/engine/asr/python/asr_engine.py index fd67b0291fd7a2adc09370f2155a859967eb292e..9fac487d777a684abf609e87da2c93e00dd83cb8 100644 --- a/paddlespeech/server/engine/asr/python/asr_engine.py +++ b/paddlespeech/server/engine/asr/python/asr_engine.py @@ -12,21 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. import io -import os -from typing import List -from typing import Optional -from typing import Union -import librosa import paddle -import soundfile from paddlespeech.cli.asr.infer import ASRExecutor from paddlespeech.cli.log import logger -from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer -from paddlespeech.s2t.transform.transformation import Transformation -from paddlespeech.s2t.utils.dynamic_import import dynamic_import -from paddlespeech.s2t.utils.utility import UpdateConfig from paddlespeech.server.engine.base_engine import BaseEngine from paddlespeech.server.utils.config import get_config @@ -63,7 +53,10 @@ class ASREngine(BaseEngine): self.executor = ASRServerExecutor() self.config = get_config(config_file) - paddle.set_device(paddle.get_device()) + if self.config.device is None: + paddle.set_device(paddle.get_device()) + else: + paddle.set_device(self.config.device) self.executor._init_from_path( self.config.model, self.config.lang, self.config.sample_rate, self.config.cfg_path, self.config.decode_method, diff --git a/paddlespeech/server/engine/base_engine.py b/paddlespeech/server/engine/base_engine.py index 0cc20209479ea7e033943b799a7e161ac21e3b35..0f020d1c783e194f96af84de9326eba25595435c 100644 --- a/paddlespeech/server/engine/base_engine.py +++ b/paddlespeech/server/engine/base_engine.py @@ -12,8 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. import os -from typing import Any -from typing import List from typing import Union from pattern_singleton import Singleton diff --git a/paddlespeech/server/engine/engine_factory.py b/paddlespeech/server/engine/engine_factory.py index 05f13568106f6646f342fe94885934203036c26c..546541edfcfbfd619275646446dbd4e086536c4f 100644 --- a/paddlespeech/server/engine/engine_factory.py +++ b/paddlespeech/server/engine/engine_factory.py @@ -13,7 +13,6 @@ # limitations under the License. from typing import Text - __all__ = ['EngineFactory'] diff --git a/paddlespeech/server/engine/engine_pool.py b/paddlespeech/server/engine/engine_pool.py index 0198bd80aa9e2e32e4c44bf6761b77a78c09abec..f6a4d2aab2c894149efae75afacf6a275a5dd6b0 100644 --- a/paddlespeech/server/engine/engine_pool.py +++ b/paddlespeech/server/engine/engine_pool.py @@ -29,8 +29,10 @@ def init_engine_pool(config) -> bool: """ global ENGINE_POOL for engine in config.engine_backend: - ENGINE_POOL[engine] = EngineFactory.get_engine(engine_name=engine, engine_type=config.engine_type[engine]) - if not ENGINE_POOL[engine].init(config_file=config.engine_backend[engine]): + ENGINE_POOL[engine] = EngineFactory.get_engine( + engine_name=engine, engine_type=config.engine_type[engine]) + if not ENGINE_POOL[engine].init( + config_file=config.engine_backend[engine]): return False return True diff --git a/paddlespeech/server/engine/tts/paddleinference/tts_engine.py b/paddlespeech/server/engine/tts/paddleinference/tts_engine.py index 7679b02f03b2b5bf6f52482ae3a926f1081f3d65..a9dc5f4ea742b903e229c4f3520909667a67881c 100644 --- a/paddlespeech/server/engine/tts/paddleinference/tts_engine.py +++ b/paddlespeech/server/engine/tts/paddleinference/tts_engine.py @@ -344,7 +344,6 @@ class TTSEngine(BaseEngine): try: self.config = get_config(config_file) - self.executor._init_from_path( am=self.config.am, am_model=self.config.am_model, @@ -361,8 +360,8 @@ class TTSEngine(BaseEngine): am_predictor_conf=self.config.am_predictor_conf, voc_predictor_conf=self.config.voc_predictor_conf, ) - except: - logger.info("Initialize TTS server engine Failed.") + except BaseException: + logger.error("Initialize TTS server engine Failed.") return False logger.info("Initialize TTS server engine successfully.") @@ -406,11 +405,13 @@ class TTSEngine(BaseEngine): # transform speed try: # windows not support soxbindings wav_speed = change_speed(wav_vol, speed, target_fs) - except: + except ServerBaseException: raise ServerBaseException( ErrorCode.SERVER_INTERNAL_ERR, "Transform speed failed. Can not install soxbindings on your system. \ You need to set speed value 1.0.") + except BaseException: + logger.error("Transform speed failed.") # wav to base64 buf = io.BytesIO() @@ -463,9 +464,11 @@ class TTSEngine(BaseEngine): try: self.executor.infer( text=sentence, lang=lang, am=self.config.am, spk_id=spk_id) - except: + except ServerBaseException: raise ServerBaseException(ErrorCode.SERVER_INTERNAL_ERR, "tts infer failed.") + except BaseException: + logger.error("tts infer failed.") try: target_sample_rate, wav_base64 = self.postprocess( @@ -475,8 +478,10 @@ class TTSEngine(BaseEngine): volume=volume, speed=speed, audio_path=save_path) - except: + except ServerBaseException: raise ServerBaseException(ErrorCode.SERVER_INTERNAL_ERR, "tts postprocess failed.") + except BaseException: + logger.error("tts postprocess failed.") return lang, target_sample_rate, wav_base64 diff --git a/paddlespeech/server/engine/tts/python/tts_engine.py b/paddlespeech/server/engine/tts/python/tts_engine.py index e11cfb1d1671ae26816a8974c1d55bf0d39e3c06..20b4e0fe94589bf831929cdd19f1b77fa6297f39 100644 --- a/paddlespeech/server/engine/tts/python/tts_engine.py +++ b/paddlespeech/server/engine/tts/python/tts_engine.py @@ -54,7 +54,10 @@ class TTSEngine(BaseEngine): try: self.config = get_config(config_file) - paddle.set_device(self.config.device) + if self.config.device is None: + paddle.set_device(paddle.get_device()) + else: + paddle.set_device(self.config.device) self.executor._init_from_path( am=self.config.am, @@ -69,8 +72,8 @@ class TTSEngine(BaseEngine): voc_ckpt=self.config.voc_ckpt, voc_stat=self.config.voc_stat, lang=self.config.lang) - except: - logger.info("Initialize TTS server engine Failed.") + except BaseException: + logger.error("Initialize TTS server engine Failed.") return False logger.info("Initialize TTS server engine successfully.") @@ -114,10 +117,13 @@ class TTSEngine(BaseEngine): # transform speed try: # windows not support soxbindings wav_speed = change_speed(wav_vol, speed, target_fs) - except: + except ServerBaseException: raise ServerBaseException( ErrorCode.SERVER_INTERNAL_ERR, - "Can not install soxbindings on your system.") + "Transform speed failed. Can not install soxbindings on your system. \ + You need to set speed value 1.0.") + except BaseException: + logger.error("Transform speed failed.") # wav to base64 buf = io.BytesIO() @@ -170,9 +176,11 @@ class TTSEngine(BaseEngine): try: self.executor.infer( text=sentence, lang=lang, am=self.config.am, spk_id=spk_id) - except: + except ServerBaseException: raise ServerBaseException(ErrorCode.SERVER_INTERNAL_ERR, "tts infer failed.") + except BaseException: + logger.error("tts infer failed.") try: target_sample_rate, wav_base64 = self.postprocess( @@ -182,8 +190,10 @@ class TTSEngine(BaseEngine): volume=volume, speed=speed, audio_path=save_path) - except: + except ServerBaseException: raise ServerBaseException(ErrorCode.SERVER_INTERNAL_ERR, "tts postprocess failed.") + except BaseException: + logger.error("tts postprocess failed.") return lang, target_sample_rate, wav_base64 diff --git a/paddlespeech/server/restful/asr_api.py b/paddlespeech/server/restful/asr_api.py index 4806c0421da0384c20297670869538b4ff17a169..cf46735dcc84dc92c8bfcfa71b426604ed7c1843 100644 --- a/paddlespeech/server/restful/asr_api.py +++ b/paddlespeech/server/restful/asr_api.py @@ -14,6 +14,7 @@ import base64 import traceback from typing import Union + from fastapi import APIRouter from paddlespeech.server.engine.engine_pool import get_engine_pool @@ -83,7 +84,7 @@ def asr(request_body: ASRRequest): except ServerBaseException as e: response = failed_response(e.error_code, e.msg) - except: + except BaseException: response = failed_response(ErrorCode.SERVER_UNKOWN_ERR) traceback.print_exc() diff --git a/paddlespeech/server/restful/request.py b/paddlespeech/server/restful/request.py index 2be5f0e546dee6c1c042820ac1a3838a446e23ea..28908801977d346e56a24ba075263a33f37e7d34 100644 --- a/paddlespeech/server/restful/request.py +++ b/paddlespeech/server/restful/request.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import List from typing import Optional from pydantic import BaseModel diff --git a/paddlespeech/server/restful/response.py b/paddlespeech/server/restful/response.py index ab5e395ba6914482e320d13abf2744e2fef71ec0..4e18ee0d790248313b6f14f068ac3f37a33aeba6 100644 --- a/paddlespeech/server/restful/response.py +++ b/paddlespeech/server/restful/response.py @@ -11,9 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import List -from typing import Optional - from pydantic import BaseModel __all__ = ['ASRResponse', 'TTSResponse'] diff --git a/paddlespeech/server/restful/tts_api.py b/paddlespeech/server/restful/tts_api.py index d5fa1d42c4db0e822ab2d545ad69225ebb382222..c7e91300da3eabf80755967cfd7eab99c299d7cd 100644 --- a/paddlespeech/server/restful/tts_api.py +++ b/paddlespeech/server/restful/tts_api.py @@ -16,7 +16,7 @@ from typing import Union from fastapi import APIRouter -from paddlespeech.server.engine.tts.paddleinference.tts_engine import TTSEngine +from paddlespeech.server.engine.engine_pool import get_engine_pool from paddlespeech.server.restful.request import TTSRequest from paddlespeech.server.restful.response import ErrorResponse from paddlespeech.server.restful.response import TTSResponse @@ -60,28 +60,41 @@ def tts(request_body: TTSRequest): Returns: json: [description] """ - # json to dict - item_dict = request_body.dict() - sentence = item_dict['text'] - spk_id = item_dict['spk_id'] - speed = item_dict['speed'] - volume = item_dict['volume'] - sample_rate = item_dict['sample_rate'] - save_path = item_dict['save_path'] + # get params + text = request_body.text + spk_id = request_body.spk_id + speed = request_body.speed + volume = request_body.volume + sample_rate = request_body.sample_rate + save_path = request_body.save_path # Check parameters - if speed <=0 or speed > 3 or volume <=0 or volume > 3 or \ - sample_rate not in [0, 16000, 8000] or \ - (save_path is not None and not save_path.endswith("pcm") and not save_path.endswith("wav")): - return failed_response(ErrorCode.SERVER_PARAM_ERR) - - # single - tts_engine = TTSEngine() + if speed <= 0 or speed > 3: + return failed_response( + ErrorCode.SERVER_PARAM_ERR, + "invalid speed value, the value should be between 0 and 3.") + if volume <= 0 or volume > 3: + return failed_response( + ErrorCode.SERVER_PARAM_ERR, + "invalid volume value, the value should be between 0 and 3.") + if sample_rate not in [0, 16000, 8000]: + return failed_response( + ErrorCode.SERVER_PARAM_ERR, + "invalid sample_rate value, the choice of value is 0, 8000, 16000.") + if save_path is not None and not save_path.endswith( + "pcm") and not save_path.endswith("wav"): + return failed_response( + ErrorCode.SERVER_PARAM_ERR, + "invalid save_path, saved audio formats support pcm and wav") # run try: + # get single engine from engine pool + engine_pool = get_engine_pool() + tts_engine = engine_pool['tts'] + lang, target_sample_rate, wav_base64 = tts_engine.run( - sentence, spk_id, speed, volume, sample_rate, save_path) + text, spk_id, speed, volume, sample_rate, save_path) response = { "success": True, @@ -101,7 +114,7 @@ def tts(request_body: TTSRequest): } except ServerBaseException as e: response = failed_response(e.error_code, e.msg) - except: + except BaseException: response = failed_response(ErrorCode.SERVER_UNKOWN_ERR) traceback.print_exc() diff --git a/paddlespeech/server/tests/asr/http_client.py b/paddlespeech/server/tests/asr/http_client.py index 14adb5741989790140fa509bb4e6eeca1b48546f..49f2adf7c28954af1fc2efc42b81169989ad471e 100644 --- a/paddlespeech/server/tests/asr/http_client.py +++ b/paddlespeech/server/tests/asr/http_client.py @@ -10,11 +10,11 @@ # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the -import requests +import base64 import json import time -import base64 -import io + +import requests def readwav2base64(wav_file): @@ -34,23 +34,23 @@ def main(): url = "http://127.0.0.1:8090/paddlespeech/asr" # start Timestamp - time_start=time.time() + time_start = time.time() test_audio_dir = "./16_audio.wav" audio = readwav2base64(test_audio_dir) data = { - "audio": audio, - "audio_format": "wav", - "sample_rate": 16000, - "lang": "zh_cn", - } + "audio": audio, + "audio_format": "wav", + "sample_rate": 16000, + "lang": "zh_cn", + } r = requests.post(url=url, data=json.dumps(data)) # ending Timestamp - time_end=time.time() - print('time cost',time_end - time_start, 's') + time_end = time.time() + print('time cost', time_end - time_start, 's') print(r.json()) diff --git a/paddlespeech/server/tests/tts/test_client.py b/paddlespeech/server/tests/tts/test_client.py index 65f4ccfece121f5ab472fe3a2e9e2f34244136b9..e42c9bcfa1cf586333ca333251f63e9b50a1b62f 100644 --- a/paddlespeech/server/tests/tts/test_client.py +++ b/paddlespeech/server/tests/tts/test_client.py @@ -25,6 +25,7 @@ import soundfile from paddlespeech.server.utils.audio_process import wav2pcm + # Request and response def tts_client(args): """ Request and response @@ -99,5 +100,5 @@ if __name__ == "__main__": print("Inference time: %f" % (time_consume)) print("The duration of synthesized audio: %f" % (duration)) print("The RTF is: %f" % (rtf)) - except: + except BaseException: print("Failed to synthesized audio.") diff --git a/paddlespeech/server/util.py b/paddlespeech/server/util.py index 48c4b8cbd09a51460c1150f4a45a43ed9244a4c9..1f1b0be1bd82f112bfa7c6162fde42c236739243 100644 --- a/paddlespeech/server/util.py +++ b/paddlespeech/server/util.py @@ -219,7 +219,7 @@ class ConfigCache: try: cfg = yaml.load(file, Loader=yaml.FullLoader) self._data.update(cfg) - except: + except BaseException: self.flush() @property diff --git a/paddlespeech/server/utils/paddle_predictor.py b/paddlespeech/server/utils/paddle_predictor.py index f910161b88896e054439c855da3efcdad10b21ae..f4216d74ca9c1cd1a444e61fe5a775db2eca3d85 100644 --- a/paddlespeech/server/utils/paddle_predictor.py +++ b/paddlespeech/server/utils/paddle_predictor.py @@ -41,8 +41,9 @@ def init_predictor(model_dir: Optional[os.PathLike]=None, config = Config(model_file, params_file) config.enable_memory_optim() - if predictor_conf["use_gpu"]: - config.enable_use_gpu(1000, 0) + if "gpu" in predictor_conf["device"]: + gpu_id = predictor_conf["device"].split(":")[-1] + config.enable_use_gpu(1000, int(gpu_id)) if predictor_conf["enable_mkldnn"]: config.enable_mkldnn() if predictor_conf["switch_ir_optim"]: