improve server code, test=doc

b361a738 · lym0302 · 2c2e561b · b361a738 · b361a738 · b361a738
30 changed file
--- a/demos/streaming_tts_server/conf/tts_online_application.yaml
+++ b/demos/streaming_tts_server/conf/tts_online_application.yaml
@@ -43,12 +43,12 @@ tts_online:
    device: 'cpu' # set 'gpu:id' or 'cpu'
    # am_block and am_pad only for fastspeech2_cnndecoder_onnx model to streaming am infer,
    # when am_pad set 12, streaming synthetic audio is the same as non-streaming synthetic audio
-    am_block: 42
+    am_block: 72
    am_pad: 12
    # voc_pad and voc_block voc model to streaming voc infer,
    # when voc model is mb_melgan_csmsc, voc_pad set 14, streaming synthetic audio is the same as non-streaming synthetic audio; The minimum value of pad can be set to 7, streaming synthetic audio sounds normal
    # when voc model is hifigan_csmsc, voc_pad set 20, streaming synthetic audio is the same as non-streaming synthetic audio; voc_pad set 14, streaming synthetic audio sounds normal
-    voc_block: 14
+    voc_block: 36
    voc_pad: 14
@@ -91,12 +91,12 @@ tts_online-onnx:
    lang: 'zh'
    # am_block and am_pad only for fastspeech2_cnndecoder_onnx model to streaming am infer,
    # when am_pad set 12, streaming synthetic audio is the same as non-streaming synthetic audio
-    am_block: 42
+    am_block: 72
    am_pad: 12
    # voc_pad and voc_block voc model to streaming voc infer,
    # when voc model is mb_melgan_csmsc_onnx, voc_pad set 14, streaming synthetic audio is the same as non-streaming synthetic audio; The minimum value of pad can be set to 7, streaming synthetic audio sounds normal
    # when voc model is hifigan_csmsc_onnx, voc_pad set 20, streaming synthetic audio is the same as non-streaming synthetic audio; voc_pad set 14, streaming synthetic audio sounds normal
-    voc_block: 14
+    voc_block: 36
    voc_pad: 14
    # voc_upsample should be same as n_shift on voc config.
    voc_upsample: 300

--- a/paddlespeech/server/bin/paddlespeech_client.py
+++ b/paddlespeech/server/bin/paddlespeech_client.py
@@ -31,6 +31,7 @@ from ..util import stats_wrapper
 from paddlespeech.cli.log import logger
 from paddlespeech.server.utils.audio_handler import ASRWsAudioHandler
 from paddlespeech.server.utils.audio_process import wav2pcm
+from paddlespeech.server.utils.util import compute_delay
 from paddlespeech.server.utils.util import wav2base64
 __all__ = [
@@ -221,7 +222,7 @@ class TTSOnlineClientExecutor(BaseExecutor):
        play = args.play
        try:
-            res = self(
+            self(
                input=input_,
                server_ip=server_ip,
                port=port,
@@ -257,17 +258,42 @@ class TTSOnlineClientExecutor(BaseExecutor):
            logger.info("tts http client start")
            from paddlespeech.server.utils.audio_handler import TTSHttpHandler
            handler = TTSHttpHandler(server_ip, port, play)
-            handler.run(input, spk_id, speed, volume, sample_rate, output)
+            first_response, final_response, duration, save_audio_success, receive_time_list, chunk_duration_list = handler.run(
+                input, spk_id, speed, volume, sample_rate, output)
+            delay_time_list = compute_delay(receive_time_list,
+                                            chunk_duration_list)
        elif protocol == "websocket":
            from paddlespeech.server.utils.audio_handler import TTSWsHandler
            logger.info("tts websocket client start")
            handler = TTSWsHandler(server_ip, port, play)
            loop = asyncio.get_event_loop()
-            loop.run_until_complete(handler.run(input, output))
+            first_response, final_response, duration, save_audio_success, receive_time_list, chunk_duration_list = loop.run_until_complete(
+                handler.run(input, output))
+            delay_time_list = compute_delay(receive_time_list,
+                                            chunk_duration_list)
        else:
            logger.error("Please set correct protocol, http or websocket")
+            return False
+        logger.info(f"sentence: {input}")
+        logger.info(f"duration: {duration} s")
+        logger.info(f"first response: {first_response} s")
+        logger.info(f"final response: {final_response} s")
+        logger.info(f"RTF: {final_response/duration}")
+        if output is not None:
+            if save_audio_success:
+                logger.info(f"Audio successfully saved in {output}")
+            else:
+                logger.error("Audio save failed.")
+        if delay_time_list != []:
+            logger.info(
+                f"Delay situation: total number of packages: {len(receive_time_list)}, the number of delayed packets: {len(delay_time_list)}, minimum delay time: {min(delay_time_list)} s, maximum delay time: {max(delay_time_list)} s, average delay time: {sum(delay_time_list)/len(delay_time_list)} s, delay rate:{len(delay_time_list)/len(receive_time_list)}"
+            )
+        else:
+            logger.info("The sentence has no delay in streaming synthesis.")
 @cli_client_register(

--- a/paddlespeech/server/conf/application.yaml
+++ b/paddlespeech/server/conf/application.yaml
-# This is the parameter configuration file for PaddleSpeech Serving.
+# This is the parameter configuration file for PaddleSpeech Offline Serving..
 #################################################################################
 #                             SERVER SETTING                                    #
@@ -7,9 +7,7 @@ host: 127.0.0.1
 port: 8090
 # The task format in the engin_list is: <speech task>_<engine type>
-# task choices = ['asr_python', 'asr_inference', 'tts_python', 'tts_inference']
+# task choices = ['asr_python', 'asr_inference', 'tts_python', 'tts_inference', 'cls_python', 'cls_inference']
-# protocol = ['websocket', 'http'] (only one can be selected). 
-# http only support offline engine type.
 protocol: 'http'
 engine_list: ['asr_python', 'tts_python', 'cls_python', 'text_python', 'vector_python']
@@ -50,24 +48,6 @@ asr_inference:
        summary: True  # False -> do not show predictor config
-################### speech task: asr; engine_type: online #######################
-asr_online:
-    model_type: 'deepspeech2online_aishell'
-    am_model: # the pdmodel file of am static model [optional]
-    am_params:  # the pdiparams file of am static model [optional]
-    lang: 'zh'
-    sample_rate: 16000
-    cfg_path: 
-    decode_method: 
-    force_yes: True
-    am_predictor_conf:
-        device:  # set 'gpu:id' or 'cpu'
-        switch_ir_optim: True
-        glog_info: False  # True -> print glog
-        summary: True  # False -> do not show predictor config
 ################################### TTS #########################################
 ################### speech task: tts; engine_type: python #######################
 tts_python: 

--- a/paddlespeech/server/conf/tts_online_application.yaml
+++ b/paddlespeech/server/conf/tts_online_application.yaml
@@ -43,12 +43,12 @@ tts_online:
    device: 'cpu' # set 'gpu:id' or 'cpu'
    # am_block and am_pad only for fastspeech2_cnndecoder_onnx model to streaming am infer,
    # when am_pad set 12, streaming synthetic audio is the same as non-streaming synthetic audio
-    am_block: 42
+    am_block: 72
    am_pad: 12
    # voc_pad and voc_block voc model to streaming voc infer,
    # when voc model is mb_melgan_csmsc, voc_pad set 14, streaming synthetic audio is the same as non-streaming synthetic audio; The minimum value of pad can be set to 7, streaming synthetic audio sounds normal
    # when voc model is hifigan_csmsc, voc_pad set 20, streaming synthetic audio is the same as non-streaming synthetic audio; voc_pad set 14, streaming synthetic audio sounds normal
-    voc_block: 14
+    voc_block: 36
    voc_pad: 14
@@ -91,12 +91,12 @@ tts_online-onnx:
    lang: 'zh'
    # am_block and am_pad only for fastspeech2_cnndecoder_onnx model to streaming am infer,
    # when am_pad set 12, streaming synthetic audio is the same as non-streaming synthetic audio
-    am_block: 42
+    am_block: 72
    am_pad: 12
    # voc_pad and voc_block voc model to streaming voc infer,
    # when voc model is mb_melgan_csmsc_onnx, voc_pad set 14, streaming synthetic audio is the same as non-streaming synthetic audio; The minimum value of pad can be set to 7, streaming synthetic audio sounds normal
    # when voc model is hifigan_csmsc_onnx, voc_pad set 20, streaming synthetic audio is the same as non-streaming synthetic audio; voc_pad set 14, streaming synthetic audio sounds normal
-    voc_block: 14
+    voc_block: 36
    voc_pad: 14
    # voc_upsample should be same as n_shift on voc config.
    voc_upsample: 300

--- a/paddlespeech/server/engine/asr/online/asr_engine.py
+++ b/paddlespeech/server/engine/asr/online/asr_engine.py
@@ -20,10 +20,9 @@ import paddle
 from numpy import float32
 from yacs.config import CfgNode
+from .pretrained_models import pretrained_models
 from paddlespeech.cli.asr.infer import ASRExecutor
-from paddlespeech.cli.asr.infer import model_alias
 from paddlespeech.cli.log import logger
-from paddlespeech.cli.utils import download_and_decompress
 from paddlespeech.cli.utils import MODEL_HOME
 from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
 from paddlespeech.s2t.frontend.speech import SpeechSegment
@@ -40,45 +39,6 @@ from paddlespeech.server.utils.paddle_predictor import init_predictor
 __all__ = ['ASREngine']
-pretrained_models = {
-    "deepspeech2online_aishell-zh-16k": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_fbank161_ckpt_0.2.1.model.tar.gz',
-        'md5':
-        '98b87b171b7240b7cae6e07d8d0bc9be',
-        'cfg_path':
-        'model.yaml',
-        'ckpt_path':
-        'exp/deepspeech2_online/checkpoints/avg_1',
-        'model':
-        'exp/deepspeech2_online/checkpoints/avg_1.jit.pdmodel',
-        'params':
-        'exp/deepspeech2_online/checkpoints/avg_1.jit.pdiparams',
-        'lm_url':
-        'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm',
-        'lm_md5':
-        '29e02312deb2e59b3c8686c7966d4fe3'
-    },
-    "conformer_online_multicn-zh-16k": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/s2t/multi_cn/asr1/asr1_chunk_conformer_multi_cn_ckpt_0.2.3.model.tar.gz',
-        'md5':
-        '0ac93d390552336f2a906aec9e33c5fa',
-        'cfg_path':
-        'model.yaml',
-        'ckpt_path':
-        'exp/chunk_conformer/checkpoints/multi_cn',
-        'model':
-        'exp/chunk_conformer/checkpoints/multi_cn.pdparams',
-        'params':
-        'exp/chunk_conformer/checkpoints/multi_cn.pdparams',
-        'lm_url':
-        'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm',
-        'lm_md5':
-        '29e02312deb2e59b3c8686c7966d4fe3'
-    },
-}
 # ASR server connection process class
 class PaddleASRConnectionHanddler:
@@ -625,24 +585,7 @@ class PaddleASRConnectionHanddler:
 class ASRServerExecutor(ASRExecutor):
    def __init__(self):
        super().__init__()
-        pass
+        self.pretrained_models = pretrained_models
-    def _get_pretrained_path(self, tag: str) -> os.PathLike:
-        """
-        Download and returns pretrained resources path of current task.
-        """
-        support_models = list(pretrained_models.keys())
-        assert tag in pretrained_models, 'The model "{}" you want to use has not been supported, please choose other models.\nThe support models includes:\n\t\t{}\n'.format(
-            tag, '\n\t\t'.join(support_models))
-        res_path = os.path.join(MODEL_HOME, tag)
-        decompressed_path = download_and_decompress(pretrained_models[tag],
-                                                    res_path)
-        decompressed_path = os.path.abspath(decompressed_path)
-        logger.info(
-            'Use pretrained model stored in: {}'.format(decompressed_path))
-        return decompressed_path
    def _init_from_path(self,
                        model_type: str='deepspeech2online_aishell',
@@ -658,20 +601,20 @@ class ASRServerExecutor(ASRExecutor):
        """
        self.model_type = model_type
        self.sample_rate = sample_rate
+        sample_rate_str = '16k' if sample_rate == 16000 else '8k'
+        tag = model_type + '-' + lang + '-' + sample_rate_str
        if cfg_path is None or am_model is None or am_params is None:
-            sample_rate_str = '16k' if sample_rate == 16000 else '8k'
-            tag = model_type + '-' + lang + '-' + sample_rate_str
            logger.info(f"Load the pretrained model, tag = {tag}")
            res_path = self._get_pretrained_path(tag)  # wenetspeech_zh
            self.res_path = res_path
-            self.cfg_path = os.path.join(res_path,
+            self.cfg_path = os.path.join(
-                                         pretrained_models[tag]['cfg_path'])
+                res_path, self.pretrained_models[tag]['cfg_path'])
            self.am_model = os.path.join(res_path,
-                                         pretrained_models[tag]['model'])
+                                         self.pretrained_models[tag]['model'])
            self.am_params = os.path.join(res_path,
-                                          pretrained_models[tag]['params'])
+                                          self.pretrained_models[tag]['params'])
            logger.info(res_path)
        else:
            self.cfg_path = os.path.abspath(cfg_path)
@@ -699,8 +642,8 @@ class ASRServerExecutor(ASRExecutor):
                self.text_feature = TextFeaturizer(
                    unit_type=self.config.unit_type, vocab=self.vocab)
-                lm_url = pretrained_models[tag]['lm_url']
+                lm_url = self.pretrained_models[tag]['lm_url']
-                lm_md5 = pretrained_models[tag]['lm_md5']
+                lm_md5 = self.pretrained_models[tag]['lm_md5']
                logger.info(f"Start to load language model {lm_url}")
                self.download_lm(
                    lm_url,
@@ -773,7 +716,7 @@ class ASRServerExecutor(ASRExecutor):
            model_name = model_type[:model_type.rindex(
                '_')]  # model_type: {model_name}_{dataset}
            logger.info(f"model name: {model_name}")
-            model_class = dynamic_import(model_name, model_alias)
+            model_class = dynamic_import(model_name, self.model_alias)
            model_conf = self.config
            model = model_class.from_config(model_conf)
            self.model = model

--- a/paddlespeech/server/engine/asr/online/pretrained_models.py
+++ b/paddlespeech/server/engine/asr/online/pretrained_models.py
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+pretrained_models = {
+    "deepspeech2online_aishell-zh-16k": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_fbank161_ckpt_0.2.1.model.tar.gz',
+        'md5':
+        '98b87b171b7240b7cae6e07d8d0bc9be',
+        'cfg_path':
+        'model.yaml',
+        'ckpt_path':
+        'exp/deepspeech2_online/checkpoints/avg_1',
+        'model':
+        'exp/deepspeech2_online/checkpoints/avg_1.jit.pdmodel',
+        'params':
+        'exp/deepspeech2_online/checkpoints/avg_1.jit.pdiparams',
+        'lm_url':
+        'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm',
+        'lm_md5':
+        '29e02312deb2e59b3c8686c7966d4fe3'
+    },
+    "conformer_online_multicn-zh-16k": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/s2t/multi_cn/asr1/asr1_chunk_conformer_multi_cn_ckpt_0.2.3.model.tar.gz',
+        'md5':
+        '0ac93d390552336f2a906aec9e33c5fa',
+        'cfg_path':
+        'model.yaml',
+        'ckpt_path':
+        'exp/chunk_conformer/checkpoints/multi_cn',
+        'model':
+        'exp/chunk_conformer/checkpoints/multi_cn.pdparams',
+        'params':
+        'exp/chunk_conformer/checkpoints/multi_cn.pdparams',
+        'lm_url':
+        'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm',
+        'lm_md5':
+        '29e02312deb2e59b3c8686c7966d4fe3'
+    },
+}
--- a/paddlespeech/server/engine/asr/paddleinference/asr_engine.py
+++ b/paddlespeech/server/engine/asr/paddleinference/asr_engine.py
@@ -19,6 +19,7 @@ from typing import Optional
 import paddle
 from yacs.config import CfgNode
+from .pretrained_models import pretrained_models
 from paddlespeech.cli.asr.infer import ASRExecutor
 from paddlespeech.cli.log import logger
 from paddlespeech.cli.utils import MODEL_HOME
@@ -31,32 +32,11 @@ from paddlespeech.server.utils.paddle_predictor import run_model
 __all__ = ['ASREngine']
-pretrained_models = {
-    "deepspeech2offline_aishell-zh-16k": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_aishell_ckpt_0.1.1.model.tar.gz',
-        'md5':
-        '932c3593d62fe5c741b59b31318aa314',
-        'cfg_path':
-        'model.yaml',
-        'ckpt_path':
-        'exp/deepspeech2/checkpoints/avg_1',
-        'model':
-        'exp/deepspeech2/checkpoints/avg_1.jit.pdmodel',
-        'params':
-        'exp/deepspeech2/checkpoints/avg_1.jit.pdiparams',
-        'lm_url':
-        'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm',
-        'lm_md5':
-        '29e02312deb2e59b3c8686c7966d4fe3'
-    },
-}
 class ASRServerExecutor(ASRExecutor):
    def __init__(self):
        super().__init__()
-        pass
+        self.pretrained_models = pretrained_models
    def _init_from_path(self,
                        model_type: str='wenetspeech',
@@ -71,18 +51,18 @@ class ASRServerExecutor(ASRExecutor):
        Init model and other resources from a specific path.
        """
+        sample_rate_str = '16k' if sample_rate == 16000 else '8k'
+        tag = model_type + '-' + lang + '-' + sample_rate_str
        if cfg_path is None or am_model is None or am_params is None:
-            sample_rate_str = '16k' if sample_rate == 16000 else '8k'
-            tag = model_type + '-' + lang + '-' + sample_rate_str
            res_path = self._get_pretrained_path(tag)  # wenetspeech_zh
            self.res_path = res_path
-            self.cfg_path = os.path.join(res_path,
+            self.cfg_path = os.path.join(
-                                         pretrained_models[tag]['cfg_path'])
+                res_path, self.pretrained_models[tag]['cfg_path'])
            self.am_model = os.path.join(res_path,
-                                         pretrained_models[tag]['model'])
+                                         self.pretrained_models[tag]['model'])
            self.am_params = os.path.join(res_path,
-                                          pretrained_models[tag]['params'])
+                                          self.pretrained_models[tag]['params'])
            logger.info(res_path)
            logger.info(self.cfg_path)
            logger.info(self.am_model)
@@ -109,8 +89,8 @@ class ASRServerExecutor(ASRExecutor):
                self.text_feature = TextFeaturizer(
                    unit_type=self.config.unit_type, vocab=self.vocab)
-                lm_url = pretrained_models[tag]['lm_url']
+                lm_url = self.pretrained_models[tag]['lm_url']
-                lm_md5 = pretrained_models[tag]['lm_md5']
+                lm_md5 = self.pretrained_models[tag]['lm_md5']
                self.download_lm(
                    lm_url,
                    os.path.dirname(self.config.decode.lang_model_path), lm_md5)

--- a/paddlespeech/server/engine/asr/paddleinference/pretrained_models.py
+++ b/paddlespeech/server/engine/asr/paddleinference/pretrained_models.py
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+pretrained_models = {
+    "deepspeech2offline_aishell-zh-16k": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_aishell_ckpt_0.1.1.model.tar.gz',
+        'md5':
+        '932c3593d62fe5c741b59b31318aa314',
+        'cfg_path':
+        'model.yaml',
+        'ckpt_path':
+        'exp/deepspeech2/checkpoints/avg_1',
+        'model':
+        'exp/deepspeech2/checkpoints/avg_1.jit.pdmodel',
+        'params':
+        'exp/deepspeech2/checkpoints/avg_1.jit.pdiparams',
+        'lm_url':
+        'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm',
+        'lm_md5':
+        '29e02312deb2e59b3c8686c7966d4fe3'
+    },
+}
--- a/paddlespeech/server/engine/cls/paddleinference/cls_engine.py
+++ b/paddlespeech/server/engine/cls/paddleinference/cls_engine.py
@@ -20,83 +20,20 @@ import numpy as np
 import paddle
 import yaml
+from .pretrained_models import pretrained_models
 from paddlespeech.cli.cls.infer import CLSExecutor
 from paddlespeech.cli.log import logger
-from paddlespeech.cli.utils import download_and_decompress
-from paddlespeech.cli.utils import MODEL_HOME
 from paddlespeech.server.engine.base_engine import BaseEngine
 from paddlespeech.server.utils.paddle_predictor import init_predictor
 from paddlespeech.server.utils.paddle_predictor import run_model
 __all__ = ['CLSEngine']
-pretrained_models = {
-    "panns_cnn6-32k": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/cls/inference_model/panns_cnn6_static.tar.gz',
-        'md5':
-        'da087c31046d23281d8ec5188c1967da',
-        'cfg_path':
-        'panns.yaml',
-        'model_path':
-        'inference.pdmodel',
-        'params_path':
-        'inference.pdiparams',
-        'label_file':
-        'audioset_labels.txt',
-    },
-    "panns_cnn10-32k": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/cls/inference_model/panns_cnn10_static.tar.gz',
-        'md5':
-        '5460cc6eafbfaf0f261cc75b90284ae1',
-        'cfg_path':
-        'panns.yaml',
-        'model_path':
-        'inference.pdmodel',
-        'params_path':
-        'inference.pdiparams',
-        'label_file':
-        'audioset_labels.txt',
-    },
-    "panns_cnn14-32k": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/cls/inference_model/panns_cnn14_static.tar.gz',
-        'md5':
-        'ccc80b194821274da79466862b2ab00f',
-        'cfg_path':
-        'panns.yaml',
-        'model_path':
-        'inference.pdmodel',
-        'params_path':
-        'inference.pdiparams',
-        'label_file':
-        'audioset_labels.txt',
-    },
-}
 class CLSServerExecutor(CLSExecutor):
    def __init__(self):
        super().__init__()
-        pass
+        self.pretrained_models = pretrained_models
-    def _get_pretrained_path(self, tag: str) -> os.PathLike:
-        """
-            Download and returns pretrained resources path of current task.
-        """
-        support_models = list(pretrained_models.keys())
-        assert tag in pretrained_models, 'The model "{}" you want to use has not been supported, please choose other models.\nThe support models includes:\n\t\t{}\n'.format(
-            tag, '\n\t\t'.join(support_models))
-        res_path = os.path.join(MODEL_HOME, tag)
-        decompressed_path = download_and_decompress(pretrained_models[tag],
-                                                    res_path)
-        decompressed_path = os.path.abspath(decompressed_path)
-        logger.info(
-            'Use pretrained model stored in: {}'.format(decompressed_path))
-        return decompressed_path
    def _init_from_path(
            self,
@@ -113,14 +50,14 @@ class CLSServerExecutor(CLSExecutor):
        if cfg_path is None or model_path is None or params_path is None or label_file is None:
            tag = model_type + '-' + '32k'
            self.res_path = self._get_pretrained_path(tag)
-            self.cfg_path = os.path.join(self.res_path,
+            self.cfg_path = os.path.join(
-                                         pretrained_models[tag]['cfg_path'])
+                self.res_path, self.pretrained_models[tag]['cfg_path'])
-            self.model_path = os.path.join(self.res_path,
+            self.model_path = os.path.join(
-                                           pretrained_models[tag]['model_path'])
+                self.res_path, self.pretrained_models[tag]['model_path'])
            self.params_path = os.path.join(
-                self.res_path, pretrained_models[tag]['params_path'])
+                self.res_path, self.pretrained_models[tag]['params_path'])
-            self.label_file = os.path.join(self.res_path,
+            self.label_file = os.path.join(
-                                           pretrained_models[tag]['label_file'])
+                self.res_path, self.pretrained_models[tag]['label_file'])
        else:
            self.cfg_path = os.path.abspath(cfg_path)
            self.model_path = os.path.abspath(model_path)

--- a/paddlespeech/server/engine/cls/paddleinference/pretrained_models.py
+++ b/paddlespeech/server/engine/cls/paddleinference/pretrained_models.py
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+pretrained_models = {
+    "panns_cnn6-32k": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/cls/inference_model/panns_cnn6_static.tar.gz',
+        'md5':
+        'da087c31046d23281d8ec5188c1967da',
+        'cfg_path':
+        'panns.yaml',
+        'model_path':
+        'inference.pdmodel',
+        'params_path':
+        'inference.pdiparams',
+        'label_file':
+        'audioset_labels.txt',
+    },
+    "panns_cnn10-32k": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/cls/inference_model/panns_cnn10_static.tar.gz',
+        'md5':
+        '5460cc6eafbfaf0f261cc75b90284ae1',
+        'cfg_path':
+        'panns.yaml',
+        'model_path':
+        'inference.pdmodel',
+        'params_path':
+        'inference.pdiparams',
+        'label_file':
+        'audioset_labels.txt',
+    },
+    "panns_cnn14-32k": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/cls/inference_model/panns_cnn14_static.tar.gz',
+        'md5':
+        'ccc80b194821274da79466862b2ab00f',
+        'cfg_path':
+        'panns.yaml',
+        'model_path':
+        'inference.pdmodel',
+        'params_path':
+        'inference.pdiparams',
+        'label_file':
+        'audioset_labels.txt',
+    },
+}
--- a/paddlespeech/server/engine/tts/online/onnx/pretrained_models.py
+++ b/paddlespeech/server/engine/tts/online/onnx/pretrained_models.py
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# support online model
+pretrained_models = {
+    # fastspeech2
+    "fastspeech2_csmsc_onnx-zh": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_csmsc_onnx_0.2.0.zip',
+        'md5':
+        'fd3ad38d83273ad51f0ea4f4abf3ab4e',
+        'ckpt': ['fastspeech2_csmsc.onnx'],
+        'phones_dict':
+        'phone_id_map.txt',
+        'sample_rate':
+        24000,
+    },
+    "fastspeech2_cnndecoder_csmsc_onnx-zh": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_streaming_onnx_1.0.0.zip',
+        'md5':
+        '5f70e1a6bcd29d72d54e7931aa86f266',
+        'ckpt': [
+            'fastspeech2_csmsc_am_encoder_infer.onnx',
+            'fastspeech2_csmsc_am_decoder.onnx',
+            'fastspeech2_csmsc_am_postnet.onnx',
+        ],
+        'speech_stats':
+        'speech_stats.npy',
+        'phones_dict':
+        'phone_id_map.txt',
+        'sample_rate':
+        24000,
+    },
+    # mb_melgan
+    "mb_melgan_csmsc_onnx-zh": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_onnx_0.2.0.zip',
+        'md5':
+        '5b83ec746e8414bc29032d954ffd07ec',
+        'ckpt':
+        'mb_melgan_csmsc.onnx',
+        'sample_rate':
+        24000,
+    },
+    # hifigan
+    "hifigan_csmsc_onnx-zh": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_onnx_0.2.0.zip',
+        'md5':
+        '1a7dc0385875889e46952e50c0994a6b',
+        'ckpt':
+        'hifigan_csmsc.onnx',
+        'sample_rate':
+        24000,
+    },
+}
--- a/paddlespeech/server/engine/tts/online/onnx/tts_engine.py
+++ b/paddlespeech/server/engine/tts/online/onnx/tts_engine.py
@@ -20,10 +20,9 @@ from typing import Optional
 import numpy as np
 import paddle
+from .pretrained_models import pretrained_models
 from paddlespeech.cli.log import logger
 from paddlespeech.cli.tts.infer import TTSExecutor
-from paddlespeech.cli.utils import download_and_decompress
-from paddlespeech.cli.utils import MODEL_HOME
 from paddlespeech.server.engine.base_engine import BaseEngine
 from paddlespeech.server.utils.audio_process import float2pcm
 from paddlespeech.server.utils.onnx_infer import get_sess
@@ -34,83 +33,6 @@ from paddlespeech.t2s.frontend.zh_frontend import Frontend
 __all__ = ['TTSEngine']
-# support online model
-pretrained_models = {
-    # fastspeech2
-    "fastspeech2_csmsc_onnx-zh": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_csmsc_onnx_0.2.0.zip',
-        'md5':
-        'fd3ad38d83273ad51f0ea4f4abf3ab4e',
-        'ckpt': ['fastspeech2_csmsc.onnx'],
-        'phones_dict':
-        'phone_id_map.txt',
-        'sample_rate':
-        24000,
-    },
-    "fastspeech2_cnndecoder_csmsc_onnx-zh": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_streaming_onnx_1.0.0.zip',
-        'md5':
-        '5f70e1a6bcd29d72d54e7931aa86f266',
-        'ckpt': [
-            'fastspeech2_csmsc_am_encoder_infer.onnx',
-            'fastspeech2_csmsc_am_decoder.onnx',
-            'fastspeech2_csmsc_am_postnet.onnx',
-        ],
-        'speech_stats':
-        'speech_stats.npy',
-        'phones_dict':
-        'phone_id_map.txt',
-        'sample_rate':
-        24000,
-    },
-    # mb_melgan
-    "mb_melgan_csmsc_onnx-zh": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_onnx_0.2.0.zip',
-        'md5':
-        '5b83ec746e8414bc29032d954ffd07ec',
-        'ckpt':
-        'mb_melgan_csmsc.onnx',
-        'sample_rate':
-        24000,
-    },
-    # hifigan
-    "hifigan_csmsc_onnx-zh": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_onnx_0.2.0.zip',
-        'md5':
-        '1a7dc0385875889e46952e50c0994a6b',
-        'ckpt':
-        'hifigan_csmsc.onnx',
-        'sample_rate':
-        24000,
-    },
-}
-model_alias = {
-    # acoustic model
-    "fastspeech2":
-    "paddlespeech.t2s.models.fastspeech2:FastSpeech2",
-    "fastspeech2_inference":
-    "paddlespeech.t2s.models.fastspeech2:FastSpeech2Inference",
-    # voc
-    "mb_melgan":
-    "paddlespeech.t2s.models.melgan:MelGANGenerator",
-    "mb_melgan_inference":
-    "paddlespeech.t2s.models.melgan:MelGANInference",
-    "hifigan":
-    "paddlespeech.t2s.models.hifigan:HiFiGANGenerator",
-    "hifigan_inference":
-    "paddlespeech.t2s.models.hifigan:HiFiGANInference",
-}
-__all__ = ['TTSEngine']
 class TTSServerExecutor(TTSExecutor):
    def __init__(self, am_block, am_pad, voc_block, voc_pad, voc_upsample):
@@ -122,23 +44,6 @@ class TTSServerExecutor(TTSExecutor):
        self.voc_upsample = voc_upsample
        self.pretrained_models = pretrained_models
-        self.model_alias = model_alias
-    def _get_pretrained_path(self, tag: str) -> os.PathLike:
-        """
-        #Download and returns pretrained resources path of current task.
-        """
-        support_models = list(pretrained_models.keys())
-        assert tag in pretrained_models, 'The model "{}" you want to use has not been supported, please choose other models.\nThe support models includes:\n\t\t{}\n'.format(
-            tag, '\n\t\t'.join(support_models))
-        res_path = os.path.join(MODEL_HOME, tag)
-        decompressed_path = download_and_decompress(pretrained_models[tag],
-                                                    res_path)
-        decompressed_path = os.path.abspath(decompressed_path)
-        logger.info(
-            'Use pretrained model stored in: {}'.format(decompressed_path))
-        return decompressed_path
    def _init_from_path(
            self,
@@ -173,10 +78,10 @@ class TTSServerExecutor(TTSExecutor):
                am_res_path = self._get_pretrained_path(am_tag)
                self.am_res_path = am_res_path
                self.am_ckpt = os.path.join(
-                    am_res_path, pretrained_models[am_tag]['ckpt'][0])
+                    am_res_path, self.pretrained_models[am_tag]['ckpt'][0])
                # must have phones_dict in acoustic
                self.phones_dict = os.path.join(
-                    am_res_path, pretrained_models[am_tag]['phones_dict'])
+                    am_res_path, self.pretrained_models[am_tag]['phones_dict'])
            else:
                self.am_ckpt = os.path.abspath(am_ckpt[0])
@@ -192,16 +97,16 @@ class TTSServerExecutor(TTSExecutor):
                am_res_path = self._get_pretrained_path(am_tag)
                self.am_res_path = am_res_path
                self.am_encoder_infer = os.path.join(
-                    am_res_path, pretrained_models[am_tag]['ckpt'][0])
+                    am_res_path, self.pretrained_models[am_tag]['ckpt'][0])
                self.am_decoder = os.path.join(
-                    am_res_path, pretrained_models[am_tag]['ckpt'][1])
+                    am_res_path, self.pretrained_models[am_tag]['ckpt'][1])
                self.am_postnet = os.path.join(
-                    am_res_path, pretrained_models[am_tag]['ckpt'][2])
+                    am_res_path, self.pretrained_models[am_tag]['ckpt'][2])
                # must have phones_dict in acoustic
                self.phones_dict = os.path.join(
-                    am_res_path, pretrained_models[am_tag]['phones_dict'])
+                    am_res_path, self.pretrained_models[am_tag]['phones_dict'])
                self.am_stat = os.path.join(
-                    am_res_path, pretrained_models[am_tag]['speech_stats'])
+                    am_res_path, self.pretrained_models[am_tag]['speech_stats'])
            else:
                self.am_encoder_infer = os.path.abspath(am_ckpt[0])
@@ -229,8 +134,8 @@ class TTSServerExecutor(TTSExecutor):
        if voc_ckpt is None:
            voc_res_path = self._get_pretrained_path(voc_tag)
            self.voc_res_path = voc_res_path
-            self.voc_ckpt = os.path.join(voc_res_path,
+            self.voc_ckpt = os.path.join(
-                                         pretrained_models[voc_tag]['ckpt'])
+                voc_res_path, self.pretrained_models[voc_tag]['ckpt'])
        else:
            self.voc_ckpt = os.path.abspath(voc_ckpt)
            self.voc_res_path = os.path.dirname(os.path.abspath(self.voc_ckpt))
@@ -283,7 +188,6 @@ class TTSServerExecutor(TTSExecutor):
        """
        Model inference and result stored in self.output.
        """
-        #import pdb;pdb.set_trace()
        am_block = self.am_block
        am_pad = self.am_pad
@@ -453,10 +357,21 @@ class TTSEngine(BaseEngine):
            self.config.am_block, self.config.am_pad, self.config.voc_block,
            self.config.voc_pad, self.config.voc_upsample)
-        if "cpu" in self.config.am_sess_conf.device or "cpu" in self.config.voc_sess_conf.device:
+        try:
-            paddle.set_device("cpu")
+            if self.config.am_sess_conf.device is not None:
-        else:
+                self.device = self.config.am_sess_conf.device
-            paddle.set_device(self.config.am_sess_conf.device)
+            elif self.config.voc_sess_conf.device is not None:
+                self.device = self.config.voc_sess_conf.device
+            else:
+                self.device = paddle.get_device()
+            paddle.set_device(self.device)
+        except BaseException as e:
+            logger.error(
+                "Set device failed, please check if device is already used and the parameter 'device' in the yaml file"
+            )
+            logger.error("Initialize TTS server engine Failed on device: %s." %
+                         (self.device))
+            return False
        try:
            self.executor._init_from_path(
@@ -480,16 +395,17 @@ class TTSEngine(BaseEngine):
                         (self.config.voc_sess_conf.device))
            return False
-        logger.info("Initialize TTS server engine successfully on device: %s." %
-                    (self.config.voc_sess_conf.device))
        # warm up
        try:
            self.warm_up()
+            logger.info("Warm up successfully.")
        except Exception as e:
            logger.error("Failed to warm up on tts engine.")
            return False
+        logger.info("Initialize TTS server engine successfully on device: %s." %
+                    (self.config.voc_sess_conf.device))
        return True
    def warm_up(self):
@@ -499,9 +415,7 @@ class TTSEngine(BaseEngine):
            sentence = "您好，欢迎使用语音合成服务。"
        if self.config.lang == 'en':
            sentence = "Hello and welcome to the speech synthesis service."
-        logger.info(
+        logger.info("Start to warm up.")
-            "*******************************warm up ********************************"
-        )
        for i in range(3):
            for wav in self.executor.infer(
                    text=sentence,
@@ -512,9 +426,6 @@ class TTSEngine(BaseEngine):
                    f"The first response time of the {i} warm up: {self.executor.first_response_time} s"
                )
                break
-        logger.info(
-            "**********************************************************************"
-        )
    def preprocess(self, text_bese64: str=None, text_bytes: bytes=None):
        # Convert byte to text

--- a/paddlespeech/server/engine/tts/online/python/pretrained_models.py
+++ b/paddlespeech/server/engine/tts/online/python/pretrained_models.py
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# support online model
+pretrained_models = {
+    # fastspeech2
+    "fastspeech2_csmsc-zh": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip',
+        'md5':
+        '637d28a5e53aa60275612ba4393d5f22',
+        'config':
+        'default.yaml',
+        'ckpt':
+        'snapshot_iter_76000.pdz',
+        'speech_stats':
+        'speech_stats.npy',
+        'phones_dict':
+        'phone_id_map.txt',
+    },
+    "fastspeech2_cnndecoder_csmsc-zh": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_ckpt_1.0.0.zip',
+        'md5':
+        '6eb28e22ace73e0ebe7845f86478f89f',
+        'config':
+        'cnndecoder.yaml',
+        'ckpt':
+        'snapshot_iter_153000.pdz',
+        'speech_stats':
+        'speech_stats.npy',
+        'phones_dict':
+        'phone_id_map.txt',
+    },
+    # mb_melgan
+    "mb_melgan_csmsc-zh": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_ckpt_0.1.1.zip',
+        'md5':
+        'ee5f0604e20091f0d495b6ec4618b90d',
+        'config':
+        'default.yaml',
+        'ckpt':
+        'snapshot_iter_1000000.pdz',
+        'speech_stats':
+        'feats_stats.npy',
+    },
+    # hifigan
+    "hifigan_csmsc-zh": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_ckpt_0.1.1.zip',
+        'md5':
+        'dd40a3d88dfcf64513fba2f0f961ada6',
+        'config':
+        'default.yaml',
+        'ckpt':
+        'snapshot_iter_2500000.pdz',
+        'speech_stats':
+        'feats_stats.npy',
+    },
+}
--- a/paddlespeech/server/engine/tts/online/python/tts_engine.py
+++ b/paddlespeech/server/engine/tts/online/python/tts_engine.py
@@ -22,10 +22,9 @@ import paddle
 import yaml
 from yacs.config import CfgNode
+from .pretrained_models import pretrained_models
 from paddlespeech.cli.log import logger
 from paddlespeech.cli.tts.infer import TTSExecutor
-from paddlespeech.cli.utils import download_and_decompress
-from paddlespeech.cli.utils import MODEL_HOME
 from paddlespeech.s2t.utils.dynamic_import import dynamic_import
 from paddlespeech.server.engine.base_engine import BaseEngine
 from paddlespeech.server.utils.audio_process import float2pcm
@@ -37,87 +36,6 @@ from paddlespeech.t2s.modules.normalizer import ZScore
 __all__ = ['TTSEngine']
-# support online model
-pretrained_models = {
-    # fastspeech2
-    "fastspeech2_csmsc-zh": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip',
-        'md5':
-        '637d28a5e53aa60275612ba4393d5f22',
-        'config':
-        'default.yaml',
-        'ckpt':
-        'snapshot_iter_76000.pdz',
-        'speech_stats':
-        'speech_stats.npy',
-        'phones_dict':
-        'phone_id_map.txt',
-    },
-    "fastspeech2_cnndecoder_csmsc-zh": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_ckpt_1.0.0.zip',
-        'md5':
-        '6eb28e22ace73e0ebe7845f86478f89f',
-        'config':
-        'cnndecoder.yaml',
-        'ckpt':
-        'snapshot_iter_153000.pdz',
-        'speech_stats':
-        'speech_stats.npy',
-        'phones_dict':
-        'phone_id_map.txt',
-    },
-    # mb_melgan
-    "mb_melgan_csmsc-zh": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_ckpt_0.1.1.zip',
-        'md5':
-        'ee5f0604e20091f0d495b6ec4618b90d',
-        'config':
-        'default.yaml',
-        'ckpt':
-        'snapshot_iter_1000000.pdz',
-        'speech_stats':
-        'feats_stats.npy',
-    },
-    # hifigan
-    "hifigan_csmsc-zh": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_ckpt_0.1.1.zip',
-        'md5':
-        'dd40a3d88dfcf64513fba2f0f961ada6',
-        'config':
-        'default.yaml',
-        'ckpt':
-        'snapshot_iter_2500000.pdz',
-        'speech_stats':
-        'feats_stats.npy',
-    },
-}
-model_alias = {
-    # acoustic model
-    "fastspeech2":
-    "paddlespeech.t2s.models.fastspeech2:FastSpeech2",
-    "fastspeech2_inference":
-    "paddlespeech.t2s.models.fastspeech2:FastSpeech2Inference",
-    # voc
-    "mb_melgan":
-    "paddlespeech.t2s.models.melgan:MelGANGenerator",
-    "mb_melgan_inference":
-    "paddlespeech.t2s.models.melgan:MelGANInference",
-    "hifigan":
-    "paddlespeech.t2s.models.hifigan:HiFiGANGenerator",
-    "hifigan_inference":
-    "paddlespeech.t2s.models.hifigan:HiFiGANInference",
-}
-__all__ = ['TTSEngine']
 class TTSServerExecutor(TTSExecutor):
    def __init__(self, am_block, am_pad, voc_block, voc_pad):
@@ -126,6 +44,7 @@ class TTSServerExecutor(TTSExecutor):
        self.am_pad = am_pad
        self.voc_block = voc_block
        self.voc_pad = voc_pad
+        self.pretrained_models = pretrained_models
    def get_model_info(self,
                       field: str,
@@ -146,7 +65,7 @@ class TTSServerExecutor(TTSExecutor):
            [Tensor]: standard deviation
        """
-        model_class = dynamic_import(model_name, model_alias)
+        model_class = dynamic_import(model_name, self.model_alias)
        if field == "am":
            odim = self.am_config.n_mels
@@ -169,22 +88,6 @@ class TTSServerExecutor(TTSExecutor):
        return model, model_mu, model_std
-    def _get_pretrained_path(self, tag: str) -> os.PathLike:
-        """
-        Download and returns pretrained resources path of current task.
-        """
-        support_models = list(pretrained_models.keys())
-        assert tag in pretrained_models, 'The model "{}" you want to use has not been supported, please choose other models.\nThe support models includes:\n\t\t{}\n'.format(
-            tag, '\n\t\t'.join(support_models))
-        res_path = os.path.join(MODEL_HOME, tag)
-        decompressed_path = download_and_decompress(pretrained_models[tag],
-                                                    res_path)
-        decompressed_path = os.path.abspath(decompressed_path)
-        logger.info(
-            'Use pretrained model stored in: {}'.format(decompressed_path))
-        return decompressed_path
    def _init_from_path(
            self,
            am: str='fastspeech2_csmsc',
@@ -210,15 +113,15 @@ class TTSServerExecutor(TTSExecutor):
        if am_ckpt is None or am_config is None or am_stat is None or phones_dict is None:
            am_res_path = self._get_pretrained_path(am_tag)
            self.am_res_path = am_res_path
-            self.am_config = os.path.join(am_res_path,
+            self.am_config = os.path.join(
-                                          pretrained_models[am_tag]['config'])
+                am_res_path, self.pretrained_models[am_tag]['config'])
            self.am_ckpt = os.path.join(am_res_path,
-                                        pretrained_models[am_tag]['ckpt'])
+                                        self.pretrained_models[am_tag]['ckpt'])
            self.am_stat = os.path.join(
-                am_res_path, pretrained_models[am_tag]['speech_stats'])
+                am_res_path, self.pretrained_models[am_tag]['speech_stats'])
            # must have phones_dict in acoustic
            self.phones_dict = os.path.join(
-                am_res_path, pretrained_models[am_tag]['phones_dict'])
+                am_res_path, self.pretrained_models[am_tag]['phones_dict'])
            print("self.phones_dict:", self.phones_dict)
            logger.info(am_res_path)
            logger.info(self.am_config)
@@ -239,12 +142,12 @@ class TTSServerExecutor(TTSExecutor):
        if voc_ckpt is None or voc_config is None or voc_stat is None:
            voc_res_path = self._get_pretrained_path(voc_tag)
            self.voc_res_path = voc_res_path
-            self.voc_config = os.path.join(voc_res_path,
+            self.voc_config = os.path.join(
-                                           pretrained_models[voc_tag]['config'])
+                voc_res_path, self.pretrained_models[voc_tag]['config'])
-            self.voc_ckpt = os.path.join(voc_res_path,
+            self.voc_ckpt = os.path.join(
-                                         pretrained_models[voc_tag]['ckpt'])
+                voc_res_path, self.pretrained_models[voc_tag]['ckpt'])
            self.voc_stat = os.path.join(
-                voc_res_path, pretrained_models[voc_tag]['speech_stats'])
+                voc_res_path, self.pretrained_models[voc_tag]['speech_stats'])
            logger.info(voc_res_path)
            logger.info(self.voc_config)
            logger.info(self.voc_ckpt)
@@ -286,7 +189,7 @@ class TTSServerExecutor(TTSExecutor):
                                                    self.am_ckpt, self.am_stat)
            am_normalizer = ZScore(am_mu, am_std)
            am_inference_class = dynamic_import(self.am_name + '_inference',
-                                                model_alias)
+                                                self.model_alias)
            self.am_inference = am_inference_class(am_normalizer, am)
            self.am_inference.eval()
        print("acoustic model done!")
@@ -297,7 +200,7 @@ class TTSServerExecutor(TTSExecutor):
                                                   self.voc_ckpt, self.voc_stat)
        voc_normalizer = ZScore(voc_mu, voc_std)
        voc_inference_class = dynamic_import(self.voc_name + '_inference',
-                                             model_alias)
+                                             self.model_alias)
        self.voc_inference = voc_inference_class(voc_normalizer, voc)
        self.voc_inference.eval()
        print("voc done!")
@@ -477,7 +380,7 @@ class TTSEngine(BaseEngine):
        ), "Please set correct voc_block and voc_pad, they should be more than 0."
        try:
-            if self.config.device:
+            if self.config.device is not None:
                self.device = self.config.device
            else:
                self.device = paddle.get_device()
@@ -513,16 +416,16 @@ class TTSEngine(BaseEngine):
                         (self.device))
            return False
-        logger.info("Initialize TTS server engine successfully on device: %s." %
-                    (self.device))
        # warm up
        try:
            self.warm_up()
+            logger.info("Warm up successfully.")
        except Exception as e:
            logger.error("Failed to warm up on tts engine.")
            return False
+        logger.info("Initialize TTS server engine successfully on device: %s." %
+                    (self.device))
        return True
    def warm_up(self):
@@ -532,9 +435,7 @@ class TTSEngine(BaseEngine):
            sentence = "您好，欢迎使用语音合成服务。"
        if self.config.lang == 'en':
            sentence = "Hello and welcome to the speech synthesis service."
-        logger.info(
+        logger.info("Start to warm up.")
-            "*******************************warm up ********************************"
-        )
        for i in range(3):
            for wav in self.executor.infer(
                    text=sentence,
@@ -545,9 +446,6 @@ class TTSEngine(BaseEngine):
                    f"The first response time of the {i} warm up: {self.executor.first_response_time} s"
                )
                break
-        logger.info(
-            "**********************************************************************"
-        )
    def preprocess(self, text_bese64: str=None, text_bytes: bytes=None):
        # Convert byte to text

--- a/paddlespeech/server/engine/tts/paddleinference/pretrained_models.py
+++ b/paddlespeech/server/engine/tts/paddleinference/pretrained_models.py
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Static model applied on paddle inference
+pretrained_models = {
+    # speedyspeech
+    "speedyspeech_csmsc-zh": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_static_0.5.zip',
+        'md5':
+        'f10cbdedf47dc7a9668d2264494e1823',
+        'model':
+        'speedyspeech_csmsc.pdmodel',
+        'params':
+        'speedyspeech_csmsc.pdiparams',
+        'phones_dict':
+        'phone_id_map.txt',
+        'tones_dict':
+        'tone_id_map.txt',
+        'sample_rate':
+        24000,
+    },
+    # fastspeech2
+    "fastspeech2_csmsc-zh": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_static_0.4.zip',
+        'md5':
+        '9788cd9745e14c7a5d12d32670b2a5a7',
+        'model':
+        'fastspeech2_csmsc.pdmodel',
+        'params':
+        'fastspeech2_csmsc.pdiparams',
+        'phones_dict':
+        'phone_id_map.txt',
+        'sample_rate':
+        24000,
+    },
+    # pwgan
+    "pwgan_csmsc-zh": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_static_0.4.zip',
+        'md5':
+        'e3504aed9c5a290be12d1347836d2742',
+        'model':
+        'pwgan_csmsc.pdmodel',
+        'params':
+        'pwgan_csmsc.pdiparams',
+        'sample_rate':
+        24000,
+    },
+    # mb_melgan
+    "mb_melgan_csmsc-zh": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_static_0.1.1.zip',
+        'md5':
+        'ac6eee94ba483421d750433f4c3b8d36',
+        'model':
+        'mb_melgan_csmsc.pdmodel',
+        'params':
+        'mb_melgan_csmsc.pdiparams',
+        'sample_rate':
+        24000,
+    },
+    # hifigan
+    "hifigan_csmsc-zh": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_static_0.1.1.zip',
+        'md5':
+        '7edd8c436b3a5546b3a7cb8cff9d5a0c',
+        'model':
+        'hifigan_csmsc.pdmodel',
+        'params':
+        'hifigan_csmsc.pdiparams',
+        'sample_rate':
+        24000,
+    },
+}
--- a/paddlespeech/server/engine/tts/paddleinference/tts_engine.py
+++ b/paddlespeech/server/engine/tts/paddleinference/tts_engine.py
@@ -23,10 +23,9 @@ import paddle
 import soundfile as sf
 from scipy.io import wavfile
+from .pretrained_models import pretrained_models
 from paddlespeech.cli.log import logger
 from paddlespeech.cli.tts.infer import TTSExecutor
-from paddlespeech.cli.utils import download_and_decompress
-from paddlespeech.cli.utils import MODEL_HOME
 from paddlespeech.server.engine.base_engine import BaseEngine
 from paddlespeech.server.utils.audio_process import change_speed
 from paddlespeech.server.utils.errors import ErrorCode
@@ -38,101 +37,11 @@ from paddlespeech.t2s.frontend.zh_frontend import Frontend
 __all__ = ['TTSEngine']
-# Static model applied on paddle inference
-pretrained_models = {
-    # speedyspeech
-    "speedyspeech_csmsc-zh": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_static_0.5.zip',
-        'md5':
-        'f10cbdedf47dc7a9668d2264494e1823',
-        'model':
-        'speedyspeech_csmsc.pdmodel',
-        'params':
-        'speedyspeech_csmsc.pdiparams',
-        'phones_dict':
-        'phone_id_map.txt',
-        'tones_dict':
-        'tone_id_map.txt',
-        'sample_rate':
-        24000,
-    },
-    # fastspeech2
-    "fastspeech2_csmsc-zh": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_static_0.4.zip',
-        'md5':
-        '9788cd9745e14c7a5d12d32670b2a5a7',
-        'model':
-        'fastspeech2_csmsc.pdmodel',
-        'params':
-        'fastspeech2_csmsc.pdiparams',
-        'phones_dict':
-        'phone_id_map.txt',
-        'sample_rate':
-        24000,
-    },
-    # pwgan
-    "pwgan_csmsc-zh": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_static_0.4.zip',
-        'md5':
-        'e3504aed9c5a290be12d1347836d2742',
-        'model':
-        'pwgan_csmsc.pdmodel',
-        'params':
-        'pwgan_csmsc.pdiparams',
-        'sample_rate':
-        24000,
-    },
-    # mb_melgan
-    "mb_melgan_csmsc-zh": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_static_0.1.1.zip',
-        'md5':
-        'ac6eee94ba483421d750433f4c3b8d36',
-        'model':
-        'mb_melgan_csmsc.pdmodel',
-        'params':
-        'mb_melgan_csmsc.pdiparams',
-        'sample_rate':
-        24000,
-    },
-    # hifigan
-    "hifigan_csmsc-zh": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_static_0.1.1.zip',
-        'md5':
-        '7edd8c436b3a5546b3a7cb8cff9d5a0c',
-        'model':
-        'hifigan_csmsc.pdmodel',
-        'params':
-        'hifigan_csmsc.pdiparams',
-        'sample_rate':
-        24000,
-    },
-}
 class TTSServerExecutor(TTSExecutor):
    def __init__(self):
        super().__init__()
-        pass
+        self.pretrained_models = pretrained_models
-    def _get_pretrained_path(self, tag: str) -> os.PathLike:
-        """
-        Download and returns pretrained resources path of current task.
-        """
-        assert tag in pretrained_models, 'Can not find pretrained resources of {}.'.format(
-            tag)
-        res_path = os.path.join(MODEL_HOME, tag)
-        decompressed_path = download_and_decompress(pretrained_models[tag],
-                                                    res_path)
-        decompressed_path = os.path.abspath(decompressed_path)
-        logger.info(
-            'Use pretrained model stored in: {}'.format(decompressed_path))
-        return decompressed_path
    def _init_from_path(
            self,
@@ -161,14 +70,14 @@ class TTSServerExecutor(TTSExecutor):
        if am_model is None or am_params is None or phones_dict is None:
            am_res_path = self._get_pretrained_path(am_tag)
            self.am_res_path = am_res_path
-            self.am_model = os.path.join(am_res_path,
+            self.am_model = os.path.join(
-                                         pretrained_models[am_tag]['model'])
+                am_res_path, self.pretrained_models[am_tag]['model'])
-            self.am_params = os.path.join(am_res_path,
+            self.am_params = os.path.join(
-                                          pretrained_models[am_tag]['params'])
+                am_res_path, self.pretrained_models[am_tag]['params'])
            # must have phones_dict in acoustic
            self.phones_dict = os.path.join(
-                am_res_path, pretrained_models[am_tag]['phones_dict'])
+                am_res_path, self.pretrained_models[am_tag]['phones_dict'])
-            self.am_sample_rate = pretrained_models[am_tag]['sample_rate']
+            self.am_sample_rate = self.pretrained_models[am_tag]['sample_rate']
            logger.info(am_res_path)
            logger.info(self.am_model)
@@ -183,17 +92,17 @@ class TTSServerExecutor(TTSExecutor):
        # for speedyspeech
        self.tones_dict = None
-        if 'tones_dict' in pretrained_models[am_tag]:
+        if 'tones_dict' in self.pretrained_models[am_tag]:
            self.tones_dict = os.path.join(
-                am_res_path, pretrained_models[am_tag]['tones_dict'])
+                am_res_path, self.pretrained_models[am_tag]['tones_dict'])
            if tones_dict:
                self.tones_dict = tones_dict
        # for multi speaker fastspeech2
        self.speaker_dict = None
-        if 'speaker_dict' in pretrained_models[am_tag]:
+        if 'speaker_dict' in self.pretrained_models[am_tag]:
            self.speaker_dict = os.path.join(
-                am_res_path, pretrained_models[am_tag]['speaker_dict'])
+                am_res_path, self.pretrained_models[am_tag]['speaker_dict'])
            if speaker_dict:
                self.speaker_dict = speaker_dict
@@ -202,11 +111,12 @@ class TTSServerExecutor(TTSExecutor):
        if voc_model is None or voc_params is None:
            voc_res_path = self._get_pretrained_path(voc_tag)
            self.voc_res_path = voc_res_path
-            self.voc_model = os.path.join(voc_res_path,
+            self.voc_model = os.path.join(
-                                          pretrained_models[voc_tag]['model'])
+                voc_res_path, self.pretrained_models[voc_tag]['model'])
-            self.voc_params = os.path.join(voc_res_path,
+            self.voc_params = os.path.join(
-                                           pretrained_models[voc_tag]['params'])
+                voc_res_path, self.pretrained_models[voc_tag]['params'])
-            self.voc_sample_rate = pretrained_models[voc_tag]['sample_rate']
+            self.voc_sample_rate = self.pretrained_models[voc_tag][
+                'sample_rate']
            logger.info(voc_res_path)
            logger.info(self.voc_model)
            logger.info(self.voc_params)
@@ -352,8 +262,24 @@ class TTSEngine(BaseEngine):
    def init(self, config: dict) -> bool:
        self.executor = TTSServerExecutor()
        self.config = config
+        try:
+            if self.config.am_predictor_conf.device is not None:
+                self.device = self.config.am_predictor_conf.device
+            elif self.config.voc_predictor_conf.device is not None:
+                self.device = self.config.voc_predictor_conf.device
+            else:
+                self.device = paddle.get_device()
+            paddle.set_device(self.device)
+        except BaseException as e:
+            logger.error(
+                "Set device failed, please check if device is already used and the parameter 'device' in the yaml file"
+            )
+            logger.error("Initialize TTS server engine Failed on device: %s." %
+                         (self.device))
+            return False
        self.executor._init_from_path(
            am=self.config.am,
            am_model=self.config.am_model,
@@ -370,9 +296,35 @@ class TTSEngine(BaseEngine):
            am_predictor_conf=self.config.am_predictor_conf,
            voc_predictor_conf=self.config.voc_predictor_conf, )
+        # warm up
+        try:
+            self.warm_up()
+            logger.info("Warm up successfully.")
+        except Exception as e:
+            logger.error("Failed to warm up on tts engine.")
+            return False
        logger.info("Initialize TTS server engine successfully.")
        return True
+    def warm_up(self):
+        """warm up
+        """
+        if self.config.lang == 'zh':
+            sentence = "您好，欢迎使用语音合成服务。"
+        if self.config.lang == 'en':
+            sentence = "Hello and welcome to the speech synthesis service."
+        logger.info("Start to warm up.")
+        for i in range(3):
+            st = time.time()
+            self.executor.infer(
+                text=sentence,
+                lang=self.config.lang,
+                am=self.config.am,
+                spk_id=0, )
+            logger.info(
+                f"The response time of the {i} warm up: {time.time() - st} s")
    def postprocess(self,
                    wav,
                    original_fs: int,

--- a/paddlespeech/server/engine/tts/python/tts_engine.py
+++ b/paddlespeech/server/engine/tts/python/tts_engine.py
@@ -51,15 +51,15 @@ class TTSEngine(BaseEngine):
    def init(self, config: dict) -> bool:
        self.executor = TTSServerExecutor()
+        self.config = config
        try:
-            self.config = config
+            if self.config.device is not None:
-            if self.config.device:
                self.device = self.config.device
            else:
                self.device = paddle.get_device()
            paddle.set_device(self.device)
-        except BaseException:
+        except BaseException as e:
            logger.error(
                "Set device failed, please check if device is already used and the parameter 'device' in the yaml file"
            )
@@ -87,10 +87,36 @@ class TTSEngine(BaseEngine):
                         (self.device))
            return False
+        # warm up
+        try:
+            self.warm_up()
+            logger.info("Warm up successfully.")
+        except Exception as e:
+            logger.error("Failed to warm up on tts engine.")
+            return False
        logger.info("Initialize TTS server engine successfully on device: %s." %
                    (self.device))
        return True
+    def warm_up(self):
+        """warm up
+        """
+        if self.config.lang == 'zh':
+            sentence = "您好，欢迎使用语音合成服务。"
+        if self.config.lang == 'en':
+            sentence = "Hello and welcome to the speech synthesis service."
+        logger.info("Start to warm up.")
+        for i in range(3):
+            st = time.time()
+            self.executor.infer(
+                text=sentence,
+                lang=self.config.lang,
+                am=self.config.am,
+                spk_id=0, )
+            logger.info(
+                f"The response time of the {i} warm up: {time.time() - st} s")
    def postprocess(self,
                    wav,
                    original_fs: int,

--- a/paddlespeech/server/restful/tts_api.py
+++ b/paddlespeech/server/restful/tts_api.py
@@ -128,7 +128,7 @@ def tts(request_body: TTSRequest):
    return response
-@router.post("/paddlespeech/streaming/tts")
+@router.post("/paddlespeech/tts/streaming")
 async def stream_tts(request_body: TTSRequest):
    text = request_body.text

--- a/paddlespeech/server/tests/tts/online/http_client.py
+++ b/paddlespeech/server/tests/tts/online/http_client.py
@@ -14,6 +14,7 @@
 import argparse
 from paddlespeech.server.utils.audio_handler import TTSHttpHandler
+from paddlespeech.server.utils.util import compute_delay
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
@@ -43,5 +44,25 @@ if __name__ == "__main__":
    print("tts http client start")
    handler = TTSHttpHandler(args.server, args.port, args.play)
-    handler.run(args.text, args.spk_id, args.speed, args.volume,
+    first_response, final_response, duration, save_audio_success, receive_time_list, chunk_duration_list = handler.run(
-                args.sample_rate, args.output)
+        args.text, args.spk_id, args.speed, args.volume, args.sample_rate,
+        args.output)
+    delay_time_list = compute_delay(receive_time_list, chunk_duration_list)
+    print(f"sentence: {args.text}")
+    print(f"duration: {duration} s")
+    print(f"first response: {first_response} s")
+    print(f"final response: {final_response} s")
+    print(f"RTF: {final_response/duration}")
+    if args.output is not None:
+        if save_audio_success:
+            print(f"Audio successfully saved in {args.output}")
+        else:
+            print("Audio save failed.")
+    if delay_time_list != []:
+        print(
+            f"Delay situation: total number of packages: {len(receive_time_list)}, the number of delayed packets: {len(delay_time_list)}, minimum delay time: {min(delay_time_list)} s, maximum delay time: {max(delay_time_list)} s, average delay time: {sum(delay_time_list)/len(delay_time_list)} s, delay rate:{len(delay_time_list)/len(receive_time_list)}"
+        )
+    else:
+        print("The sentence has no delay in streaming synthesis.")
--- a/paddlespeech/server/tests/tts/online/ws_client.py
+++ b/paddlespeech/server/tests/tts/online/ws_client.py
@@ -15,6 +15,7 @@ import argparse
 import asyncio
 from paddlespeech.server.utils.audio_handler import TTSWsHandler
+from paddlespeech.server.utils.util import compute_delay
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
@@ -35,4 +36,24 @@ if __name__ == "__main__":
    print("tts websocket client start")
    handler = TTSWsHandler(args.server, args.port, args.play)
    loop = asyncio.get_event_loop()
-    loop.run_until_complete(handler.run(args.text, args.output))
+    first_response, final_response, duration, save_audio_success, receive_time_list, chunk_duration_list = loop.run_until_complete(
+        handler.run(args.text, args.output))
+    delay_time_list = compute_delay(receive_time_list, chunk_duration_list)
+    print(f"sentence: {args.text}")
+    print(f"duration: {duration} s")
+    print(f"first response: {first_response} s")
+    print(f"final response: {final_response} s")
+    print(f"RTF: {final_response/duration}")
+    if args.output is not None:
+        if save_audio_success:
+            print(f"Audio successfully saved in {args.output}")
+        else:
+            print("Audio save failed.")
+    if delay_time_list != []:
+        print(
+            f"Delay situation: total number of packages: {len(receive_time_list)}, the number of delayed packets: {len(delay_time_list)}, minimum delay time: {min(delay_time_list)} s, maximum delay time: {max(delay_time_list)} s, average delay time: {sum(delay_time_list)/len(delay_time_list)} s, delay rate:{len(delay_time_list)/len(receive_time_list)}"
+        )
+    else:
+        print("The sentence has no delay in streaming synthesis.")
--- a/paddlespeech/server/utils/audio_handler.py
+++ b/paddlespeech/server/utils/audio_handler.py
@@ -259,7 +259,8 @@ class TTSWsHandler:
        """
        self.server = server
        self.port = port
-        self.url = "ws://" + self.server + ":" + str(self.port) + "/ws/tts"
+        self.url = "ws://" + self.server + ":" + str(
+            self.port) + "/paddlespeech/tts/streaming"
        self.play = play
        if self.play:
            import pyaudio
@@ -295,6 +296,8 @@ class TTSWsHandler:
            output (str): save audio path
        """
        all_bytes = b''
+        receive_time_list = []
+        chunk_duration_list = []
        # 1. Send websocket handshake protocal
        async with websockets.connect(self.url) as ws:
@@ -309,14 +312,15 @@ class TTSWsHandler:
            # 3. Process the received response 
            message = await ws.recv()
-            logger.info(f"句子：{text}")
+            first_response = time.time() - st
-            logger.info(f"首包响应：{time.time() - st} s")
            message = json.loads(message)
            status = message["status"]
            while (status == 1):
+                receive_time_list.append(time.time())
                audio = message["audio"]
                audio = base64.b64decode(audio)  # bytes
+                chunk_duration_list.append(len(audio) / 2.0 / 24000)
                all_bytes += audio
                if self.play:
                    self.mutex.acquire()
@@ -334,15 +338,11 @@ class TTSWsHandler:
            if status == 2:
                final_response = time.time() - st
                duration = len(all_bytes) / 2.0 / 24000
-                logger.info(f"尾包响应：{final_response} s")
-                logger.info(f"音频时长：{duration} s")
-                logger.info(f"RTF: {final_response / duration}")
                if output is not None:
-                    if save_audio(all_bytes, output):
+                    save_audio_success = save_audio(all_bytes, output)
-                        logger.info(f"音频保存至：{output}")
+                else:
-                    else:
+                    save_audio_success = False
-                        logger.error("save audio error")
            else:
                logger.error("infer error")
@@ -352,6 +352,8 @@ class TTSWsHandler:
                self.stream.close()
                self.p.terminate()
+        return first_response, final_response, duration, save_audio_success, receive_time_list, chunk_duration_list
 class TTSHttpHandler:
    def __init__(self, server="127.0.0.1", port=8092, play: bool=False):
@@ -365,7 +367,7 @@ class TTSHttpHandler:
        self.server = server
        self.port = port
        self.url = "http://" + str(self.server) + ":" + str(
-            self.port) + "/paddlespeech/streaming/tts"
+            self.port) + "/paddlespeech/tts/streaming"
        self.play = play
        if self.play:
@@ -423,13 +425,16 @@ class TTSHttpHandler:
        all_bytes = b''
        first_flag = 1
+        receive_time_list = []
+        chunk_duration_list = []
        # 2. Send request
        st = time.time()
        html = requests.post(self.url, json.dumps(params), stream=True)
        # 3. Process the received response 
-        for chunk in html.iter_content(chunk_size=1024):
+        for chunk in html.iter_content(chunk_size=None):
+            receive_time_list.append(time.time())
            audio = base64.b64decode(chunk)  # bytes
            if first_flag:
                first_response = time.time() - st
@@ -443,21 +448,15 @@ class TTSHttpHandler:
                    self.t.start()
                    self.start_play = False
            all_bytes += audio
+            chunk_duration_list.append(len(audio) / 2.0 / 24000)
        final_response = time.time() - st
        duration = len(all_bytes) / 2.0 / 24000
-        logger.info(f"句子：{text}")
-        logger.info(f"首包响应：{first_response} s")
-        logger.info(f"尾包响应：{final_response} s")
-        logger.info(f"音频时长：{duration} s")
-        logger.info(f"RTF: {final_response / duration}")
        if output is not None:
-            if save_audio(all_bytes, output):
+            save_audio_success = save_audio(all_bytes, output)
-                logger.info(f"音频保存至：{output}")
+        else:
-            else:
+            save_audio_success = False
-                logger.error("save audio error")
        if self.play:
            self.t.join()
@@ -465,6 +464,8 @@ class TTSHttpHandler:
            self.stream.close()
            self.p.terminate()
+        return first_response, final_response, duration, save_audio_success, receive_time_list, chunk_duration_list
 class VectorHttpHandler:
    def __init__(self, server_ip=None, port=None):

--- a/paddlespeech/server/utils/util.py
+++ b/paddlespeech/server/utils/util.py
@@ -75,3 +75,74 @@ def get_chunks(data, block_size, pad_size, step):
        else:
            print("Please set correct type to get chunks, am or voc")
    return chunks
+def compute_delay(receive_time_list, chunk_duration_list):
+    """compute delay 
+        Args:
+            receive_time_list (list): Time to receive each packet
+            chunk_duration_list (list): The audio duration corresponding to each packet
+        Returns:
+            [list]: Delay time list
+        """
+    assert (len(receive_time_list) == len(chunk_duration_list))
+    delay_time_list = []
+    play_time = receive_time_list[0] + chunk_duration_list[0]
+    for i in range(1, len(receive_time_list)):
+        receive_time = receive_time_list[i]
+        delay_time = receive_time - play_time
+        # 有延迟
+        if delay_time > 0:
+            play_time = play_time + delay_time + chunk_duration_list[i]
+            delay_time_list.append(delay_time)
+        # 没有延迟
+        else:
+            play_time = play_time + chunk_duration_list[i]
+    return delay_time_list
+def count_engine(logfile: str="./nohup.out"):
+    """For inference on the statistical engine side
+    Args:
+        logfile (str, optional): server log. Defaults to "./nohup.out".
+    """
+    first_response_list = []
+    final_response_list = []
+    duration_list = []
+    with open(logfile, "r") as f:
+        for line in f.readlines():
+            if "- first response time:" in line:
+                first_response = float(line.splie(" ")[-2])
+                first_response_list.append(first_response)
+            elif "- final response time:" in line:
+                final_response = float(line.splie(" ")[-2])
+                final_response_list.append(final_response)
+            elif "- The durations of audio is:" in line:
+                duration = float(line.splie(" ")[-2])
+                duration_list.append(duration)
+    assert (len(first_response_list) == len(final_response_list) and
+            len(final_response_list) == len(duration_list))
+    avg_first_response = sum(first_response_list) / len(first_response_list)
+    avg_final_response = sum(final_response_list) / len(final_response_list)
+    avg_duration = sum(duration_list) / len(duration_list)
+    RTF = sum(final_response_list) / sum(duration_list)
+    print(
+        "************************* engine result ***************************************"
+    )
+    print(
+        f"test num: {len(duration_list)}, avg first response: {avg_first_response} s, avg final response: {avg_final_response} s, avg duration: {avg_duration}, RTF: {RTF}"
+    )
+    print(
+        f"min duration: {min(duration_list)} s, max duration: {max(duration_list)} s"
+    )
+    print(
+        f"max first response: {max(first_response_list)} s, min first response: {min(first_response_list)} s"
+    )
+    print(
+        f"max final response: {max(final_response_list)} s, min final response: {min(final_response_list)} s"
+    )
--- a/paddlespeech/server/ws/tts_socket.py
+++ b/paddlespeech/server/ws/tts_socket.py
@@ -24,7 +24,7 @@ from paddlespeech.server.engine.engine_pool import get_engine_pool
 router = APIRouter()
-@router.websocket('/ws/tts')
+@router.websocket('/paddlespeech/tts/streaming')
 async def websocket_endpoint(websocket: WebSocket):
    await websocket.accept()

--- a/tests/unit/server/offline/change_yaml.py
+++ b/tests/unit/server/offline/change_yaml.py
@@ -19,7 +19,7 @@ def change_device(yamlfile: str, engine: str, device: str):
    if device == 'cpu':
        set_device = 'cpu'
    elif device == 'gpu':
-        set_device = 'gpu:0'
+        set_device = 'gpu:3'
    else:
        print("Please set correct device: cpu or gpu.")

--- a/tests/unit/server/offline/conf/application.yaml
+++ b/tests/unit/server/offline/conf/application.yaml
-# This is the parameter configuration file for PaddleSpeech Serving.
+# This is the parameter configuration file for PaddleSpeech Offline Serving.
 #################################################################################
 #                             SERVER SETTING                                    #
@@ -7,8 +7,8 @@ host: 127.0.0.1
 port: 8090
 # The task format in the engin_list is: <speech task>_<engine type>
-# task choices = ['asr_python', 'asr_inference', 'tts_python', 'tts_inference']
+# task choices = ['asr_python', 'asr_inference', 'tts_python', 'tts_inference', 'cls_python', 'cls_inference']
+protocol: 'http'
 engine_list: ['asr_python', 'tts_python', 'cls_python']

--- a/tests/unit/server/offline/test_server_client.sh
+++ b/tests/unit/server/offline/test_server_client.sh
@@ -21,6 +21,8 @@ StartService(){
 }
 ClientTest(){
+    echo "aaaaaaaaaaaaaaaaaaaaaaaaaa $server_ip"
+    echo "aaaaaaaaaaaaaaaaaaaaaaaaaa $port"
    # Client test
    # test asr client
    paddlespeech_client asr --server_ip $server_ip --port $port --input ./zh.wav 
@@ -39,6 +41,7 @@ ClientTest(){
    ((test_times+=1))
    paddlespeech_client cls --server_ip $server_ip --port $port --input ./zh.wav 
    ((test_times+=1)) 
 }
 GetTestResult() {
@@ -58,6 +61,7 @@ rm -rf log/server.log.wf
 rm -rf log/server.log
 rm -rf log/test_result.log
+cp ../../../../demos/speech_server/conf/application.yaml ./conf/
 config_file=./conf/application.yaml
 server_ip=$(cat $config_file | grep "host" | awk -F " " '{print $2}')
 port=$(cat $config_file | grep "port" | awk '/port:/ {print $2}')
@@ -191,5 +195,4 @@ echo "***************** Here are all the test results ********************"
 cat ./log/test_result.log
 # Restoring conf is the same as demos/speech_server
-rm -rf ./conf
+cp ../../../../demos/speech_server/conf/application.yaml ./conf/
-cp ../../../demos/speech_server/conf/ ./ -rf
\ No newline at end of file
--- a/tests/unit/server/online/tts/check_server/conf/application.yaml
+++ b/tests/unit/server/online/tts/check_server/conf/application.yaml
@@ -39,9 +39,9 @@ tts_online:
    # others
    lang: 'zh'
    device: 'cpu' # set 'gpu:id' or 'cpu'
-    am_block: 42
+    am_block: 72
    am_pad: 12
-    voc_block: 14
+    voc_block: 36
    voc_pad: 14
@@ -80,9 +80,9 @@ tts_online-onnx:
    # others
    lang: 'zh'
-    am_block: 42
+    am_block: 72
    am_pad: 12
-    voc_block: 14
+    voc_block: 36
    voc_pad: 14
    voc_upsample: 300
--- a/tests/unit/server/online/tts/check_server/test_all.sh
+++ b/tests/unit/server/online/tts/check_server/test_all.sh
@@ -10,7 +10,6 @@ bash test.sh tts_online $log_all_dir/log_tts_online_cpu
 python change_yaml.py --change_type engine_type --target_key engine_list --target_value tts_online-onnx
 bash test.sh tts_online-onnx $log_all_dir/log_tts_online-onnx_cpu
 python change_yaml.py --change_type device --target_key device --target_value gpu:3
 bash test.sh tts_online $log_all_dir/log_tts_online_gpu

--- a/tests/unit/server/online/tts/check_server/tts_online_application.yaml
+++ b/tests/unit/server/online/tts/check_server/tts_online_application.yaml
@@ -39,9 +39,9 @@ tts_online:
    # others
    lang: 'zh'
    device: 'cpu' # set 'gpu:id' or 'cpu'
-    am_block: 42
+    am_block: 72
    am_pad: 12
-    voc_block: 14
+    voc_block: 36
    voc_pad: 14
@@ -80,9 +80,9 @@ tts_online-onnx:
    # others
    lang: 'zh'
-    am_block: 42
+    am_block: 72
    am_pad: 12
-    voc_block: 14
+    voc_block: 36
    voc_pad: 14
    voc_upsample: 300
--- a/tests/unit/server/online/tts/test_server/test_http_client.py
+++ b/tests/unit/server/online/tts/test_server/test_http_client.py
@@ -12,117 +12,35 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
-import base64
+import asyncio
-import json
 import os
-import time
-import requests
+from paddlespeech.server.utils.util import compute_delay
-from paddlespeech.server.utils.audio_process import pcm2wav
 from paddlespeech.t2s.exps.syn_utils import get_sentences
-def save_audio(buffer, audio_path) -> bool:
-    if audio_path.endswith("pcm"):
-        with open(audio_path, "wb") as f:
-            f.write(buffer)
-    elif audio_path.endswith("wav"):
-        with open("./tmp.pcm", "wb") as f:
-            f.write(buffer)
-        pcm2wav("./tmp.pcm", audio_path, channels=1, bits=16, sample_rate=24000)
-        os.system("rm ./tmp.pcm")
-    else:
-        print("Only supports saved audio format is pcm or wav")
-        return False
-    return True
 def test(args, text, utt_id):
-    params = {
+    output = str(args.output_dir + "/" + utt_id + ".wav")
-        "text": text,
+    if args.protocol == "http":
-        "spk_id": args.spk_id,
+        print("tts http client start")
-        "speed": args.speed,
+        from paddlespeech.server.utils.audio_handler import TTSHttpHandler
-        "volume": args.volume,
+        handler = TTSHttpHandler(args.server_ip, args.port, args.play)
-        "sample_rate": args.sample_rate,
+        first_response, final_response, duration, save_audio_success, receive_time_list, chunk_duration_list = handler.run(
-        "save_path": ''
+            text, args.spk_id, args.speed, args.volume, args.sample_rate,
-    }
+            output)
-    buffer = b''
+    elif args.protocol == "websocket":
-    flag = 1
+        from paddlespeech.server.utils.audio_handler import TTSWsHandler
-    url = "http://" + str(args.server) + ":" + str(
+        print("tts websocket client start")
-        args.port) + "/paddlespeech/streaming/tts"
+        handler = TTSWsHandler(args.server_ip, args.port, args.play)
-    st = time.time()
+        loop = asyncio.get_event_loop()
-    html = requests.post(url, json.dumps(params), stream=True)
+        first_response, final_response, duration, save_audio_success, receive_time_list, chunk_duration_list = loop.run_until_complete(
-    for chunk in html.iter_content(chunk_size=1024):
+            handler.run(text, output))
-        chunk = base64.b64decode(chunk)  # bytes
-        if flag:
-            first_response = time.time() - st
-            print(f"首包响应：{first_response} s")
-            flag = 0
-        buffer += chunk
-    final_response = time.time() - st
-    duration = len(buffer) / 2.0 / 24000
-    print(f"sentence: {text}")
-    print(f"尾包响应：{final_response} s")
-    print(f"音频时长：{duration} s")
-    print(f"RTF: {final_response / duration}")
-    save_path = str(args.output_dir + "/" + utt_id + ".wav")
-    save_audio(buffer, save_path)
-    print("音频保存至：", save_path)
-    return first_response, final_response, duration
-def count_engine(logfile: str="./nohup.out"):
-    """For inference on the statistical engine side
-    Args:
-        logfile (str, optional): server log. Defaults to "./nohup.out".
-    """
-    first_response_list = []
-    final_response_list = []
-    duration_list = []
-    with open(logfile, "r") as f:
+    else:
-        for line in f.readlines():
+        print("Please set correct protocol, http or websocket")
-            if "- first response time:" in line:
-                first_response = float(line.splie(" ")[-2])
-                first_response_list.append(first_response)
-            elif "- final response time:" in line:
-                final_response = float(line.splie(" ")[-2])
-                final_response_list.append(final_response)
-            elif "- The durations of audio is:" in line:
-                duration = float(line.splie(" ")[-2])
-                duration_list.append(duration)
-    assert (len(first_response_list) == len(final_response_list) and
+    return first_response, final_response, duration, save_audio_success, receive_time_list, chunk_duration_list
-            len(final_response_list) == len(duration_list))
-    avg_first_response = sum(first_response_list) / len(first_response_list)
-    avg_final_response = sum(final_response_list) / len(final_response_list)
-    avg_duration = sum(duration_list) / len(duration_list)
-    RTF = sum(final_response_list) / sum(duration_list)
-    print(
-        "************************* engine result ***************************************"
-    )
-    print(
-        f"test num: {len(duration_list)}, avg first response: {avg_first_response} s, avg final response: {avg_final_response} s, avg duration: {avg_duration}, RTF: {RTF}"
-    )
-    print(
-        f"min duration: {min(duration_list)} s, max duration: {max(duration_list)} s"
-    )
-    print(
-        f"max first response: {max(first_response_list)} s, min first response: {min(first_response_list)} s"
-    )
-    print(
-        f"max final response: {max(final_response_list)} s, min final response: {min(final_response_list)} s"
-    )
 if __name__ == "__main__":
@@ -142,10 +60,18 @@ if __name__ == "__main__":
        default=0,
        help='Sampling rate, the default is the same as the model')
    parser.add_argument(
-        "--server", type=str, help="server ip", default="127.0.0.1")
+        "--server_ip", type=str, help="server ip", default="127.0.0.1")
    parser.add_argument("--port", type=int, help="server port", default=8092)
+    parser.add_argument(
+        "--protocol",
+        type=str,
+        choices=['http', 'websocket'],
+        help="server protocol",
+        default="http")
    parser.add_argument(
        "--output_dir", type=str, default="./output", help="output dir")
+    parser.add_argument(
+        "--play", type=bool, help="whether to play audio", default=False)
    args = parser.parse_args()
@@ -155,13 +81,35 @@ if __name__ == "__main__":
    first_response_list = []
    final_response_list = []
    duration_list = []
+    all_delay_list = []
+    packet_count = 0.0
    sentences = get_sentences(text_file=args.text, lang="zh")
    for utt_id, sentence in sentences:
-        first_response, final_response, duration = test(args, sentence, utt_id)
+        first_response, final_response, duration, save_audio_success, receive_time_list, chunk_duration_list = test(
+            args, sentence, utt_id)
+        delay_time_list = compute_delay(receive_time_list, chunk_duration_list)
        first_response_list.append(first_response)
        final_response_list.append(final_response)
        duration_list.append(duration)
+        packet_count += len(receive_time_list)
+        print(f"句子：{sentence}")
+        print(f"首包响应时间：{first_response} s")
+        print(f"尾包响应时间：{final_response} s")
+        print(f"音频时长：{duration} s")
+        print(f"该句RTF：{final_response/duration}")
+        if delay_time_list != []:
+            for t in delay_time_list:
+                all_delay_list.append(t)
+            print(
+                f"该句流式合成的延迟情况：总包个数：{len(receive_time_list)}，延迟包个数：{len(delay_time_list)}, 最小延迟时间：{min(delay_time_list)} s, 最大延迟时间：{max(delay_time_list)} s, 平均延迟时间：{sum(delay_time_list)/len(delay_time_list)} s, 延迟率：{len(delay_time_list)/len(receive_time_list)}"
+            )
+        else:
+            print("该句流式合成无延迟情况")
+        packet_count += len(receive_time_list)
    assert (len(first_response_list) == len(final_response_list) and
            len(final_response_list) == len(duration_list))
@@ -170,19 +118,35 @@ if __name__ == "__main__":
    avg_final_response = sum(final_response_list) / len(final_response_list)
    avg_duration = sum(duration_list) / len(duration_list)
    RTF = sum(final_response_list) / sum(duration_list)
+    if all_delay_list != []:
+        delay_count = len(all_delay_list)
+        avg_delay = sum(all_delay_list) / len(all_delay_list)
+        delay_ratio = len(all_delay_list) / packet_count
+        min_delay = min(all_delay_list)
+        max_delay = max(all_delay_list)
+    else:
+        delay_count = 0.0
+        avg_delay = 0.0
+        delay_ratio = 0.0
+        min_delay = 0.0
+        max_delay = 0.0
    print(
        "************************* server/client result ***************************************"
    )
    print(
-        f"test num: {len(duration_list)}, avg first response: {avg_first_response} s, avg final response: {avg_final_response} s, avg duration: {avg_duration}, RTF: {RTF}"
+        f"test num: {len(duration_list)}, avg first response: {avg_first_response} s, avg final response: {avg_final_response} s, avg duration: {avg_duration}, RTF: {RTF}."
+    )
+    print(
+        f"test num: {len(duration_list)}, packet count: {packet_count}, delay count: {delay_count}, avg delay time: {avg_delay} s, delay ratio: {delay_ratio} "
    )
    print(
        f"min duration: {min(duration_list)} s, max duration: {max(duration_list)} s"
    )
    print(
-        f"max first response: {max(first_response_list)} s, min first response: {min(first_response_list)} s"
+        f"min first response: {min(first_response_list)} s, max first response: {max(first_response_list)} s."
    )
    print(
-        f"max final response: {max(final_response_list)} s, min final response: {min(final_response_list)} s"
+        f"min final response: {min(final_response_list)} s, max final response: {max(final_response_list)} s."
    )
+    print(f"min delay: {min_delay} s, max delay: {max_delay}")