diff --git a/demos/streaming_tts_server/conf/tts_online_application.yaml b/demos/streaming_tts_server/conf/tts_online_application.yaml
index 67d4641a0f75c08d57798ff43cca99be25d35298..714f4a68969b2ec196c483692c4f712baeaad3a3 100644
--- a/demos/streaming_tts_server/conf/tts_online_application.yaml
+++ b/demos/streaming_tts_server/conf/tts_online_application.yaml
@@ -43,12 +43,12 @@ tts_online:
     device: 'cpu' # set 'gpu:id' or 'cpu'
     # am_block and am_pad only for fastspeech2_cnndecoder_onnx model to streaming am infer,
     # when am_pad set 12, streaming synthetic audio is the same as non-streaming synthetic audio
-    am_block: 42
+    am_block: 72
     am_pad: 12
     # voc_pad and voc_block voc model to streaming voc infer,
     # when voc model is mb_melgan_csmsc, voc_pad set 14, streaming synthetic audio is the same as non-streaming synthetic audio; The minimum value of pad can be set to 7, streaming synthetic audio sounds normal
     # when voc model is hifigan_csmsc, voc_pad set 20, streaming synthetic audio is the same as non-streaming synthetic audio; voc_pad set 14, streaming synthetic audio sounds normal
-    voc_block: 14
+    voc_block: 36
     voc_pad: 14
     
 
@@ -91,12 +91,12 @@ tts_online-onnx:
     lang: 'zh'
     # am_block and am_pad only for fastspeech2_cnndecoder_onnx model to streaming am infer,
     # when am_pad set 12, streaming synthetic audio is the same as non-streaming synthetic audio
-    am_block: 42
+    am_block: 72
     am_pad: 12
     # voc_pad and voc_block voc model to streaming voc infer,
     # when voc model is mb_melgan_csmsc_onnx, voc_pad set 14, streaming synthetic audio is the same as non-streaming synthetic audio; The minimum value of pad can be set to 7, streaming synthetic audio sounds normal
     # when voc model is hifigan_csmsc_onnx, voc_pad set 20, streaming synthetic audio is the same as non-streaming synthetic audio; voc_pad set 14, streaming synthetic audio sounds normal
-    voc_block: 14
+    voc_block: 36
     voc_pad: 14
     # voc_upsample should be same as n_shift on voc config.
     voc_upsample: 300
diff --git a/paddlespeech/server/bin/paddlespeech_client.py b/paddlespeech/server/bin/paddlespeech_client.py
index 8677279b77b77ab22d565cc8bce1a392b6adadd1..19bdc10b1ac03f5c197f5801a71c413dfc77b688 100644
--- a/paddlespeech/server/bin/paddlespeech_client.py
+++ b/paddlespeech/server/bin/paddlespeech_client.py
@@ -31,6 +31,7 @@ from ..util import stats_wrapper
 from paddlespeech.cli.log import logger
 from paddlespeech.server.utils.audio_handler import ASRWsAudioHandler
 from paddlespeech.server.utils.audio_process import wav2pcm
+from paddlespeech.server.utils.util import compute_delay
 from paddlespeech.server.utils.util import wav2base64
 
 __all__ = [
@@ -221,7 +222,7 @@ class TTSOnlineClientExecutor(BaseExecutor):
         play = args.play
 
         try:
-            res = self(
+            self(
                 input=input_,
                 server_ip=server_ip,
                 port=port,
@@ -257,17 +258,42 @@ class TTSOnlineClientExecutor(BaseExecutor):
             logger.info("tts http client start")
             from paddlespeech.server.utils.audio_handler import TTSHttpHandler
             handler = TTSHttpHandler(server_ip, port, play)
-            handler.run(input, spk_id, speed, volume, sample_rate, output)
+            first_response, final_response, duration, save_audio_success, receive_time_list, chunk_duration_list = handler.run(
+                input, spk_id, speed, volume, sample_rate, output)
+            delay_time_list = compute_delay(receive_time_list,
+                                            chunk_duration_list)
 
         elif protocol == "websocket":
             from paddlespeech.server.utils.audio_handler import TTSWsHandler
             logger.info("tts websocket client start")
             handler = TTSWsHandler(server_ip, port, play)
             loop = asyncio.get_event_loop()
-            loop.run_until_complete(handler.run(input, output))
+            first_response, final_response, duration, save_audio_success, receive_time_list, chunk_duration_list = loop.run_until_complete(
+                handler.run(input, output))
+            delay_time_list = compute_delay(receive_time_list,
+                                            chunk_duration_list)
 
         else:
             logger.error("Please set correct protocol, http or websocket")
+            return False
+
+        logger.info(f"sentence: {input}")
+        logger.info(f"duration: {duration} s")
+        logger.info(f"first response: {first_response} s")
+        logger.info(f"final response: {final_response} s")
+        logger.info(f"RTF: {final_response/duration}")
+        if output is not None:
+            if save_audio_success:
+                logger.info(f"Audio successfully saved in {output}")
+            else:
+                logger.error("Audio save failed.")
+
+        if delay_time_list != []:
+            logger.info(
+                f"Delay situation: total number of packages: {len(receive_time_list)}, the number of delayed packets: {len(delay_time_list)}, minimum delay time: {min(delay_time_list)} s, maximum delay time: {max(delay_time_list)} s, average delay time: {sum(delay_time_list)/len(delay_time_list)} s, delay rate:{len(delay_time_list)/len(receive_time_list)}"
+            )
+        else:
+            logger.info("The sentence has no delay in streaming synthesis.")
 
 
 @cli_client_register(
diff --git a/paddlespeech/server/conf/application.yaml b/paddlespeech/server/conf/application.yaml
index b6a9942ed7bfa4248004a2c27d9a32bbb3ac0386..31a37ef04e2dc910314bad88c1e81fdbff07bb4b 100644
--- a/paddlespeech/server/conf/application.yaml
+++ b/paddlespeech/server/conf/application.yaml
@@ -1,4 +1,4 @@
-# This is the parameter configuration file for PaddleSpeech Serving.
+# This is the parameter configuration file for PaddleSpeech Offline Serving..
 
 #################################################################################
 #                             SERVER SETTING                                    #
@@ -7,9 +7,7 @@ host: 127.0.0.1
 port: 8090
 
 # The task format in the engin_list is: <speech task>_<engine type>
-# task choices = ['asr_python', 'asr_inference', 'tts_python', 'tts_inference']
-# protocol = ['websocket', 'http'] (only one can be selected). 
-# http only support offline engine type.
+# task choices = ['asr_python', 'asr_inference', 'tts_python', 'tts_inference', 'cls_python', 'cls_inference']
 protocol: 'http'
 engine_list: ['asr_python', 'tts_python', 'cls_python', 'text_python', 'vector_python']
 
@@ -50,24 +48,6 @@ asr_inference:
         summary: True  # False -> do not show predictor config
 
 
-################### speech task: asr; engine_type: online #######################
-asr_online:
-    model_type: 'deepspeech2online_aishell'
-    am_model: # the pdmodel file of am static model [optional]
-    am_params:  # the pdiparams file of am static model [optional]
-    lang: 'zh'
-    sample_rate: 16000
-    cfg_path: 
-    decode_method: 
-    force_yes: True
-
-    am_predictor_conf:
-        device:  # set 'gpu:id' or 'cpu'
-        switch_ir_optim: True
-        glog_info: False  # True -> print glog
-        summary: True  # False -> do not show predictor config
-
-
 ################################### TTS #########################################
 ################### speech task: tts; engine_type: python #######################
 tts_python: 
diff --git a/paddlespeech/server/conf/tts_online_application.yaml b/paddlespeech/server/conf/tts_online_application.yaml
index 67d4641a0f75c08d57798ff43cca99be25d35298..714f4a68969b2ec196c483692c4f712baeaad3a3 100644
--- a/paddlespeech/server/conf/tts_online_application.yaml
+++ b/paddlespeech/server/conf/tts_online_application.yaml
@@ -43,12 +43,12 @@ tts_online:
     device: 'cpu' # set 'gpu:id' or 'cpu'
     # am_block and am_pad only for fastspeech2_cnndecoder_onnx model to streaming am infer,
     # when am_pad set 12, streaming synthetic audio is the same as non-streaming synthetic audio
-    am_block: 42
+    am_block: 72
     am_pad: 12
     # voc_pad and voc_block voc model to streaming voc infer,
     # when voc model is mb_melgan_csmsc, voc_pad set 14, streaming synthetic audio is the same as non-streaming synthetic audio; The minimum value of pad can be set to 7, streaming synthetic audio sounds normal
     # when voc model is hifigan_csmsc, voc_pad set 20, streaming synthetic audio is the same as non-streaming synthetic audio; voc_pad set 14, streaming synthetic audio sounds normal
-    voc_block: 14
+    voc_block: 36
     voc_pad: 14
     
 
@@ -91,12 +91,12 @@ tts_online-onnx:
     lang: 'zh'
     # am_block and am_pad only for fastspeech2_cnndecoder_onnx model to streaming am infer,
     # when am_pad set 12, streaming synthetic audio is the same as non-streaming synthetic audio
-    am_block: 42
+    am_block: 72
     am_pad: 12
     # voc_pad and voc_block voc model to streaming voc infer,
     # when voc model is mb_melgan_csmsc_onnx, voc_pad set 14, streaming synthetic audio is the same as non-streaming synthetic audio; The minimum value of pad can be set to 7, streaming synthetic audio sounds normal
     # when voc model is hifigan_csmsc_onnx, voc_pad set 20, streaming synthetic audio is the same as non-streaming synthetic audio; voc_pad set 14, streaming synthetic audio sounds normal
-    voc_block: 14
+    voc_block: 36
     voc_pad: 14
     # voc_upsample should be same as n_shift on voc config.
     voc_upsample: 300
diff --git a/paddlespeech/server/engine/asr/online/asr_engine.py b/paddlespeech/server/engine/asr/online/asr_engine.py
index 99d34a3050eff54d2185ff6d0dda0ffb9ae27dbe..ad1e6fa390a1b290a052a5eb976fd34149b2a494 100644
--- a/paddlespeech/server/engine/asr/online/asr_engine.py
+++ b/paddlespeech/server/engine/asr/online/asr_engine.py
@@ -20,10 +20,9 @@ import paddle
 from numpy import float32
 from yacs.config import CfgNode
 
+from .pretrained_models import pretrained_models
 from paddlespeech.cli.asr.infer import ASRExecutor
-from paddlespeech.cli.asr.infer import model_alias
 from paddlespeech.cli.log import logger
-from paddlespeech.cli.utils import download_and_decompress
 from paddlespeech.cli.utils import MODEL_HOME
 from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
 from paddlespeech.s2t.frontend.speech import SpeechSegment
@@ -40,45 +39,6 @@ from paddlespeech.server.utils.paddle_predictor import init_predictor
 
 __all__ = ['ASREngine']
 
-pretrained_models = {
-    "deepspeech2online_aishell-zh-16k": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_fbank161_ckpt_0.2.1.model.tar.gz',
-        'md5':
-        '98b87b171b7240b7cae6e07d8d0bc9be',
-        'cfg_path':
-        'model.yaml',
-        'ckpt_path':
-        'exp/deepspeech2_online/checkpoints/avg_1',
-        'model':
-        'exp/deepspeech2_online/checkpoints/avg_1.jit.pdmodel',
-        'params':
-        'exp/deepspeech2_online/checkpoints/avg_1.jit.pdiparams',
-        'lm_url':
-        'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm',
-        'lm_md5':
-        '29e02312deb2e59b3c8686c7966d4fe3'
-    },
-    "conformer_online_multicn-zh-16k": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/s2t/multi_cn/asr1/asr1_chunk_conformer_multi_cn_ckpt_0.2.3.model.tar.gz',
-        'md5':
-        '0ac93d390552336f2a906aec9e33c5fa',
-        'cfg_path':
-        'model.yaml',
-        'ckpt_path':
-        'exp/chunk_conformer/checkpoints/multi_cn',
-        'model':
-        'exp/chunk_conformer/checkpoints/multi_cn.pdparams',
-        'params':
-        'exp/chunk_conformer/checkpoints/multi_cn.pdparams',
-        'lm_url':
-        'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm',
-        'lm_md5':
-        '29e02312deb2e59b3c8686c7966d4fe3'
-    },
-}
-
 
 # ASR server connection process class
 class PaddleASRConnectionHanddler:
@@ -625,24 +585,7 @@ class PaddleASRConnectionHanddler:
 class ASRServerExecutor(ASRExecutor):
     def __init__(self):
         super().__init__()
-        pass
-
-    def _get_pretrained_path(self, tag: str) -> os.PathLike:
-        """
-        Download and returns pretrained resources path of current task.
-        """
-        support_models = list(pretrained_models.keys())
-        assert tag in pretrained_models, 'The model "{}" you want to use has not been supported, please choose other models.\nThe support models includes:\n\t\t{}\n'.format(
-            tag, '\n\t\t'.join(support_models))
-
-        res_path = os.path.join(MODEL_HOME, tag)
-        decompressed_path = download_and_decompress(pretrained_models[tag],
-                                                    res_path)
-        decompressed_path = os.path.abspath(decompressed_path)
-        logger.info(
-            'Use pretrained model stored in: {}'.format(decompressed_path))
-
-        return decompressed_path
+        self.pretrained_models = pretrained_models
 
     def _init_from_path(self,
                         model_type: str='deepspeech2online_aishell',
@@ -658,20 +601,20 @@ class ASRServerExecutor(ASRExecutor):
         """
         self.model_type = model_type
         self.sample_rate = sample_rate
+        sample_rate_str = '16k' if sample_rate == 16000 else '8k'
+        tag = model_type + '-' + lang + '-' + sample_rate_str
         if cfg_path is None or am_model is None or am_params is None:
-            sample_rate_str = '16k' if sample_rate == 16000 else '8k'
-            tag = model_type + '-' + lang + '-' + sample_rate_str
             logger.info(f"Load the pretrained model, tag = {tag}")
             res_path = self._get_pretrained_path(tag)  # wenetspeech_zh
             self.res_path = res_path
 
-            self.cfg_path = os.path.join(res_path,
-                                         pretrained_models[tag]['cfg_path'])
+            self.cfg_path = os.path.join(
+                res_path, self.pretrained_models[tag]['cfg_path'])
 
             self.am_model = os.path.join(res_path,
-                                         pretrained_models[tag]['model'])
+                                         self.pretrained_models[tag]['model'])
             self.am_params = os.path.join(res_path,
-                                          pretrained_models[tag]['params'])
+                                          self.pretrained_models[tag]['params'])
             logger.info(res_path)
         else:
             self.cfg_path = os.path.abspath(cfg_path)
@@ -699,8 +642,8 @@ class ASRServerExecutor(ASRExecutor):
                 self.text_feature = TextFeaturizer(
                     unit_type=self.config.unit_type, vocab=self.vocab)
 
-                lm_url = pretrained_models[tag]['lm_url']
-                lm_md5 = pretrained_models[tag]['lm_md5']
+                lm_url = self.pretrained_models[tag]['lm_url']
+                lm_md5 = self.pretrained_models[tag]['lm_md5']
                 logger.info(f"Start to load language model {lm_url}")
                 self.download_lm(
                     lm_url,
@@ -773,7 +716,7 @@ class ASRServerExecutor(ASRExecutor):
             model_name = model_type[:model_type.rindex(
                 '_')]  # model_type: {model_name}_{dataset}
             logger.info(f"model name: {model_name}")
-            model_class = dynamic_import(model_name, model_alias)
+            model_class = dynamic_import(model_name, self.model_alias)
             model_conf = self.config
             model = model_class.from_config(model_conf)
             self.model = model
diff --git a/paddlespeech/server/engine/asr/online/pretrained_models.py b/paddlespeech/server/engine/asr/online/pretrained_models.py
new file mode 100644
index 0000000000000000000000000000000000000000..005977b46b309d42f2b0bdf981fdfdfef22fdcb2
--- /dev/null
+++ b/paddlespeech/server/engine/asr/online/pretrained_models.py
@@ -0,0 +1,52 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+pretrained_models = {
+    "deepspeech2online_aishell-zh-16k": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_fbank161_ckpt_0.2.1.model.tar.gz',
+        'md5':
+        '98b87b171b7240b7cae6e07d8d0bc9be',
+        'cfg_path':
+        'model.yaml',
+        'ckpt_path':
+        'exp/deepspeech2_online/checkpoints/avg_1',
+        'model':
+        'exp/deepspeech2_online/checkpoints/avg_1.jit.pdmodel',
+        'params':
+        'exp/deepspeech2_online/checkpoints/avg_1.jit.pdiparams',
+        'lm_url':
+        'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm',
+        'lm_md5':
+        '29e02312deb2e59b3c8686c7966d4fe3'
+    },
+    "conformer_online_multicn-zh-16k": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/s2t/multi_cn/asr1/asr1_chunk_conformer_multi_cn_ckpt_0.2.3.model.tar.gz',
+        'md5':
+        '0ac93d390552336f2a906aec9e33c5fa',
+        'cfg_path':
+        'model.yaml',
+        'ckpt_path':
+        'exp/chunk_conformer/checkpoints/multi_cn',
+        'model':
+        'exp/chunk_conformer/checkpoints/multi_cn.pdparams',
+        'params':
+        'exp/chunk_conformer/checkpoints/multi_cn.pdparams',
+        'lm_url':
+        'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm',
+        'lm_md5':
+        '29e02312deb2e59b3c8686c7966d4fe3'
+    },
+}
diff --git a/paddlespeech/server/engine/asr/paddleinference/asr_engine.py b/paddlespeech/server/engine/asr/paddleinference/asr_engine.py
index 1925bf1d623613d073bb028133a348842b591127..e275f1088f648df62947ded43f297cbb8d2c70c2 100644
--- a/paddlespeech/server/engine/asr/paddleinference/asr_engine.py
+++ b/paddlespeech/server/engine/asr/paddleinference/asr_engine.py
@@ -19,6 +19,7 @@ from typing import Optional
 import paddle
 from yacs.config import CfgNode
 
+from .pretrained_models import pretrained_models
 from paddlespeech.cli.asr.infer import ASRExecutor
 from paddlespeech.cli.log import logger
 from paddlespeech.cli.utils import MODEL_HOME
@@ -31,32 +32,11 @@ from paddlespeech.server.utils.paddle_predictor import run_model
 
 __all__ = ['ASREngine']
 
-pretrained_models = {
-    "deepspeech2offline_aishell-zh-16k": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_aishell_ckpt_0.1.1.model.tar.gz',
-        'md5':
-        '932c3593d62fe5c741b59b31318aa314',
-        'cfg_path':
-        'model.yaml',
-        'ckpt_path':
-        'exp/deepspeech2/checkpoints/avg_1',
-        'model':
-        'exp/deepspeech2/checkpoints/avg_1.jit.pdmodel',
-        'params':
-        'exp/deepspeech2/checkpoints/avg_1.jit.pdiparams',
-        'lm_url':
-        'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm',
-        'lm_md5':
-        '29e02312deb2e59b3c8686c7966d4fe3'
-    },
-}
-
 
 class ASRServerExecutor(ASRExecutor):
     def __init__(self):
         super().__init__()
-        pass
+        self.pretrained_models = pretrained_models
 
     def _init_from_path(self,
                         model_type: str='wenetspeech',
@@ -71,18 +51,18 @@ class ASRServerExecutor(ASRExecutor):
         Init model and other resources from a specific path.
         """
 
+        sample_rate_str = '16k' if sample_rate == 16000 else '8k'
+        tag = model_type + '-' + lang + '-' + sample_rate_str
         if cfg_path is None or am_model is None or am_params is None:
-            sample_rate_str = '16k' if sample_rate == 16000 else '8k'
-            tag = model_type + '-' + lang + '-' + sample_rate_str
             res_path = self._get_pretrained_path(tag)  # wenetspeech_zh
             self.res_path = res_path
-            self.cfg_path = os.path.join(res_path,
-                                         pretrained_models[tag]['cfg_path'])
+            self.cfg_path = os.path.join(
+                res_path, self.pretrained_models[tag]['cfg_path'])
 
             self.am_model = os.path.join(res_path,
-                                         pretrained_models[tag]['model'])
+                                         self.pretrained_models[tag]['model'])
             self.am_params = os.path.join(res_path,
-                                          pretrained_models[tag]['params'])
+                                          self.pretrained_models[tag]['params'])
             logger.info(res_path)
             logger.info(self.cfg_path)
             logger.info(self.am_model)
@@ -109,8 +89,8 @@ class ASRServerExecutor(ASRExecutor):
                 self.text_feature = TextFeaturizer(
                     unit_type=self.config.unit_type, vocab=self.vocab)
 
-                lm_url = pretrained_models[tag]['lm_url']
-                lm_md5 = pretrained_models[tag]['lm_md5']
+                lm_url = self.pretrained_models[tag]['lm_url']
+                lm_md5 = self.pretrained_models[tag]['lm_md5']
                 self.download_lm(
                     lm_url,
                     os.path.dirname(self.config.decode.lang_model_path), lm_md5)
diff --git a/paddlespeech/server/engine/asr/paddleinference/pretrained_models.py b/paddlespeech/server/engine/asr/paddleinference/pretrained_models.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4c23e38cfb0b126e91090053054bcc50dc733e1
--- /dev/null
+++ b/paddlespeech/server/engine/asr/paddleinference/pretrained_models.py
@@ -0,0 +1,34 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+pretrained_models = {
+    "deepspeech2offline_aishell-zh-16k": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_aishell_ckpt_0.1.1.model.tar.gz',
+        'md5':
+        '932c3593d62fe5c741b59b31318aa314',
+        'cfg_path':
+        'model.yaml',
+        'ckpt_path':
+        'exp/deepspeech2/checkpoints/avg_1',
+        'model':
+        'exp/deepspeech2/checkpoints/avg_1.jit.pdmodel',
+        'params':
+        'exp/deepspeech2/checkpoints/avg_1.jit.pdiparams',
+        'lm_url':
+        'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm',
+        'lm_md5':
+        '29e02312deb2e59b3c8686c7966d4fe3'
+    },
+}
diff --git a/paddlespeech/server/engine/cls/paddleinference/cls_engine.py b/paddlespeech/server/engine/cls/paddleinference/cls_engine.py
index 3982effd902c9d79b7b7684a7bd0268d0e8c1049..0906c2412d36f2d27393731da18e994772c2addd 100644
--- a/paddlespeech/server/engine/cls/paddleinference/cls_engine.py
+++ b/paddlespeech/server/engine/cls/paddleinference/cls_engine.py
@@ -20,83 +20,20 @@ import numpy as np
 import paddle
 import yaml
 
+from .pretrained_models import pretrained_models
 from paddlespeech.cli.cls.infer import CLSExecutor
 from paddlespeech.cli.log import logger
-from paddlespeech.cli.utils import download_and_decompress
-from paddlespeech.cli.utils import MODEL_HOME
 from paddlespeech.server.engine.base_engine import BaseEngine
 from paddlespeech.server.utils.paddle_predictor import init_predictor
 from paddlespeech.server.utils.paddle_predictor import run_model
 
 __all__ = ['CLSEngine']
 
-pretrained_models = {
-    "panns_cnn6-32k": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/cls/inference_model/panns_cnn6_static.tar.gz',
-        'md5':
-        'da087c31046d23281d8ec5188c1967da',
-        'cfg_path':
-        'panns.yaml',
-        'model_path':
-        'inference.pdmodel',
-        'params_path':
-        'inference.pdiparams',
-        'label_file':
-        'audioset_labels.txt',
-    },
-    "panns_cnn10-32k": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/cls/inference_model/panns_cnn10_static.tar.gz',
-        'md5':
-        '5460cc6eafbfaf0f261cc75b90284ae1',
-        'cfg_path':
-        'panns.yaml',
-        'model_path':
-        'inference.pdmodel',
-        'params_path':
-        'inference.pdiparams',
-        'label_file':
-        'audioset_labels.txt',
-    },
-    "panns_cnn14-32k": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/cls/inference_model/panns_cnn14_static.tar.gz',
-        'md5':
-        'ccc80b194821274da79466862b2ab00f',
-        'cfg_path':
-        'panns.yaml',
-        'model_path':
-        'inference.pdmodel',
-        'params_path':
-        'inference.pdiparams',
-        'label_file':
-        'audioset_labels.txt',
-    },
-}
-
 
 class CLSServerExecutor(CLSExecutor):
     def __init__(self):
         super().__init__()
-        pass
-
-    def _get_pretrained_path(self, tag: str) -> os.PathLike:
-        """
-            Download and returns pretrained resources path of current task.
-        """
-        support_models = list(pretrained_models.keys())
-        assert tag in pretrained_models, 'The model "{}" you want to use has not been supported, please choose other models.\nThe support models includes:\n\t\t{}\n'.format(
-            tag, '\n\t\t'.join(support_models))
-
-        res_path = os.path.join(MODEL_HOME, tag)
-        decompressed_path = download_and_decompress(pretrained_models[tag],
-                                                    res_path)
-        decompressed_path = os.path.abspath(decompressed_path)
-        logger.info(
-            'Use pretrained model stored in: {}'.format(decompressed_path))
-
-        return decompressed_path
+        self.pretrained_models = pretrained_models
 
     def _init_from_path(
             self,
@@ -113,14 +50,14 @@ class CLSServerExecutor(CLSExecutor):
         if cfg_path is None or model_path is None or params_path is None or label_file is None:
             tag = model_type + '-' + '32k'
             self.res_path = self._get_pretrained_path(tag)
-            self.cfg_path = os.path.join(self.res_path,
-                                         pretrained_models[tag]['cfg_path'])
-            self.model_path = os.path.join(self.res_path,
-                                           pretrained_models[tag]['model_path'])
+            self.cfg_path = os.path.join(
+                self.res_path, self.pretrained_models[tag]['cfg_path'])
+            self.model_path = os.path.join(
+                self.res_path, self.pretrained_models[tag]['model_path'])
             self.params_path = os.path.join(
-                self.res_path, pretrained_models[tag]['params_path'])
-            self.label_file = os.path.join(self.res_path,
-                                           pretrained_models[tag]['label_file'])
+                self.res_path, self.pretrained_models[tag]['params_path'])
+            self.label_file = os.path.join(
+                self.res_path, self.pretrained_models[tag]['label_file'])
         else:
             self.cfg_path = os.path.abspath(cfg_path)
             self.model_path = os.path.abspath(model_path)
diff --git a/paddlespeech/server/engine/cls/paddleinference/pretrained_models.py b/paddlespeech/server/engine/cls/paddleinference/pretrained_models.py
new file mode 100644
index 0000000000000000000000000000000000000000..e4914874600c2198e434d267c775dea66f3f252a
--- /dev/null
+++ b/paddlespeech/server/engine/cls/paddleinference/pretrained_models.py
@@ -0,0 +1,58 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+pretrained_models = {
+    "panns_cnn6-32k": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/cls/inference_model/panns_cnn6_static.tar.gz',
+        'md5':
+        'da087c31046d23281d8ec5188c1967da',
+        'cfg_path':
+        'panns.yaml',
+        'model_path':
+        'inference.pdmodel',
+        'params_path':
+        'inference.pdiparams',
+        'label_file':
+        'audioset_labels.txt',
+    },
+    "panns_cnn10-32k": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/cls/inference_model/panns_cnn10_static.tar.gz',
+        'md5':
+        '5460cc6eafbfaf0f261cc75b90284ae1',
+        'cfg_path':
+        'panns.yaml',
+        'model_path':
+        'inference.pdmodel',
+        'params_path':
+        'inference.pdiparams',
+        'label_file':
+        'audioset_labels.txt',
+    },
+    "panns_cnn14-32k": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/cls/inference_model/panns_cnn14_static.tar.gz',
+        'md5':
+        'ccc80b194821274da79466862b2ab00f',
+        'cfg_path':
+        'panns.yaml',
+        'model_path':
+        'inference.pdmodel',
+        'params_path':
+        'inference.pdiparams',
+        'label_file':
+        'audioset_labels.txt',
+    },
+}
diff --git a/paddlespeech/server/engine/tts/online/onnx/pretrained_models.py b/paddlespeech/server/engine/tts/online/onnx/pretrained_models.py
new file mode 100644
index 0000000000000000000000000000000000000000..789f5be7d7ca16965459fec6df7e40f7713ee104
--- /dev/null
+++ b/paddlespeech/server/engine/tts/online/onnx/pretrained_models.py
@@ -0,0 +1,69 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# support online model
+pretrained_models = {
+    # fastspeech2
+    "fastspeech2_csmsc_onnx-zh": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_csmsc_onnx_0.2.0.zip',
+        'md5':
+        'fd3ad38d83273ad51f0ea4f4abf3ab4e',
+        'ckpt': ['fastspeech2_csmsc.onnx'],
+        'phones_dict':
+        'phone_id_map.txt',
+        'sample_rate':
+        24000,
+    },
+    "fastspeech2_cnndecoder_csmsc_onnx-zh": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_streaming_onnx_1.0.0.zip',
+        'md5':
+        '5f70e1a6bcd29d72d54e7931aa86f266',
+        'ckpt': [
+            'fastspeech2_csmsc_am_encoder_infer.onnx',
+            'fastspeech2_csmsc_am_decoder.onnx',
+            'fastspeech2_csmsc_am_postnet.onnx',
+        ],
+        'speech_stats':
+        'speech_stats.npy',
+        'phones_dict':
+        'phone_id_map.txt',
+        'sample_rate':
+        24000,
+    },
+
+    # mb_melgan
+    "mb_melgan_csmsc_onnx-zh": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_onnx_0.2.0.zip',
+        'md5':
+        '5b83ec746e8414bc29032d954ffd07ec',
+        'ckpt':
+        'mb_melgan_csmsc.onnx',
+        'sample_rate':
+        24000,
+    },
+
+    # hifigan
+    "hifigan_csmsc_onnx-zh": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_onnx_0.2.0.zip',
+        'md5':
+        '1a7dc0385875889e46952e50c0994a6b',
+        'ckpt':
+        'hifigan_csmsc.onnx',
+        'sample_rate':
+        24000,
+    },
+}
diff --git a/paddlespeech/server/engine/tts/online/onnx/tts_engine.py b/paddlespeech/server/engine/tts/online/onnx/tts_engine.py
index 22c1c960700d74d003914f59595679cb8dbad9f1..792442065074af9168f84b1ce695bb484b01e388 100644
--- a/paddlespeech/server/engine/tts/online/onnx/tts_engine.py
+++ b/paddlespeech/server/engine/tts/online/onnx/tts_engine.py
@@ -20,10 +20,9 @@ from typing import Optional
 import numpy as np
 import paddle
 
+from .pretrained_models import pretrained_models
 from paddlespeech.cli.log import logger
 from paddlespeech.cli.tts.infer import TTSExecutor
-from paddlespeech.cli.utils import download_and_decompress
-from paddlespeech.cli.utils import MODEL_HOME
 from paddlespeech.server.engine.base_engine import BaseEngine
 from paddlespeech.server.utils.audio_process import float2pcm
 from paddlespeech.server.utils.onnx_infer import get_sess
@@ -34,83 +33,6 @@ from paddlespeech.t2s.frontend.zh_frontend import Frontend
 
 __all__ = ['TTSEngine']
 
-# support online model
-pretrained_models = {
-    # fastspeech2
-    "fastspeech2_csmsc_onnx-zh": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_csmsc_onnx_0.2.0.zip',
-        'md5':
-        'fd3ad38d83273ad51f0ea4f4abf3ab4e',
-        'ckpt': ['fastspeech2_csmsc.onnx'],
-        'phones_dict':
-        'phone_id_map.txt',
-        'sample_rate':
-        24000,
-    },
-    "fastspeech2_cnndecoder_csmsc_onnx-zh": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_streaming_onnx_1.0.0.zip',
-        'md5':
-        '5f70e1a6bcd29d72d54e7931aa86f266',
-        'ckpt': [
-            'fastspeech2_csmsc_am_encoder_infer.onnx',
-            'fastspeech2_csmsc_am_decoder.onnx',
-            'fastspeech2_csmsc_am_postnet.onnx',
-        ],
-        'speech_stats':
-        'speech_stats.npy',
-        'phones_dict':
-        'phone_id_map.txt',
-        'sample_rate':
-        24000,
-    },
-
-    # mb_melgan
-    "mb_melgan_csmsc_onnx-zh": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_onnx_0.2.0.zip',
-        'md5':
-        '5b83ec746e8414bc29032d954ffd07ec',
-        'ckpt':
-        'mb_melgan_csmsc.onnx',
-        'sample_rate':
-        24000,
-    },
-
-    # hifigan
-    "hifigan_csmsc_onnx-zh": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_onnx_0.2.0.zip',
-        'md5':
-        '1a7dc0385875889e46952e50c0994a6b',
-        'ckpt':
-        'hifigan_csmsc.onnx',
-        'sample_rate':
-        24000,
-    },
-}
-
-model_alias = {
-    # acoustic model
-    "fastspeech2":
-    "paddlespeech.t2s.models.fastspeech2:FastSpeech2",
-    "fastspeech2_inference":
-    "paddlespeech.t2s.models.fastspeech2:FastSpeech2Inference",
-
-    # voc
-    "mb_melgan":
-    "paddlespeech.t2s.models.melgan:MelGANGenerator",
-    "mb_melgan_inference":
-    "paddlespeech.t2s.models.melgan:MelGANInference",
-    "hifigan":
-    "paddlespeech.t2s.models.hifigan:HiFiGANGenerator",
-    "hifigan_inference":
-    "paddlespeech.t2s.models.hifigan:HiFiGANInference",
-}
-
-__all__ = ['TTSEngine']
-
 
 class TTSServerExecutor(TTSExecutor):
     def __init__(self, am_block, am_pad, voc_block, voc_pad, voc_upsample):
@@ -122,23 +44,6 @@ class TTSServerExecutor(TTSExecutor):
         self.voc_upsample = voc_upsample
 
         self.pretrained_models = pretrained_models
-        self.model_alias = model_alias
-
-    def _get_pretrained_path(self, tag: str) -> os.PathLike:
-        """
-        #Download and returns pretrained resources path of current task.
-        """
-        support_models = list(pretrained_models.keys())
-        assert tag in pretrained_models, 'The model "{}" you want to use has not been supported, please choose other models.\nThe support models includes:\n\t\t{}\n'.format(
-            tag, '\n\t\t'.join(support_models))
-
-        res_path = os.path.join(MODEL_HOME, tag)
-        decompressed_path = download_and_decompress(pretrained_models[tag],
-                                                    res_path)
-        decompressed_path = os.path.abspath(decompressed_path)
-        logger.info(
-            'Use pretrained model stored in: {}'.format(decompressed_path))
-        return decompressed_path
 
     def _init_from_path(
             self,
@@ -173,10 +78,10 @@ class TTSServerExecutor(TTSExecutor):
                 am_res_path = self._get_pretrained_path(am_tag)
                 self.am_res_path = am_res_path
                 self.am_ckpt = os.path.join(
-                    am_res_path, pretrained_models[am_tag]['ckpt'][0])
+                    am_res_path, self.pretrained_models[am_tag]['ckpt'][0])
                 # must have phones_dict in acoustic
                 self.phones_dict = os.path.join(
-                    am_res_path, pretrained_models[am_tag]['phones_dict'])
+                    am_res_path, self.pretrained_models[am_tag]['phones_dict'])
 
             else:
                 self.am_ckpt = os.path.abspath(am_ckpt[0])
@@ -192,16 +97,16 @@ class TTSServerExecutor(TTSExecutor):
                 am_res_path = self._get_pretrained_path(am_tag)
                 self.am_res_path = am_res_path
                 self.am_encoder_infer = os.path.join(
-                    am_res_path, pretrained_models[am_tag]['ckpt'][0])
+                    am_res_path, self.pretrained_models[am_tag]['ckpt'][0])
                 self.am_decoder = os.path.join(
-                    am_res_path, pretrained_models[am_tag]['ckpt'][1])
+                    am_res_path, self.pretrained_models[am_tag]['ckpt'][1])
                 self.am_postnet = os.path.join(
-                    am_res_path, pretrained_models[am_tag]['ckpt'][2])
+                    am_res_path, self.pretrained_models[am_tag]['ckpt'][2])
                 # must have phones_dict in acoustic
                 self.phones_dict = os.path.join(
-                    am_res_path, pretrained_models[am_tag]['phones_dict'])
+                    am_res_path, self.pretrained_models[am_tag]['phones_dict'])
                 self.am_stat = os.path.join(
-                    am_res_path, pretrained_models[am_tag]['speech_stats'])
+                    am_res_path, self.pretrained_models[am_tag]['speech_stats'])
 
             else:
                 self.am_encoder_infer = os.path.abspath(am_ckpt[0])
@@ -229,8 +134,8 @@ class TTSServerExecutor(TTSExecutor):
         if voc_ckpt is None:
             voc_res_path = self._get_pretrained_path(voc_tag)
             self.voc_res_path = voc_res_path
-            self.voc_ckpt = os.path.join(voc_res_path,
-                                         pretrained_models[voc_tag]['ckpt'])
+            self.voc_ckpt = os.path.join(
+                voc_res_path, self.pretrained_models[voc_tag]['ckpt'])
         else:
             self.voc_ckpt = os.path.abspath(voc_ckpt)
             self.voc_res_path = os.path.dirname(os.path.abspath(self.voc_ckpt))
@@ -283,7 +188,6 @@ class TTSServerExecutor(TTSExecutor):
         """
         Model inference and result stored in self.output.
         """
-        #import pdb;pdb.set_trace()
 
         am_block = self.am_block
         am_pad = self.am_pad
@@ -453,10 +357,21 @@ class TTSEngine(BaseEngine):
             self.config.am_block, self.config.am_pad, self.config.voc_block,
             self.config.voc_pad, self.config.voc_upsample)
 
-        if "cpu" in self.config.am_sess_conf.device or "cpu" in self.config.voc_sess_conf.device:
-            paddle.set_device("cpu")
-        else:
-            paddle.set_device(self.config.am_sess_conf.device)
+        try:
+            if self.config.am_sess_conf.device is not None:
+                self.device = self.config.am_sess_conf.device
+            elif self.config.voc_sess_conf.device is not None:
+                self.device = self.config.voc_sess_conf.device
+            else:
+                self.device = paddle.get_device()
+            paddle.set_device(self.device)
+        except BaseException as e:
+            logger.error(
+                "Set device failed, please check if device is already used and the parameter 'device' in the yaml file"
+            )
+            logger.error("Initialize TTS server engine Failed on device: %s." %
+                         (self.device))
+            return False
 
         try:
             self.executor._init_from_path(
@@ -480,16 +395,17 @@ class TTSEngine(BaseEngine):
                          (self.config.voc_sess_conf.device))
             return False
 
-        logger.info("Initialize TTS server engine successfully on device: %s." %
-                    (self.config.voc_sess_conf.device))
-
         # warm up
         try:
             self.warm_up()
+            logger.info("Warm up successfully.")
         except Exception as e:
             logger.error("Failed to warm up on tts engine.")
             return False
 
+        logger.info("Initialize TTS server engine successfully on device: %s." %
+                    (self.config.voc_sess_conf.device))
+
         return True
 
     def warm_up(self):
@@ -499,9 +415,7 @@ class TTSEngine(BaseEngine):
             sentence = "您好，欢迎使用语音合成服务。"
         if self.config.lang == 'en':
             sentence = "Hello and welcome to the speech synthesis service."
-        logger.info(
-            "*******************************warm up ********************************"
-        )
+        logger.info("Start to warm up.")
         for i in range(3):
             for wav in self.executor.infer(
                     text=sentence,
@@ -512,9 +426,6 @@ class TTSEngine(BaseEngine):
                     f"The first response time of the {i} warm up: {self.executor.first_response_time} s"
                 )
                 break
-        logger.info(
-            "**********************************************************************"
-        )
 
     def preprocess(self, text_bese64: str=None, text_bytes: bytes=None):
         # Convert byte to text
diff --git a/paddlespeech/server/engine/tts/online/python/pretrained_models.py b/paddlespeech/server/engine/tts/online/python/pretrained_models.py
new file mode 100644
index 0000000000000000000000000000000000000000..bf6aded51168c2c21172ec8101413b4cb0e05154
--- /dev/null
+++ b/paddlespeech/server/engine/tts/online/python/pretrained_models.py
@@ -0,0 +1,73 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# support online model
+pretrained_models = {
+    # fastspeech2
+    "fastspeech2_csmsc-zh": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip',
+        'md5':
+        '637d28a5e53aa60275612ba4393d5f22',
+        'config':
+        'default.yaml',
+        'ckpt':
+        'snapshot_iter_76000.pdz',
+        'speech_stats':
+        'speech_stats.npy',
+        'phones_dict':
+        'phone_id_map.txt',
+    },
+    "fastspeech2_cnndecoder_csmsc-zh": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_ckpt_1.0.0.zip',
+        'md5':
+        '6eb28e22ace73e0ebe7845f86478f89f',
+        'config':
+        'cnndecoder.yaml',
+        'ckpt':
+        'snapshot_iter_153000.pdz',
+        'speech_stats':
+        'speech_stats.npy',
+        'phones_dict':
+        'phone_id_map.txt',
+    },
+
+    # mb_melgan
+    "mb_melgan_csmsc-zh": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_ckpt_0.1.1.zip',
+        'md5':
+        'ee5f0604e20091f0d495b6ec4618b90d',
+        'config':
+        'default.yaml',
+        'ckpt':
+        'snapshot_iter_1000000.pdz',
+        'speech_stats':
+        'feats_stats.npy',
+    },
+
+    # hifigan
+    "hifigan_csmsc-zh": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_ckpt_0.1.1.zip',
+        'md5':
+        'dd40a3d88dfcf64513fba2f0f961ada6',
+        'config':
+        'default.yaml',
+        'ckpt':
+        'snapshot_iter_2500000.pdz',
+        'speech_stats':
+        'feats_stats.npy',
+    },
+}
diff --git a/paddlespeech/server/engine/tts/online/python/tts_engine.py b/paddlespeech/server/engine/tts/online/python/tts_engine.py
index 1f51586bc19149db4d4aac1142470cf824bbd197..1fca5283745325a21b8299c1fdbc661100af7aaf 100644
--- a/paddlespeech/server/engine/tts/online/python/tts_engine.py
+++ b/paddlespeech/server/engine/tts/online/python/tts_engine.py
@@ -22,10 +22,9 @@ import paddle
 import yaml
 from yacs.config import CfgNode
 
+from .pretrained_models import pretrained_models
 from paddlespeech.cli.log import logger
 from paddlespeech.cli.tts.infer import TTSExecutor
-from paddlespeech.cli.utils import download_and_decompress
-from paddlespeech.cli.utils import MODEL_HOME
 from paddlespeech.s2t.utils.dynamic_import import dynamic_import
 from paddlespeech.server.engine.base_engine import BaseEngine
 from paddlespeech.server.utils.audio_process import float2pcm
@@ -37,87 +36,6 @@ from paddlespeech.t2s.modules.normalizer import ZScore
 
 __all__ = ['TTSEngine']
 
-# support online model
-pretrained_models = {
-    # fastspeech2
-    "fastspeech2_csmsc-zh": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip',
-        'md5':
-        '637d28a5e53aa60275612ba4393d5f22',
-        'config':
-        'default.yaml',
-        'ckpt':
-        'snapshot_iter_76000.pdz',
-        'speech_stats':
-        'speech_stats.npy',
-        'phones_dict':
-        'phone_id_map.txt',
-    },
-    "fastspeech2_cnndecoder_csmsc-zh": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_ckpt_1.0.0.zip',
-        'md5':
-        '6eb28e22ace73e0ebe7845f86478f89f',
-        'config':
-        'cnndecoder.yaml',
-        'ckpt':
-        'snapshot_iter_153000.pdz',
-        'speech_stats':
-        'speech_stats.npy',
-        'phones_dict':
-        'phone_id_map.txt',
-    },
-
-    # mb_melgan
-    "mb_melgan_csmsc-zh": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_ckpt_0.1.1.zip',
-        'md5':
-        'ee5f0604e20091f0d495b6ec4618b90d',
-        'config':
-        'default.yaml',
-        'ckpt':
-        'snapshot_iter_1000000.pdz',
-        'speech_stats':
-        'feats_stats.npy',
-    },
-
-    # hifigan
-    "hifigan_csmsc-zh": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_ckpt_0.1.1.zip',
-        'md5':
-        'dd40a3d88dfcf64513fba2f0f961ada6',
-        'config':
-        'default.yaml',
-        'ckpt':
-        'snapshot_iter_2500000.pdz',
-        'speech_stats':
-        'feats_stats.npy',
-    },
-}
-
-model_alias = {
-    # acoustic model
-    "fastspeech2":
-    "paddlespeech.t2s.models.fastspeech2:FastSpeech2",
-    "fastspeech2_inference":
-    "paddlespeech.t2s.models.fastspeech2:FastSpeech2Inference",
-
-    # voc
-    "mb_melgan":
-    "paddlespeech.t2s.models.melgan:MelGANGenerator",
-    "mb_melgan_inference":
-    "paddlespeech.t2s.models.melgan:MelGANInference",
-    "hifigan":
-    "paddlespeech.t2s.models.hifigan:HiFiGANGenerator",
-    "hifigan_inference":
-    "paddlespeech.t2s.models.hifigan:HiFiGANInference",
-}
-
-__all__ = ['TTSEngine']
-
 
 class TTSServerExecutor(TTSExecutor):
     def __init__(self, am_block, am_pad, voc_block, voc_pad):
@@ -126,6 +44,7 @@ class TTSServerExecutor(TTSExecutor):
         self.am_pad = am_pad
         self.voc_block = voc_block
         self.voc_pad = voc_pad
+        self.pretrained_models = pretrained_models
 
     def get_model_info(self,
                        field: str,
@@ -146,7 +65,7 @@ class TTSServerExecutor(TTSExecutor):
             [Tensor]: standard deviation
         """
 
-        model_class = dynamic_import(model_name, model_alias)
+        model_class = dynamic_import(model_name, self.model_alias)
 
         if field == "am":
             odim = self.am_config.n_mels
@@ -169,22 +88,6 @@ class TTSServerExecutor(TTSExecutor):
 
         return model, model_mu, model_std
 
-    def _get_pretrained_path(self, tag: str) -> os.PathLike:
-        """
-        Download and returns pretrained resources path of current task.
-        """
-        support_models = list(pretrained_models.keys())
-        assert tag in pretrained_models, 'The model "{}" you want to use has not been supported, please choose other models.\nThe support models includes:\n\t\t{}\n'.format(
-            tag, '\n\t\t'.join(support_models))
-
-        res_path = os.path.join(MODEL_HOME, tag)
-        decompressed_path = download_and_decompress(pretrained_models[tag],
-                                                    res_path)
-        decompressed_path = os.path.abspath(decompressed_path)
-        logger.info(
-            'Use pretrained model stored in: {}'.format(decompressed_path))
-        return decompressed_path
-
     def _init_from_path(
             self,
             am: str='fastspeech2_csmsc',
@@ -210,15 +113,15 @@ class TTSServerExecutor(TTSExecutor):
         if am_ckpt is None or am_config is None or am_stat is None or phones_dict is None:
             am_res_path = self._get_pretrained_path(am_tag)
             self.am_res_path = am_res_path
-            self.am_config = os.path.join(am_res_path,
-                                          pretrained_models[am_tag]['config'])
+            self.am_config = os.path.join(
+                am_res_path, self.pretrained_models[am_tag]['config'])
             self.am_ckpt = os.path.join(am_res_path,
-                                        pretrained_models[am_tag]['ckpt'])
+                                        self.pretrained_models[am_tag]['ckpt'])
             self.am_stat = os.path.join(
-                am_res_path, pretrained_models[am_tag]['speech_stats'])
+                am_res_path, self.pretrained_models[am_tag]['speech_stats'])
             # must have phones_dict in acoustic
             self.phones_dict = os.path.join(
-                am_res_path, pretrained_models[am_tag]['phones_dict'])
+                am_res_path, self.pretrained_models[am_tag]['phones_dict'])
             print("self.phones_dict:", self.phones_dict)
             logger.info(am_res_path)
             logger.info(self.am_config)
@@ -239,12 +142,12 @@ class TTSServerExecutor(TTSExecutor):
         if voc_ckpt is None or voc_config is None or voc_stat is None:
             voc_res_path = self._get_pretrained_path(voc_tag)
             self.voc_res_path = voc_res_path
-            self.voc_config = os.path.join(voc_res_path,
-                                           pretrained_models[voc_tag]['config'])
-            self.voc_ckpt = os.path.join(voc_res_path,
-                                         pretrained_models[voc_tag]['ckpt'])
+            self.voc_config = os.path.join(
+                voc_res_path, self.pretrained_models[voc_tag]['config'])
+            self.voc_ckpt = os.path.join(
+                voc_res_path, self.pretrained_models[voc_tag]['ckpt'])
             self.voc_stat = os.path.join(
-                voc_res_path, pretrained_models[voc_tag]['speech_stats'])
+                voc_res_path, self.pretrained_models[voc_tag]['speech_stats'])
             logger.info(voc_res_path)
             logger.info(self.voc_config)
             logger.info(self.voc_ckpt)
@@ -286,7 +189,7 @@ class TTSServerExecutor(TTSExecutor):
                                                     self.am_ckpt, self.am_stat)
             am_normalizer = ZScore(am_mu, am_std)
             am_inference_class = dynamic_import(self.am_name + '_inference',
-                                                model_alias)
+                                                self.model_alias)
             self.am_inference = am_inference_class(am_normalizer, am)
             self.am_inference.eval()
         print("acoustic model done!")
@@ -297,7 +200,7 @@ class TTSServerExecutor(TTSExecutor):
                                                    self.voc_ckpt, self.voc_stat)
         voc_normalizer = ZScore(voc_mu, voc_std)
         voc_inference_class = dynamic_import(self.voc_name + '_inference',
-                                             model_alias)
+                                             self.model_alias)
         self.voc_inference = voc_inference_class(voc_normalizer, voc)
         self.voc_inference.eval()
         print("voc done!")
@@ -477,7 +380,7 @@ class TTSEngine(BaseEngine):
         ), "Please set correct voc_block and voc_pad, they should be more than 0."
 
         try:
-            if self.config.device:
+            if self.config.device is not None:
                 self.device = self.config.device
             else:
                 self.device = paddle.get_device()
@@ -513,16 +416,16 @@ class TTSEngine(BaseEngine):
                          (self.device))
             return False
 
-        logger.info("Initialize TTS server engine successfully on device: %s." %
-                    (self.device))
-
         # warm up
         try:
             self.warm_up()
+            logger.info("Warm up successfully.")
         except Exception as e:
             logger.error("Failed to warm up on tts engine.")
             return False
 
+        logger.info("Initialize TTS server engine successfully on device: %s." %
+                    (self.device))
         return True
 
     def warm_up(self):
@@ -532,9 +435,7 @@ class TTSEngine(BaseEngine):
             sentence = "您好，欢迎使用语音合成服务。"
         if self.config.lang == 'en':
             sentence = "Hello and welcome to the speech synthesis service."
-        logger.info(
-            "*******************************warm up ********************************"
-        )
+        logger.info("Start to warm up.")
         for i in range(3):
             for wav in self.executor.infer(
                     text=sentence,
@@ -545,9 +446,6 @@ class TTSEngine(BaseEngine):
                     f"The first response time of the {i} warm up: {self.executor.first_response_time} s"
                 )
                 break
-        logger.info(
-            "**********************************************************************"
-        )
 
     def preprocess(self, text_bese64: str=None, text_bytes: bytes=None):
         # Convert byte to text
diff --git a/paddlespeech/server/engine/tts/paddleinference/pretrained_models.py b/paddlespeech/server/engine/tts/paddleinference/pretrained_models.py
new file mode 100644
index 0000000000000000000000000000000000000000..9618a7a697765f532a172c551b6be733a68a1bec
--- /dev/null
+++ b/paddlespeech/server/engine/tts/paddleinference/pretrained_models.py
@@ -0,0 +1,87 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Static model applied on paddle inference
+pretrained_models = {
+    # speedyspeech
+    "speedyspeech_csmsc-zh": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_static_0.5.zip',
+        'md5':
+        'f10cbdedf47dc7a9668d2264494e1823',
+        'model':
+        'speedyspeech_csmsc.pdmodel',
+        'params':
+        'speedyspeech_csmsc.pdiparams',
+        'phones_dict':
+        'phone_id_map.txt',
+        'tones_dict':
+        'tone_id_map.txt',
+        'sample_rate':
+        24000,
+    },
+    # fastspeech2
+    "fastspeech2_csmsc-zh": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_static_0.4.zip',
+        'md5':
+        '9788cd9745e14c7a5d12d32670b2a5a7',
+        'model':
+        'fastspeech2_csmsc.pdmodel',
+        'params':
+        'fastspeech2_csmsc.pdiparams',
+        'phones_dict':
+        'phone_id_map.txt',
+        'sample_rate':
+        24000,
+    },
+    # pwgan
+    "pwgan_csmsc-zh": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_static_0.4.zip',
+        'md5':
+        'e3504aed9c5a290be12d1347836d2742',
+        'model':
+        'pwgan_csmsc.pdmodel',
+        'params':
+        'pwgan_csmsc.pdiparams',
+        'sample_rate':
+        24000,
+    },
+    # mb_melgan
+    "mb_melgan_csmsc-zh": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_static_0.1.1.zip',
+        'md5':
+        'ac6eee94ba483421d750433f4c3b8d36',
+        'model':
+        'mb_melgan_csmsc.pdmodel',
+        'params':
+        'mb_melgan_csmsc.pdiparams',
+        'sample_rate':
+        24000,
+    },
+    # hifigan
+    "hifigan_csmsc-zh": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_static_0.1.1.zip',
+        'md5':
+        '7edd8c436b3a5546b3a7cb8cff9d5a0c',
+        'model':
+        'hifigan_csmsc.pdmodel',
+        'params':
+        'hifigan_csmsc.pdiparams',
+        'sample_rate':
+        24000,
+    },
+}
diff --git a/paddlespeech/server/engine/tts/paddleinference/tts_engine.py b/paddlespeech/server/engine/tts/paddleinference/tts_engine.py
index db8813ba901a93fa935ce003b8a7abdeec245485..f1ce8b76e2eacd378ccb8657486716ffb5ad4036 100644
--- a/paddlespeech/server/engine/tts/paddleinference/tts_engine.py
+++ b/paddlespeech/server/engine/tts/paddleinference/tts_engine.py
@@ -23,10 +23,9 @@ import paddle
 import soundfile as sf
 from scipy.io import wavfile
 
+from .pretrained_models import pretrained_models
 from paddlespeech.cli.log import logger
 from paddlespeech.cli.tts.infer import TTSExecutor
-from paddlespeech.cli.utils import download_and_decompress
-from paddlespeech.cli.utils import MODEL_HOME
 from paddlespeech.server.engine.base_engine import BaseEngine
 from paddlespeech.server.utils.audio_process import change_speed
 from paddlespeech.server.utils.errors import ErrorCode
@@ -38,101 +37,11 @@ from paddlespeech.t2s.frontend.zh_frontend import Frontend
 
 __all__ = ['TTSEngine']
 
-# Static model applied on paddle inference
-pretrained_models = {
-    # speedyspeech
-    "speedyspeech_csmsc-zh": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_static_0.5.zip',
-        'md5':
-        'f10cbdedf47dc7a9668d2264494e1823',
-        'model':
-        'speedyspeech_csmsc.pdmodel',
-        'params':
-        'speedyspeech_csmsc.pdiparams',
-        'phones_dict':
-        'phone_id_map.txt',
-        'tones_dict':
-        'tone_id_map.txt',
-        'sample_rate':
-        24000,
-    },
-    # fastspeech2
-    "fastspeech2_csmsc-zh": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_static_0.4.zip',
-        'md5':
-        '9788cd9745e14c7a5d12d32670b2a5a7',
-        'model':
-        'fastspeech2_csmsc.pdmodel',
-        'params':
-        'fastspeech2_csmsc.pdiparams',
-        'phones_dict':
-        'phone_id_map.txt',
-        'sample_rate':
-        24000,
-    },
-    # pwgan
-    "pwgan_csmsc-zh": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_static_0.4.zip',
-        'md5':
-        'e3504aed9c5a290be12d1347836d2742',
-        'model':
-        'pwgan_csmsc.pdmodel',
-        'params':
-        'pwgan_csmsc.pdiparams',
-        'sample_rate':
-        24000,
-    },
-    # mb_melgan
-    "mb_melgan_csmsc-zh": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_static_0.1.1.zip',
-        'md5':
-        'ac6eee94ba483421d750433f4c3b8d36',
-        'model':
-        'mb_melgan_csmsc.pdmodel',
-        'params':
-        'mb_melgan_csmsc.pdiparams',
-        'sample_rate':
-        24000,
-    },
-    # hifigan
-    "hifigan_csmsc-zh": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_static_0.1.1.zip',
-        'md5':
-        '7edd8c436b3a5546b3a7cb8cff9d5a0c',
-        'model':
-        'hifigan_csmsc.pdmodel',
-        'params':
-        'hifigan_csmsc.pdiparams',
-        'sample_rate':
-        24000,
-    },
-}
-
 
 class TTSServerExecutor(TTSExecutor):
     def __init__(self):
         super().__init__()
-        pass
-
-    def _get_pretrained_path(self, tag: str) -> os.PathLike:
-        """
-        Download and returns pretrained resources path of current task.
-        """
-        assert tag in pretrained_models, 'Can not find pretrained resources of {}.'.format(
-            tag)
-
-        res_path = os.path.join(MODEL_HOME, tag)
-        decompressed_path = download_and_decompress(pretrained_models[tag],
-                                                    res_path)
-        decompressed_path = os.path.abspath(decompressed_path)
-        logger.info(
-            'Use pretrained model stored in: {}'.format(decompressed_path))
-        return decompressed_path
+        self.pretrained_models = pretrained_models
 
     def _init_from_path(
             self,
@@ -161,14 +70,14 @@ class TTSServerExecutor(TTSExecutor):
         if am_model is None or am_params is None or phones_dict is None:
             am_res_path = self._get_pretrained_path(am_tag)
             self.am_res_path = am_res_path
-            self.am_model = os.path.join(am_res_path,
-                                         pretrained_models[am_tag]['model'])
-            self.am_params = os.path.join(am_res_path,
-                                          pretrained_models[am_tag]['params'])
+            self.am_model = os.path.join(
+                am_res_path, self.pretrained_models[am_tag]['model'])
+            self.am_params = os.path.join(
+                am_res_path, self.pretrained_models[am_tag]['params'])
             # must have phones_dict in acoustic
             self.phones_dict = os.path.join(
-                am_res_path, pretrained_models[am_tag]['phones_dict'])
-            self.am_sample_rate = pretrained_models[am_tag]['sample_rate']
+                am_res_path, self.pretrained_models[am_tag]['phones_dict'])
+            self.am_sample_rate = self.pretrained_models[am_tag]['sample_rate']
 
             logger.info(am_res_path)
             logger.info(self.am_model)
@@ -183,17 +92,17 @@ class TTSServerExecutor(TTSExecutor):
 
         # for speedyspeech
         self.tones_dict = None
-        if 'tones_dict' in pretrained_models[am_tag]:
+        if 'tones_dict' in self.pretrained_models[am_tag]:
             self.tones_dict = os.path.join(
-                am_res_path, pretrained_models[am_tag]['tones_dict'])
+                am_res_path, self.pretrained_models[am_tag]['tones_dict'])
             if tones_dict:
                 self.tones_dict = tones_dict
 
         # for multi speaker fastspeech2
         self.speaker_dict = None
-        if 'speaker_dict' in pretrained_models[am_tag]:
+        if 'speaker_dict' in self.pretrained_models[am_tag]:
             self.speaker_dict = os.path.join(
-                am_res_path, pretrained_models[am_tag]['speaker_dict'])
+                am_res_path, self.pretrained_models[am_tag]['speaker_dict'])
             if speaker_dict:
                 self.speaker_dict = speaker_dict
 
@@ -202,11 +111,12 @@ class TTSServerExecutor(TTSExecutor):
         if voc_model is None or voc_params is None:
             voc_res_path = self._get_pretrained_path(voc_tag)
             self.voc_res_path = voc_res_path
-            self.voc_model = os.path.join(voc_res_path,
-                                          pretrained_models[voc_tag]['model'])
-            self.voc_params = os.path.join(voc_res_path,
-                                           pretrained_models[voc_tag]['params'])
-            self.voc_sample_rate = pretrained_models[voc_tag]['sample_rate']
+            self.voc_model = os.path.join(
+                voc_res_path, self.pretrained_models[voc_tag]['model'])
+            self.voc_params = os.path.join(
+                voc_res_path, self.pretrained_models[voc_tag]['params'])
+            self.voc_sample_rate = self.pretrained_models[voc_tag][
+                'sample_rate']
             logger.info(voc_res_path)
             logger.info(self.voc_model)
             logger.info(self.voc_params)
@@ -352,8 +262,24 @@ class TTSEngine(BaseEngine):
 
     def init(self, config: dict) -> bool:
         self.executor = TTSServerExecutor()
-
         self.config = config
+
+        try:
+            if self.config.am_predictor_conf.device is not None:
+                self.device = self.config.am_predictor_conf.device
+            elif self.config.voc_predictor_conf.device is not None:
+                self.device = self.config.voc_predictor_conf.device
+            else:
+                self.device = paddle.get_device()
+            paddle.set_device(self.device)
+        except BaseException as e:
+            logger.error(
+                "Set device failed, please check if device is already used and the parameter 'device' in the yaml file"
+            )
+            logger.error("Initialize TTS server engine Failed on device: %s." %
+                         (self.device))
+            return False
+
         self.executor._init_from_path(
             am=self.config.am,
             am_model=self.config.am_model,
@@ -370,9 +296,35 @@ class TTSEngine(BaseEngine):
             am_predictor_conf=self.config.am_predictor_conf,
             voc_predictor_conf=self.config.voc_predictor_conf, )
 
+        # warm up
+        try:
+            self.warm_up()
+            logger.info("Warm up successfully.")
+        except Exception as e:
+            logger.error("Failed to warm up on tts engine.")
+            return False
+
         logger.info("Initialize TTS server engine successfully.")
         return True
 
+    def warm_up(self):
+        """warm up
+        """
+        if self.config.lang == 'zh':
+            sentence = "您好，欢迎使用语音合成服务。"
+        if self.config.lang == 'en':
+            sentence = "Hello and welcome to the speech synthesis service."
+        logger.info("Start to warm up.")
+        for i in range(3):
+            st = time.time()
+            self.executor.infer(
+                text=sentence,
+                lang=self.config.lang,
+                am=self.config.am,
+                spk_id=0, )
+            logger.info(
+                f"The response time of the {i} warm up: {time.time() - st} s")
+
     def postprocess(self,
                     wav,
                     original_fs: int,
diff --git a/paddlespeech/server/engine/tts/python/tts_engine.py b/paddlespeech/server/engine/tts/python/tts_engine.py
index f153f60b966682fea72418643b29adc38ffa1f07..d0002baa4f46c949e8258a7bea527a18b781b657 100644
--- a/paddlespeech/server/engine/tts/python/tts_engine.py
+++ b/paddlespeech/server/engine/tts/python/tts_engine.py
@@ -51,15 +51,15 @@ class TTSEngine(BaseEngine):
 
     def init(self, config: dict) -> bool:
         self.executor = TTSServerExecutor()
+        self.config = config
 
         try:
-            self.config = config
-            if self.config.device:
+            if self.config.device is not None:
                 self.device = self.config.device
             else:
                 self.device = paddle.get_device()
             paddle.set_device(self.device)
-        except BaseException:
+        except BaseException as e:
             logger.error(
                 "Set device failed, please check if device is already used and the parameter 'device' in the yaml file"
             )
@@ -87,10 +87,36 @@ class TTSEngine(BaseEngine):
                          (self.device))
             return False
 
+        # warm up
+        try:
+            self.warm_up()
+            logger.info("Warm up successfully.")
+        except Exception as e:
+            logger.error("Failed to warm up on tts engine.")
+            return False
+
         logger.info("Initialize TTS server engine successfully on device: %s." %
                     (self.device))
         return True
 
+    def warm_up(self):
+        """warm up
+        """
+        if self.config.lang == 'zh':
+            sentence = "您好，欢迎使用语音合成服务。"
+        if self.config.lang == 'en':
+            sentence = "Hello and welcome to the speech synthesis service."
+        logger.info("Start to warm up.")
+        for i in range(3):
+            st = time.time()
+            self.executor.infer(
+                text=sentence,
+                lang=self.config.lang,
+                am=self.config.am,
+                spk_id=0, )
+            logger.info(
+                f"The response time of the {i} warm up: {time.time() - st} s")
+
     def postprocess(self,
                     wav,
                     original_fs: int,
diff --git a/paddlespeech/server/restful/tts_api.py b/paddlespeech/server/restful/tts_api.py
index d1268428a0b53d41fdf9abb8fd7dbff4d485decc..15d618d9324fcda2616d571a4d074ea0876f0fb5 100644
--- a/paddlespeech/server/restful/tts_api.py
+++ b/paddlespeech/server/restful/tts_api.py
@@ -128,7 +128,7 @@ def tts(request_body: TTSRequest):
     return response
 
 
-@router.post("/paddlespeech/streaming/tts")
+@router.post("/paddlespeech/tts/streaming")
 async def stream_tts(request_body: TTSRequest):
     text = request_body.text
 
diff --git a/paddlespeech/server/tests/tts/online/http_client.py b/paddlespeech/server/tests/tts/online/http_client.py
index 756f7b5be204cbd3ae3dd125c0f04a78b9879421..47b781ed9030e55a33a6a8383f83eb1ba61b617d 100644
--- a/paddlespeech/server/tests/tts/online/http_client.py
+++ b/paddlespeech/server/tests/tts/online/http_client.py
@@ -14,6 +14,7 @@
 import argparse
 
 from paddlespeech.server.utils.audio_handler import TTSHttpHandler
+from paddlespeech.server.utils.util import compute_delay
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
@@ -43,5 +44,25 @@ if __name__ == "__main__":
 
     print("tts http client start")
     handler = TTSHttpHandler(args.server, args.port, args.play)
-    handler.run(args.text, args.spk_id, args.speed, args.volume,
-                args.sample_rate, args.output)
+    first_response, final_response, duration, save_audio_success, receive_time_list, chunk_duration_list = handler.run(
+        args.text, args.spk_id, args.speed, args.volume, args.sample_rate,
+        args.output)
+    delay_time_list = compute_delay(receive_time_list, chunk_duration_list)
+
+    print(f"sentence: {args.text}")
+    print(f"duration: {duration} s")
+    print(f"first response: {first_response} s")
+    print(f"final response: {final_response} s")
+    print(f"RTF: {final_response/duration}")
+    if args.output is not None:
+        if save_audio_success:
+            print(f"Audio successfully saved in {args.output}")
+        else:
+            print("Audio save failed.")
+
+    if delay_time_list != []:
+        print(
+            f"Delay situation: total number of packages: {len(receive_time_list)}, the number of delayed packets: {len(delay_time_list)}, minimum delay time: {min(delay_time_list)} s, maximum delay time: {max(delay_time_list)} s, average delay time: {sum(delay_time_list)/len(delay_time_list)} s, delay rate:{len(delay_time_list)/len(receive_time_list)}"
+        )
+    else:
+        print("The sentence has no delay in streaming synthesis.")
diff --git a/paddlespeech/server/tests/tts/online/ws_client.py b/paddlespeech/server/tests/tts/online/ws_client.py
index 821d82a9a6e254987cb4774a6a7e50f8b076b3d2..0b1794c8aaef4dc3af3cea3f80b9166548a7a39c 100644
--- a/paddlespeech/server/tests/tts/online/ws_client.py
+++ b/paddlespeech/server/tests/tts/online/ws_client.py
@@ -15,6 +15,7 @@ import argparse
 import asyncio
 
 from paddlespeech.server.utils.audio_handler import TTSWsHandler
+from paddlespeech.server.utils.util import compute_delay
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
@@ -35,4 +36,24 @@ if __name__ == "__main__":
     print("tts websocket client start")
     handler = TTSWsHandler(args.server, args.port, args.play)
     loop = asyncio.get_event_loop()
-    loop.run_until_complete(handler.run(args.text, args.output))
+    first_response, final_response, duration, save_audio_success, receive_time_list, chunk_duration_list = loop.run_until_complete(
+        handler.run(args.text, args.output))
+    delay_time_list = compute_delay(receive_time_list, chunk_duration_list)
+
+    print(f"sentence: {args.text}")
+    print(f"duration: {duration} s")
+    print(f"first response: {first_response} s")
+    print(f"final response: {final_response} s")
+    print(f"RTF: {final_response/duration}")
+    if args.output is not None:
+        if save_audio_success:
+            print(f"Audio successfully saved in {args.output}")
+        else:
+            print("Audio save failed.")
+
+    if delay_time_list != []:
+        print(
+            f"Delay situation: total number of packages: {len(receive_time_list)}, the number of delayed packets: {len(delay_time_list)}, minimum delay time: {min(delay_time_list)} s, maximum delay time: {max(delay_time_list)} s, average delay time: {sum(delay_time_list)/len(delay_time_list)} s, delay rate:{len(delay_time_list)/len(receive_time_list)}"
+        )
+    else:
+        print("The sentence has no delay in streaming synthesis.")
diff --git a/paddlespeech/server/utils/audio_handler.py b/paddlespeech/server/utils/audio_handler.py
index a088929f2ec60c8cbee79feb06fb6d914c5b2476..75f4a10bebe3db1759bfe948a471087b7abe3ec2 100644
--- a/paddlespeech/server/utils/audio_handler.py
+++ b/paddlespeech/server/utils/audio_handler.py
@@ -259,7 +259,8 @@ class TTSWsHandler:
         """
         self.server = server
         self.port = port
-        self.url = "ws://" + self.server + ":" + str(self.port) + "/ws/tts"
+        self.url = "ws://" + self.server + ":" + str(
+            self.port) + "/paddlespeech/tts/streaming"
         self.play = play
         if self.play:
             import pyaudio
@@ -295,6 +296,8 @@ class TTSWsHandler:
             output (str): save audio path
         """
         all_bytes = b''
+        receive_time_list = []
+        chunk_duration_list = []
 
         # 1. Send websocket handshake protocal
         async with websockets.connect(self.url) as ws:
@@ -309,14 +312,15 @@ class TTSWsHandler:
 
             # 3. Process the received response 
             message = await ws.recv()
-            logger.info(f"句子：{text}")
-            logger.info(f"首包响应：{time.time() - st} s")
+            first_response = time.time() - st
             message = json.loads(message)
             status = message["status"]
 
             while (status == 1):
+                receive_time_list.append(time.time())
                 audio = message["audio"]
                 audio = base64.b64decode(audio)  # bytes
+                chunk_duration_list.append(len(audio) / 2.0 / 24000)
                 all_bytes += audio
                 if self.play:
                     self.mutex.acquire()
@@ -334,15 +338,11 @@ class TTSWsHandler:
             if status == 2:
                 final_response = time.time() - st
                 duration = len(all_bytes) / 2.0 / 24000
-                logger.info(f"尾包响应：{final_response} s")
-                logger.info(f"音频时长：{duration} s")
-                logger.info(f"RTF: {final_response / duration}")
 
                 if output is not None:
-                    if save_audio(all_bytes, output):
-                        logger.info(f"音频保存至：{output}")
-                    else:
-                        logger.error("save audio error")
+                    save_audio_success = save_audio(all_bytes, output)
+                else:
+                    save_audio_success = False
             else:
                 logger.error("infer error")
 
@@ -352,6 +352,8 @@ class TTSWsHandler:
                 self.stream.close()
                 self.p.terminate()
 
+        return first_response, final_response, duration, save_audio_success, receive_time_list, chunk_duration_list
+
 
 class TTSHttpHandler:
     def __init__(self, server="127.0.0.1", port=8092, play: bool=False):
@@ -365,7 +367,7 @@ class TTSHttpHandler:
         self.server = server
         self.port = port
         self.url = "http://" + str(self.server) + ":" + str(
-            self.port) + "/paddlespeech/streaming/tts"
+            self.port) + "/paddlespeech/tts/streaming"
         self.play = play
 
         if self.play:
@@ -423,13 +425,16 @@ class TTSHttpHandler:
 
         all_bytes = b''
         first_flag = 1
+        receive_time_list = []
+        chunk_duration_list = []
 
         # 2. Send request
         st = time.time()
         html = requests.post(self.url, json.dumps(params), stream=True)
 
         # 3. Process the received response 
-        for chunk in html.iter_content(chunk_size=1024):
+        for chunk in html.iter_content(chunk_size=None):
+            receive_time_list.append(time.time())
             audio = base64.b64decode(chunk)  # bytes
             if first_flag:
                 first_response = time.time() - st
@@ -443,21 +448,15 @@ class TTSHttpHandler:
                     self.t.start()
                     self.start_play = False
             all_bytes += audio
+            chunk_duration_list.append(len(audio) / 2.0 / 24000)
 
         final_response = time.time() - st
         duration = len(all_bytes) / 2.0 / 24000
 
-        logger.info(f"句子：{text}")
-        logger.info(f"首包响应：{first_response} s")
-        logger.info(f"尾包响应：{final_response} s")
-        logger.info(f"音频时长：{duration} s")
-        logger.info(f"RTF: {final_response / duration}")
-
         if output is not None:
-            if save_audio(all_bytes, output):
-                logger.info(f"音频保存至：{output}")
-            else:
-                logger.error("save audio error")
+            save_audio_success = save_audio(all_bytes, output)
+        else:
+            save_audio_success = False
 
         if self.play:
             self.t.join()
@@ -465,6 +464,8 @@ class TTSHttpHandler:
             self.stream.close()
             self.p.terminate()
 
+        return first_response, final_response, duration, save_audio_success, receive_time_list, chunk_duration_list
+
 
 class VectorHttpHandler:
     def __init__(self, server_ip=None, port=None):
diff --git a/paddlespeech/server/utils/util.py b/paddlespeech/server/utils/util.py
index 72ee0060e246d437052b916362b2b55b1946fc65..061b213c78360d523d1cc3cc180f93cfaac387ab 100644
--- a/paddlespeech/server/utils/util.py
+++ b/paddlespeech/server/utils/util.py
@@ -75,3 +75,74 @@ def get_chunks(data, block_size, pad_size, step):
         else:
             print("Please set correct type to get chunks, am or voc")
     return chunks
+
+
+def compute_delay(receive_time_list, chunk_duration_list):
+    """compute delay 
+        Args:
+            receive_time_list (list): Time to receive each packet
+            chunk_duration_list (list): The audio duration corresponding to each packet
+        Returns:
+            [list]: Delay time list
+        """
+    assert (len(receive_time_list) == len(chunk_duration_list))
+    delay_time_list = []
+    play_time = receive_time_list[0] + chunk_duration_list[0]
+    for i in range(1, len(receive_time_list)):
+        receive_time = receive_time_list[i]
+        delay_time = receive_time - play_time
+        # 有延迟
+        if delay_time > 0:
+            play_time = play_time + delay_time + chunk_duration_list[i]
+            delay_time_list.append(delay_time)
+        # 没有延迟
+        else:
+            play_time = play_time + chunk_duration_list[i]
+
+    return delay_time_list
+
+
+def count_engine(logfile: str="./nohup.out"):
+    """For inference on the statistical engine side
+    Args:
+        logfile (str, optional): server log. Defaults to "./nohup.out".
+    """
+    first_response_list = []
+    final_response_list = []
+    duration_list = []
+
+    with open(logfile, "r") as f:
+        for line in f.readlines():
+            if "- first response time:" in line:
+                first_response = float(line.splie(" ")[-2])
+                first_response_list.append(first_response)
+            elif "- final response time:" in line:
+                final_response = float(line.splie(" ")[-2])
+                final_response_list.append(final_response)
+            elif "- The durations of audio is:" in line:
+                duration = float(line.splie(" ")[-2])
+                duration_list.append(duration)
+
+    assert (len(first_response_list) == len(final_response_list) and
+            len(final_response_list) == len(duration_list))
+
+    avg_first_response = sum(first_response_list) / len(first_response_list)
+    avg_final_response = sum(final_response_list) / len(final_response_list)
+    avg_duration = sum(duration_list) / len(duration_list)
+    RTF = sum(final_response_list) / sum(duration_list)
+
+    print(
+        "************************* engine result ***************************************"
+    )
+    print(
+        f"test num: {len(duration_list)}, avg first response: {avg_first_response} s, avg final response: {avg_final_response} s, avg duration: {avg_duration}, RTF: {RTF}"
+    )
+    print(
+        f"min duration: {min(duration_list)} s, max duration: {max(duration_list)} s"
+    )
+    print(
+        f"max first response: {max(first_response_list)} s, min first response: {min(first_response_list)} s"
+    )
+    print(
+        f"max final response: {max(final_response_list)} s, min final response: {min(final_response_list)} s"
+    )
diff --git a/paddlespeech/server/ws/tts_socket.py b/paddlespeech/server/ws/tts_socket.py
index 699ee412bb43a2b8f39d164e96360afd88cda689..482aeb79b0dc36aa028f3fa9be44205926dcd8a9 100644
--- a/paddlespeech/server/ws/tts_socket.py
+++ b/paddlespeech/server/ws/tts_socket.py
@@ -24,7 +24,7 @@ from paddlespeech.server.engine.engine_pool import get_engine_pool
 router = APIRouter()
 
 
-@router.websocket('/ws/tts')
+@router.websocket('/paddlespeech/tts/streaming')
 async def websocket_endpoint(websocket: WebSocket):
     await websocket.accept()
 
diff --git a/tests/unit/server/offline/change_yaml.py b/tests/unit/server/offline/change_yaml.py
index cdeaebdbcf35b8c4345d1bdaca5ccfa6ebffbc1b..d51a6259178c981f30d8864c64682d257e2eb1cd 100644
--- a/tests/unit/server/offline/change_yaml.py
+++ b/tests/unit/server/offline/change_yaml.py
@@ -19,7 +19,7 @@ def change_device(yamlfile: str, engine: str, device: str):
     if device == 'cpu':
         set_device = 'cpu'
     elif device == 'gpu':
-        set_device = 'gpu:0'
+        set_device = 'gpu:3'
     else:
         print("Please set correct device: cpu or gpu.")
 
diff --git a/tests/unit/server/offline/conf/application.yaml b/tests/unit/server/offline/conf/application.yaml
index 2b1a05998083e08377d63ee02bc77323a7c4dce5..762f4af6e952fad3c671b452899584ffcfe81aeb 100644
--- a/tests/unit/server/offline/conf/application.yaml
+++ b/tests/unit/server/offline/conf/application.yaml
@@ -1,4 +1,4 @@
-# This is the parameter configuration file for PaddleSpeech Serving.
+# This is the parameter configuration file for PaddleSpeech Offline Serving.
 
 #################################################################################
 #                             SERVER SETTING                                    #
@@ -7,8 +7,8 @@ host: 127.0.0.1
 port: 8090
 
 # The task format in the engin_list is: <speech task>_<engine type>
-# task choices = ['asr_python', 'asr_inference', 'tts_python', 'tts_inference']
-
+# task choices = ['asr_python', 'asr_inference', 'tts_python', 'tts_inference', 'cls_python', 'cls_inference']
+protocol: 'http'
 engine_list: ['asr_python', 'tts_python', 'cls_python']
 
 
diff --git a/tests/unit/server/offline/test_server_client.sh b/tests/unit/server/offline/test_server_client.sh
index e7ae7604d177f4fa0f654a10e64e5eb9ba45669e..875008a75f5ac903f3d024ba1250fd653b4c7ebd 100644
--- a/tests/unit/server/offline/test_server_client.sh
+++ b/tests/unit/server/offline/test_server_client.sh
@@ -21,6 +21,8 @@ StartService(){
 }
 
 ClientTest(){
+    echo "aaaaaaaaaaaaaaaaaaaaaaaaaa $server_ip"
+    echo "aaaaaaaaaaaaaaaaaaaaaaaaaa $port"
     # Client test
     # test asr client
     paddlespeech_client asr --server_ip $server_ip --port $port --input ./zh.wav 
@@ -39,6 +41,7 @@ ClientTest(){
     ((test_times+=1))
     paddlespeech_client cls --server_ip $server_ip --port $port --input ./zh.wav 
     ((test_times+=1)) 
+
 }
 
 GetTestResult() {
@@ -58,6 +61,7 @@ rm -rf log/server.log.wf
 rm -rf log/server.log
 rm -rf log/test_result.log
 
+cp ../../../../demos/speech_server/conf/application.yaml ./conf/
 config_file=./conf/application.yaml
 server_ip=$(cat $config_file | grep "host" | awk -F " " '{print $2}')
 port=$(cat $config_file | grep "port" | awk '/port:/ {print $2}')
@@ -191,5 +195,4 @@ echo "***************** Here are all the test results ********************"
 cat ./log/test_result.log
 
 # Restoring conf is the same as demos/speech_server
-rm -rf ./conf
-cp ../../../demos/speech_server/conf/ ./ -rf
\ No newline at end of file
+cp ../../../../demos/speech_server/conf/application.yaml ./conf/
diff --git a/tests/unit/server/online/tts/check_server/conf/application.yaml b/tests/unit/server/online/tts/check_server/conf/application.yaml
index 26cd325b96d6ea3ff229e8505c6c52b8dec286b8..dd1a7e197875df335b491f5fab971c58bc7d1a23 100644
--- a/tests/unit/server/online/tts/check_server/conf/application.yaml
+++ b/tests/unit/server/online/tts/check_server/conf/application.yaml
@@ -39,9 +39,9 @@ tts_online:
     # others
     lang: 'zh'
     device: 'cpu' # set 'gpu:id' or 'cpu'
-    am_block: 42
+    am_block: 72
     am_pad: 12
-    voc_block: 14
+    voc_block: 36
     voc_pad: 14
     
 
@@ -80,9 +80,9 @@ tts_online-onnx:
 
     # others
     lang: 'zh'
-    am_block: 42
+    am_block: 72
     am_pad: 12
-    voc_block: 14
+    voc_block: 36
     voc_pad: 14
     voc_upsample: 300
     
diff --git a/tests/unit/server/online/tts/check_server/test_all.sh b/tests/unit/server/online/tts/check_server/test_all.sh
index b2ea6b44563299745145852e2f0389efd46a841c..94129860e1eb562d401f8b8ad1ef1f8b16a93109 100644
--- a/tests/unit/server/online/tts/check_server/test_all.sh
+++ b/tests/unit/server/online/tts/check_server/test_all.sh
@@ -10,7 +10,6 @@ bash test.sh tts_online $log_all_dir/log_tts_online_cpu
 python change_yaml.py --change_type engine_type --target_key engine_list --target_value tts_online-onnx
 bash test.sh tts_online-onnx $log_all_dir/log_tts_online-onnx_cpu
 
-
 python change_yaml.py --change_type device --target_key device --target_value gpu:3
 bash test.sh tts_online $log_all_dir/log_tts_online_gpu
 
diff --git a/tests/unit/server/online/tts/check_server/tts_online_application.yaml b/tests/unit/server/online/tts/check_server/tts_online_application.yaml
index 26cd325b96d6ea3ff229e8505c6c52b8dec286b8..dd1a7e197875df335b491f5fab971c58bc7d1a23 100644
--- a/tests/unit/server/online/tts/check_server/tts_online_application.yaml
+++ b/tests/unit/server/online/tts/check_server/tts_online_application.yaml
@@ -39,9 +39,9 @@ tts_online:
     # others
     lang: 'zh'
     device: 'cpu' # set 'gpu:id' or 'cpu'
-    am_block: 42
+    am_block: 72
     am_pad: 12
-    voc_block: 14
+    voc_block: 36
     voc_pad: 14
     
 
@@ -80,9 +80,9 @@ tts_online-onnx:
 
     # others
     lang: 'zh'
-    am_block: 42
+    am_block: 72
     am_pad: 12
-    voc_block: 14
+    voc_block: 36
     voc_pad: 14
     voc_upsample: 300
     
diff --git a/tests/unit/server/online/tts/test_server/test_http_client.py b/tests/unit/server/online/tts/test_server/test_http_client.py
index 96372ab37c141825d7d59d79f876ab6dccd22b9e..7fdb4e00c45df002138732314c884e204ea567fc 100644
--- a/tests/unit/server/online/tts/test_server/test_http_client.py
+++ b/tests/unit/server/online/tts/test_server/test_http_client.py
@@ -12,117 +12,35 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
-import base64
-import json
+import asyncio
 import os
-import time
 
-import requests
-
-from paddlespeech.server.utils.audio_process import pcm2wav
+from paddlespeech.server.utils.util import compute_delay
 from paddlespeech.t2s.exps.syn_utils import get_sentences
 
 
-def save_audio(buffer, audio_path) -> bool:
-    if audio_path.endswith("pcm"):
-        with open(audio_path, "wb") as f:
-            f.write(buffer)
-    elif audio_path.endswith("wav"):
-        with open("./tmp.pcm", "wb") as f:
-            f.write(buffer)
-        pcm2wav("./tmp.pcm", audio_path, channels=1, bits=16, sample_rate=24000)
-        os.system("rm ./tmp.pcm")
-    else:
-        print("Only supports saved audio format is pcm or wav")
-        return False
-
-    return True
-
-
 def test(args, text, utt_id):
-    params = {
-        "text": text,
-        "spk_id": args.spk_id,
-        "speed": args.speed,
-        "volume": args.volume,
-        "sample_rate": args.sample_rate,
-        "save_path": ''
-    }
-
-    buffer = b''
-    flag = 1
-    url = "http://" + str(args.server) + ":" + str(
-        args.port) + "/paddlespeech/streaming/tts"
-    st = time.time()
-    html = requests.post(url, json.dumps(params), stream=True)
-    for chunk in html.iter_content(chunk_size=1024):
-        chunk = base64.b64decode(chunk)  # bytes
-        if flag:
-            first_response = time.time() - st
-            print(f"首包响应：{first_response} s")
-            flag = 0
-        buffer += chunk
-
-    final_response = time.time() - st
-    duration = len(buffer) / 2.0 / 24000
-
-    print(f"sentence: {text}")
-    print(f"尾包响应：{final_response} s")
-    print(f"音频时长：{duration} s")
-    print(f"RTF: {final_response / duration}")
-
-    save_path = str(args.output_dir + "/" + utt_id + ".wav")
-    save_audio(buffer, save_path)
-    print("音频保存至：", save_path)
-
-    return first_response, final_response, duration
-
-
-def count_engine(logfile: str="./nohup.out"):
-    """For inference on the statistical engine side
-
-    Args:
-        logfile (str, optional): server log. Defaults to "./nohup.out".
-    """
-    first_response_list = []
-    final_response_list = []
-    duration_list = []
+    output = str(args.output_dir + "/" + utt_id + ".wav")
+    if args.protocol == "http":
+        print("tts http client start")
+        from paddlespeech.server.utils.audio_handler import TTSHttpHandler
+        handler = TTSHttpHandler(args.server_ip, args.port, args.play)
+        first_response, final_response, duration, save_audio_success, receive_time_list, chunk_duration_list = handler.run(
+            text, args.spk_id, args.speed, args.volume, args.sample_rate,
+            output)
+
+    elif args.protocol == "websocket":
+        from paddlespeech.server.utils.audio_handler import TTSWsHandler
+        print("tts websocket client start")
+        handler = TTSWsHandler(args.server_ip, args.port, args.play)
+        loop = asyncio.get_event_loop()
+        first_response, final_response, duration, save_audio_success, receive_time_list, chunk_duration_list = loop.run_until_complete(
+            handler.run(text, output))
 
-    with open(logfile, "r") as f:
-        for line in f.readlines():
-            if "- first response time:" in line:
-                first_response = float(line.splie(" ")[-2])
-                first_response_list.append(first_response)
-            elif "- final response time:" in line:
-                final_response = float(line.splie(" ")[-2])
-                final_response_list.append(final_response)
-            elif "- The durations of audio is:" in line:
-                duration = float(line.splie(" ")[-2])
-                duration_list.append(duration)
+    else:
+        print("Please set correct protocol, http or websocket")
 
-    assert (len(first_response_list) == len(final_response_list) and
-            len(final_response_list) == len(duration_list))
-
-    avg_first_response = sum(first_response_list) / len(first_response_list)
-    avg_final_response = sum(final_response_list) / len(final_response_list)
-    avg_duration = sum(duration_list) / len(duration_list)
-    RTF = sum(final_response_list) / sum(duration_list)
-
-    print(
-        "************************* engine result ***************************************"
-    )
-    print(
-        f"test num: {len(duration_list)}, avg first response: {avg_first_response} s, avg final response: {avg_final_response} s, avg duration: {avg_duration}, RTF: {RTF}"
-    )
-    print(
-        f"min duration: {min(duration_list)} s, max duration: {max(duration_list)} s"
-    )
-    print(
-        f"max first response: {max(first_response_list)} s, min first response: {min(first_response_list)} s"
-    )
-    print(
-        f"max final response: {max(final_response_list)} s, min final response: {min(final_response_list)} s"
-    )
+    return first_response, final_response, duration, save_audio_success, receive_time_list, chunk_duration_list
 
 
 if __name__ == "__main__":
@@ -142,10 +60,18 @@ if __name__ == "__main__":
         default=0,
         help='Sampling rate, the default is the same as the model')
     parser.add_argument(
-        "--server", type=str, help="server ip", default="127.0.0.1")
+        "--server_ip", type=str, help="server ip", default="127.0.0.1")
     parser.add_argument("--port", type=int, help="server port", default=8092)
+    parser.add_argument(
+        "--protocol",
+        type=str,
+        choices=['http', 'websocket'],
+        help="server protocol",
+        default="http")
     parser.add_argument(
         "--output_dir", type=str, default="./output", help="output dir")
+    parser.add_argument(
+        "--play", type=bool, help="whether to play audio", default=False)
 
     args = parser.parse_args()
 
@@ -155,13 +81,35 @@ if __name__ == "__main__":
     first_response_list = []
     final_response_list = []
     duration_list = []
+    all_delay_list = []
+    packet_count = 0.0
 
     sentences = get_sentences(text_file=args.text, lang="zh")
     for utt_id, sentence in sentences:
-        first_response, final_response, duration = test(args, sentence, utt_id)
+        first_response, final_response, duration, save_audio_success, receive_time_list, chunk_duration_list = test(
+            args, sentence, utt_id)
+        delay_time_list = compute_delay(receive_time_list, chunk_duration_list)
         first_response_list.append(first_response)
         final_response_list.append(final_response)
         duration_list.append(duration)
+        packet_count += len(receive_time_list)
+
+        print(f"句子：{sentence}")
+        print(f"首包响应时间：{first_response} s")
+        print(f"尾包响应时间：{final_response} s")
+        print(f"音频时长：{duration} s")
+        print(f"该句RTF：{final_response/duration}")
+
+        if delay_time_list != []:
+            for t in delay_time_list:
+                all_delay_list.append(t)
+            print(
+                f"该句流式合成的延迟情况：总包个数：{len(receive_time_list)}，延迟包个数：{len(delay_time_list)}, 最小延迟时间：{min(delay_time_list)} s, 最大延迟时间：{max(delay_time_list)} s, 平均延迟时间：{sum(delay_time_list)/len(delay_time_list)} s, 延迟率：{len(delay_time_list)/len(receive_time_list)}"
+            )
+        else:
+            print("该句流式合成无延迟情况")
+
+        packet_count += len(receive_time_list)
 
     assert (len(first_response_list) == len(final_response_list) and
             len(final_response_list) == len(duration_list))
@@ -170,19 +118,35 @@ if __name__ == "__main__":
     avg_final_response = sum(final_response_list) / len(final_response_list)
     avg_duration = sum(duration_list) / len(duration_list)
     RTF = sum(final_response_list) / sum(duration_list)
+    if all_delay_list != []:
+        delay_count = len(all_delay_list)
+        avg_delay = sum(all_delay_list) / len(all_delay_list)
+        delay_ratio = len(all_delay_list) / packet_count
+        min_delay = min(all_delay_list)
+        max_delay = max(all_delay_list)
+    else:
+        delay_count = 0.0
+        avg_delay = 0.0
+        delay_ratio = 0.0
+        min_delay = 0.0
+        max_delay = 0.0
 
     print(
         "************************* server/client result ***************************************"
     )
     print(
-        f"test num: {len(duration_list)}, avg first response: {avg_first_response} s, avg final response: {avg_final_response} s, avg duration: {avg_duration}, RTF: {RTF}"
+        f"test num: {len(duration_list)}, avg first response: {avg_first_response} s, avg final response: {avg_final_response} s, avg duration: {avg_duration}, RTF: {RTF}."
+    )
+    print(
+        f"test num: {len(duration_list)}, packet count: {packet_count}, delay count: {delay_count}, avg delay time: {avg_delay} s, delay ratio: {delay_ratio} "
     )
     print(
         f"min duration: {min(duration_list)} s, max duration: {max(duration_list)} s"
     )
     print(
-        f"max first response: {max(first_response_list)} s, min first response: {min(first_response_list)} s"
+        f"min first response: {min(first_response_list)} s, max first response: {max(first_response_list)} s."
     )
     print(
-        f"max final response: {max(final_response_list)} s, min final response: {min(final_response_list)} s"
+        f"min final response: {min(final_response_list)} s, max final response: {max(final_response_list)} s."
     )
+    print(f"min delay: {min_delay} s, max delay: {max_delay}")