diff --git a/docs/source/tts/models_introduction.md b/docs/source/tts/models_introduction.md
index edf6f312c260a946cae09f754ccef3541d618266..202fe491ad500c9804faf22ada1d109e16bab2cd 100644
--- a/docs/source/tts/models_introduction.md
+++ b/docs/source/tts/models_introduction.md
@@ -251,8 +251,10 @@ Vocoders based on neural networks usually is speech synthesis, which learns the
 - GAN
     - WaveGAN
     - **Parallel WaveGAN**
-    - MelGAN
-    - HiFi-GAN
+    - **MelGAN**
+    - **Style MelGAN**
+    - **Multi Band MelGAN**
+    - **HiFi GAN**
 - VAE
     - Wave-VAE
 - Diffusion
diff --git a/examples/csmsc/tts3/README.md b/examples/csmsc/tts3/README.md
index cb7d49f900050f0cde1bf0ecea1bac29d52722cf..488ede2ec132f43afbaf4d2ccb2c01b4c11db76c 100644
--- a/examples/csmsc/tts3/README.md
+++ b/examples/csmsc/tts3/README.md
@@ -203,7 +203,9 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path}
 ```
 
 ## Pretrained Model
-Pretrained FastSpeech2 model with no silence in the edge of audios [fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip).
+Pretrained FastSpeech2 model with no silence in the edge of audios:
+- [fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip)
+- [fastspeech2_conformer_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_conformer_baker_ckpt_0.5.zip)
 
 Static model can be downloaded here [fastspeech2_nosil_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_static_0.4.zip).
 
diff --git a/paddlespeech/cli/README.md b/paddlespeech/cli/README.md
index 56afb939c2a36f998a70993f44f4d402f6538a3d..264d66f72b44f091737a003278518826e6987159 100644
--- a/paddlespeech/cli/README.md
+++ b/paddlespeech/cli/README.md
@@ -7,3 +7,6 @@
 
  ## ASR
  `paddlespeech asr --input ./test_audio.wav`
+
+ ## Multi-label Classification
+ `paddlespeech cls --input ./test_audio.wav`
diff --git a/paddlespeech/cli/__init__.py b/paddlespeech/cli/__init__.py
index b4b2e22d4bba7943cf2a7c8196546173130f0667..246d0f381279dd7263415c9e1a6c7ad76b8dbb53 100644
--- a/paddlespeech/cli/__init__.py
+++ b/paddlespeech/cli/__init__.py
@@ -14,4 +14,5 @@
 from .asr import ASRExecutor
 from .base_commands import BaseCommand
 from .base_commands import HelpCommand
+from .cls import CLSExecutor
 from .st import STExecutor
diff --git a/paddlespeech/cli/asr/infer.py b/paddlespeech/cli/asr/infer.py
index b40516e955a3a5f1b1ab36ca0be605bd6df8c586..1d235201d06080c2268033d78792ef4ccebd5152 100644
--- a/paddlespeech/cli/asr/infer.py
+++ b/paddlespeech/cli/asr/infer.py
@@ -39,7 +39,11 @@ from paddlespeech.s2t.utils.utility import UpdateConfig
 __all__ = ['ASRExecutor']
 
 pretrained_models = {
-    "wenetspeech_zh_16k": {
+    # The tags for pretrained_models should be "{model_name}[_{dataset}][-{lang}][-...]".
+    # e.g. "conformer_wenetspeech-zh-16k", "transformer_aishell-zh-16k" and "panns_cnn6-32k".
+    # Command line and python api use "{model_name}[_{dataset}]" as --model, usage:
+    # "paddlespeech asr --model conformer_wenetspeech --lang zh --sr 16000 --input ./input.wav"
+    "conformer_wenetspeech-zh-16k": {
         'url':
         'https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/conformer.model.tar.gz',
         'md5':
@@ -49,7 +53,7 @@ pretrained_models = {
         'ckpt_path':
         'exp/conformer/checkpoints/wenetspeech',
     },
-    "transformer_zh_16k": {
+    "transformer_aishell-zh-16k": {
         'url':
         'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/transformer.model.tar.gz',
         'md5':
@@ -83,7 +87,7 @@ class ASRExecutor(BaseExecutor):
         self.parser.add_argument(
             '--model',
             type=str,
-            default='wenetspeech',
+            default='conformer_wenetspeech',
             help='Choose model type of asr task.')
         self.parser.add_argument(
             '--lang',
@@ -137,9 +141,13 @@ class ASRExecutor(BaseExecutor):
         """
             Init model and other resources from a specific path.
         """
+        if hasattr(self, 'model'):
+            logger.info('Model had been initialized.')
+            return
+
         if cfg_path is None or ckpt_path is None:
             sample_rate_str = '16k' if sample_rate == 16000 else '8k'
-            tag = model_type + '_' + lang + '_' + sample_rate_str
+            tag = model_type + '-' + lang + '-' + sample_rate_str
             res_path = self._get_pretrained_path(tag)  # wenetspeech_zh
             self.res_path = res_path
             self.cfg_path = os.path.join(res_path,
@@ -161,7 +169,7 @@ class ASRExecutor(BaseExecutor):
         self.config.decoding.decoding_method = "attention_rescoring"
 
         with UpdateConfig(self.config):
-            if model_type == "ds2_online" or model_type == "ds2_offline":
+            if "ds2_online" in model_type or "ds2_offline" in model_type:
                 from paddlespeech.s2t.io.collator import SpeechCollator
                 self.config.collator.vocab_filepath = os.path.join(
                     res_path, self.config.collator.vocab_filepath)
@@ -174,7 +182,7 @@ class ASRExecutor(BaseExecutor):
                     spm_model_prefix=self.config.collator.spm_model_prefix)
                 self.config.model.input_dim = self.collate_fn_test.feature_size
                 self.config.model.output_dim = text_feature.vocab_size
-            elif model_type == "conformer" or model_type == "transformer" or model_type == "wenetspeech":
+            elif "conformer" in model_type or "transformer" in model_type or "wenetspeech" in model_type:
                 self.config.collator.vocab_filepath = os.path.join(
                     res_path, self.config.collator.vocab_filepath)
                 self.config.collator.augmentation_config = os.path.join(
@@ -192,7 +200,9 @@ class ASRExecutor(BaseExecutor):
                 raise Exception("wrong type")
         # Enter the path of model root
 
-        model_class = dynamic_import(model_type, model_alias)
+        model_name = ''.join(
+            model_type.split('_')[:-1])  # model_type: {model_name}_{dataset}
+        model_class = dynamic_import(model_name, model_alias)
         model_conf = self.config.model
         logger.info(model_conf)
         model = model_class.from_config(model_conf)
@@ -213,7 +223,7 @@ class ASRExecutor(BaseExecutor):
         logger.info("Preprocess audio_file:" + audio_file)
 
         # Get the object for feature extraction
-        if model_type == "ds2_online" or model_type == "ds2_offline":
+        if "ds2_online" in model_type or "ds2_offline" in model_type:
             audio, _ = self.collate_fn_test.process_utterance(
                 audio_file=audio_file, transcript=" ")
             audio_len = audio.shape[0]
@@ -225,7 +235,7 @@ class ASRExecutor(BaseExecutor):
             self._inputs["audio_len"] = audio_len
             logger.info(f"audio feat shape: {audio.shape}")
 
-        elif model_type == "conformer" or model_type == "transformer" or model_type == "wenetspeech":
+        elif "conformer" in model_type or "transformer" in model_type or "wenetspeech" in model_type:
             logger.info("get the preprocess conf")
             preprocess_conf_file = self.config.collator.augmentation_config
             # redirect the cmvn path
@@ -289,7 +299,7 @@ class ASRExecutor(BaseExecutor):
         cfg = self.config.decoding
         audio = self._inputs["audio"]
         audio_len = self._inputs["audio_len"]
-        if model_type == "ds2_online" or model_type == "ds2_offline":
+        if "ds2_online" in model_type or "ds2_offline" in model_type:
             result_transcripts = self.model.decode(
                 audio,
                 audio_len,
@@ -304,7 +314,7 @@ class ASRExecutor(BaseExecutor):
                 num_processes=cfg.num_proc_bsearch)
             self._outputs["result"] = result_transcripts[0]
 
-        elif model_type == "conformer" or model_type == "transformer" or model_type == "wenetspeech":
+        elif "conformer" in model_type or "transformer" in model_type or "wenetspeech" in model_type:
             result_transcripts = self.model.decode(
                 audio,
                 audio_len,
@@ -361,7 +371,7 @@ class ASRExecutor(BaseExecutor):
             audio, audio_sample_rate = soundfile.read(
                 audio_file, dtype="int16", always_2d=True)
         except Exception as e:
-            logger.error(str(e))
+            logger.exception(e)
             logger.error(
                 "can not open the audio file, please check the audio file format is 'wav'. \n \
                  you can try to use sox to change the file format.\n \
@@ -421,7 +431,7 @@ class ASRExecutor(BaseExecutor):
             logger.info('ASR Result: {}'.format(res))
             return True
         except Exception as e:
-            print(e)
+            logger.exception(e)
             return False
 
     def __call__(self, model, lang, sample_rate, config, ckpt_path, audio_file,
diff --git a/paddlespeech/cli/cls/__init.__py b/paddlespeech/cli/cls/__init.__py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/paddlespeech/cli/cls/__init__.py b/paddlespeech/cli/cls/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..13e316f8f687b34743a92bd0723b944741b74516
--- /dev/null
+++ b/paddlespeech/cli/cls/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .infer import CLSExecutor
diff --git a/paddlespeech/cli/cls/infer.py b/paddlespeech/cli/cls/infer.py
new file mode 100644
index 0000000000000000000000000000000000000000..0b4982d157b8768160d47d4c4eb0e39533a9884d
--- /dev/null
+++ b/paddlespeech/cli/cls/infer.py
@@ -0,0 +1,260 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+from typing import List
+from typing import Optional
+from typing import Union
+
+import numpy as np
+import paddle
+import yaml
+
+from ..executor import BaseExecutor
+from ..utils import cli_register
+from ..utils import download_and_decompress
+from ..utils import logger
+from ..utils import MODEL_HOME
+from paddleaudio import load
+from paddleaudio.features import LogMelSpectrogram
+from paddlespeech.s2t.utils.dynamic_import import dynamic_import
+
+__all__ = ['CLSExecutor']
+
+pretrained_models = {
+    # The tags for pretrained_models should be "{model_name}[_{dataset}][-{lang}][-...]".
+    # e.g. "conformer_wenetspeech-zh-16k", "transformer_aishell-zh-16k" and "panns_cnn6-32k".
+    # Command line and python api use "{model_name}[_{dataset}]" as --model, usage:
+    # "paddlespeech asr --model conformer_wenetspeech --lang zh --sr 16000 --input ./input.wav"
+    "panns_cnn6-32k": {
+        'url': 'https://paddlespeech.bj.bcebos.com/cls/panns_cnn6.tar.gz',
+        'md5': '4cf09194a95df024fd12f84712cf0f9c',
+        'cfg_path': 'panns.yaml',
+        'ckpt_path': 'cnn6.pdparams',
+        'label_file': 'audioset_labels.txt',
+    },
+    "panns_cnn10-32k": {
+        'url': 'https://paddlespeech.bj.bcebos.com/cls/panns_cnn10.tar.gz',
+        'md5': 'cb8427b22176cc2116367d14847f5413',
+        'cfg_path': 'panns.yaml',
+        'ckpt_path': 'cnn10.pdparams',
+        'label_file': 'audioset_labels.txt',
+    },
+    "panns_cnn14-32k": {
+        'url': 'https://paddlespeech.bj.bcebos.com/cls/panns_cnn14.tar.gz',
+        'md5': 'e3b9b5614a1595001161d0ab95edee97',
+        'cfg_path': 'panns.yaml',
+        'ckpt_path': 'cnn14.pdparams',
+        'label_file': 'audioset_labels.txt',
+    },
+}
+
+model_alias = {
+    "panns_cnn6": "paddlespeech.cls.models.panns:CNN6",
+    "panns_cnn10": "paddlespeech.cls.models.panns:CNN10",
+    "panns_cnn14": "paddlespeech.cls.models.panns:CNN14",
+}
+
+
+@cli_register(
+    name='paddlespeech.cls', description='Audio classification infer command.')
+class CLSExecutor(BaseExecutor):
+    def __init__(self):
+        super(CLSExecutor, self).__init__()
+
+        self.parser = argparse.ArgumentParser(
+            prog='paddlespeech.cls', add_help=True)
+        self.parser.add_argument(
+            '--input', type=str, required=True, help='Audio file to classify.')
+        self.parser.add_argument(
+            '--model',
+            type=str,
+            default='panns_cnn14',
+            help='Choose model type of cls task.')
+        self.parser.add_argument(
+            '--config',
+            type=str,
+            default=None,
+            help='Config of cls task. Use deault config when it is None.')
+        self.parser.add_argument(
+            '--ckpt_path',
+            type=str,
+            default=None,
+            help='Checkpoint file of model.')
+        self.parser.add_argument(
+            '--label_file',
+            type=str,
+            default=None,
+            help='Label file of cls task.')
+        self.parser.add_argument(
+            '--topk',
+            type=int,
+            default=1,
+            help='Return topk scores of classification result.')
+        self.parser.add_argument(
+            '--device',
+            type=str,
+            default=paddle.get_device(),
+            help='Choose device to execute model inference.')
+
+    def _get_pretrained_path(self, tag: str) -> os.PathLike:
+        """
+            Download and returns pretrained resources path of current task.
+        """
+        assert tag in pretrained_models, 'Can not find pretrained resources of {}.'.format(
+            tag)
+
+        res_path = os.path.join(MODEL_HOME, tag)
+        decompressed_path = download_and_decompress(pretrained_models[tag],
+                                                    res_path)
+        decompressed_path = os.path.abspath(decompressed_path)
+        logger.info(
+            'Use pretrained model stored in: {}'.format(decompressed_path))
+
+        return decompressed_path
+
+    def _init_from_path(self,
+                        model_type: str='panns_cnn14',
+                        cfg_path: Optional[os.PathLike]=None,
+                        ckpt_path: Optional[os.PathLike]=None,
+                        label_file: Optional[os.PathLike]=None):
+        """
+            Init model and other resources from a specific path.
+        """
+        if hasattr(self, 'model'):
+            logger.info('Model had been initialized.')
+            return
+
+        if label_file is None or ckpt_path is None:
+            tag = model_type + '-' + '32k'  # panns_cnn14-32k
+            self.res_path = self._get_pretrained_path(tag)
+            self.cfg_path = os.path.join(self.res_path,
+                                         pretrained_models[tag]['cfg_path'])
+            self.label_file = os.path.join(self.res_path,
+                                           pretrained_models[tag]['label_file'])
+            self.ckpt_path = os.path.join(self.res_path,
+                                          pretrained_models[tag]['ckpt_path'])
+        else:
+            self.cfg_path = os.path.abspath(cfg_path)
+            self.label_file = os.path.abspath(label_file)
+            self.ckpt_path = os.path.abspath(ckpt_path)
+
+        # config
+        with open(self.cfg_path, 'r') as f:
+            self._conf = yaml.safe_load(f)
+
+        # labels
+        self._label_list = []
+        with open(self.label_file, 'r') as f:
+            for line in f:
+                self._label_list.append(line.strip())
+
+        # model
+        model_class = dynamic_import(model_type, model_alias)
+        model_dict = paddle.load(self.ckpt_path)
+        self.model = model_class(extract_embedding=False)
+        self.model.set_state_dict(model_dict)
+        self.model.eval()
+
+    def preprocess(self, audio_file: Union[str, os.PathLike]):
+        """
+            Input preprocess and return paddle.Tensor stored in self.input.
+            Input content can be a text(tts), a file(asr, cls) or a streaming(not supported yet).
+        """
+        feat_conf = self._conf['feature']
+        logger.info(feat_conf)
+        waveform, _ = load(
+            file=audio_file,
+            sr=feat_conf['sample_rate'],
+            mono=True,
+            dtype='float32')
+        logger.info("Preprocessing audio_file:" + audio_file)
+
+        # Feature extraction
+        feature_extractor = LogMelSpectrogram(
+            sr=feat_conf['sample_rate'],
+            n_fft=feat_conf['n_fft'],
+            hop_length=feat_conf['hop_length'],
+            window=feat_conf['window'],
+            win_length=feat_conf['window_length'],
+            f_min=feat_conf['f_min'],
+            f_max=feat_conf['f_max'],
+            n_mels=feat_conf['n_mels'], )
+        feats = feature_extractor(
+            paddle.to_tensor(paddle.to_tensor(waveform).unsqueeze(0)))
+        self._inputs['feats'] = paddle.transpose(feats, [0, 2, 1]).unsqueeze(
+            1)  # [B, N, T] -> [B, 1, T, N]
+
+    @paddle.no_grad()
+    def infer(self):
+        """
+            Model inference and result stored in self.output.
+        """
+        self._outputs['logits'] = self.model(self._inputs['feats'])
+
+    def _generate_topk_label(self, result: np.ndarray, topk: int) -> str:
+        assert topk <= len(
+            self._label_list), 'Value of topk is larger than number of labels.'
+
+        topk_idx = (-result).argsort()[:topk]
+        ret = ''
+        for idx in topk_idx:
+            label, score = self._label_list[idx], result[idx]
+            ret += f'{label}: {score}\n'
+        return ret
+
+    def postprocess(self, topk: int) -> Union[str, os.PathLike]:
+        """
+            Output postprocess and return human-readable results such as texts and audio files.
+        """
+        return self._generate_topk_label(
+            result=self._outputs['logits'].squeeze(0).numpy(), topk=topk)
+
+    def execute(self, argv: List[str]) -> bool:
+        """
+            Command line entry.
+        """
+        parser_args = self.parser.parse_args(argv)
+
+        model_type = parser_args.model
+        label_file = parser_args.label_file
+        cfg_path = parser_args.config
+        ckpt_path = parser_args.ckpt_path
+        audio_file = parser_args.input
+        topk = parser_args.topk
+        device = parser_args.device
+
+        try:
+            res = self(model_type, cfg_path, label_file, ckpt_path, audio_file,
+                       topk, device)
+            logger.info('CLS Result:\n{}'.format(res))
+            return True
+        except Exception as e:
+            logger.exception(e)
+            return False
+
+    def __call__(self, model, config, ckpt_path, label_file, audio_file, topk,
+                 device):
+        """
+            Python API to call an executor.
+        """
+        audio_file = os.path.abspath(audio_file)
+        # self._check(audio_file, sample_rate)
+        paddle.set_device(device)
+        self._init_from_path(model, config, ckpt_path, label_file)
+        self.preprocess(audio_file)
+        self.infer()
+        res = self.postprocess(topk)  # Retrieve result of cls.
+
+        return res
diff --git a/paddlespeech/cli/st/infer.py b/paddlespeech/cli/st/infer.py
index 9f97d8731e913e0513a24a0de5c10838d151a521..534b9e3b9458f75fd89710ee20dd4b06feb38c4d 100644
--- a/paddlespeech/cli/st/infer.py
+++ b/paddlespeech/cli/st/infer.py
@@ -21,6 +21,7 @@ from typing import Union
 import kaldi_io
 import numpy as np
 import paddle
+import soundfile
 from kaldiio import WriteHelper
 from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
 from paddlespeech.s2t.utils.dynamic_import import dynamic_import
@@ -36,19 +37,19 @@ from ..utils import MODEL_HOME
 __all__ = ["STExecutor"]
 
 pretrained_models = {
-    "fat_st_ted_en_zh": {
+    "fat_st_ted_en-zh": {
         "url":
-        "https://paddlespeech.bj.bcebos.com/s2t/ted_en_zh/st1/fat_st_mtl.model.tar.gz",
+        "https://paddlespeech.bj.bcebos.com/s2t/ted_en_zh/st1/fat_st_ted-en-zh.tar.gz",
         "md5":
-        "210b8eacc390d9965334fa8e96c49a13",
+        "fa0a7425b91b4f8d259c70b2aca5ae67",
         "cfg_path":
         "conf/transformer_mtl_noam.yaml",
         "ckpt_path":
-        "exp/transformer_mtl_noam/checkpoints/fat_st_ted_en_zh",
+        "exp/transformer_mtl_noam/checkpoints/fat_st_ted-en-zh.pdparams",
     }
 }
 
-model_alias = {"fat_st": "paddlespeech.s2t.models.u2_st:U2STModel"}
+model_alias = {"fat_st_ted": "paddlespeech.s2t.models.u2_st:U2STModel"}
 
 kaldi_bins = {
     "url":
@@ -69,17 +70,28 @@ class STExecutor(BaseExecutor):
         self.parser.add_argument(
             "--input", type=str, required=True, help="Audio file to translate.")
         self.parser.add_argument(
-            "--model",
+            "--model_type",
             type=str,
-            default="fat_st",
+            default="fat_st_ted",
             help="Choose model type of st task.")
         self.parser.add_argument(
-            "--lang",
+            "--src_lang",
             type=str,
-            default="ted_en_zh",
-            help="Choose model language.")
+            default="en",
+            help="Choose model source language.")
         self.parser.add_argument(
-            "--config",
+            "--tgt_lang",
+            type=str,
+            default="zh",
+            help="Choose model target language.")
+        self.parser.add_argument(
+            "--sample_rate",
+            type=int,
+            default=16000,
+            choices=[16000],
+            help='Choose the audio sample rate of the model. 8000 or 16000')
+        self.parser.add_argument(
+            "--cfg_path",
             type=str,
             default=None,
             help="Config of st task. Use deault config when it is None.")
@@ -117,20 +129,28 @@ class STExecutor(BaseExecutor):
         decompressed_path = download_and_decompress(kaldi_bins, MODEL_HOME)
         decompressed_path = os.path.abspath(decompressed_path)
         logger.info("Kaldi_bins stored in: {}".format(decompressed_path))
-        os.environ['LD_LIBRARY_PATH'] += f':{decompressed_path}'
+        if "LD_LIBRARY_PATH" in os.environ:
+            os.environ["LD_LIBRARY_PATH"] += f":{decompressed_path}"
+        else:
+            os.environ["LD_LIBRARY_PATH"] = f"{decompressed_path}"
         os.environ["PATH"] += f":{decompressed_path}"
         return decompressed_path
 
     def _init_from_path(self,
-                        model_type: str="fat_st",
-                        lang: str="zh",
+                        model_type: str="fat_st_ted",
+                        src_lang: str="en",
+                        tgt_lang: str="zh",
                         cfg_path: Optional[os.PathLike]=None,
                         ckpt_path: Optional[os.PathLike]=None):
         """
             Init model and other resources from a specific path.
         """
+        if hasattr(self, 'model'):
+            logger.info('Model had been initialized.')
+            return
+
         if cfg_path is None or ckpt_path is None:
-            tag = model_type + "_" + lang
+            tag = model_type + "_" + src_lang + "-" + tgt_lang
             res_path = self._get_pretrained_path(tag)
             self.cfg_path = os.path.join(res_path,
                                          pretrained_models[tag]["cfg_path"])
@@ -171,13 +191,20 @@ class STExecutor(BaseExecutor):
         self.model.eval()
 
         # load model
-        params_path = self.ckpt_path + ".pdparams"
+        params_path = self.ckpt_path
         model_dict = paddle.load(params_path)
         self.model.set_state_dict(model_dict)
 
         # set kaldi bins
         self._set_kaldi_bins()
 
+    def _check(self, audio_file: str, sample_rate: int):
+        _, audio_sample_rate = soundfile.read(
+            audio_file, dtype="int16", always_2d=True)
+        if audio_sample_rate != sample_rate:
+            raise Exception("invalid sample rate")
+            sys.exit(-1)
+
     def preprocess(self, wav_file: Union[str, os.PathLike], model_type: str):
         """
             Input preprocess and return paddle.Tensor stored in self.input.
@@ -186,7 +213,7 @@ class STExecutor(BaseExecutor):
         audio_file = os.path.abspath(wav_file)
         logger.info("Preprocess audio_file:" + audio_file)
 
-        if model_type == "fat_st":
+        if model_type == "fat_st_ted":
             cmvn = self.config.collator.cmvn_path
             utt_name = "_tmp"
 
@@ -198,7 +225,8 @@ class STExecutor(BaseExecutor):
             fbank_extract_process = subprocess.Popen(
                 fbank_extract_command,
                 stdin=subprocess.PIPE,
-                stdout=subprocess.PIPE)
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE)
             fbank_extract_process.stdin.write(
                 f"{utt_name} {wav_file}".encode("utf8"))
             fbank_extract_process.stdin.close()
@@ -207,14 +235,18 @@ class STExecutor(BaseExecutor):
 
             extract_command = ["compute-kaldi-pitch-feats", "scp:-", "ark:-"]
             pitch_extract_process = subprocess.Popen(
-                extract_command, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
+                extract_command,
+                stdin=subprocess.PIPE,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE)
             pitch_extract_process.stdin.write(
                 f"{utt_name} {wav_file}".encode("utf8"))
             process_command = ["process-kaldi-pitch-feats", "ark:", "ark:-"]
             pitch_process = subprocess.Popen(
                 process_command,
                 stdin=pitch_extract_process.stdout,
-                stdout=subprocess.PIPE)
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE)
             pitch_extract_process.stdin.close()
             pitch_feat = dict(
                 kaldi_io.read_mat_ark(pitch_process.stdout))[utt_name]
@@ -228,19 +260,19 @@ class STExecutor(BaseExecutor):
                 "ark:-"
             ]
             cmvn_process = subprocess.Popen(
-                cmvn_command, stdout=subprocess.PIPE)
+                cmvn_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
             process_command = [
                 "copy-feats", "--compress=true", "ark:-", "ark:-"
             ]
             process = subprocess.Popen(
                 process_command,
                 stdin=cmvn_process.stdout,
-                stdout=subprocess.PIPE)
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE)
             norm_feat = dict(kaldi_io.read_mat_ark(process.stdout))[utt_name]
-            self.audio = paddle.to_tensor(norm_feat).unsqueeze(0)
-            self.audio_len = paddle.to_tensor(
-                self.audio.shape[1], dtype="int64")
-            logger.info(f"audio feat shape: {self.audio.shape}")
+            self._inputs["audio"] = paddle.to_tensor(norm_feat).unsqueeze(0)
+            self._inputs["audio_len"] = paddle.to_tensor(
+                self._inputs["audio"].shape[1], dtype="int64")
         else:
             raise ValueError("Wrong model type.")
 
@@ -250,9 +282,9 @@ class STExecutor(BaseExecutor):
             Model inference and result stored in self.output.
         """
         cfg = self.config.decoding
-        audio = self.audio
-        audio_len = self.audio_len
-        if model_type == "fat_st":
+        audio = self._inputs["audio"]
+        audio_len = self._inputs["audio_len"]
+        if model_type == "fat_st_ted":
             hyps = self.model.decode(
                 audio,
                 audio_len,
@@ -270,7 +302,7 @@ class STExecutor(BaseExecutor):
                 decoding_chunk_size=cfg.decoding_chunk_size,
                 num_decoding_left_chunks=cfg.num_decoding_left_chunks,
                 simulate_streaming=cfg.simulate_streaming)
-            self.result_transcripts = hyps
+            self._outputs["result"] = hyps
         else:
             raise ValueError("Wrong model type.")
 
@@ -278,8 +310,8 @@ class STExecutor(BaseExecutor):
         """
             Output postprocess and return human-readable results such as texts and audio files.
         """
-        if model_type == "fat_st":
-            return self.result_transcripts
+        if model_type == "fat_st_ted":
+            return self._outputs["result"]
         else:
             raise ValueError("Wrong model type.")
 
@@ -289,30 +321,36 @@ class STExecutor(BaseExecutor):
         """
         parser_args = self.parser.parse_args(argv)
 
-        model = parser_args.model
-        lang = parser_args.lang
-        config = parser_args.config
+        model_type = parser_args.model_type
+        src_lang = parser_args.src_lang
+        tgt_lang = parser_args.tgt_lang
+        sample_rate = parser_args.sample_rate
+        cfg_path = parser_args.cfg_path
         ckpt_path = parser_args.ckpt_path
         audio_file = parser_args.input
         device = parser_args.device
 
         try:
-            res = self(model, lang, config, ckpt_path, audio_file, device)
-            logger.info('ST Result: {}'.format(res))
+            res = self(model_type, src_lang, tgt_lang, sample_rate, cfg_path,
+                       ckpt_path, audio_file, device)
+            logger.info("ST Result: {}".format(res))
             return True
         except Exception as e:
             print(e)
             return False
 
-    def __call__(self, model, lang, config, ckpt_path, audio_file, device):
+    def __call__(self, model_type, src_lang, tgt_lang, sample_rate, cfg_path,
+                 ckpt_path, audio_file, device):
         """
             Python API to call an executor.
         """
         audio_file = os.path.abspath(audio_file)
+        self._check(audio_file, sample_rate)
         paddle.set_device(device)
-        self._init_from_path(model, lang, config, ckpt_path)
-        self.preprocess(audio_file, model)
-        self.infer(model)
-        res = self.postprocess(model)
+        self._init_from_path(model_type, src_lang, tgt_lang, cfg_path,
+                             ckpt_path)
+        self.preprocess(audio_file, model_type)
+        self.infer(model_type)
+        res = self.postprocess(model_type)
 
         return res
diff --git a/paddlespeech/cli/utils.py b/paddlespeech/cli/utils.py
index edf579f71634b07281eded2479ecbd7d20b7c308..eb023c11ba88b001fc0ce0171508ea16bed8ffea 100644
--- a/paddlespeech/cli/utils.py
+++ b/paddlespeech/cli/utils.py
@@ -12,10 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import functools
+import hashlib
 import logging
 import os
+import tarfile
+import zipfile
 from typing import Any
 from typing import Dict
+from typing import List
 
 from paddle.framework import load
 from paddle.utils import download
@@ -55,12 +59,69 @@ def get_command(name: str) -> Any:
     return com['_entry']
 
 
-def decompress(file: str) -> os.PathLike:
-    """
-    Extracts all files from a compressed file.
-    """
-    assert os.path.isfile(file), "File: {} not exists.".format(file)
-    return download._decompress(file)
+def _md5check(filepath: os.PathLike, md5sum: str) -> bool:
+    logger.info("File {} md5 checking...".format(filepath))
+    md5 = hashlib.md5()
+    with open(filepath, 'rb') as f:
+        for chunk in iter(lambda: f.read(4096), b""):
+            md5.update(chunk)
+    calc_md5sum = md5.hexdigest()
+
+    if calc_md5sum != md5sum:
+        logger.info("File {} md5 check failed, {}(calc) != "
+                    "{}(base)".format(filepath, calc_md5sum, md5sum))
+        return False
+    else:
+        logger.info("File {} md5 check passed.".format(filepath))
+        return True
+
+
+def _get_uncompress_path(filepath: os.PathLike) -> os.PathLike:
+    file_dir = os.path.dirname(filepath)
+
+    if tarfile.is_tarfile(filepath):
+        files = tarfile.open(filepath, "r:*")
+        file_list = files.getnames()
+    elif zipfile.is_zipfile(filepath):
+        files = zipfile.ZipFile(filepath, 'r')
+        file_list = files.namelist()
+    else:
+        return file_dir
+
+    if _is_a_single_file(file_list):
+        rootpath = file_list[0]
+        uncompressed_path = os.path.join(file_dir, rootpath)
+    elif _is_a_single_dir(file_list):
+        rootpath = os.path.splitext(file_list[0])[0].split(os.sep)[-1]
+        uncompressed_path = os.path.join(file_dir, rootpath)
+    else:
+        rootpath = os.path.splitext(filepath)[0].split(os.sep)[-1]
+        uncompressed_path = os.path.join(file_dir, rootpath)
+
+    files.close()
+    return uncompressed_path
+
+
+def _is_a_single_file(file_list: List[os.PathLike]) -> bool:
+    if len(file_list) == 1 and file_list[0].find(os.sep) < -1:
+        return True
+    return False
+
+
+def _is_a_single_dir(file_list: List[os.PathLike]) -> bool:
+    new_file_list = []
+    for file_path in file_list:
+        if '/' in file_path:
+            file_path = file_path.replace('/', os.sep)
+        elif '\\' in file_path:
+            file_path = file_path.replace('\\', os.sep)
+        new_file_list.append(file_path)
+
+    file_name = new_file_list[0].split(os.sep)[0]
+    for i in range(1, len(new_file_list)):
+        if file_name != new_file_list[i].split(os.sep)[0]:
+            return False
+    return True
 
 
 def download_and_decompress(archive: Dict[str, str], path: str) -> os.PathLike:
@@ -72,7 +133,17 @@ def download_and_decompress(archive: Dict[str, str], path: str) -> os.PathLike:
 
     assert 'url' in archive and 'md5' in archive, \
         'Dictionary keys of "url" and "md5" are required in the archive, but got: {}'.format(list(archive.keys()))
-    return download.get_path_from_url(archive['url'], path, archive['md5'])
+
+    filepath = os.path.join(path, os.path.basename(archive['url']))
+    if os.path.isfile(filepath) and _md5check(filepath, archive['md5']):
+        uncompress_path = _get_uncompress_path(filepath)
+        if not os.path.isdir(uncompress_path):
+            download._decompress(filepath)
+    else:
+        uncompress_path = download.get_path_from_url(archive['url'], path,
+                                                     archive['md5'])
+
+    return uncompress_path
 
 
 def load_state_dict_from_url(url: str, path: str, md5: str=None) -> os.PathLike:
@@ -128,11 +199,16 @@ class Logger(object):
             'EVAL': 22,
             'WARNING': 30,
             'ERROR': 40,
-            'CRITICAL': 50
+            'CRITICAL': 50,
+            'EXCEPTION': 100,
         }
         for key, level in log_config.items():
             logging.addLevelName(level, key)
-            self.__dict__[key.lower()] = functools.partial(self.__call__, level)
+            if key == 'EXCEPTION':
+                self.__dict__[key.lower()] = self.logger.exception
+            else:
+                self.__dict__[key.lower()] = functools.partial(self.__call__,
+                                                               level)
 
         self.format = logging.Formatter(
             fmt='[%(asctime)-15s] [%(levelname)8s] [%(filename)s] [L%(lineno)d] - %(message)s'
diff --git a/requirements.txt b/requirements.txt
index 658e64c05d654780de8347e3862f92fd385255db..3708bb0620b39f0282438a98ecc8a5e8066a027f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -14,6 +14,7 @@ loguru
 matplotlib
 nara_wpe
 nltk
+paddleaudio
 paddlespeech_ctcdecoders
 paddlespeech_feat
 pandas
diff --git a/setup.py b/setup.py
index fbb3a24a455acef6c04cdee060b0d77e230abe30..7720ba3fd9d4e64ef4f85f1b37919f896bb546d2 100644
--- a/setup.py
+++ b/setup.py
@@ -43,6 +43,7 @@ requirements = {
         "nara_wpe",
         "nltk",
         "pandas",
+        "paddleaudio",
         "paddlespeech_ctcdecoders",
         "paddlespeech_feat",
         "praatio~=4.1",
@@ -197,7 +198,7 @@ setup_info = dict(
         "pwgan",
         "gan",
     ],
-    python_requires='>=3.6',
+    python_requires='>=3.7',
     install_requires=requirements["install"],
     extras_require={
         'develop':
diff --git a/tests/benchmark/conformer/run.sh b/tests/benchmark/conformer/run.sh
index 9fd13fbcb1691f2a6a6ab9ac42fce97912efd062..fcd0c2359690373e73acf077e3b3edeeeb6e470f 100644
--- a/tests/benchmark/conformer/run.sh
+++ b/tests/benchmark/conformer/run.sh
@@ -20,7 +20,7 @@ mkdir -p conf/benchmark
 cp conf/conformer.yaml  conf/benchmark/conformer.yaml
 sed -i "s/  accum_grad: 2/  accum_grad: 1/g" conf/benchmark/conformer.yaml
 fp_item_list=(fp32)
-bs_item=(16 30)
+bs_item=(16)
 config_path=conf/benchmark/conformer.yaml
 seed=0
 output=exp/conformer
diff --git a/tests/benchmark/pwgan/run_all.sh b/tests/benchmark/pwgan/run_all.sh
index 51deaf9ff149f8bee28608bf9fae43ece3172054..874e9aa252e74a2fec1daec7645af29acdd998e7 100755
--- a/tests/benchmark/pwgan/run_all.sh
+++ b/tests/benchmark/pwgan/run_all.sh
@@ -38,7 +38,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
       model_mode_list=(pwgan)
       fp_item_list=(fp32)
       # 满 bs 是 26
-      bs_item_list=(6 26)
+      bs_item_list=(6)
       for model_mode in ${model_mode_list[@]}; do
             for fp_item in ${fp_item_list[@]}; do
             for bs_item in ${bs_item_list[@]}; do
@@ -55,4 +55,4 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
                   done
             done
       done
-fi
\ No newline at end of file
+fi