fix conflict

fd1116bd · Yang Zhou · dc8efca2 · ab656aab · fd1116bd · fd1116bd
55 changed file
--- a/.gitignore
+++ b/.gitignore
@@ -33,6 +33,12 @@ tools/Miniconda3-latest-Linux-x86_64.sh
 tools/activate_python.sh
 tools/miniconda.sh
 tools/CRF++-0.58/
+tools/liblbfgs-1.10/
+tools/srilm/
+tools/env.sh
+tools/openfst-1.8.1/
+tools/libsndfile/
+tools/python-soundfile/
 speechx/fc_patch/

--- a/examples/other/ngram_lm/s0/local/build_zh_lm.sh
+++ b/examples/other/ngram_lm/s0/local/build_zh_lm.sh
@@ -27,7 +27,7 @@ arpa=$3
 if [ $stage -le 0 ] && [ $stop_stage -ge 0 ];then
    # text tn & wordseg preprocess
    echo "process text."
-    python3 ${MAIN_ROOT}/utils/zh_tn.py ${type} ${text} ${text}.${type}.tn
+    python3 ${MAIN_ROOT}/utils/zh_tn.py --token_type ${type} ${text} ${text}.${type}.tn
 fi
 if [ $stage -le 1 ] && [ $stop_stage -ge 1 ];then

--- a/examples/other/ngram_lm/s0/local/download_lm_zh.sh
+++ b/examples/other/ngram_lm/s0/local/download_lm_zh.sh
@@ -10,6 +10,11 @@ MD5="29e02312deb2e59b3c8686c7966d4fe3"
 TARGET=${DIR}/zh_giga.no_cna_cmn.prune01244.klm
+if [ -e $TARGET ];then
+    echo "already have lm"
+    exit 0;
+fi
 echo "Download language model ..."
 download $URL $MD5 $TARGET
 if [ $? -ne 0 ]; then

--- a/paddlespeech/cli/asr/infer.py
+++ b/paddlespeech/cli/asr/infer.py
@@ -29,9 +29,10 @@ from ..download import get_path_from_url
 from ..executor import BaseExecutor
 from ..log import logger
 from ..utils import cli_register
-from ..utils import download_and_decompress
 from ..utils import MODEL_HOME
 from ..utils import stats_wrapper
+from .pretrained_models import model_alias
+from .pretrained_models import pretrained_models
 from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
 from paddlespeech.s2t.transform.transformation import Transformation
 from paddlespeech.s2t.utils.dynamic_import import dynamic_import
@@ -39,94 +40,14 @@ from paddlespeech.s2t.utils.utility import UpdateConfig
 __all__ = ['ASRExecutor']
-pretrained_models = {
-    # The tags for pretrained_models should be "{model_name}[_{dataset}][-{lang}][-...]".
-    # e.g. "conformer_wenetspeech-zh-16k" and "panns_cnn6-32k".
-    # Command line and python api use "{model_name}[_{dataset}]" as --model, usage:
-    # "paddlespeech asr --model conformer_wenetspeech --lang zh --sr 16000 --input ./input.wav"
-    "conformer_wenetspeech-zh-16k": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1_conformer_wenetspeech_ckpt_0.1.1.model.tar.gz',
-        'md5':
-        '76cb19ed857e6623856b7cd7ebbfeda4',
-        'cfg_path':
-        'model.yaml',
-        'ckpt_path':
-        'exp/conformer/checkpoints/wenetspeech',
-    },
-    "transformer_librispeech-en-16k": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr1/asr1_transformer_librispeech_ckpt_0.1.1.model.tar.gz',
-        'md5':
-        '2c667da24922aad391eacafe37bc1660',
-        'cfg_path':
-        'model.yaml',
-        'ckpt_path':
-        'exp/transformer/checkpoints/avg_10',
-    },
-    "deepspeech2offline_aishell-zh-16k": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_aishell_ckpt_0.1.1.model.tar.gz',
-        'md5':
-        '932c3593d62fe5c741b59b31318aa314',
-        'cfg_path':
-        'model.yaml',
-        'ckpt_path':
-        'exp/deepspeech2/checkpoints/avg_1',
-        'lm_url':
-        'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm',
-        'lm_md5':
-        '29e02312deb2e59b3c8686c7966d4fe3'
-    },
-    "deepspeech2online_aishell-zh-16k": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz',
-        'md5':
-        '23e16c69730a1cb5d735c98c83c21e16',
-        'cfg_path':
-        'model.yaml',
-        'ckpt_path':
-        'exp/deepspeech2_online/checkpoints/avg_1',
-        'lm_url':
-        'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm',
-        'lm_md5':
-        '29e02312deb2e59b3c8686c7966d4fe3'
-    },
-    "deepspeech2offline_librispeech-en-16k": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr0/asr0_deepspeech2_librispeech_ckpt_0.1.1.model.tar.gz',
-        'md5':
-        'f5666c81ad015c8de03aac2bc92e5762',
-        'cfg_path':
-        'model.yaml',
-        'ckpt_path':
-        'exp/deepspeech2/checkpoints/avg_1',
-        'lm_url':
-        'https://deepspeech.bj.bcebos.com/en_lm/common_crawl_00.prune01111.trie.klm',
-        'lm_md5':
-        '099a601759d467cd0a8523ff939819c5'
-    },
-}
-model_alias = {
-    "deepspeech2offline":
-    "paddlespeech.s2t.models.ds2:DeepSpeech2Model",
-    "deepspeech2online":
-    "paddlespeech.s2t.models.ds2_online:DeepSpeech2ModelOnline",
-    "conformer":
-    "paddlespeech.s2t.models.u2:U2Model",
-    "transformer":
-    "paddlespeech.s2t.models.u2:U2Model",
-    "wenetspeech":
-    "paddlespeech.s2t.models.u2:U2Model",
-}
 @cli_register(
    name='paddlespeech.asr', description='Speech to text infer command.')
 class ASRExecutor(BaseExecutor):
    def __init__(self):
-        super(ASRExecutor, self).__init__()
+        super().__init__()
+        self.model_alias = model_alias
+        self.pretrained_models = pretrained_models
        self.parser = argparse.ArgumentParser(
            prog='paddlespeech.asr', add_help=True)
@@ -136,7 +57,9 @@ class ASRExecutor(BaseExecutor):
            '--model',
            type=str,
            default='conformer_wenetspeech',
-            choices=[tag[:tag.index('-')] for tag in pretrained_models.keys()],
+            choices=[
+                tag[:tag.index('-')] for tag in self.pretrained_models.keys()
+            ],
            help='Choose model type of asr task.')
        self.parser.add_argument(
            '--lang',
@@ -192,23 +115,6 @@ class ASRExecutor(BaseExecutor):
            action='store_true',
            help='Increase logger verbosity of current task.')
-    def _get_pretrained_path(self, tag: str) -> os.PathLike:
-        """
-        Download and returns pretrained resources path of current task.
-        """
-        support_models = list(pretrained_models.keys())
-        assert tag in pretrained_models, 'The model "{}" you want to use has not been supported, please choose other models.\nThe support models includes:\n\t\t{}\n'.format(
-            tag, '\n\t\t'.join(support_models))
-        res_path = os.path.join(MODEL_HOME, tag)
-        decompressed_path = download_and_decompress(pretrained_models[tag],
-                                                    res_path)
-        decompressed_path = os.path.abspath(decompressed_path)
-        logger.info(
-            'Use pretrained model stored in: {}'.format(decompressed_path))
-        return decompressed_path
    def _init_from_path(self,
                        model_type: str='wenetspeech',
                        lang: str='zh',
@@ -228,10 +134,11 @@ class ASRExecutor(BaseExecutor):
            tag = model_type + '-' + lang + '-' + sample_rate_str
            res_path = self._get_pretrained_path(tag)  # wenetspeech_zh
            self.res_path = res_path
-            self.cfg_path = os.path.join(res_path,
+            self.cfg_path = os.path.join(
-                                         pretrained_models[tag]['cfg_path'])
+                res_path, self.pretrained_models[tag]['cfg_path'])
            self.ckpt_path = os.path.join(
-                res_path, pretrained_models[tag]['ckpt_path'] + ".pdparams")
+                res_path,
+                self.pretrained_models[tag]['ckpt_path'] + ".pdparams")
            logger.info(res_path)
            logger.info(self.cfg_path)
            logger.info(self.ckpt_path)
@@ -255,8 +162,8 @@ class ASRExecutor(BaseExecutor):
                self.collate_fn_test = SpeechCollator.from_config(self.config)
                self.text_feature = TextFeaturizer(
                    unit_type=self.config.unit_type, vocab=self.vocab)
-                lm_url = pretrained_models[tag]['lm_url']
+                lm_url = self.pretrained_models[tag]['lm_url']
-                lm_md5 = pretrained_models[tag]['lm_md5']
+                lm_md5 = self.pretrained_models[tag]['lm_md5']
                self.download_lm(
                    lm_url,
                    os.path.dirname(self.config.decode.lang_model_path), lm_md5)
@@ -274,7 +181,7 @@ class ASRExecutor(BaseExecutor):
                raise Exception("wrong type")
        model_name = model_type[:model_type.rindex(
            '_')]  # model_type: {model_name}_{dataset}
-        model_class = dynamic_import(model_name, model_alias)
+        model_class = dynamic_import(model_name, self.model_alias)
        model_conf = self.config
        model = model_class.from_config(model_conf)
        self.model = model

--- a/paddlespeech/cli/asr/pretrained_models.py
+++ b/paddlespeech/cli/asr/pretrained_models.py
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+pretrained_models = {
+    # The tags for pretrained_models should be "{model_name}[_{dataset}][-{lang}][-...]".
+    # e.g. "conformer_wenetspeech-zh-16k" and "panns_cnn6-32k".
+    # Command line and python api use "{model_name}[_{dataset}]" as --model, usage:
+    # "paddlespeech asr --model conformer_wenetspeech --lang zh --sr 16000 --input ./input.wav"
+    "conformer_wenetspeech-zh-16k": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1_conformer_wenetspeech_ckpt_0.1.1.model.tar.gz',
+        'md5':
+        '76cb19ed857e6623856b7cd7ebbfeda4',
+        'cfg_path':
+        'model.yaml',
+        'ckpt_path':
+        'exp/conformer/checkpoints/wenetspeech',
+    },
+    "transformer_librispeech-en-16k": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr1/asr1_transformer_librispeech_ckpt_0.1.1.model.tar.gz',
+        'md5':
+        '2c667da24922aad391eacafe37bc1660',
+        'cfg_path':
+        'model.yaml',
+        'ckpt_path':
+        'exp/transformer/checkpoints/avg_10',
+    },
+    "deepspeech2offline_aishell-zh-16k": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_aishell_ckpt_0.1.1.model.tar.gz',
+        'md5':
+        '932c3593d62fe5c741b59b31318aa314',
+        'cfg_path':
+        'model.yaml',
+        'ckpt_path':
+        'exp/deepspeech2/checkpoints/avg_1',
+        'lm_url':
+        'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm',
+        'lm_md5':
+        '29e02312deb2e59b3c8686c7966d4fe3'
+    },
+    "deepspeech2online_aishell-zh-16k": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz',
+        'md5':
+        '23e16c69730a1cb5d735c98c83c21e16',
+        'cfg_path':
+        'model.yaml',
+        'ckpt_path':
+        'exp/deepspeech2_online/checkpoints/avg_1',
+        'lm_url':
+        'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm',
+        'lm_md5':
+        '29e02312deb2e59b3c8686c7966d4fe3'
+    },
+    "deepspeech2offline_librispeech-en-16k": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr0/asr0_deepspeech2_librispeech_ckpt_0.1.1.model.tar.gz',
+        'md5':
+        'f5666c81ad015c8de03aac2bc92e5762',
+        'cfg_path':
+        'model.yaml',
+        'ckpt_path':
+        'exp/deepspeech2/checkpoints/avg_1',
+        'lm_url':
+        'https://deepspeech.bj.bcebos.com/en_lm/common_crawl_00.prune01111.trie.klm',
+        'lm_md5':
+        '099a601759d467cd0a8523ff939819c5'
+    },
+}
+model_alias = {
+    "deepspeech2offline":
+    "paddlespeech.s2t.models.ds2:DeepSpeech2Model",
+    "deepspeech2online":
+    "paddlespeech.s2t.models.ds2_online:DeepSpeech2ModelOnline",
+    "conformer":
+    "paddlespeech.s2t.models.u2:U2Model",
+    "transformer":
+    "paddlespeech.s2t.models.u2:U2Model",
+    "wenetspeech":
+    "paddlespeech.s2t.models.u2:U2Model",
+}
--- a/paddlespeech/cli/cls/infer.py
+++ b/paddlespeech/cli/cls/infer.py
@@ -25,55 +25,23 @@ import yaml
 from ..executor import BaseExecutor
 from ..log import logger
 from ..utils import cli_register
-from ..utils import download_and_decompress
-from ..utils import MODEL_HOME
 from ..utils import stats_wrapper
+from .pretrained_models import model_alias
+from .pretrained_models import pretrained_models
 from paddleaudio import load
 from paddleaudio.features import LogMelSpectrogram
 from paddlespeech.s2t.utils.dynamic_import import dynamic_import
 __all__ = ['CLSExecutor']
-pretrained_models = {
-    # The tags for pretrained_models should be "{model_name}[_{dataset}][-{lang}][-...]".
-    # e.g. "conformer_wenetspeech-zh-16k", "transformer_aishell-zh-16k" and "panns_cnn6-32k".
-    # Command line and python api use "{model_name}[_{dataset}]" as --model, usage:
-    # "paddlespeech asr --model conformer_wenetspeech --lang zh --sr 16000 --input ./input.wav"
-    "panns_cnn6-32k": {
-        'url': 'https://paddlespeech.bj.bcebos.com/cls/panns_cnn6.tar.gz',
-        'md5': '4cf09194a95df024fd12f84712cf0f9c',
-        'cfg_path': 'panns.yaml',
-        'ckpt_path': 'cnn6.pdparams',
-        'label_file': 'audioset_labels.txt',
-    },
-    "panns_cnn10-32k": {
-        'url': 'https://paddlespeech.bj.bcebos.com/cls/panns_cnn10.tar.gz',
-        'md5': 'cb8427b22176cc2116367d14847f5413',
-        'cfg_path': 'panns.yaml',
-        'ckpt_path': 'cnn10.pdparams',
-        'label_file': 'audioset_labels.txt',
-    },
-    "panns_cnn14-32k": {
-        'url': 'https://paddlespeech.bj.bcebos.com/cls/panns_cnn14.tar.gz',
-        'md5': 'e3b9b5614a1595001161d0ab95edee97',
-        'cfg_path': 'panns.yaml',
-        'ckpt_path': 'cnn14.pdparams',
-        'label_file': 'audioset_labels.txt',
-    },
-}
-model_alias = {
-    "panns_cnn6": "paddlespeech.cls.models.panns:CNN6",
-    "panns_cnn10": "paddlespeech.cls.models.panns:CNN10",
-    "panns_cnn14": "paddlespeech.cls.models.panns:CNN14",
-}
 @cli_register(
    name='paddlespeech.cls', description='Audio classification infer command.')
 class CLSExecutor(BaseExecutor):
    def __init__(self):
-        super(CLSExecutor, self).__init__()
+        super().__init__()
+        self.model_alias = model_alias
+        self.pretrained_models = pretrained_models
        self.parser = argparse.ArgumentParser(
            prog='paddlespeech.cls', add_help=True)
@@ -83,7 +51,9 @@ class CLSExecutor(BaseExecutor):
            '--model',
            type=str,
            default='panns_cnn14',
-            choices=[tag[:tag.index('-')] for tag in pretrained_models.keys()],
+            choices=[
+                tag[:tag.index('-')] for tag in self.pretrained_models.keys()
+            ],
            help='Choose model type of cls task.')
        self.parser.add_argument(
            '--config',
@@ -121,23 +91,6 @@ class CLSExecutor(BaseExecutor):
            action='store_true',
            help='Increase logger verbosity of current task.')
-    def _get_pretrained_path(self, tag: str) -> os.PathLike:
-        """
-            Download and returns pretrained resources path of current task.
-        """
-        support_models = list(pretrained_models.keys())
-        assert tag in pretrained_models, 'The model "{}" you want to use has not been supported, please choose other models.\nThe support models includes:\n\t\t{}\n'.format(
-            tag, '\n\t\t'.join(support_models))
-        res_path = os.path.join(MODEL_HOME, tag)
-        decompressed_path = download_and_decompress(pretrained_models[tag],
-                                                    res_path)
-        decompressed_path = os.path.abspath(decompressed_path)
-        logger.info(
-            'Use pretrained model stored in: {}'.format(decompressed_path))
-        return decompressed_path
    def _init_from_path(self,
                        model_type: str='panns_cnn14',
                        cfg_path: Optional[os.PathLike]=None,
@@ -153,12 +106,12 @@ class CLSExecutor(BaseExecutor):
        if label_file is None or ckpt_path is None:
            tag = model_type + '-' + '32k'  # panns_cnn14-32k
            self.res_path = self._get_pretrained_path(tag)
-            self.cfg_path = os.path.join(self.res_path,
+            self.cfg_path = os.path.join(
-                                         pretrained_models[tag]['cfg_path'])
+                self.res_path, self.pretrained_models[tag]['cfg_path'])
-            self.label_file = os.path.join(self.res_path,
+            self.label_file = os.path.join(
-                                           pretrained_models[tag]['label_file'])
+                self.res_path, self.pretrained_models[tag]['label_file'])
-            self.ckpt_path = os.path.join(self.res_path,
+            self.ckpt_path = os.path.join(
-                                          pretrained_models[tag]['ckpt_path'])
+                self.res_path, self.pretrained_models[tag]['ckpt_path'])
        else:
            self.cfg_path = os.path.abspath(cfg_path)
            self.label_file = os.path.abspath(label_file)
@@ -175,7 +128,7 @@ class CLSExecutor(BaseExecutor):
                self._label_list.append(line.strip())
        # model
-        model_class = dynamic_import(model_type, model_alias)
+        model_class = dynamic_import(model_type, self.model_alias)
        model_dict = paddle.load(self.ckpt_path)
        self.model = model_class(extract_embedding=False)
        self.model.set_state_dict(model_dict)

--- a/paddlespeech/cli/cls/pretrained_models.py
+++ b/paddlespeech/cli/cls/pretrained_models.py
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+pretrained_models = {
+    # The tags for pretrained_models should be "{model_name}[_{dataset}][-{lang}][-...]".
+    # e.g. "conformer_wenetspeech-zh-16k", "transformer_aishell-zh-16k" and "panns_cnn6-32k".
+    # Command line and python api use "{model_name}[_{dataset}]" as --model, usage:
+    # "paddlespeech asr --model conformer_wenetspeech --lang zh --sr 16000 --input ./input.wav"
+    "panns_cnn6-32k": {
+        'url': 'https://paddlespeech.bj.bcebos.com/cls/panns_cnn6.tar.gz',
+        'md5': '4cf09194a95df024fd12f84712cf0f9c',
+        'cfg_path': 'panns.yaml',
+        'ckpt_path': 'cnn6.pdparams',
+        'label_file': 'audioset_labels.txt',
+    },
+    "panns_cnn10-32k": {
+        'url': 'https://paddlespeech.bj.bcebos.com/cls/panns_cnn10.tar.gz',
+        'md5': 'cb8427b22176cc2116367d14847f5413',
+        'cfg_path': 'panns.yaml',
+        'ckpt_path': 'cnn10.pdparams',
+        'label_file': 'audioset_labels.txt',
+    },
+    "panns_cnn14-32k": {
+        'url': 'https://paddlespeech.bj.bcebos.com/cls/panns_cnn14.tar.gz',
+        'md5': 'e3b9b5614a1595001161d0ab95edee97',
+        'cfg_path': 'panns.yaml',
+        'ckpt_path': 'cnn14.pdparams',
+        'label_file': 'audioset_labels.txt',
+    },
+}
+model_alias = {
+    "panns_cnn6": "paddlespeech.cls.models.panns:CNN6",
+    "panns_cnn10": "paddlespeech.cls.models.panns:CNN10",
+    "panns_cnn14": "paddlespeech.cls.models.panns:CNN14",
+}
--- a/paddlespeech/cli/executor.py
+++ b/paddlespeech/cli/executor.py
@@ -25,6 +25,8 @@ from typing import Union
 import paddle
 from .log import logger
+from .utils import download_and_decompress
+from .utils import MODEL_HOME
 class BaseExecutor(ABC):
@@ -35,19 +37,8 @@ class BaseExecutor(ABC):
    def __init__(self):
        self._inputs = OrderedDict()
        self._outputs = OrderedDict()
+        self.pretrained_models = OrderedDict()
-    @abstractmethod
+        self.model_alias = OrderedDict()
-    def _get_pretrained_path(self, tag: str) -> os.PathLike:
-        """
-        Download and returns pretrained resources path of current task.
-        Args:
-            tag (str): A tag of pretrained model.
-        Returns:
-            os.PathLike: The path on which resources of pretrained model locate. 
-        """
-        pass
    @abstractmethod
    def _init_from_path(self, *args, **kwargs):
@@ -227,3 +218,20 @@ class BaseExecutor(ABC):
        ]
        for l in loggers:
            l.disabled = True
+    def _get_pretrained_path(self, tag: str) -> os.PathLike:
+        """
+        Download and returns pretrained resources path of current task.
+        """
+        support_models = list(self.pretrained_models.keys())
+        assert tag in self.pretrained_models, 'The model "{}" you want to use has not been supported, please choose other models.\nThe support models includes:\n\t\t{}\n'.format(
+            tag, '\n\t\t'.join(support_models))
+        res_path = os.path.join(MODEL_HOME, tag)
+        decompressed_path = download_and_decompress(self.pretrained_models[tag],
+                                                    res_path)
+        decompressed_path = os.path.abspath(decompressed_path)
+        logger.info(
+            'Use pretrained model stored in: {}'.format(decompressed_path))
+        return decompressed_path
--- a/paddlespeech/cli/st/infer.py
+++ b/paddlespeech/cli/st/infer.py
@@ -32,40 +32,24 @@ from ..utils import cli_register
 from ..utils import download_and_decompress
 from ..utils import MODEL_HOME
 from ..utils import stats_wrapper
+from .pretrained_models import kaldi_bins
+from .pretrained_models import model_alias
+from .pretrained_models import pretrained_models
 from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
 from paddlespeech.s2t.utils.dynamic_import import dynamic_import
 from paddlespeech.s2t.utils.utility import UpdateConfig
 __all__ = ["STExecutor"]
-pretrained_models = {
-    "fat_st_ted-en-zh": {
-        "url":
-        "https://paddlespeech.bj.bcebos.com/s2t/ted_en_zh/st1/st1_transformer_mtl_noam_ted-en-zh_ckpt_0.1.1.model.tar.gz",
-        "md5":
-        "d62063f35a16d91210a71081bd2dd557",
-        "cfg_path":
-        "model.yaml",
-        "ckpt_path":
-        "exp/transformer_mtl_noam/checkpoints/fat_st_ted-en-zh.pdparams",
-    }
-}
-model_alias = {"fat_st": "paddlespeech.s2t.models.u2_st:U2STModel"}
-kaldi_bins = {
-    "url":
-    "https://paddlespeech.bj.bcebos.com/s2t/ted_en_zh/st1/kaldi_bins.tar.gz",
-    "md5":
-    "c0682303b3f3393dbf6ed4c4e35a53eb",
-}
 @cli_register(
    name="paddlespeech.st", description="Speech translation infer command.")
 class STExecutor(BaseExecutor):
    def __init__(self):
-        super(STExecutor, self).__init__()
+        super().__init__()
+        self.model_alias = model_alias
+        self.pretrained_models = pretrained_models
+        self.kaldi_bins = kaldi_bins
        self.parser = argparse.ArgumentParser(
            prog="paddlespeech.st", add_help=True)
@@ -75,7 +59,9 @@ class STExecutor(BaseExecutor):
            "--model",
            type=str,
            default="fat_st_ted",
-            choices=[tag[:tag.index('-')] for tag in pretrained_models.keys()],
+            choices=[
+                tag[:tag.index('-')] for tag in self.pretrained_models.keys()
+            ],
            help="Choose model type of st task.")
        self.parser.add_argument(
            "--src_lang",
@@ -119,28 +105,11 @@ class STExecutor(BaseExecutor):
            action='store_true',
            help='Increase logger verbosity of current task.')
-    def _get_pretrained_path(self, tag: str) -> os.PathLike:
-        """
-            Download and returns pretrained resources path of current task.
-        """
-        support_models = list(pretrained_models.keys())
-        assert tag in pretrained_models, 'The model "{}" you want to use has not been supported, please choose other models.\nThe support models includes:\n\t\t{}\n'.format(
-            tag, '\n\t\t'.join(support_models))
-        res_path = os.path.join(MODEL_HOME, tag)
-        decompressed_path = download_and_decompress(pretrained_models[tag],
-                                                    res_path)
-        decompressed_path = os.path.abspath(decompressed_path)
-        logger.info(
-            "Use pretrained model stored in: {}".format(decompressed_path))
-        return decompressed_path
    def _set_kaldi_bins(self) -> os.PathLike:
        """
            Download and returns kaldi_bins resources path of current task.
        """
-        decompressed_path = download_and_decompress(kaldi_bins, MODEL_HOME)
+        decompressed_path = download_and_decompress(self.kaldi_bins, MODEL_HOME)
        decompressed_path = os.path.abspath(decompressed_path)
        logger.info("Kaldi_bins stored in: {}".format(decompressed_path))
        if "LD_LIBRARY_PATH" in os.environ:
@@ -197,7 +166,7 @@ class STExecutor(BaseExecutor):
        model_conf = self.config
        model_name = model_type[:model_type.rindex(
            '_')]  # model_type: {model_name}_{dataset}
-        model_class = dynamic_import(model_name, model_alias)
+        model_class = dynamic_import(model_name, self.model_alias)
        self.model = model_class.from_config(model_conf)
        self.model.eval()

--- a/paddlespeech/cli/st/pretrained_models.py
+++ b/paddlespeech/cli/st/pretrained_models.py
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+pretrained_models = {
+    "fat_st_ted-en-zh": {
+        "url":
+        "https://paddlespeech.bj.bcebos.com/s2t/ted_en_zh/st1/st1_transformer_mtl_noam_ted-en-zh_ckpt_0.1.1.model.tar.gz",
+        "md5":
+        "d62063f35a16d91210a71081bd2dd557",
+        "cfg_path":
+        "model.yaml",
+        "ckpt_path":
+        "exp/transformer_mtl_noam/checkpoints/fat_st_ted-en-zh.pdparams",
+    }
+}
+model_alias = {"fat_st": "paddlespeech.s2t.models.u2_st:U2STModel"}
+kaldi_bins = {
+    "url":
+    "https://paddlespeech.bj.bcebos.com/s2t/ted_en_zh/st1/kaldi_bins.tar.gz",
+    "md5":
+    "c0682303b3f3393dbf6ed4c4e35a53eb",
+}
--- a/paddlespeech/cli/stats/infer.py
+++ b/paddlespeech/cli/stats/infer.py
@@ -16,7 +16,6 @@ from typing import List
 from prettytable import PrettyTable
-from ..log import logger
 from ..utils import cli_register
 from ..utils import stats_wrapper
@@ -27,7 +26,8 @@ model_name_format = {
    'cls': 'Model-Sample Rate',
    'st': 'Model-Source language-Target language',
    'text': 'Model-Task-Language',
-    'tts': 'Model-Language'
+    'tts': 'Model-Language',
+    'vector': 'Model-Sample Rate'
 }
@@ -36,18 +36,18 @@ model_name_format = {
    description='Get speech tasks support models list.')
 class StatsExecutor():
    def __init__(self):
-        super(StatsExecutor, self).__init__()
+        super().__init__()
        self.parser = argparse.ArgumentParser(
            prog='paddlespeech.stats', add_help=True)
+        self.task_choices = ['asr', 'cls', 'st', 'text', 'tts', 'vector']
        self.parser.add_argument(
            '--task',
            type=str,
            default='asr',
-            choices=['asr', 'cls', 'st', 'text', 'tts'],
+            choices=self.task_choices,
            help='Choose speech task.',
            required=True)
-        self.task_choices = ['asr', 'cls', 'st', 'text', 'tts']
    def show_support_models(self, pretrained_models: dict):
        fields = model_name_format[self.task].split("-")
@@ -61,73 +61,15 @@ class StatsExecutor():
            Command line entry.
        """
        parser_args = self.parser.parse_args(argv)
-        self.task = parser_args.task
+        has_exceptions = False
-        if self.task not in self.task_choices:
+        try:
-            logger.error(
+            self(parser_args.task)
-                "Please input correct speech task, choices = ['asr', 'cls', 'st', 'text', 'tts']"
+        except Exception as e:
-            )
+            has_exceptions = True
+        if has_exceptions:
            return False
+        else:
-        elif self.task == 'asr':
+            return True
-            try:
-                from ..asr.infer import pretrained_models
-                logger.info(
-                    "Here is the list of ASR pretrained models released by PaddleSpeech that can be used by command line and python API"
-                )
-                self.show_support_models(pretrained_models)
-                return True
-            except BaseException:
-                logger.error("Failed to get the list of ASR pretrained models.")
-                return False
-        elif self.task == 'cls':
-            try:
-                from ..cls.infer import pretrained_models
-                logger.info(
-                    "Here is the list of CLS pretrained models released by PaddleSpeech that can be used by command line and python API"
-                )
-                self.show_support_models(pretrained_models)
-                return True
-            except BaseException:
-                logger.error("Failed to get the list of CLS pretrained models.")
-                return False
-        elif self.task == 'st':
-            try:
-                from ..st.infer import pretrained_models
-                logger.info(
-                    "Here is the list of ST pretrained models released by PaddleSpeech that can be used by command line and python API"
-                )
-                self.show_support_models(pretrained_models)
-                return True
-            except BaseException:
-                logger.error("Failed to get the list of ST pretrained models.")
-                return False
-        elif self.task == 'text':
-            try:
-                from ..text.infer import pretrained_models
-                logger.info(
-                    "Here is the list of TEXT pretrained models released by PaddleSpeech that can be used by command line and python API"
-                )
-                self.show_support_models(pretrained_models)
-                return True
-            except BaseException:
-                logger.error(
-                    "Failed to get the list of TEXT pretrained models.")
-                return False
-        elif self.task == 'tts':
-            try:
-                from ..tts.infer import pretrained_models
-                logger.info(
-                    "Here is the list of TTS pretrained models released by PaddleSpeech that can be used by command line and python API"
-                )
-                self.show_support_models(pretrained_models)
-                return True
-            except BaseException:
-                logger.error("Failed to get the list of TTS pretrained models.")
-                return False
    @stats_wrapper
    def __call__(
@@ -138,13 +80,12 @@ class StatsExecutor():
        """
        self.task = task
        if self.task not in self.task_choices:
-            print(
+            print("Please input correct speech task, choices = " + str(
-                "Please input correct speech task, choices = ['asr', 'cls', 'st', 'text', 'tts']"
+                self.task_choices))
-            )
        elif self.task == 'asr':
            try:
-                from ..asr.infer import pretrained_models
+                from ..asr.pretrained_models import pretrained_models
                print(
                    "Here is the list of ASR pretrained models released by PaddleSpeech that can be used by command line and python API"
                )
@@ -154,7 +95,7 @@ class StatsExecutor():
        elif self.task == 'cls':
            try:
-                from ..cls.infer import pretrained_models
+                from ..cls.pretrained_models import pretrained_models
                print(
                    "Here is the list of CLS pretrained models released by PaddleSpeech that can be used by command line and python API"
                )
@@ -164,7 +105,7 @@ class StatsExecutor():
        elif self.task == 'st':
            try:
-                from ..st.infer import pretrained_models
+                from ..st.pretrained_models import pretrained_models
                print(
                    "Here is the list of ST pretrained models released by PaddleSpeech that can be used by command line and python API"
                )
@@ -174,7 +115,7 @@ class StatsExecutor():
        elif self.task == 'text':
            try:
-                from ..text.infer import pretrained_models
+                from ..text.pretrained_models import pretrained_models
                print(
                    "Here is the list of TEXT pretrained models released by PaddleSpeech that can be used by command line and python API"
                )
@@ -184,10 +125,22 @@ class StatsExecutor():
        elif self.task == 'tts':
            try:
-                from ..tts.infer import pretrained_models
+                from ..tts.pretrained_models import pretrained_models
                print(
                    "Here is the list of TTS pretrained models released by PaddleSpeech that can be used by command line and python API"
                )
                self.show_support_models(pretrained_models)
            except BaseException:
                print("Failed to get the list of TTS pretrained models.")
+        elif self.task == 'vector':
+            try:
+                from ..vector.pretrained_models import pretrained_models
+                print(
+                    "Here is the list of Speaker Recognition pretrained models released by PaddleSpeech that can be used by command line and python API"
+                )
+                self.show_support_models(pretrained_models)
+            except BaseException:
+                print(
+                    "Failed to get the list of Speaker Recognition pretrained models."
+                )
--- a/paddlespeech/cli/text/infer.py
+++ b/paddlespeech/cli/text/infer.py
@@ -25,58 +25,21 @@ from ...s2t.utils.dynamic_import import dynamic_import
 from ..executor import BaseExecutor
 from ..log import logger
 from ..utils import cli_register
-from ..utils import download_and_decompress
-from ..utils import MODEL_HOME
 from ..utils import stats_wrapper
+from .pretrained_models import model_alias
+from .pretrained_models import pretrained_models
+from .pretrained_models import tokenizer_alias
 __all__ = ['TextExecutor']
-pretrained_models = {
-    # The tags for pretrained_models should be "{model_name}[_{dataset}][-{lang}][-...]".
-    # e.g. "conformer_wenetspeech-zh-16k", "transformer_aishell-zh-16k" and "panns_cnn6-32k".
-    # Command line and python api use "{model_name}[_{dataset}]" as --model, usage:
-    # "paddlespeech asr --model conformer_wenetspeech --lang zh --sr 16000 --input ./input.wav"
-    "ernie_linear_p7_wudao-punc-zh": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/text/ernie_linear_p7_wudao-punc-zh.tar.gz',
-        'md5':
-        '12283e2ddde1797c5d1e57036b512746',
-        'cfg_path':
-        'ckpt/model_config.json',
-        'ckpt_path':
-        'ckpt/model_state.pdparams',
-        'vocab_file':
-        'punc_vocab.txt',
-    },
-    "ernie_linear_p3_wudao-punc-zh": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/text/ernie_linear_p3_wudao-punc-zh.tar.gz',
-        'md5':
-        '448eb2fdf85b6a997e7e652e80c51dd2',
-        'cfg_path':
-        'ckpt/model_config.json',
-        'ckpt_path':
-        'ckpt/model_state.pdparams',
-        'vocab_file':
-        'punc_vocab.txt',
-    },
-}
-model_alias = {
-    "ernie_linear_p7": "paddlespeech.text.models:ErnieLinear",
-    "ernie_linear_p3": "paddlespeech.text.models:ErnieLinear",
-}
-tokenizer_alias = {
-    "ernie_linear_p7": "paddlenlp.transformers:ErnieTokenizer",
-    "ernie_linear_p3": "paddlenlp.transformers:ErnieTokenizer",
-}
 @cli_register(name='paddlespeech.text', description='Text infer command.')
 class TextExecutor(BaseExecutor):
    def __init__(self):
-        super(TextExecutor, self).__init__()
+        super().__init__()
+        self.model_alias = model_alias
+        self.pretrained_models = pretrained_models
+        self.tokenizer_alias = tokenizer_alias
        self.parser = argparse.ArgumentParser(
            prog='paddlespeech.text', add_help=True)
@@ -92,7 +55,9 @@ class TextExecutor(BaseExecutor):
            '--model',
            type=str,
            default='ernie_linear_p7_wudao',
-            choices=[tag[:tag.index('-')] for tag in pretrained_models.keys()],
+            choices=[
+                tag[:tag.index('-')] for tag in self.pretrained_models.keys()
+            ],
            help='Choose model type of text task.')
        self.parser.add_argument(
            '--lang',
@@ -131,23 +96,6 @@ class TextExecutor(BaseExecutor):
            action='store_true',
            help='Increase logger verbosity of current task.')
-    def _get_pretrained_path(self, tag: str) -> os.PathLike:
-        """
-            Download and returns pretrained resources path of current task.
-        """
-        support_models = list(pretrained_models.keys())
-        assert tag in pretrained_models, 'The model "{}" you want to use has not been supported, please choose other models.\nThe support models includes:\n\t\t{}\n'.format(
-            tag, '\n\t\t'.join(support_models))
-        res_path = os.path.join(MODEL_HOME, tag)
-        decompressed_path = download_and_decompress(pretrained_models[tag],
-                                                    res_path)
-        decompressed_path = os.path.abspath(decompressed_path)
-        logger.info(
-            'Use pretrained model stored in: {}'.format(decompressed_path))
-        return decompressed_path
    def _init_from_path(self,
                        task: str='punc',
                        model_type: str='ernie_linear_p7_wudao',
@@ -167,12 +115,12 @@ class TextExecutor(BaseExecutor):
        if cfg_path is None or ckpt_path is None or vocab_file is None:
            tag = '-'.join([model_type, task, lang])
            self.res_path = self._get_pretrained_path(tag)
-            self.cfg_path = os.path.join(self.res_path,
+            self.cfg_path = os.path.join(
-                                         pretrained_models[tag]['cfg_path'])
+                self.res_path, self.pretrained_models[tag]['cfg_path'])
-            self.ckpt_path = os.path.join(self.res_path,
+            self.ckpt_path = os.path.join(
-                                          pretrained_models[tag]['ckpt_path'])
+                self.res_path, self.pretrained_models[tag]['ckpt_path'])
-            self.vocab_file = os.path.join(self.res_path,
+            self.vocab_file = os.path.join(
-                                           pretrained_models[tag]['vocab_file'])
+                self.res_path, self.pretrained_models[tag]['vocab_file'])
        else:
            self.cfg_path = os.path.abspath(cfg_path)
            self.ckpt_path = os.path.abspath(ckpt_path)
@@ -187,8 +135,8 @@ class TextExecutor(BaseExecutor):
                    self._punc_list.append(line.strip())
            # model
-            model_class = dynamic_import(model_name, model_alias)
+            model_class = dynamic_import(model_name, self.model_alias)
-            tokenizer_class = dynamic_import(model_name, tokenizer_alias)
+            tokenizer_class = dynamic_import(model_name, self.tokenizer_alias)
            self.model = model_class(
                cfg_path=self.cfg_path, ckpt_path=self.ckpt_path)
            self.tokenizer = tokenizer_class.from_pretrained('ernie-1.0')

--- a/paddlespeech/cli/text/pretrained_models.py
+++ b/paddlespeech/cli/text/pretrained_models.py
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+pretrained_models = {
+    # The tags for pretrained_models should be "{model_name}[_{dataset}][-{lang}][-...]".
+    # e.g. "conformer_wenetspeech-zh-16k", "transformer_aishell-zh-16k" and "panns_cnn6-32k".
+    # Command line and python api use "{model_name}[_{dataset}]" as --model, usage:
+    # "paddlespeech asr --model conformer_wenetspeech --lang zh --sr 16000 --input ./input.wav"
+    "ernie_linear_p7_wudao-punc-zh": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/text/ernie_linear_p7_wudao-punc-zh.tar.gz',
+        'md5':
+        '12283e2ddde1797c5d1e57036b512746',
+        'cfg_path':
+        'ckpt/model_config.json',
+        'ckpt_path':
+        'ckpt/model_state.pdparams',
+        'vocab_file':
+        'punc_vocab.txt',
+    },
+    "ernie_linear_p3_wudao-punc-zh": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/text/ernie_linear_p3_wudao-punc-zh.tar.gz',
+        'md5':
+        '448eb2fdf85b6a997e7e652e80c51dd2',
+        'cfg_path':
+        'ckpt/model_config.json',
+        'ckpt_path':
+        'ckpt/model_state.pdparams',
+        'vocab_file':
+        'punc_vocab.txt',
+    },
+}
+model_alias = {
+    "ernie_linear_p7": "paddlespeech.text.models:ErnieLinear",
+    "ernie_linear_p3": "paddlespeech.text.models:ErnieLinear",
+}
+tokenizer_alias = {
+    "ernie_linear_p7": "paddlenlp.transformers:ErnieTokenizer",
+    "ernie_linear_p3": "paddlenlp.transformers:ErnieTokenizer",
+}
--- a/paddlespeech/cli/tts/infer.py
+++ b/paddlespeech/cli/tts/infer.py
@@ -29,9 +29,9 @@ from yacs.config import CfgNode
 from ..executor import BaseExecutor
 from ..log import logger
 from ..utils import cli_register
-from ..utils import download_and_decompress
-from ..utils import MODEL_HOME
 from ..utils import stats_wrapper
+from .pretrained_models import model_alias
+from .pretrained_models import pretrained_models
 from paddlespeech.s2t.utils.dynamic_import import dynamic_import
 from paddlespeech.t2s.frontend import English
 from paddlespeech.t2s.frontend.zh_frontend import Frontend
@@ -39,299 +39,14 @@ from paddlespeech.t2s.modules.normalizer import ZScore
 __all__ = ['TTSExecutor']
-pretrained_models = {
-    # speedyspeech
-    "speedyspeech_csmsc-zh": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_csmsc_ckpt_0.2.0.zip',
-        'md5':
-        '6f6fa967b408454b6662c8c00c0027cb',
-        'config':
-        'default.yaml',
-        'ckpt':
-        'snapshot_iter_30600.pdz',
-        'speech_stats':
-        'feats_stats.npy',
-        'phones_dict':
-        'phone_id_map.txt',
-        'tones_dict':
-        'tone_id_map.txt',
-    },
-    # fastspeech2
-    "fastspeech2_csmsc-zh": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip',
-        'md5':
-        '637d28a5e53aa60275612ba4393d5f22',
-        'config':
-        'default.yaml',
-        'ckpt':
-        'snapshot_iter_76000.pdz',
-        'speech_stats':
-        'speech_stats.npy',
-        'phones_dict':
-        'phone_id_map.txt',
-    },
-    "fastspeech2_ljspeech-en": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_ljspeech_ckpt_0.5.zip',
-        'md5':
-        'ffed800c93deaf16ca9b3af89bfcd747',
-        'config':
-        'default.yaml',
-        'ckpt':
-        'snapshot_iter_100000.pdz',
-        'speech_stats':
-        'speech_stats.npy',
-        'phones_dict':
-        'phone_id_map.txt',
-    },
-    "fastspeech2_aishell3-zh": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_aishell3_ckpt_0.4.zip',
-        'md5':
-        'f4dd4a5f49a4552b77981f544ab3392e',
-        'config':
-        'default.yaml',
-        'ckpt':
-        'snapshot_iter_96400.pdz',
-        'speech_stats':
-        'speech_stats.npy',
-        'phones_dict':
-        'phone_id_map.txt',
-        'speaker_dict':
-        'speaker_id_map.txt',
-    },
-    "fastspeech2_vctk-en": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_vctk_ckpt_0.5.zip',
-        'md5':
-        '743e5024ca1e17a88c5c271db9779ba4',
-        'config':
-        'default.yaml',
-        'ckpt':
-        'snapshot_iter_66200.pdz',
-        'speech_stats':
-        'speech_stats.npy',
-        'phones_dict':
-        'phone_id_map.txt',
-        'speaker_dict':
-        'speaker_id_map.txt',
-    },
-    # tacotron2
-    "tacotron2_csmsc-zh": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_csmsc_ckpt_0.2.0.zip',
-        'md5':
-        '0df4b6f0bcbe0d73c5ed6df8867ab91a',
-        'config':
-        'default.yaml',
-        'ckpt':
-        'snapshot_iter_30600.pdz',
-        'speech_stats':
-        'speech_stats.npy',
-        'phones_dict':
-        'phone_id_map.txt',
-    },
-    "tacotron2_ljspeech-en": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_ljspeech_ckpt_0.2.0.zip',
-        'md5':
-        '6a5eddd81ae0e81d16959b97481135f3',
-        'config':
-        'default.yaml',
-        'ckpt':
-        'snapshot_iter_60300.pdz',
-        'speech_stats':
-        'speech_stats.npy',
-        'phones_dict':
-        'phone_id_map.txt',
-    },
-    # pwgan
-    "pwgan_csmsc-zh": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip',
-        'md5':
-        '2e481633325b5bdf0a3823c714d2c117',
-        'config':
-        'pwg_default.yaml',
-        'ckpt':
-        'pwg_snapshot_iter_400000.pdz',
-        'speech_stats':
-        'pwg_stats.npy',
-    },
-    "pwgan_ljspeech-en": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_ljspeech_ckpt_0.5.zip',
-        'md5':
-        '53610ba9708fd3008ccaf8e99dacbaf0',
-        'config':
-        'pwg_default.yaml',
-        'ckpt':
-        'pwg_snapshot_iter_400000.pdz',
-        'speech_stats':
-        'pwg_stats.npy',
-    },
-    "pwgan_aishell3-zh": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_aishell3_ckpt_0.5.zip',
-        'md5':
-        'd7598fa41ad362d62f85ffc0f07e3d84',
-        'config':
-        'default.yaml',
-        'ckpt':
-        'snapshot_iter_1000000.pdz',
-        'speech_stats':
-        'feats_stats.npy',
-    },
-    "pwgan_vctk-en": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_vctk_ckpt_0.1.1.zip',
-        'md5':
-        'b3da1defcde3e578be71eb284cb89f2c',
-        'config':
-        'default.yaml',
-        'ckpt':
-        'snapshot_iter_1500000.pdz',
-        'speech_stats':
-        'feats_stats.npy',
-    },
-    # mb_melgan
-    "mb_melgan_csmsc-zh": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_ckpt_0.1.1.zip',
-        'md5':
-        'ee5f0604e20091f0d495b6ec4618b90d',
-        'config':
-        'default.yaml',
-        'ckpt':
-        'snapshot_iter_1000000.pdz',
-        'speech_stats':
-        'feats_stats.npy',
-    },
-    # style_melgan
-    "style_melgan_csmsc-zh": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/style_melgan/style_melgan_csmsc_ckpt_0.1.1.zip',
-        'md5':
-        '5de2d5348f396de0c966926b8c462755',
-        'config':
-        'default.yaml',
-        'ckpt':
-        'snapshot_iter_1500000.pdz',
-        'speech_stats':
-        'feats_stats.npy',
-    },
-    # hifigan
-    "hifigan_csmsc-zh": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_ckpt_0.1.1.zip',
-        'md5':
-        'dd40a3d88dfcf64513fba2f0f961ada6',
-        'config':
-        'default.yaml',
-        'ckpt':
-        'snapshot_iter_2500000.pdz',
-        'speech_stats':
-        'feats_stats.npy',
-    },
-    "hifigan_ljspeech-en": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_ljspeech_ckpt_0.2.0.zip',
-        'md5':
-        '70e9131695decbca06a65fe51ed38a72',
-        'config':
-        'default.yaml',
-        'ckpt':
-        'snapshot_iter_2500000.pdz',
-        'speech_stats':
-        'feats_stats.npy',
-    },
-    "hifigan_aishell3-zh": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_ckpt_0.2.0.zip',
-        'md5':
-        '3bb49bc75032ed12f79c00c8cc79a09a',
-        'config':
-        'default.yaml',
-        'ckpt':
-        'snapshot_iter_2500000.pdz',
-        'speech_stats':
-        'feats_stats.npy',
-    },
-    "hifigan_vctk-en": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_vctk_ckpt_0.2.0.zip',
-        'md5':
-        '7da8f88359bca2457e705d924cf27bd4',
-        'config':
-        'default.yaml',
-        'ckpt':
-        'snapshot_iter_2500000.pdz',
-        'speech_stats':
-        'feats_stats.npy',
-    },
-    # wavernn
-    "wavernn_csmsc-zh": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/wavernn/wavernn_csmsc_ckpt_0.2.0.zip',
-        'md5':
-        'ee37b752f09bcba8f2af3b777ca38e13',
-        'config':
-        'default.yaml',
-        'ckpt':
-        'snapshot_iter_400000.pdz',
-        'speech_stats':
-        'feats_stats.npy',
-    }
-}
-model_alias = {
-    # acoustic model
-    "speedyspeech":
-    "paddlespeech.t2s.models.speedyspeech:SpeedySpeech",
-    "speedyspeech_inference":
-    "paddlespeech.t2s.models.speedyspeech:SpeedySpeechInference",
-    "fastspeech2":
-    "paddlespeech.t2s.models.fastspeech2:FastSpeech2",
-    "fastspeech2_inference":
-    "paddlespeech.t2s.models.fastspeech2:FastSpeech2Inference",
-    "tacotron2":
-    "paddlespeech.t2s.models.tacotron2:Tacotron2",
-    "tacotron2_inference":
-    "paddlespeech.t2s.models.tacotron2:Tacotron2Inference",
-    # voc
-    "pwgan":
-    "paddlespeech.t2s.models.parallel_wavegan:PWGGenerator",
-    "pwgan_inference":
-    "paddlespeech.t2s.models.parallel_wavegan:PWGInference",
-    "mb_melgan":
-    "paddlespeech.t2s.models.melgan:MelGANGenerator",
-    "mb_melgan_inference":
-    "paddlespeech.t2s.models.melgan:MelGANInference",
-    "style_melgan":
-    "paddlespeech.t2s.models.melgan:StyleMelGANGenerator",
-    "style_melgan_inference":
-    "paddlespeech.t2s.models.melgan:StyleMelGANInference",
-    "hifigan":
-    "paddlespeech.t2s.models.hifigan:HiFiGANGenerator",
-    "hifigan_inference":
-    "paddlespeech.t2s.models.hifigan:HiFiGANInference",
-    "wavernn":
-    "paddlespeech.t2s.models.wavernn:WaveRNN",
-    "wavernn_inference":
-    "paddlespeech.t2s.models.wavernn:WaveRNNInference",
-}
 @cli_register(
    name='paddlespeech.tts', description='Text to Speech infer command.')
 class TTSExecutor(BaseExecutor):
    def __init__(self):
        super().__init__()
+        self.model_alias = model_alias
+        self.pretrained_models = pretrained_models
        self.parser = argparse.ArgumentParser(
            prog='paddlespeech.tts', add_help=True)
@@ -449,22 +164,6 @@ class TTSExecutor(BaseExecutor):
            action='store_true',
            help='Increase logger verbosity of current task.')
-    def _get_pretrained_path(self, tag: str) -> os.PathLike:
-        """
-        Download and returns pretrained resources path of current task.
-        """
-        support_models = list(pretrained_models.keys())
-        assert tag in pretrained_models, 'The model "{}" you want to use has not been supported, please choose other models.\nThe support models includes:\n\t\t{}\n'.format(
-            tag, '\n\t\t'.join(support_models))
-        res_path = os.path.join(MODEL_HOME, tag)
-        decompressed_path = download_and_decompress(pretrained_models[tag],
-                                                    res_path)
-        decompressed_path = os.path.abspath(decompressed_path)
-        logger.info(
-            'Use pretrained model stored in: {}'.format(decompressed_path))
-        return decompressed_path
    def _init_from_path(
            self,
            am: str='fastspeech2_csmsc',
@@ -490,16 +189,15 @@ class TTSExecutor(BaseExecutor):
        if am_ckpt is None or am_config is None or am_stat is None or phones_dict is None:
            am_res_path = self._get_pretrained_path(am_tag)
            self.am_res_path = am_res_path
-            self.am_config = os.path.join(am_res_path,
+            self.am_config = os.path.join(
-                                          pretrained_models[am_tag]['config'])
+                am_res_path, self.pretrained_models[am_tag]['config'])
            self.am_ckpt = os.path.join(am_res_path,
-                                        pretrained_models[am_tag]['ckpt'])
+                                        self.pretrained_models[am_tag]['ckpt'])
            self.am_stat = os.path.join(
-                am_res_path, pretrained_models[am_tag]['speech_stats'])
+                am_res_path, self.pretrained_models[am_tag]['speech_stats'])
            # must have phones_dict in acoustic
            self.phones_dict = os.path.join(
-                am_res_path, pretrained_models[am_tag]['phones_dict'])
+                am_res_path, self.pretrained_models[am_tag]['phones_dict'])
-            print("self.phones_dict:", self.phones_dict)
            logger.info(am_res_path)
            logger.info(self.am_config)
            logger.info(self.am_ckpt)
@@ -509,21 +207,20 @@ class TTSExecutor(BaseExecutor):
            self.am_stat = os.path.abspath(am_stat)
            self.phones_dict = os.path.abspath(phones_dict)
            self.am_res_path = os.path.dirname(os.path.abspath(self.am_config))
-        print("self.phones_dict:", self.phones_dict)
        # for speedyspeech
        self.tones_dict = None
-        if 'tones_dict' in pretrained_models[am_tag]:
+        if 'tones_dict' in self.pretrained_models[am_tag]:
            self.tones_dict = os.path.join(
-                am_res_path, pretrained_models[am_tag]['tones_dict'])
+                am_res_path, self.pretrained_models[am_tag]['tones_dict'])
            if tones_dict:
                self.tones_dict = tones_dict
        # for multi speaker fastspeech2
        self.speaker_dict = None
-        if 'speaker_dict' in pretrained_models[am_tag]:
+        if 'speaker_dict' in self.pretrained_models[am_tag]:
            self.speaker_dict = os.path.join(
-                am_res_path, pretrained_models[am_tag]['speaker_dict'])
+                am_res_path, self.pretrained_models[am_tag]['speaker_dict'])
            if speaker_dict:
                self.speaker_dict = speaker_dict
@@ -532,12 +229,12 @@ class TTSExecutor(BaseExecutor):
        if voc_ckpt is None or voc_config is None or voc_stat is None:
            voc_res_path = self._get_pretrained_path(voc_tag)
            self.voc_res_path = voc_res_path
-            self.voc_config = os.path.join(voc_res_path,
+            self.voc_config = os.path.join(
-                                           pretrained_models[voc_tag]['config'])
+                voc_res_path, self.pretrained_models[voc_tag]['config'])
-            self.voc_ckpt = os.path.join(voc_res_path,
+            self.voc_ckpt = os.path.join(
-                                         pretrained_models[voc_tag]['ckpt'])
+                voc_res_path, self.pretrained_models[voc_tag]['ckpt'])
            self.voc_stat = os.path.join(
-                voc_res_path, pretrained_models[voc_tag]['speech_stats'])
+                voc_res_path, self.pretrained_models[voc_tag]['speech_stats'])
            logger.info(voc_res_path)
            logger.info(self.voc_config)
            logger.info(self.voc_ckpt)
@@ -588,8 +285,9 @@ class TTSExecutor(BaseExecutor):
        # model: {model_name}_{dataset}
        am_name = am[:am.rindex('_')]
-        am_class = dynamic_import(am_name, model_alias)
+        am_class = dynamic_import(am_name, self.model_alias)
-        am_inference_class = dynamic_import(am_name + '_inference', model_alias)
+        am_inference_class = dynamic_import(am_name + '_inference',
+                                            self.model_alias)
        if am_name == 'fastspeech2':
            am = am_class(
@@ -618,9 +316,9 @@ class TTSExecutor(BaseExecutor):
        # vocoder
        # model: {model_name}_{dataset}
        voc_name = voc[:voc.rindex('_')]
-        voc_class = dynamic_import(voc_name, model_alias)
+        voc_class = dynamic_import(voc_name, self.model_alias)
        voc_inference_class = dynamic_import(voc_name + '_inference',
-                                             model_alias)
+                                             self.model_alias)
        if voc_name != 'wavernn':
            voc = voc_class(**self.voc_config["generator_params"])
            voc.set_state_dict(paddle.load(self.voc_ckpt)["generator_params"])
@@ -735,7 +433,6 @@ class TTSExecutor(BaseExecutor):
        am_ckpt = args.am_ckpt
        am_stat = args.am_stat
        phones_dict = args.phones_dict
-        print("phones_dict:", phones_dict)
        tones_dict = args.tones_dict
        speaker_dict = args.speaker_dict
        voc = args.voc

--- a/paddlespeech/cli/tts/pretrained_models.py
+++ b/paddlespeech/cli/tts/pretrained_models.py
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+pretrained_models = {
+    # speedyspeech
+    "speedyspeech_csmsc-zh": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_csmsc_ckpt_0.2.0.zip',
+        'md5':
+        '6f6fa967b408454b6662c8c00c0027cb',
+        'config':
+        'default.yaml',
+        'ckpt':
+        'snapshot_iter_30600.pdz',
+        'speech_stats':
+        'feats_stats.npy',
+        'phones_dict':
+        'phone_id_map.txt',
+        'tones_dict':
+        'tone_id_map.txt',
+    },
+    # fastspeech2
+    "fastspeech2_csmsc-zh": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip',
+        'md5':
+        '637d28a5e53aa60275612ba4393d5f22',
+        'config':
+        'default.yaml',
+        'ckpt':
+        'snapshot_iter_76000.pdz',
+        'speech_stats':
+        'speech_stats.npy',
+        'phones_dict':
+        'phone_id_map.txt',
+    },
+    "fastspeech2_ljspeech-en": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_ljspeech_ckpt_0.5.zip',
+        'md5':
+        'ffed800c93deaf16ca9b3af89bfcd747',
+        'config':
+        'default.yaml',
+        'ckpt':
+        'snapshot_iter_100000.pdz',
+        'speech_stats':
+        'speech_stats.npy',
+        'phones_dict':
+        'phone_id_map.txt',
+    },
+    "fastspeech2_aishell3-zh": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_aishell3_ckpt_0.4.zip',
+        'md5':
+        'f4dd4a5f49a4552b77981f544ab3392e',
+        'config':
+        'default.yaml',
+        'ckpt':
+        'snapshot_iter_96400.pdz',
+        'speech_stats':
+        'speech_stats.npy',
+        'phones_dict':
+        'phone_id_map.txt',
+        'speaker_dict':
+        'speaker_id_map.txt',
+    },
+    "fastspeech2_vctk-en": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_vctk_ckpt_0.5.zip',
+        'md5':
+        '743e5024ca1e17a88c5c271db9779ba4',
+        'config':
+        'default.yaml',
+        'ckpt':
+        'snapshot_iter_66200.pdz',
+        'speech_stats':
+        'speech_stats.npy',
+        'phones_dict':
+        'phone_id_map.txt',
+        'speaker_dict':
+        'speaker_id_map.txt',
+    },
+    # tacotron2
+    "tacotron2_csmsc-zh": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_csmsc_ckpt_0.2.0.zip',
+        'md5':
+        '0df4b6f0bcbe0d73c5ed6df8867ab91a',
+        'config':
+        'default.yaml',
+        'ckpt':
+        'snapshot_iter_30600.pdz',
+        'speech_stats':
+        'speech_stats.npy',
+        'phones_dict':
+        'phone_id_map.txt',
+    },
+    "tacotron2_ljspeech-en": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_ljspeech_ckpt_0.2.0.zip',
+        'md5':
+        '6a5eddd81ae0e81d16959b97481135f3',
+        'config':
+        'default.yaml',
+        'ckpt':
+        'snapshot_iter_60300.pdz',
+        'speech_stats':
+        'speech_stats.npy',
+        'phones_dict':
+        'phone_id_map.txt',
+    },
+    # pwgan
+    "pwgan_csmsc-zh": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip',
+        'md5':
+        '2e481633325b5bdf0a3823c714d2c117',
+        'config':
+        'pwg_default.yaml',
+        'ckpt':
+        'pwg_snapshot_iter_400000.pdz',
+        'speech_stats':
+        'pwg_stats.npy',
+    },
+    "pwgan_ljspeech-en": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_ljspeech_ckpt_0.5.zip',
+        'md5':
+        '53610ba9708fd3008ccaf8e99dacbaf0',
+        'config':
+        'pwg_default.yaml',
+        'ckpt':
+        'pwg_snapshot_iter_400000.pdz',
+        'speech_stats':
+        'pwg_stats.npy',
+    },
+    "pwgan_aishell3-zh": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_aishell3_ckpt_0.5.zip',
+        'md5':
+        'd7598fa41ad362d62f85ffc0f07e3d84',
+        'config':
+        'default.yaml',
+        'ckpt':
+        'snapshot_iter_1000000.pdz',
+        'speech_stats':
+        'feats_stats.npy',
+    },
+    "pwgan_vctk-en": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_vctk_ckpt_0.1.1.zip',
+        'md5':
+        'b3da1defcde3e578be71eb284cb89f2c',
+        'config':
+        'default.yaml',
+        'ckpt':
+        'snapshot_iter_1500000.pdz',
+        'speech_stats':
+        'feats_stats.npy',
+    },
+    # mb_melgan
+    "mb_melgan_csmsc-zh": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_ckpt_0.1.1.zip',
+        'md5':
+        'ee5f0604e20091f0d495b6ec4618b90d',
+        'config':
+        'default.yaml',
+        'ckpt':
+        'snapshot_iter_1000000.pdz',
+        'speech_stats':
+        'feats_stats.npy',
+    },
+    # style_melgan
+    "style_melgan_csmsc-zh": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/style_melgan/style_melgan_csmsc_ckpt_0.1.1.zip',
+        'md5':
+        '5de2d5348f396de0c966926b8c462755',
+        'config':
+        'default.yaml',
+        'ckpt':
+        'snapshot_iter_1500000.pdz',
+        'speech_stats':
+        'feats_stats.npy',
+    },
+    # hifigan
+    "hifigan_csmsc-zh": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_ckpt_0.1.1.zip',
+        'md5':
+        'dd40a3d88dfcf64513fba2f0f961ada6',
+        'config':
+        'default.yaml',
+        'ckpt':
+        'snapshot_iter_2500000.pdz',
+        'speech_stats':
+        'feats_stats.npy',
+    },
+    "hifigan_ljspeech-en": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_ljspeech_ckpt_0.2.0.zip',
+        'md5':
+        '70e9131695decbca06a65fe51ed38a72',
+        'config':
+        'default.yaml',
+        'ckpt':
+        'snapshot_iter_2500000.pdz',
+        'speech_stats':
+        'feats_stats.npy',
+    },
+    "hifigan_aishell3-zh": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_ckpt_0.2.0.zip',
+        'md5':
+        '3bb49bc75032ed12f79c00c8cc79a09a',
+        'config':
+        'default.yaml',
+        'ckpt':
+        'snapshot_iter_2500000.pdz',
+        'speech_stats':
+        'feats_stats.npy',
+    },
+    "hifigan_vctk-en": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_vctk_ckpt_0.2.0.zip',
+        'md5':
+        '7da8f88359bca2457e705d924cf27bd4',
+        'config':
+        'default.yaml',
+        'ckpt':
+        'snapshot_iter_2500000.pdz',
+        'speech_stats':
+        'feats_stats.npy',
+    },
+    # wavernn
+    "wavernn_csmsc-zh": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/wavernn/wavernn_csmsc_ckpt_0.2.0.zip',
+        'md5':
+        'ee37b752f09bcba8f2af3b777ca38e13',
+        'config':
+        'default.yaml',
+        'ckpt':
+        'snapshot_iter_400000.pdz',
+        'speech_stats':
+        'feats_stats.npy',
+    }
+}
+model_alias = {
+    # acoustic model
+    "speedyspeech":
+    "paddlespeech.t2s.models.speedyspeech:SpeedySpeech",
+    "speedyspeech_inference":
+    "paddlespeech.t2s.models.speedyspeech:SpeedySpeechInference",
+    "fastspeech2":
+    "paddlespeech.t2s.models.fastspeech2:FastSpeech2",
+    "fastspeech2_inference":
+    "paddlespeech.t2s.models.fastspeech2:FastSpeech2Inference",
+    "tacotron2":
+    "paddlespeech.t2s.models.tacotron2:Tacotron2",
+    "tacotron2_inference":
+    "paddlespeech.t2s.models.tacotron2:Tacotron2Inference",
+    # voc
+    "pwgan":
+    "paddlespeech.t2s.models.parallel_wavegan:PWGGenerator",
+    "pwgan_inference":
+    "paddlespeech.t2s.models.parallel_wavegan:PWGInference",
+    "mb_melgan":
+    "paddlespeech.t2s.models.melgan:MelGANGenerator",
+    "mb_melgan_inference":
+    "paddlespeech.t2s.models.melgan:MelGANInference",
+    "style_melgan":
+    "paddlespeech.t2s.models.melgan:StyleMelGANGenerator",
+    "style_melgan_inference":
+    "paddlespeech.t2s.models.melgan:StyleMelGANInference",
+    "hifigan":
+    "paddlespeech.t2s.models.hifigan:HiFiGANGenerator",
+    "hifigan_inference":
+    "paddlespeech.t2s.models.hifigan:HiFiGANInference",
+    "wavernn":
+    "paddlespeech.t2s.models.wavernn:WaveRNN",
+    "wavernn_inference":
+    "paddlespeech.t2s.models.wavernn:WaveRNNInference",
+}
--- a/paddlespeech/cli/vector/infer.py
+++ b/paddlespeech/cli/vector/infer.py
@@ -27,45 +27,24 @@ from yacs.config import CfgNode
 from ..executor import BaseExecutor
 from ..log import logger
 from ..utils import cli_register
-from ..utils import download_and_decompress
-from ..utils import MODEL_HOME
 from ..utils import stats_wrapper
+from .pretrained_models import model_alias
+from .pretrained_models import pretrained_models
 from paddleaudio.backends import load as load_audio
 from paddleaudio.compliance.librosa import melspectrogram
 from paddlespeech.s2t.utils.dynamic_import import dynamic_import
 from paddlespeech.vector.io.batch import feature_normalize
 from paddlespeech.vector.modules.sid_model import SpeakerIdetification
-pretrained_models = {
-    # The tags for pretrained_models should be "{model_name}[-{dataset}][-{sr}][-...]".
-    # e.g. "ecapatdnn_voxceleb12-16k".
-    # Command line and python api use "{model_name}[-{dataset}]" as --model, usage:
-    # "paddlespeech vector --task spk --model ecapatdnn_voxceleb12-16k --sr 16000 --input ./input.wav"
-    "ecapatdnn_voxceleb12-16k": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/vector/voxceleb/sv0_ecapa_tdnn_voxceleb12_ckpt_0_2_0.tar.gz',
-        'md5':
-        'cc33023c54ab346cd318408f43fcaf95',
-        'cfg_path':
-        'conf/model.yaml',  # the yaml config path
-        'ckpt_path':
-        'model/model',  # the format is ${dir}/{model_name}, 
-        # so the first 'model' is dir, the second 'model' is the name
-        # this means we have a model stored as model/model.pdparams
-    },
-}
-model_alias = {
-    "ecapatdnn": "paddlespeech.vector.models.ecapa_tdnn:EcapaTdnn",
-}
 @cli_register(
    name="paddlespeech.vector",
    description="Speech to vector embedding infer command.")
 class VectorExecutor(BaseExecutor):
    def __init__(self):
-        super(VectorExecutor, self).__init__()
+        super().__init__()
+        self.model_alias = model_alias
+        self.pretrained_models = pretrained_models
        self.parser = argparse.ArgumentParser(
            prog="paddlespeech.vector", add_help=True)
@@ -128,8 +107,8 @@ class VectorExecutor(BaseExecutor):
        Returns:
            bool: 
-                 False: some audio occurs error
+                False: some audio occurs error
-                 True: all audio process success
+                True: all audio process success
        """
        # stage 0: parse the args and get the required args
        parser_args = self.parser.parse_args(argv)
@@ -289,32 +268,6 @@ class VectorExecutor(BaseExecutor):
        return res
-    def _get_pretrained_path(self, tag: str) -> os.PathLike:
-        """get the neural network path from the pretrained model list
-           we stored all the pretained mode in the variable `pretrained_models`
-        Args:
-            tag (str): model tag in the pretrained model list
-        Returns:
-            os.PathLike: the downloaded pretrained model path in the disk
-        """
-        support_models = list(pretrained_models.keys())
-        assert tag in pretrained_models, \
-            'The model "{}" you want to use has not been supported,'\
-            'please choose other models.\n' \
-            'The support models includes\n\t\t{}'.format(tag, "\n\t\t".join(support_models))
-        res_path = os.path.join(MODEL_HOME, tag)
-        decompressed_path = download_and_decompress(pretrained_models[tag],
-                                                    res_path)
-        decompressed_path = os.path.abspath(decompressed_path)
-        logger.info(
-            'Use pretrained model stored in: {}'.format(decompressed_path))
-        return decompressed_path
    def _init_from_path(self,
                        model_type: str='ecapatdnn_voxceleb12',
                        sample_rate: int=16000,
@@ -350,10 +303,11 @@ class VectorExecutor(BaseExecutor):
            res_path = self._get_pretrained_path(tag)
            self.res_path = res_path
-            self.cfg_path = os.path.join(res_path,
+            self.cfg_path = os.path.join(
-                                         pretrained_models[tag]['cfg_path'])
+                res_path, self.pretrained_models[tag]['cfg_path'])
            self.ckpt_path = os.path.join(
-                res_path, pretrained_models[tag]['ckpt_path'] + '.pdparams')
+                res_path,
+                self.pretrained_models[tag]['ckpt_path'] + '.pdparams')
        else:
            # get the model from disk
            self.cfg_path = os.path.abspath(cfg_path)
@@ -373,7 +327,7 @@ class VectorExecutor(BaseExecutor):
        logger.info("start to dynamic import the model class")
        model_name = model_type[:model_type.rindex('_')]
        logger.info(f"model name {model_name}")
-        model_class = dynamic_import(model_name, model_alias)
+        model_class = dynamic_import(model_name, self.model_alias)
        model_conf = self.config.model
        backbone = model_class(**model_conf)
        model = SpeakerIdetification(

--- a/paddlespeech/cli/vector/pretrained_models.py
+++ b/paddlespeech/cli/vector/pretrained_models.py
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+pretrained_models = {
+    # The tags for pretrained_models should be "{model_name}[-{dataset}][-{sr}][-...]".
+    # e.g. "ecapatdnn_voxceleb12-16k".
+    # Command line and python api use "{model_name}[-{dataset}]" as --model, usage:
+    # "paddlespeech vector --task spk --model ecapatdnn_voxceleb12-16k --sr 16000 --input ./input.wav"
+    "ecapatdnn_voxceleb12-16k": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/vector/voxceleb/sv0_ecapa_tdnn_voxceleb12_ckpt_0_2_0.tar.gz',
+        'md5':
+        'cc33023c54ab346cd318408f43fcaf95',
+        'cfg_path':
+        'conf/model.yaml',  # the yaml config path
+        'ckpt_path':
+        'model/model',  # the format is ${dir}/{model_name}, 
+        # so the first 'model' is dir, the second 'model' is the name
+        # this means we have a model stored as model/model.pdparams
+    },
+}
+model_alias = {
+    "ecapatdnn": "paddlespeech.vector.models.ecapa_tdnn:EcapaTdnn",
+}
--- a/paddlespeech/server/engine/tts/online/tts_engine.py
+++ b/paddlespeech/server/engine/tts/online/tts_engine.py
--- a/paddlespeech/server/utils/util.py
+++ b/paddlespeech/server/utils/util.py
@@ -52,6 +52,10 @@ def get_chunks(data, block_size, pad_size, step):
    Returns:
        list: chunks list
    """
+    if block_size == -1:
+        return [data]
    if step == "am":
        data_len = data.shape[1]
    elif step == "voc":

--- a/paddlespeech/vector/cluster/diarization.py
+++ b/paddlespeech/vector/cluster/diarization.py
-# Copyright (c) 2022 SpeechBrain Authors. All Rights Reserved.
+# Copyright (c) 2022 PaddlePaddle and SpeechBrain Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -18,12 +18,14 @@ This script has an optional dependency on open source sklearn library.
 A few sklearn functions are modified in this script as per requirement.
 """
 import argparse
+import copy
 import warnings
 from distutils.util import strtobool
 import numpy as np
 import scipy
 import sklearn
+from scipy import linalg
 from scipy import sparse
 from scipy.sparse.csgraph import connected_components
 from scipy.sparse.csgraph import laplacian as csgraph_laplacian
@@ -346,6 +348,8 @@ class EmbeddingMeta:
    ---------
    segset : list
        List of session IDs as an array of strings.
+    modelset : list
+        List of model IDs as an array of strings.
    stats : tensor
        An ndarray of float64. Each line contains embedding
        from the corresponding session.
@@ -354,15 +358,20 @@ class EmbeddingMeta:
    def __init__(
            self,
            segset=None,
+            modelset=None,
            stats=None, ):
        if segset is None:
-            self.segset = numpy.empty(0, dtype="|O")
+            self.segset = np.empty(0, dtype="|O")
-            self.stats = numpy.array([], dtype=np.float64)
+            self.modelset = np.empty(0, dtype="|O")
+            self.stats = np.array([], dtype=np.float64)
        else:
            self.segset = segset
+            self.modelset = modelset
            self.stats = stats
+        self.stat0 = np.array([[1.0]] * self.stats.shape[0])
    def norm_stats(self):
        """
        Divide all first-order statistics by their Euclidean norm.
@@ -371,6 +380,188 @@ class EmbeddingMeta:
        vect_norm = np.clip(np.linalg.norm(self.stats, axis=1), 1e-08, np.inf)
        self.stats = (self.stats.transpose() / vect_norm).transpose()
+    def get_mean_stats(self):
+        """
+        Return the mean of first order statistics.
+        """
+        mu = np.mean(self.stats, axis=0)
+        return mu
+    def get_total_covariance_stats(self):
+        """
+        Compute and return the total covariance matrix of the first-order statistics.
+        """
+        C = self.stats - self.stats.mean(axis=0)
+        return np.dot(C.transpose(), C) / self.stats.shape[0]
+    def get_model_stat0(self, mod_id):
+        """Return zero-order statistics of a given model
+        Arguments
+        ---------
+        mod_id : str
+            ID of the model which stat0 will be returned.
+        """
+        S = self.stat0[self.modelset == mod_id, :]
+        return S
+    def get_model_stats(self, mod_id):
+        """Return first-order statistics of a given model.
+        Arguments
+        ---------
+        mod_id : str
+            ID of the model which stat1 will be returned.
+        """
+        return self.stats[self.modelset == mod_id, :]
+    def sum_stat_per_model(self):
+        """
+        Sum the zero- and first-order statistics per model and store them
+        in a new EmbeddingMeta.
+        Returns a EmbeddingMeta object with the statistics summed per model
+        and a numpy array with session_per_model.
+        """
+        sts_per_model = EmbeddingMeta()
+        sts_per_model.modelset = np.unique(
+            self.modelset)  # nd: get uniq spkr ids
+        sts_per_model.segset = copy.deepcopy(sts_per_model.modelset)
+        sts_per_model.stat0 = np.zeros(
+            (sts_per_model.modelset.shape[0], self.stat0.shape[1]),
+            dtype=np.float64, )
+        sts_per_model.stats = np.zeros(
+            (sts_per_model.modelset.shape[0], self.stats.shape[1]),
+            dtype=np.float64, )
+        session_per_model = np.zeros(np.unique(self.modelset).shape[0])
+        # For each model sum the stats
+        for idx, model in enumerate(sts_per_model.modelset):
+            sts_per_model.stat0[idx, :] = self.get_model_stat0(model).sum(
+                axis=0)
+            sts_per_model.stats[idx, :] = self.get_model_stats(model).sum(
+                axis=0)
+            session_per_model[idx] += self.get_model_stats(model).shape[0]
+        return sts_per_model, session_per_model
+    def center_stats(self, mu):
+        """
+        Center first order statistics.
+        Arguments
+        ---------
+        mu : array
+            Array to center on.
+        """
+        dim = self.stats.shape[1] / self.stat0.shape[1]
+        index_map = np.repeat(np.arange(self.stat0.shape[1]), dim)
+        self.stats = self.stats - (self.stat0[:, index_map] *
+                                   mu.astype(np.float64))
+    def rotate_stats(self, R):
+        """
+        Rotate first-order statistics by a right-product.
+        Arguments
+        ---------
+        R : ndarray
+            Matrix to use for right product on the first order statistics.
+        """
+        self.stats = np.dot(self.stats, R)
+    def whiten_stats(self, mu, sigma, isSqrInvSigma=False):
+        """
+        Whiten first-order statistics
+        If sigma.ndim == 1, case of a diagonal covariance.
+        If sigma.ndim == 2, case of a single Gaussian with full covariance.
+        If sigma.ndim == 3, case of a full covariance UBM.
+        Arguments
+        ---------
+        mu : array
+            Mean vector to be subtracted from the statistics.
+        sigma : narray
+            Co-variance matrix or covariance super-vector.
+        isSqrInvSigma : bool
+            True if the input Sigma matrix is the inverse of the square root of a covariance matrix.
+        """
+        if sigma.ndim == 1:
+            self.center_stats(mu)
+            self.stats = self.stats / np.sqrt(sigma.astype(np.float64))
+        elif sigma.ndim == 2:
+            # Compute the inverse square root of the co-variance matrix Sigma
+            sqr_inv_sigma = sigma
+            if not isSqrInvSigma:
+                # eigen_values, eigen_vectors = scipy.linalg.eigh(sigma)
+                eigen_values, eigen_vectors = linalg.eigh(sigma)
+                ind = eigen_values.real.argsort()[::-1]
+                eigen_values = eigen_values.real[ind]
+                eigen_vectors = eigen_vectors.real[:, ind]
+                sqr_inv_eval_sigma = 1 / np.sqrt(eigen_values.real)
+                sqr_inv_sigma = np.dot(eigen_vectors,
+                                       np.diag(sqr_inv_eval_sigma))
+            else:
+                pass
+            # Whitening of the first-order statistics
+            self.center_stats(mu)  # CENTERING
+            self.rotate_stats(sqr_inv_sigma)
+        elif sigma.ndim == 3:
+            # we assume that sigma is a 3D ndarray of size D x n x n
+            # where D is the number of distributions and n is the dimension of a single distribution
+            n = self.stats.shape[1] // self.stat0.shape[1]
+            sess_nb = self.stat0.shape[0]
+            self.center_stats(mu)
+            self.stats = (np.einsum("ikj,ikl->ilj",
+                                    self.stats.T.reshape(-1, n, sess_nb), sigma)
+                          .reshape(-1, sess_nb).T)
+        else:
+            raise Exception("Wrong dimension of Sigma, must be 1 or 2")
+    def align_models(self, model_list):
+        """
+        Align models of the current EmbeddingMeta to match a list of models
+            provided as input parameter. The size of the StatServer might be
+            reduced to match the input list of models.
+        Arguments
+        ---------
+        model_list : ndarray of strings
+            List of models to match.
+        """
+        indx = np.array(
+            [np.argwhere(self.modelset == v)[0][0] for v in model_list])
+        self.segset = self.segset[indx]
+        self.modelset = self.modelset[indx]
+        self.stat0 = self.stat0[indx, :]
+        self.stats = self.stats[indx, :]
+    def align_segments(self, segment_list):
+        """
+        Align segments of the current EmbeddingMeta to match a list of segment
+            provided as input parameter. The size of the StatServer might be
+            reduced to match the input list of segments.
+        Arguments
+        ---------
+        segment_list: ndarray of strings
+            list of segments to match
+        """
+        indx = np.array(
+            [np.argwhere(self.segset == v)[0][0] for v in segment_list])
+        self.segset = self.segset[indx]
+        self.modelset = self.modelset[indx]
+        self.stat0 = self.stat0[indx, :]
+        self.stats = self.stats[indx, :]
 class SpecClustUnorm:
    """

--- a/paddlespeech/vector/cluster/plda.py
+++ b/paddlespeech/vector/cluster/plda.py
--- a/paddlespeech/vector/io/dataset_from_json.py
+++ b/paddlespeech/vector/io/dataset_from_json.py
@@ -26,14 +26,14 @@ from paddleaudio.compliance.librosa import mfcc
 class meta_info:
    """the audio meta info in the vector JSONDataset
    Args:
-        id (str): the segment name
+        utt_id (str): the segment name
        duration (float): segment time
        wav (str): wav file path
        start (int): start point in the original wav file
        stop (int): stop point in the original wav file
        lab_id (str): the record id
    """
-    id: str
+    utt_id: str
    duration: float
    wav: str
    start: int

--- a/setup.py
+++ b/setup.py
@@ -65,6 +65,7 @@ base = [
    "webrtcvad",
    "yacs~=0.1.8",
    "prettytable",
+    "zhon",
 ]
 server = [
@@ -91,7 +92,6 @@ requirements = {
        "unidecode",
        "yq",
        "pre-commit",
-        "zhon",
    ]
 }

--- a/speechx/examples/README.md
+++ b/speechx/examples/README.md
 # Examples for SpeechX
-* dev - for speechx developer, using for test.
-* ngram - using to build NGram ARPA lm.
 * ds2_ol - ds2 streaming test under `aishell-1` test dataset.
- The entrypoint is `ds2_ol/aishell/run.sh`
+   The entrypoint is `ds2_ol/aishell/run.sh`
-## How to run
+## How to run  
 `run.sh` is the entry point.
@@ -17,9 +15,23 @@ pushd ds2_ol/aishell
 bash run.sh
 ```
-## Display Model with [Netron](https://github.com/lutzroeder/netron)
+## Display Model with [Netron](https://github.com/lutzroeder/netron)  
 ```
 pip install netron
 netron exp/deepspeech2_online/checkpoints/avg_1.jit.pdmodel  --port 8022 --host 10.21.55.20
 ```
+## For Developer  
+> Warning: Only for developer, make sure you know what's it.
+* dev - for speechx developer, using for test.
+## Build WFST  
+> Warning: Using below example when you know what's it.
+* text_lm - process text for build lm
+* ngram - using to build NGram ARPA lm.
+* wfst - build wfst for TLG.
--- a/speechx/examples/ds2_ol/aishell/README.md
+++ b/speechx/examples/ds2_ol/aishell/README.md
@@ -10,12 +10,18 @@ Other -> 0.00 % N=0 C=0 S=0 D=0 I=0
 ## CTC Prefix Beam Search w LM
+LM: zh_giga.no_cna_cmn.prune01244.klm
 ```
+Overall -> 7.86 % N=104768 C=96865 S=7573 D=330 I=327
+Mandarin -> 7.86 % N=104768 C=96865 S=7573 D=330 I=327
+Other -> 0.00 % N=0 C=0 S=0 D=0 I=0
 ```
 ## CTC WFST
+LM: aishell train
+```
+Overall -> 11.14 % N=103017 C=93363 S=9583 D=71 I=1819
+Mandarin -> 11.14 % N=103017 C=93363 S=9583 D=71 I=1818
+Other -> 0.00 % N=0 C=0 S=0 D=0 I=1
 ```
-```
\ No newline at end of file
--- a/speechx/examples/ds2_ol/aishell/run.sh
+++ b/speechx/examples/ds2_ol/aishell/run.sh
@@ -5,7 +5,10 @@ set -e
 . path.sh
 nj=40
+stage=0
+stop_stage=100
+. utils/parse_options.sh
 # 1. compile
 if [ ! -d ${SPEECHX_EXAMPLES} ]; then
@@ -26,102 +29,112 @@ vocb_dir=$ckpt_dir/data/lang_char/
 mkdir -p exp
 exp=$PWD/exp
-aishell_wav_scp=aishell_test.scp
+if [ $stage -le 0 ] && [ $stop_stage -ge 0 ];then
-if [ ! -d $data/test ]; then
+    aishell_wav_scp=aishell_test.scp
-    pushd $data
+    if [ ! -d $data/test ]; then
-    wget -c https://paddlespeech.bj.bcebos.com/s2t/paddle_asr_online/aishell_test.zip
+        pushd $data
-    unzip  aishell_test.zip
+        wget -c https://paddlespeech.bj.bcebos.com/s2t/paddle_asr_online/aishell_test.zip
-    popd
+        unzip  aishell_test.zip
+        popd
-    realpath $data/test/*/*.wav > $data/wavlist
-    awk -F '/' '{ print $(NF) }' $data/wavlist | awk -F '.' '{ print $1 }' > $data/utt_id
+        realpath $data/test/*/*.wav > $data/wavlist
-    paste $data/utt_id $data/wavlist > $data/$aishell_wav_scp
+        awk -F '/' '{ print $(NF) }' $data/wavlist | awk -F '.' '{ print $1 }' > $data/utt_id
-fi
+        paste $data/utt_id $data/wavlist > $data/$aishell_wav_scp
+    fi
-if [ ! -d $ckpt_dir ]; then
-    mkdir -p $ckpt_dir
+    if [ ! -d $ckpt_dir ]; then
-    wget -P $ckpt_dir -c https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz
+        mkdir -p $ckpt_dir
-    tar xzfv $ckpt_dir/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz -C $ckpt_dir
+        wget -P $ckpt_dir -c https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz
-fi
+        tar xzfv $model_dir/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz -C $ckpt_dir
+    fi
-lm=$data/zh_giga.no_cna_cmn.prune01244.klm
-if [ ! -f $lm ]; then
+    lm=$data/zh_giga.no_cna_cmn.prune01244.klm
-    pushd $data
+    if [ ! -f $lm ]; then
-    wget -c https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm
+        pushd $data
-    popd
+        wget -c https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm
+        popd
+    fi
 fi
 # 3. make feature
+text=$data/test/text
 label_file=./aishell_result
 wer=./aishell_wer
 export GLOG_logtostderr=1
-# 3. gen linear feat
-cmvn=$PWD/cmvn.ark
-cmvn-json2kaldi --json_file=$ckpt_dir/data/mean_std.json --cmvn_write_path=$cmvn
+if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
+    # 3. gen linear feat
+    cmvn=$data/cmvn.ark
+    cmvn-json2kaldi --json_file=$ckpt_dir/data/mean_std.json --cmvn_write_path=$cmvn
-./local/split_data.sh $data $data/$aishell_wav_scp $aishell_wav_scp $nj
+    ./local/split_data.sh $data $data/$aishell_wav_scp $aishell_wav_scp $nj
-utils/run.pl JOB=1:$nj $data/split${nj}/JOB/feat.log \
+    utils/run.pl JOB=1:$nj $data/split${nj}/JOB/feat.log \
-linear-spectrogram-wo-db-norm-ol \
+    linear-spectrogram-wo-db-norm-ol \
-    --wav_rspecifier=scp:$data/split${nj}/JOB/${aishell_wav_scp} \
+        --wav_rspecifier=scp:$data/split${nj}/JOB/${aishell_wav_scp} \
-    --feature_wspecifier=ark,scp:$data/split${nj}/JOB/feat.ark,$data/split${nj}/JOB/feat.scp \
+        --feature_wspecifier=ark,scp:$data/split${nj}/JOB/feat.ark,$data/split${nj}/JOB/feat.scp \
-    --cmvn_file=$cmvn \
+        --cmvn_file=$cmvn \
-    --streaming_chunk=0.36
+        --streaming_chunk=0.36
+fi
-text=$data/test/text
-# 4. recognizer
+if [ $stage -le 2 ] && [ $stop_stage -ge 2 ];then
-utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.wolm.log \
+    #  recognizer
-  ctc-prefix-beam-search-decoder-ol \
+    utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.wolm.log \
-    --feature_rspecifier=scp:$data/split${nj}/JOB/feat.scp \
+    ctc-prefix-beam-search-decoder-ol \
-    --model_path=$model_dir/avg_1.jit.pdmodel \
+        --feature_rspecifier=scp:$data/split${nj}/JOB/feat.scp \
-    --params_path=$model_dir/avg_1.jit.pdiparams \
+        --model_path=$model_dir/avg_1.jit.pdmodel \
-    --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \
+        --params_path=$model_dir/avg_1.jit.pdiparams \
-    --dict_file=$vocb_dir/vocab.txt \
+        --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \
-    --result_wspecifier=ark,t:$data/split${nj}/JOB/result
+        --dict_file=$vocb_dir/vocab.txt \
+        --result_wspecifier=ark,t:$data/split${nj}/JOB/result
-cat $data/split${nj}/*/result > ${label_file}
-utils/compute-wer.py --char=1 --v=1 ${label_file} $text > ${wer}
+    cat $data/split${nj}/*/result > $exp/${label_file}
+    utils/compute-wer.py --char=1 --v=1 $exp/${label_file} $text > $exp/${wer}
-# 4. decode with lm
-utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.lm.log \
-  ctc-prefix-beam-search-decoder-ol \
-    --feature_rspecifier=scp:$data/split${nj}/JOB/feat.scp \
-    --model_path=$model_dir/avg_1.jit.pdmodel \
-    --params_path=$model_dir/avg_1.jit.pdiparams \
-    --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \
-    --dict_file=$vocb_dir/vocab.txt \
-    --lm_path=$lm \
-    --result_wspecifier=ark,t:$data/split${nj}/JOB/result_lm
-cat $data/split${nj}/*/result_lm > ${label_file}_lm
-utils/compute-wer.py --char=1 --v=1 ${label_file}_lm $text > ${wer}_lm
-graph_dir=./aishell_graph
-if [ ! -d $graph_dir ]; then
-    wget -c https://paddlespeech.bj.bcebos.com/s2t/paddle_asr_online/aishell_graph.zip
-    unzip  aishell_graph.zip
 fi
+if [ $stage -le 3 ] && [ $stop_stage -ge 3 ];then
+    #  decode with lm
+    utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.lm.log \
+    ctc-prefix-beam-search-decoder-ol \
+        --feature_rspecifier=scp:$data/split${nj}/JOB/feat.scp \
+        --model_path=$model_dir/avg_1.jit.pdmodel \
+        --params_path=$model_dir/avg_1.jit.pdiparams \
+        --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \
+        --dict_file=$vocb_dir/vocab.txt \
+        --lm_path=$lm \
+        --result_wspecifier=ark,t:$data/split${nj}/JOB/result_lm
+    cat $data/split${nj}/*/result_lm > $exp/${label_file}_lm
+    utils/compute-wer.py --char=1 --v=1 $exp/${label_file}_lm $text > $exp/${wer}_lm
+fi
-# 5. test TLG decoder
-utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.wfst.log \
-  wfst-decoder-ol \
-    --feature_rspecifier=scp:$data/split${nj}/JOB/feat.scp \
-    --model_path=$model_dir/avg_1.jit.pdmodel \
-    --params_path=$model_dir/avg_1.jit.pdiparams \
-    --word_symbol_table=$graph_dir/words.txt \
-    --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \
-     --graph_path=$graph_dir/TLG.fst --max_active=7500 \
-    --acoustic_scale=1.2 \
-    --result_wspecifier=ark,t:$data/split${nj}/JOB/result_tlg
+wfst=$data/wfst/
+mkdir -p $wfst
+if [ ! -f $wfst/aishell_graph.zip ]; then
+    pushd $wfst
+    wget -c https://paddlespeech.bj.bcebos.com/s2t/paddle_asr_online/aishell_graph.zip
+    unzip aishell_graph.zip
+    popd
+fi
-cat $data/split${nj}/*/result_tlg > ${label_file}_tlg
+graph_dir=$wfst/aishell_graph
-utils/compute-wer.py --char=1 --v=1 ${label_file}_tlg $text > ${wer}_tlg
+if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
\ No newline at end of file
+    #  TLG decoder
+    utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.wfst.log \
+    wfst-decoder-ol \
+        --feature_rspecifier=scp:$data/split${nj}/JOB/feat.scp \
+        --model_path=$model_dir/avg_1.jit.pdmodel \
+        --params_path=$model_dir/avg_1.jit.pdiparams \
+        --word_symbol_table=$graph_dir/words.txt \
+        --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \
+        --graph_path=$graph_dir/TLG.fst --max_active=7500 \
+        --acoustic_scale=1.2 \
+        --result_wspecifier=ark,t:$data/split${nj}/JOB/result_tlg
+    cat $data/split${nj}/*/result_tlg > $exp/${label_file}_tlg
+    utils/compute-wer.py --char=1 --v=1 $exp/${label_file}_tlg $text > $exp/${wer}_tlg
+fi
--- a/speechx/examples/ngram/README.md
+++ b/speechx/examples/ngram/README.md
-# NGram Train
--- a/speechx/examples/ngram/en/README.md
+++ b/speechx/examples/ngram/en/README.md
--- a/speechx/examples/ngram/zh/README.md
+++ b/speechx/examples/ngram/zh/README.md
+# ngram train for mandarin
+Quick run:
+```
+bash run.sh --stage -1
+```
+## input
+input files:
+```
+data/
+├── lexicon.txt
+├── text
+└── vocab.txt
+```
+```
+==> data/text <==
+BAC009S0002W0122 而 对 楼市 成交 抑制 作用 最 大 的 限 购
+BAC009S0002W0123 也 成为 地方 政府 的 眼中 钉
+BAC009S0002W0124 自 六月 底 呼和浩特 市 率先 宣布 取消 限 购 后
+BAC009S0002W0125 各地 政府 便 纷纷 跟进
+BAC009S0002W0126 仅 一 个 多 月 的 时间 里
+BAC009S0002W0127 除了 北京 上海 广州 深圳 四 个 一 线 城市 和 三亚 之外
+BAC009S0002W0128 四十六 个 限 购 城市 当中
+BAC009S0002W0129 四十一 个 已 正式 取消 或 变相 放松 了 限 购
+BAC009S0002W0130 财政 金融 政策 紧随 其后 而来
+BAC009S0002W0131 显示 出 了 极 强 的 威力
+==> data/lexicon.txt <==
+SIL sil
+<SPOKEN_NOISE> sil
+啊 aa a1
+啊 aa a2
+啊 aa a4
+啊 aa a5
+啊啊啊 aa a2 aa a2 aa a2
+啊啊啊 aa a5 aa a5 aa a5
+坐地 z uo4 d i4
+坐实 z uo4 sh ix2
+坐视 z uo4 sh ix4
+坐稳 z uo4 uu un3
+坐拥 z uo4 ii iong1
+坐诊 z uo4 zh en3
+坐庄 z uo4 zh uang1
+坐姿 z uo4 z iy1
+==> data/vocab.txt <==
+<blank>
+<unk>
+A
+B
+C
+D
+E
+龙
+龚
+龛
+<eos>
+```
+## output
+```
+data/
+├── local
+│   ├── dict
+│   │   ├── lexicon.txt
+│   │   └── units.txt
+│   └── lm
+│       ├── heldout
+│       ├── lm.arpa
+│       ├── text
+│       ├── text.no_oov
+│       ├── train
+│       ├── unigram.counts
+│       ├── word.counts
+│       └── wordlist
+```
+```
+/workspace/srilm/bin/i686-m64/ngram-count
+Namespace(bpemodel=None, in_lexicon='data/lexicon.txt', out_lexicon='data/local/dict/lexicon.txt', unit_file='data/vocab.txt')
+Ignoring words 矽, which contains oov unit
+Ignoring words 傩, which contains oov unit
+Ignoring words 堀, which contains oov unit
+Ignoring words 莼, which contains oov unit
+Ignoring words 菰, which contains oov unit
+Ignoring words 摭, which contains oov unit
+Ignoring words 帙, which contains oov unit
+Ignoring words 迨, which contains oov unit
+Ignoring words 孥, which contains oov unit
+Ignoring words 瑗, which contains oov unit
+...
+...
+...
+file data/local/lm/heldout: 10000 sentences, 89496 words, 0 OOVs
+0 zeroprobs, logprob= -270337.9 ppl= 521.2819 ppl1= 1048.745
+build LM done.
+```
--- a/speechx/examples/ngram/zh/local/aishell_train_lms.sh
+++ b/speechx/examples/ngram/zh/local/aishell_train_lms.sh
+#!/bin/bash
+# To be run from one directory above this script.
+. ./path.sh
+text=data/local/lm/text
+lexicon=data/local/dict/lexicon.txt
+for f in "$text" "$lexicon"; do
+  [ ! -f $x ] && echo "$0: No such file $f" && exit 1;
+done
+# Check SRILM tools
+if ! which ngram-count > /dev/null; then
+    echo "srilm tools are not found, please download it and install it from: "
+    echo "http://www.speech.sri.com/projects/srilm/download.html"
+    echo "Then add the tools to your PATH"
+    exit 1
+fi
+# This script takes no arguments.  It assumes you have already run
+# aishell_data_prep.sh.
+# It takes as input the files
+# data/local/lm/text
+# data/local/dict/lexicon.txt
+dir=data/local/lm
+mkdir -p $dir
+cleantext=$dir/text.no_oov
+# oov to <SPOKEN_NOISE>
+# lexicon line: word char0 ... charn
+# text line: utt word0 ... wordn -> line: <SPOKEN_NOISE> word0 ... wordn
+cat $text | awk -v lex=$lexicon 'BEGIN{while((getline<lex) >0){ seen[$1]=1; } }
+  {for(n=1; n<=NF;n++) {  if (seen[$n]) { printf("%s ", $n); } else {printf("<SPOKEN_NOISE> ");} } printf("\n");}' \
+  > $cleantext || exit 1;
+# compute word counts, sort in descending order
+# line: count word
+cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort | uniq -c | \
+   sort -nr > $dir/word.counts || exit 1;
+# Get counts from acoustic training transcripts, and add  one-count
+# for each word in the lexicon (but not silence, we don't want it
+# in the LM-- we'll add it optionally later).
+cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | \
+  cat - <(grep -w -v '!SIL' $lexicon | awk '{print $1}') | \
+   sort | uniq -c | sort -nr > $dir/unigram.counts || exit 1;
+# word with <s> </s>
+cat $dir/unigram.counts | awk '{print $2}' | cat - <(echo "<s>"; echo "</s>" ) > $dir/wordlist
+# hold out to compute ppl
+heldout_sent=10000 # Don't change this if you want result to be comparable with kaldi_lm results
+mkdir -p $dir
+cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \
+  head -$heldout_sent > $dir/heldout
+cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \
+  tail -n +$heldout_sent > $dir/train
+ngram-count -text $dir/train -order 3 -limit-vocab -vocab $dir/wordlist -unk \
+  -map-unk "<UNK>" -kndiscount -interpolate -lm $dir/lm.arpa
+ngram -lm $dir/lm.arpa -ppl $dir/heldout
\ No newline at end of file
--- a/speechx/examples/ngram/zh/local/text_to_lexicon.py
+++ b/speechx/examples/ngram/zh/local/text_to_lexicon.py
+#!/usr/bin/env python3
+import argparse
+from collections import Counter
+def main(args):
+    counter = Counter()
+    with open(args.text, 'r') as fin, open(args.lexicon, 'w') as fout:
+        for line in fin:
+            line = line.strip()
+            if args.has_key:
+                utt, text = line.split(maxsplit=1)
+                words = text.split()
+            else:
+                words = line.split()
+            counter.update(words)
+        for word in counter:
+            val = " ".join(list(word))
+            fout.write(f"{word}\t{val}\n")
+            fout.flush()
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        description='text(line:utt1 中国 人) to lexicon（line:中国 中 国).')
+    parser.add_argument(
+        '--has_key',
+        default=True,
+        help='text path, with utt or not')
+    parser.add_argument(
+        '--text',
+        required=True,
+        help='text path. line: utt1 中国 人 or 中国 人')
+    parser.add_argument(
+        '--lexicon',
+        required=True,
+        help='lexicon path. line:中国 中 国')
+    args = parser.parse_args()
+    print(args)
+    main(args)
--- a/speechx/examples/ngram/zh/path.sh
+++ b/speechx/examples/ngram/zh/path.sh
+# This contains the locations of binarys build required for running the examples.
+MAIN_ROOT=`realpath $PWD/../../../../`
+SPEECHX_ROOT=`realpath $MAIN_ROOT/speechx`
+export LC_AL=C
+# srilm
+export LIBLBFGS=${MAIN_ROOT}/tools/liblbfgs-1.10
+export LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-}:${LIBLBFGS}/lib/.libs
+export SRILM=${MAIN_ROOT}/tools/srilm
+export PATH=${PATH}:${SRILM}/bin:${SRILM}/bin/i686-m64
--- a/speechx/examples/ngram/zh/run.sh
+++ b/speechx/examples/ngram/zh/run.sh
+#!/bin/bash
+set -eo pipefail
+. path.sh
+stage=-1
+stop_stage=100
+corpus=aishell
+unit=data/vocab.txt       # vocab file, line: char/spm_pice
+lexicon=data/lexicon.txt  # line: word ph0 ... phn, aishell/resource_aishell/lexicon.txt
+text=data/text            # line: utt text, aishell/data_aishell/transcript/aishell_transcript_v0.8.txt
+. utils/parse_options.sh
+data=$PWD/data
+mkdir -p $data
+if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then
+    if [ ! -f $data/speech.ngram.zh.tar.gz ];then
+        pushd $data
+        wget -c http://paddlespeech.bj.bcebos.com/speechx/examples/ngram/zh/speech.ngram.zh.tar.gz
+        tar xvzf speech.ngram.zh.tar.gz
+        popd
+    fi
+fi
+if [ ! -f $unit ]; then
+    echo "$0: No such file $unit"
+    exit 1;
+fi
+if ! which ngram-count; then
+    pushd $MAIN_ROOT/tools
+    make srilm.done
+    popd
+fi
+mkdir -p data/local/dict
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # 7.1 Prepare dict
+    # line: char/spm_pices
+    cp $unit data/local/dict/units.txt
+    if [ ! -f $lexicon ];then
+        local/text_to_lexicon.py --has_key true --text $text --lexicon $lexicon
+        echo "Generate $lexicon from $text"
+    fi
+    # filter by vocab
+    # line: word ph0 ... phn -> line: word char0 ... charn
+    utils/fst/prepare_dict.py \
+        --unit_file $unit \
+        --in_lexicon ${lexicon} \
+        --out_lexicon data/local/dict/lexicon.txt
+fi
+lm=data/local/lm
+mkdir -p $lm
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # 7.2 Train lm
+    cp $text $lm/text
+    local/aishell_train_lms.sh
+fi
+echo "build LM done."
+exit 0
--- a/speechx/examples/ngram/zh/utils
+++ b/speechx/examples/ngram/zh/utils
+../../../../utils/
\ No newline at end of file
--- a/speechx/examples/text_lm/.gitignore
+++ b/speechx/examples/text_lm/.gitignore
+data
--- a/speechx/examples/text_lm/README.md
+++ b/speechx/examples/text_lm/README.md
+# Text PreProcess for building ngram LM
+Output `text` file like this:
+```
+BAC009S0002W0122 而 对 楼市 成交 抑制 作用 最 大 的 限 购
+BAC009S0002W0123 也 成为 地方 政府 的 眼中 钉
+BAC009S0002W0124 自 六月 底 呼和浩特 市 率先 宣布 取消 限 购 后
+BAC009S0002W0125 各地 政府 便 纷纷 跟进
+BAC009S0002W0126 仅 一 个 多 月 的 时间 里
+BAC009S0002W0127 除了 北京 上海 广州 深圳 四 个 一 线 城市 和 三亚 之外
+BAC009S0002W0128 四十六 个 限 购 城市 当中
+BAC009S0002W0129 四十一 个 已 正式 取消 或 变相 放松 了 限 购
+BAC009S0002W0130 财政 金融 政策 紧随 其后 而来
+```
--- a/speechx/examples/text_lm/path.sh
+++ b/speechx/examples/text_lm/path.sh
+MAIN_ROOT=`realpath $PWD/../../../../`
+SPEECHX_ROOT=`realpath $MAIN_ROOT/speechx`
+export LC_AL=C
--- a/speechx/examples/text_lm/run.sh
+++ b/speechx/examples/text_lm/run.sh
+#!/bin/bash
+set -eo pipefail
+. path.sh
+stage=0
+stop_stage=100
+has_key=true
+token_type=word
+. utils/parse_options.sh || exit -1;
+text=data/text
+if [ ! -f $text ]; then
+    echo "$0: Not find $1";
+    exit -1;
+fi
+if [ $stage -le 0 ] && [ $stop_stage -ge 0 ];then
+    echo "text tn & wordseg preprocess"
+    rm -rf ${text}.tn
+    python3 utils/zh_tn.py --has_key $has_key --token_type $token_type ${text} ${text}.tn
+fi
\ No newline at end of file
--- a/speechx/examples/text_lm/utils
+++ b/speechx/examples/text_lm/utils
+../../../utils/
\ No newline at end of file
--- a/speechx/examples/wfst/.gitignore
+++ b/speechx/examples/wfst/.gitignore
+data
--- a/speechx/examples/wfst/README.md
+++ b/speechx/examples/wfst/README.md
+# Built TLG wfst
+## Input
+```
+data/local/
+├── dict
+│   ├── lexicon.txt
+│   └── units.txt
+└── lm
+    ├── heldout
+    ├── lm.arpa
+    ├── text
+    ├── text.no_oov
+    ├── train
+    ├── unigram.counts
+    ├── word.counts
+    └── wordlist
+```
+```
+==> data/local/dict/lexicon.txt <==
+啊 啊
+啊啊啊 啊 啊 啊
+阿 阿
+阿尔 阿 尔
+阿根廷 阿 根 廷
+阿九 阿 九
+阿克 阿 克
+阿拉伯数字 阿 拉 伯 数 字
+阿拉法特 阿 拉 法 特
+阿拉木图 阿 拉 木 图
+==> data/local/dict/units.txt <==
+<blank>
+<unk>
+A
+B
+C
+D
+E
+F
+G
+H
+==> data/local/lm/heldout <==
+而 对 楼市 成交 抑制 作用 最 大 的 限 购
+也 成为 地方 政府 的 眼中 钉
+自 六月 底 呼和浩特 市 率先 宣布 取消 限 购 后
+各地 政府 便 纷纷 跟进
+仅 一 个 多 月 的 时间 里
+除了 北京 上海 广州 深圳 四 个 一 线 城市 和 三亚 之外
+四十六 个 限 购 城市 当中
+四十一 个 已 正式 取消 或 变相 放松 了 限 购
+财政 金融 政策 紧随 其后 而来
+显示 出 了 极 强 的 威力
+==> data/local/lm/lm.arpa <==
+\data\
+ngram 1=129356
+ngram 2=504661
+ngram 3=123455
+\1-grams:
+-1.531278       </s>
+-3.828829       <SPOKEN_NOISE>  -0.1600094
+-6.157292       <UNK>
+==> data/local/lm/text <==
+BAC009S0002W0122 而 对 楼市 成交 抑制 作用 最 大 的 限 购
+BAC009S0002W0123 也 成为 地方 政府 的 眼中 钉
+BAC009S0002W0124 自 六月 底 呼和浩特 市 率先 宣布 取消 限 购 后
+BAC009S0002W0125 各地 政府 便 纷纷 跟进
+BAC009S0002W0126 仅 一 个 多 月 的 时间 里
+BAC009S0002W0127 除了 北京 上海 广州 深圳 四 个 一 线 城市 和 三亚 之外
+BAC009S0002W0128 四十六 个 限 购 城市 当中
+BAC009S0002W0129 四十一 个 已 正式 取消 或 变相 放松 了 限 购
+BAC009S0002W0130 财政 金融 政策 紧随 其后 而来
+BAC009S0002W0131 显示 出 了 极 强 的 威力
+==> data/local/lm/text.no_oov <==
+<SPOKEN_NOISE> 而 对 楼市 成交 抑制 作用 最 大 的 限 购 
+<SPOKEN_NOISE> 也 成为 地方 政府 的 眼中 钉 
+<SPOKEN_NOISE> 自 六月 底 呼和浩特 市 率先 宣布 取消 限 购 后 
+<SPOKEN_NOISE> 各地 政府 便 纷纷 跟进 
+<SPOKEN_NOISE> 仅 一 个 多 月 的 时间 里 
+<SPOKEN_NOISE> 除了 北京 上海 广州 深圳 四 个 一 线 城市 和 三亚 之外 
+<SPOKEN_NOISE> 四十六 个 限 购 城市 当中 
+<SPOKEN_NOISE> 四十一 个 已 正式 取消 或 变相 放松 了 限 购 
+<SPOKEN_NOISE> 财政 ���融 政策 紧随 其后 而来 
+<SPOKEN_NOISE> 显示 出 了 极 强 的 威力 
+==> data/local/lm/train <==
+汉莎 不 得 不 通过 这样 的 方式 寻求 新 的 发展 点
+并 计划 朝云 计算 方面 发展
+汉莎 的 基础 设施 部门 拥有 一千四百 名 员工
+媒体 就 曾 披露 这笔 交易
+虽然 双方 已经 正式 签署 了 外包 协议
+但是 这笔 交易 还 需要 得到 反 垄断 部门 的 批准
+陈 黎明 一九八九 年 获得 美国 康乃尔 大学 硕士 学位
+并 于 二零零三 年 顺利 完成 美国 哈佛 商学 院 高级 管理 课程
+曾 在 多家 国际 公司 任职
+拥有 业务 开发 商务 及 企业 治理
+==> data/local/lm/unigram.counts <==
+  57487 的
+  13099 在
+  11862 一
+  11397 了
+  10998 不
+   9913 是
+   7952 有
+   6250 和
+   6152 个
+   5422 将
+==> data/local/lm/word.counts <==
+  57486 的
+  13098 在
+  11861 一
+  11396 了
+  10997 不
+   9912 是
+   7951 有
+   6249 和
+   6151 个
+   5421 将
+==> data/local/lm/wordlist <==
+的
+在
+一
+了
+不
+是
+有
+和
+个
+将
+```
+## Output
+```
+fstaddselfloops 'echo 4234 |' 'echo 123660 |' 
+Lexicon and Token FSTs compiling succeeded
+arpa2fst --read-symbol-table=data/lang_test/words.txt --keep-symbols=true - 
+LOG (arpa2fst[5.5.0~1-5a37]:Read():arpa-file-parser.cc:94) Reading \data\ section.
+LOG (arpa2fst[5.5.0~1-5a37]:Read():arpa-file-parser.cc:149) Reading \1-grams: section.
+LOG (arpa2fst[5.5.0~1-5a37]:Read():arpa-file-parser.cc:149) Reading \2-grams: section.
+LOG (arpa2fst[5.5.0~1-5a37]:Read():arpa-file-parser.cc:149) Reading \3-grams: section.
+Checking how stochastic G is (the first of these numbers should be small):
+fstisstochastic data/lang_test/G.fst 
+0 -1.14386
+fsttablecompose data/lang_test/L.fst data/lang_test/G.fst 
+fstminimizeencoded 
+fstdeterminizestar --use-log=true 
+fsttablecompose data/lang_test/T.fst data/lang_test/LG.fst 
+Composing decoding graph TLG.fst succeeded
+Aishell build TLG done.
+```
+```
+data/
+├── lang_test
+│   ├── G.fst
+│   ├── L.fst
+│   ├── LG.fst
+│   ├── T.fst
+│   ├── TLG.fst
+│   ├── tokens.txt
+│   ├── units.txt
+│   └── words.txt
+└── local
+    ├── lang
+    │   ├── L.fst
+    │   ├── T.fst
+    │   ├── tokens.txt
+    │   ├── units.txt
+    │   └── words.txt
+    └── tmp
+        ├── disambig.list
+        ├── lexiconp_disambig.txt
+        ├── lexiconp.txt
+        └── units.list
+```
\ No newline at end of file
--- a/speechx/examples/wfst/path.sh
+++ b/speechx/examples/wfst/path.sh
+# This contains the locations of binarys build required for running the examples.
+MAIN_ROOT=`realpath $PWD/../../../`
+SPEECHX_ROOT=`realpath $MAIN_ROOT/speechx`
+export LC_AL=C
+# srilm
+export LIBLBFGS=${MAIN_ROOT}/tools/liblbfgs-1.10
+export LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-}:${LIBLBFGS}/lib/.libs
+export SRILM=${MAIN_ROOT}/tools/srilm
+export PATH=${PATH}:${SRILM}/bin:${SRILM}/bin/i686-m64
+# Kaldi
+export KALDI_ROOT=${MAIN_ROOT}/tools/kaldi
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present, can not using Kaldi!"
+[ -f $KALDI_ROOT/tools/config/common_path.sh ] && . $KALDI_ROOT/tools/config/common_path.sh
--- a/speechx/examples/wfst/run.sh
+++ b/speechx/examples/wfst/run.sh
+#!/bin/bash
+set -eo pipefail
+. path.sh
+stage=-1
+stop_stage=100
+. utils/parse_options.sh
+if ! which fstprint ; then
+    pushd $MAIN_ROOT/tools
+    make kaldi.done
+    popd
+fi
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then 
+    # build T & L
+    # utils/fst/compile_lexicon_token_fst.sh <dict-src-dir> <tmp-dir> <lang-dir>
+    utils/fst/compile_lexicon_token_fst.sh \
+        data/local/dict data/local/tmp data/local/lang
+    # build G & LG & TLG
+    # utils/fst/make_tlg.sh <lm_dir> <src_lang> <tgt_lang>
+    utils/fst/make_tlg.sh data/local/lm data/local/lang data/lang_test || exit 1;
+fi
+echo "build TLG done."
+exit 0
--- a/speechx/examples/wfst/utils
+++ b/speechx/examples/wfst/utils
+../../../utils/
\ No newline at end of file
--- a/speechx/tools/install_srilm.sh
+++ b/speechx/tools/install_srilm.sh
-#!/usr/bin/env bash
-current_path=`pwd`
-current_dir=`basename "$current_path"`
-if [ "tools" != "$current_dir" ]; then
-    echo "You should run this script in tools/ directory!!"
-    exit 1
-fi
-if [ ! -d liblbfgs-1.10 ]; then
-    echo Installing libLBFGS library to support MaxEnt LMs
-    bash extras/install_liblbfgs.sh || exit 1
-fi
-! command -v gawk > /dev/null && \
-   echo "GNU awk is not installed so SRILM will probably not work correctly: refusing to install" && exit 1;
-if [ $# -ne 3 ]; then
-    echo "SRILM download requires some information about you"
-    echo
-    echo "Usage: $0 <name> <organization> <email>"
-    exit 1
-fi
-srilm_url="http://www.speech.sri.com/projects/srilm/srilm_download.php"
-post_data="WWW_file=srilm-1.7.3.tar.gz&WWW_name=$1&WWW_org=$2&WWW_email=$3"
-if ! wget --post-data "$post_data" -O ./srilm.tar.gz "$srilm_url"; then
-    echo 'There was a problem downloading the file.'
-    echo 'Check you internet connection and try again.'
-    exit 1
-fi
-mkdir -p srilm
-cd srilm
-if [ -f ../srilm.tgz ]; then
-    tar -xvzf ../srilm.tgz # Old SRILM format
-elif [  -f ../srilm.tar.gz ]; then
-    tar -xvzf ../srilm.tar.gz # Changed format type from tgz to tar.gz
-fi
-major=`gawk -F. '{ print $1 }' RELEASE`
-minor=`gawk -F. '{ print $2 }' RELEASE`
-micro=`gawk -F. '{ print $3 }' RELEASE`
-if [ $major -le 1 ] && [ $minor -le 7 ] && [ $micro -le 1 ]; then
-  echo "Detected version 1.7.1 or earlier. Applying patch."
-  patch -p0 < ../extras/srilm.patch
-fi
-# set the SRILM variable in the top-level Makefile to this directory.
-cp Makefile tmpf
-cat tmpf | gawk -v pwd=`pwd` '/SRILM =/{printf("SRILM = %s\n", pwd); next;} {print;}' \
-  > Makefile || exit 1
-rm tmpf
-mtype=`sbin/machine-type`
-echo HAVE_LIBLBFGS=1 >> common/Makefile.machine.$mtype
-grep ADDITIONAL_INCLUDES common/Makefile.machine.$mtype | \
-    sed 's|$| -I$(SRILM)/../liblbfgs-1.10/include|' \
-    >> common/Makefile.machine.$mtype
-grep ADDITIONAL_LDFLAGS common/Makefile.machine.$mtype | \
-    sed 's|$| -L$(SRILM)/../liblbfgs-1.10/lib/ -Wl,-rpath -Wl,$(SRILM)/../liblbfgs-1.10/lib/|' \
-    >> common/Makefile.machine.$mtype
-make || exit
-cd ..
-(
-  [ ! -z "${SRILM}" ] && \
-    echo >&2 "SRILM variable is aleady defined. Undefining..." && \
-    unset SRILM
-  [ -f ./env.sh ] && . ./env.sh
-  [ ! -z "${SRILM}" ] && \
-    echo >&2 "SRILM config is already in env.sh" && exit
-  wd=`pwd`
-  wd=`readlink -f $wd || pwd`
-  echo "export SRILM=$wd/srilm"
-  dirs="\${PATH}"
-  for directory in $(cd srilm && find bin -type d ) ; do
-    dirs="$dirs:\${SRILM}/$directory"
-  done
-  echo "export PATH=$dirs"
-) >> env.sh
-echo >&2 "Installation of SRILM finished successfully"
-echo >&2 "Please source the tools/env.sh in your path.sh to enable it"
--- a/tests/unit/cli/test_cli.sh
+++ b/tests/unit/cli/test_cli.sh
 #!/bin/bash
 set -e
 # Audio classification
 wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/cat.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/dog.wav
 paddlespeech cls --input ./cat.wav --topk 10
@@ -28,26 +29,16 @@ paddlespeech tts --am tacotron2_csmsc --input "你好，欢迎使用百度飞桨
 paddlespeech tts --am tacotron2_csmsc --voc wavernn_csmsc --input "你好，欢迎使用百度飞桨深度学习框架！"
 paddlespeech tts --am tacotron2_ljspeech --voc pwgan_ljspeech --lang en --input "Life was like a box of chocolates, you never know what you're gonna get."
 # Speech Translation (only support linux)
 paddlespeech st --input ./en.wav
-# batch process
-echo -e "1 欢迎光临。\n2 谢谢惠顾。" | paddlespeech tts
-# shell pipeline
-paddlespeech asr --input ./zh.wav | paddlespeech text --task punc
-# stats
-paddlespeech stats --task asr
-paddlespeech stats --task tts
-paddlespeech stats --task cls
 # Speaker Verification 
 wget -c https://paddlespeech.bj.bcebos.com/vector/audio/85236145389.wav
 paddlespeech vector --task spk --input 85236145389.wav
+# batch process
+echo -e "1 欢迎光临。\n2 谢谢惠顾。" | paddlespeech tts
 echo -e "demo1 85236145389.wav \n demo2 85236145389.wav" > vec.job
 paddlespeech vector --task spk --input vec.job
@@ -55,4 +46,13 @@ echo -e "demo3 85236145389.wav \n demo4 85236145389.wav" | paddlespeech vector -
 rm 85236145389.wav 
 rm vec.job
+# shell pipeline
+paddlespeech asr --input ./zh.wav | paddlespeech text --task punc
+# stats
+paddlespeech stats --task asr
+paddlespeech stats --task tts
+paddlespeech stats --task cls
+paddlespeech stats --task text
+paddlespeech stats --task vector
+paddlespeech stats --task st
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -25,7 +25,7 @@ clean:
 apt.done:
 	apt update -y
-	apt install -y bc flac jq vim tig tree pkg-config libsndfile1 libflac-dev libogg-dev libvorbis-dev libboost-dev swig python3-dev 
+	apt install -y bc flac jq vim tig tree sox pkg-config libsndfile1 libflac-dev libogg-dev libvorbis-dev libboost-dev swig python3-dev 
 	echo "check_certificate = off" >> ~/.wgetrc
 	touch apt.done
@@ -50,7 +50,7 @@ openblas.done:
 	bash extras/install_openblas.sh
 	touch openblas.done
-kaldi.done: openblas.done
+kaldi.done: apt.done openblas.done
 	bash extras/install_kaldi.sh
 	touch kaldi.done
@@ -58,6 +58,11 @@ sctk.done:
 	./extras/install_sclite.sh
 	touch sctk.done
+srilm.done:
+	./extras/install_liblbfgs.sh
+	extras/install_srilm.sh
+	touch srilm.done
 ######################
 dev: python conda_packages.done sctk.done
@@ -96,4 +101,4 @@ conda_packages.done: bc.done cmake.done flac.done ffmpeg.done sox.done sndfile.d
 else
 conda_packages.done:
 endif
 	touch conda_packages.done
\ No newline at end of file
--- a/tools/extras/install_openfst.sh
+++ b/tools/extras/install_openfst.sh
@@ -7,8 +7,9 @@ set -x
 # openfst
 openfst=openfst-1.8.1
 shared=true
+WGET="wget -c --no-check-certificate"
-test -e ${openfst}.tar.gz || wget http://www.openfst.org/twiki/pub/FST/FstDownload/${openfst}.tar.gz
+test -e ${openfst}.tar.gz || $WGET http://www.openfst.org/twiki/pub/FST/FstDownload/${openfst}.tar.gz
 test -d ${openfst} || tar -xvf ${openfst}.tar.gz && chown -R root:root ${openfst}

--- a/utils/compute-wer.py
+++ b/utils/compute-wer.py
--- a/utils/espnet_json_to_manifest.py
+++ b/utils/espnet_json_to_manifest.py
--- a/utils/fst/prepare_dict.py
+++ b/utils/fst/prepare_dict.py
@@ -3,7 +3,8 @@ import argparse
 def main(args):
-    # load `unit` or `vocab` file
+    # load vocab file
+    # line: token
    unit_table = set()
    with open(args.unit_file, 'r') as fin:
        for line in fin:
@@ -11,27 +12,41 @@ def main(args):
            unit_table.add(unit)
    def contain_oov(units):
+        """token not in vocab
+        Args:
+            units (str): token
+        Returns:
+            bool: True token in voca, else False.
+        """
        for unit in units:
            if unit not in unit_table:
                return True
        return False
-    # load spm model
+    # load spm model, for English
    bpemode = args.bpemodel
    if bpemode:
        import sentencepiece as spm
        sp = spm.SentencePieceProcessor()
        sp.Load(sys.bpemodel)
-    # used to filter polyphone
+    # used to filter polyphone and invalid word
    lexicon_table = set()
+    in_n = 0  # in lexicon word count
+    out_n = 0 # out lexicon word cout
    with open(args.in_lexicon, 'r') as fin, \
            open(args.out_lexicon, 'w') as fout:
        for line in fin:
            word = line.split()[0]
+            in_n += 1
            if word == 'SIL' and not bpemode:  # `sil` might be a valid piece in bpemodel
+                # filter 'SIL' for mandarin, keep it in English
                continue
            elif word == '<SPOKEN_NOISE>':
+                # filter <SPOKEN_NOISE>
                continue
            else:
                # each word only has one pronunciation for e2e system
@@ -39,12 +54,14 @@ def main(args):
                    continue
                if bpemode:
+                    # for english
                    pieces = sp.EncodeAsPieces(word)
                    if contain_oov(pieces):
                        print('Ignoring words {}, which contains oov unit'.
                              format(''.join(word).strip('▁')))
                        continue
+                    # word is piece list, which not have <unk> piece, filter out by `contain_oov(pieces)`
                    chars = ' '.join(
                        [p if p in unit_table else '<unk>' for p in pieces])
                else:
@@ -58,11 +75,14 @@ def main(args):
                    # we assume the model unit of our e2e system is char now.
                    if word.encode('utf8').isalpha() and '▁' in unit_table:
                        word = '▁' + word
                    chars = ' '.join(word)  # word is a char list
                fout.write('{} {}\n'.format(word, chars))
                lexicon_table.add(word)
+                out_n += 1
+    print(f"Filter lexicon by unit table: filter out {in_n - out_n}, {out_n}/{in_n}")
 if __name__ == '__main__':
    parser = argparse.ArgumentParser(

--- a/utils/generate_infer_yaml.py
+++ b/utils/generate_infer_yaml.py
--- a/utils/link_wav.py
+++ b/utils/link_wav.py
--- a/utils/manifest_key_value.py
+++ b/utils/manifest_key_value.py
@@ -26,23 +26,39 @@ def main(args):
    with wav_scp.open('w') as fwav, dur_scp.open('w') as fdur, text_scp.open(
            'w') as ftxt:
        for line_json in manifest_jsons:
+            # utt:str
+            # utt2spk:str
+            # input: [{name:str, shape:[dur_in_sec, feat_dim], feat:str, filetype:str}, ]
+            # output: [{name:str, shape:[tokenlen, vocab_dim], text:str, token:str, tokenid:str}, ] 
            utt = line_json['utt']
-            feat = line_json['feat']
+            utt2spk = line_json['utt2spk']
+            # input
+            assert (len(line_json['input']) == 1), "only support one input now"
+            input_json = line_json['input'][0]
+            feat = input_json['feat']
+            feat_shape = input_json['shape']
+            file_type = input_json['filetype']
            file_ext = Path(feat).suffix  # .wav
-            text = line_json['text']
-            feat_shape = line_json['feat_shape']
            dur = feat_shape[0]
            feat_dim = feat_shape[1]
-            if 'token' in line_json:
-                tokens = line_json['token']
-                tokenids = line_json['token_id']
-                token_shape = line_json['token_shape']
-                token_len = token_shape[0]
-                vocab_dim = token_shape[1]
            if file_ext == '.wav':
                fwav.write(f"{utt} {feat}\n")
            fdur.write(f"{utt} {dur}\n")
+            # output
+            assert (
+                len(line_json['output']) == 1), "only support one output now"
+            output_json = line_json['output'][0]
+            text = output_json['text']
+            if 'token' in output_json:
+                tokens = output_json['token']
+                tokenids = output_json['tokenid']
+                token_shape = output_json['shape']
+                token_len = token_shape[0]
+                vocab_dim = token_shape[1]
            ftxt.write(f"{utt} {text}\n")
            count += 1

--- a/utils/zh_tn.py
+++ b/utils/zh_tn.py