merge develop to server

dcab04a7 · xiongxinlei · f56dba0c · 55122cfc · dcab04a7 · dcab04a7
70 changed file
--- a/.gitignore
+++ b/.gitignore
@@ -33,6 +33,12 @@ tools/Miniconda3-latest-Linux-x86_64.sh
 tools/activate_python.sh
 tools/miniconda.sh
 tools/CRF++-0.58/
+tools/liblbfgs-1.10/
+tools/srilm/
+tools/env.sh
+tools/openfst-1.8.1/
+tools/libsndfile/
+tools/python-soundfile/
 speechx/fc_patch/

--- a/examples/csmsc/tts2/local/inference.sh
+++ b/examples/csmsc/tts2/local/inference.sh
@@ -30,21 +30,8 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
        --tones_dict=dump/tone_id_map.txt
 fi
-# style melgan
-# style melgan's Dygraph to Static Graph is not ready now
-if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
-    python3 ${BIN_DIR}/../inference.py \
-        --inference_dir=${train_output_path}/inference \
-        --am=speedyspeech_csmsc \
-        --voc=style_melgan_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
-        --output_dir=${train_output_path}/pd_infer_out \
-        --phones_dict=dump/phone_id_map.txt \
-        --tones_dict=dump/tone_id_map.txt
-fi
 # hifigan
-if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    python3 ${BIN_DIR}/../inference.py \
        --inference_dir=${train_output_path}/inference \
        --am=speedyspeech_csmsc \

--- a/examples/csmsc/tts3/README.md
+++ b/examples/csmsc/tts3/README.md
@@ -231,14 +231,19 @@ Pretrained FastSpeech2 model with no silence in the edge of audios:
 The static model can be downloaded here:
 - [fastspeech2_nosil_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_static_0.4.zip)
 - [fastspeech2_csmsc_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_csmsc_static_0.2.0.zip)
+- [fastspeech2_cnndecoder_csmsc_static_1.0.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_static_1.0.0.zip)
+- [fastspeech2_cnndecoder_csmsc_streaming_static_1.0.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_streaming_static_1.0.0.zip)
 The ONNX model can be downloaded here:
 - [fastspeech2_csmsc_onnx_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_csmsc_onnx_0.2.0.zip)
+- [fastspeech2_cnndecoder_csmsc_onnx_1.0.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_onnx_1.0.0.zip)
+- [fastspeech2_cnndecoder_csmsc_streaming_onnx_1.0.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_streaming_onnx_1.0.0.zip)
 Model | Step | eval/loss | eval/l1_loss | eval/duration_loss | eval/pitch_loss| eval/energy_loss 
 :-------------:| :------------:| :-----: | :-----: | :--------: |:--------:|:---------:
 default| 2(gpu) x 76000|1.0991|0.59132|0.035815|0.31915|0.15287|
 conformer| 2(gpu) x 76000|1.0675|0.56103|0.035869|0.31553|0.15509|
+cnndecoder| 1(gpu) x 153000|1.1153|0.61475|0.03380|0.30414|0.14707|
 FastSpeech2 checkpoint contains files listed below.
 ```text

--- a/examples/csmsc/tts3/local/inference.sh
+++ b/examples/csmsc/tts3/local/inference.sh
@@ -5,6 +5,7 @@ train_output_path=$1
 stage=0
 stop_stage=0
+# pwgan
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    python3 ${BIN_DIR}/../inference.py \
        --inference_dir=${train_output_path}/inference \
@@ -27,20 +28,9 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
        --phones_dict=dump/phone_id_map.txt
 fi
-# style melgan
-# style melgan's Dygraph to Static Graph is not ready now
-if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
-    python3 ${BIN_DIR}/../inference.py \
-        --inference_dir=${train_output_path}/inference \
-        --am=fastspeech2_csmsc \
-        --voc=style_melgan_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
-        --output_dir=${train_output_path}/pd_infer_out \
-        --phones_dict=dump/phone_id_map.txt
-fi
 # hifigan
-if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    python3 ${BIN_DIR}/../inference.py \
        --inference_dir=${train_output_path}/inference \
        --am=fastspeech2_csmsc \
@@ -51,7 +41,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
 fi
 # wavernn
-if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
    python3 ${BIN_DIR}/../inference.py \
        --inference_dir=${train_output_path}/inference \
        --am=fastspeech2_csmsc \

--- a/examples/csmsc/tts3/local/inference_streaming.sh
+++ b/examples/csmsc/tts3/local/inference_streaming.sh
+#!/bin/bash
+train_output_path=$1
+stage=0
+stop_stage=0
+# pwgan
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    python3 ${BIN_DIR}/../inference_streaming.py \
+        --inference_dir=${train_output_path}/inference_streaming \
+        --am=fastspeech2_csmsc \
+        --am_stat=dump/train/speech_stats.npy \
+        --voc=pwgan_csmsc \
+        --text=${BIN_DIR}/../sentences.txt \
+        --output_dir=${train_output_path}/pd_infer_out_streaming \
+        --phones_dict=dump/phone_id_map.txt \
+        --am_streaming=True
+fi
+# for more GAN Vocoders
+# multi band melgan
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    python3 ${BIN_DIR}/../inference_streaming.py \
+        --inference_dir=${train_output_path}/inference_streaming \
+        --am=fastspeech2_csmsc \
+        --am_stat=dump/train/speech_stats.npy \
+        --voc=mb_melgan_csmsc \
+        --text=${BIN_DIR}/../sentences.txt \
+        --output_dir=${train_output_path}/pd_infer_out_streaming \
+        --phones_dict=dump/phone_id_map.txt \
+        --am_streaming=True
+fi
+# hifigan
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    python3 ${BIN_DIR}/../inference_streaming.py \
+        --inference_dir=${train_output_path}/inference_streaming \
+        --am=fastspeech2_csmsc \
+        --am_stat=dump/train/speech_stats.npy \
+        --voc=hifigan_csmsc \
+        --text=${BIN_DIR}/../sentences.txt \
+        --output_dir=${train_output_path}/pd_infer_out_streaming \
+        --phones_dict=dump/phone_id_map.txt \
+        --am_streaming=True
+fi
--- a/examples/csmsc/tts3/local/ort_predict_streaming.sh
+++ b/examples/csmsc/tts3/local/ort_predict_streaming.sh
+train_output_path=$1
+stage=0
+stop_stage=0
+# e2e, synthesize from text
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    python3 ${BIN_DIR}/../ort_predict_streaming.py \
+        --inference_dir=${train_output_path}/inference_onnx_streaming \
+        --am=fastspeech2_csmsc \
+        --am_stat=dump/train/speech_stats.npy \
+        --voc=hifigan_csmsc \
+        --output_dir=${train_output_path}/onnx_infer_out_streaming \
+        --text=${BIN_DIR}/../csmsc_test.txt \
+        --phones_dict=dump/phone_id_map.txt \
+        --device=cpu \
+        --cpu_threads=2 \
+        --am_streaming=True
+fi
--- a/examples/csmsc/tts3/local/synthesize_streaming.sh
+++ b/examples/csmsc/tts3/local/synthesize_streaming.sh
@@ -88,5 +88,6 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
        --text=${BIN_DIR}/../sentences.txt \
        --output_dir=${train_output_path}/test_e2e_streaming \
        --phones_dict=dump/phone_id_map.txt \
-        --am_streaming=True
+        --am_streaming=True \
+        --inference_dir=${train_output_path}/inference_streaming
 fi
--- a/examples/csmsc/tts3/run_cnndecoder.sh
+++ b/examples/csmsc/tts3/run_cnndecoder.sh
@@ -31,18 +31,75 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
 fi
+# synthesize_e2e non-streaming
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
    # synthesize_e2e, vocoder is pwgan
    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
 fi
+# inference non-streaming
 if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
    # inference with static model
    CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path} || exit -1
 fi
+# synthesize_e2e streaming
 if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
    # synthesize_e2e, vocoder is pwgan
    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_streaming.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
 fi
+# inference streaming
+if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
+    # inference with static model
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/inference_streaming.sh ${train_output_path} || exit -1
+fi
+# paddle2onnx non streaming
+if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
+    # install paddle2onnx
+    version=$(echo `pip list |grep "paddle2onnx"` |awk -F" " '{print $2}')
+    if [[ -z "$version" || ${version} != '0.9.4' ]]; then
+        pip install paddle2onnx==0.9.4
+    fi
+    ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx fastspeech2_csmsc
+    ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx hifigan_csmsc
+fi
+# onnxruntime non streaming
+# inference with onnxruntime, use fastspeech2 + hifigan by default
+if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then
+    # install onnxruntime
+    version=$(echo `pip list |grep "onnxruntime"` |awk -F" " '{print $2}')
+    if [[ -z "$version" || ${version} != '1.10.0' ]]; then
+        pip install onnxruntime==1.10.0
+    fi
+    ./local/ort_predict.sh ${train_output_path}
+fi
+# paddle2onnx streaming
+if [ ${stage} -le 9 ] && [ ${stop_stage} -ge 9 ]; then
+    # install paddle2onnx
+    version=$(echo `pip list |grep "paddle2onnx"` |awk -F" " '{print $2}')
+    if [[ -z "$version" || ${version} != '0.9.4' ]]; then
+        pip install paddle2onnx==0.9.4
+    fi
+    # streaming acoustic model
+    ./local/paddle2onnx.sh ${train_output_path} inference_streaming inference_onnx_streaming fastspeech2_csmsc_am_encoder_infer
+    ./local/paddle2onnx.sh ${train_output_path} inference_streaming inference_onnx_streaming fastspeech2_csmsc_am_decoder
+    ./local/paddle2onnx.sh ${train_output_path} inference_streaming inference_onnx_streaming fastspeech2_csmsc_am_postnet
+    # vocoder
+    ./local/paddle2onnx.sh ${train_output_path} inference_streaming inference_onnx_streaming hifigan_csmsc
+fi
+# onnxruntime streaming
+if [ ${stage} -le 10 ] && [ ${stop_stage} -ge 10 ]; then
+    # install onnxruntime
+    version=$(echo `pip list |grep "onnxruntime"` |awk -F" " '{print $2}')
+    if [[ -z "$version" || ${version} != '1.10.0' ]]; then
+        pip install onnxruntime==1.10.0
+    fi
+    ./local/ort_predict_streaming.sh ${train_output_path}
+fi
--- a/examples/other/ngram_lm/s0/local/build_zh_lm.sh
+++ b/examples/other/ngram_lm/s0/local/build_zh_lm.sh
@@ -27,7 +27,7 @@ arpa=$3
 if [ $stage -le 0 ] && [ $stop_stage -ge 0 ];then
    # text tn & wordseg preprocess
    echo "process text."
-    python3 ${MAIN_ROOT}/utils/zh_tn.py ${type} ${text} ${text}.${type}.tn
+    python3 ${MAIN_ROOT}/utils/zh_tn.py --token_type ${type} ${text} ${text}.${type}.tn
 fi
 if [ $stage -le 1 ] && [ $stop_stage -ge 1 ];then

--- a/examples/other/ngram_lm/s0/local/download_lm_zh.sh
+++ b/examples/other/ngram_lm/s0/local/download_lm_zh.sh
@@ -10,6 +10,11 @@ MD5="29e02312deb2e59b3c8686c7966d4fe3"
 TARGET=${DIR}/zh_giga.no_cna_cmn.prune01244.klm
+if [ -e $TARGET ];then
+    echo "already have lm"
+    exit 0;
+fi
 echo "Download language model ..."
 download $URL $MD5 $TARGET
 if [ $? -ne 0 ]; then

--- a/paddlespeech/cli/asr/infer.py
+++ b/paddlespeech/cli/asr/infer.py
@@ -29,9 +29,10 @@ from ..download import get_path_from_url
 from ..executor import BaseExecutor
 from ..log import logger
 from ..utils import cli_register
-from ..utils import download_and_decompress
 from ..utils import MODEL_HOME
 from ..utils import stats_wrapper
+from .pretrained_models import model_alias
+from .pretrained_models import pretrained_models
 from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
 from paddlespeech.s2t.transform.transformation import Transformation
 from paddlespeech.s2t.utils.dynamic_import import dynamic_import
@@ -39,110 +40,13 @@ from paddlespeech.s2t.utils.utility import UpdateConfig
 __all__ = ['ASRExecutor']
-pretrained_models = {
-    # The tags for pretrained_models should be "{model_name}[_{dataset}][-{lang}][-...]".
-    # e.g. "conformer_wenetspeech-zh-16k" and "panns_cnn6-32k".
-    # Command line and python api use "{model_name}[_{dataset}]" as --model, usage:
-    # "paddlespeech asr --model conformer_wenetspeech --lang zh --sr 16000 --input ./input.wav"
-    "conformer_wenetspeech-zh-16k": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1_conformer_wenetspeech_ckpt_0.1.1.model.tar.gz',
-        'md5':
-        '76cb19ed857e6623856b7cd7ebbfeda4',
-        'cfg_path':
-        'model.yaml',
-        'ckpt_path':
-        'exp/conformer/checkpoints/wenetspeech',
-    },
-    "transformer_librispeech-en-16k": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr1/asr1_transformer_librispeech_ckpt_0.1.1.model.tar.gz',
-        'md5':
-        '2c667da24922aad391eacafe37bc1660',
-        'cfg_path':
-        'model.yaml',
-        'ckpt_path':
-        'exp/transformer/checkpoints/avg_10',
-    },
-    "deepspeech2offline_aishell-zh-16k": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_aishell_ckpt_0.1.1.model.tar.gz',
-        'md5':
-        '932c3593d62fe5c741b59b31318aa314',
-        'cfg_path':
-        'model.yaml',
-        'ckpt_path':
-        'exp/deepspeech2/checkpoints/avg_1',
-        'lm_url':
-        'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm',
-        'lm_md5':
-        '29e02312deb2e59b3c8686c7966d4fe3'
-    },
-    "deepspeech2online_aishell-zh-16k": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz',
-        'md5':
-        '23e16c69730a1cb5d735c98c83c21e16',
-        'cfg_path':
-        'model.yaml',
-        'ckpt_path':
-        'exp/deepspeech2_online/checkpoints/avg_1',
-        'lm_url':
-        'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm',
-        'lm_md5':
-        '29e02312deb2e59b3c8686c7966d4fe3'
-    },
-        "conformer2online_aishell-zh-16k": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_chunk_conformer_aishell_ckpt_0.1.2.model.tar.gz',
-        'md5':
-        '4814e52e0fc2fd48899373f95c84b0c9',
-        'cfg_path':
-        'config.yaml',
-        'ckpt_path':
-        'exp/deepspeech2_online/checkpoints/avg_30',
-        'lm_url':
-        'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm',
-        'lm_md5':
-        '29e02312deb2e59b3c8686c7966d4fe3'
-    },
-    "deepspeech2offline_librispeech-en-16k": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr0/asr0_deepspeech2_librispeech_ckpt_0.1.1.model.tar.gz',
-        'md5':
-        'f5666c81ad015c8de03aac2bc92e5762',
-        'cfg_path':
-        'model.yaml',
-        'ckpt_path':
-        'exp/deepspeech2/checkpoints/avg_1',
-        'lm_url':
-        'https://deepspeech.bj.bcebos.com/en_lm/common_crawl_00.prune01111.trie.klm',
-        'lm_md5':
-        '099a601759d467cd0a8523ff939819c5'
-    },
-}
-model_alias = {
-    "deepspeech2offline":
-    "paddlespeech.s2t.models.ds2:DeepSpeech2Model",
-    "deepspeech2online":
-    "paddlespeech.s2t.models.ds2_online:DeepSpeech2ModelOnline",
-    "conformer":
-    "paddlespeech.s2t.models.u2:U2Model",
-    "conformer_online":
-    "paddlespeech.s2t.models.u2:U2Model",
-    "transformer":
-    "paddlespeech.s2t.models.u2:U2Model",
-    "wenetspeech":
-    "paddlespeech.s2t.models.u2:U2Model",
-}
 @cli_register(
    name='paddlespeech.asr', description='Speech to text infer command.')
 class ASRExecutor(BaseExecutor):
    def __init__(self):
-        super(ASRExecutor, self).__init__()
+        super().__init__()
+        self.model_alias = model_alias
+        self.pretrained_models = pretrained_models
        self.parser = argparse.ArgumentParser(
            prog='paddlespeech.asr', add_help=True)
@@ -152,7 +56,9 @@ class ASRExecutor(BaseExecutor):
            '--model',
            type=str,
            default='conformer_wenetspeech',
-            choices=[tag[:tag.index('-')] for tag in pretrained_models.keys()],
+            choices=[
+                tag[:tag.index('-')] for tag in self.pretrained_models.keys()
+            ],
            help='Choose model type of asr task.')
        self.parser.add_argument(
            '--lang',
@@ -208,23 +114,6 @@ class ASRExecutor(BaseExecutor):
            action='store_true',
            help='Increase logger verbosity of current task.')
-    def _get_pretrained_path(self, tag: str) -> os.PathLike:
-        """
-        Download and returns pretrained resources path of current task.
-        """
-        support_models = list(pretrained_models.keys())
-        assert tag in pretrained_models, 'The model "{}" you want to use has not been supported, please choose other models.\nThe support models includes:\n\t\t{}\n'.format(
-            tag, '\n\t\t'.join(support_models))
-        res_path = os.path.join(MODEL_HOME, tag)
-        decompressed_path = download_and_decompress(pretrained_models[tag],
-                                                    res_path)
-        decompressed_path = os.path.abspath(decompressed_path)
-        logger.info(
-            'Use pretrained model stored in: {}'.format(decompressed_path))
-        return decompressed_path
    def _init_from_path(self,
                        model_type: str='wenetspeech',
                        lang: str='zh',
@@ -245,10 +134,11 @@ class ASRExecutor(BaseExecutor):
            tag = model_type + '-' + lang + '-' + sample_rate_str
            res_path = self._get_pretrained_path(tag)  # wenetspeech_zh
            self.res_path = res_path
-            self.cfg_path = os.path.join(res_path,
+            self.cfg_path = os.path.join(
-                                         pretrained_models[tag]['cfg_path'])
+                res_path, self.pretrained_models[tag]['cfg_path'])
            self.ckpt_path = os.path.join(
-                res_path, pretrained_models[tag]['ckpt_path'] + ".pdparams")
+                res_path,
+                self.pretrained_models[tag]['ckpt_path'] + ".pdparams")
            logger.info(res_path)
        else:
@@ -273,8 +163,8 @@ class ASRExecutor(BaseExecutor):
                self.collate_fn_test = SpeechCollator.from_config(self.config)
                self.text_feature = TextFeaturizer(
                    unit_type=self.config.unit_type, vocab=self.vocab)
-                lm_url = pretrained_models[tag]['lm_url']
+                lm_url = self.pretrained_models[tag]['lm_url']
-                lm_md5 = pretrained_models[tag]['lm_md5']
+                lm_md5 = self.pretrained_models[tag]['lm_md5']
                self.download_lm(
                    lm_url,
                    os.path.dirname(self.config.decode.lang_model_path), lm_md5)
@@ -291,7 +181,7 @@ class ASRExecutor(BaseExecutor):
                raise Exception("wrong type")
        model_name = model_type[:model_type.rindex(
            '_')]  # model_type: {model_name}_{dataset}
-        model_class = dynamic_import(model_name, model_alias)
+        model_class = dynamic_import(model_name, self.model_alias)
        model_conf = self.config
        model = model_class.from_config(model_conf)
        self.model = model

--- a/paddlespeech/cli/asr/pretrained_models.py
+++ b/paddlespeech/cli/asr/pretrained_models.py
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+pretrained_models = {
+    # The tags for pretrained_models should be "{model_name}[_{dataset}][-{lang}][-...]".
+    # e.g. "conformer_wenetspeech-zh-16k" and "panns_cnn6-32k".
+    # Command line and python api use "{model_name}[_{dataset}]" as --model, usage:
+    # "paddlespeech asr --model conformer_wenetspeech --lang zh --sr 16000 --input ./input.wav"
+    "conformer_wenetspeech-zh-16k": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1_conformer_wenetspeech_ckpt_0.1.1.model.tar.gz',
+        'md5':
+        '76cb19ed857e6623856b7cd7ebbfeda4',
+        'cfg_path':
+        'model.yaml',
+        'ckpt_path':
+        'exp/conformer/checkpoints/wenetspeech',
+    },
+    "transformer_librispeech-en-16k": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr1/asr1_transformer_librispeech_ckpt_0.1.1.model.tar.gz',
+        'md5':
+        '2c667da24922aad391eacafe37bc1660',
+        'cfg_path':
+        'model.yaml',
+        'ckpt_path':
+        'exp/transformer/checkpoints/avg_10',
+    },
+    "deepspeech2offline_aishell-zh-16k": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_aishell_ckpt_0.1.1.model.tar.gz',
+        'md5':
+        '932c3593d62fe5c741b59b31318aa314',
+        'cfg_path':
+        'model.yaml',
+        'ckpt_path':
+        'exp/deepspeech2/checkpoints/avg_1',
+        'lm_url':
+        'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm',
+        'lm_md5':
+        '29e02312deb2e59b3c8686c7966d4fe3'
+    },
+    "deepspeech2online_aishell-zh-16k": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz',
+        'md5':
+        '23e16c69730a1cb5d735c98c83c21e16',
+        'cfg_path':
+        'model.yaml',
+        'ckpt_path':
+        'exp/deepspeech2_online/checkpoints/avg_1',
+        'lm_url':
+        'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm',
+        'lm_md5':
+        '29e02312deb2e59b3c8686c7966d4fe3'
+    },
+    "deepspeech2offline_librispeech-en-16k": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr0/asr0_deepspeech2_librispeech_ckpt_0.1.1.model.tar.gz',
+        'md5':
+        'f5666c81ad015c8de03aac2bc92e5762',
+        'cfg_path':
+        'model.yaml',
+        'ckpt_path':
+        'exp/deepspeech2/checkpoints/avg_1',
+        'lm_url':
+        'https://deepspeech.bj.bcebos.com/en_lm/common_crawl_00.prune01111.trie.klm',
+        'lm_md5':
+        '099a601759d467cd0a8523ff939819c5'
+    },
+}
+model_alias = {
+    "deepspeech2offline":
+    "paddlespeech.s2t.models.ds2:DeepSpeech2Model",
+    "deepspeech2online":
+    "paddlespeech.s2t.models.ds2_online:DeepSpeech2ModelOnline",
+    "conformer":
+    "paddlespeech.s2t.models.u2:U2Model",
+    "transformer":
+    "paddlespeech.s2t.models.u2:U2Model",
+    "wenetspeech":
+    "paddlespeech.s2t.models.u2:U2Model",
+}
--- a/paddlespeech/cli/cls/infer.py
+++ b/paddlespeech/cli/cls/infer.py
@@ -25,55 +25,23 @@ import yaml
 from ..executor import BaseExecutor
 from ..log import logger
 from ..utils import cli_register
-from ..utils import download_and_decompress
-from ..utils import MODEL_HOME
 from ..utils import stats_wrapper
+from .pretrained_models import model_alias
+from .pretrained_models import pretrained_models
 from paddleaudio import load
 from paddleaudio.features import LogMelSpectrogram
 from paddlespeech.s2t.utils.dynamic_import import dynamic_import
 __all__ = ['CLSExecutor']
-pretrained_models = {
-    # The tags for pretrained_models should be "{model_name}[_{dataset}][-{lang}][-...]".
-    # e.g. "conformer_wenetspeech-zh-16k", "transformer_aishell-zh-16k" and "panns_cnn6-32k".
-    # Command line and python api use "{model_name}[_{dataset}]" as --model, usage:
-    # "paddlespeech asr --model conformer_wenetspeech --lang zh --sr 16000 --input ./input.wav"
-    "panns_cnn6-32k": {
-        'url': 'https://paddlespeech.bj.bcebos.com/cls/panns_cnn6.tar.gz',
-        'md5': '4cf09194a95df024fd12f84712cf0f9c',
-        'cfg_path': 'panns.yaml',
-        'ckpt_path': 'cnn6.pdparams',
-        'label_file': 'audioset_labels.txt',
-    },
-    "panns_cnn10-32k": {
-        'url': 'https://paddlespeech.bj.bcebos.com/cls/panns_cnn10.tar.gz',
-        'md5': 'cb8427b22176cc2116367d14847f5413',
-        'cfg_path': 'panns.yaml',
-        'ckpt_path': 'cnn10.pdparams',
-        'label_file': 'audioset_labels.txt',
-    },
-    "panns_cnn14-32k": {
-        'url': 'https://paddlespeech.bj.bcebos.com/cls/panns_cnn14.tar.gz',
-        'md5': 'e3b9b5614a1595001161d0ab95edee97',
-        'cfg_path': 'panns.yaml',
-        'ckpt_path': 'cnn14.pdparams',
-        'label_file': 'audioset_labels.txt',
-    },
-}
-model_alias = {
-    "panns_cnn6": "paddlespeech.cls.models.panns:CNN6",
-    "panns_cnn10": "paddlespeech.cls.models.panns:CNN10",
-    "panns_cnn14": "paddlespeech.cls.models.panns:CNN14",
-}
 @cli_register(
    name='paddlespeech.cls', description='Audio classification infer command.')
 class CLSExecutor(BaseExecutor):
    def __init__(self):
-        super(CLSExecutor, self).__init__()
+        super().__init__()
+        self.model_alias = model_alias
+        self.pretrained_models = pretrained_models
        self.parser = argparse.ArgumentParser(
            prog='paddlespeech.cls', add_help=True)
@@ -83,7 +51,9 @@ class CLSExecutor(BaseExecutor):
            '--model',
            type=str,
            default='panns_cnn14',
-            choices=[tag[:tag.index('-')] for tag in pretrained_models.keys()],
+            choices=[
+                tag[:tag.index('-')] for tag in self.pretrained_models.keys()
+            ],
            help='Choose model type of cls task.')
        self.parser.add_argument(
            '--config',
@@ -121,23 +91,6 @@ class CLSExecutor(BaseExecutor):
            action='store_true',
            help='Increase logger verbosity of current task.')
-    def _get_pretrained_path(self, tag: str) -> os.PathLike:
-        """
-            Download and returns pretrained resources path of current task.
-        """
-        support_models = list(pretrained_models.keys())
-        assert tag in pretrained_models, 'The model "{}" you want to use has not been supported, please choose other models.\nThe support models includes:\n\t\t{}\n'.format(
-            tag, '\n\t\t'.join(support_models))
-        res_path = os.path.join(MODEL_HOME, tag)
-        decompressed_path = download_and_decompress(pretrained_models[tag],
-                                                    res_path)
-        decompressed_path = os.path.abspath(decompressed_path)
-        logger.info(
-            'Use pretrained model stored in: {}'.format(decompressed_path))
-        return decompressed_path
    def _init_from_path(self,
                        model_type: str='panns_cnn14',
                        cfg_path: Optional[os.PathLike]=None,
@@ -153,12 +106,12 @@ class CLSExecutor(BaseExecutor):
        if label_file is None or ckpt_path is None:
            tag = model_type + '-' + '32k'  # panns_cnn14-32k
            self.res_path = self._get_pretrained_path(tag)
-            self.cfg_path = os.path.join(self.res_path,
+            self.cfg_path = os.path.join(
-                                         pretrained_models[tag]['cfg_path'])
+                self.res_path, self.pretrained_models[tag]['cfg_path'])
-            self.label_file = os.path.join(self.res_path,
+            self.label_file = os.path.join(
-                                           pretrained_models[tag]['label_file'])
+                self.res_path, self.pretrained_models[tag]['label_file'])
-            self.ckpt_path = os.path.join(self.res_path,
+            self.ckpt_path = os.path.join(
-                                          pretrained_models[tag]['ckpt_path'])
+                self.res_path, self.pretrained_models[tag]['ckpt_path'])
        else:
            self.cfg_path = os.path.abspath(cfg_path)
            self.label_file = os.path.abspath(label_file)
@@ -175,7 +128,7 @@ class CLSExecutor(BaseExecutor):
                self._label_list.append(line.strip())
        # model
-        model_class = dynamic_import(model_type, model_alias)
+        model_class = dynamic_import(model_type, self.model_alias)
        model_dict = paddle.load(self.ckpt_path)
        self.model = model_class(extract_embedding=False)
        self.model.set_state_dict(model_dict)

--- a/paddlespeech/cli/cls/pretrained_models.py
+++ b/paddlespeech/cli/cls/pretrained_models.py
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+pretrained_models = {
+    # The tags for pretrained_models should be "{model_name}[_{dataset}][-{lang}][-...]".
+    # e.g. "conformer_wenetspeech-zh-16k", "transformer_aishell-zh-16k" and "panns_cnn6-32k".
+    # Command line and python api use "{model_name}[_{dataset}]" as --model, usage:
+    # "paddlespeech asr --model conformer_wenetspeech --lang zh --sr 16000 --input ./input.wav"
+    "panns_cnn6-32k": {
+        'url': 'https://paddlespeech.bj.bcebos.com/cls/panns_cnn6.tar.gz',
+        'md5': '4cf09194a95df024fd12f84712cf0f9c',
+        'cfg_path': 'panns.yaml',
+        'ckpt_path': 'cnn6.pdparams',
+        'label_file': 'audioset_labels.txt',
+    },
+    "panns_cnn10-32k": {
+        'url': 'https://paddlespeech.bj.bcebos.com/cls/panns_cnn10.tar.gz',
+        'md5': 'cb8427b22176cc2116367d14847f5413',
+        'cfg_path': 'panns.yaml',
+        'ckpt_path': 'cnn10.pdparams',
+        'label_file': 'audioset_labels.txt',
+    },
+    "panns_cnn14-32k": {
+        'url': 'https://paddlespeech.bj.bcebos.com/cls/panns_cnn14.tar.gz',
+        'md5': 'e3b9b5614a1595001161d0ab95edee97',
+        'cfg_path': 'panns.yaml',
+        'ckpt_path': 'cnn14.pdparams',
+        'label_file': 'audioset_labels.txt',
+    },
+}
+model_alias = {
+    "panns_cnn6": "paddlespeech.cls.models.panns:CNN6",
+    "panns_cnn10": "paddlespeech.cls.models.panns:CNN10",
+    "panns_cnn14": "paddlespeech.cls.models.panns:CNN14",
+}
--- a/paddlespeech/cli/executor.py
+++ b/paddlespeech/cli/executor.py
@@ -25,6 +25,8 @@ from typing import Union
 import paddle
 from .log import logger
+from .utils import download_and_decompress
+from .utils import MODEL_HOME
 class BaseExecutor(ABC):
@@ -35,19 +37,8 @@ class BaseExecutor(ABC):
    def __init__(self):
        self._inputs = OrderedDict()
        self._outputs = OrderedDict()
+        self.pretrained_models = OrderedDict()
-    @abstractmethod
+        self.model_alias = OrderedDict()
-    def _get_pretrained_path(self, tag: str) -> os.PathLike:
-        """
-        Download and returns pretrained resources path of current task.
-        Args:
-            tag (str): A tag of pretrained model.
-        Returns:
-            os.PathLike: The path on which resources of pretrained model locate. 
-        """
-        pass
    @abstractmethod
    def _init_from_path(self, *args, **kwargs):
@@ -227,3 +218,20 @@ class BaseExecutor(ABC):
        ]
        for l in loggers:
            l.disabled = True
+    def _get_pretrained_path(self, tag: str) -> os.PathLike:
+        """
+        Download and returns pretrained resources path of current task.
+        """
+        support_models = list(self.pretrained_models.keys())
+        assert tag in self.pretrained_models, 'The model "{}" you want to use has not been supported, please choose other models.\nThe support models includes:\n\t\t{}\n'.format(
+            tag, '\n\t\t'.join(support_models))
+        res_path = os.path.join(MODEL_HOME, tag)
+        decompressed_path = download_and_decompress(self.pretrained_models[tag],
+                                                    res_path)
+        decompressed_path = os.path.abspath(decompressed_path)
+        logger.info(
+            'Use pretrained model stored in: {}'.format(decompressed_path))
+        return decompressed_path
--- a/paddlespeech/cli/st/infer.py
+++ b/paddlespeech/cli/st/infer.py
@@ -32,40 +32,24 @@ from ..utils import cli_register
 from ..utils import download_and_decompress
 from ..utils import MODEL_HOME
 from ..utils import stats_wrapper
+from .pretrained_models import kaldi_bins
+from .pretrained_models import model_alias
+from .pretrained_models import pretrained_models
 from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
 from paddlespeech.s2t.utils.dynamic_import import dynamic_import
 from paddlespeech.s2t.utils.utility import UpdateConfig
 __all__ = ["STExecutor"]
-pretrained_models = {
-    "fat_st_ted-en-zh": {
-        "url":
-        "https://paddlespeech.bj.bcebos.com/s2t/ted_en_zh/st1/st1_transformer_mtl_noam_ted-en-zh_ckpt_0.1.1.model.tar.gz",
-        "md5":
-        "d62063f35a16d91210a71081bd2dd557",
-        "cfg_path":
-        "model.yaml",
-        "ckpt_path":
-        "exp/transformer_mtl_noam/checkpoints/fat_st_ted-en-zh.pdparams",
-    }
-}
-model_alias = {"fat_st": "paddlespeech.s2t.models.u2_st:U2STModel"}
-kaldi_bins = {
-    "url":
-    "https://paddlespeech.bj.bcebos.com/s2t/ted_en_zh/st1/kaldi_bins.tar.gz",
-    "md5":
-    "c0682303b3f3393dbf6ed4c4e35a53eb",
-}
 @cli_register(
    name="paddlespeech.st", description="Speech translation infer command.")
 class STExecutor(BaseExecutor):
    def __init__(self):
-        super(STExecutor, self).__init__()
+        super().__init__()
+        self.model_alias = model_alias
+        self.pretrained_models = pretrained_models
+        self.kaldi_bins = kaldi_bins
        self.parser = argparse.ArgumentParser(
            prog="paddlespeech.st", add_help=True)
@@ -75,7 +59,9 @@ class STExecutor(BaseExecutor):
            "--model",
            type=str,
            default="fat_st_ted",
-            choices=[tag[:tag.index('-')] for tag in pretrained_models.keys()],
+            choices=[
+                tag[:tag.index('-')] for tag in self.pretrained_models.keys()
+            ],
            help="Choose model type of st task.")
        self.parser.add_argument(
            "--src_lang",
@@ -119,28 +105,11 @@ class STExecutor(BaseExecutor):
            action='store_true',
            help='Increase logger verbosity of current task.')
-    def _get_pretrained_path(self, tag: str) -> os.PathLike:
-        """
-            Download and returns pretrained resources path of current task.
-        """
-        support_models = list(pretrained_models.keys())
-        assert tag in pretrained_models, 'The model "{}" you want to use has not been supported, please choose other models.\nThe support models includes:\n\t\t{}\n'.format(
-            tag, '\n\t\t'.join(support_models))
-        res_path = os.path.join(MODEL_HOME, tag)
-        decompressed_path = download_and_decompress(pretrained_models[tag],
-                                                    res_path)
-        decompressed_path = os.path.abspath(decompressed_path)
-        logger.info(
-            "Use pretrained model stored in: {}".format(decompressed_path))
-        return decompressed_path
    def _set_kaldi_bins(self) -> os.PathLike:
        """
            Download and returns kaldi_bins resources path of current task.
        """
-        decompressed_path = download_and_decompress(kaldi_bins, MODEL_HOME)
+        decompressed_path = download_and_decompress(self.kaldi_bins, MODEL_HOME)
        decompressed_path = os.path.abspath(decompressed_path)
        logger.info("Kaldi_bins stored in: {}".format(decompressed_path))
        if "LD_LIBRARY_PATH" in os.environ:
@@ -197,7 +166,7 @@ class STExecutor(BaseExecutor):
        model_conf = self.config
        model_name = model_type[:model_type.rindex(
            '_')]  # model_type: {model_name}_{dataset}
-        model_class = dynamic_import(model_name, model_alias)
+        model_class = dynamic_import(model_name, self.model_alias)
        self.model = model_class.from_config(model_conf)
        self.model.eval()

--- a/paddlespeech/cli/st/pretrained_models.py
+++ b/paddlespeech/cli/st/pretrained_models.py
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+pretrained_models = {
+    "fat_st_ted-en-zh": {
+        "url":
+        "https://paddlespeech.bj.bcebos.com/s2t/ted_en_zh/st1/st1_transformer_mtl_noam_ted-en-zh_ckpt_0.1.1.model.tar.gz",
+        "md5":
+        "d62063f35a16d91210a71081bd2dd557",
+        "cfg_path":
+        "model.yaml",
+        "ckpt_path":
+        "exp/transformer_mtl_noam/checkpoints/fat_st_ted-en-zh.pdparams",
+    }
+}
+model_alias = {"fat_st": "paddlespeech.s2t.models.u2_st:U2STModel"}
+kaldi_bins = {
+    "url":
+    "https://paddlespeech.bj.bcebos.com/s2t/ted_en_zh/st1/kaldi_bins.tar.gz",
+    "md5":
+    "c0682303b3f3393dbf6ed4c4e35a53eb",
+}
--- a/paddlespeech/cli/stats/infer.py
+++ b/paddlespeech/cli/stats/infer.py
@@ -16,7 +16,6 @@ from typing import List
 from prettytable import PrettyTable
-from ..log import logger
 from ..utils import cli_register
 from ..utils import stats_wrapper
@@ -27,7 +26,8 @@ model_name_format = {
    'cls': 'Model-Sample Rate',
    'st': 'Model-Source language-Target language',
    'text': 'Model-Task-Language',
-    'tts': 'Model-Language'
+    'tts': 'Model-Language',
+    'vector': 'Model-Sample Rate'
 }
@@ -36,18 +36,18 @@ model_name_format = {
    description='Get speech tasks support models list.')
 class StatsExecutor():
    def __init__(self):
-        super(StatsExecutor, self).__init__()
+        super().__init__()
        self.parser = argparse.ArgumentParser(
            prog='paddlespeech.stats', add_help=True)
+        self.task_choices = ['asr', 'cls', 'st', 'text', 'tts', 'vector']
        self.parser.add_argument(
            '--task',
            type=str,
            default='asr',
-            choices=['asr', 'cls', 'st', 'text', 'tts'],
+            choices=self.task_choices,
            help='Choose speech task.',
            required=True)
-        self.task_choices = ['asr', 'cls', 'st', 'text', 'tts']
    def show_support_models(self, pretrained_models: dict):
        fields = model_name_format[self.task].split("-")
@@ -61,73 +61,15 @@ class StatsExecutor():
            Command line entry.
        """
        parser_args = self.parser.parse_args(argv)
-        self.task = parser_args.task
+        has_exceptions = False
-        if self.task not in self.task_choices:
+        try:
-            logger.error(
+            self(parser_args.task)
-                "Please input correct speech task, choices = ['asr', 'cls', 'st', 'text', 'tts']"
+        except Exception as e:
-            )
+            has_exceptions = True
+        if has_exceptions:
            return False
+        else:
-        elif self.task == 'asr':
+            return True
-            try:
-                from ..asr.infer import pretrained_models
-                logger.info(
-                    "Here is the list of ASR pretrained models released by PaddleSpeech that can be used by command line and python API"
-                )
-                self.show_support_models(pretrained_models)
-                return True
-            except BaseException:
-                logger.error("Failed to get the list of ASR pretrained models.")
-                return False
-        elif self.task == 'cls':
-            try:
-                from ..cls.infer import pretrained_models
-                logger.info(
-                    "Here is the list of CLS pretrained models released by PaddleSpeech that can be used by command line and python API"
-                )
-                self.show_support_models(pretrained_models)
-                return True
-            except BaseException:
-                logger.error("Failed to get the list of CLS pretrained models.")
-                return False
-        elif self.task == 'st':
-            try:
-                from ..st.infer import pretrained_models
-                logger.info(
-                    "Here is the list of ST pretrained models released by PaddleSpeech that can be used by command line and python API"
-                )
-                self.show_support_models(pretrained_models)
-                return True
-            except BaseException:
-                logger.error("Failed to get the list of ST pretrained models.")
-                return False
-        elif self.task == 'text':
-            try:
-                from ..text.infer import pretrained_models
-                logger.info(
-                    "Here is the list of TEXT pretrained models released by PaddleSpeech that can be used by command line and python API"
-                )
-                self.show_support_models(pretrained_models)
-                return True
-            except BaseException:
-                logger.error(
-                    "Failed to get the list of TEXT pretrained models.")
-                return False
-        elif self.task == 'tts':
-            try:
-                from ..tts.infer import pretrained_models
-                logger.info(
-                    "Here is the list of TTS pretrained models released by PaddleSpeech that can be used by command line and python API"
-                )
-                self.show_support_models(pretrained_models)
-                return True
-            except BaseException:
-                logger.error("Failed to get the list of TTS pretrained models.")
-                return False
    @stats_wrapper
    def __call__(
@@ -138,13 +80,12 @@ class StatsExecutor():
        """
        self.task = task
        if self.task not in self.task_choices:
-            print(
+            print("Please input correct speech task, choices = " + str(
-                "Please input correct speech task, choices = ['asr', 'cls', 'st', 'text', 'tts']"
+                self.task_choices))
-            )
        elif self.task == 'asr':
            try:
-                from ..asr.infer import pretrained_models
+                from ..asr.pretrained_models import pretrained_models
                print(
                    "Here is the list of ASR pretrained models released by PaddleSpeech that can be used by command line and python API"
                )
@@ -154,7 +95,7 @@ class StatsExecutor():
        elif self.task == 'cls':
            try:
-                from ..cls.infer import pretrained_models
+                from ..cls.pretrained_models import pretrained_models
                print(
                    "Here is the list of CLS pretrained models released by PaddleSpeech that can be used by command line and python API"
                )
@@ -164,7 +105,7 @@ class StatsExecutor():
        elif self.task == 'st':
            try:
-                from ..st.infer import pretrained_models
+                from ..st.pretrained_models import pretrained_models
                print(
                    "Here is the list of ST pretrained models released by PaddleSpeech that can be used by command line and python API"
                )
@@ -174,7 +115,7 @@ class StatsExecutor():
        elif self.task == 'text':
            try:
-                from ..text.infer import pretrained_models
+                from ..text.pretrained_models import pretrained_models
                print(
                    "Here is the list of TEXT pretrained models released by PaddleSpeech that can be used by command line and python API"
                )
@@ -184,10 +125,22 @@ class StatsExecutor():
        elif self.task == 'tts':
            try:
-                from ..tts.infer import pretrained_models
+                from ..tts.pretrained_models import pretrained_models
                print(
                    "Here is the list of TTS pretrained models released by PaddleSpeech that can be used by command line and python API"
                )
                self.show_support_models(pretrained_models)
            except BaseException:
                print("Failed to get the list of TTS pretrained models.")
+        elif self.task == 'vector':
+            try:
+                from ..vector.pretrained_models import pretrained_models
+                print(
+                    "Here is the list of Speaker Recognition pretrained models released by PaddleSpeech that can be used by command line and python API"
+                )
+                self.show_support_models(pretrained_models)
+            except BaseException:
+                print(
+                    "Failed to get the list of Speaker Recognition pretrained models."
+                )
--- a/paddlespeech/cli/text/infer.py
+++ b/paddlespeech/cli/text/infer.py
@@ -25,58 +25,21 @@ from ...s2t.utils.dynamic_import import dynamic_import
 from ..executor import BaseExecutor
 from ..log import logger
 from ..utils import cli_register
-from ..utils import download_and_decompress
-from ..utils import MODEL_HOME
 from ..utils import stats_wrapper
+from .pretrained_models import model_alias
+from .pretrained_models import pretrained_models
+from .pretrained_models import tokenizer_alias
 __all__ = ['TextExecutor']
-pretrained_models = {
-    # The tags for pretrained_models should be "{model_name}[_{dataset}][-{lang}][-...]".
-    # e.g. "conformer_wenetspeech-zh-16k", "transformer_aishell-zh-16k" and "panns_cnn6-32k".
-    # Command line and python api use "{model_name}[_{dataset}]" as --model, usage:
-    # "paddlespeech asr --model conformer_wenetspeech --lang zh --sr 16000 --input ./input.wav"
-    "ernie_linear_p7_wudao-punc-zh": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/text/ernie_linear_p7_wudao-punc-zh.tar.gz',
-        'md5':
-        '12283e2ddde1797c5d1e57036b512746',
-        'cfg_path':
-        'ckpt/model_config.json',
-        'ckpt_path':
-        'ckpt/model_state.pdparams',
-        'vocab_file':
-        'punc_vocab.txt',
-    },
-    "ernie_linear_p3_wudao-punc-zh": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/text/ernie_linear_p3_wudao-punc-zh.tar.gz',
-        'md5':
-        '448eb2fdf85b6a997e7e652e80c51dd2',
-        'cfg_path':
-        'ckpt/model_config.json',
-        'ckpt_path':
-        'ckpt/model_state.pdparams',
-        'vocab_file':
-        'punc_vocab.txt',
-    },
-}
-model_alias = {
-    "ernie_linear_p7": "paddlespeech.text.models:ErnieLinear",
-    "ernie_linear_p3": "paddlespeech.text.models:ErnieLinear",
-}
-tokenizer_alias = {
-    "ernie_linear_p7": "paddlenlp.transformers:ErnieTokenizer",
-    "ernie_linear_p3": "paddlenlp.transformers:ErnieTokenizer",
-}
 @cli_register(name='paddlespeech.text', description='Text infer command.')
 class TextExecutor(BaseExecutor):
    def __init__(self):
-        super(TextExecutor, self).__init__()
+        super().__init__()
+        self.model_alias = model_alias
+        self.pretrained_models = pretrained_models
+        self.tokenizer_alias = tokenizer_alias
        self.parser = argparse.ArgumentParser(
            prog='paddlespeech.text', add_help=True)
@@ -92,7 +55,9 @@ class TextExecutor(BaseExecutor):
            '--model',
            type=str,
            default='ernie_linear_p7_wudao',
-            choices=[tag[:tag.index('-')] for tag in pretrained_models.keys()],
+            choices=[
+                tag[:tag.index('-')] for tag in self.pretrained_models.keys()
+            ],
            help='Choose model type of text task.')
        self.parser.add_argument(
            '--lang',
@@ -131,23 +96,6 @@ class TextExecutor(BaseExecutor):
            action='store_true',
            help='Increase logger verbosity of current task.')
-    def _get_pretrained_path(self, tag: str) -> os.PathLike:
-        """
-            Download and returns pretrained resources path of current task.
-        """
-        support_models = list(pretrained_models.keys())
-        assert tag in pretrained_models, 'The model "{}" you want to use has not been supported, please choose other models.\nThe support models includes:\n\t\t{}\n'.format(
-            tag, '\n\t\t'.join(support_models))
-        res_path = os.path.join(MODEL_HOME, tag)
-        decompressed_path = download_and_decompress(pretrained_models[tag],
-                                                    res_path)
-        decompressed_path = os.path.abspath(decompressed_path)
-        logger.info(
-            'Use pretrained model stored in: {}'.format(decompressed_path))
-        return decompressed_path
    def _init_from_path(self,
                        task: str='punc',
                        model_type: str='ernie_linear_p7_wudao',
@@ -167,12 +115,12 @@ class TextExecutor(BaseExecutor):
        if cfg_path is None or ckpt_path is None or vocab_file is None:
            tag = '-'.join([model_type, task, lang])
            self.res_path = self._get_pretrained_path(tag)
-            self.cfg_path = os.path.join(self.res_path,
+            self.cfg_path = os.path.join(
-                                         pretrained_models[tag]['cfg_path'])
+                self.res_path, self.pretrained_models[tag]['cfg_path'])
-            self.ckpt_path = os.path.join(self.res_path,
+            self.ckpt_path = os.path.join(
-                                          pretrained_models[tag]['ckpt_path'])
+                self.res_path, self.pretrained_models[tag]['ckpt_path'])
-            self.vocab_file = os.path.join(self.res_path,
+            self.vocab_file = os.path.join(
-                                           pretrained_models[tag]['vocab_file'])
+                self.res_path, self.pretrained_models[tag]['vocab_file'])
        else:
            self.cfg_path = os.path.abspath(cfg_path)
            self.ckpt_path = os.path.abspath(ckpt_path)
@@ -187,8 +135,8 @@ class TextExecutor(BaseExecutor):
                    self._punc_list.append(line.strip())
            # model
-            model_class = dynamic_import(model_name, model_alias)
+            model_class = dynamic_import(model_name, self.model_alias)
-            tokenizer_class = dynamic_import(model_name, tokenizer_alias)
+            tokenizer_class = dynamic_import(model_name, self.tokenizer_alias)
            self.model = model_class(
                cfg_path=self.cfg_path, ckpt_path=self.ckpt_path)
            self.tokenizer = tokenizer_class.from_pretrained('ernie-1.0')

--- a/paddlespeech/cli/text/pretrained_models.py
+++ b/paddlespeech/cli/text/pretrained_models.py
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+pretrained_models = {
+    # The tags for pretrained_models should be "{model_name}[_{dataset}][-{lang}][-...]".
+    # e.g. "conformer_wenetspeech-zh-16k", "transformer_aishell-zh-16k" and "panns_cnn6-32k".
+    # Command line and python api use "{model_name}[_{dataset}]" as --model, usage:
+    # "paddlespeech asr --model conformer_wenetspeech --lang zh --sr 16000 --input ./input.wav"
+    "ernie_linear_p7_wudao-punc-zh": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/text/ernie_linear_p7_wudao-punc-zh.tar.gz',
+        'md5':
+        '12283e2ddde1797c5d1e57036b512746',
+        'cfg_path':
+        'ckpt/model_config.json',
+        'ckpt_path':
+        'ckpt/model_state.pdparams',
+        'vocab_file':
+        'punc_vocab.txt',
+    },
+    "ernie_linear_p3_wudao-punc-zh": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/text/ernie_linear_p3_wudao-punc-zh.tar.gz',
+        'md5':
+        '448eb2fdf85b6a997e7e652e80c51dd2',
+        'cfg_path':
+        'ckpt/model_config.json',
+        'ckpt_path':
+        'ckpt/model_state.pdparams',
+        'vocab_file':
+        'punc_vocab.txt',
+    },
+}
+model_alias = {
+    "ernie_linear_p7": "paddlespeech.text.models:ErnieLinear",
+    "ernie_linear_p3": "paddlespeech.text.models:ErnieLinear",
+}
+tokenizer_alias = {
+    "ernie_linear_p7": "paddlenlp.transformers:ErnieTokenizer",
+    "ernie_linear_p3": "paddlenlp.transformers:ErnieTokenizer",
+}
--- a/paddlespeech/cli/tts/infer.py
+++ b/paddlespeech/cli/tts/infer.py
@@ -29,9 +29,9 @@ from yacs.config import CfgNode
 from ..executor import BaseExecutor
 from ..log import logger
 from ..utils import cli_register
-from ..utils import download_and_decompress
-from ..utils import MODEL_HOME
 from ..utils import stats_wrapper
+from .pretrained_models import model_alias
+from .pretrained_models import pretrained_models
 from paddlespeech.s2t.utils.dynamic_import import dynamic_import
 from paddlespeech.t2s.frontend import English
 from paddlespeech.t2s.frontend.zh_frontend import Frontend
@@ -39,299 +39,14 @@ from paddlespeech.t2s.modules.normalizer import ZScore
 __all__ = ['TTSExecutor']
-pretrained_models = {
-    # speedyspeech
-    "speedyspeech_csmsc-zh": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_csmsc_ckpt_0.2.0.zip',
-        'md5':
-        '6f6fa967b408454b6662c8c00c0027cb',
-        'config':
-        'default.yaml',
-        'ckpt':
-        'snapshot_iter_30600.pdz',
-        'speech_stats':
-        'feats_stats.npy',
-        'phones_dict':
-        'phone_id_map.txt',
-        'tones_dict':
-        'tone_id_map.txt',
-    },
-    # fastspeech2
-    "fastspeech2_csmsc-zh": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip',
-        'md5':
-        '637d28a5e53aa60275612ba4393d5f22',
-        'config':
-        'default.yaml',
-        'ckpt':
-        'snapshot_iter_76000.pdz',
-        'speech_stats':
-        'speech_stats.npy',
-        'phones_dict':
-        'phone_id_map.txt',
-    },
-    "fastspeech2_ljspeech-en": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_ljspeech_ckpt_0.5.zip',
-        'md5':
-        'ffed800c93deaf16ca9b3af89bfcd747',
-        'config':
-        'default.yaml',
-        'ckpt':
-        'snapshot_iter_100000.pdz',
-        'speech_stats':
-        'speech_stats.npy',
-        'phones_dict':
-        'phone_id_map.txt',
-    },
-    "fastspeech2_aishell3-zh": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_aishell3_ckpt_0.4.zip',
-        'md5':
-        'f4dd4a5f49a4552b77981f544ab3392e',
-        'config':
-        'default.yaml',
-        'ckpt':
-        'snapshot_iter_96400.pdz',
-        'speech_stats':
-        'speech_stats.npy',
-        'phones_dict':
-        'phone_id_map.txt',
-        'speaker_dict':
-        'speaker_id_map.txt',
-    },
-    "fastspeech2_vctk-en": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_vctk_ckpt_0.5.zip',
-        'md5':
-        '743e5024ca1e17a88c5c271db9779ba4',
-        'config':
-        'default.yaml',
-        'ckpt':
-        'snapshot_iter_66200.pdz',
-        'speech_stats':
-        'speech_stats.npy',
-        'phones_dict':
-        'phone_id_map.txt',
-        'speaker_dict':
-        'speaker_id_map.txt',
-    },
-    # tacotron2
-    "tacotron2_csmsc-zh": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_csmsc_ckpt_0.2.0.zip',
-        'md5':
-        '0df4b6f0bcbe0d73c5ed6df8867ab91a',
-        'config':
-        'default.yaml',
-        'ckpt':
-        'snapshot_iter_30600.pdz',
-        'speech_stats':
-        'speech_stats.npy',
-        'phones_dict':
-        'phone_id_map.txt',
-    },
-    "tacotron2_ljspeech-en": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_ljspeech_ckpt_0.2.0.zip',
-        'md5':
-        '6a5eddd81ae0e81d16959b97481135f3',
-        'config':
-        'default.yaml',
-        'ckpt':
-        'snapshot_iter_60300.pdz',
-        'speech_stats':
-        'speech_stats.npy',
-        'phones_dict':
-        'phone_id_map.txt',
-    },
-    # pwgan
-    "pwgan_csmsc-zh": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip',
-        'md5':
-        '2e481633325b5bdf0a3823c714d2c117',
-        'config':
-        'pwg_default.yaml',
-        'ckpt':
-        'pwg_snapshot_iter_400000.pdz',
-        'speech_stats':
-        'pwg_stats.npy',
-    },
-    "pwgan_ljspeech-en": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_ljspeech_ckpt_0.5.zip',
-        'md5':
-        '53610ba9708fd3008ccaf8e99dacbaf0',
-        'config':
-        'pwg_default.yaml',
-        'ckpt':
-        'pwg_snapshot_iter_400000.pdz',
-        'speech_stats':
-        'pwg_stats.npy',
-    },
-    "pwgan_aishell3-zh": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_aishell3_ckpt_0.5.zip',
-        'md5':
-        'd7598fa41ad362d62f85ffc0f07e3d84',
-        'config':
-        'default.yaml',
-        'ckpt':
-        'snapshot_iter_1000000.pdz',
-        'speech_stats':
-        'feats_stats.npy',
-    },
-    "pwgan_vctk-en": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_vctk_ckpt_0.1.1.zip',
-        'md5':
-        'b3da1defcde3e578be71eb284cb89f2c',
-        'config':
-        'default.yaml',
-        'ckpt':
-        'snapshot_iter_1500000.pdz',
-        'speech_stats':
-        'feats_stats.npy',
-    },
-    # mb_melgan
-    "mb_melgan_csmsc-zh": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_ckpt_0.1.1.zip',
-        'md5':
-        'ee5f0604e20091f0d495b6ec4618b90d',
-        'config':
-        'default.yaml',
-        'ckpt':
-        'snapshot_iter_1000000.pdz',
-        'speech_stats':
-        'feats_stats.npy',
-    },
-    # style_melgan
-    "style_melgan_csmsc-zh": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/style_melgan/style_melgan_csmsc_ckpt_0.1.1.zip',
-        'md5':
-        '5de2d5348f396de0c966926b8c462755',
-        'config':
-        'default.yaml',
-        'ckpt':
-        'snapshot_iter_1500000.pdz',
-        'speech_stats':
-        'feats_stats.npy',
-    },
-    # hifigan
-    "hifigan_csmsc-zh": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_ckpt_0.1.1.zip',
-        'md5':
-        'dd40a3d88dfcf64513fba2f0f961ada6',
-        'config':
-        'default.yaml',
-        'ckpt':
-        'snapshot_iter_2500000.pdz',
-        'speech_stats':
-        'feats_stats.npy',
-    },
-    "hifigan_ljspeech-en": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_ljspeech_ckpt_0.2.0.zip',
-        'md5':
-        '70e9131695decbca06a65fe51ed38a72',
-        'config':
-        'default.yaml',
-        'ckpt':
-        'snapshot_iter_2500000.pdz',
-        'speech_stats':
-        'feats_stats.npy',
-    },
-    "hifigan_aishell3-zh": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_ckpt_0.2.0.zip',
-        'md5':
-        '3bb49bc75032ed12f79c00c8cc79a09a',
-        'config':
-        'default.yaml',
-        'ckpt':
-        'snapshot_iter_2500000.pdz',
-        'speech_stats':
-        'feats_stats.npy',
-    },
-    "hifigan_vctk-en": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_vctk_ckpt_0.2.0.zip',
-        'md5':
-        '7da8f88359bca2457e705d924cf27bd4',
-        'config':
-        'default.yaml',
-        'ckpt':
-        'snapshot_iter_2500000.pdz',
-        'speech_stats':
-        'feats_stats.npy',
-    },
-    # wavernn
-    "wavernn_csmsc-zh": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/wavernn/wavernn_csmsc_ckpt_0.2.0.zip',
-        'md5':
-        'ee37b752f09bcba8f2af3b777ca38e13',
-        'config':
-        'default.yaml',
-        'ckpt':
-        'snapshot_iter_400000.pdz',
-        'speech_stats':
-        'feats_stats.npy',
-    }
-}
-model_alias = {
-    # acoustic model
-    "speedyspeech":
-    "paddlespeech.t2s.models.speedyspeech:SpeedySpeech",
-    "speedyspeech_inference":
-    "paddlespeech.t2s.models.speedyspeech:SpeedySpeechInference",
-    "fastspeech2":
-    "paddlespeech.t2s.models.fastspeech2:FastSpeech2",
-    "fastspeech2_inference":
-    "paddlespeech.t2s.models.fastspeech2:FastSpeech2Inference",
-    "tacotron2":
-    "paddlespeech.t2s.models.tacotron2:Tacotron2",
-    "tacotron2_inference":
-    "paddlespeech.t2s.models.tacotron2:Tacotron2Inference",
-    # voc
-    "pwgan":
-    "paddlespeech.t2s.models.parallel_wavegan:PWGGenerator",
-    "pwgan_inference":
-    "paddlespeech.t2s.models.parallel_wavegan:PWGInference",
-    "mb_melgan":
-    "paddlespeech.t2s.models.melgan:MelGANGenerator",
-    "mb_melgan_inference":
-    "paddlespeech.t2s.models.melgan:MelGANInference",
-    "style_melgan":
-    "paddlespeech.t2s.models.melgan:StyleMelGANGenerator",
-    "style_melgan_inference":
-    "paddlespeech.t2s.models.melgan:StyleMelGANInference",
-    "hifigan":
-    "paddlespeech.t2s.models.hifigan:HiFiGANGenerator",
-    "hifigan_inference":
-    "paddlespeech.t2s.models.hifigan:HiFiGANInference",
-    "wavernn":
-    "paddlespeech.t2s.models.wavernn:WaveRNN",
-    "wavernn_inference":
-    "paddlespeech.t2s.models.wavernn:WaveRNNInference",
-}
 @cli_register(
    name='paddlespeech.tts', description='Text to Speech infer command.')
 class TTSExecutor(BaseExecutor):
    def __init__(self):
        super().__init__()
+        self.model_alias = model_alias
+        self.pretrained_models = pretrained_models
        self.parser = argparse.ArgumentParser(
            prog='paddlespeech.tts', add_help=True)
@@ -449,22 +164,6 @@ class TTSExecutor(BaseExecutor):
            action='store_true',
            help='Increase logger verbosity of current task.')
-    def _get_pretrained_path(self, tag: str) -> os.PathLike:
-        """
-        Download and returns pretrained resources path of current task.
-        """
-        support_models = list(pretrained_models.keys())
-        assert tag in pretrained_models, 'The model "{}" you want to use has not been supported, please choose other models.\nThe support models includes:\n\t\t{}\n'.format(
-            tag, '\n\t\t'.join(support_models))
-        res_path = os.path.join(MODEL_HOME, tag)
-        decompressed_path = download_and_decompress(pretrained_models[tag],
-                                                    res_path)
-        decompressed_path = os.path.abspath(decompressed_path)
-        logger.info(
-            'Use pretrained model stored in: {}'.format(decompressed_path))
-        return decompressed_path
    def _init_from_path(
            self,
            am: str='fastspeech2_csmsc',
@@ -490,16 +189,15 @@ class TTSExecutor(BaseExecutor):
        if am_ckpt is None or am_config is None or am_stat is None or phones_dict is None:
            am_res_path = self._get_pretrained_path(am_tag)
            self.am_res_path = am_res_path
-            self.am_config = os.path.join(am_res_path,
+            self.am_config = os.path.join(
-                                          pretrained_models[am_tag]['config'])
+                am_res_path, self.pretrained_models[am_tag]['config'])
            self.am_ckpt = os.path.join(am_res_path,
-                                        pretrained_models[am_tag]['ckpt'])
+                                        self.pretrained_models[am_tag]['ckpt'])
            self.am_stat = os.path.join(
-                am_res_path, pretrained_models[am_tag]['speech_stats'])
+                am_res_path, self.pretrained_models[am_tag]['speech_stats'])
            # must have phones_dict in acoustic
            self.phones_dict = os.path.join(
-                am_res_path, pretrained_models[am_tag]['phones_dict'])
+                am_res_path, self.pretrained_models[am_tag]['phones_dict'])
-            print("self.phones_dict:", self.phones_dict)
            logger.info(am_res_path)
            logger.info(self.am_config)
            logger.info(self.am_ckpt)
@@ -509,21 +207,20 @@ class TTSExecutor(BaseExecutor):
            self.am_stat = os.path.abspath(am_stat)
            self.phones_dict = os.path.abspath(phones_dict)
            self.am_res_path = os.path.dirname(os.path.abspath(self.am_config))
-        print("self.phones_dict:", self.phones_dict)
        # for speedyspeech
        self.tones_dict = None
-        if 'tones_dict' in pretrained_models[am_tag]:
+        if 'tones_dict' in self.pretrained_models[am_tag]:
            self.tones_dict = os.path.join(
-                am_res_path, pretrained_models[am_tag]['tones_dict'])
+                am_res_path, self.pretrained_models[am_tag]['tones_dict'])
            if tones_dict:
                self.tones_dict = tones_dict
        # for multi speaker fastspeech2
        self.speaker_dict = None
-        if 'speaker_dict' in pretrained_models[am_tag]:
+        if 'speaker_dict' in self.pretrained_models[am_tag]:
            self.speaker_dict = os.path.join(
-                am_res_path, pretrained_models[am_tag]['speaker_dict'])
+                am_res_path, self.pretrained_models[am_tag]['speaker_dict'])
            if speaker_dict:
                self.speaker_dict = speaker_dict
@@ -532,12 +229,12 @@ class TTSExecutor(BaseExecutor):
        if voc_ckpt is None or voc_config is None or voc_stat is None:
            voc_res_path = self._get_pretrained_path(voc_tag)
            self.voc_res_path = voc_res_path
-            self.voc_config = os.path.join(voc_res_path,
+            self.voc_config = os.path.join(
-                                           pretrained_models[voc_tag]['config'])
+                voc_res_path, self.pretrained_models[voc_tag]['config'])
-            self.voc_ckpt = os.path.join(voc_res_path,
+            self.voc_ckpt = os.path.join(
-                                         pretrained_models[voc_tag]['ckpt'])
+                voc_res_path, self.pretrained_models[voc_tag]['ckpt'])
            self.voc_stat = os.path.join(
-                voc_res_path, pretrained_models[voc_tag]['speech_stats'])
+                voc_res_path, self.pretrained_models[voc_tag]['speech_stats'])
            logger.info(voc_res_path)
            logger.info(self.voc_config)
            logger.info(self.voc_ckpt)
@@ -588,8 +285,9 @@ class TTSExecutor(BaseExecutor):
        # model: {model_name}_{dataset}
        am_name = am[:am.rindex('_')]
-        am_class = dynamic_import(am_name, model_alias)
+        am_class = dynamic_import(am_name, self.model_alias)
-        am_inference_class = dynamic_import(am_name + '_inference', model_alias)
+        am_inference_class = dynamic_import(am_name + '_inference',
+                                            self.model_alias)
        if am_name == 'fastspeech2':
            am = am_class(
@@ -618,9 +316,9 @@ class TTSExecutor(BaseExecutor):
        # vocoder
        # model: {model_name}_{dataset}
        voc_name = voc[:voc.rindex('_')]
-        voc_class = dynamic_import(voc_name, model_alias)
+        voc_class = dynamic_import(voc_name, self.model_alias)
        voc_inference_class = dynamic_import(voc_name + '_inference',
-                                             model_alias)
+                                             self.model_alias)
        if voc_name != 'wavernn':
            voc = voc_class(**self.voc_config["generator_params"])
            voc.set_state_dict(paddle.load(self.voc_ckpt)["generator_params"])
@@ -735,7 +433,6 @@ class TTSExecutor(BaseExecutor):
        am_ckpt = args.am_ckpt
        am_stat = args.am_stat
        phones_dict = args.phones_dict
-        print("phones_dict:", phones_dict)
        tones_dict = args.tones_dict
        speaker_dict = args.speaker_dict
        voc = args.voc

--- a/paddlespeech/cli/tts/pretrained_models.py
+++ b/paddlespeech/cli/tts/pretrained_models.py
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+pretrained_models = {
+    # speedyspeech
+    "speedyspeech_csmsc-zh": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_csmsc_ckpt_0.2.0.zip',
+        'md5':
+        '6f6fa967b408454b6662c8c00c0027cb',
+        'config':
+        'default.yaml',
+        'ckpt':
+        'snapshot_iter_30600.pdz',
+        'speech_stats':
+        'feats_stats.npy',
+        'phones_dict':
+        'phone_id_map.txt',
+        'tones_dict':
+        'tone_id_map.txt',
+    },
+    # fastspeech2
+    "fastspeech2_csmsc-zh": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip',
+        'md5':
+        '637d28a5e53aa60275612ba4393d5f22',
+        'config':
+        'default.yaml',
+        'ckpt':
+        'snapshot_iter_76000.pdz',
+        'speech_stats':
+        'speech_stats.npy',
+        'phones_dict':
+        'phone_id_map.txt',
+    },
+    "fastspeech2_ljspeech-en": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_ljspeech_ckpt_0.5.zip',
+        'md5':
+        'ffed800c93deaf16ca9b3af89bfcd747',
+        'config':
+        'default.yaml',
+        'ckpt':
+        'snapshot_iter_100000.pdz',
+        'speech_stats':
+        'speech_stats.npy',
+        'phones_dict':
+        'phone_id_map.txt',
+    },
+    "fastspeech2_aishell3-zh": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_aishell3_ckpt_0.4.zip',
+        'md5':
+        'f4dd4a5f49a4552b77981f544ab3392e',
+        'config':
+        'default.yaml',
+        'ckpt':
+        'snapshot_iter_96400.pdz',
+        'speech_stats':
+        'speech_stats.npy',
+        'phones_dict':
+        'phone_id_map.txt',
+        'speaker_dict':
+        'speaker_id_map.txt',
+    },
+    "fastspeech2_vctk-en": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_vctk_ckpt_0.5.zip',
+        'md5':
+        '743e5024ca1e17a88c5c271db9779ba4',
+        'config':
+        'default.yaml',
+        'ckpt':
+        'snapshot_iter_66200.pdz',
+        'speech_stats':
+        'speech_stats.npy',
+        'phones_dict':
+        'phone_id_map.txt',
+        'speaker_dict':
+        'speaker_id_map.txt',
+    },
+    # tacotron2
+    "tacotron2_csmsc-zh": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_csmsc_ckpt_0.2.0.zip',
+        'md5':
+        '0df4b6f0bcbe0d73c5ed6df8867ab91a',
+        'config':
+        'default.yaml',
+        'ckpt':
+        'snapshot_iter_30600.pdz',
+        'speech_stats':
+        'speech_stats.npy',
+        'phones_dict':
+        'phone_id_map.txt',
+    },
+    "tacotron2_ljspeech-en": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_ljspeech_ckpt_0.2.0.zip',
+        'md5':
+        '6a5eddd81ae0e81d16959b97481135f3',
+        'config':
+        'default.yaml',
+        'ckpt':
+        'snapshot_iter_60300.pdz',
+        'speech_stats':
+        'speech_stats.npy',
+        'phones_dict':
+        'phone_id_map.txt',
+    },
+    # pwgan
+    "pwgan_csmsc-zh": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip',
+        'md5':
+        '2e481633325b5bdf0a3823c714d2c117',
+        'config':
+        'pwg_default.yaml',
+        'ckpt':
+        'pwg_snapshot_iter_400000.pdz',
+        'speech_stats':
+        'pwg_stats.npy',
+    },
+    "pwgan_ljspeech-en": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_ljspeech_ckpt_0.5.zip',
+        'md5':
+        '53610ba9708fd3008ccaf8e99dacbaf0',
+        'config':
+        'pwg_default.yaml',
+        'ckpt':
+        'pwg_snapshot_iter_400000.pdz',
+        'speech_stats':
+        'pwg_stats.npy',
+    },
+    "pwgan_aishell3-zh": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_aishell3_ckpt_0.5.zip',
+        'md5':
+        'd7598fa41ad362d62f85ffc0f07e3d84',
+        'config':
+        'default.yaml',
+        'ckpt':
+        'snapshot_iter_1000000.pdz',
+        'speech_stats':
+        'feats_stats.npy',
+    },
+    "pwgan_vctk-en": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_vctk_ckpt_0.1.1.zip',
+        'md5':
+        'b3da1defcde3e578be71eb284cb89f2c',
+        'config':
+        'default.yaml',
+        'ckpt':
+        'snapshot_iter_1500000.pdz',
+        'speech_stats':
+        'feats_stats.npy',
+    },
+    # mb_melgan
+    "mb_melgan_csmsc-zh": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_ckpt_0.1.1.zip',
+        'md5':
+        'ee5f0604e20091f0d495b6ec4618b90d',
+        'config':
+        'default.yaml',
+        'ckpt':
+        'snapshot_iter_1000000.pdz',
+        'speech_stats':
+        'feats_stats.npy',
+    },
+    # style_melgan
+    "style_melgan_csmsc-zh": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/style_melgan/style_melgan_csmsc_ckpt_0.1.1.zip',
+        'md5':
+        '5de2d5348f396de0c966926b8c462755',
+        'config':
+        'default.yaml',
+        'ckpt':
+        'snapshot_iter_1500000.pdz',
+        'speech_stats':
+        'feats_stats.npy',
+    },
+    # hifigan
+    "hifigan_csmsc-zh": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_ckpt_0.1.1.zip',
+        'md5':
+        'dd40a3d88dfcf64513fba2f0f961ada6',
+        'config':
+        'default.yaml',
+        'ckpt':
+        'snapshot_iter_2500000.pdz',
+        'speech_stats':
+        'feats_stats.npy',
+    },
+    "hifigan_ljspeech-en": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_ljspeech_ckpt_0.2.0.zip',
+        'md5':
+        '70e9131695decbca06a65fe51ed38a72',
+        'config':
+        'default.yaml',
+        'ckpt':
+        'snapshot_iter_2500000.pdz',
+        'speech_stats':
+        'feats_stats.npy',
+    },
+    "hifigan_aishell3-zh": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_ckpt_0.2.0.zip',
+        'md5':
+        '3bb49bc75032ed12f79c00c8cc79a09a',
+        'config':
+        'default.yaml',
+        'ckpt':
+        'snapshot_iter_2500000.pdz',
+        'speech_stats':
+        'feats_stats.npy',
+    },
+    "hifigan_vctk-en": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_vctk_ckpt_0.2.0.zip',
+        'md5':
+        '7da8f88359bca2457e705d924cf27bd4',
+        'config':
+        'default.yaml',
+        'ckpt':
+        'snapshot_iter_2500000.pdz',
+        'speech_stats':
+        'feats_stats.npy',
+    },
+    # wavernn
+    "wavernn_csmsc-zh": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/wavernn/wavernn_csmsc_ckpt_0.2.0.zip',
+        'md5':
+        'ee37b752f09bcba8f2af3b777ca38e13',
+        'config':
+        'default.yaml',
+        'ckpt':
+        'snapshot_iter_400000.pdz',
+        'speech_stats':
+        'feats_stats.npy',
+    }
+}
+model_alias = {
+    # acoustic model
+    "speedyspeech":
+    "paddlespeech.t2s.models.speedyspeech:SpeedySpeech",
+    "speedyspeech_inference":
+    "paddlespeech.t2s.models.speedyspeech:SpeedySpeechInference",
+    "fastspeech2":
+    "paddlespeech.t2s.models.fastspeech2:FastSpeech2",
+    "fastspeech2_inference":
+    "paddlespeech.t2s.models.fastspeech2:FastSpeech2Inference",
+    "tacotron2":
+    "paddlespeech.t2s.models.tacotron2:Tacotron2",
+    "tacotron2_inference":
+    "paddlespeech.t2s.models.tacotron2:Tacotron2Inference",
+    # voc
+    "pwgan":
+    "paddlespeech.t2s.models.parallel_wavegan:PWGGenerator",
+    "pwgan_inference":
+    "paddlespeech.t2s.models.parallel_wavegan:PWGInference",
+    "mb_melgan":
+    "paddlespeech.t2s.models.melgan:MelGANGenerator",
+    "mb_melgan_inference":
+    "paddlespeech.t2s.models.melgan:MelGANInference",
+    "style_melgan":
+    "paddlespeech.t2s.models.melgan:StyleMelGANGenerator",
+    "style_melgan_inference":
+    "paddlespeech.t2s.models.melgan:StyleMelGANInference",
+    "hifigan":
+    "paddlespeech.t2s.models.hifigan:HiFiGANGenerator",
+    "hifigan_inference":
+    "paddlespeech.t2s.models.hifigan:HiFiGANInference",
+    "wavernn":
+    "paddlespeech.t2s.models.wavernn:WaveRNN",
+    "wavernn_inference":
+    "paddlespeech.t2s.models.wavernn:WaveRNNInference",
+}
--- a/paddlespeech/cli/vector/infer.py
+++ b/paddlespeech/cli/vector/infer.py
@@ -27,45 +27,24 @@ from yacs.config import CfgNode
 from ..executor import BaseExecutor
 from ..log import logger
 from ..utils import cli_register
-from ..utils import download_and_decompress
-from ..utils import MODEL_HOME
 from ..utils import stats_wrapper
+from .pretrained_models import model_alias
+from .pretrained_models import pretrained_models
 from paddleaudio.backends import load as load_audio
 from paddleaudio.compliance.librosa import melspectrogram
 from paddlespeech.s2t.utils.dynamic_import import dynamic_import
 from paddlespeech.vector.io.batch import feature_normalize
 from paddlespeech.vector.modules.sid_model import SpeakerIdetification
-pretrained_models = {
-    # The tags for pretrained_models should be "{model_name}[-{dataset}][-{sr}][-...]".
-    # e.g. "ecapatdnn_voxceleb12-16k".
-    # Command line and python api use "{model_name}[-{dataset}]" as --model, usage:
-    # "paddlespeech vector --task spk --model ecapatdnn_voxceleb12-16k --sr 16000 --input ./input.wav"
-    "ecapatdnn_voxceleb12-16k": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/vector/voxceleb/sv0_ecapa_tdnn_voxceleb12_ckpt_0_2_0.tar.gz',
-        'md5':
-        'cc33023c54ab346cd318408f43fcaf95',
-        'cfg_path':
-        'conf/model.yaml',  # the yaml config path
-        'ckpt_path':
-        'model/model',  # the format is ${dir}/{model_name}, 
-        # so the first 'model' is dir, the second 'model' is the name
-        # this means we have a model stored as model/model.pdparams
-    },
-}
-model_alias = {
-    "ecapatdnn": "paddlespeech.vector.models.ecapa_tdnn:EcapaTdnn",
-}
 @cli_register(
    name="paddlespeech.vector",
    description="Speech to vector embedding infer command.")
 class VectorExecutor(BaseExecutor):
    def __init__(self):
-        super(VectorExecutor, self).__init__()
+        super().__init__()
+        self.model_alias = model_alias
+        self.pretrained_models = pretrained_models
        self.parser = argparse.ArgumentParser(
            prog="paddlespeech.vector", add_help=True)
@@ -128,8 +107,8 @@ class VectorExecutor(BaseExecutor):
        Returns:
            bool: 
-                 False: some audio occurs error
+                False: some audio occurs error
-                 True: all audio process success
+                True: all audio process success
        """
        # stage 0: parse the args and get the required args
        parser_args = self.parser.parse_args(argv)
@@ -289,32 +268,6 @@ class VectorExecutor(BaseExecutor):
        return res
-    def _get_pretrained_path(self, tag: str) -> os.PathLike:
-        """get the neural network path from the pretrained model list
-           we stored all the pretained mode in the variable `pretrained_models`
-        Args:
-            tag (str): model tag in the pretrained model list
-        Returns:
-            os.PathLike: the downloaded pretrained model path in the disk
-        """
-        support_models = list(pretrained_models.keys())
-        assert tag in pretrained_models, \
-            'The model "{}" you want to use has not been supported,'\
-            'please choose other models.\n' \
-            'The support models includes\n\t\t{}'.format(tag, "\n\t\t".join(support_models))
-        res_path = os.path.join(MODEL_HOME, tag)
-        decompressed_path = download_and_decompress(pretrained_models[tag],
-                                                    res_path)
-        decompressed_path = os.path.abspath(decompressed_path)
-        logger.info(
-            'Use pretrained model stored in: {}'.format(decompressed_path))
-        return decompressed_path
    def _init_from_path(self,
                        model_type: str='ecapatdnn_voxceleb12',
                        sample_rate: int=16000,
@@ -350,10 +303,11 @@ class VectorExecutor(BaseExecutor):
            res_path = self._get_pretrained_path(tag)
            self.res_path = res_path
-            self.cfg_path = os.path.join(res_path,
+            self.cfg_path = os.path.join(
-                                         pretrained_models[tag]['cfg_path'])
+                res_path, self.pretrained_models[tag]['cfg_path'])
            self.ckpt_path = os.path.join(
-                res_path, pretrained_models[tag]['ckpt_path'] + '.pdparams')
+                res_path,
+                self.pretrained_models[tag]['ckpt_path'] + '.pdparams')
        else:
            # get the model from disk
            self.cfg_path = os.path.abspath(cfg_path)
@@ -373,7 +327,7 @@ class VectorExecutor(BaseExecutor):
        logger.info("start to dynamic import the model class")
        model_name = model_type[:model_type.rindex('_')]
        logger.info(f"model name {model_name}")
-        model_class = dynamic_import(model_name, model_alias)
+        model_class = dynamic_import(model_name, self.model_alias)
        model_conf = self.config.model
        backbone = model_class(**model_conf)
        model = SpeakerIdetification(

--- a/paddlespeech/cli/vector/pretrained_models.py
+++ b/paddlespeech/cli/vector/pretrained_models.py
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+pretrained_models = {
+    # The tags for pretrained_models should be "{model_name}[-{dataset}][-{sr}][-...]".
+    # e.g. "ecapatdnn_voxceleb12-16k".
+    # Command line and python api use "{model_name}[-{dataset}]" as --model, usage:
+    # "paddlespeech vector --task spk --model ecapatdnn_voxceleb12-16k --sr 16000 --input ./input.wav"
+    "ecapatdnn_voxceleb12-16k": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/vector/voxceleb/sv0_ecapa_tdnn_voxceleb12_ckpt_0_2_0.tar.gz',
+        'md5':
+        'cc33023c54ab346cd318408f43fcaf95',
+        'cfg_path':
+        'conf/model.yaml',  # the yaml config path
+        'ckpt_path':
+        'model/model',  # the format is ${dir}/{model_name}, 
+        # so the first 'model' is dir, the second 'model' is the name
+        # this means we have a model stored as model/model.pdparams
+    },
+}
+model_alias = {
+    "ecapatdnn": "paddlespeech.vector.models.ecapa_tdnn:EcapaTdnn",
+}
--- a/paddlespeech/t2s/exps/inference.py
+++ b/paddlespeech/t2s/exps/inference.py
@@ -14,92 +14,17 @@
 import argparse
 from pathlib import Path
-import numpy
 import soundfile as sf
-from paddle import inference
 from timer import timer
+from paddlespeech.t2s.exps.syn_utils import get_am_output
 from paddlespeech.t2s.exps.syn_utils import get_frontend
+from paddlespeech.t2s.exps.syn_utils import get_predictor
 from paddlespeech.t2s.exps.syn_utils import get_sentences
+from paddlespeech.t2s.exps.syn_utils import get_voc_output
 from paddlespeech.t2s.utils import str2bool
-def get_predictor(args, filed='am'):
-    full_name = ''
-    if filed == 'am':
-        full_name = args.am
-    elif filed == 'voc':
-        full_name = args.voc
-    model_name = full_name[:full_name.rindex('_')]
-    config = inference.Config(
-        str(Path(args.inference_dir) / (full_name + ".pdmodel")),
-        str(Path(args.inference_dir) / (full_name + ".pdiparams")))
-    if args.device == "gpu":
-        config.enable_use_gpu(100, 0)
-    elif args.device == "cpu":
-        config.disable_gpu()
-    config.enable_memory_optim()
-    predictor = inference.create_predictor(config)
-    return predictor
-def get_am_output(args, am_predictor, frontend, merge_sentences, input):
-    am_name = args.am[:args.am.rindex('_')]
-    am_dataset = args.am[args.am.rindex('_') + 1:]
-    am_input_names = am_predictor.get_input_names()
-    get_tone_ids = False
-    get_spk_id = False
-    if am_name == 'speedyspeech':
-        get_tone_ids = True
-    if am_dataset in {"aishell3", "vctk"} and args.speaker_dict:
-        get_spk_id = True
-        spk_id = numpy.array([args.spk_id])
-    if args.lang == 'zh':
-        input_ids = frontend.get_input_ids(
-            input, merge_sentences=merge_sentences, get_tone_ids=get_tone_ids)
-        phone_ids = input_ids["phone_ids"]
-    elif args.lang == 'en':
-        input_ids = frontend.get_input_ids(
-            input, merge_sentences=merge_sentences)
-        phone_ids = input_ids["phone_ids"]
-    else:
-        print("lang should in {'zh', 'en'}!")
-    if get_tone_ids:
-        tone_ids = input_ids["tone_ids"]
-        tones = tone_ids[0].numpy()
-        tones_handle = am_predictor.get_input_handle(am_input_names[1])
-        tones_handle.reshape(tones.shape)
-        tones_handle.copy_from_cpu(tones)
-    if get_spk_id:
-        spk_id_handle = am_predictor.get_input_handle(am_input_names[1])
-        spk_id_handle.reshape(spk_id.shape)
-        spk_id_handle.copy_from_cpu(spk_id)
-    phones = phone_ids[0].numpy()
-    phones_handle = am_predictor.get_input_handle(am_input_names[0])
-    phones_handle.reshape(phones.shape)
-    phones_handle.copy_from_cpu(phones)
-    am_predictor.run()
-    am_output_names = am_predictor.get_output_names()
-    am_output_handle = am_predictor.get_output_handle(am_output_names[0])
-    am_output_data = am_output_handle.copy_to_cpu()
-    return am_output_data
-def get_voc_output(args, voc_predictor, input):
-    voc_input_names = voc_predictor.get_input_names()
-    mel_handle = voc_predictor.get_input_handle(voc_input_names[0])
-    mel_handle.reshape(input.shape)
-    mel_handle.copy_from_cpu(input)
-    voc_predictor.run()
-    voc_output_names = voc_predictor.get_output_names()
-    voc_output_handle = voc_predictor.get_output_handle(voc_output_names[0])
-    wav = voc_output_handle.copy_to_cpu()
-    return wav
 def parse_args():
    parser = argparse.ArgumentParser(
        description="Paddle Infernce with acoustic model & vocoder.")
@@ -204,7 +129,7 @@ def main():
                merge_sentences=merge_sentences,
                input=sentence)
            wav = get_voc_output(
-                args, voc_predictor=voc_predictor, input=am_output_data)
+                voc_predictor=voc_predictor, input=am_output_data)
        speed = wav.size / t.elapse
        rtf = fs / speed
        print(
@@ -224,7 +149,7 @@ def main():
                merge_sentences=merge_sentences,
                input=sentence)
            wav = get_voc_output(
-                args, voc_predictor=voc_predictor, input=am_output_data)
+                voc_predictor=voc_predictor, input=am_output_data)
        N += wav.size
        T += t.elapse

--- a/paddlespeech/t2s/exps/inference_streaming.py
+++ b/paddlespeech/t2s/exps/inference_streaming.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+from pathlib import Path
+import numpy as np
+import soundfile as sf
+from timer import timer
+from paddlespeech.t2s.exps.syn_utils import denorm
+from paddlespeech.t2s.exps.syn_utils import get_am_sublayer_output
+from paddlespeech.t2s.exps.syn_utils import get_chunks
+from paddlespeech.t2s.exps.syn_utils import get_frontend
+from paddlespeech.t2s.exps.syn_utils import get_predictor
+from paddlespeech.t2s.exps.syn_utils import get_sentences
+from paddlespeech.t2s.exps.syn_utils import get_streaming_am_output
+from paddlespeech.t2s.exps.syn_utils import get_streaming_am_predictor
+from paddlespeech.t2s.exps.syn_utils import get_voc_output
+from paddlespeech.t2s.utils import str2bool
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Paddle Infernce with acoustic model & vocoder.")
+    # acoustic model
+    parser.add_argument(
+        '--am',
+        type=str,
+        default='fastspeech2_csmsc',
+        choices=['fastspeech2_csmsc'],
+        help='Choose acoustic model type of tts task.')
+    parser.add_argument(
+        "--am_stat",
+        type=str,
+        default=None,
+        help="mean and standard deviation used to normalize spectrogram when training acoustic model."
+    )
+    parser.add_argument(
+        "--phones_dict", type=str, default=None, help="phone vocabulary file.")
+    parser.add_argument(
+        "--tones_dict", type=str, default=None, help="tone vocabulary file.")
+    parser.add_argument(
+        "--speaker_dict", type=str, default=None, help="speaker id map file.")
+    parser.add_argument(
+        '--spk_id',
+        type=int,
+        default=0,
+        help='spk id for multi speaker acoustic model')
+    # voc
+    parser.add_argument(
+        '--voc',
+        type=str,
+        default='pwgan_csmsc',
+        choices=['pwgan_csmsc', 'mb_melgan_csmsc', 'hifigan_csmsc'],
+        help='Choose vocoder type of tts task.')
+    # other
+    parser.add_argument(
+        '--lang',
+        type=str,
+        default='zh',
+        help='Choose model language. zh or en')
+    parser.add_argument(
+        "--text",
+        type=str,
+        help="text to synthesize, a 'utt_id sentence' pair per line")
+    parser.add_argument(
+        "--inference_dir", type=str, help="dir to save inference models")
+    parser.add_argument("--output_dir", type=str, help="output dir")
+    # inference
+    parser.add_argument(
+        "--device",
+        default="gpu",
+        choices=["gpu", "cpu"],
+        help="Device selected for inference.", )
+    # streaming related
+    parser.add_argument(
+        "--am_streaming",
+        type=str2bool,
+        default=False,
+        help="whether use streaming acoustic model")
+    parser.add_argument(
+        "--chunk_size", type=int, default=42, help="chunk size of am streaming")
+    parser.add_argument(
+        "--pad_size", type=int, default=12, help="pad size of am streaming")
+    args, _ = parser.parse_known_args()
+    return args
+# only inference for models trained with csmsc now
+def main():
+    args = parse_args()
+    # frontend
+    frontend = get_frontend(args)
+    # am_predictor
+    am_encoder_infer_predictor, am_decoder_predictor, am_postnet_predictor = get_streaming_am_predictor(
+        args)
+    am_mu, am_std = np.load(args.am_stat)
+    # model: {model_name}_{dataset}
+    am_dataset = args.am[args.am.rindex('_') + 1:]
+    # voc_predictor
+    voc_predictor = get_predictor(args, filed='voc')
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    sentences = get_sentences(args)
+    merge_sentences = True
+    fs = 24000 if am_dataset != 'ljspeech' else 22050
+    # warmup
+    for utt_id, sentence in sentences[:3]:
+        with timer() as t:
+            normalized_mel = get_streaming_am_output(
+                args,
+                am_encoder_infer_predictor=am_encoder_infer_predictor,
+                am_decoder_predictor=am_decoder_predictor,
+                am_postnet_predictor=am_postnet_predictor,
+                frontend=frontend,
+                merge_sentences=merge_sentences,
+                input=sentence)
+            mel = denorm(normalized_mel, am_mu, am_std)
+            wav = get_voc_output(voc_predictor=voc_predictor, input=mel)
+        speed = wav.size / t.elapse
+        rtf = fs / speed
+        print(
+            f"{utt_id}, mel: {mel.shape}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}."
+        )
+    print("warm up done!")
+    N = 0
+    T = 0
+    chunk_size = args.chunk_size
+    pad_size = args.pad_size
+    get_tone_ids = False
+    for utt_id, sentence in sentences:
+        with timer() as t:
+            # frontend
+            if args.lang == 'zh':
+                input_ids = frontend.get_input_ids(
+                    sentence,
+                    merge_sentences=merge_sentences,
+                    get_tone_ids=get_tone_ids)
+                phone_ids = input_ids["phone_ids"]
+            else:
+                print("lang should be 'zh' here!")
+            phones = phone_ids[0].numpy()
+            # acoustic model
+            orig_hs = get_am_sublayer_output(
+                am_encoder_infer_predictor, input=phones)
+            if args.am_streaming:
+                hss = get_chunks(orig_hs, chunk_size, pad_size)
+                chunk_num = len(hss)
+                mel_list = []
+                for i, hs in enumerate(hss):
+                    am_decoder_output = get_am_sublayer_output(
+                        am_decoder_predictor, input=hs)
+                    am_postnet_output = get_am_sublayer_output(
+                        am_postnet_predictor,
+                        input=np.transpose(am_decoder_output, (0, 2, 1)))
+                    am_output_data = am_decoder_output + np.transpose(
+                        am_postnet_output, (0, 2, 1))
+                    normalized_mel = am_output_data[0]
+                    sub_mel = denorm(normalized_mel, am_mu, am_std)
+                    # clip output part of pad
+                    if i == 0:
+                        sub_mel = sub_mel[:-pad_size]
+                    elif i == chunk_num - 1:
+                        # 最后一块的右侧一定没有 pad 够
+                        sub_mel = sub_mel[pad_size:]
+                    else:
+                        # 倒数几块的右侧也可能没有 pad 够
+                        sub_mel = sub_mel[pad_size:(chunk_size + pad_size) -
+                                          sub_mel.shape[0]]
+                    mel_list.append(sub_mel)
+                mel = np.concatenate(mel_list, axis=0)
+            else:
+                am_decoder_output = get_am_sublayer_output(
+                    am_decoder_predictor, input=orig_hs)
+                am_postnet_output = get_am_sublayer_output(
+                    am_postnet_predictor,
+                    input=np.transpose(am_decoder_output, (0, 2, 1)))
+                am_output_data = am_decoder_output + np.transpose(
+                    am_postnet_output, (0, 2, 1))
+                normalized_mel = am_output_data[0]
+                mel = denorm(normalized_mel, am_mu, am_std)
+            # vocoder
+            wav = get_voc_output(voc_predictor=voc_predictor, input=mel)
+        N += wav.size
+        T += t.elapse
+        speed = wav.size / t.elapse
+        rtf = fs / speed
+        sf.write(output_dir / (utt_id + ".wav"), wav, samplerate=24000)
+        print(
+            f"{utt_id}, mel: {mel.shape}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}."
+        )
+        print(f"{utt_id} done!")
+    print(f"generation speed: {N / T}Hz, RTF: {fs / (N / T) }")
+if __name__ == "__main__":
+    main()
--- a/paddlespeech/t2s/exps/ort_predict.py
+++ b/paddlespeech/t2s/exps/ort_predict.py
@@ -16,39 +16,14 @@ from pathlib import Path
 import jsonlines
 import numpy as np
-import onnxruntime as ort
 import soundfile as sf
 from timer import timer
+from paddlespeech.t2s.exps.syn_utils import get_sess
 from paddlespeech.t2s.exps.syn_utils import get_test_dataset
 from paddlespeech.t2s.utils import str2bool
-def get_sess(args, filed='am'):
-    full_name = ''
-    if filed == 'am':
-        full_name = args.am
-    elif filed == 'voc':
-        full_name = args.voc
-    model_dir = str(Path(args.inference_dir) / (full_name + ".onnx"))
-    sess_options = ort.SessionOptions()
-    sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
-    sess_options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
-    if args.device == "gpu":
-        # fastspeech2/mb_melgan can't use trt now!
-        if args.use_trt:
-            providers = ['TensorrtExecutionProvider']
-        else:
-            providers = ['CUDAExecutionProvider']
-    elif args.device == "cpu":
-        providers = ['CPUExecutionProvider']
-    sess_options.intra_op_num_threads = args.cpu_threads
-    sess = ort.InferenceSession(
-        model_dir, providers=providers, sess_options=sess_options)
-    return sess
 def ort_predict(args):
    # construct dataset for evaluation
    with jsonlines.open(args.test_metadata, 'r') as reader:
@@ -131,7 +106,7 @@ def parse_args():
        '--voc',
        type=str,
        default='hifigan_csmsc',
-        choices=['hifigan_csmsc', 'mb_melgan_csmsc'],
+        choices=['hifigan_csmsc', 'mb_melgan_csmsc', 'pwgan_csmsc'],
        help='Choose vocoder type of tts task.')
    # other
    parser.add_argument(

--- a/paddlespeech/t2s/exps/ort_predict_e2e.py
+++ b/paddlespeech/t2s/exps/ort_predict_e2e.py
@@ -15,40 +15,15 @@ import argparse
 from pathlib import Path
 import numpy as np
-import onnxruntime as ort
 import soundfile as sf
 from timer import timer
 from paddlespeech.t2s.exps.syn_utils import get_frontend
 from paddlespeech.t2s.exps.syn_utils import get_sentences
+from paddlespeech.t2s.exps.syn_utils import get_sess
 from paddlespeech.t2s.utils import str2bool
-def get_sess(args, filed='am'):
-    full_name = ''
-    if filed == 'am':
-        full_name = args.am
-    elif filed == 'voc':
-        full_name = args.voc
-    model_dir = str(Path(args.inference_dir) / (full_name + ".onnx"))
-    sess_options = ort.SessionOptions()
-    sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
-    sess_options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
-    if args.device == "gpu":
-        # fastspeech2/mb_melgan can't use trt now!
-        if args.use_trt:
-            providers = ['TensorrtExecutionProvider']
-        else:
-            providers = ['CUDAExecutionProvider']
-    elif args.device == "cpu":
-        providers = ['CPUExecutionProvider']
-    sess_options.intra_op_num_threads = args.cpu_threads
-    sess = ort.InferenceSession(
-        model_dir, providers=providers, sess_options=sess_options)
-    return sess
 def ort_predict(args):
    # frontend
@@ -156,7 +131,7 @@ def parse_args():
        '--voc',
        type=str,
        default='hifigan_csmsc',
-        choices=['hifigan_csmsc', 'mb_melgan_csmsc'],
+        choices=['hifigan_csmsc', 'mb_melgan_csmsc', 'pwgan_csmsc'],
        help='Choose vocoder type of tts task.')
    # other
    parser.add_argument(

--- a/paddlespeech/t2s/exps/ort_predict_streaming.py
+++ b/paddlespeech/t2s/exps/ort_predict_streaming.py
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+from pathlib import Path
+import numpy as np
+import soundfile as sf
+from timer import timer
+from paddlespeech.t2s.exps.syn_utils import denorm
+from paddlespeech.t2s.exps.syn_utils import get_chunks
+from paddlespeech.t2s.exps.syn_utils import get_frontend
+from paddlespeech.t2s.exps.syn_utils import get_sentences
+from paddlespeech.t2s.exps.syn_utils import get_sess
+from paddlespeech.t2s.exps.syn_utils import get_streaming_am_sess
+from paddlespeech.t2s.utils import str2bool
+def ort_predict(args):
+    # frontend
+    frontend = get_frontend(args)
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    sentences = get_sentences(args)
+    am_name = args.am[:args.am.rindex('_')]
+    am_dataset = args.am[args.am.rindex('_') + 1:]
+    fs = 24000 if am_dataset != 'ljspeech' else 22050
+    # am
+    am_encoder_infer_sess, am_decoder_sess, am_postnet_sess = get_streaming_am_sess(
+        args)
+    am_mu, am_std = np.load(args.am_stat)
+    # vocoder
+    voc_sess = get_sess(args, filed='voc')
+    # frontend warmup
+    # Loading model cost 0.5+ seconds
+    if args.lang == 'zh':
+        frontend.get_input_ids("你好，欢迎使用飞桨框架进行深度学习研究！", merge_sentences=True)
+    else:
+        print("lang should in be 'zh' here!")
+    # am warmup
+    for T in [27, 38, 54]:
+        phone_ids = np.random.randint(1, 266, size=(T, ))
+        am_encoder_infer_sess.run(None, input_feed={'text': phone_ids})
+        am_decoder_input = np.random.rand(1, T * 15, 384).astype('float32')
+        am_decoder_sess.run(None, input_feed={'xs': am_decoder_input})
+        am_postnet_input = np.random.rand(1, 80, T * 15).astype('float32')
+        am_postnet_sess.run(None, input_feed={'xs': am_postnet_input})
+    # voc warmup
+    for T in [227, 308, 544]:
+        data = np.random.rand(T, 80).astype("float32")
+        voc_sess.run(None, input_feed={"logmel": data})
+    print("warm up done!")
+    N = 0
+    T = 0
+    merge_sentences = True
+    get_tone_ids = False
+    chunk_size = args.chunk_size
+    pad_size = args.pad_size
+    for utt_id, sentence in sentences:
+        with timer() as t:
+            if args.lang == 'zh':
+                input_ids = frontend.get_input_ids(
+                    sentence,
+                    merge_sentences=merge_sentences,
+                    get_tone_ids=get_tone_ids)
+                phone_ids = input_ids["phone_ids"]
+            else:
+                print("lang should in be 'zh' here!")
+            # merge_sentences=True here, so we only use the first item of phone_ids
+            phone_ids = phone_ids[0].numpy()
+            orig_hs = am_encoder_infer_sess.run(
+                None, input_feed={'text': phone_ids})
+            if args.am_streaming:
+                hss = get_chunks(orig_hs[0], chunk_size, pad_size)
+                chunk_num = len(hss)
+                mel_list = []
+                for i, hs in enumerate(hss):
+                    am_decoder_output = am_decoder_sess.run(
+                        None, input_feed={'xs': hs})
+                    am_postnet_output = am_postnet_sess.run(
+                        None,
+                        input_feed={
+                            'xs': np.transpose(am_decoder_output[0], (0, 2, 1))
+                        })
+                    am_output_data = am_decoder_output + np.transpose(
+                        am_postnet_output[0], (0, 2, 1))
+                    normalized_mel = am_output_data[0][0]
+                    sub_mel = denorm(normalized_mel, am_mu, am_std)
+                    # clip output part of pad
+                    if i == 0:
+                        sub_mel = sub_mel[:-pad_size]
+                    elif i == chunk_num - 1:
+                        # 最后一块的右侧一定没有 pad 够
+                        sub_mel = sub_mel[pad_size:]
+                    else:
+                        # 倒数几块的右侧也可能没有 pad 够
+                        sub_mel = sub_mel[pad_size:(chunk_size + pad_size) -
+                                          sub_mel.shape[0]]
+                    mel_list.append(sub_mel)
+                mel = np.concatenate(mel_list, axis=0)
+            else:
+                am_decoder_output = am_decoder_sess.run(
+                    None, input_feed={'xs': orig_hs[0]})
+                am_postnet_output = am_postnet_sess.run(
+                    None,
+                    input_feed={
+                        'xs': np.transpose(am_decoder_output[0], (0, 2, 1))
+                    })
+                am_output_data = am_decoder_output + np.transpose(
+                    am_postnet_output[0], (0, 2, 1))
+                normalized_mel = am_output_data[0]
+                mel = denorm(normalized_mel, am_mu, am_std)
+                mel = mel[0]
+            # vocoder
+            wav = voc_sess.run(output_names=None, input_feed={'logmel': mel})
+            N += len(wav[0])
+            T += t.elapse
+            speed = len(wav[0]) / t.elapse
+            rtf = fs / speed
+        sf.write(
+            str(output_dir / (utt_id + ".wav")),
+            np.array(wav)[0],
+            samplerate=fs)
+        print(
+            f"{utt_id}, mel: {mel.shape}, wave: {len(wav[0])}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}."
+        )
+    print(f"generation speed: {N / T}Hz, RTF: {fs / (N / T) }")
+def parse_args():
+    parser = argparse.ArgumentParser(description="Infernce with onnxruntime.")
+    # acoustic model
+    parser.add_argument(
+        '--am',
+        type=str,
+        default='fastspeech2_csmsc',
+        choices=['fastspeech2_csmsc'],
+        help='Choose acoustic model type of tts task.')
+    parser.add_argument(
+        "--am_stat",
+        type=str,
+        default=None,
+        help="mean and standard deviation used to normalize spectrogram when training acoustic model."
+    )
+    parser.add_argument(
+        "--phones_dict", type=str, default=None, help="phone vocabulary file.")
+    parser.add_argument(
+        "--tones_dict", type=str, default=None, help="tone vocabulary file.")
+    # voc
+    parser.add_argument(
+        '--voc',
+        type=str,
+        default='hifigan_csmsc',
+        choices=['hifigan_csmsc', 'mb_melgan_csmsc', 'pwgan_csmsc'],
+        help='Choose vocoder type of tts task.')
+    # other
+    parser.add_argument(
+        "--inference_dir", type=str, help="dir to save inference models")
+    parser.add_argument(
+        "--text",
+        type=str,
+        help="text to synthesize, a 'utt_id sentence' pair per line")
+    parser.add_argument("--output_dir", type=str, help="output dir")
+    parser.add_argument(
+        '--lang',
+        type=str,
+        default='zh',
+        help='Choose model language. zh or en')
+    # inference
+    parser.add_argument(
+        "--use_trt",
+        type=str2bool,
+        default=False,
+        help="Whether to use inference engin TensorRT.", )
+    parser.add_argument(
+        "--device",
+        default="gpu",
+        choices=["gpu", "cpu"],
+        help="Device selected for inference.", )
+    parser.add_argument('--cpu_threads', type=int, default=1)
+    # streaming related
+    parser.add_argument(
+        "--am_streaming",
+        type=str2bool,
+        default=False,
+        help="whether use streaming acoustic model")
+    parser.add_argument(
+        "--chunk_size", type=int, default=42, help="chunk size of am streaming")
+    parser.add_argument(
+        "--pad_size", type=int, default=12, help="pad size of am streaming")
+    args, _ = parser.parse_known_args()
+    return args
+def main():
+    args = parse_args()
+    ort_predict(args)
+if __name__ == "__main__":
+    main()
--- a/paddlespeech/t2s/exps/syn_utils.py
+++ b/paddlespeech/t2s/exps/syn_utils.py
@@ -11,10 +11,14 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import math
 import os
+from pathlib import Path
 import numpy as np
+import onnxruntime as ort
 import paddle
+from paddle import inference
 from paddle import jit
 from paddle.static import InputSpec
@@ -62,6 +66,21 @@ model_alias = {
 }
+def denorm(data, mean, std):
+    return data * std + mean
+def get_chunks(data, chunk_size, pad_size):
+    data_len = data.shape[1]
+    chunks = []
+    n = math.ceil(data_len / chunk_size)
+    for i in range(n):
+        start = max(0, i * chunk_size - pad_size)
+        end = min((i + 1) * chunk_size + pad_size, data_len)
+        chunks.append(data[:, start:end, :])
+    return chunks
 # input
 def get_sentences(args):
    # construct dataset for evaluation
@@ -241,3 +260,221 @@ def voc_to_static(args, voc_inference):
    paddle.jit.save(voc_inference, os.path.join(args.inference_dir, args.voc))
    voc_inference = paddle.jit.load(os.path.join(args.inference_dir, args.voc))
    return voc_inference
+# inference
+def get_predictor(args, filed='am'):
+    full_name = ''
+    if filed == 'am':
+        full_name = args.am
+    elif filed == 'voc':
+        full_name = args.voc
+    config = inference.Config(
+        str(Path(args.inference_dir) / (full_name + ".pdmodel")),
+        str(Path(args.inference_dir) / (full_name + ".pdiparams")))
+    if args.device == "gpu":
+        config.enable_use_gpu(100, 0)
+    elif args.device == "cpu":
+        config.disable_gpu()
+    config.enable_memory_optim()
+    predictor = inference.create_predictor(config)
+    return predictor
+def get_am_output(args, am_predictor, frontend, merge_sentences, input):
+    am_name = args.am[:args.am.rindex('_')]
+    am_dataset = args.am[args.am.rindex('_') + 1:]
+    am_input_names = am_predictor.get_input_names()
+    get_tone_ids = False
+    get_spk_id = False
+    if am_name == 'speedyspeech':
+        get_tone_ids = True
+    if am_dataset in {"aishell3", "vctk"} and args.speaker_dict:
+        get_spk_id = True
+        spk_id = np.array([args.spk_id])
+    if args.lang == 'zh':
+        input_ids = frontend.get_input_ids(
+            input, merge_sentences=merge_sentences, get_tone_ids=get_tone_ids)
+        phone_ids = input_ids["phone_ids"]
+    elif args.lang == 'en':
+        input_ids = frontend.get_input_ids(
+            input, merge_sentences=merge_sentences)
+        phone_ids = input_ids["phone_ids"]
+    else:
+        print("lang should in {'zh', 'en'}!")
+    if get_tone_ids:
+        tone_ids = input_ids["tone_ids"]
+        tones = tone_ids[0].numpy()
+        tones_handle = am_predictor.get_input_handle(am_input_names[1])
+        tones_handle.reshape(tones.shape)
+        tones_handle.copy_from_cpu(tones)
+    if get_spk_id:
+        spk_id_handle = am_predictor.get_input_handle(am_input_names[1])
+        spk_id_handle.reshape(spk_id.shape)
+        spk_id_handle.copy_from_cpu(spk_id)
+    phones = phone_ids[0].numpy()
+    phones_handle = am_predictor.get_input_handle(am_input_names[0])
+    phones_handle.reshape(phones.shape)
+    phones_handle.copy_from_cpu(phones)
+    am_predictor.run()
+    am_output_names = am_predictor.get_output_names()
+    am_output_handle = am_predictor.get_output_handle(am_output_names[0])
+    am_output_data = am_output_handle.copy_to_cpu()
+    return am_output_data
+def get_voc_output(voc_predictor, input):
+    voc_input_names = voc_predictor.get_input_names()
+    mel_handle = voc_predictor.get_input_handle(voc_input_names[0])
+    mel_handle.reshape(input.shape)
+    mel_handle.copy_from_cpu(input)
+    voc_predictor.run()
+    voc_output_names = voc_predictor.get_output_names()
+    voc_output_handle = voc_predictor.get_output_handle(voc_output_names[0])
+    wav = voc_output_handle.copy_to_cpu()
+    return wav
+# streaming am
+def get_streaming_am_predictor(args):
+    full_name = args.am
+    am_encoder_infer_config = inference.Config(
+        str(
+            Path(args.inference_dir) /
+            (full_name + "_am_encoder_infer" + ".pdmodel")),
+        str(
+            Path(args.inference_dir) /
+            (full_name + "_am_encoder_infer" + ".pdiparams")))
+    am_decoder_config = inference.Config(
+        str(
+            Path(args.inference_dir) /
+            (full_name + "_am_decoder" + ".pdmodel")),
+        str(
+            Path(args.inference_dir) /
+            (full_name + "_am_decoder" + ".pdiparams")))
+    am_postnet_config = inference.Config(
+        str(
+            Path(args.inference_dir) /
+            (full_name + "_am_postnet" + ".pdmodel")),
+        str(
+            Path(args.inference_dir) /
+            (full_name + "_am_postnet" + ".pdiparams")))
+    if args.device == "gpu":
+        am_encoder_infer_config.enable_use_gpu(100, 0)
+        am_decoder_config.enable_use_gpu(100, 0)
+        am_postnet_config.enable_use_gpu(100, 0)
+    elif args.device == "cpu":
+        am_encoder_infer_config.disable_gpu()
+        am_decoder_config.disable_gpu()
+        am_postnet_config.disable_gpu()
+    am_encoder_infer_config.enable_memory_optim()
+    am_decoder_config.enable_memory_optim()
+    am_postnet_config.enable_memory_optim()
+    am_encoder_infer_predictor = inference.create_predictor(
+        am_encoder_infer_config)
+    am_decoder_predictor = inference.create_predictor(am_decoder_config)
+    am_postnet_predictor = inference.create_predictor(am_postnet_config)
+    return am_encoder_infer_predictor, am_decoder_predictor, am_postnet_predictor
+def get_am_sublayer_output(am_sublayer_predictor, input):
+    am_sublayer_input_names = am_sublayer_predictor.get_input_names()
+    input_handle = am_sublayer_predictor.get_input_handle(
+        am_sublayer_input_names[0])
+    input_handle.reshape(input.shape)
+    input_handle.copy_from_cpu(input)
+    am_sublayer_predictor.run()
+    am_sublayer_names = am_sublayer_predictor.get_output_names()
+    am_sublayer_handle = am_sublayer_predictor.get_output_handle(
+        am_sublayer_names[0])
+    am_sublayer_output = am_sublayer_handle.copy_to_cpu()
+    return am_sublayer_output
+def get_streaming_am_output(args, am_encoder_infer_predictor,
+                            am_decoder_predictor, am_postnet_predictor,
+                            frontend, merge_sentences, input):
+    get_tone_ids = False
+    if args.lang == 'zh':
+        input_ids = frontend.get_input_ids(
+            input, merge_sentences=merge_sentences, get_tone_ids=get_tone_ids)
+        phone_ids = input_ids["phone_ids"]
+    else:
+        print("lang should be 'zh' here!")
+    phones = phone_ids[0].numpy()
+    am_encoder_infer_output = get_am_sublayer_output(
+        am_encoder_infer_predictor, input=phones)
+    am_decoder_output = get_am_sublayer_output(
+        am_decoder_predictor, input=am_encoder_infer_output)
+    am_postnet_output = get_am_sublayer_output(
+        am_postnet_predictor, input=np.transpose(am_decoder_output, (0, 2, 1)))
+    am_output_data = am_decoder_output + np.transpose(am_postnet_output,
+                                                      (0, 2, 1))
+    normalized_mel = am_output_data[0]
+    return normalized_mel
+def get_sess(args, filed='am'):
+    full_name = ''
+    if filed == 'am':
+        full_name = args.am
+    elif filed == 'voc':
+        full_name = args.voc
+    model_dir = str(Path(args.inference_dir) / (full_name + ".onnx"))
+    sess_options = ort.SessionOptions()
+    sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
+    sess_options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
+    if args.device == "gpu":
+        # fastspeech2/mb_melgan can't use trt now!
+        if args.use_trt:
+            providers = ['TensorrtExecutionProvider']
+        else:
+            providers = ['CUDAExecutionProvider']
+    elif args.device == "cpu":
+        providers = ['CPUExecutionProvider']
+    sess_options.intra_op_num_threads = args.cpu_threads
+    sess = ort.InferenceSession(
+        model_dir, providers=providers, sess_options=sess_options)
+    return sess
+# streaming am
+def get_streaming_am_sess(args):
+    full_name = args.am
+    am_encoder_infer_model_dir = str(
+        Path(args.inference_dir) / (full_name + "_am_encoder_infer" + ".onnx"))
+    am_decoder_model_dir = str(
+        Path(args.inference_dir) / (full_name + "_am_decoder" + ".onnx"))
+    am_postnet_model_dir = str(
+        Path(args.inference_dir) / (full_name + "_am_postnet" + ".onnx"))
+    sess_options = ort.SessionOptions()
+    sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
+    sess_options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
+    if args.device == "gpu":
+        # fastspeech2/mb_melgan can't use trt now!
+        if args.use_trt:
+            providers = ['TensorrtExecutionProvider']
+        else:
+            providers = ['CUDAExecutionProvider']
+    elif args.device == "cpu":
+        providers = ['CPUExecutionProvider']
+    sess_options.intra_op_num_threads = args.cpu_threads
+    am_encoder_infer_sess = ort.InferenceSession(
+        am_encoder_infer_model_dir,
+        providers=providers,
+        sess_options=sess_options)
+    am_decoder_sess = ort.InferenceSession(
+        am_decoder_model_dir, providers=providers, sess_options=sess_options)
+    am_postnet_sess = ort.InferenceSession(
+        am_postnet_model_dir, providers=providers, sess_options=sess_options)
+    return am_encoder_infer_sess, am_decoder_sess, am_postnet_sess
--- a/paddlespeech/t2s/exps/synthesize_streaming.py
+++ b/paddlespeech/t2s/exps/synthesize_streaming.py
@@ -12,39 +12,29 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
-import math
+import os
 from pathlib import Path
 import numpy as np
 import paddle
 import soundfile as sf
 import yaml
+from paddle import jit
+from paddle.static import InputSpec
 from timer import timer
 from yacs.config import CfgNode
 from paddlespeech.s2t.utils.dynamic_import import dynamic_import
+from paddlespeech.t2s.exps.syn_utils import denorm
+from paddlespeech.t2s.exps.syn_utils import get_chunks
 from paddlespeech.t2s.exps.syn_utils import get_frontend
 from paddlespeech.t2s.exps.syn_utils import get_sentences
 from paddlespeech.t2s.exps.syn_utils import get_voc_inference
 from paddlespeech.t2s.exps.syn_utils import model_alias
+from paddlespeech.t2s.exps.syn_utils import voc_to_static
 from paddlespeech.t2s.utils import str2bool
-def denorm(data, mean, std):
-    return data * std + mean
-def get_chunks(data, chunk_size, pad_size):
-    data_len = data.shape[1]
-    chunks = []
-    n = math.ceil(data_len / chunk_size)
-    for i in range(n):
-        start = max(0, i * chunk_size - pad_size)
-        end = min((i + 1) * chunk_size + pad_size, data_len)
-        chunks.append(data[:, start:end, :])
-    return chunks
 def evaluate(args):
    # Init body.
@@ -84,9 +74,49 @@ def evaluate(args):
    am_mu = paddle.to_tensor(am_mu)
    am_std = paddle.to_tensor(am_std)
+    # am sub layers
+    am_encoder_infer = am.encoder_infer
+    am_decoder = am.decoder
+    am_postnet = am.postnet
    # vocoder
    voc_inference = get_voc_inference(args, voc_config)
+    # whether dygraph to static
+    if args.inference_dir:
+        # fastspeech2 cnndecoder to static
+        # am.encoder_infer
+        am_encoder_infer = jit.to_static(
+            am_encoder_infer, input_spec=[InputSpec([-1], dtype=paddle.int64)])
+        paddle.jit.save(am_encoder_infer,
+                        os.path.join(args.inference_dir,
+                                     args.am + "_am_encoder_infer"))
+        am_encoder_infer = paddle.jit.load(
+            os.path.join(args.inference_dir, args.am + "_am_encoder_infer"))
+        # am.decoder
+        am_decoder = jit.to_static(
+            am_decoder,
+            input_spec=[InputSpec([1, -1, 384], dtype=paddle.float32)])
+        paddle.jit.save(am_decoder,
+                        os.path.join(args.inference_dir,
+                                     args.am + "_am_decoder"))
+        am_decoder = paddle.jit.load(
+            os.path.join(args.inference_dir, args.am + "_am_decoder"))
+        # am.postnet
+        am_postnet = jit.to_static(
+            am_postnet,
+            input_spec=[InputSpec([1, 80, -1], dtype=paddle.float32)])
+        paddle.jit.save(am_postnet,
+                        os.path.join(args.inference_dir,
+                                     args.am + "_am_postnet"))
+        am_postnet = paddle.jit.load(
+            os.path.join(args.inference_dir, args.am + "_am_postnet"))
+        # vocoder
+        voc_inference = voc_to_static(args, voc_inference)
    output_dir = Path(args.output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    merge_sentences = True
@@ -107,20 +137,19 @@ def evaluate(args):
                phone_ids = input_ids["phone_ids"]
            else:
-                print("lang should in be 'zh' here!")
+                print("lang should be 'zh' here!")
            # merge_sentences=True here, so we only use the first item of phone_ids
            phone_ids = phone_ids[0]
            with paddle.no_grad():
                # acoustic model
-                orig_hs, h_masks = am.encoder_infer(phone_ids)
+                orig_hs = am_encoder_infer(phone_ids)
                if args.am_streaming:
                    hss = get_chunks(orig_hs, chunk_size, pad_size)
                    chunk_num = len(hss)
                    mel_list = []
                    for i, hs in enumerate(hss):
-                        before_outs, _ = am.decoder(hs)
+                        before_outs = am_decoder(hs)
-                        after_outs = before_outs + am.postnet(
+                        after_outs = before_outs + am_postnet(
                            before_outs.transpose((0, 2, 1))).transpose(
                                (0, 2, 1))
                        normalized_mel = after_outs[0]
@@ -139,8 +168,8 @@ def evaluate(args):
                    mel = paddle.concat(mel_list, axis=0)
                else:
-                    before_outs, _ = am.decoder(orig_hs)
+                    before_outs = am_decoder(orig_hs)
-                    after_outs = before_outs + am.postnet(
+                    after_outs = before_outs + am_postnet(
                        before_outs.transpose((0, 2, 1))).transpose((0, 2, 1))
                    normalized_mel = after_outs[0]
                    mel = denorm(normalized_mel, am_mu, am_std)
@@ -201,16 +230,9 @@ def parse_args():
        default='pwgan_csmsc',
        choices=[
            'pwgan_csmsc',
-            'pwgan_ljspeech',
-            'pwgan_aishell3',
-            'pwgan_vctk',
            'mb_melgan_csmsc',
            'style_melgan_csmsc',
            'hifigan_csmsc',
-            'hifigan_ljspeech',
-            'hifigan_aishell3',
-            'hifigan_vctk',
-            'wavernn_csmsc',
        ],
        help='Choose vocoder type of tts task.')
    parser.add_argument(
@@ -233,13 +255,19 @@ def parse_args():
        default='zh',
        help='Choose model language. zh or en')
+    parser.add_argument(
+        "--inference_dir",
+        type=str,
+        default=None,
+        help="dir to save inference models")
    parser.add_argument(
        "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
    parser.add_argument(
        "--text",
        type=str,
        help="text to synthesize, a 'utt_id sentence' pair per line.")
+    # streaming related
    parser.add_argument(
        "--am_streaming",
        type=str2bool,

--- a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
+++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
@@ -590,15 +590,17 @@ class FastSpeech2(nn.Layer):
            h_masks = self._source_mask(olens_in)
        else:
            h_masks = None
        if return_after_enc:
            return hs, h_masks
-        # (B, Lmax, adim)
-        zs, _ = self.decoder(hs, h_masks)
-        # (B, Lmax, odim)
        if self.decoder_type == 'cnndecoder':
+            # remove output masks for dygraph to static graph
+            zs = self.decoder(hs, h_masks)
            before_outs = zs
        else:
+            # (B, Lmax, adim)
+            zs, _ = self.decoder(hs, h_masks)
+            # (B, Lmax, odim)
            before_outs = self.feat_out(zs).reshape(
                (paddle.shape(zs)[0], -1, self.odim))
@@ -633,7 +635,8 @@ class FastSpeech2(nn.Layer):
            tone_id = tone_id.unsqueeze(0)
        # (1, L, odim)
-        hs, h_masks = self._forward(
+        # use *_ to avoid bug in dygraph to static graph    
+        hs, *_ = self._forward(
            xs,
            ilens,
            is_inference=True,
@@ -642,7 +645,7 @@ class FastSpeech2(nn.Layer):
            spk_emb=spk_emb,
            spk_id=spk_id,
            tone_id=tone_id)
-        return hs, h_masks
+        return hs
    def inference(
            self,

--- a/paddlespeech/t2s/modules/transformer/encoder.py
+++ b/paddlespeech/t2s/modules/transformer/encoder.py
@@ -602,7 +602,7 @@ class CNNDecoder(nn.Layer):
        if masks is not None:
            outputs = outputs * masks
        outputs = outputs.transpose([0, 2, 1])
-        return outputs, masks
+        return outputs
 class CNNPostnet(nn.Layer):

--- a/paddlespeech/vector/cluster/diarization.py
+++ b/paddlespeech/vector/cluster/diarization.py
-# Copyright (c) 2022 SpeechBrain Authors. All Rights Reserved.
+# Copyright (c) 2022 PaddlePaddle and SpeechBrain Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -18,12 +18,14 @@ This script has an optional dependency on open source sklearn library.
 A few sklearn functions are modified in this script as per requirement.
 """
 import argparse
+import copy
 import warnings
 from distutils.util import strtobool
 import numpy as np
 import scipy
 import sklearn
+from scipy import linalg
 from scipy import sparse
 from scipy.sparse.csgraph import connected_components
 from scipy.sparse.csgraph import laplacian as csgraph_laplacian
@@ -346,6 +348,8 @@ class EmbeddingMeta:
    ---------
    segset : list
        List of session IDs as an array of strings.
+    modelset : list
+        List of model IDs as an array of strings.
    stats : tensor
        An ndarray of float64. Each line contains embedding
        from the corresponding session.
@@ -354,15 +358,20 @@ class EmbeddingMeta:
    def __init__(
            self,
            segset=None,
+            modelset=None,
            stats=None, ):
        if segset is None:
-            self.segset = numpy.empty(0, dtype="|O")
+            self.segset = np.empty(0, dtype="|O")
-            self.stats = numpy.array([], dtype=np.float64)
+            self.modelset = np.empty(0, dtype="|O")
+            self.stats = np.array([], dtype=np.float64)
        else:
            self.segset = segset
+            self.modelset = modelset
            self.stats = stats
+        self.stat0 = np.array([[1.0]] * self.stats.shape[0])
    def norm_stats(self):
        """
        Divide all first-order statistics by their Euclidean norm.
@@ -371,6 +380,188 @@ class EmbeddingMeta:
        vect_norm = np.clip(np.linalg.norm(self.stats, axis=1), 1e-08, np.inf)
        self.stats = (self.stats.transpose() / vect_norm).transpose()
+    def get_mean_stats(self):
+        """
+        Return the mean of first order statistics.
+        """
+        mu = np.mean(self.stats, axis=0)
+        return mu
+    def get_total_covariance_stats(self):
+        """
+        Compute and return the total covariance matrix of the first-order statistics.
+        """
+        C = self.stats - self.stats.mean(axis=0)
+        return np.dot(C.transpose(), C) / self.stats.shape[0]
+    def get_model_stat0(self, mod_id):
+        """Return zero-order statistics of a given model
+        Arguments
+        ---------
+        mod_id : str
+            ID of the model which stat0 will be returned.
+        """
+        S = self.stat0[self.modelset == mod_id, :]
+        return S
+    def get_model_stats(self, mod_id):
+        """Return first-order statistics of a given model.
+        Arguments
+        ---------
+        mod_id : str
+            ID of the model which stat1 will be returned.
+        """
+        return self.stats[self.modelset == mod_id, :]
+    def sum_stat_per_model(self):
+        """
+        Sum the zero- and first-order statistics per model and store them
+        in a new EmbeddingMeta.
+        Returns a EmbeddingMeta object with the statistics summed per model
+        and a numpy array with session_per_model.
+        """
+        sts_per_model = EmbeddingMeta()
+        sts_per_model.modelset = np.unique(
+            self.modelset)  # nd: get uniq spkr ids
+        sts_per_model.segset = copy.deepcopy(sts_per_model.modelset)
+        sts_per_model.stat0 = np.zeros(
+            (sts_per_model.modelset.shape[0], self.stat0.shape[1]),
+            dtype=np.float64, )
+        sts_per_model.stats = np.zeros(
+            (sts_per_model.modelset.shape[0], self.stats.shape[1]),
+            dtype=np.float64, )
+        session_per_model = np.zeros(np.unique(self.modelset).shape[0])
+        # For each model sum the stats
+        for idx, model in enumerate(sts_per_model.modelset):
+            sts_per_model.stat0[idx, :] = self.get_model_stat0(model).sum(
+                axis=0)
+            sts_per_model.stats[idx, :] = self.get_model_stats(model).sum(
+                axis=0)
+            session_per_model[idx] += self.get_model_stats(model).shape[0]
+        return sts_per_model, session_per_model
+    def center_stats(self, mu):
+        """
+        Center first order statistics.
+        Arguments
+        ---------
+        mu : array
+            Array to center on.
+        """
+        dim = self.stats.shape[1] / self.stat0.shape[1]
+        index_map = np.repeat(np.arange(self.stat0.shape[1]), dim)
+        self.stats = self.stats - (self.stat0[:, index_map] *
+                                   mu.astype(np.float64))
+    def rotate_stats(self, R):
+        """
+        Rotate first-order statistics by a right-product.
+        Arguments
+        ---------
+        R : ndarray
+            Matrix to use for right product on the first order statistics.
+        """
+        self.stats = np.dot(self.stats, R)
+    def whiten_stats(self, mu, sigma, isSqrInvSigma=False):
+        """
+        Whiten first-order statistics
+        If sigma.ndim == 1, case of a diagonal covariance.
+        If sigma.ndim == 2, case of a single Gaussian with full covariance.
+        If sigma.ndim == 3, case of a full covariance UBM.
+        Arguments
+        ---------
+        mu : array
+            Mean vector to be subtracted from the statistics.
+        sigma : narray
+            Co-variance matrix or covariance super-vector.
+        isSqrInvSigma : bool
+            True if the input Sigma matrix is the inverse of the square root of a covariance matrix.
+        """
+        if sigma.ndim == 1:
+            self.center_stats(mu)
+            self.stats = self.stats / np.sqrt(sigma.astype(np.float64))
+        elif sigma.ndim == 2:
+            # Compute the inverse square root of the co-variance matrix Sigma
+            sqr_inv_sigma = sigma
+            if not isSqrInvSigma:
+                # eigen_values, eigen_vectors = scipy.linalg.eigh(sigma)
+                eigen_values, eigen_vectors = linalg.eigh(sigma)
+                ind = eigen_values.real.argsort()[::-1]
+                eigen_values = eigen_values.real[ind]
+                eigen_vectors = eigen_vectors.real[:, ind]
+                sqr_inv_eval_sigma = 1 / np.sqrt(eigen_values.real)
+                sqr_inv_sigma = np.dot(eigen_vectors,
+                                       np.diag(sqr_inv_eval_sigma))
+            else:
+                pass
+            # Whitening of the first-order statistics
+            self.center_stats(mu)  # CENTERING
+            self.rotate_stats(sqr_inv_sigma)
+        elif sigma.ndim == 3:
+            # we assume that sigma is a 3D ndarray of size D x n x n
+            # where D is the number of distributions and n is the dimension of a single distribution
+            n = self.stats.shape[1] // self.stat0.shape[1]
+            sess_nb = self.stat0.shape[0]
+            self.center_stats(mu)
+            self.stats = (np.einsum("ikj,ikl->ilj",
+                                    self.stats.T.reshape(-1, n, sess_nb), sigma)
+                          .reshape(-1, sess_nb).T)
+        else:
+            raise Exception("Wrong dimension of Sigma, must be 1 or 2")
+    def align_models(self, model_list):
+        """
+        Align models of the current EmbeddingMeta to match a list of models
+            provided as input parameter. The size of the StatServer might be
+            reduced to match the input list of models.
+        Arguments
+        ---------
+        model_list : ndarray of strings
+            List of models to match.
+        """
+        indx = np.array(
+            [np.argwhere(self.modelset == v)[0][0] for v in model_list])
+        self.segset = self.segset[indx]
+        self.modelset = self.modelset[indx]
+        self.stat0 = self.stat0[indx, :]
+        self.stats = self.stats[indx, :]
+    def align_segments(self, segment_list):
+        """
+        Align segments of the current EmbeddingMeta to match a list of segment
+            provided as input parameter. The size of the StatServer might be
+            reduced to match the input list of segments.
+        Arguments
+        ---------
+        segment_list: ndarray of strings
+            list of segments to match
+        """
+        indx = np.array(
+            [np.argwhere(self.segset == v)[0][0] for v in segment_list])
+        self.segset = self.segset[indx]
+        self.modelset = self.modelset[indx]
+        self.stat0 = self.stat0[indx, :]
+        self.stats = self.stats[indx, :]
 class SpecClustUnorm:
    """

--- a/paddlespeech/vector/cluster/plda.py
+++ b/paddlespeech/vector/cluster/plda.py
--- a/paddlespeech/vector/io/dataset_from_json.py
+++ b/paddlespeech/vector/io/dataset_from_json.py
@@ -26,14 +26,14 @@ from paddleaudio.compliance.librosa import mfcc
 class meta_info:
    """the audio meta info in the vector JSONDataset
    Args:
-        id (str): the segment name
+        utt_id (str): the segment name
        duration (float): segment time
        wav (str): wav file path
        start (int): start point in the original wav file
        stop (int): stop point in the original wav file
        lab_id (str): the record id
    """
-    id: str
+    utt_id: str
    duration: float
    wav: str
    start: int

--- a/setup.py
+++ b/setup.py
@@ -42,6 +42,7 @@ base = [
    "loguru",
    "matplotlib",
    "nara_wpe",
+    "onnxruntime",
    "pandas",
    "paddleaudio",
    "paddlenlp",
@@ -64,6 +65,7 @@ base = [
    "webrtcvad",
    "yacs~=0.1.8",
    "prettytable",
+    "zhon",
 ]
 server = [
@@ -90,7 +92,6 @@ requirements = {
        "unidecode",
        "yq",
        "pre-commit",
-        "zhon",
    ]
 }

--- a/speechx/examples/README.md
+++ b/speechx/examples/README.md
 # Examples for SpeechX
-* dev - for speechx developer, using for test.
-* ngram - using to build NGram ARPA lm.
 * ds2_ol - ds2 streaming test under `aishell-1` test dataset.
- The entrypoint is `ds2_ol/aishell/run.sh`
+   The entrypoint is `ds2_ol/aishell/run.sh`
-## How to run
+## How to run  
 `run.sh` is the entry point.
@@ -17,9 +15,23 @@ pushd ds2_ol/aishell
 bash run.sh
 ```
-## Display Model with [Netron](https://github.com/lutzroeder/netron)
+## Display Model with [Netron](https://github.com/lutzroeder/netron)  
 ```
 pip install netron
 netron exp/deepspeech2_online/checkpoints/avg_1.jit.pdmodel  --port 8022 --host 10.21.55.20
 ```
+## For Developer  
+> Warning: Only for developer, make sure you know what's it.
+* dev - for speechx developer, using for test.
+## Build WFST  
+> Warning: Using below example when you know what's it.
+* text_lm - process text for build lm
+* ngram - using to build NGram ARPA lm.
+* wfst - build wfst for TLG.
--- a/speechx/examples/ds2_ol/aishell/README.md
+++ b/speechx/examples/ds2_ol/aishell/README.md
@@ -10,12 +10,18 @@ Other -> 0.00 % N=0 C=0 S=0 D=0 I=0
 ## CTC Prefix Beam Search w LM
+LM: zh_giga.no_cna_cmn.prune01244.klm
 ```
+Overall -> 7.86 % N=104768 C=96865 S=7573 D=330 I=327
+Mandarin -> 7.86 % N=104768 C=96865 S=7573 D=330 I=327
+Other -> 0.00 % N=0 C=0 S=0 D=0 I=0
 ```
 ## CTC WFST
+LM: aishell train
+```
+Overall -> 11.14 % N=103017 C=93363 S=9583 D=71 I=1819
+Mandarin -> 11.14 % N=103017 C=93363 S=9583 D=71 I=1818
+Other -> 0.00 % N=0 C=0 S=0 D=0 I=1
 ```
-```
\ No newline at end of file
--- a/speechx/examples/ds2_ol/aishell/path.sh
+++ b/speechx/examples/ds2_ol/aishell/path.sh
@@ -11,4 +11,4 @@ TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin
 export LC_AL=C
 SPEECHX_BIN=$SPEECHX_EXAMPLES/ds2_ol/decoder:$SPEECHX_EXAMPLES/ds2_ol/feat
 export PATH=$PATH:$SPEECHX_BIN:$TOOLS_BIN
\ No newline at end of file
--- a/speechx/examples/ds2_ol/aishell/run.sh
+++ b/speechx/examples/ds2_ol/aishell/run.sh
@@ -5,7 +5,10 @@ set -e
 . path.sh
 nj=40
+stage=0
+stop_stage=100
+. utils/parse_options.sh
 # 1. compile
 if [ ! -d ${SPEECHX_EXAMPLES} ]; then
@@ -26,102 +29,112 @@ vocb_dir=$ckpt_dir/data/lang_char/
 mkdir -p exp
 exp=$PWD/exp
-aishell_wav_scp=aishell_test.scp
+if [ $stage -le 0 ] && [ $stop_stage -ge 0 ];then
-if [ ! -d $data/test ]; then
+    aishell_wav_scp=aishell_test.scp
-    pushd $data
+    if [ ! -d $data/test ]; then
-    wget -c https://paddlespeech.bj.bcebos.com/s2t/paddle_asr_online/aishell_test.zip
+        pushd $data
-    unzip  aishell_test.zip
+        wget -c https://paddlespeech.bj.bcebos.com/s2t/paddle_asr_online/aishell_test.zip
-    popd
+        unzip  aishell_test.zip
+        popd
-    realpath $data/test/*/*.wav > $data/wavlist
-    awk -F '/' '{ print $(NF) }' $data/wavlist | awk -F '.' '{ print $1 }' > $data/utt_id
+        realpath $data/test/*/*.wav > $data/wavlist
-    paste $data/utt_id $data/wavlist > $data/$aishell_wav_scp
+        awk -F '/' '{ print $(NF) }' $data/wavlist | awk -F '.' '{ print $1 }' > $data/utt_id
-fi
+        paste $data/utt_id $data/wavlist > $data/$aishell_wav_scp
+    fi
-if [ ! -d $ckpt_dir ]; then
-    mkdir -p $ckpt_dir
+    if [ ! -d $ckpt_dir ]; then
-    wget -P $ckpt_dir -c https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz
+        mkdir -p $ckpt_dir
-    tar xzfv $model_dir/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz -C $ckpt_dir
+        wget -P $ckpt_dir -c https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz
-fi
+        tar xzfv $model_dir/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz -C $ckpt_dir
+    fi
-lm=$data/zh_giga.no_cna_cmn.prune01244.klm
-if [ ! -f $lm ]; then
+    lm=$data/zh_giga.no_cna_cmn.prune01244.klm
-    pushd $data
+    if [ ! -f $lm ]; then
-    wget -c https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm
+        pushd $data
-    popd
+        wget -c https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm
+        popd
+    fi
 fi
 # 3. make feature
+text=$data/test/text
 label_file=./aishell_result
 wer=./aishell_wer
 export GLOG_logtostderr=1
-# 3. gen linear feat
-cmvn=$PWD/cmvn.ark
-cmvn-json2kaldi --json_file=$ckpt_dir/data/mean_std.json --cmvn_write_path=$cmvn
+if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
+    # 3. gen linear feat
+    cmvn=$data/cmvn.ark
+    cmvn-json2kaldi --json_file=$ckpt_dir/data/mean_std.json --cmvn_write_path=$cmvn
-./local/split_data.sh $data $data/$aishell_wav_scp $aishell_wav_scp $nj
+    ./local/split_data.sh $data $data/$aishell_wav_scp $aishell_wav_scp $nj
-utils/run.pl JOB=1:$nj $data/split${nj}/JOB/feat.log \
+    utils/run.pl JOB=1:$nj $data/split${nj}/JOB/feat.log \
-linear-spectrogram-wo-db-norm-ol \
+    linear-spectrogram-wo-db-norm-ol \
-    --wav_rspecifier=scp:$data/split${nj}/JOB/${aishell_wav_scp} \
+        --wav_rspecifier=scp:$data/split${nj}/JOB/${aishell_wav_scp} \
-    --feature_wspecifier=ark,scp:$data/split${nj}/JOB/feat.ark,$data/split${nj}/JOB/feat.scp \
+        --feature_wspecifier=ark,scp:$data/split${nj}/JOB/feat.ark,$data/split${nj}/JOB/feat.scp \
-    --cmvn_file=$cmvn \
+        --cmvn_file=$cmvn \
-    --streaming_chunk=0.36
+        --streaming_chunk=0.36
+fi
-text=$data/test/text
-# 4. recognizer
+if [ $stage -le 2 ] && [ $stop_stage -ge 2 ];then
-utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.wolm.log \
+    #  recognizer
-  ctc-prefix-beam-search-decoder-ol \
+    utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.wolm.log \
-    --feature_rspecifier=scp:$data/split${nj}/JOB/feat.scp \
+    ctc-prefix-beam-search-decoder-ol \
-    --model_path=$model_dir/avg_1.jit.pdmodel \
+        --feature_rspecifier=scp:$data/split${nj}/JOB/feat.scp \
-    --param_path=$model_dir/avg_1.jit.pdiparams \
+        --model_path=$model_dir/avg_1.jit.pdmodel \
-    --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \
+        --param_path=$model_dir/avg_1.jit.pdiparams \
-    --dict_file=$vocb_dir/vocab.txt \
+        --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \
-    --result_wspecifier=ark,t:$data/split${nj}/JOB/result
+        --dict_file=$vocb_dir/vocab.txt \
+        --result_wspecifier=ark,t:$data/split${nj}/JOB/result
-cat $data/split${nj}/*/result > ${label_file}
-utils/compute-wer.py --char=1 --v=1 ${label_file} $text > ${wer}
+    cat $data/split${nj}/*/result > $exp/${label_file}
+    utils/compute-wer.py --char=1 --v=1 $exp/${label_file} $text > $exp/${wer}
-# 4. decode with lm
-utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.lm.log \
-  ctc-prefix-beam-search-decoder-ol \
-    --feature_rspecifier=scp:$data/split${nj}/JOB/feat.scp \
-    --model_path=$model_dir/avg_1.jit.pdmodel \
-    --param_path=$model_dir/avg_1.jit.pdiparams \
-    --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \
-    --dict_file=$vocb_dir/vocab.txt \
-    --lm_path=$lm \
-    --result_wspecifier=ark,t:$data/split${nj}/JOB/result_lm
-cat $data/split${nj}/*/result_lm > ${label_file}_lm
-utils/compute-wer.py --char=1 --v=1 ${label_file}_lm $text > ${wer}_lm
-graph_dir=./aishell_graph
-if [ ! -d $ ]; then
-    wget -c https://paddlespeech.bj.bcebos.com/s2t/paddle_asr_online/aishell_graph.zip
-    unzip -d aishell_graph.zip
 fi
+if [ $stage -le 3 ] && [ $stop_stage -ge 3 ];then
+    #  decode with lm
+    utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.lm.log \
+    ctc-prefix-beam-search-decoder-ol \
+        --feature_rspecifier=scp:$data/split${nj}/JOB/feat.scp \
+        --model_path=$model_dir/avg_1.jit.pdmodel \
+        --param_path=$model_dir/avg_1.jit.pdiparams \
+        --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \
+        --dict_file=$vocb_dir/vocab.txt \
+        --lm_path=$lm \
+        --result_wspecifier=ark,t:$data/split${nj}/JOB/result_lm
+    cat $data/split${nj}/*/result_lm > $exp/${label_file}_lm
+    utils/compute-wer.py --char=1 --v=1 $exp/${label_file}_lm $text > $exp/${wer}_lm
+fi
-# 5. test TLG decoder
-utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.wfst.log \
-  wfst-decoder-ol \
-    --feature_rspecifier=scp:$data/split${nj}/JOB/feat.scp \
-    --model_path=$model_dir/avg_1.jit.pdmodel \
-    --param_path=$model_dir/avg_1.jit.pdiparams \
-    --word_symbol_table=$graph_dir/words.txt \
-    --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \
-     --graph_path=$graph_dir/TLG.fst --max_active=7500 \
-    --acoustic_scale=1.2 \
-    --result_wspecifier=ark,t:$data/split${nj}/JOB/result_tlg
+wfst=$data/wfst/
+mkdir -p $wfst
+if [ ! -f $wfst/aishell_graph.zip ]; then
+    pushd $wfst
+    wget -c https://paddlespeech.bj.bcebos.com/s2t/paddle_asr_online/aishell_graph.zip
+    unzip aishell_graph.zip
+    popd
+fi
-cat $data/split${nj}/*/result_tlg > ${label_file}_tlg
+graph_dir=$wfst/aishell_graph
-utils/compute-wer.py --char=1 --v=1 ${label_file}_tlg $text > ${wer}_tlg
+if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
\ No newline at end of file
+    #  TLG decoder
+    utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.wfst.log \
+    wfst-decoder-ol \
+        --feature_rspecifier=scp:$data/split${nj}/JOB/feat.scp \
+        --model_path=$model_dir/avg_1.jit.pdmodel \
+        --param_path=$model_dir/avg_1.jit.pdiparams \
+        --word_symbol_table=$graph_dir/words.txt \
+        --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \
+        --graph_path=$graph_dir/TLG.fst --max_active=7500 \
+        --acoustic_scale=1.2 \
+        --result_wspecifier=ark,t:$data/split${nj}/JOB/result_tlg
+    cat $data/split${nj}/*/result_tlg > $exp/${label_file}_tlg
+    utils/compute-wer.py --char=1 --v=1 $exp/${label_file}_tlg $text > $exp/${wer}_tlg
+fi
\ No newline at end of file
--- a/speechx/examples/ngram/README.md
+++ b/speechx/examples/ngram/README.md
-# NGram Train
--- a/speechx/examples/ngram/en/README.md
+++ b/speechx/examples/ngram/en/README.md
--- a/speechx/examples/ngram/zh/README.md
+++ b/speechx/examples/ngram/zh/README.md
+# ngram train for mandarin
+Quick run:
+```
+bash run.sh --stage -1
+```
+## input
+input files:
+```
+data/
+├── lexicon.txt
+├── text
+└── vocab.txt
+```
+```
+==> data/text <==
+BAC009S0002W0122 而 对 楼市 成交 抑制 作用 最 大 的 限 购
+BAC009S0002W0123 也 成为 地方 政府 的 眼中 钉
+BAC009S0002W0124 自 六月 底 呼和浩特 市 率先 宣布 取消 限 购 后
+BAC009S0002W0125 各地 政府 便 纷纷 跟进
+BAC009S0002W0126 仅 一 个 多 月 的 时间 里
+BAC009S0002W0127 除了 北京 上海 广州 深圳 四 个 一 线 城市 和 三亚 之外
+BAC009S0002W0128 四十六 个 限 购 城市 当中
+BAC009S0002W0129 四十一 个 已 正式 取消 或 变相 放松 了 限 购
+BAC009S0002W0130 财政 金融 政策 紧随 其后 而来
+BAC009S0002W0131 显示 出 了 极 强 的 威力
+==> data/lexicon.txt <==
+SIL sil
+<SPOKEN_NOISE> sil
+啊 aa a1
+啊 aa a2
+啊 aa a4
+啊 aa a5
+啊啊啊 aa a2 aa a2 aa a2
+啊啊啊 aa a5 aa a5 aa a5
+坐地 z uo4 d i4
+坐实 z uo4 sh ix2
+坐视 z uo4 sh ix4
+坐稳 z uo4 uu un3
+坐拥 z uo4 ii iong1
+坐诊 z uo4 zh en3
+坐庄 z uo4 zh uang1
+坐姿 z uo4 z iy1
+==> data/vocab.txt <==
+<blank>
+<unk>
+A
+B
+C
+D
+E
+龙
+龚
+龛
+<eos>
+```
+## output
+```
+data/
+├── local
+│   ├── dict
+│   │   ├── lexicon.txt
+│   │   └── units.txt
+│   └── lm
+│       ├── heldout
+│       ├── lm.arpa
+│       ├── text
+│       ├── text.no_oov
+│       ├── train
+│       ├── unigram.counts
+│       ├── word.counts
+│       └── wordlist
+```
+```
+/workspace/srilm/bin/i686-m64/ngram-count
+Namespace(bpemodel=None, in_lexicon='data/lexicon.txt', out_lexicon='data/local/dict/lexicon.txt', unit_file='data/vocab.txt')
+Ignoring words 矽, which contains oov unit
+Ignoring words 傩, which contains oov unit
+Ignoring words 堀, which contains oov unit
+Ignoring words 莼, which contains oov unit
+Ignoring words 菰, which contains oov unit
+Ignoring words 摭, which contains oov unit
+Ignoring words 帙, which contains oov unit
+Ignoring words 迨, which contains oov unit
+Ignoring words 孥, which contains oov unit
+Ignoring words 瑗, which contains oov unit
+...
+...
+...
+file data/local/lm/heldout: 10000 sentences, 89496 words, 0 OOVs
+0 zeroprobs, logprob= -270337.9 ppl= 521.2819 ppl1= 1048.745
+build LM done.
+```
--- a/speechx/examples/ngram/zh/local/aishell_train_lms.sh
+++ b/speechx/examples/ngram/zh/local/aishell_train_lms.sh
+#!/bin/bash
+# To be run from one directory above this script.
+. ./path.sh
+text=data/local/lm/text
+lexicon=data/local/dict/lexicon.txt
+for f in "$text" "$lexicon"; do
+  [ ! -f $x ] && echo "$0: No such file $f" && exit 1;
+done
+# Check SRILM tools
+if ! which ngram-count > /dev/null; then
+    echo "srilm tools are not found, please download it and install it from: "
+    echo "http://www.speech.sri.com/projects/srilm/download.html"
+    echo "Then add the tools to your PATH"
+    exit 1
+fi
+# This script takes no arguments.  It assumes you have already run
+# aishell_data_prep.sh.
+# It takes as input the files
+# data/local/lm/text
+# data/local/dict/lexicon.txt
+dir=data/local/lm
+mkdir -p $dir
+cleantext=$dir/text.no_oov
+# oov to <SPOKEN_NOISE>
+# lexicon line: word char0 ... charn
+# text line: utt word0 ... wordn -> line: <SPOKEN_NOISE> word0 ... wordn
+cat $text | awk -v lex=$lexicon 'BEGIN{while((getline<lex) >0){ seen[$1]=1; } }
+  {for(n=1; n<=NF;n++) {  if (seen[$n]) { printf("%s ", $n); } else {printf("<SPOKEN_NOISE> ");} } printf("\n");}' \
+  > $cleantext || exit 1;
+# compute word counts, sort in descending order
+# line: count word
+cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort | uniq -c | \
+   sort -nr > $dir/word.counts || exit 1;
+# Get counts from acoustic training transcripts, and add  one-count
+# for each word in the lexicon (but not silence, we don't want it
+# in the LM-- we'll add it optionally later).
+cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | \
+  cat - <(grep -w -v '!SIL' $lexicon | awk '{print $1}') | \
+   sort | uniq -c | sort -nr > $dir/unigram.counts || exit 1;
+# word with <s> </s>
+cat $dir/unigram.counts | awk '{print $2}' | cat - <(echo "<s>"; echo "</s>" ) > $dir/wordlist
+# hold out to compute ppl
+heldout_sent=10000 # Don't change this if you want result to be comparable with kaldi_lm results
+mkdir -p $dir
+cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \
+  head -$heldout_sent > $dir/heldout
+cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \
+  tail -n +$heldout_sent > $dir/train
+ngram-count -text $dir/train -order 3 -limit-vocab -vocab $dir/wordlist -unk \
+  -map-unk "<UNK>" -kndiscount -interpolate -lm $dir/lm.arpa
+ngram -lm $dir/lm.arpa -ppl $dir/heldout
\ No newline at end of file
--- a/speechx/examples/ngram/zh/local/text_to_lexicon.py
+++ b/speechx/examples/ngram/zh/local/text_to_lexicon.py
+#!/usr/bin/env python3
+import argparse
+from collections import Counter
+def main(args):
+    counter = Counter()
+    with open(args.text, 'r') as fin, open(args.lexicon, 'w') as fout:
+        for line in fin:
+            line = line.strip()
+            if args.has_key:
+                utt, text = line.split(maxsplit=1)
+                words = text.split()
+            else:
+                words = line.split()
+            counter.update(words)
+        for word in counter:
+            val = " ".join(list(word))
+            fout.write(f"{word}\t{val}\n")
+            fout.flush()
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        description='text(line:utt1 中国 人) to lexicon（line:中国 中 国).')
+    parser.add_argument(
+        '--has_key',
+        default=True,
+        help='text path, with utt or not')
+    parser.add_argument(
+        '--text',
+        required=True,
+        help='text path. line: utt1 中国 人 or 中国 人')
+    parser.add_argument(
+        '--lexicon',
+        required=True,
+        help='lexicon path. line:中国 中 国')
+    args = parser.parse_args()
+    print(args)
+    main(args)
--- a/speechx/examples/ngram/zh/path.sh
+++ b/speechx/examples/ngram/zh/path.sh
+# This contains the locations of binarys build required for running the examples.
+MAIN_ROOT=`realpath $PWD/../../../../`
+SPEECHX_ROOT=`realpath $MAIN_ROOT/speechx`
+export LC_AL=C
+# srilm
+export LIBLBFGS=${MAIN_ROOT}/tools/liblbfgs-1.10
+export LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-}:${LIBLBFGS}/lib/.libs
+export SRILM=${MAIN_ROOT}/tools/srilm
+export PATH=${PATH}:${SRILM}/bin:${SRILM}/bin/i686-m64
--- a/speechx/examples/ngram/zh/run.sh
+++ b/speechx/examples/ngram/zh/run.sh
+#!/bin/bash
+set -eo pipefail
+. path.sh
+stage=-1
+stop_stage=100
+corpus=aishell
+unit=data/vocab.txt       # vocab file, line: char/spm_pice
+lexicon=data/lexicon.txt  # line: word ph0 ... phn, aishell/resource_aishell/lexicon.txt
+text=data/text            # line: utt text, aishell/data_aishell/transcript/aishell_transcript_v0.8.txt
+. utils/parse_options.sh
+data=$PWD/data
+mkdir -p $data
+if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then
+    if [ ! -f $data/speech.ngram.zh.tar.gz ];then
+        pushd $data
+        wget -c http://paddlespeech.bj.bcebos.com/speechx/examples/ngram/zh/speech.ngram.zh.tar.gz
+        tar xvzf speech.ngram.zh.tar.gz
+        popd
+    fi
+fi
+if [ ! -f $unit ]; then
+    echo "$0: No such file $unit"
+    exit 1;
+fi
+if ! which ngram-count; then
+    pushd $MAIN_ROOT/tools
+    make srilm.done
+    popd
+fi
+mkdir -p data/local/dict
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # 7.1 Prepare dict
+    # line: char/spm_pices
+    cp $unit data/local/dict/units.txt
+    if [ ! -f $lexicon ];then
+        local/text_to_lexicon.py --has_key true --text $text --lexicon $lexicon
+        echo "Generate $lexicon from $text"
+    fi
+    # filter by vocab
+    # line: word ph0 ... phn -> line: word char0 ... charn
+    utils/fst/prepare_dict.py \
+        --unit_file $unit \
+        --in_lexicon ${lexicon} \
+        --out_lexicon data/local/dict/lexicon.txt
+fi
+lm=data/local/lm
+mkdir -p $lm
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # 7.2 Train lm
+    cp $text $lm/text
+    local/aishell_train_lms.sh
+fi
+echo "build LM done."
+exit 0
--- a/speechx/examples/ngram/zh/utils
+++ b/speechx/examples/ngram/zh/utils
+../../../../utils/
\ No newline at end of file
--- a/speechx/examples/text_lm/.gitignore
+++ b/speechx/examples/text_lm/.gitignore
+data
--- a/speechx/examples/text_lm/README.md
+++ b/speechx/examples/text_lm/README.md
+# Text PreProcess for building ngram LM
+Output `text` file like this:
+```
+BAC009S0002W0122 而 对 楼市 成交 抑制 作用 最 大 的 限 购
+BAC009S0002W0123 也 成为 地方 政府 的 眼中 钉
+BAC009S0002W0124 自 六月 底 呼和浩特 市 率先 宣布 取消 限 购 后
+BAC009S0002W0125 各地 政府 便 纷纷 跟进
+BAC009S0002W0126 仅 一 个 多 月 的 时间 里
+BAC009S0002W0127 除了 北京 上海 广州 深圳 四 个 一 线 城市 和 三亚 之外
+BAC009S0002W0128 四十六 个 限 购 城市 当中
+BAC009S0002W0129 四十一 个 已 正式 取消 或 变相 放松 了 限 购
+BAC009S0002W0130 财政 金融 政策 紧随 其后 而来
+```
--- a/speechx/examples/text_lm/path.sh
+++ b/speechx/examples/text_lm/path.sh
+MAIN_ROOT=`realpath $PWD/../../../../`
+SPEECHX_ROOT=`realpath $MAIN_ROOT/speechx`
+export LC_AL=C
--- a/speechx/examples/text_lm/run.sh
+++ b/speechx/examples/text_lm/run.sh
+#!/bin/bash
+set -eo pipefail
+. path.sh
+stage=0
+stop_stage=100
+has_key=true
+token_type=word
+. utils/parse_options.sh || exit -1;
+text=data/text
+if [ ! -f $text ]; then
+    echo "$0: Not find $1";
+    exit -1;
+fi
+if [ $stage -le 0 ] && [ $stop_stage -ge 0 ];then
+    echo "text tn & wordseg preprocess"
+    rm -rf ${text}.tn
+    python3 utils/zh_tn.py --has_key $has_key --token_type $token_type ${text} ${text}.tn
+fi
\ No newline at end of file
--- a/speechx/examples/text_lm/utils
+++ b/speechx/examples/text_lm/utils
+../../../utils/
\ No newline at end of file
--- a/speechx/examples/wfst/.gitignore
+++ b/speechx/examples/wfst/.gitignore
+data
--- a/speechx/examples/wfst/README.md
+++ b/speechx/examples/wfst/README.md
+# Built TLG wfst
+## Input
+```
+data/local/
+├── dict
+│   ├── lexicon.txt
+│   └── units.txt
+└── lm
+    ├── heldout
+    ├── lm.arpa
+    ├── text
+    ├── text.no_oov
+    ├── train
+    ├── unigram.counts
+    ├── word.counts
+    └── wordlist
+```
+```
+==> data/local/dict/lexicon.txt <==
+啊 啊
+啊啊啊 啊 啊 啊
+阿 阿
+阿尔 阿 尔
+阿根廷 阿 根 廷
+阿九 阿 九
+阿克 阿 克
+阿拉伯数字 阿 拉 伯 数 字
+阿拉法特 阿 拉 法 特
+阿拉木图 阿 拉 木 图
+==> data/local/dict/units.txt <==
+<blank>
+<unk>
+A
+B
+C
+D
+E
+F
+G
+H
+==> data/local/lm/heldout <==
+而 对 楼市 成交 抑制 作用 最 大 的 限 购
+也 成为 地方 政府 的 眼中 钉
+自 六月 底 呼和浩特 市 率先 宣布 取消 限 购 后
+各地 政府 便 纷纷 跟进
+仅 一 个 多 月 的 时间 里
+除了 北京 上海 广州 深圳 四 个 一 线 城市 和 三亚 之外
+四十六 个 限 购 城市 当中
+四十一 个 已 正式 取消 或 变相 放松 了 限 购
+财政 金融 政策 紧随 其后 而来
+显示 出 了 极 强 的 威力
+==> data/local/lm/lm.arpa <==
+\data\
+ngram 1=129356
+ngram 2=504661
+ngram 3=123455
+\1-grams:
+-1.531278       </s>
+-3.828829       <SPOKEN_NOISE>  -0.1600094
+-6.157292       <UNK>
+==> data/local/lm/text <==
+BAC009S0002W0122 而 对 楼市 成交 抑制 作用 最 大 的 限 购
+BAC009S0002W0123 也 成为 地方 政府 的 眼中 钉
+BAC009S0002W0124 自 六月 底 呼和浩特 市 率先 宣布 取消 限 购 后
+BAC009S0002W0125 各地 政府 便 纷纷 跟进
+BAC009S0002W0126 仅 一 个 多 月 的 时间 里
+BAC009S0002W0127 除了 北京 上海 广州 深圳 四 个 一 线 城市 和 三亚 之外
+BAC009S0002W0128 四十六 个 限 购 城市 当中
+BAC009S0002W0129 四十一 个 已 正式 取消 或 变相 放松 了 限 购
+BAC009S0002W0130 财政 金融 政策 紧随 其后 而来
+BAC009S0002W0131 显示 出 了 极 强 的 威力
+==> data/local/lm/text.no_oov <==
+<SPOKEN_NOISE> 而 对 楼市 成交 抑制 作用 最 大 的 限 购 
+<SPOKEN_NOISE> 也 成为 地方 政府 的 眼中 钉 
+<SPOKEN_NOISE> 自 六月 底 呼和浩特 市 率先 宣布 取消 限 购 后 
+<SPOKEN_NOISE> 各地 政府 便 纷纷 跟进 
+<SPOKEN_NOISE> 仅 一 个 多 月 的 时间 里 
+<SPOKEN_NOISE> 除了 北京 上海 广州 深圳 四 个 一 线 城市 和 三亚 之外 
+<SPOKEN_NOISE> 四十六 个 限 购 城市 当中 
+<SPOKEN_NOISE> 四十一 个 已 正式 取消 或 变相 放松 了 限 购 
+<SPOKEN_NOISE> 财政 ���融 政策 紧随 其后 而来 
+<SPOKEN_NOISE> 显示 出 了 极 强 的 威力 
+==> data/local/lm/train <==
+汉莎 不 得 不 通过 这样 的 方式 寻求 新 的 发展 点
+并 计划 朝云 计算 方面 发展
+汉莎 的 基础 设施 部门 拥有 一千四百 名 员工
+媒体 就 曾 披露 这笔 交易
+虽然 双方 已经 正式 签署 了 外包 协议
+但是 这笔 交易 还 需要 得到 反 垄断 部门 的 批准
+陈 黎明 一九八九 年 获得 美国 康乃尔 大学 硕士 学位
+并 于 二零零三 年 顺利 完成 美国 哈佛 商学 院 高级 管理 课程
+曾 在 多家 国际 公司 任职
+拥有 业务 开发 商务 及 企业 治理
+==> data/local/lm/unigram.counts <==
+  57487 的
+  13099 在
+  11862 一
+  11397 了
+  10998 不
+   9913 是
+   7952 有
+   6250 和
+   6152 个
+   5422 将
+==> data/local/lm/word.counts <==
+  57486 的
+  13098 在
+  11861 一
+  11396 了
+  10997 不
+   9912 是
+   7951 有
+   6249 和
+   6151 个
+   5421 将
+==> data/local/lm/wordlist <==
+的
+在
+一
+了
+不
+是
+有
+和
+个
+将
+```
+## Output
+```
+fstaddselfloops 'echo 4234 |' 'echo 123660 |' 
+Lexicon and Token FSTs compiling succeeded
+arpa2fst --read-symbol-table=data/lang_test/words.txt --keep-symbols=true - 
+LOG (arpa2fst[5.5.0~1-5a37]:Read():arpa-file-parser.cc:94) Reading \data\ section.
+LOG (arpa2fst[5.5.0~1-5a37]:Read():arpa-file-parser.cc:149) Reading \1-grams: section.
+LOG (arpa2fst[5.5.0~1-5a37]:Read():arpa-file-parser.cc:149) Reading \2-grams: section.
+LOG (arpa2fst[5.5.0~1-5a37]:Read():arpa-file-parser.cc:149) Reading \3-grams: section.
+Checking how stochastic G is (the first of these numbers should be small):
+fstisstochastic data/lang_test/G.fst 
+0 -1.14386
+fsttablecompose data/lang_test/L.fst data/lang_test/G.fst 
+fstminimizeencoded 
+fstdeterminizestar --use-log=true 
+fsttablecompose data/lang_test/T.fst data/lang_test/LG.fst 
+Composing decoding graph TLG.fst succeeded
+Aishell build TLG done.
+```
+```
+data/
+├── lang_test
+│   ├── G.fst
+│   ├── L.fst
+│   ├── LG.fst
+│   ├── T.fst
+│   ├── TLG.fst
+│   ├── tokens.txt
+│   ├── units.txt
+│   └── words.txt
+└── local
+    ├── lang
+    │   ├── L.fst
+    │   ├── T.fst
+    │   ├── tokens.txt
+    │   ├── units.txt
+    │   └── words.txt
+    └── tmp
+        ├── disambig.list
+        ├── lexiconp_disambig.txt
+        ├── lexiconp.txt
+        └── units.list
+```
\ No newline at end of file
--- a/speechx/examples/wfst/path.sh
+++ b/speechx/examples/wfst/path.sh
+# This contains the locations of binarys build required for running the examples.
+MAIN_ROOT=`realpath $PWD/../../../`
+SPEECHX_ROOT=`realpath $MAIN_ROOT/speechx`
+export LC_AL=C
+# srilm
+export LIBLBFGS=${MAIN_ROOT}/tools/liblbfgs-1.10
+export LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-}:${LIBLBFGS}/lib/.libs
+export SRILM=${MAIN_ROOT}/tools/srilm
+export PATH=${PATH}:${SRILM}/bin:${SRILM}/bin/i686-m64
+# Kaldi
+export KALDI_ROOT=${MAIN_ROOT}/tools/kaldi
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present, can not using Kaldi!"
+[ -f $KALDI_ROOT/tools/config/common_path.sh ] && . $KALDI_ROOT/tools/config/common_path.sh
--- a/speechx/examples/wfst/run.sh
+++ b/speechx/examples/wfst/run.sh
+#!/bin/bash
+set -eo pipefail
+. path.sh
+stage=-1
+stop_stage=100
+. utils/parse_options.sh
+if ! which fstprint ; then
+    pushd $MAIN_ROOT/tools
+    make kaldi.done
+    popd
+fi
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then 
+    # build T & L
+    # utils/fst/compile_lexicon_token_fst.sh <dict-src-dir> <tmp-dir> <lang-dir>
+    utils/fst/compile_lexicon_token_fst.sh \
+        data/local/dict data/local/tmp data/local/lang
+    # build G & LG & TLG
+    # utils/fst/make_tlg.sh <lm_dir> <src_lang> <tgt_lang>
+    utils/fst/make_tlg.sh data/local/lm data/local/lang data/lang_test || exit 1;
+fi
+echo "build TLG done."
+exit 0
--- a/speechx/examples/wfst/utils
+++ b/speechx/examples/wfst/utils
+../../../utils/
\ No newline at end of file
--- a/speechx/tools/install_srilm.sh
+++ b/speechx/tools/install_srilm.sh
--- a/tests/unit/cli/test_cli.sh
+++ b/tests/unit/cli/test_cli.sh
 #!/bin/bash
 set -e
 # Audio classification
 wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/cat.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/dog.wav
 paddlespeech cls --input ./cat.wav --topk 10
@@ -28,26 +29,16 @@ paddlespeech tts --am tacotron2_csmsc --input "你好，欢迎使用百度飞桨
 paddlespeech tts --am tacotron2_csmsc --voc wavernn_csmsc --input "你好，欢迎使用百度飞桨深度学习框架！"
 paddlespeech tts --am tacotron2_ljspeech --voc pwgan_ljspeech --lang en --input "Life was like a box of chocolates, you never know what you're gonna get."
 # Speech Translation (only support linux)
 paddlespeech st --input ./en.wav
-# batch process
-echo -e "1 欢迎光临。\n2 谢谢惠顾。" | paddlespeech tts
-# shell pipeline
-paddlespeech asr --input ./zh.wav | paddlespeech text --task punc
-# stats
-paddlespeech stats --task asr
-paddlespeech stats --task tts
-paddlespeech stats --task cls
 # Speaker Verification 
 wget -c https://paddlespeech.bj.bcebos.com/vector/audio/85236145389.wav
 paddlespeech vector --task spk --input 85236145389.wav
+# batch process
+echo -e "1 欢迎光临。\n2 谢谢惠顾。" | paddlespeech tts
 echo -e "demo1 85236145389.wav \n demo2 85236145389.wav" > vec.job
 paddlespeech vector --task spk --input vec.job
@@ -55,4 +46,13 @@ echo -e "demo3 85236145389.wav \n demo4 85236145389.wav" | paddlespeech vector -
 rm 85236145389.wav 
 rm vec.job
+# shell pipeline
+paddlespeech asr --input ./zh.wav | paddlespeech text --task punc
+# stats
+paddlespeech stats --task asr
+paddlespeech stats --task tts
+paddlespeech stats --task cls
+paddlespeech stats --task text
+paddlespeech stats --task vector
+paddlespeech stats --task st
--- a/tools/Makefile
+++ b/tools/Makefile
--- a/tools/extras/install_openfst.sh
+++ b/tools/extras/install_openfst.sh
--- a/utils/compute-wer.py
+++ b/utils/compute-wer.py
--- a/utils/espnet_json_to_manifest.py
+++ b/utils/espnet_json_to_manifest.py
--- a/utils/fst/prepare_dict.py
+++ b/utils/fst/prepare_dict.py
--- a/utils/generate_infer_yaml.py
+++ b/utils/generate_infer_yaml.py
--- a/utils/link_wav.py
+++ b/utils/link_wav.py
--- a/utils/manifest_key_value.py
+++ b/utils/manifest_key_value.py
--- a/utils/zh_tn.py
+++ b/utils/zh_tn.py