diff --git a/examples/csmsc/vits/local/export2lite.sh b/examples/csmsc/vits/local/export2lite.sh
new file mode 120000
index 0000000000000000000000000000000000000000..402fd8334571997359f63c79096da0288175c19c
--- /dev/null
+++ b/examples/csmsc/vits/local/export2lite.sh
@@ -0,0 +1 @@
+../../tts3/local/export2lite.sh
\ No newline at end of file
diff --git a/examples/csmsc/vits/local/inference.sh b/examples/csmsc/vits/local/inference.sh
new file mode 100755
index 0000000000000000000000000000000000000000..0a79c255ca35edda43e846c995c0cc7d7c8685fd
--- /dev/null
+++ b/examples/csmsc/vits/local/inference.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+train_output_path=$1
+add_blank=$2
+
+stage=0
+stop_stage=0
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    python3 ${BIN_DIR}/inference.py \
+        --inference_dir=${train_output_path}/inference \
+        --am=vits_csmsc \
+        --text=${BIN_DIR}/../sentences.txt \
+        --output_dir=${train_output_path}/pd_infer_out \
+        --phones_dict=dump/phone_id_map.txt \
+        --add-blank=${add_blank}
+fi
\ No newline at end of file
diff --git a/examples/csmsc/vits/local/lite_predict.sh b/examples/csmsc/vits/local/lite_predict.sh
new file mode 100755
index 0000000000000000000000000000000000000000..9ed57b727326162d2d3155e059c1822ee9225b0d
--- /dev/null
+++ b/examples/csmsc/vits/local/lite_predict.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+train_output_path=$1
+add_blank=$2
+
+stage=0
+stop_stage=0
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    python3 ${BIN_DIR}/../lite_predict.py \
+        --inference_dir=${train_output_path}/pdlite \
+        --am=vits_csmsc \
+        --text=${BIN_DIR}/../sentences.txt \
+        --output_dir=${train_output_path}/lite_infer_out \
+        --phones_dict=dump/phone_id_map.txt \
+        --add-blank=${add_blank}
+fi
+
diff --git a/examples/csmsc/vits/run.sh b/examples/csmsc/vits/run.sh
index 74505d9b926cf5cdf721bffe49034fa49fa73c07..ac190bfa89b23277aa528e667200b7b3c9457f9d 100755
--- a/examples/csmsc/vits/run.sh
+++ b/examples/csmsc/vits/run.sh
@@ -35,3 +35,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
     # synthesize_e2e, vocoder is pwgan
     CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} ${add_blank}|| exit -1
 fi
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path} ${add_blank}|| exit -1
+fi
diff --git a/paddlespeech/t2s/exps/lite_syn_utils.py b/paddlespeech/t2s/exps/lite_syn_utils.py
index 2c67edae6c06f24c220e902f05289b7c3bbbb34c..65783e1a9d7f1ee9248c3965c77d6d07ab80e765 100644
--- a/paddlespeech/t2s/exps/lite_syn_utils.py
+++ b/paddlespeech/t2s/exps/lite_syn_utils.py
@@ -19,15 +19,15 @@ def get_lite_predictor(model_dir: Optional[os.PathLike]=None,
     return predictor
 
 
-def get_lite_am_output(
-        input: str,
-        am_predictor,
-        am: str,
-        frontend: object,
-        lang: str='zh',
-        merge_sentences: bool=True,
-        speaker_dict: Optional[os.PathLike]=None,
-        spk_id: int=0, ):
+def get_lite_am_output(input: str,
+                       am_predictor,
+                       am: str,
+                       frontend: object,
+                       lang: str='zh',
+                       merge_sentences: bool=True,
+                       speaker_dict: Optional[os.PathLike]=None,
+                       spk_id: int=0,
+                       add_blank: bool=False):
     am_name = am[:am.rindex('_')]
     am_dataset = am[am.rindex('_') + 1:]
     get_spk_id = False
@@ -43,7 +43,8 @@ def get_lite_am_output(
         text=input,
         merge_sentences=merge_sentences,
         get_tone_ids=get_tone_ids,
-        lang=lang)
+        lang=lang,
+        add_blank=add_blank, )
 
     if get_tone_ids:
         tone_ids = frontend_dict['tone_ids']
diff --git a/paddlespeech/t2s/exps/syn_utils.py b/paddlespeech/t2s/exps/syn_utils.py
index ebe104c1fb667ce985965fc2c4ec73aa6a02c1ed..12b75615e8f8f7dabada50f888627f42a2292877 100644
--- a/paddlespeech/t2s/exps/syn_utils.py
+++ b/paddlespeech/t2s/exps/syn_utils.py
@@ -284,7 +284,8 @@ def run_frontend(frontend: object,
                  merge_sentences: bool=False,
                  get_tone_ids: bool=False,
                  lang: str='zh',
-                 to_tensor: bool=True):
+                 to_tensor: bool=True,
+                 add_blank: bool=False):
     outs = dict()
     if lang == 'zh':
         input_ids = {}
@@ -300,7 +301,8 @@ def run_frontend(frontend: object,
                 text,
                 merge_sentences=merge_sentences,
                 get_tone_ids=get_tone_ids,
-                to_tensor=to_tensor)
+                to_tensor=to_tensor,
+                add_blank=add_blank)
         phone_ids = input_ids["phone_ids"]
         if get_tone_ids:
             tone_ids = input_ids["tone_ids"]
@@ -576,15 +578,15 @@ def get_predictor(
     return predictor
 
 
-def get_am_output(
-        input: str,
-        am_predictor: paddle.nn.Layer,
-        am: str,
-        frontend: object,
-        lang: str='zh',
-        merge_sentences: bool=True,
-        speaker_dict: Optional[os.PathLike]=None,
-        spk_id: int=0, ):
+def get_am_output(input: str,
+                  am_predictor: paddle.nn.Layer,
+                  am: str,
+                  frontend: object,
+                  lang: str='zh',
+                  merge_sentences: bool=True,
+                  speaker_dict: Optional[os.PathLike]=None,
+                  spk_id: int=0,
+                  add_blank: bool=False):
     am_name = am[:am.rindex('_')]
     am_dataset = am[am.rindex('_') + 1:]
     am_input_names = am_predictor.get_input_names()
@@ -601,7 +603,8 @@ def get_am_output(
         text=input,
         merge_sentences=merge_sentences,
         get_tone_ids=get_tone_ids,
-        lang=lang)
+        lang=lang,
+        add_blank=add_blank, )
 
     if get_tone_ids:
         tone_ids = frontend_dict['tone_ids']
diff --git a/paddlespeech/t2s/exps/vits/inference.py b/paddlespeech/t2s/exps/vits/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..08c1ac566db5798a905437cf8a62b576512741f1
--- /dev/null
+++ b/paddlespeech/t2s/exps/vits/inference.py
@@ -0,0 +1,174 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+from pathlib import Path
+
+import paddle
+import soundfile as sf
+from timer import timer
+
+from paddlespeech.t2s.exps.syn_utils import get_am_output
+from paddlespeech.t2s.exps.syn_utils import get_frontend
+from paddlespeech.t2s.exps.syn_utils import get_predictor
+from paddlespeech.t2s.exps.syn_utils import get_sentences
+from paddlespeech.t2s.utils import str2bool
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Paddle Infernce with acoustic model & vocoder.")
+    # acoustic model
+    parser.add_argument(
+        '--am',
+        type=str,
+        default='vits_csmsc',
+        choices=['vits_csmsc', 'vits_aishell3'],
+        help='Choose acoustic model type of tts task.')
+    parser.add_argument(
+        "--phones_dict", type=str, default=None, help="phone vocabulary file.")
+    parser.add_argument(
+        "--speaker_dict", type=str, default=None, help="speaker id map file.")
+    parser.add_argument(
+        '--spk_id',
+        type=int,
+        default=0,
+        help='spk id for multi speaker acoustic model')
+    # other
+    parser.add_argument(
+        '--lang',
+        type=str,
+        default='zh',
+        help='Choose model language. zh or en or mix')
+    parser.add_argument(
+        "--text",
+        type=str,
+        help="text to synthesize, a 'utt_id sentence' pair per line")
+    parser.add_argument(
+        "--add-blank",
+        type=str2bool,
+        default=True,
+        help="whether to add blank between phones")
+    parser.add_argument(
+        "--inference_dir", type=str, help="dir to save inference models")
+    parser.add_argument("--output_dir", type=str, help="output dir")
+    # inference
+    parser.add_argument(
+        "--use_trt",
+        type=str2bool,
+        default=False,
+        help="whether to use TensorRT or not in GPU", )
+    parser.add_argument(
+        "--use_mkldnn",
+        type=str2bool,
+        default=False,
+        help="whether to use MKLDNN or not in CPU.", )
+    parser.add_argument(
+        "--precision",
+        type=str,
+        default='fp32',
+        choices=['fp32', 'fp16', 'bf16', 'int8'],
+        help="mode of running")
+    parser.add_argument(
+        "--device",
+        default="gpu",
+        choices=["gpu", "cpu"],
+        help="Device selected for inference.", )
+    parser.add_argument('--cpu_threads', type=int, default=1)
+
+    args, _ = parser.parse_known_args()
+    return args
+
+
+# only inference for models trained with csmsc now
+def main():
+    args = parse_args()
+
+    paddle.set_device(args.device)
+
+    # frontend
+    frontend = get_frontend(lang=args.lang, phones_dict=args.phones_dict)
+
+    # am_predictor
+    am_predictor = get_predictor(
+        model_dir=args.inference_dir,
+        model_file=args.am + ".pdmodel",
+        params_file=args.am + ".pdiparams",
+        device=args.device,
+        use_trt=args.use_trt,
+        use_mkldnn=args.use_mkldnn,
+        cpu_threads=args.cpu_threads,
+        precision=args.precision)
+    # model: {model_name}_{dataset}
+    am_dataset = args.am[args.am.rindex('_') + 1:]
+
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    sentences = get_sentences(text_file=args.text, lang=args.lang)
+
+    merge_sentences = True
+    add_blank = args.add_blank
+    # vits's fs is 22050
+    fs = 22050
+    # warmup
+    for utt_id, sentence in sentences[:3]:
+        with timer() as t:
+            wav = get_am_output(
+                input=sentence,
+                am_predictor=am_predictor,
+                am=args.am,
+                frontend=frontend,
+                lang=args.lang,
+                merge_sentences=merge_sentences,
+                speaker_dict=args.speaker_dict,
+                spk_id=args.spk_id,
+                add_blank=add_blank)
+        speed = wav.size / t.elapse
+        rtf = fs / speed
+        print(
+            f"{utt_id}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}."
+        )
+
+    print("warm up done!")
+
+    N = 0
+    T = 0
+    for utt_id, sentence in sentences:
+        with timer() as t:
+            wav = get_am_output(
+                input=sentence,
+                am_predictor=am_predictor,
+                am=args.am,
+                frontend=frontend,
+                lang=args.lang,
+                merge_sentences=merge_sentences,
+                speaker_dict=args.speaker_dict,
+                spk_id=args.spk_id,
+                add_blank=add_blank)
+
+        N += wav.size
+        T += t.elapse
+        speed = wav.size / t.elapse
+        rtf = fs / speed
+        sf.write(output_dir / (utt_id + ".wav"), wav, samplerate=fs)
+        print(
+            f"{utt_id}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}."
+        )
+
+        print(f"{utt_id} done!")
+    print(f"generation speed: {N / T}Hz, RTF: {fs / (N / T) }")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/paddlespeech/t2s/exps/vits/lite_predict.py b/paddlespeech/t2s/exps/vits/lite_predict.py
new file mode 100644
index 0000000000000000000000000000000000000000..790cd48e3d307246455ccbfeec53006419bdf64f
--- /dev/null
+++ b/paddlespeech/t2s/exps/vits/lite_predict.py
@@ -0,0 +1,147 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+from pathlib import Path
+
+import soundfile as sf
+from timer import timer
+
+from paddlespeech.t2s.exps.lite_syn_utils import get_lite_am_output
+from paddlespeech.t2s.exps.lite_syn_utils import get_lite_predictor
+from paddlespeech.t2s.exps.syn_utils import get_frontend
+from paddlespeech.t2s.exps.syn_utils import get_sentences
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Paddle Infernce with acoustic model & vocoder.")
+    # acoustic model
+    parser.add_argument(
+        '--am',
+        type=str,
+        default='vits_csmsc',
+        choices=[
+            'vits_csmsc',
+            'vits_aishell3',
+        ],
+        help='Choose acoustic model type of tts task.')
+    parser.add_argument(
+        "--phones_dict", type=str, default=None, help="phone vocabulary file.")
+    parser.add_argument(
+        "--speaker_dict", type=str, default=None, help="speaker id map file.")
+    parser.add_argument(
+        '--spk_id',
+        type=int,
+        default=0,
+        help='spk id for multi speaker acoustic model')
+    # other
+    parser.add_argument(
+        '--lang',
+        type=str,
+        default='zh',
+        help='Choose model language. zh or en or mix')
+    parser.add_argument(
+        "--text",
+        type=str,
+        help="text to synthesize, a 'utt_id sentence' pair per line")
+    parser.add_argument(
+        "--add-blank",
+        type=str2bool,
+        default=True,
+        help="whether to add blank between phones")
+    parser.add_argument(
+        "--inference_dir", type=str, help="dir to save inference models")
+    parser.add_argument("--output_dir", type=str, help="output dir")
+
+    args, _ = parser.parse_known_args()
+    return args
+
+
+# only inference for models trained with csmsc now
+def main():
+    args = parse_args()
+
+    # frontend
+    frontend = get_frontend(
+        lang=args.lang,
+        phones_dict=args.phones_dict,
+        tones_dict=args.tones_dict)
+
+    # am_predictor
+    am_predictor = get_lite_predictor(
+        model_dir=args.inference_dir, model_file=args.am + "_x86.nb")
+    # model: {model_name}_{dataset}
+    am_dataset = args.am[args.am.rindex('_') + 1:]
+
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    sentences = get_sentences(text_file=args.text, lang=args.lang)
+
+    merge_sentences = True
+    add_blank = args.add_blank
+    fs = 22050
+    # warmup
+    for utt_id, sentence in sentences[:3]:
+        with timer() as t:
+            wav = get_lite_am_output(
+                input=sentence,
+                am_predictor=am_predictor,
+                am=args.am,
+                frontend=frontend,
+                lang=args.lang,
+                merge_sentences=merge_sentences,
+                speaker_dict=args.speaker_dict,
+                spk_id=args.spk_id,
+                add_blank=add_blank)
+
+        speed = wav.size / t.elapse
+        rtf = fs / speed
+        print(
+            f"{utt_id}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}."
+        )
+
+    print("warm up done!")
+
+    N = 0
+    T = 0
+    for utt_id, sentence in sentences:
+        with timer() as t:
+            wav = get_lite_am_output(
+                input=sentence,
+                am_predictor=am_predictor,
+                am=args.am,
+                frontend=frontend,
+                lang=args.lang,
+                merge_sentences=merge_sentences,
+                speaker_dict=args.speaker_dict,
+                spk_id=args.spk_id,
+                add_blank=add_blank)
+
+        N += wav.size
+        T += t.elapse
+        speed = wav.size / t.elapse
+        rtf = fs / speed
+
+        sf.write(output_dir / (utt_id + ".wav"), wav, samplerate=fs)
+        print(
+            f"{utt_id}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}."
+        )
+
+        print(f"{utt_id} done!")
+    print(f"generation speed: {N / T}Hz, RTF: {fs / (N / T) }")
+
+
+if __name__ == "__main__":
+    main()