b0607485 · b0607485 · b0607485 · b0607485 · b0607485 · b0607485
100 changed file
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -26,12 +26,12 @@ repos:
        - --no-sort-keys
        - --autofix
    -   id: check-merge-conflict
-    -   id: flake8
+      #    -   id: flake8
-        aergs:
+      #        aergs:
-        -  --ignore=E501,E228,E226,E261,E266,E128,E402,W503
+      #        -  --ignore=E501,E228,E226,E261,E266,E128,E402,W503
-        -  --builtins=G,request
+      #        -  --builtins=G,request
-        -  --jobs=1
+      #        -  --jobs=1
-        exclude: (?=runtime/engine/kaldi|audio/paddleaudio/src|third_party).*(\.cpp|\.cc|\.h\.hpp|\.py)$
+      #        exclude: (?=runtime/engine/kaldi|audio/paddleaudio/src|third_party).*(\.cpp|\.cc|\.h\.hpp|\.py)$
 -   repo : https://github.com/Lucas-C/pre-commit-hooks
    rev: v1.0.1

--- a/audio/setup.py
+++ b/audio/setup.py
@@ -38,8 +38,10 @@ VERSION = '1.2.0'
 COMMITID = 'none'
 base = [
+    # paddleaudio align with librosa==0.8.1, which need numpy==1.23.x
+    "librosa==0.8.1",
+    "numpy==1.23.5",
    "kaldiio",
-    "librosa>=0.10.0",
    "pathos",
    "pybind11",
    "parameterized",

--- a/examples/aishell3/tts3/README.md
+++ b/examples/aishell3/tts3/README.md
@@ -241,7 +241,7 @@ fastspeech2_aishell3_ckpt_1.1.0
 ├── speaker_id_map.txt      # speaker id map file when training a multi-speaker fastspeech2
 └── speech_stats.npy        # statistics used to normalize spectrogram when training fastspeech2
 ```
-You can use the following scripts to synthesize for `${BIN_DIR}/../sentences.txt` using pretrained fastspeech2 and parallel wavegan models.
+You can use the following scripts to synthesize for `${BIN_DIR}/../../assets/sentences.txt` using pretrained fastspeech2 and parallel wavegan models.
 ```bash
 source path.sh
@@ -257,7 +257,7 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \
  --voc_ckpt=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
  --voc_stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \
  --lang=zh \
-  --text=${BIN_DIR}/../sentences.txt \
+  --text=${BIN_DIR}/../../assets/sentences.txt \
  --output_dir=exp/default/test_e2e \
  --phones_dict=fastspeech2_aishell3_ckpt_1.1.0/phone_id_map.txt \
  --speaker_dict=fastspeech2_aishell3_ckpt_1.1.0/speaker_id_map.txt \

--- a/examples/aishell3/tts3/local/inference.sh
+++ b/examples/aishell3/tts3/local/inference.sh
@@ -10,7 +10,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        --inference_dir=${train_output_path}/inference \
        --am=fastspeech2_aishell3 \
        --voc=pwgan_aishell3 \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/pd_infer_out \
        --phones_dict=dump/phone_id_map.txt \
        --speaker_dict=dump/speaker_id_map.txt \
@@ -22,7 +22,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
        --inference_dir=${train_output_path}/inference \
        --am=fastspeech2_aishell3 \
        --voc=hifigan_aishell3 \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/pd_infer_out \
        --phones_dict=dump/phone_id_map.txt \
        --speaker_dict=dump/speaker_id_map.txt \

--- a/examples/aishell3/tts3/local/lite_predict.sh
+++ b/examples/aishell3/tts3/local/lite_predict.sh
@@ -11,7 +11,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        --inference_dir=${train_output_path}/pdlite \
        --am=fastspeech2_aishell3 \
        --voc=pwgan_aishell3 \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/lite_infer_out \
        --phones_dict=dump/phone_id_map.txt \
        --speaker_dict=dump/speaker_id_map.txt \
@@ -24,7 +24,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
        --inference_dir=${train_output_path}/pdlite \
        --am=fastspeech2_aishell3 \
        --voc=hifigan_aishell3 \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/lite_infer_out \
        --phones_dict=dump/phone_id_map.txt \
        --speaker_dict=dump/speaker_id_map.txt \

--- a/examples/aishell3/tts3/local/ort_predict.sh
+++ b/examples/aishell3/tts3/local/ort_predict.sh
@@ -10,7 +10,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        --am=fastspeech2_aishell3 \
        --voc=pwgan_aishell3 \
        --output_dir=${train_output_path}/onnx_infer_out_e2e \
-        --text=${BIN_DIR}/../csmsc_test.txt \
+        --text=${BIN_DIR}/../../assets/csmsc_test.txt \
        --phones_dict=dump/phone_id_map.txt \
        --device=cpu \
        --cpu_threads=2 \
@@ -24,7 +24,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
        --am=fastspeech2_aishell3 \
        --voc=hifigan_aishell3 \
        --output_dir=${train_output_path}/onnx_infer_out_e2e \
-        --text=${BIN_DIR}/../csmsc_test.txt \
+        --text=${BIN_DIR}/../../assets/csmsc_test.txt \
        --phones_dict=dump/phone_id_map.txt \
        --device=cpu \
        --cpu_threads=2 \

--- a/examples/aishell3/tts3/local/synthesize_e2e.sh
+++ b/examples/aishell3/tts3/local/synthesize_e2e.sh
@@ -21,7 +21,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        --voc_ckpt=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
        --voc_stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \
        --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/test_e2e \
        --phones_dict=dump/phone_id_map.txt \
        --speaker_dict=dump/speaker_id_map.txt \
@@ -44,7 +44,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
        --voc_ckpt=hifigan_aishell3_ckpt_0.2.0/snapshot_iter_2500000.pdz \
        --voc_stat=hifigan_aishell3_ckpt_0.2.0/feats_stats.npy \
        --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/test_e2e \
        --phones_dict=dump/phone_id_map.txt \
        --speaker_dict=dump/speaker_id_map.txt \

--- a/examples/aishell3/vits/README.md
+++ b/examples/aishell3/vits/README.md
@@ -196,7 +196,7 @@ python3 ${BIN_DIR}/synthesize_e2e.py \
    --phones_dict=vits_aishell3_ckpt_1.1.0/phone_id_map.txt \
    --speaker_dict=vits_aishell3_ckpt_1.1.0/speaker_id_map.txt \
    --output_dir=exp/default/test_e2e \
-    --text=${BIN_DIR}/../sentences.txt \
+    --text=${BIN_DIR}/../../assets/sentences.txt \
    --add-blank=${add_blank} 
 ```
 -->
--- a/examples/aishell3/vits/local/synthesize_e2e.sh
+++ b/examples/aishell3/vits/local/synthesize_e2e.sh
@@ -20,6 +20,6 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        --speaker_dict=dump/speaker_id_map.txt \
        --spk_id=0 \
        --output_dir=${train_output_path}/test_e2e \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --add-blank=${add_blank}
 fi
--- a/examples/canton/tts3/README.md
+++ b/examples/canton/tts3/README.md
@@ -102,7 +102,7 @@ Download the pretrained parallel wavegan model from [pwg_aishell3_ckpt_0.5.zip](
 unzip pwg_aishell3_ckpt_0.5.zip
 ```
-You can use the following scripts to synthesize for `${BIN_DIR}/../sentences_canton.txt` using pretrained fastspeech2 and parallel wavegan models.
+You can use the following scripts to synthesize for `${BIN_DIR}/../../assets/sentences_canton.txt` using pretrained fastspeech2 and parallel wavegan models.
 ```bash
 source path.sh
@@ -118,7 +118,7 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \
  --voc_ckpt=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
  --voc_stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \
  --lang=canton \
-  --text=${BIN_DIR}/../sentences_canton.txt \
+  --text=${BIN_DIR}/../../assets/sentences_canton.txt \
  --output_dir=exp/default/test_e2e \
  --phones_dict=fastspeech2_canton_ckpt_1.4.0/phone_id_map.txt \
  --speaker_dict=fastspeech2_canton_ckpt_1.4.0/speaker_id_map.txt \

--- a/examples/canton/tts3/local/inference.sh
+++ b/examples/canton/tts3/local/inference.sh
@@ -12,7 +12,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        --am=fastspeech2_canton \
        --voc=pwgan_aishell3 \
        --spk_id=10 \
-        --text=${BIN_DIR}/../sentences_canton.txt \
+        --text=${BIN_DIR}/../../assets/sentences_canton.txt \
        --output_dir=${train_output_path}/pd_infer_out \
        --phones_dict=dump/phone_id_map.txt \
        --speaker_dict=dump/speaker_id_map.txt \
@@ -27,7 +27,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
        --am=fastspeech2_canton \
        --voc=mb_melgan_csmsc \
        --spk_id=10 \
-        --text=${BIN_DIR}/../sentences_canton.txt \
+        --text=${BIN_DIR}/../../assets/sentences_canton.txt \
        --output_dir=${train_output_path}/pd_infer_out \
        --phones_dict=dump/phone_id_map.txt \
        --speaker_dict=dump/speaker_id_map.txt \
@@ -41,7 +41,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
        --am=fastspeech2_canton \
        --voc=hifigan_csmsc \
        --spk_id=10 \
-        --text=${BIN_DIR}/../sentences_canton.txt \
+        --text=${BIN_DIR}/../../assets/sentences_canton.txt \
        --output_dir=${train_output_path}/pd_infer_out \
        --phones_dict=dump/phone_id_map.txt \
        --speaker_dict=dump/speaker_id_map.txt \
@@ -55,7 +55,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
        --am=fastspeech2_canton \
        --voc=wavernn_csmsc \
        --spk_id=10 \
-        --text=${BIN_DIR}/../sentences_canton.txt \
+        --text=${BIN_DIR}/../../assets/sentences_canton.txt \
        --output_dir=${train_output_path}/pd_infer_out \
        --phones_dict=dump/phone_id_map.txt \
        --speaker_dict=dump/speaker_id_map.txt \

--- a/examples/canton/tts3/local/ort_predict.sh
+++ b/examples/canton/tts3/local/ort_predict.sh
@@ -11,7 +11,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        --voc=pwgan_aishell3 \
        --spk_id=10 \
        --output_dir=${train_output_path}/onnx_infer_out_e2e \
-        --text=${BIN_DIR}/../sentences_canton.txt \
+        --text=${BIN_DIR}/../../assets/sentences_canton.txt \
        --phones_dict=dump/phone_id_map.txt \
        --speaker_dict=dump/speaker_id_map.txt \
        --lang=canton \
@@ -26,7 +26,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
        --voc=mb_melgan_csmsc \
        --spk_id=10 \
        --output_dir=${train_output_path}/onnx_infer_out_e2e \
-        --text=${BIN_DIR}/../sentences_canton.txt \
+        --text=${BIN_DIR}/../../assets/sentences_canton.txt \
        --phones_dict=dump/phone_id_map.txt \
        --speaker_dict=dump/speaker_id_map.txt \
        --lang=canton \
@@ -40,7 +40,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
        --am=fastspeech2_canton \
        --voc=hifigan_csmsc \
        --output_dir=${train_output_path}/onnx_infer_out_e2e \
-        --text=${BIN_DIR}/../sentences_canton.txt \
+        --text=${BIN_DIR}/../../assets/sentences_canton.txt \
        --phones_dict=dump/phone_id_map.txt \
        --speaker_dict=dump/speaker_id_map.txt \
        --lang=canton \

--- a/examples/canton/tts3/local/synthesize_e2e.sh
+++ b/examples/canton/tts3/local/synthesize_e2e.sh
@@ -21,7 +21,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        --voc_ckpt=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
        --voc_stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \
        --lang=canton \
-        --text=${BIN_DIR}/../sentences_canton.txt \
+        --text=${BIN_DIR}/../../assets/sentences_canton.txt \
        --output_dir=${train_output_path}/test_e2e \
        --phones_dict=dump/phone_id_map.txt \
        --speaker_dict=dump/speaker_id_map.txt \
@@ -44,7 +44,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
        --voc_ckpt=hifigan_aishell3_ckpt_0.2.0/snapshot_iter_2500000.pdz \
        --voc_stat=hifigan_aishell3_ckpt_0.2.0/feats_stats.npy \
        --lang=canton \
-        --text=${BIN_DIR}/../sentences_canton.txt \
+        --text=${BIN_DIR}/../../assets/sentences_canton.txt \
        --output_dir=${train_output_path}/test_e2e \
        --phones_dict=dump/phone_id_map.txt \
        --speaker_dict=dump/speaker_id_map.txt \

--- a/examples/csmsc/jets/local/inference.sh
+++ b/examples/csmsc/jets/local/inference.sh
@@ -9,7 +9,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    python3 ${BIN_DIR}/inference.py \
        --inference_dir=${train_output_path}/inference \
        --am=jets_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/pd_infer_out \
        --phones_dict=dump/phone_id_map.txt
 fi
--- a/examples/csmsc/jets/local/synthesize_e2e.sh
+++ b/examples/csmsc/jets/local/synthesize_e2e.sh
@@ -17,6 +17,6 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        --ckpt=${train_output_path}/checkpoints/${ckpt_name} \
        --phones_dict=dump/phone_id_map.txt \
        --output_dir=${train_output_path}/test_e2e \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --inference_dir=${train_output_path}/inference
 fi
--- a/examples/csmsc/tts0/README.md
+++ b/examples/csmsc/tts0/README.md
@@ -226,7 +226,7 @@ tacotron2_csmsc_ckpt_0.2.0
 ├── snapshot_iter_30600.pdz # model parameters and optimizer states
 └── speech_stats.npy        # statistics used to normalize spectrogram when training Tacotron2
 ```
-You can use the following scripts to synthesize for `${BIN_DIR}/../sentences.txt` using pretrained Tacotron2 and parallel wavegan models.
+You can use the following scripts to synthesize for `${BIN_DIR}/../../assets/sentences.txt` using pretrained Tacotron2 and parallel wavegan models.
 ```bash
 source path.sh
@@ -242,7 +242,7 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \
  --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
  --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
  --lang=zh \
-  --text=${BIN_DIR}/../sentences.txt \
+  --text=${BIN_DIR}/../../assets/sentences.txt \
  --output_dir=exp/default/test_e2e \
  --inference_dir=exp/default/inference \
  --phones_dict=tacotron2_csmsc_ckpt_0.2.0/phone_id_map.txt

--- a/examples/csmsc/tts0/local/inference.sh
+++ b/examples/csmsc/tts0/local/inference.sh
@@ -10,7 +10,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        --inference_dir=${train_output_path}/inference \
        --am=tacotron2_csmsc \
        --voc=pwgan_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/pd_infer_out \
        --phones_dict=dump/phone_id_map.txt
 fi
@@ -22,7 +22,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
        --inference_dir=${train_output_path}/inference \
        --am=tacotron2_csmsc \
        --voc=mb_melgan_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/pd_infer_out \
        --phones_dict=dump/phone_id_map.txt
 fi
@@ -33,7 +33,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
        --inference_dir=${train_output_path}/inference \
        --am=tacotron2_csmsc \
        --voc=hifigan_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/pd_infer_out \
        --phones_dict=dump/phone_id_map.txt
 fi
\ No newline at end of file
--- a/examples/csmsc/tts0/local/synthesize_e2e.sh
+++ b/examples/csmsc/tts0/local/synthesize_e2e.sh
@@ -22,7 +22,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
        --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
        --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/test_e2e \
        --phones_dict=dump/phone_id_map.txt \
        --inference_dir=${train_output_path}/inference
@@ -44,7 +44,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
        --voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
        --voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
        --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/test_e2e \
        --phones_dict=dump/phone_id_map.txt \
        --inference_dir=${train_output_path}/inference
@@ -66,7 +66,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
        --voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
        --voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
        --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/test_e2e \
        --phones_dict=dump/phone_id_map.txt
        # --inference_dir=${train_output_path}/inference
@@ -87,7 +87,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
        --voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
        --voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
        --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/test_e2e \
        --phones_dict=dump/phone_id_map.txt \
        --inference_dir=${train_output_path}/inference
@@ -108,7 +108,7 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
        --voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
        --voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
        --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/test_e2e \
        --phones_dict=dump/phone_id_map.txt \
        --inference_dir=${train_output_path}/inference

--- a/examples/csmsc/tts2/README.md
+++ b/examples/csmsc/tts2/README.md
@@ -248,7 +248,7 @@ speedyspeech_csmsc_ckpt_0.2.0
 ├── snapshot_iter_30600.pdz # model parameters and optimizer states
 └── tone_id_map.txt         # tone vocabulary file when training speedyspeech
 ```
-You can use the following scripts to synthesize for `${BIN_DIR}/../sentences.txt` using pretrained speedyspeech and parallel wavegan models.
+You can use the following scripts to synthesize for `${BIN_DIR}/../../assets/sentences.txt` using pretrained speedyspeech and parallel wavegan models.
 ```bash
 source path.sh
@@ -264,7 +264,7 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \
  --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
  --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
  --lang=zh \
-  --text=${BIN_DIR}/../sentences.txt \
+  --text=${BIN_DIR}/../../assets/sentences.txt \
  --output_dir=exp/default/test_e2e \
  --inference_dir=exp/default/inference \
  --phones_dict=speedyspeech_csmsc_ckpt_0.2.0/phone_id_map.txt \

--- a/examples/csmsc/tts2/local/inference.sh
+++ b/examples/csmsc/tts2/local/inference.sh
@@ -11,7 +11,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        --inference_dir=${train_output_path}/inference \
        --am=speedyspeech_csmsc \
        --voc=pwgan_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/pd_infer_out \
        --phones_dict=dump/phone_id_map.txt \
        --tones_dict=dump/tone_id_map.txt
@@ -24,7 +24,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
        --inference_dir=${train_output_path}/inference \
        --am=speedyspeech_csmsc \
        --voc=mb_melgan_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/pd_infer_out \
        --phones_dict=dump/phone_id_map.txt \
        --tones_dict=dump/tone_id_map.txt
@@ -36,7 +36,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
        --inference_dir=${train_output_path}/inference \
        --am=speedyspeech_csmsc \
        --voc=hifigan_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/pd_infer_out \
        --phones_dict=dump/phone_id_map.txt \
        --tones_dict=dump/tone_id_map.txt

--- a/examples/csmsc/tts2/local/lite_predict.sh
+++ b/examples/csmsc/tts2/local/lite_predict.sh
@@ -11,7 +11,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        --inference_dir=${train_output_path}/pdlite \
        --am=speedyspeech_csmsc \
        --voc=pwgan_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/lite_infer_out \
        --phones_dict=dump/phone_id_map.txt \
        --tones_dict=dump/tone_id_map.txt
@@ -24,7 +24,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
        --inference_dir=${train_output_path}/pdlite \
        --am=speedyspeech_csmsc \
        --voc=mb_melgan_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/lite_infer_out \
        --phones_dict=dump/phone_id_map.txt \
        --tones_dict=dump/tone_id_map.txt
@@ -36,7 +36,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
        --inference_dir=${train_output_path}/pdlite \
        --am=speedyspeech_csmsc \
        --voc=hifigan_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/lite_infer_out \
        --phones_dict=dump/phone_id_map.txt \
        --tones_dict=dump/tone_id_map.txt

--- a/examples/csmsc/tts2/local/ort_predict.sh
+++ b/examples/csmsc/tts2/local/ort_predict.sh
@@ -10,7 +10,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        --am=speedyspeech_csmsc \
        --voc=pwgan_csmsc \
        --output_dir=${train_output_path}/onnx_infer_out_e2e \
-        --text=${BIN_DIR}/../csmsc_test.txt \
+        --text=${BIN_DIR}/../../assets/csmsc_test.txt \
        --phones_dict=dump/phone_id_map.txt \
        --tones_dict=dump/tone_id_map.txt \
        --device=cpu \
@@ -23,7 +23,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
        --am=speedyspeech_csmsc \
        --voc=mb_melgan_csmsc \
        --output_dir=${train_output_path}/onnx_infer_out_e2e \
-        --text=${BIN_DIR}/../csmsc_test.txt \
+        --text=${BIN_DIR}/../../assets/csmsc_test.txt \
        --phones_dict=dump/phone_id_map.txt \
        --tones_dict=dump/tone_id_map.txt \
        --device=cpu \
@@ -36,7 +36,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
        --am=speedyspeech_csmsc \
        --voc=hifigan_csmsc \
        --output_dir=${train_output_path}/onnx_infer_out_e2e \
-        --text=${BIN_DIR}/../csmsc_test.txt \
+        --text=${BIN_DIR}/../../assets/csmsc_test.txt \
        --phones_dict=dump/phone_id_map.txt \
        --tones_dict=dump/tone_id_map.txt \
        --device=cpu \

--- a/examples/csmsc/tts2/local/synthesize_e2e.sh
+++ b/examples/csmsc/tts2/local/synthesize_e2e.sh
@@ -21,7 +21,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
        --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
        --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/test_e2e \
        --phones_dict=dump/phone_id_map.txt \
        --tones_dict=dump/tone_id_map.txt \
@@ -43,7 +43,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
        --voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
        --voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
        --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/test_e2e \
        --phones_dict=dump/phone_id_map.txt \
        --tones_dict=dump/tone_id_map.txt \
@@ -66,7 +66,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
        --voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
        --voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
        --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/test_e2e \
        --phones_dict=dump/phone_id_map.txt \
        --tones_dict=dump/tone_id_map.txt
@@ -87,7 +87,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
        --voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
        --voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
        --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/test_e2e \
        --phones_dict=dump/phone_id_map.txt \
        --tones_dict=dump/tone_id_map.txt \
@@ -109,7 +109,7 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
        --voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
        --voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
        --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/test_e2e \
        --phones_dict=dump/phone_id_map.txt \
        --tones_dict=dump/tone_id_map.txt \

--- a/examples/csmsc/tts3/README.md
+++ b/examples/csmsc/tts3/README.md
@@ -258,7 +258,7 @@ fastspeech2_nosil_baker_ckpt_0.4
 ├── snapshot_iter_76000.pdz # model parameters and optimizer states
 └── speech_stats.npy        # statistics used to normalize spectrogram when training fastspeech2
 ```
-You can use the following scripts to synthesize for `${BIN_DIR}/../sentences.txt` using pretrained fastspeech2 and parallel wavegan models.
+You can use the following scripts to synthesize for `${BIN_DIR}/../../assets/sentences.txt` using pretrained fastspeech2 and parallel wavegan models.
 If you want to use fastspeech2_conformer, you must delete this line `--inference_dir=exp/default/inference \` to skip the step of dygraph to static graph, cause we haven't tested dygraph to static graph for fastspeech2_conformer till now.
 ```bash
@@ -276,7 +276,7 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \
  --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
  --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
  --lang=zh \
-  --text=${BIN_DIR}/../sentences.txt \
+  --text=${BIN_DIR}/../../assets/sentences.txt \
  --output_dir=exp/default/test_e2e \
  --inference_dir=exp/default/inference \
  --phones_dict=fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt

--- a/examples/csmsc/tts3/README_cn.md
+++ b/examples/csmsc/tts3/README_cn.md
@@ -248,7 +248,7 @@ fastspeech2_nosil_baker_ckpt_0.4
 ├── snapshot_iter_76000.pdz # 模型参数和优化器状态
 └── speech_stats.npy        # 训练 fastspeech2 时用于规范化频谱图的统计数据
 ```
-您可以使用以下脚本通过使用预训练的 fastspeech2 和 parallel wavegan 模型为 `${BIN_DIR}/../sentences.txt` 合成句子
+您可以使用以下脚本通过使用预训练的 fastspeech2 和 parallel wavegan 模型为 `${BIN_DIR}/../../assets/sentences.txt` 合成句子
 ```bash
 source path.sh
@@ -264,7 +264,7 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \
  --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
  --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
  --lang=zh \
-  --text=${BIN_DIR}/../sentences.txt \
+  --text=${BIN_DIR}/../../assets/sentences.txt \
  --output_dir=exp/default/test_e2e \
  --inference_dir=exp/default/inference \
  --phones_dict=fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt

--- a/examples/csmsc/tts3/local/inference.sh
+++ b/examples/csmsc/tts3/local/inference.sh
@@ -11,7 +11,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        --inference_dir=${train_output_path}/inference \
        --am=fastspeech2_csmsc \
        --voc=pwgan_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/pd_infer_out \
        --phones_dict=dump/phone_id_map.txt
 fi
@@ -23,7 +23,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
        --inference_dir=${train_output_path}/inference \
        --am=fastspeech2_csmsc \
        --voc=mb_melgan_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/pd_infer_out \
        --phones_dict=dump/phone_id_map.txt
 fi
@@ -34,7 +34,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
        --inference_dir=${train_output_path}/inference \
        --am=fastspeech2_csmsc \
        --voc=hifigan_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/pd_infer_out \
        --phones_dict=dump/phone_id_map.txt
 fi
@@ -45,7 +45,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
        --inference_dir=${train_output_path}/inference \
        --am=fastspeech2_csmsc \
        --voc=wavernn_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/pd_infer_out \
        --phones_dict=dump/phone_id_map.txt
 fi
\ No newline at end of file
--- a/examples/csmsc/tts3/local/inference_streaming.sh
+++ b/examples/csmsc/tts3/local/inference_streaming.sh
@@ -12,7 +12,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        --am=fastspeech2_csmsc \
        --am_stat=dump/train/speech_stats.npy \
        --voc=pwgan_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/pd_infer_out_streaming \
        --phones_dict=dump/phone_id_map.txt \
        --am_streaming=True
@@ -26,7 +26,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
        --am=fastspeech2_csmsc \
        --am_stat=dump/train/speech_stats.npy \
        --voc=mb_melgan_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/pd_infer_out_streaming \
        --phones_dict=dump/phone_id_map.txt \
        --am_streaming=True
@@ -39,7 +39,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
        --am=fastspeech2_csmsc \
        --am_stat=dump/train/speech_stats.npy \
        --voc=hifigan_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/pd_infer_out_streaming \
        --phones_dict=dump/phone_id_map.txt \
        --am_streaming=True

--- a/examples/csmsc/tts3/local/lite_predict.sh
+++ b/examples/csmsc/tts3/local/lite_predict.sh
@@ -11,7 +11,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        --inference_dir=${train_output_path}/pdlite \
        --am=fastspeech2_csmsc \
        --voc=pwgan_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/lite_infer_out \
        --phones_dict=dump/phone_id_map.txt
 fi
@@ -23,7 +23,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
        --inference_dir=${train_output_path}/pdlite \
        --am=fastspeech2_csmsc \
        --voc=mb_melgan_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/lite_infer_out \
        --phones_dict=dump/phone_id_map.txt
 fi
@@ -34,7 +34,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
        --inference_dir=${train_output_path}/pdlite \
        --am=fastspeech2_csmsc \
        --voc=hifigan_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/lite_infer_out \
        --phones_dict=dump/phone_id_map.txt
 fi
--- a/examples/csmsc/tts3/local/lite_predict_streaming.sh
+++ b/examples/csmsc/tts3/local/lite_predict_streaming.sh
@@ -12,7 +12,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        --am=fastspeech2_csmsc \
        --am_stat=dump/train/speech_stats.npy \
        --voc=pwgan_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/lite_infer_out_streaming \
        --phones_dict=dump/phone_id_map.txt \
        --am_streaming=True
@@ -26,7 +26,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
        --am=fastspeech2_csmsc \
        --am_stat=dump/train/speech_stats.npy \
        --voc=mb_melgan_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/lite_infer_out_streaming \
        --phones_dict=dump/phone_id_map.txt \
        --am_streaming=True
@@ -39,7 +39,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
        --am=fastspeech2_csmsc \
        --am_stat=dump/train/speech_stats.npy \
        --voc=hifigan_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/lite_infer_out_streaming \
        --phones_dict=dump/phone_id_map.txt \
        --am_streaming=True

--- a/examples/csmsc/tts3/local/ort_predict.sh
+++ b/examples/csmsc/tts3/local/ort_predict.sh
@@ -10,7 +10,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        --am=fastspeech2_csmsc \
        --voc=pwgan_csmsc \
        --output_dir=${train_output_path}/onnx_infer_out_e2e \
-        --text=${BIN_DIR}/../csmsc_test.txt \
+        --text=${BIN_DIR}/../../assets/csmsc_test.txt \
        --phones_dict=dump/phone_id_map.txt \
        --device=cpu \
        --cpu_threads=2
@@ -22,7 +22,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
        --am=fastspeech2_csmsc \
        --voc=mb_melgan_csmsc \
        --output_dir=${train_output_path}/onnx_infer_out_e2e \
-        --text=${BIN_DIR}/../csmsc_test.txt \
+        --text=${BIN_DIR}/../../assets/csmsc_test.txt \
        --phones_dict=dump/phone_id_map.txt \
        --device=cpu \
        --cpu_threads=2
@@ -34,7 +34,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
        --am=fastspeech2_csmsc \
        --voc=hifigan_csmsc \
        --output_dir=${train_output_path}/onnx_infer_out_e2e \
-        --text=${BIN_DIR}/../csmsc_test.txt \
+        --text=${BIN_DIR}/../../assets/csmsc_test.txt \
        --phones_dict=dump/phone_id_map.txt \
        --device=cpu \
        --cpu_threads=2

--- a/examples/csmsc/tts3/local/ort_predict_streaming.sh
+++ b/examples/csmsc/tts3/local/ort_predict_streaming.sh
@@ -11,7 +11,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        --am_stat=dump/train/speech_stats.npy \
        --voc=pwgan_csmsc \
        --output_dir=${train_output_path}/onnx_infer_out_streaming \
-        --text=${BIN_DIR}/../csmsc_test.txt \
+        --text=${BIN_DIR}/../../assets/csmsc_test.txt \
        --phones_dict=dump/phone_id_map.txt \
        --device=cpu \
        --cpu_threads=2 \
@@ -25,7 +25,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
        --am_stat=dump/train/speech_stats.npy \
        --voc=mb_melgan_csmsc \
        --output_dir=${train_output_path}/onnx_infer_out_streaming \
-        --text=${BIN_DIR}/../csmsc_test.txt \
+        --text=${BIN_DIR}/../../assets/csmsc_test.txt \
        --phones_dict=dump/phone_id_map.txt \
        --device=cpu \
        --cpu_threads=2 \
@@ -39,7 +39,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
        --am_stat=dump/train/speech_stats.npy \
        --voc=hifigan_csmsc \
        --output_dir=${train_output_path}/onnx_infer_out_streaming \
-        --text=${BIN_DIR}/../csmsc_test.txt \
+        --text=${BIN_DIR}/../../assets/csmsc_test.txt \
        --phones_dict=dump/phone_id_map.txt \
        --device=cpu \
        --cpu_threads=2 \

--- a/examples/csmsc/tts3/local/synthesize_e2e.sh
+++ b/examples/csmsc/tts3/local/synthesize_e2e.sh
@@ -21,7 +21,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
        --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
        --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/test_e2e \
        --phones_dict=dump/phone_id_map.txt \
        --inference_dir=${train_output_path}/inference
@@ -42,7 +42,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
        --voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
        --voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
        --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/test_e2e \
        --phones_dict=dump/phone_id_map.txt \
        --inference_dir=${train_output_path}/inference
@@ -64,7 +64,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
        --voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
        --voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
        --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/test_e2e \
        --phones_dict=dump/phone_id_map.txt
        # --inference_dir=${train_output_path}/inference
@@ -85,7 +85,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
        --voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
        --voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
        --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/test_e2e \
        --phones_dict=dump/phone_id_map.txt \
        --inference_dir=${train_output_path}/inference
@@ -107,7 +107,7 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
        --voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
        --voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
        --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/test_e2e \
        --phones_dict=dump/phone_id_map.txt \
        --inference_dir=${train_output_path}/inference

--- a/examples/csmsc/tts3/local/synthesize_streaming.sh
+++ b/examples/csmsc/tts3/local/synthesize_streaming.sh
@@ -21,7 +21,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
        --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
        --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/test_e2e_streaming \
        --phones_dict=dump/phone_id_map.txt \
        --am_streaming=True \
@@ -43,7 +43,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
        --voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
        --voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
        --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/test_e2e_streaming \
        --phones_dict=dump/phone_id_map.txt \
        --am_streaming=True \
@@ -66,7 +66,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
        --voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
        --voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
        --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/test_e2e_streaming \
        --phones_dict=dump/phone_id_map.txt \
        --am_streaming=True
@@ -87,7 +87,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
        --voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
        --voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
        --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/test_e2e_streaming \
        --phones_dict=dump/phone_id_map.txt \
        --am_streaming=True \

--- a/examples/csmsc/tts3_rhy/local/synthesize_e2e.sh
+++ b/examples/csmsc/tts3_rhy/local/synthesize_e2e.sh
@@ -21,7 +21,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
        --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
        --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/test_e2e \
        --phones_dict=dump/phone_id_map.txt \
        --inference_dir=${train_output_path}/inference \
@@ -43,7 +43,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
        --voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
        --voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
        --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/test_e2e \
        --phones_dict=dump/phone_id_map.txt \
        --inference_dir=${train_output_path}/inference \
@@ -66,7 +66,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
        --voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
        --voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
        --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/test_e2e \
        --phones_dict=dump/phone_id_map.txt \
        --use_rhy=True
@@ -88,7 +88,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
        --voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
        --voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
        --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/test_e2e \
        --phones_dict=dump/phone_id_map.txt \
        --inference_dir=${train_output_path}/inference \
@@ -111,7 +111,7 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
        --voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
        --voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
        --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/test_e2e \
        --phones_dict=dump/phone_id_map.txt \
        --inference_dir=${train_output_path}/inference \

--- a/examples/csmsc/vits/README.md
+++ b/examples/csmsc/vits/README.md
@@ -172,6 +172,6 @@ python3 ${BIN_DIR}/synthesize_e2e.py \
    --ckpt=vits_csmsc_ckpt_1.4.0/snapshot_iter_150000.pdz \
    --phones_dict=vits_csmsc_ckpt_1.4.0/phone_id_map.txt \
    --output_dir=exp/default/test_e2e \
-    --text=${BIN_DIR}/../sentences.txt \
+    --text=${BIN_DIR}/../../assets/sentences.txt \
    --add-blank=${add_blank} 
 ```
--- a/examples/csmsc/vits/local/inference.sh
+++ b/examples/csmsc/vits/local/inference.sh
@@ -10,7 +10,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    python3 ${BIN_DIR}/inference.py \
        --inference_dir=${train_output_path}/inference \
        --am=vits_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/pd_infer_out \
        --phones_dict=dump/phone_id_map.txt \
        --add-blank=${add_blank}

--- a/examples/csmsc/vits/local/lite_predict.sh
+++ b/examples/csmsc/vits/local/lite_predict.sh
@@ -10,7 +10,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    python3 ${BIN_DIR}/lite_predict.py \
        --inference_dir=${train_output_path}/pdlite \
        --am=vits_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/lite_infer_out \
        --phones_dict=dump/phone_id_map.txt \
        --add-blank=${add_blank}

--- a/examples/csmsc/vits/local/synthesize_e2e.sh
+++ b/examples/csmsc/vits/local/synthesize_e2e.sh
@@ -18,7 +18,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        --ckpt=${train_output_path}/checkpoints/${ckpt_name} \
        --phones_dict=dump/phone_id_map.txt \
        --output_dir=${train_output_path}/test_e2e \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --add-blank=${add_blank} #\
        # --inference_dir=${train_output_path}/inference
 fi
--- a/examples/ljspeech/tts0/README.md
+++ b/examples/ljspeech/tts0/README.md
@@ -239,7 +239,7 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \
  --voc_ckpt=pwg_ljspeech_ckpt_0.5/pwg_snapshot_iter_400000.pdz  \
  --voc_stat=pwg_ljspeech_ckpt_0.5/pwg_stats.npy \
  --lang=en \
-  --text=${BIN_DIR}/../sentences_en.txt \
+  --text=${BIN_DIR}/../../assets/sentences_en.txt \
  --output_dir=exp/default/test_e2e \
  --phones_dict=tacotron2_ljspeech_ckpt_0.2.0/phone_id_map.txt
 ```
--- a/examples/ljspeech/tts0/local/synthesize_e2e.sh
+++ b/examples/ljspeech/tts0/local/synthesize_e2e.sh
@@ -16,7 +16,7 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \
    --voc_ckpt=pwg_ljspeech_ckpt_0.5/pwg_snapshot_iter_400000.pdz  \
    --voc_stat=pwg_ljspeech_ckpt_0.5/pwg_stats.npy \
    --lang=en \
-    --text=${BIN_DIR}/../sentences_en.txt \
+    --text=${BIN_DIR}/../../assets/sentences_en.txt \
    --output_dir=${train_output_path}/test_e2e \
    --phones_dict=dump/phone_id_map.txt \
    # --inference_dir=${train_output_path}/inference
\ No newline at end of file
--- a/examples/ljspeech/tts1/README.md
+++ b/examples/ljspeech/tts1/README.md
@@ -191,7 +191,7 @@ python3 ${BIN_DIR}/synthesize_e2e.py \
  --transformer-tts-stat=transformer_tts_ljspeech_ckpt_0.4/speech_stats.npy \
  --waveflow-config=waveflow_ljspeech_ckpt_0.3/config.yaml \
  --waveflow-checkpoint=waveflow_ljspeech_ckpt_0.3/step-2000000.pdparams \
-  --text=${BIN_DIR}/../sentences_en.txt \
+  --text=${BIN_DIR}/../../assets/sentences_en.txt \
  --output-dir=exp/default/test_e2e \
  --phones-dict=transformer_tts_ljspeech_ckpt_0.4/phone_id_map.txt
 ```
--- a/examples/ljspeech/tts1/local/synthesize_e2e.sh
+++ b/examples/ljspeech/tts1/local/synthesize_e2e.sh
@@ -12,6 +12,6 @@ python3 ${BIN_DIR}/synthesize_e2e.py \
    --transformer-tts-stat=dump/train/speech_stats.npy \
    --waveflow-config=waveflow_ljspeech_ckpt_0.3/config.yaml \
    --waveflow-checkpoint=waveflow_ljspeech_ckpt_0.3/step-2000000.pdparams \
-    --text=${BIN_DIR}/../sentences_en.txt \
+    --text=${BIN_DIR}/../../assets/sentences_en.txt \
    --output-dir=${train_output_path}/test_e2e \
    --phones-dict=dump/phone_id_map.txt
--- a/examples/ljspeech/tts3/README.md
+++ b/examples/ljspeech/tts3/README.md
@@ -254,7 +254,7 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \
  --voc_ckpt=pwg_ljspeech_ckpt_0.5/pwg_snapshot_iter_400000.pdz  \
  --voc_stat=pwg_ljspeech_ckpt_0.5/pwg_stats.npy \
  --lang=en \
-  --text=${BIN_DIR}/../sentences_en.txt \
+  --text=${BIN_DIR}/../../assets/sentences_en.txt \
  --output_dir=exp/default/test_e2e \
  --inference_dir=exp/default/inference \
  --phones_dict=fastspeech2_nosil_ljspeech_ckpt_0.5/phone_id_map.txt

--- a/examples/ljspeech/tts3/local/inference.sh
+++ b/examples/ljspeech/tts3/local/inference.sh
@@ -11,7 +11,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        --inference_dir=${train_output_path}/inference \
        --am=fastspeech2_ljspeech \
        --voc=pwgan_ljspeech \
-        --text=${BIN_DIR}/../sentences_en.txt \
+        --text=${BIN_DIR}/../../assets/sentences_en.txt \
        --output_dir=${train_output_path}/pd_infer_out \
        --phones_dict=dump/phone_id_map.txt \
        --lang=en
@@ -23,7 +23,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
        --inference_dir=${train_output_path}/inference \
        --am=fastspeech2_ljspeech \
        --voc=hifigan_ljspeech \
-        --text=${BIN_DIR}/../sentences_en.txt \
+        --text=${BIN_DIR}/../../assets/sentences_en.txt \
        --output_dir=${train_output_path}/pd_infer_out \
        --phones_dict=dump/phone_id_map.txt \
        --lang=en

--- a/examples/ljspeech/tts3/local/lite_predict.sh
+++ b/examples/ljspeech/tts3/local/lite_predict.sh
@@ -11,7 +11,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        --inference_dir=${train_output_path}/pdlite \
        --am=fastspeech2_ljspeech \
        --voc=pwgan_ljspeech \
-        --text=${BIN_DIR}/../sentences_en.txt \
+        --text=${BIN_DIR}/../../assets/sentences_en.txt \
        --output_dir=${train_output_path}/lite_infer_out \
        --phones_dict=dump/phone_id_map.txt \
        --lang=en
@@ -23,7 +23,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
        --inference_dir=${train_output_path}/pdlite \
        --am=fastspeech2_ljspeech \
        --voc=hifigan_ljspeech \
-        --text=${BIN_DIR}/../sentences_en.txt \
+        --text=${BIN_DIR}/../../assets/sentences_en.txt \
        --output_dir=${train_output_path}/lite_infer_out \
        --phones_dict=dump/phone_id_map.txt \
        --lang=en

--- a/examples/ljspeech/tts3/local/ort_predict.sh
+++ b/examples/ljspeech/tts3/local/ort_predict.sh
@@ -10,7 +10,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        --am=fastspeech2_ljspeech \
        --voc=pwgan_ljspeech\
        --output_dir=${train_output_path}/onnx_infer_out_e2e \
-        --text=${BIN_DIR}/../sentences_en.txt  \
+        --text=${BIN_DIR}/../../assets/sentences_en.txt  \
        --phones_dict=dump/phone_id_map.txt \
        --device=cpu \
        --cpu_threads=2 \
@@ -24,7 +24,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
        --am=fastspeech2_ljspeech \
        --voc=hifigan_ljspeech \
        --output_dir=${train_output_path}/onnx_infer_out_e2e \
-        --text=${BIN_DIR}/../sentences_en.txt  \
+        --text=${BIN_DIR}/../../assets/sentences_en.txt  \
        --phones_dict=dump/phone_id_map.txt \
        --device=cpu \
        --cpu_threads=2 \

--- a/examples/ljspeech/tts3/local/synthesize_e2e.sh
+++ b/examples/ljspeech/tts3/local/synthesize_e2e.sh
@@ -21,7 +21,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        --voc_ckpt=pwg_ljspeech_ckpt_0.5/pwg_snapshot_iter_400000.pdz  \
        --voc_stat=pwg_ljspeech_ckpt_0.5/pwg_stats.npy \
        --lang=en \
-        --text=${BIN_DIR}/../sentences_en.txt \
+        --text=${BIN_DIR}/../../assets/sentences_en.txt \
        --output_dir=${train_output_path}/test_e2e \
        --inference_dir=${train_output_path}/inference \
        --phones_dict=dump/phone_id_map.txt
@@ -41,7 +41,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
        --voc_ckpt=hifigan_ljspeech_ckpt_0.2.0/snapshot_iter_2500000.pdz \
        --voc_stat=hifigan_ljspeech_ckpt_0.2.0/feats_stats.npy \
        --lang=en \
-        --text=${BIN_DIR}/../sentences_en.txt \
+        --text=${BIN_DIR}/../../assets/sentences_en.txt \
        --output_dir=${train_output_path}/test_e2e \
        --inference_dir=${train_output_path}/inference \
        --phones_dict=dump/phone_id_map.txt

--- a/examples/opencpop/svs1/README.md
+++ b/examples/opencpop/svs1/README.md
@@ -267,7 +267,7 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \
  --voc_ckpt=pwgan_opencpop_ckpt_1.4.0/snapshot_iter_100000.pdz \
  --voc_stat=pwgan_opencpop_ckpt_1.4.0/feats_stats.npy \
  --lang=sing \
-  --text=${BIN_DIR}/../sentences_sing.txt \
+  --text=${BIN_DIR}/../../assets/sentences_sing.txt \
  --output_dir=exp/default/test_e2e \
  --phones_dict=diffsinger_opencpop_ckpt_1.4.0/phone_id_map.txt \
  --pinyin_phone=diffsinger_opencpop_ckpt_1.4.0/pinyin_to_phone.txt \

--- a/examples/opencpop/svs1/README_cn.md
+++ b/examples/opencpop/svs1/README_cn.md
@@ -271,7 +271,7 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \
  --voc_ckpt=pwgan_opencpop_ckpt_1.4.0/snapshot_iter_100000.pdz \
  --voc_stat=pwgan_opencpop_ckpt_1.4.0/feats_stats.npy \
  --lang=sing \
-  --text=${BIN_DIR}/../sentences_sing.txt \
+  --text=${BIN_DIR}/../../assets/sentences_sing.txt \
  --output_dir=exp/default/test_e2e \
  --phones_dict=diffsinger_opencpop_ckpt_1.4.0/phone_id_map.txt \
  --pinyin_phone=diffsinger_opencpop_ckpt_1.4.0/pinyin_to_phone.txt \

--- a/examples/opencpop/svs1/local/synthesize_e2e.sh
+++ b/examples/opencpop/svs1/local/synthesize_e2e.sh
@@ -21,7 +21,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        --voc_ckpt=pwgan_opencpop_ckpt_1.4.0/snapshot_iter_100000.pdz \
        --voc_stat=pwgan_opencpop_ckpt_1.4.0/feats_stats.npy \
        --lang=sing \
-        --text=${BIN_DIR}/../sentences_sing.txt \
+        --text=${BIN_DIR}/../../assets/sentences_sing.txt \
        --output_dir=${train_output_path}/test_e2e \
        --phones_dict=dump/phone_id_map.txt \
        --speech_stretchs=dump/train/speech_stretchs.npy \
@@ -44,7 +44,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
        --voc_ckpt=hifigan_opencpop_ckpt_1.4.0/snapshot_iter_625000.pdz \
        --voc_stat=hifigan_opencpop_ckpt_1.4.0/feats_stats.npy \
        --lang=sing \
-        --text=${BIN_DIR}/../sentences_sing.txt \
+        --text=${BIN_DIR}/../../assets/sentences_sing.txt \
        --output_dir=${train_output_path}/test_e2e \
        --phones_dict=dump/phone_id_map.txt \
        --speech_stretchs=dump/train/speech_stretchs.npy \

--- a/examples/other/tts_finetune/tts3/run.sh
+++ b/examples/other/tts_finetune/tts3/run.sh
@@ -99,7 +99,7 @@ if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
        --voc_ckpt=pretrained_models/hifigan_aishell3_ckpt_0.2.0/snapshot_iter_2500000.pdz \
        --voc_stat=pretrained_models/hifigan_aishell3_ckpt_0.2.0/feats_stats.npy \
        --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=./test_e2e/ \
        --phones_dict=${dump_dir}/phone_id_map.txt \
        --speaker_dict=${dump_dir}/speaker_id_map.txt \

--- a/examples/other/tts_finetune/tts3/run_en.sh
+++ b/examples/other/tts_finetune/tts3/run_en.sh
@@ -98,7 +98,7 @@ if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
        --voc_ckpt=pretrained_models/hifigan_vctk_ckpt_0.2.0/snapshot_iter_2500000.pdz \
        --voc_stat=pretrained_models/hifigan_vctk_ckpt_0.2.0/feats_stats.npy \
        --lang=en \
-        --text=${BIN_DIR}/../sentences_en.txt \
+        --text=${BIN_DIR}/../../assets/sentences_en.txt \
        --output_dir=./test_e2e/ \
        --phones_dict=${dump_dir}/phone_id_map.txt \
        --speaker_dict=${dump_dir}/speaker_id_map.txt \

--- a/examples/other/tts_finetune/tts3/run_mix.sh
+++ b/examples/other/tts_finetune/tts3/run_mix.sh
@@ -100,7 +100,7 @@ if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
        --voc_ckpt=pretrained_models/hifigan_aishell3_ckpt_0.2.0/snapshot_iter_2500000.pdz \
        --voc_stat=pretrained_models/hifigan_aishell3_ckpt_0.2.0/feats_stats.npy \
        --lang=mix \
-        --text=${BIN_DIR}/../sentences_mix.txt \
+        --text=${BIN_DIR}/../../assets/sentences_mix.txt \
        --output_dir=./test_e2e/ \
        --phones_dict=${dump_dir}/phone_id_map.txt \
        --speaker_dict=${dump_dir}/speaker_id_map.txt \

--- a/examples/vctk/tts3/README.md
+++ b/examples/vctk/tts3/README.md
@@ -254,7 +254,7 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \
  --voc_ckpt=pwg_vctk_ckpt_0.1.1/snapshot_iter_1500000.pdz \
  --voc_stat=pwg_vctk_ckpt_0.1.1/feats_stats.npy \
  --lang=en \
-  --text=${BIN_DIR}/../sentences_en.txt \
+  --text=${BIN_DIR}/../../assets/sentences_en.txt \
  --output_dir=exp/default/test_e2e \
  --phones_dict=fastspeech2_vctk_ckpt_1.2.0/phone_id_map.txt \
  --speaker_dict=fastspeech2_vctk_ckpt_1.2.0/speaker_id_map.txt \

--- a/examples/vctk/tts3/local/inference.sh
+++ b/examples/vctk/tts3/local/inference.sh
@@ -10,7 +10,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        --inference_dir=${train_output_path}/inference \
        --am=fastspeech2_vctk \
        --voc=pwgan_vctk \
-        --text=${BIN_DIR}/../sentences_en.txt \
+        --text=${BIN_DIR}/../../assets/sentences_en.txt \
        --output_dir=${train_output_path}/pd_infer_out \
        --phones_dict=dump/phone_id_map.txt \
        --speaker_dict=dump/speaker_id_map.txt \
@@ -23,7 +23,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
        --inference_dir=${train_output_path}/inference \
        --am=fastspeech2_vctk \
        --voc=hifigan_vctk \
-        --text=${BIN_DIR}/../sentences_en.txt \
+        --text=${BIN_DIR}/../../assets/sentences_en.txt \
        --output_dir=${train_output_path}/pd_infer_out \
        --phones_dict=dump/phone_id_map.txt \
        --speaker_dict=dump/speaker_id_map.txt \

--- a/examples/vctk/tts3/local/lite_predict.sh
+++ b/examples/vctk/tts3/local/lite_predict.sh
@@ -11,7 +11,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        --inference_dir=${train_output_path}/pdlite \
        --am=fastspeech2_vctk \
        --voc=pwgan_vctk \
-        --text=${BIN_DIR}/../sentences_en.txt \
+        --text=${BIN_DIR}/../../assets/sentences_en.txt \
        --output_dir=${train_output_path}/lite_infer_out \
        --phones_dict=dump/phone_id_map.txt \
        --speaker_dict=dump/speaker_id_map.txt \
@@ -25,7 +25,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
        --inference_dir=${train_output_path}/pdlite \
        --am=fastspeech2_vctk \
        --voc=hifigan_vctk \
-        --text=${BIN_DIR}/../sentences_en.txt \
+        --text=${BIN_DIR}/../../assets/sentences_en.txt \
        --output_dir=${train_output_path}/lite_infer_out \
        --phones_dict=dump/phone_id_map.txt \
        --speaker_dict=dump/speaker_id_map.txt \

--- a/examples/vctk/tts3/local/ort_predict.sh
+++ b/examples/vctk/tts3/local/ort_predict.sh
@@ -10,7 +10,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        --am=fastspeech2_vctk \
        --voc=pwgan_vctk \
        --output_dir=${train_output_path}/onnx_infer_out_e2e \
-        --text=${BIN_DIR}/../sentences_en.txt \
+        --text=${BIN_DIR}/../../assets/sentences_en.txt \
        --phones_dict=dump/phone_id_map.txt \
        --device=cpu \
        --cpu_threads=2 \
@@ -25,7 +25,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
        --am=fastspeech2_vctk \
        --voc=hifigan_vctk \
        --output_dir=${train_output_path}/onnx_infer_out_e2e \
-        --text=${BIN_DIR}/../sentences_en.txt \
+        --text=${BIN_DIR}/../../assets/sentences_en.txt \
        --phones_dict=dump/phone_id_map.txt \
        --device=cpu \
        --cpu_threads=2 \

--- a/examples/vctk/tts3/local/synthesize_e2e.sh
+++ b/examples/vctk/tts3/local/synthesize_e2e.sh
@@ -21,7 +21,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        --voc_ckpt=pwg_vctk_ckpt_0.1.1/snapshot_iter_1500000.pdz \
        --voc_stat=pwg_vctk_ckpt_0.1.1/feats_stats.npy \
        --lang=en \
-        --text=${BIN_DIR}/../sentences_en.txt \
+        --text=${BIN_DIR}/../../assets/sentences_en.txt \
        --output_dir=${train_output_path}/test_e2e \
        --phones_dict=dump/phone_id_map.txt \
        --speaker_dict=dump/speaker_id_map.txt \
@@ -43,7 +43,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
        --voc_ckpt=hifigan_vctk_ckpt_0.2.0/snapshot_iter_2500000.pdz \
        --voc_stat=hifigan_vctk_ckpt_0.2.0/feats_stats.npy \
        --lang=en \
-        --text=${BIN_DIR}/../sentences_en.txt \
+        --text=${BIN_DIR}/../../assets/sentences_en.txt \
        --output_dir=${train_output_path}/test_e2e \
        --phones_dict=dump/phone_id_map.txt \
        --speaker_dict=dump/speaker_id_map.txt \

--- a/examples/zh_en_tts/tts3/.gitignore
+++ b/examples/zh_en_tts/tts3/.gitignore
+data
+exp
--- a/examples/zh_en_tts/tts3/README.md
+++ b/examples/zh_en_tts/tts3/README.md
@@ -6,11 +6,11 @@ This example contains code used to train a [Fastspeech2](https://arxiv.org/abs/2
 ## Dataset
 ### Download and Extract
-Download all datasets and extract it to `~/datasets`:
+Download all datasets and extract it to `./data`:
- The CSMSC dataset is in the directory `~/datasets/BZNSYP`
+- The CSMSC dataset is in the directory `./data/BZNSYP`
- The Ljspeech dataset is in the directory `~/datasets/LJSpeech-1.1`
+- The Ljspeech dataset is in the directory `./data/LJSpeech-1.1`
- The aishell3 dataset is in the directory `~/datasets/data_aishell3`
+- The aishell3 dataset is in the directory `./data/data_aishell3`
- The vctk dataset is in the directory `~/datasets/VCTK-Corpus-0.92`
+- The vctk dataset is in the directory `./data/VCTK-Corpus-0.92`
 ### Get MFA Result and Extract
 We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for the fastspeech2 training.
@@ -24,16 +24,16 @@ Or train your MFA model reference to [mfa example](https://github.com/PaddlePadd
 ## Get Started
 Assume the paths to the datasets are:
- `~/datasets/BZNSYP`
+- `./data/BZNSYP`
- `~/datasets/LJSpeech-1.1`
+- `./data/LJSpeech-1.1`
- `~/datasets/data_aishell3` 
+- `./data/data_aishell3` 
- `~/datasets/VCTK-Corpus-0.92`
+- `./data/VCTK-Corpus-0.92`
 Assume the path to the MFA results of the datasets are:
- `./mfa_results/baker_alignment_tone`
+- `./data/mfa/baker_alignment_tone`
- `./mfa_results/ljspeech_alignment`
+- `./data/mfa/ljspeech_alignment`
- `./mfa_results/aishell3_alignment_tone`
+- `./data/mfa/aishell3_alignment_tone`
- `./mfa_results/vctk_alignment`
+- `./data/mfa/vctk_alignment`
 Run the command below to
 1. **source path**.
@@ -252,8 +252,10 @@ optional arguments:
 ## Pretrained Model
 Pretrained FastSpeech2 model with no silence in the edge of audios:
 - [fastspeech2_mix_ckpt_1.2.0.zip](https://paddlespeech.bj.bcebos.com/t2s/chinse_english_mixed/models/fastspeech2_mix_ckpt_1.2.0.zip)
+- [pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_aishell3_ckpt_0.5.zip)
 The static model can be downloaded here:
 - [fastspeech2_mix_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/t2s/chinse_english_mixed/models/fastspeech2_mix_static_0.2.0.zip)
@@ -285,18 +287,18 @@ FLAGS_allocator_strategy=naive_best_fit \
 FLAGS_fraction_of_gpu_memory_to_use=0.01 \
 python3 ${BIN_DIR}/../synthesize_e2e.py \
  --am=fastspeech2_mix \
-  --am_config=fastspeech2_mix_ckpt_1.2.0/default.yaml \
+  --am_config=exp/pretrain/fastspeech2_mix_ckpt_1.2.0/default.yaml \
-  --am_ckpt=fastspeech2_mix_ckpt_1.2.0/snapshot_iter_99200.pdz \
+  --am_ckpt=exp/pretrain/fastspeech2_mix_ckpt_1.2.0/snapshot_iter_99200.pdz \
-  --am_stat=fastspeech2_mix_ckpt_1.2.0/speech_stats.npy \
+  --am_stat=exp/pretrain/fastspeech2_mix_ckpt_1.2.0/speech_stats.npy \
+  --phones_dict=exp/pretrain/fastspeech2_mix_ckpt_1.2.0/phone_id_map.txt \
+  --speaker_dict=exp/pretrain/fastspeech2_mix_ckpt_1.2.0/speaker_id_map.txt \
+  --spk_id=174 \
  --voc=pwgan_aishell3 \
-  --voc_config=pwg_aishell3_ckpt_0.5/default.yaml \
+  --voc_config=exp/pretrain/pwg_aishell3_ckpt_0.5/default.yaml \
-  --voc_ckpt=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
+  --voc_ckpt=exp/pretrain/pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
-  --voc_stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \
+  --voc_stat=exp/pretrain/pwg_aishell3_ckpt_0.5/feats_stats.npy \
  --lang=mix \
-  --text=${BIN_DIR}/../sentences_mix.txt \
+  --text=${BIN_DIR}/../../assets/sentences_mix.txt \
  --output_dir=exp/default/test_e2e \
-  --phones_dict=fastspeech2_mix_ckpt_1.2.0/phone_id_map.txt \
-  --speaker_dict=fastspeech2_mix_ckpt_1.2.0/speaker_id_map.txt \
-  --spk_id=174 \
  --inference_dir=exp/default/inference
 ```
--- a/examples/zh_en_tts/tts3/local/inference.sh
+++ b/examples/zh_en_tts/tts3/local/inference.sh
@@ -13,7 +13,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        --inference_dir=${train_output_path}/inference \
        --am=fastspeech2_mix \
        --voc=pwgan_aishell3 \
-        --text=${BIN_DIR}/../sentences_mix.txt \
+        --text=${BIN_DIR}/../../assets/sentences_mix.txt \
        --output_dir=${train_output_path}/pd_infer_out \
        --phones_dict=dump/phone_id_map.txt \
        --speaker_dict=dump/speaker_id_map.txt \
@@ -30,7 +30,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
        --inference_dir=${train_output_path}/inference \
        --am=fastspeech2_mix \
        --voc=hifigan_aishell3 \
-        --text=${BIN_DIR}/../sentences_mix.txt \
+        --text=${BIN_DIR}/../../assets/sentences_mix.txt \
        --output_dir=${train_output_path}/pd_infer_out \
        --phones_dict=dump/phone_id_map.txt \
        --speaker_dict=dump/speaker_id_map.txt \
@@ -45,7 +45,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
        --inference_dir=${train_output_path}/inference \
        --am=fastspeech2_mix \
        --voc=hifigan_csmsc \
-        --text=${BIN_DIR}/../sentences_mix.txt \
+        --text=${BIN_DIR}/../../assets/sentences_mix.txt \
        --output_dir=${train_output_path}/pd_infer_out \
        --phones_dict=dump/phone_id_map.txt \
        --speaker_dict=dump/speaker_id_map.txt \

--- a/examples/zh_en_tts/tts3/local/mfa_download.sh
+++ b/examples/zh_en_tts/tts3/local/mfa_download.sh
+#!/bin/bash
+exp=exp
+mfa=$exp/mfa
+mkdir -p $mfa
+pushd $mfa
+wget -c https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz &
+wget -c https://paddlespeech.bj.bcebos.com/MFA/LJSpeech-1.1/ljspeech_alignment.tar.gz &
+wget -c https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/with_tone/aishell3_alignment_tone.tar.gz &
+wget -c https://paddlespeech.bj.bcebos.com/MFA/VCTK-Corpus-0.92/vctk_alignment.tar.gz &
+wait
+popd
--- a/examples/zh_en_tts/tts3/local/model_download.sh
+++ b/examples/zh_en_tts/tts3/local/model_download.sh
+#!/bin/bash
+exp=exp
+pretrain=$exp/pretrain
+mkdir -p $pretrain
+pushd $pretrain
+wget -c https://paddlespeech.bj.bcebos.com/t2s/chinse_english_mixed/models/fastspeech2_mix_ckpt_1.2.0.zip &
+wget -c https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_aishell3_ckpt_0.5.zip &
+wait
+popd
--- a/examples/zh_en_tts/tts3/local/ort_predict.sh
+++ b/examples/zh_en_tts/tts3/local/ort_predict.sh
@@ -13,7 +13,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        --am=fastspeech2_mix \
        --voc=pwgan_aishell3 \
        --output_dir=${train_output_path}/onnx_infer_out_e2e \
-        --text=${BIN_DIR}/../sentences_mix.txt \
+        --text=${BIN_DIR}/../../assets/sentences_mix.txt \
        --phones_dict=dump/phone_id_map.txt \
        --device=cpu \
        --cpu_threads=4 \
@@ -31,7 +31,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
        --am=fastspeech2_mix \
        --voc=hifigan_aishell3 \
        --output_dir=${train_output_path}/onnx_infer_out_e2e \
-        --text=${BIN_DIR}/../sentences_mix.txt \
+        --text=${BIN_DIR}/../../assets/sentences_mix.txt \
        --phones_dict=dump/phone_id_map.txt \
        --device=cpu \
        --cpu_threads=4 \
@@ -45,7 +45,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
        --am=fastspeech2_mix \
        --voc=hifigan_csmsc \
        --output_dir=${train_output_path}/onnx_infer_out_e2e \
-        --text=${BIN_DIR}/../sentences_mix.txt \
+        --text=${BIN_DIR}/../../assets/sentences_mix.txt \
        --phones_dict=dump/phone_id_map.txt \
        --device=cpu \
        --cpu_threads=4 \

--- a/examples/zh_en_tts/tts3/local/synthesize_e2e.sh
+++ b/examples/zh_en_tts/tts3/local/synthesize_e2e.sh
@@ -23,7 +23,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        --voc_ckpt=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
        --voc_stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \
        --lang=mix \
-        --text=${BIN_DIR}/../sentences_mix.txt \
+        --text=${BIN_DIR}/../../assets/sentences_mix.txt \
        --output_dir=${train_output_path}/test_e2e \
        --phones_dict=dump/phone_id_map.txt \
        --speaker_dict=dump/speaker_id_map.txt \
@@ -48,7 +48,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
        --voc_ckpt=hifigan_aishell3_ckpt_0.2.0/snapshot_iter_2500000.pdz \
        --voc_stat=hifigan_aishell3_ckpt_0.2.0/feats_stats.npy \
        --lang=mix \
-        --text=${BIN_DIR}/../sentences_mix.txt \
+        --text=${BIN_DIR}/../../assets/sentences_mix.txt \
        --output_dir=${train_output_path}/test_e2e \
        --phones_dict=dump/phone_id_map.txt \
        --speaker_dict=dump/speaker_id_map.txt \
@@ -73,7 +73,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
        --voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
        --voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
        --lang=mix \
-        --text=${BIN_DIR}/../sentences_mix.txt \
+        --text=${BIN_DIR}/../../assets/sentences_mix.txt \
        --output_dir=${train_output_path}/test_e2e \
        --phones_dict=dump/phone_id_map.txt \
        --speaker_dict=dump/speaker_id_map.txt \

--- a/examples/zh_en_tts/tts3/run.sh
+++ b/examples/zh_en_tts/tts3/run.sh
@@ -7,8 +7,8 @@ gpus=0,1
 stage=0
 stop_stage=100
-datasets_root_dir=~/datasets
+datasets_root_dir=./data
-mfa_root_dir=./mfa_results/
+mfa_root_dir=./data/mfa
 conf_path=conf/default.yaml
 train_output_path=exp/default
 ckpt_name=snapshot_iter_99200.pdz

--- a/paddlespeech/server/engine/tts/online/onnx/tts_engine.py
+++ b/paddlespeech/server/engine/tts/online/onnx/tts_engine.py
@@ -28,7 +28,7 @@ from paddlespeech.server.utils.audio_process import float2pcm
 from paddlespeech.server.utils.onnx_infer import get_sess
 from paddlespeech.server.utils.util import denorm
 from paddlespeech.server.utils.util import get_chunks
-from paddlespeech.t2s.frontend import English
+from paddlespeech.t2s.frontend.en_frontend import English
 from paddlespeech.t2s.frontend.zh_frontend import Frontend
 __all__ = ['TTSEngine', 'PaddleTTSConnectionHandler']

--- a/paddlespeech/server/engine/tts/online/python/tts_engine.py
+++ b/paddlespeech/server/engine/tts/online/python/tts_engine.py
@@ -29,7 +29,7 @@ from paddlespeech.server.engine.base_engine import BaseEngine
 from paddlespeech.server.utils.audio_process import float2pcm
 from paddlespeech.server.utils.util import denorm
 from paddlespeech.server.utils.util import get_chunks
-from paddlespeech.t2s.frontend import English
+from paddlespeech.t2s.frontend.en_frontend import English
 from paddlespeech.t2s.frontend.zh_frontend import Frontend
 from paddlespeech.t2s.modules.normalizer import ZScore

--- a/paddlespeech/server/engine/tts/paddleinference/tts_engine.py
+++ b/paddlespeech/server/engine/tts/paddleinference/tts_engine.py
@@ -32,7 +32,7 @@ from paddlespeech.server.utils.errors import ErrorCode
 from paddlespeech.server.utils.exception import ServerBaseException
 from paddlespeech.server.utils.paddle_predictor import init_predictor
 from paddlespeech.server.utils.paddle_predictor import run_model
-from paddlespeech.t2s.frontend import English
+from paddlespeech.t2s.frontend.en_frontend import English
 from paddlespeech.t2s.frontend.zh_frontend import Frontend
 __all__ = ['TTSEngine', 'PaddleTTSConnectionHandler']

--- a/paddlespeech/t2s/__init__.py
+++ b/paddlespeech/t2s/__init__.py
@@ -18,6 +18,5 @@ from . import exps
 from . import frontend
 from . import models
 from . import modules
-from . import ssml
 from . import training
 from . import utils
--- a/paddlespeech/t2s/assets/__init__.py
+++ b/paddlespeech/t2s/assets/__init__.py
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/paddlespeech/t2s/exps/csmsc_test.txt
+++ b/paddlespeech/t2s/exps/csmsc_test.txt
--- a/paddlespeech/t2s/exps/sentences.txt
+++ b/paddlespeech/t2s/exps/sentences.txt
--- a/paddlespeech/t2s/exps/sentences_canton.txt
+++ b/paddlespeech/t2s/exps/sentences_canton.txt
--- a/paddlespeech/t2s/exps/sentences_en.txt
+++ b/paddlespeech/t2s/exps/sentences_en.txt
--- a/paddlespeech/t2s/exps/sentences_mix.txt
+++ b/paddlespeech/t2s/exps/sentences_mix.txt
@@ -5,4 +5,5 @@
 005 Paddle Bo Bo: 使用 Paddle Speech 的语音合成模块生成虚拟人的声音。
 006 热烈欢迎您在 Discussions 中提交问题，并在 Issues 中指出发现的 bug。此外，我们非常希望您参与到 Paddle Speech 的开发中！
 007 我喜欢 eat apple, 你喜欢 drink milk。
 008 我们要去云南 team building, 非常非常 happy.
\ No newline at end of file
+009 AI for Sceience 平台。
\ No newline at end of file
--- a/paddlespeech/t2s/exps/sentences_sing.txt
+++ b/paddlespeech/t2s/exps/sentences_sing.txt
--- a/paddlespeech/t2s/exps/sentences_ssml.txt
+++ b/paddlespeech/t2s/exps/sentences_ssml.txt
--- a/paddlespeech/t2s/exps/syn_utils.py
+++ b/paddlespeech/t2s/exps/syn_utils.py
@@ -33,8 +33,8 @@ from yacs.config import CfgNode
 from paddlespeech.t2s.datasets.am_batch_fn import *
 from paddlespeech.t2s.datasets.data_table import DataTable
 from paddlespeech.t2s.datasets.vocoder_batch_fn import Clip_static
-from paddlespeech.t2s.frontend import English
 from paddlespeech.t2s.frontend.canton_frontend import CantonFrontend
+from paddlespeech.t2s.frontend.en_frontend import English
 from paddlespeech.t2s.frontend.mix_frontend import MixFrontend
 from paddlespeech.t2s.frontend.sing_frontend import SingFrontend
 from paddlespeech.t2s.frontend.zh_frontend import Frontend
@@ -99,14 +99,23 @@ def norm(data, mean, std):
    return (data - mean) / std
-def get_chunks(data, block_size: int, pad_size: int):
+def get_chunks(mel, chunk_size: int, pad_size: int):
-    data_len = data.shape[1]
+    """
+    Split mel by chunk size with left and right context.
+    Args:
+        mel (paddle.Tensor): mel spectrogram, shape (B, T, D)
+        chunk_size (int): chunk size
+        pad_size (int): size for left and right context.
+    """
+    T = mel.shape[1]
+    n = math.ceil(T / chunk_size)
    chunks = []
-    n = math.ceil(data_len / block_size)
    for i in range(n):
-        start = max(0, i * block_size - pad_size)
+        start = max(0, i * chunk_size - pad_size)
-        end = min((i + 1) * block_size + pad_size, data_len)
+        end = min((i + 1) * chunk_size + pad_size, T)
-        chunks.append(data[:, start:end, :])
+        chunks.append(mel[:, start:end, :])
    return chunks
@@ -117,14 +126,10 @@ def get_sentences(text_file: Optional[os.PathLike], lang: str='zh'):
    with open(text_file, 'rt', encoding='utf-8') as f:
        for line in f:
            if line.strip() != "":
-                items = re.split(r"\s+", line.strip(), 1)
+                items = re.split(r"\s+", line.strip(), maxsplit=1)
+                assert len(items) == 2
                utt_id = items[0]
-                if lang in {'zh', 'canton'}:
+                sentence = items[1]
-                    sentence = "".join(items[1:])
-                elif lang == 'en':
-                    sentence = " ".join(items[1:])
-                elif lang == 'mix':
-                    sentence = " ".join(items[1:])
            sentences.append((utt_id, sentence))
    return sentences
@@ -319,6 +324,7 @@ def run_frontend(
        input_ids = {}
        if text.strip() != "" and re.match(r".*?<speak>.*?</speak>.*", text,
                                           re.DOTALL):
+            # using ssml
            input_ids = frontend.get_input_ids_ssml(
                text,
                merge_sentences=merge_sentences,
@@ -359,6 +365,7 @@ def run_frontend(
        outs.update({'is_slurs': is_slurs})
    else:
        print("lang should in {'zh', 'en', 'mix', 'canton', 'sing'}!")
    outs.update({'phone_ids': phone_ids})
    return outs

--- a/paddlespeech/t2s/exps/synthesize_e2e.py
+++ b/paddlespeech/t2s/exps/synthesize_e2e.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 import argparse
 from pathlib import Path
+from pprint import pprint
 import paddle
 import soundfile as sf
@@ -78,6 +79,7 @@ def evaluate(args):
    # whether dygraph to static
    if args.inference_dir:
+        print("convert am and voc to static model.")
        # acoustic model
        am_inference = am_to_static(
            am_inference=am_inference,
@@ -92,6 +94,7 @@ def evaluate(args):
    output_dir = Path(args.output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    merge_sentences = False
    # Avoid not stopping at the end of a sub sentence when tacotron2_ljspeech dygraph to static graph
    # but still not stopping in the end (NOTE by yuantian01 Feb 9 2022)
@@ -102,13 +105,19 @@ def evaluate(args):
    if am_name == 'speedyspeech':
        get_tone_ids = True
+    # wav samples
    N = 0
+    # inference time cost
    T = 0
+    # [(uid, text), ]
    if am_name == 'diffsinger':
        sentences = get_sentences_svs(text_file=args.text)
    else:
        sentences = get_sentences(text_file=args.text, lang=args.lang)
    for utt_id, sentence in sentences:
+        print(f"{utt_id} {sentence}")
        with timer() as t:
            if am_name == "diffsinger":
                text = ""
@@ -116,6 +125,8 @@ def evaluate(args):
            else:
                text = sentence
                svs_input = None
+            # frontend
            frontend_dict = run_frontend(
                frontend=frontend,
                text=text,
@@ -124,25 +135,33 @@ def evaluate(args):
                lang=args.lang,
                svs_input=svs_input)
            phone_ids = frontend_dict['phone_ids']
+            # pprint(f"{utt_id} {phone_ids}")
            with paddle.no_grad():
                flags = 0
                for i in range(len(phone_ids)):
+                    # sub phone, split by `sp` or punctuation.
                    part_phone_ids = phone_ids[i]
                    # acoustic model
                    if am_name == 'fastspeech2':
                        # multi speaker
                        if am_dataset in {"aishell3", "vctk", "mix", "canton"}:
-                            spk_id = paddle.to_tensor(args.spk_id)
+                            # multi-speaker
+                            spk_id = paddle.to_tensor([args.spk_id])
                            mel = am_inference(part_phone_ids, spk_id)
                        else:
+                            # single-speaker
                            mel = am_inference(part_phone_ids)
                    elif am_name == 'speedyspeech':
                        part_tone_ids = frontend_dict['tone_ids'][i]
                        if am_dataset in {"aishell3", "vctk", "mix"}:
-                            spk_id = paddle.to_tensor(args.spk_id)
+                            # multi-speaker
+                            spk_id = paddle.to_tensor([args.spk_id])
                            mel = am_inference(part_phone_ids, part_tone_ids,
                                               spk_id)
                        else:
+                            # single-speaker
                            mel = am_inference(part_phone_ids, part_tone_ids)
                    elif am_name == 'tacotron2':
                        mel = am_inference(part_phone_ids)
@@ -155,6 +174,7 @@ def evaluate(args):
                            note=part_note_ids,
                            note_dur=part_note_durs,
                            is_slur=part_is_slurs, )
                    # vocoder
                    wav = voc_inference(mel)
                    if flags == 0:
@@ -162,17 +182,23 @@ def evaluate(args):
                        flags = 1
                    else:
                        wav_all = paddle.concat([wav_all, wav])
        wav = wav_all.numpy()
        N += wav.size
        T += t.elapse
+        # samples per second
        speed = wav.size / t.elapse
+        # generate one second wav need `RTF` seconds
        rtf = am_config.fs / speed
        print(
            f"{utt_id}, mel: {mel.shape}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}."
        )
        sf.write(
            str(output_dir / (utt_id + ".wav")), wav, samplerate=am_config.fs)
        print(f"{utt_id} done!")
    print(f"generation speed: {N / T}Hz, RTF: {am_config.fs / (N / T) }")

--- a/paddlespeech/t2s/exps/transformer_tts/preprocess.py
+++ b/paddlespeech/t2s/exps/transformer_tts/preprocess.py
@@ -27,7 +27,7 @@ import yaml
 from yacs.config import CfgNode as Configuration
 from paddlespeech.t2s.datasets.get_feats import LogMelFBank
-from paddlespeech.t2s.frontend import English
+from paddlespeech.t2s.frontend.en_frontend import English
 def get_lj_sentences(file_name, frontend):

--- a/paddlespeech/t2s/exps/transformer_tts/synthesize_e2e.py
+++ b/paddlespeech/t2s/exps/transformer_tts/synthesize_e2e.py
@@ -21,7 +21,7 @@ import soundfile as sf
 import yaml
 from yacs.config import CfgNode
-from paddlespeech.t2s.frontend import English
+from paddlespeech.t2s.frontend.en_frontend import English
 from paddlespeech.t2s.models.transformer_tts import TransformerTTS
 from paddlespeech.t2s.models.transformer_tts import TransformerTTSInference
 from paddlespeech.t2s.models.waveflow import ConditionalWaveFlow

--- a/paddlespeech/t2s/frontend/__init__.py
+++ b/paddlespeech/t2s/frontend/__init__.py
@@ -13,8 +13,8 @@
 # limitations under the License.
 from .generate_lexicon import *
 from .normalizer import *
-from .phonectic import *
 from .punctuation import *
+from .ssml import *
 from .tone_sandhi import *
 from .vocab import *
 from .zh_normalization import *
--- a/paddlespeech/t2s/frontend/arpabet.py
+++ b/paddlespeech/t2s/frontend/arpabet.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from paddlespeech.t2s.frontend.phonectic import Phonetics
 """
 A phonology system with ARPABET symbols and limited punctuations. The G2P 
 conversion is done by g2p_en.
@@ -19,55 +18,68 @@ conversion is done by g2p_en.
 Note that g2p_en does not handle words with hypen well. So make sure the input
 sentence is first normalized.
 """
-from paddlespeech.t2s.frontend.vocab import Vocab
 from g2p_en import G2p
+from paddlespeech.t2s.frontend.phonectic import Phonetics
+from paddlespeech.t2s.frontend.vocab import Vocab
 class ARPABET(Phonetics):
-    """A phonology for English that uses ARPABET as the phoneme vocabulary.
+    """A phonology for English that uses ARPABET without stress as the phoneme vocabulary.
+    47 symbols = 39 phones + 4 punctuations + 4 special tokens(<pad> <unk> <s> </s>)
+    The current phoneme set contains 39 phonemes, vowels carry a lexical stress marker:
+        0    — No stress
+        1    — Primary stress
+        2    — Secondary stress
+    Phoneme Set:
+        Phoneme Example Translation
+            ------- ------- -----------
+            AA	odd     AA D
+            AE	at	AE T
+            AH	hut	HH AH T
+            AO	ought	AO T
+            AW	cow	K AW
+            AY	hide	HH AY D
+            B 	be	B IY
+            CH	cheese	CH IY Z
+            D 	dee	D IY
+            DH	thee	DH IY
+            EH	Ed	EH D
+            ER	hurt	HH ER T
+            EY	ate	EY T
+            F 	fee	F IY
+            G 	green	G R IY N
+            HH	he	HH IY
+            IH	it	IH T
+            IY	eat	IY T
+            JH	gee	JH IY
+            K 	key	K IY
+            L 	lee	L IY
+            M 	me	M IY
+            N 	knee	N IY
+            NG	ping	P IH NG
+            OW	oat	OW T
+            OY	toy	T OY
+            P 	pee	P IY
+            R 	read	R IY D
+            S 	sea	S IY
+            SH	she	SH IY
+            T 	tea	T IY
+            TH	theta	TH EY T AH
+            UH	hood	HH UH D
+            UW	two	T UW
+            V 	vee	V IY
+            W 	we	W IY
+            Y 	yield	Y IY L D
+            Z 	zee	Z IY
+            ZH	seizure	S IY ZH ER
    See http://www.speech.cs.cmu.edu/cgi-bin/cmudict for more details.
-    Phoneme Example Translation
-        ------- ------- -----------
-        AA	odd     AA D
-        AE	at	AE T
-        AH	hut	HH AH T
-        AO	ought	AO T
-        AW	cow	K AW
-        AY	hide	HH AY D
-        B 	be	B IY
-        CH	cheese	CH IY Z
-        D 	dee	D IY
-        DH	thee	DH IY
-        EH	Ed	EH D
-        ER	hurt	HH ER T
-        EY	ate	EY T
-        F 	fee	F IY
-        G 	green	G R IY N
-        HH	he	HH IY
-        IH	it	IH T
-        IY	eat	IY T
-        JH	gee	JH IY
-        K 	key	K IY
-        L 	lee	L IY
-        M 	me	M IY
-        N 	knee	N IY
-        NG	ping	P IH NG
-        OW	oat	OW T
-        OY	toy	T OY
-        P 	pee	P IY
-        R 	read	R IY D
-        S 	sea	S IY
-        SH	she	SH IY
-        T 	tea	T IY
-        TH	theta	TH EY T AH
-        UH	hood	HH UH D
-        UW	two	T UW
-        V 	vee	V IY
-        W 	we	W IY
-        Y 	yield	Y IY L D
-        Z 	zee	Z IY
-        ZH	seizure	S IY ZH ER
    """
+    # 39 phonemes
    phonemes = [
        'AA', 'AE', 'AH', 'AO', 'AW', 'AY', 'B', 'CH', 'D', 'DH', 'EH', 'ER',
        'EY', 'F', 'G', 'HH', 'IH', 'IY', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW',
@@ -76,6 +88,8 @@ class ARPABET(Phonetics):
    ]
    punctuations = [',', '.', '?', '!']
    symbols = phonemes + punctuations
+    # vowels carry a lexical stress marker：
+    # 0 unstressed（无重音）, 1 primary stress（主重音）和 2 secondary stress（次重音）
    _stress_to_no_stress_ = {
        'AA0': 'AA',
        'AA1': 'AA',
@@ -124,7 +138,12 @@ class ARPABET(Phonetics):
        'UW2': 'UW'
    }
+    def __repr__(self):
+        fmt = "ARPABETWithoutStress(phonemes: {}, punctuations: {})"
+        return fmt.format(len(phonemes), punctuations)
    def __init__(self):
+        # https://github.com/Kyubyong/g2p/blob/master/g2p_en/g2p.py
        self.backend = G2p()
        self.vocab = Vocab(self.phonemes + self.punctuations)
@@ -139,6 +158,7 @@ class ARPABET(Phonetics):
        Returns:
            List[str]: The list of pronunciation sequence.
        """
+        # g2p and remove vowel stress
        phonemes = [
            self._remove_vowels(item) for item in self.backend(sentence)
        ]
@@ -158,6 +178,7 @@ class ARPABET(Phonetics):
        Returns:
            List[int]: The list of pronunciation id sequence.
        """
+        # phonemes to ids
        ids = [self.vocab.lookup(item) for item in phonemes]
        return ids
@@ -189,11 +210,16 @@ class ARPABET(Phonetics):
    def vocab_size(self):
        """ Vocab size.
        """
-        # 47 = 39 phones + 4 punctuations + 4 special tokens
+        # 47 = 39 phones + 4 punctuations + 4 special tokens(<pad> <unk> <s> </s>)
        return len(self.vocab)
 class ARPABETWithStress(Phonetics):
+    """
+    A phonology for English that uses ARPABET with stress as the phoneme vocabulary.
+    77 symbols = 69 phones + 4 punctuations + 4 special tokens
+    """
    phonemes = [
        'AA0', 'AA1', 'AA2', 'AE0', 'AE1', 'AE2', 'AH0', 'AH1', 'AH2', 'AO0',
        'AO1', 'AO2', 'AW0', 'AW1', 'AW2', 'AY0', 'AY1', 'AY2', 'B', 'CH', 'D',
@@ -206,6 +232,10 @@ class ARPABETWithStress(Phonetics):
    punctuations = [',', '.', '?', '!']
    symbols = phonemes + punctuations
+    def __repr__(self):
+        fmt = "ARPABETWithStress(phonemes: {}, punctuations: {})"
+        return fmt.format(len(phonemes), punctuations)
    def __init__(self):
        self.backend = G2p()
        self.vocab = Vocab(self.phonemes + self.punctuations)

--- a/paddlespeech/t2s/frontend/canton_frontend.py
+++ b/paddlespeech/t2s/frontend/canton_frontend.py
@@ -29,7 +29,8 @@ INITIALS = [
 INITIALS += ['sp', 'spl', 'spn', 'sil']
-def get_lines(cantons: List[str]):
+def jyuping_to_phonemes(cantons: List[str]):
+    # jyuping to inital and final
    phones = []
    for canton in cantons:
        for consonant in INITIALS:
@@ -47,7 +48,7 @@ def get_lines(cantons: List[str]):
 class CantonFrontend():
    def __init__(self, phone_vocab_path: str):
        self.text_normalizer = TextNormalizer()
-        self.punc = "：，；。？！“”‘’':,;.?!"
+        self.punc = "、：，；。？！“”‘’':,;.?!"
        self.vocab_phones = {}
        if phone_vocab_path:
@@ -61,8 +62,11 @@ class CantonFrontend():
             merge_sentences: bool=True) -> List[List[str]]:
        phones_list = []
        for sentence in sentences:
+            # jyuping
+            # 'gam3 ngaam1 lou5 sai3 jiu1 kau4 keoi5 dang2 zan6 jiu3 hoi1 wui2, zing6 dai1 ge2 je5 ngo5 wui5 gaau2 dim6 ga3 laa3.'
            phones_str = ToJyutping.get_jyutping_text(sentence)
-            phones_split = get_lines(phones_str.split(' '))
+            # phonemes 
+            phones_split = jyuping_to_phonemes(phones_str.split(' '))
            phones_list.append(phones_split)
        return phones_list
@@ -78,8 +82,11 @@ class CantonFrontend():
                     sentence: str,
                     merge_sentences: bool=True,
                     print_info: bool=False) -> List[List[str]]:
+        # TN & Text Segmentation
        sentences = self.text_normalizer.normalize(sentence)
+        # G2P
        phonemes = self._g2p(sentences, merge_sentences=merge_sentences)
        if print_info:
            print("----------------------------")
            print("text norm results:")
@@ -88,6 +95,7 @@ class CantonFrontend():
            print("g2p results:")
            print(phonemes)
            print("----------------------------")
        return phonemes
    def get_input_ids(self,
@@ -98,9 +106,9 @@ class CantonFrontend():
        phonemes = self.get_phonemes(
            sentence, merge_sentences=merge_sentences, print_info=print_info)
        result = {}
        temp_phone_ids = []
        for phones in phonemes:
            if phones:
                phone_ids = self._p2id(phones)
@@ -108,6 +116,8 @@ class CantonFrontend():
                if to_tensor:
                    phone_ids = paddle.to_tensor(phone_ids)
                temp_phone_ids.append(phone_ids)
        if temp_phone_ids:
            result["phone_ids"] = temp_phone_ids
        return result
--- a/paddlespeech/t2s/frontend/en_frontend.py
+++ b/paddlespeech/t2s/frontend/en_frontend.py
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .phonectic import English
--- a/paddlespeech/t2s/frontend/mix_frontend.py
+++ b/paddlespeech/t2s/frontend/mix_frontend.py
@@ -18,9 +18,9 @@ from typing import List
 import numpy as np
 import paddle
-from paddlespeech.t2s.frontend import English
+from paddlespeech.t2s.frontend.en_frontend import English as EnFrontend
-from paddlespeech.t2s.frontend.zh_frontend import Frontend
+from paddlespeech.t2s.frontend.ssml.xml_processor import MixTextProcessor
-from paddlespeech.t2s.ssml.xml_processor import MixTextProcessor
+from paddlespeech.t2s.frontend.zh_frontend import Frontend as ZhFrontend
 class MixFrontend():
@@ -28,10 +28,9 @@ class MixFrontend():
                 g2p_model="pypinyin",
                 phone_vocab_path=None,
                 tone_vocab_path=None):
+        self.zh_frontend = ZhFrontend(
-        self.zh_frontend = Frontend(
            phone_vocab_path=phone_vocab_path, tone_vocab_path=tone_vocab_path)
-        self.en_frontend = English(phone_vocab_path=phone_vocab_path)
+        self.en_frontend = EnFrontend(phone_vocab_path=phone_vocab_path)
        self.sp_id = self.zh_frontend.vocab_phones["sp"]
        self.sp_id_numpy = np.array([self.sp_id])
        self.sp_id_tensor = paddle.to_tensor([self.sp_id])
@@ -55,15 +54,12 @@ class MixFrontend():
        else:
            return False
-    def get_segment(self, text: str) -> List[str]:
+    def split_by_lang(self, text: str) -> List[str]:
        # sentence --> [ch_part, en_part, ch_part, ...]
        segments = []
        types = []
-        flag = 0
-        temp_seg = ""
-        temp_lang = ""
-        # Determine the type of each character. type: blank, chinese, alphabet, number, unk and point.
+        # Determine the type of each character. type: chinese, alphabet, other.
        for ch in text:
            if self.is_chinese(ch):
                types.append("zh")
@@ -74,31 +70,31 @@ class MixFrontend():
        assert len(types) == len(text)
-        for i in range(len(types)):
+        flag = 0
+        temp_seg = ""
+        temp_lang = ""
+        for i in range(len(text)):
            # find the first char of the seg
            if flag == 0:
                temp_seg += text[i]
                temp_lang = types[i]
                flag = 1
            else:
                if temp_lang == "other":
-                    if types[i] == temp_lang:
+                    # text start is not lang.
-                        temp_seg += text[i]
+                    temp_seg += text[i]
-                    else:
+                    if types[i] != temp_lang:
-                        temp_seg += text[i]
                        temp_lang = types[i]
                else:
-                    if types[i] == temp_lang:
+                    if types[i] == temp_lang or types[i] == "other":
-                        temp_seg += text[i]
+                        # merge same lang or other
-                    elif types[i] == "other":
                        temp_seg += text[i]
                    else:
+                        # change lang
                        segments.append((temp_seg, temp_lang))
                        temp_seg = text[i]
-                        temp_lang = types[i]
+                        temp_lang = types[i]  # new lang
-                        flag = 1
        segments.append((temp_seg, temp_lang))
@@ -110,76 +106,95 @@ class MixFrontend():
                      get_tone_ids: bool=False,
                      add_sp: bool=True,
                      to_tensor: bool=True) -> Dict[str, List[paddle.Tensor]]:
-        ''' 1. 添加SSML支持，先列出 文字 和 <say-as>标签内容，
+        # XML Document Object Model (DOM)
-                然后添加到tmpSegments数组里
+        doms = MixTextProcessor.get_dom_split(sentence)
-        '''
-        d_inputs = MixTextProcessor.get_dom_split(sentence)
+        lang_splits = []
-        tmpSegments = []
+        for dom in doms:
-        for instr in d_inputs:
+            if dom.lower().startswith("<say-as pinyin="):
-            ''' 暂时只支持 say-as '''
+                # `<say-as pinyin=` for zh lang
-            if instr.lower().startswith("<say-as"):
+                lang_splits.append((dom, "zh"))
-                tmpSegments.append((instr, "zh"))
            else:
-                tmpSegments.extend(self.get_segment(instr))
+                # process zh, en and zh/en
-        ''' 2. 把zh的merge到一起，避免合成结果中间停顿
+                lang_splits.extend(self.split_by_lang(dom))
-        '''
+        # merge adjacent zh segment
        segments = []
        currentSeg = ["", ""]
-        for seg in tmpSegments:
+        for seg in lang_splits:
            if seg[1] == "en" or seg[1] == "other":
                if currentSeg[0] == '':
+                    # first see
                    segments.append(seg)
                else:
+                    # zh
                    currentSeg[0] = "<speak>" + currentSeg[0] + "</speak>"
                    segments.append(tuple(currentSeg))
+                    # en
                    segments.append(seg)
+                    # reset
                    currentSeg = ["", ""]
            else:
+                # zh
                if currentSeg[0] == '':
+                    # first see
                    currentSeg[0] = seg[0]
                    currentSeg[1] = seg[1]
                else:
+                    # merge zh 
                    currentSeg[0] = currentSeg[0] + seg[0]
        if currentSeg[0] != '':
+            # last zh
            currentSeg[0] = "<speak>" + currentSeg[0] + "</speak>"
            segments.append(tuple(currentSeg))
        phones_list = []
        result = {}
+        # 008 我们要去云南 team building, 非常非常 happy.
+        # seg ('我们要去云南 ', 'zh')
+        # seg ('team building, ', 'en')
+        # seg ('非常非常 ', 'zh')
+        # seg ('happy.', 'en')
+        # [('<speak>我们要去云南 </speak>', 'zh'), ('team building, ', 'en'), ('<speak>非常非常 </speak>', 'zh'), ('happy.', 'en')]
        for seg in segments:
            content = seg[0]
            lang = seg[1]
-            if content != '':
-                if lang == "en":
+            if not content:
-                    input_ids = self.en_frontend.get_input_ids(
+                continue
-                        content, merge_sentences=False, to_tensor=to_tensor)
+            if lang == "en":
+                input_ids = self.en_frontend.get_input_ids(
+                    content, merge_sentences=False, to_tensor=to_tensor)
+            else:
+                if content.strip() != "" and \
+                    re.match(r".*?<speak>.*?</speak>.*", content, re.DOTALL):
+                    # process ssml
+                    input_ids = self.zh_frontend.get_input_ids_ssml(
+                        content,
+                        merge_sentences=False,
+                        get_tone_ids=get_tone_ids,
+                        to_tensor=to_tensor)
                else:
-                    ''' 3. 把带speak tag的中文和普通文字分开处理
+                    # process plain text
-                    '''
+                    input_ids = self.zh_frontend.get_input_ids(
-                    if content.strip() != "" and \
+                        content,
-                        re.match(r".*?<speak>.*?</speak>.*", content, re.DOTALL):
+                        merge_sentences=False,
-                        input_ids = self.zh_frontend.get_input_ids_ssml(
+                        get_tone_ids=get_tone_ids,
-                            content,
+                        to_tensor=to_tensor)
-                            merge_sentences=False,
-                            get_tone_ids=get_tone_ids,
+            if add_sp:
-                            to_tensor=to_tensor)
+                # add sp between zh and en
-                    else:
+                if to_tensor:
-                        input_ids = self.zh_frontend.get_input_ids(
+                    input_ids["phone_ids"][-1] = paddle.concat(
-                            content,
+                        [input_ids["phone_ids"][-1], self.sp_id_tensor])
-                            merge_sentences=False,
+                else:
-                            get_tone_ids=get_tone_ids,
+                    input_ids["phone_ids"][-1] = np.concatenate(
-                            to_tensor=to_tensor)
+                        (input_ids["phone_ids"][-1], self.sp_id_numpy))
-                if add_sp:
-                    if to_tensor:
-                        input_ids["phone_ids"][-1] = paddle.concat(
-                            [input_ids["phone_ids"][-1], self.sp_id_tensor])
-                    else:
-                        input_ids["phone_ids"][-1] = np.concatenate(
-                            (input_ids["phone_ids"][-1], self.sp_id_numpy))
-                for phones in input_ids["phone_ids"]:
+            phones_list.extend(input_ids["phone_ids"])
-                    phones_list.append(phones)
        if merge_sentences:
            merge_list = paddle.concat(phones_list)

--- a/paddlespeech/t2s/frontend/phonectic.py
+++ b/paddlespeech/t2s/frontend/phonectic.py
@@ -47,15 +47,34 @@ class Phonetics(ABC):
 class English(Phonetics):
    """ Normalize the input text sequence and convert into pronunciation id sequence.
+    https://github.com/Kyubyong/g2p/blob/master/g2p_en/g2p.py
+    phonemes = ["<pad>", "<unk>", "<s>", "</s>"] + [   
+        'AA0', 'AA1', 'AA2', 'AE0', 'AE1', 'AE2', 'AH0', 'AH1', 'AH2', 'AO0',
+        'AO1', 'AO2', 'AW0', 'AW1', 'AW2', 'AY0', 'AY1', 'AY2', 'B', 'CH', 'D', 'DH',
+        'EH0', 'EH1', 'EH2', 'ER0', 'ER1', 'ER2', 'EY0', 'EY1',
+        'EY2', 'F', 'G', 'HH',
+        'IH0', 'IH1', 'IH2', 'IY0', 'IY1', 'IY2', 'JH', 'K', 'L',
+        'M', 'N', 'NG', 'OW0', 'OW1',
+        'OW2', 'OY0', 'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH',
+        'UH0', 'UH1', 'UH2', 'UW',
+        'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH']
    """
+    LEXICON = {
+        # key using lowercase
+        "AI".lower(): [["EY0", "AY1"]],
+    }
    def __init__(self, phone_vocab_path=None):
        self.backend = G2p()
+        self.backend.cmu.update(English.LEXICON)
        self.phonemes = list(self.backend.phonemes)
        self.punctuations = get_punctuations("en")
        self.vocab = Vocab(self.phonemes + self.punctuations)
        self.vocab_phones = {}
-        self.punc = "：，；。？！“”‘’':,;.?!"
+        self.punc = "、：，；。？！“”‘’':,;.?!"
        self.text_normalizer = TextNormalizer()
        if phone_vocab_path:
            with open(phone_vocab_path, 'rt', encoding='utf-8') as f:
@@ -86,8 +105,8 @@ class English(Phonetics):
                      sentence: str,
                      merge_sentences: bool=False,
                      to_tensor: bool=True) -> paddle.Tensor:
-        result = {}
        sentences = self.text_normalizer._split(sentence, lang="en")
        phones_list = []
        temp_phone_ids = []
        for sentence in sentences:
@@ -118,7 +137,10 @@ class English(Phonetics):
            if to_tensor:
                phone_ids = paddle.to_tensor(phone_ids)
            temp_phone_ids.append(phone_ids)
+        result = {}
        result["phone_ids"] = temp_phone_ids
        return result
    def numericalize(self, phonemes):

--- a/paddlespeech/t2s/frontend/polyphonic.py
+++ b/paddlespeech/t2s/frontend/polyphonic.py
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import yaml
+class Polyphonic():
+    def __init__(self):
+        with open(
+                os.path.join(
+                    os.path.dirname(os.path.abspath(__file__)),
+                    'polyphonic.yaml'),
+                'r',
+                encoding='utf-8') as polyphonic_file:
+            # 解析yaml
+            polyphonic_dict = yaml.load(polyphonic_file, Loader=yaml.FullLoader)
+        self.polyphonic_words = polyphonic_dict["polyphonic"]
+    def correct_pronunciation(self, word, pinyin):
+        # 词汇被词典收录则返回纠正后的读音
+        if word in self.polyphonic_words.keys():
+            pinyin = self.polyphonic_words[word]
+        # 否则返回原读音
+        return pinyin
--- a/paddlespeech/t2s/frontend/polyphonic.yaml
+++ b/paddlespeech/t2s/frontend/polyphonic.yaml
@@ -47,4 +47,8 @@ polyphonic:
    恶行: ['e4','xing2']
    唉: ['ai4']
    扎实: ['zha1','shi2']
    干将: ['gan4','jiang4']
\ No newline at end of file
+    陈威行: ['chen2', 'wei1', 'hang2']
+    郭晟: ['guo1', 'sheng4']
+    中标: ['zhong4', 'biao1']
+    抗住: ['kang2', 'zhu4']
\ No newline at end of file
--- a/paddlespeech/t2s/frontend/sing_frontend.py
+++ b/paddlespeech/t2s/frontend/sing_frontend.py
@@ -29,7 +29,7 @@ class SingFrontend():
            pinyin_phone_path (str): pinyin to phone file path, a 'pinyin|phones' (like: ba|b a ) pair per line.
            phone_vocab_path (str): phone to phone id file path, a 'phone phone id' (like: a 4 ) pair per line.
        """
-        self.punc = '[：，；。？！“”‘’\':,;.?!]'
+        self.punc = '[、：，；。？！“”‘’\':,;.?!]'
        self.pinyin_phones = {'AP': 'AP', 'SP': 'SP'}
        if pinyin_phone_path:

--- a/paddlespeech/t2s/ssml/__init__.py
+++ b/paddlespeech/t2s/ssml/__init__.py
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/paddlespeech/t2s/ssml/xml_processor.py
+++ b/paddlespeech/t2s/ssml/xml_processor.py
 # -*- coding: utf-8 -*-
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import re
 import xml.dom.minidom
 import xml.parsers.expat
@@ -17,7 +30,6 @@ Note:  xml 有5种特殊字符， &<>"'
 '  &apos;
 例如：
 <TitleName>&quot;姓名&quot;</TitleName>
 '''
@@ -61,17 +73,29 @@ class MixTextProcessor():
        patn = re.compile(r'(.*\s*?)(<speak>.*?</speak>)(.*\s*)$', re.M | re.S)
        mat = re.match(patn, mixstr)
        if mat:
+            # pre <speak>
            pre_xml = mat.group(1)
+            # between <speak> ... </speak>
            in_xml = mat.group(2)
+            # post </speak>
            after_xml = mat.group(3)
-            ctlist.append([pre_xml, []])
+            # pre with none syllable
+            if pre_xml:
+                ctlist.append([pre_xml, []])
+            # between with syllable
+            # [(sub sentence, [syllables]), ...]
            dom = DomXml(in_xml)
            pinyinlist = dom.get_pinyins_for_xml()
            ctlist = ctlist + pinyinlist
-            ctlist.append([after_xml, []])
+            # post with none syllable
+            if after_xml:
+                ctlist.append([after_xml, []])
        else:
            ctlist.append([mixstr, []])
        return ctlist
    @classmethod
@@ -86,17 +110,21 @@ class MixTextProcessor():
            in_xml = mat.group(2)
            after_xml = mat.group(3)
-            ctlist.append(pre_xml)
+            if pre_xml:
+                ctlist.append(pre_xml)
            dom = DomXml(in_xml)
            tags = dom.get_text_and_sayas_tags()
            ctlist.extend(tags)
-            ctlist.append(after_xml)
+            if after_xml:
-            return ctlist
+                ctlist.append(after_xml)
        else:
            ctlist.append(mixstr)
        return ctlist
 class DomXml():
    def __init__(self, xmlstr):
        self.tdom = parseString(xmlstr)  #Document

--- a/paddlespeech/t2s/frontend/tone_sandhi.py
+++ b/paddlespeech/t2s/frontend/tone_sandhi.py
@@ -20,6 +20,9 @@ from pypinyin import Style
 class ToneSandhi():
+    def __repr__(self):
+        return "MandarinToneSandhi"
    def __init__(self):
        self.must_neural_tone_words = {
            '麻烦', '麻利', '鸳鸯', '高粱', '骨头', '骆驼', '马虎', '首饰', '馒头', '馄饨', '风筝',
@@ -65,9 +68,22 @@ class ToneSandhi():
            '男子', '女子', '分子', '原子', '量子', '莲子', '石子', '瓜子', '电子', '人人', '虎虎',
            '幺幺', '干嘛', '学子', '哈哈', '数数', '袅袅', '局地', '以下', '娃哈哈', '花花草草', '留得',
            '耕地', '想想', '熙熙', '攘攘', '卵子', '死死', '冉冉', '恳恳', '佼佼', '吵吵', '打打',
-            '考考', '整整', '莘莘', '落地', '算子', '家家户户'
+            '考考', '整整', '莘莘', '落地', '算子', '家家户户', '青青'
        }
-        self.punc = "：，；。？！“”‘’':,;.?!"
+        self.punc = "、：，；。？！“”‘’':,;.?!"
+    def _split_word(self, word: str) -> List[str]:
+        word_list = jieba.cut_for_search(word)
+        word_list = sorted(word_list, key=lambda i: len(i), reverse=False)
+        first_subword = word_list[0]
+        first_begin_idx = word.find(first_subword)
+        if first_begin_idx == 0:
+            second_subword = word[len(first_subword):]
+            new_word_list = [first_subword, second_subword]
+        else:
+            second_subword = word[:-len(first_subword)]
+            new_word_list = [second_subword, first_subword]
+        return new_word_list
    # the meaning of jieba pos tag: https://blog.csdn.net/weixin_44174352/article/details/113731041
    # e.g.
@@ -154,18 +170,8 @@ class ToneSandhi():
                            finals[i] = finals[i][:-1] + "4"
        return finals
-    def _split_word(self, word: str) -> List[str]:
+    def _all_tone_three(self, finals: List[str]) -> bool:
-        word_list = jieba.cut_for_search(word)
+        return all(x[-1] == "3" for x in finals)
-        word_list = sorted(word_list, key=lambda i: len(i), reverse=False)
-        first_subword = word_list[0]
-        first_begin_idx = word.find(first_subword)
-        if first_begin_idx == 0:
-            second_subword = word[len(first_subword):]
-            new_word_list = [first_subword, second_subword]
-        else:
-            second_subword = word[:-len(first_subword)]
-            new_word_list = [second_subword, first_subword]
-        return new_word_list
    def _three_sandhi(self, word: str, finals: List[str]) -> List[str]:
@@ -207,9 +213,6 @@ class ToneSandhi():
        return finals
-    def _all_tone_three(self, finals: List[str]) -> bool:
-        return all(x[-1] == "3" for x in finals)
    # merge "不" and the word behind it
    # if don't merge, "不" sometimes appears alone according to jieba, which may occur sandhi error
    def _merge_bu(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
@@ -336,6 +339,9 @@ class ToneSandhi():
    def pre_merge_for_modify(
            self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
+        """
+            seg: [(word, pos), ...]
+        """
        seg = self._merge_bu(seg)
        seg = self._merge_yi(seg)
        seg = self._merge_reduplication(seg)
@@ -346,7 +352,11 @@ class ToneSandhi():
    def modified_tone(self, word: str, pos: str,
                      finals: List[str]) -> List[str]:
+        """
+            word: 分词
+            pos: 词性
+            finals: 带调韵母, [final1, ..., finaln]
+        """
        finals = self._bu_sandhi(word, finals)
        finals = self._yi_sandhi(word, finals)
        finals = self._neural_sandhi(word, pos, finals)

--- a/paddlespeech/t2s/frontend/zh_frontend.py
+++ b/paddlespeech/t2s/frontend/zh_frontend.py
--- a/paddlespeech/t2s/frontend/zh_normalization/phonecode.py
+++ b/paddlespeech/t2s/frontend/zh_normalization/phonecode.py
@@ -24,7 +24,7 @@ from .num import verbalize_digit
 RE_MOBILE_PHONE = re.compile(
    r"(?<!\d)((\+?86 ?)?1([38]\d|5[0-35-9]|7[678]|9[89])\d{8})(?!\d)")
 RE_TELEPHONE = re.compile(
-    r"(?<!\d)((0(10|2[1-3]|[3-9]\d{2})-?)?[1-9]\d{7,8})(?!\d)")
+    r"(?<!\d)((0(10|2[1-3]|[3-9]\d{2})-?)?[1-9]\d{6,7})(?!\d)")
 # 全国统一的号码400开头
 RE_NATIONAL_UNIFORM_NUMBER = re.compile(r"(400)(-)?\d{3}(-)?\d{4}")

--- a/setup.py
+++ b/setup.py
@@ -40,6 +40,8 @@ base = [
    "hyperpyyaml",
    "inflect",
    "jsonlines",
+    # paddleaudio align with librosa==0.8.1, which need numpy==1.23.x
+    "numpy==1.23.5",
    "librosa==0.8.1",
    "scipy>=1.4.0",
    "loguru",
@@ -260,6 +262,7 @@ setup_info = dict(
    long_description=read("README.md"),
    long_description_content_type="text/markdown",
    keywords=[
+        "SSL"
        "speech",
        "asr",
        "tts",
@@ -268,12 +271,19 @@ setup_info = dict(
        "text frontend",
        "MFA",
        "paddlepaddle",
+        "paddleaudio",
+        "streaming asr",
+        "streaming tts",
        "beam search",
        "ctcdecoder",
        "deepspeech2",
+        "wav2vec2",
+        "hubert",
+        "wavlm",
        "transformer",
        "conformer",
        "fastspeech2",
+        "hifigan",
        "gan vocoders",
    ],
    python_requires='>=3.7',

--- a/tests/unit/tts/test_enfrontend.py
+++ b/tests/unit/tts/test_enfrontend.py
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from paddlespeech.t2s.frontend.en_frontend import English as EnFrontend
+if __name__ == '__main__':
+    fe = EnFrontend()
+    text = "AI for Sceience"
+    phonemes = fe.phoneticize(text)
+    print(text)
+    print(phonemes)
+    text = "eight"
+    phonemes = fe.phoneticize(text)
+    print(text)
+    print(phonemes)
--- a/tests/unit/tts/test_mixfrontend.py
+++ b/tests/unit/tts/test_mixfrontend.py
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import re
+import tempfile
+from paddlespeech.t2s.frontend.mix_frontend import MixFrontend
+# mix zh & en phonemes
+phone_id_str = """
+<pad> 0
+<unk> 1
+AA0 2
+AA1 3
+AA2 4
+AE0 5
+AE1 6
+AE2 7
+AH0 8
+AH1 9
+AH2 10
+AO0 11
+AO1 12
+AO2 13
+AW0 14
+AW1 15
+AW2 16
+AY0 17
+AY1 18
+AY2 19
+B 20
+CH 21
+D 22
+DH 23
+EH0 24
+EH1 25
+EH2 26
+ER0 27
+ER1 28
+ER2 29
+EY0 30
+EY1 31
+EY2 32
+F 33
+G 34
+HH 35
+IH0 36
+IH1 37
+IH2 38
+IY0 39
+IY1 40
+IY2 41
+JH 42
+K 43
+L 44
+M 45
+N 46
+NG 47
+OW0 48
+OW1 49
+OW2 50
+OY0 51
+OY1 52
+OY2 53
+P 54
+R 55
+S 56
+SH 57
+T 58
+TH 59
+UH0 60
+UH1 61
+UH2 62
+UW0 63
+UW1 64
+UW2 65
+V 66
+W 67
+Y 68
+Z 69
+ZH 70
+a1 71
+a2 72
+a3 73
+a4 74
+a5 75
+ai1 76
+ai2 77
+ai3 78
+ai4 79
+ai5 80
+air2 81
+air3 82
+air4 83
+an1 84
+an2 85
+an3 86
+an4 87
+an5 88
+ang1 89
+ang2 90
+ang3 91
+ang4 92
+ang5 93
+angr2 94
+angr4 95
+anr1 96
+anr3 97
+anr4 98
+ao1 99
+ao2 100
+ao3 101
+ao4 102
+ao5 103
+aor1 104
+aor3 105
+aor4 106
+aor5 107
+ar2 108
+ar3 109
+ar4 110
+ar5 111
+b 112
+c 113
+ch 114
+d 115
+e1 116
+e2 117
+e3 118
+e4 119
+e5 120
+ei1 121
+ei2 122
+ei3 123
+ei4 124
+ei5 125
+eir4 126
+en1 127
+en2 128
+en3 129
+en4 130
+en5 131
+eng1 132
+eng2 133
+eng3 134
+eng4 135
+eng5 136
+engr4 137
+enr1 138
+enr2 139
+enr3 140
+enr4 141
+enr5 142
+er1 143
+er2 144
+er3 145
+er4 146
+er5 147
+f 148
+g 149
+h 150
+i1 151
+i2 152
+i3 153
+i4 154
+i5 155
+ia1 156
+ia2 157
+ia3 158
+ia4 159
+ia5 160
+ian1 161
+ian2 162
+ian3 163
+ian4 164
+ian5 165
+iang1 166
+iang2 167
+iang3 168
+iang4 169
+iang5 170
+iangr4 171
+ianr1 172
+ianr2 173
+ianr3 174
+ianr4 175
+ianr5 176
+iao1 177
+iao2 178
+iao3 179
+iao4 180
+iao5 181
+iaor1 182
+iaor2 183
+iaor3 184
+iaor4 185
+iar1 186
+iar3 187
+iar4 188
+ie1 189
+ie2 190
+ie3 191
+ie4 192
+ie5 193
+ii1 194
+ii2 195
+ii3 196
+ii4 197
+ii5 198
+iii1 199
+iii2 200
+iii3 201
+iii4 202
+iii5 203
+iiir1 204
+iiir4 205
+iir2 206
+in1 207
+in2 208
+in3 209
+in4 210
+in5 211
+ing1 212
+ing2 213
+ing3 214
+ing4 215
+ing5 216
+ingr1 217
+ingr2 218
+ingr3 219
+ingr4 220
+inr1 221
+inr4 222
+io1 223
+io3 224
+io5 225
+iong1 226
+iong2 227
+iong3 228
+iong4 229
+iong5 230
+iou1 231
+iou2 232
+iou3 233
+iou4 234
+iou5 235
+iour1 236
+iour2 237
+iour3 238
+iour4 239
+ir1 240
+ir2 241
+ir3 242
+ir4 243
+ir5 244
+j 245
+k 246
+l 247
+m 248
+n 249
+o1 250
+o2 251
+o3 252
+o4 253
+o5 254
+ong1 255
+ong2 256
+ong3 257
+ong4 258
+ong5 259
+ongr4 260
+or2 261
+ou1 262
+ou2 263
+ou3 264
+ou4 265
+ou5 266
+our2 267
+our3 268
+our4 269
+our5 270
+p 271
+q 272
+r 273
+s 274
+sh 275
+sil 276
+sp 277
+spl 278
+spn 279
+t 280
+u1 281
+u2 282
+u3 283
+u4 284
+u5 285
+ua1 286
+ua2 287
+ua3 288
+ua4 289
+ua5 290
+uai1 291
+uai2 292
+uai3 293
+uai4 294
+uai5 295
+uair4 296
+uan1 297
+uan2 298
+uan3 299
+uan4 300
+uan5 301
+uang1 302
+uang2 303
+uang3 304
+uang4 305
+uang5 306
+uangr4 307
+uanr1 308
+uanr2 309
+uanr3 310
+uanr4 311
+uanr5 312
+uar1 313
+uar2 314
+uar4 315
+uei1 316
+uei2 317
+uei3 318
+uei4 319
+uei5 320
+ueir1 321
+ueir2 322
+ueir3 323
+ueir4 324
+uen1 325
+uen2 326
+uen3 327
+uen4 328
+uen5 329
+ueng1 330
+ueng2 331
+ueng3 332
+ueng4 333
+uenr1 334
+uenr2 335
+uenr3 336
+uenr4 337
+uo1 338
+uo2 339
+uo3 340
+uo4 341
+uo5 342
+uor1 343
+uor2 344
+uor3 345
+uor5 346
+ur1 347
+ur2 348
+ur3 349
+ur4 350
+ur5 351
+v1 352
+v2 353
+v3 354
+v4 355
+v5 356
+van1 357
+van2 358
+van3 359
+van4 360
+van5 361
+vanr1 362
+vanr2 363
+vanr3 364
+vanr4 365
+ve1 366
+ve2 367
+ve3 368
+ve4 369
+ve5 370
+ver3 371
+ver4 372
+vn1 373
+vn2 374
+vn3 375
+vn4 376
+vn5 377
+vnr2 378
+vr3 379
+x 380
+z 381
+zh 382
+, 383
+. 384
+? 385
+! 386
+<eos> 387
+"""
+if __name__ == '__main__':
+    with tempfile.NamedTemporaryFile(mode='wt') as f:
+        phone_ids = phone_id_str.split()
+        for phone, id in zip(phone_ids[::2], phone_ids[1::2]):
+            f.write(f"{phone} {id}")
+            f.write('\n')
+            f.flush()
+        frontend = MixFrontend(phone_vocab_path=f.name)
+        text = "hello, 我爱北京天安们，what about you."
+        print(text)
+        # [('hello, ', 'en'), ('我爱北京天安们，', 'zh'), ('what about you.', 'en')]
+        segs = frontend.split_by_lang(text)
+        print(segs)
+        text = "hello?!!我爱北京天安们，what about you."
+        print(text)
+        # [('hello?!!', 'en'), ('我爱北京天安们，', 'zh'), ('what about you.', 'en')]
+        segs = frontend.split_by_lang(text)
+        print(segs)
+        text = "<speak> hello，我爱北京天安们，what about you."
+        print(text)
+        # [('<speak> hello，', 'en'), ('我爱北京天安们，', 'zh'), ('what about you.', 'en')]
+        segs = frontend.split_by_lang(text)
+        print(segs)
+        # 对于SSML的xml标记处理不好。需要先解析SSML，后处理中英的划分。
+        text = "<speak>我们的声学模型使用了 Fast Speech Two。前浪<say-as pinyin='dao3'>倒</say-as>在沙滩上,沙滩上倒了一堆<say-as pinyin='tu3'>土</say-as>。 想象<say-as pinyin='gan1 gan1'>干干</say-as>的树干<say-as pinyin='dao3'>倒</say-as>了, 里面有个干尸，不知是被谁<say-as pinyin='gan4'>干</say-as>死的。</speak>"
+        print(text)
+        # [('<speak>', 'en'), ('我们的声学模型使用了 ', 'zh'), ('Fast Speech Two。', 'en'), ('前浪<', 'zh'), ("say-as pinyin='dao3'>", 'en'), ('倒</', 'zh'), ('say-as>', 'en'), ('在沙滩上,沙滩上倒了一堆<', 'zh'), ("say-as pinyin='tu3'>", 'en'), ('土</', 'zh'), ('say-as>。 ', 'en'), ('想象<', 'zh'), ("say-as pinyin='gan1 gan1'>", 'en'), ('干干</', 'zh'), ('say-as>', 'en'), ('的树干<', 'zh'), ("say-as pinyin='dao3'>", 'en'), ('倒</', 'zh'), ('say-as>', 'en'), ('了, 里面有个干尸，不知是被谁<', 'zh'), ("say-as pinyin='gan4'>", 'en'), ('干</', 'zh'), ('say-as>', 'en'), ('死的。</', 'zh'), ('speak>', 'en')]
+        segs = frontend.split_by_lang(text)
+        print(segs)
--- a/tests/unit/tts/test_ssml.py
+++ b/tests/unit/tts/test_ssml.py
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from paddlespeech.t2s.frontend.ssml.xml_processor import MixTextProcessor
+if __name__ == '__main__':
+    text = "你好吗，<speak>我们的声学模型使用了 Fast Speech Two。前浪<say-as pinyin='dao3'>倒</say-as>在沙滩上,沙滩上倒了一堆<say-as pinyin='tu3'>土</say-as>。 想象<say-as pinyin='gan1 gan1'>干干</say-as>的树干<say-as pinyin='dao3'>倒</say-as>了, 里面有个干尸，不知是被谁<say-as pinyin='gan4'>干</say-as>死的。</speak>thank you."
+    # SSML: 13
+    # 0 ['你好吗，', []]
+    # 1 ['我们的声学模型使用了FastSpeechTwo。前浪', []]
+    # 2 ['倒', ['dao3']]
+    # 3 ['在沙滩上,沙滩上倒了一堆', []]
+    # 4 ['土', ['tu3']]
+    # 5 ['。想象', []]
+    # 6 ['干干', ['gan1', 'gan1']]
+    # 7 ['的树干', []]
+    # 8 ['倒', ['dao3']]
+    # 9 ['了,里面有个干尸，不知是被谁', []]
+    # 10 ['干', ['gan4']]
+    # 11 ['死的。', []]
+    # 12 ['thank you.', []]
+    inputs = MixTextProcessor.get_pinyin_split(text)
+    print(f"SSML get_pinyin_split: {len(inputs)}")
+    for i, sub in enumerate(inputs):
+        print(i, sub)
+    print()
+    # SSML get_dom_split: 13
+    # 0 你好吗，
+    # 1 我们的声学模型使用了 Fast Speech Two。前浪
+    # 2 <say-as pinyin="dao3">倒</say-as>
+    # 3 在沙滩上,沙滩上倒了一堆
+    # 4 <say-as pinyin="tu3">土</say-as>
+    # 5 。 想象
+    # 6 <say-as pinyin="gan1 gan1">干干</say-as>
+    # 7 的树干
+    # 8 <say-as pinyin="dao3">倒</say-as>
+    # 9 了, 里面有个干尸，不知是被谁
+    # 10 <say-as pinyin="gan4">干</say-as>
+    # 11 死的。
+    # 12 thank you.
+    inputs = MixTextProcessor.get_dom_split(text)
+    print(f"SSML get_dom_split: {len(inputs)}")
+    for i, sub in enumerate(inputs):
+        print(i, sub)
+    print()
+    # SSML object.get_pinyin_split: 246
+    # <speak>我们的声学模型使用了 Fast Speech Two。前浪<say-as pinyin='dao3'>倒</say-as>在沙滩上,沙滩上倒了一堆<say-as pinyin='tu3'>土</say-as>。 想象<say-as pinyin='gan1 gan1'>干干</say-as>的树干<say-as pinyin='dao3'>倒</say-as>了, 里面有个干尸，不知是被谁<say-as pinyin='gan4'>干</say-as>死的。</speak>
+    outs = MixTextProcessor().get_xml_content(text)
+    print(f"SSML object.get_pinyin_split: {len(outs)}")
+    print(outs)
+    print()
+    # SSML object.get_content_split: 30 你好吗，
+    # 1 <speak>我们的声学模型使用了 Fast Speech Two。前浪<say-as pinyin='dao3'>倒</say-as>在沙滩上,沙滩上倒了一堆<say-as pinyin='tu3'>土</say-as>。 想象<say-as pinyin='gan1 gan1'>干干</say-as>的树干<say-as pinyin='dao3'>
+    # 倒</say-as>了, 里面有个干尸，不知是被谁<say-as pinyin='gan4'>干</say-as>死的。</speak>
+    # 2 thank you.
+    outs = MixTextProcessor().get_content_split(text)
+    print(f"SSML object.get_content_split: {len(outs)}")
+    for i, sub in enumerate(outs):
+        print(i, sub)
+    print()