add ernie sat model file and config

94688264 · 小湉湉 · 0ea9def0 · 94688264 · 94688264 · 94688264
46 changed file
--- a/examples/aishell3/ernie_sat/conf/default.yaml
+++ b/examples/aishell3/ernie_sat/conf/default.yaml
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+
+fs: 24000          # sr
+n_fft: 2048        # FFT size (samples).
+n_shift: 300       # Hop size (samples). 12.5ms
+win_length: 1200   # Window length (samples). 50ms
+                   # If set to null, it will be the same as fft_size.
+window: "hann"     # Window function.
+
+# Only used for feats_type != raw
+
+fmin: 80           # Minimum frequency of Mel basis.
+fmax: 7600         # Maximum frequency of Mel basis.
+n_mels: 80         # The number of mel basis.
+
+mean_phn_span: 8
+mlm_prob: 0.8
+
+###########################################################
+#                       DATA SETTING                      #
+###########################################################
+batch_size: 64
+num_workers: 2
+
+###########################################################
+#                       MODEL SETTING                     #
+###########################################################
+model:
+    text_masking: false
+    postnet_layers: 5
+    postnet_filts: 5
+    postnet_chans: 256
+    encoder_type: conformer
+    decoder_type: conformer
+    enc_input_layer: sega_mlm
+    enc_pre_speech_layer: 0
+    enc_cnn_module_kernel: 7
+    enc_attention_dim: 384
+    enc_attention_heads: 2
+    enc_linear_units: 1536
+    enc_num_blocks: 4
+    enc_dropout_rate: 0.2
+    enc_positional_dropout_rate: 0.2
+    enc_attention_dropout_rate: 0.2
+    enc_normalize_before: true
+    enc_macaron_style: true
+    enc_use_cnn_module: true
+    enc_selfattention_layer_type: legacy_rel_selfattn
+    enc_activation_type: swish
+    enc_pos_enc_layer_type: legacy_rel_pos
+    enc_positionwise_layer_type: conv1d
+    enc_positionwise_conv_kernel_size: 3
+    dec_cnn_module_kernel: 31
+    dec_attention_dim: 384
+    dec_attention_heads: 2
+    dec_linear_units: 1536
+    dec_num_blocks: 4
+    dec_dropout_rate: 0.2
+    dec_positional_dropout_rate: 0.2
+    dec_attention_dropout_rate: 0.2
+    dec_macaron_style: true
+    dec_use_cnn_module: true
+    dec_selfattention_layer_type: legacy_rel_selfattn
+    dec_activation_type: swish
+    dec_pos_enc_layer_type: legacy_rel_pos
+    dec_positionwise_layer_type: conv1d
+    dec_positionwise_conv_kernel_size: 3
+
+###########################################################
+#                     OPTIMIZER SETTING                   #
+###########################################################
+optimizer:
+    optim: adam               # optimizer type
+    learning_rate: 0.001      # learning rate
+
+###########################################################
+#                     TRAINING SETTING                    #
+###########################################################
+max_epoch: 200
+num_snapshots: 5
+
+###########################################################
+#                       OTHER SETTING                     #
+###########################################################
+seed: 10086
+
+token_list:
+- <blank>
+- <unk>
+- d
+- sp
+- sh
+- ii
+- j
+- zh
+- l
+- x
+- b
+- g
+- uu
+- e5
+- h
+- q
+- m
+- i1
+- t
+- z
+- ch
+- f
+- s
+- u4
+- ix4
+- i4
+- n
+- i3
+- iu3
+- vv
+- ian4
+- ix2
+- r
+- e4
+- ai4
+- k
+- ing2
+- a1
+- en2
+- ui4
+- ong1
+- uo3
+- u2
+- u3
+- ao4
+- ee
+- p
+- an1
+- eng2
+- i2
+- in1
+- c
+- ai2
+- ian2
+- e2
+- an4
+- ing4
+- v4
+- ai3
+- a5
+- ian3
+- eng1
+- ong4
+- ang4
+- ian1
+- ing1
+- iy4
+- ao3
+- ang1
+- uo4
+- u1
+- iao4
+- iu4
+- a4
+- van2
+- ie4
+- ang2
+- ou4
+- iang4
+- ix1
+- er4
+- iy1
+- e1
+- en1
+- ui2
+- an3
+- ei4
+- ong2
+- uo1
+- ou3
+- uo2
+- iao1
+- ou1
+- an2
+- uan4
+- ia4
+- ia1
+- ang3
+- v3
+- iu2
+- iao3
+- in4
+- a3
+- ei3
+- iang3
+- v2
+- eng4
+- en3
+- aa
+- uan1
+- v1
+- ao1
+- ve4
+- ie3
+- ai1
+- ing3
+- iang1
+- a2
+- ui1
+- en4
+- en5
+- in3
+- uan3
+- e3
+- ie1
+- ve2
+- ei2
+- in2
+- ix3
+- uan2
+- iang2
+- ie2
+- ua4
+- ou2
+- uai4
+- er2
+- eng3
+- uang3
+- un1
+- ong3
+- uang4
+- vn4
+- un2
+- iy3
+- iz4
+- ui3
+- iao2
+- iong4
+- un4
+- van4
+- ao2
+- uang1
+- iy5
+- o2
+- ei1
+- ua1
+- iu1
+- uang2
+- er5
+- o1
+- un3
+- vn1
+- vn2
+- o4
+- ve1
+- van3
+- ua2
+- er3
+- iong3
+- van1
+- ia2
+- iy2
+- ia3
+- iong1
+- uo5
+- oo
+- ve3
+- ou5
+- uai3
+- ian5
+- iong2
+- uai2
+- uai1
+- ua3
+- vn3
+- ia5
+- ie5
+- ueng1
+- o5
+- o3
+- iang5
+- ei5
+- <sos/eos>
\ No newline at end of file
--- a/examples/aishell3/ernie_sat/local/preprocess.sh
+++ b/examples/aishell3/ernie_sat/local/preprocess.sh
+#!/bin/bash
+
+stage=0
+stop_stage=100
+
+config_path=$1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # get durations from MFA's result
+    echo "Generate durations.txt from MFA results ..."
+    python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
+        --inputdir=./aishell3_alignment_tone \
+        --output durations.txt \
+        --config=${config_path}
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # extract features
+    echo "Extract features ..."
+    python3 ${BIN_DIR}/preprocess.py \
+        --dataset=aishell3 \
+        --rootdir=~/datasets/data_aishell3/ \
+        --dumpdir=dump \
+        --dur-file=durations.txt \
+        --config=${config_path} \
+        --num-cpu=20 \
+        --cut-sil=True
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # get features' stats(mean and std)
+    echo "Get features' stats ..."
+    python3 ${MAIN_ROOT}/utils/compute_statistics.py \
+        --metadata=dump/train/raw/metadata.jsonl \
+        --field-name="speech"
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    # normalize and covert phone/speaker to id, dev and test should use train's stats
+    echo "Normalize ..."
+    python3 ${BIN_DIR}/normalize.py \
+        --metadata=dump/train/raw/metadata.jsonl \
+        --dumpdir=dump/train/norm \
+        --speech-stats=dump/train/speech_stats.npy \
+        --phones-dict=dump/phone_id_map.txt \
+        --speaker-dict=dump/speaker_id_map.txt
+
+    python3 ${BIN_DIR}/normalize.py \
+        --metadata=dump/dev/raw/metadata.jsonl \
+        --dumpdir=dump/dev/norm \
+        --speech-stats=dump/train/speech_stats.npy \
+        --phones-dict=dump/phone_id_map.txt \
+        --speaker-dict=dump/speaker_id_map.txt
+
+    python3 ${BIN_DIR}/normalize.py \
+        --metadata=dump/test/raw/metadata.jsonl \
+        --dumpdir=dump/test/norm \
+        --speech-stats=dump/train/speech_stats.npy \
+        --phones-dict=dump/phone_id_map.txt \
+        --speaker-dict=dump/speaker_id_map.txt
+fi
--- a/examples/aishell3/ernie_sat/local/synthesize.sh
+++ b/examples/aishell3/ernie_sat/local/synthesize.sh
+#!/bin/bash
\ No newline at end of file
--- a/examples/aishell3/ernie_sat/local/train.sh
+++ b/examples/aishell3/ernie_sat/local/train.sh
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+
+python3 ${BIN_DIR}/train.py \
+    --train-metadata=dump/train/norm/metadata.jsonl \
+    --dev-metadata=dump/dev/norm/metadata.jsonl \
+    --config=${config_path} \
+    --output-dir=${train_output_path} \
+    --ngpu=1 \
+    --phones-dict=dump/phone_id_map.txt
\ No newline at end of file
--- a/examples/aishell3/ernie_sat/path.sh
+++ b/examples/aishell3/ernie_sat/path.sh
+#!/bin/bash
+export MAIN_ROOT=`realpath ${PWD}/../../../`
+
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
+export LC_ALL=C
+
+export PYTHONDONTWRITEBYTECODE=1
+# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
+
+MODEL=ernie_sat
+export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
\ No newline at end of file
--- a/examples/aishell3/ernie_sat/run.sh
+++ b/examples/aishell3/ernie_sat/run.sh
+#!/bin/bash
+
+set -e
+source path.sh
+
+gpus=0,1
+stage=0
+stop_stage=100
+
+conf_path=conf/default.yaml
+train_output_path=exp/default
+ckpt_name=snapshot_iter_153.pdz
+
+# with the following command, you can choose the stage range you want to run
+# such as `./run.sh --stage 0 --stop-stage 0`
+# this can not be mixed use with `$1`, `$2` ...
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # prepare data
+    ./local/preprocess.sh ${conf_path} || exit -1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # train model, all `ckpt` under `train_output_path/checkpoints/` dir
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # synthesize, vocoder is pwgan
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
--- a/examples/aishell3/tts3/conf/conformer.yaml
+++ b/examples/aishell3/tts3/conf/conformer.yaml
--- a/examples/aishell3/tts3/conf/default.yaml
+++ b/examples/aishell3/tts3/conf/default.yaml
--- a/examples/aishell3_vctk/ernie_sat/conf/default.yaml
+++ b/examples/aishell3_vctk/ernie_sat/conf/default.yaml
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+
+fs: 24000          # sr
+n_fft: 2048        # FFT size (samples).
+n_shift: 300       # Hop size (samples). 12.5ms
+win_length: 1200   # Window length (samples). 50ms
+                   # If set to null, it will be the same as fft_size.
+window: "hann"     # Window function.
+
+# Only used for feats_type != raw
+
+fmin: 80           # Minimum frequency of Mel basis.
+fmax: 7600         # Maximum frequency of Mel basis.
+n_mels: 80         # The number of mel basis.
+
+mean_phn_span: 8
+mlm_prob: 0.8
+
+###########################################################
+#                       DATA SETTING                      #
+###########################################################
+batch_size: 64
+num_workers: 2
+
+###########################################################
+#                       MODEL SETTING                     #
+###########################################################
+model:
+    text_masking: true
+    postnet_layers: 5
+    postnet_filts: 5
+    postnet_chans: 256
+    encoder_type: conformer
+    decoder_type: conformer
+    enc_input_layer: sega_mlm
+    enc_pre_speech_layer: 0
+    enc_cnn_module_kernel: 7
+    enc_attention_dim: 384
+    enc_attention_heads: 2
+    enc_linear_units: 1536
+    enc_num_blocks: 4
+    enc_dropout_rate: 0.2
+    enc_positional_dropout_rate: 0.2
+    enc_attention_dropout_rate: 0.2
+    enc_normalize_before: true
+    enc_macaron_style: true
+    enc_use_cnn_module: true
+    enc_selfattention_layer_type: legacy_rel_selfattn
+    enc_activation_type: swish
+    enc_pos_enc_layer_type: legacy_rel_pos
+    enc_positionwise_layer_type: conv1d
+    enc_positionwise_conv_kernel_size: 3
+    dec_cnn_module_kernel: 31
+    dec_attention_dim: 384
+    dec_attention_heads: 2
+    dec_linear_units: 1536
+    dec_num_blocks: 4
+    dec_dropout_rate: 0.2
+    dec_positional_dropout_rate: 0.2
+    dec_attention_dropout_rate: 0.2
+    dec_macaron_style: true
+    dec_use_cnn_module: true
+    dec_selfattention_layer_type: legacy_rel_selfattn
+    dec_activation_type: swish
+    dec_pos_enc_layer_type: legacy_rel_pos
+    dec_positionwise_layer_type: conv1d
+    dec_positionwise_conv_kernel_size: 3
+
+###########################################################
+#                     OPTIMIZER SETTING                   #
+###########################################################
+optimizer:
+    optim: adam               # optimizer type
+    learning_rate: 0.001      # learning rate
+
+###########################################################
+#                     TRAINING SETTING                    #
+###########################################################
+max_epoch: 100
+num_snapshots: 5
+
+###########################################################
+#                       OTHER SETTING                     #
+###########################################################
+seed: 10086
+
+token_list:
+- <blank>
+- <unk>
+- AH0
+- T
+- N
+- sp
+- S
+- R
+- D
+- L
+- Z
+- DH
+- IH1
+- K
+- W
+- M
+- EH1
+- AE1
+- ER0
+- B
+- IY1
+- P
+- V
+- IY0
+- F
+- HH
+- AA1
+- AY1
+- AH1
+- EY1
+- IH0
+- AO1
+- OW1
+- UW1
+- G
+- NG
+- SH
+- Y
+- TH
+- ER1
+- JH
+- UH1
+- AW1
+- CH
+- IH2
+- OW0
+- OW2
+- EY2
+- EH2
+- UW0
+- OY1
+- ZH
+- EH0
+- AY2
+- AW2
+- AA2
+- AE2
+- IY2
+- AH2
+- AE0
+- AO2
+- AY0
+- AO0
+- UW2
+- UH2
+- AA0
+- EY0
+- AW0
+- UH0
+- ER2
+- OY2
+- OY0
+- d
+- sh
+- ii
+- j
+- zh
+- l
+- x
+- b
+- g
+- uu
+- e5
+- h
+- q
+- m
+- i1
+- t
+- z
+- ch
+- f
+- s
+- u4
+- ix4
+- i4
+- n
+- i3
+- iu3
+- vv
+- ian4
+- ix2
+- r
+- e4
+- ai4
+- k
+- ing2
+- a1
+- en2
+- ui4
+- ong1
+- uo3
+- u2
+- u3
+- ao4
+- ee
+- p
+- an1
+- eng2
+- i2
+- in1
+- c
+- ai2
+- ian2
+- e2
+- an4
+- ing4
+- v4
+- ai3
+- a5
+- ian3
+- eng1
+- ong4
+- ang4
+- ian1
+- ing1
+- iy4
+- ao3
+- ang1
+- uo4
+- u1
+- iao4
+- iu4
+- a4
+- van2
+- ie4
+- ang2
+- ou4
+- iang4
+- ix1
+- er4
+- iy1
+- e1
+- en1
+- ui2
+- an3
+- ei4
+- ong2
+- uo1
+- ou3
+- uo2
+- iao1
+- ou1
+- an2
+- uan4
+- ia4
+- ia1
+- ang3
+- v3
+- iu2
+- iao3
+- in4
+- a3
+- ei3
+- iang3
+- v2
+- eng4
+- en3
+- aa
+- uan1
+- v1
+- ao1
+- ve4
+- ie3
+- ai1
+- ing3
+- iang1
+- a2
+- ui1
+- en4
+- en5
+- in3
+- uan3
+- e3
+- ie1
+- ve2
+- ei2
+- in2
+- ix3
+- uan2
+- iang2
+- ie2
+- ua4
+- ou2
+- uai4
+- er2
+- eng3
+- uang3
+- un1
+- ong3
+- uang4
+- vn4
+- un2
+- iy3
+- iz4
+- ui3
+- iao2
+- iong4
+- un4
+- van4
+- ao2
+- uang1
+- iy5
+- o2
+- ei1
+- ua1
+- iu1
+- uang2
+- er5
+- o1
+- un3
+- vn1
+- vn2
+- o4
+- ve1
+- van3
+- ua2
+- er3
+- iong3
+- van1
+- ia2
+- iy2
+- ia3
+- iong1
+- uo5
+- oo
+- ve3
+- ou5
+- uai3
+- ian5
+- iong2
+- uai2
+- uai1
+- ua3
+- vn3
+- ia5
+- ie5
+- ueng1
+- o5
+- o3
+- iang5
+- ei5
+- <sos/eos>
--- a/examples/aishell3_vctk/ernie_sat/local/preprocess.sh
+++ b/examples/aishell3_vctk/ernie_sat/local/preprocess.sh
+#!/bin/bash
+
+stage=0
+stop_stage=100
+
+config_path=$1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # get durations from MFA's result
+    echo "Generate durations.txt from MFA results ..."
+    python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
+        --inputdir=./aishell3_alignment_tone \
+        --output durations.txt \
+        --config=${config_path}
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # extract features
+    echo "Extract features ..."
+    python3 ${BIN_DIR}/preprocess.py \
+        --dataset=aishell3 \
+        --rootdir=~/datasets/data_aishell3/ \
+        --dumpdir=dump \
+        --dur-file=durations.txt \
+        --config=${config_path} \
+        --num-cpu=20 \
+        --cut-sil=True
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # get features' stats(mean and std)
+    echo "Get features' stats ..."
+    python3 ${MAIN_ROOT}/utils/compute_statistics.py \
+        --metadata=dump/train/raw/metadata.jsonl \
+        --field-name="speech"
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    # normalize and covert phone/speaker to id, dev and test should use train's stats
+    echo "Normalize ..."
+    python3 ${BIN_DIR}/normalize.py \
+        --metadata=dump/train/raw/metadata.jsonl \
+        --dumpdir=dump/train/norm \
+        --speech-stats=dump/train/speech_stats.npy \
+        --pitch-stats=dump/train/pitch_stats.npy \
+        --energy-stats=dump/train/energy_stats.npy \
+        --phones-dict=dump/phone_id_map.txt \
+        --speaker-dict=dump/speaker_id_map.txt
+
+    python3 ${BIN_DIR}/normalize.py \
+        --metadata=dump/dev/raw/metadata.jsonl \
+        --dumpdir=dump/dev/norm \
+        --speech-stats=dump/train/speech_stats.npy \
+        --pitch-stats=dump/train/pitch_stats.npy \
+        --energy-stats=dump/train/energy_stats.npy \
+        --phones-dict=dump/phone_id_map.txt \
+        --speaker-dict=dump/speaker_id_map.txt
+
+    python3 ${BIN_DIR}/normalize.py \
+        --metadata=dump/test/raw/metadata.jsonl \
+        --dumpdir=dump/test/norm \
+        --speech-stats=dump/train/speech_stats.npy \
+        --pitch-stats=dump/train/pitch_stats.npy \
+        --energy-stats=dump/train/energy_stats.npy \
+        --phones-dict=dump/phone_id_map.txt \
+        --speaker-dict=dump/speaker_id_map.txt
+fi
--- a/examples/aishell3_vctk/ernie_sat/local/synthesize.sh
+++ b/examples/aishell3_vctk/ernie_sat/local/synthesize.sh
+#!/bin/bash
\ No newline at end of file
--- a/examples/aishell3_vctk/ernie_sat/local/train.sh
+++ b/examples/aishell3_vctk/ernie_sat/local/train.sh
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+
+python3 ${BIN_DIR}/train.py \
+    --train-metadata=dump/train/norm/metadata.jsonl \
+    --dev-metadata=dump/dev/norm/metadata.jsonl \
+    --config=${config_path} \
+    --output-dir=${train_output_path} \
+    --ngpu=1 \
+    --phones-dict=dump/phone_id_map.txt
\ No newline at end of file
--- a/examples/aishell3_vctk/ernie_sat/path.sh
+++ b/examples/aishell3_vctk/ernie_sat/path.sh
+#!/bin/bash
+export MAIN_ROOT=`realpath ${PWD}/../../`
+
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
+export LC_ALL=C
+
+export PYTHONDONTWRITEBYTECODE=1
+# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
+
+MODEL=ernie_sat
+export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
\ No newline at end of file
--- a/examples/aishell3_vctk/ernie_sat/run.sh
+++ b/examples/aishell3_vctk/ernie_sat/run.sh
+#!/bin/bash
+
+set -e
+source path.sh
+
+gpus=0,1
+stage=0
+stop_stage=100
+
+conf_path=conf/default.yaml
+train_output_path=exp/default
+ckpt_name=snapshot_iter_153.pdz
+
+# with the following command, you can choose the stage range you want to run
+# such as `./run.sh --stage 0 --stop-stage 0`
+# this can not be mixed use with `$1`, `$2` ...
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # prepare data
+    ./local/preprocess.sh ${conf_path} || exit -1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # train model, all `ckpt` under `train_output_path/checkpoints/` dir
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # synthesize, vocoder is pwgan
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
--- a/examples/csmsc/tts2/conf/default.yaml
+++ b/examples/csmsc/tts2/conf/default.yaml
--- a/examples/csmsc/voc3/conf/default.yaml
+++ b/examples/csmsc/voc3/conf/default.yaml
@@ -29,7 +29,7 @@ generator_params:
    out_channels: 4               # Number of output channels.
    kernel_size: 7                # Kernel size of initial and final conv layers.
    channels: 384                 # Initial number of channels for conv layers.
-    upsample_scales: [5, 5, 3]    # List of Upsampling scales. prod(upsample_scales) == n_shift
+    upsample_scales: [5, 5, 3]    # List of Upsampling scales. prod(upsample_scales) x out_channels == n_shift
    stack_kernel_size: 3          # Kernel size of dilated conv layers in residual stack.
    stacks: 4                     # Number of stacks in a single residual stack module.
    use_weight_norm: True         # Whether to use weight normalization.

--- a/examples/csmsc/voc3/conf/finetune.yaml
+++ b/examples/csmsc/voc3/conf/finetune.yaml
@@ -29,7 +29,7 @@ generator_params:
    out_channels: 4               # Number of output channels.
    kernel_size: 7                # Kernel size of initial and final conv layers.
    channels: 384                 # Initial number of channels for conv layers.
-    upsample_scales: [5, 5, 3]    # List of Upsampling scales. prod(upsample_scales) == n_shift
+    upsample_scales: [5, 5, 3]    # List of Upsampling scales. prod(upsample_scales) x out_channels == n_shift
    stack_kernel_size: 3          # Kernel size of dilated conv layers in residual stack.
    stacks: 4                     # Number of stacks in a single residual stack module.
    use_weight_norm: True         # Whether to use weight normalization.

--- a/examples/ernie_sat/local/align.py
+++ b/examples/ernie_sat/local/align.py
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 """ Usage:
    align.py wavfile trsfile outwordfile outphonefile
 """

--- a/examples/ernie_sat/local/inference.py
+++ b/examples/ernie_sat/local/inference.py
-#!/usr/bin/env python3
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import os
 import random
 from typing import Dict
@@ -305,7 +317,6 @@ def get_dur_adj_factor(orig_dur: List[int],


 def prep_feats_with_dur(wav_path: str,
-                        mlm_model: nn.Layer,
                        source_lang: str="English",
                        target_lang: str="English",
                        old_str: str="",
@@ -425,8 +436,7 @@ def prep_feats_with_dur(wav_path: str,
    return new_wav, new_phns, new_mfa_start, new_mfa_end, old_span_bdy, new_span_bdy


-def prep_feats(mlm_model: nn.Layer,
-               wav_path: str,
+def prep_feats(wav_path: str,
               source_lang: str="english",
               target_lang: str="english",
               old_str: str="",
@@ -440,7 +450,6 @@ def prep_feats(mlm_model: nn.Layer,
    wav, phns, mfa_start, mfa_end, old_span_bdy, new_span_bdy = prep_feats_with_dur(
        source_lang=source_lang,
        target_lang=target_lang,
-        mlm_model=mlm_model,
        old_str=old_str,
        new_str=new_str,
        wav_path=wav_path,
@@ -482,7 +491,6 @@ def decode_with_model(mlm_model: nn.Layer,
    batch, old_span_bdy, new_span_bdy = prep_feats(
        source_lang=source_lang,
        target_lang=target_lang,
-        mlm_model=mlm_model,
        wav_path=wav_path,
        old_str=old_str,
        new_str=new_str,

--- a/examples/ernie_sat/local/inference_new.py
+++ b/examples/ernie_sat/local/inference_new.py
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import random
+from typing import Dict
+from typing import List
+
+import librosa
+import numpy as np
+import paddle
+import soundfile as sf
+import yaml
+from align import alignment
+from align import alignment_zh
+from align import words2phns
+from align import words2phns_zh
+from paddle import nn
+from sedit_arg_parser import parse_args
+from utils import eval_durs
+from utils import get_voc_out
+from utils import is_chinese
+from utils import load_num_sequence_text
+from utils import read_2col_text
+from yacs.config import CfgNode
+
+from paddlespeech.t2s.datasets.am_batch_fn import build_mlm_collate_fn
+from paddlespeech.t2s.models.ernie_sat.ernie_sat import ErnieSAT
+
+random.seed(0)
+np.random.seed(0)
+
+
+def get_wav(wav_path: str,
+            source_lang: str='english',
+            target_lang: str='english',
+            model_name: str="paddle_checkpoint_en",
+            old_str: str="",
+            new_str: str="",
+            non_autoreg: bool=True):
+    wav_org, output_feat, old_span_bdy, new_span_bdy, fs, hop_length = get_mlm_output(
+        source_lang=source_lang,
+        target_lang=target_lang,
+        model_name=model_name,
+        wav_path=wav_path,
+        old_str=old_str,
+        new_str=new_str,
+        use_teacher_forcing=non_autoreg)
+
+    masked_feat = output_feat[new_span_bdy[0]:new_span_bdy[1]]
+
+    alt_wav = get_voc_out(masked_feat)
+
+    old_time_bdy = [hop_length * x for x in old_span_bdy]
+
+    wav_replaced = np.concatenate(
+        [wav_org[:old_time_bdy[0]], alt_wav, wav_org[old_time_bdy[1]:]])
+
+    data_dict = {"origin": wav_org, "output": wav_replaced}
+
+    return data_dict
+
+
+def load_model(model_name: str="paddle_checkpoint_en"):
+    config_path = './pretrained_model/{}/default.yaml'.format(model_name)
+    model_path = './pretrained_model/{}/model.pdparams'.format(model_name)
+    with open(config_path) as f:
+        conf = CfgNode(yaml.safe_load(f))
+    token_list = list(conf.token_list)
+    vocab_size = len(token_list)
+    odim = conf.n_mels
+    mlm_model = ErnieSAT(idim=vocab_size, odim=odim, **conf["model"])
+    state_dict = paddle.load(model_path)
+    new_state_dict = {}
+    for key, value in state_dict.items():
+        new_key = "model." + key
+        new_state_dict[new_key] = value
+    mlm_model.set_state_dict(new_state_dict)
+    mlm_model.eval()
+
+    return mlm_model, conf
+
+
+def read_data(uid: str, prefix: os.PathLike):
+    # 获取 uid 对应的文本
+    mfa_text = read_2col_text(prefix + '/text')[uid]
+    # 获取 uid 对应的音频路径
+    mfa_wav_path = read_2col_text(prefix + '/wav.scp')[uid]
+    if not os.path.isabs(mfa_wav_path):
+        mfa_wav_path = prefix + mfa_wav_path
+    return mfa_text, mfa_wav_path
+
+
+def get_align_data(uid: str, prefix: os.PathLike):
+    mfa_path = prefix + "mfa_"
+    mfa_text = read_2col_text(mfa_path + 'text')[uid]
+    mfa_start = load_num_sequence_text(
+        mfa_path + 'start', loader_type='text_float')[uid]
+    mfa_end = load_num_sequence_text(
+        mfa_path + 'end', loader_type='text_float')[uid]
+    mfa_wav_path = read_2col_text(mfa_path + 'wav.scp')[uid]
+    return mfa_text, mfa_start, mfa_end, mfa_wav_path
+
+
+# 获取需要被 mask 的 mel 帧的范围
+def get_masked_mel_bdy(mfa_start: List[float],
+                       mfa_end: List[float],
+                       fs: int,
+                       hop_length: int,
+                       span_to_repl: List[List[int]]):
+    align_start = np.array(mfa_start)
+    align_end = np.array(mfa_end)
+    align_start = np.floor(fs * align_start / hop_length).astype('int')
+    align_end = np.floor(fs * align_end / hop_length).astype('int')
+    if span_to_repl[0] >= len(mfa_start):
+        span_bdy = [align_end[-1], align_end[-1]]
+    else:
+        span_bdy = [
+            align_start[span_to_repl[0]], align_end[span_to_repl[1] - 1]
+        ]
+    return span_bdy, align_start, align_end
+
+
+def recover_dict(word2phns: Dict[str, str], tp_word2phns: Dict[str, str]):
+    dic = {}
+    keys_to_del = []
+    exist_idx = []
+    sp_count = 0
+    add_sp_count = 0
+    for key in word2phns.keys():
+        idx, wrd = key.split('_')
+        if wrd == 'sp':
+            sp_count += 1
+            exist_idx.append(int(idx))
+        else:
+            keys_to_del.append(key)
+
+    for key in keys_to_del:
+        del word2phns[key]
+
+    cur_id = 0
+    for key in tp_word2phns.keys():
+        if cur_id in exist_idx:
+            dic[str(cur_id) + "_sp"] = 'sp'
+            cur_id += 1
+            add_sp_count += 1
+        idx, wrd = key.split('_')
+        dic[str(cur_id) + "_" + wrd] = tp_word2phns[key]
+        cur_id += 1
+
+    if add_sp_count + 1 == sp_count:
+        dic[str(cur_id) + "_sp"] = 'sp'
+        add_sp_count += 1
+
+    assert add_sp_count == sp_count, "sp are not added in dic"
+    return dic
+
+
+def get_max_idx(dic):
+    return sorted([int(key.split('_')[0]) for key in dic.keys()])[-1]
+
+
+def get_phns_and_spans(wav_path: str,
+                       old_str: str="",
+                       new_str: str="",
+                       source_lang: str="english",
+                       target_lang: str="english"):
+    is_append = (old_str == new_str[:len(old_str)])
+    old_phns, mfa_start, mfa_end = [], [], []
+    # source
+    if source_lang == "english":
+        intervals, word2phns = alignment(wav_path, old_str)
+    elif source_lang == "chinese":
+        intervals, word2phns = alignment_zh(wav_path, old_str)
+        _, tp_word2phns = words2phns_zh(old_str)
+
+        for key, value in tp_word2phns.items():
+            idx, wrd = key.split('_')
+            cur_val = " ".join(value)
+            tp_word2phns[key] = cur_val
+
+        word2phns = recover_dict(word2phns, tp_word2phns)
+    else:
+        assert source_lang == "chinese" or source_lang == "english", \
+            "source_lang is wrong..."
+
+    for item in intervals:
+        old_phns.append(item[0])
+        mfa_start.append(float(item[1]))
+        mfa_end.append(float(item[2]))
+    # target
+    if is_append and (source_lang != target_lang):
+        cross_lingual_clone = True
+    else:
+        cross_lingual_clone = False
+
+    if cross_lingual_clone:
+        str_origin = new_str[:len(old_str)]
+        str_append = new_str[len(old_str):]
+
+        if target_lang == "chinese":
+            phns_origin, origin_word2phns = words2phns(str_origin)
+            phns_append, append_word2phns_tmp = words2phns_zh(str_append)
+
+        elif target_lang == "english":
+            # 原始句子
+            phns_origin, origin_word2phns = words2phns_zh(str_origin)
+            # clone 句子 
+            phns_append, append_word2phns_tmp = words2phns(str_append)
+        else:
+            assert target_lang == "chinese" or target_lang == "english", \
+                "cloning is not support for this language, please check it."
+
+        new_phns = phns_origin + phns_append
+
+        append_word2phns = {}
+        length = len(origin_word2phns)
+        for key, value in append_word2phns_tmp.items():
+            idx, wrd = key.split('_')
+            append_word2phns[str(int(idx) + length) + '_' + wrd] = value
+        new_word2phns = origin_word2phns.copy()
+        new_word2phns.update(append_word2phns)
+
+    else:
+        if source_lang == target_lang and target_lang == "english":
+            new_phns, new_word2phns = words2phns(new_str)
+        elif source_lang == target_lang and target_lang == "chinese":
+            new_phns, new_word2phns = words2phns_zh(new_str)
+        else:
+            assert source_lang == target_lang, \
+                "source language is not same with target language..."
+
+    span_to_repl = [0, len(old_phns) - 1]
+    span_to_add = [0, len(new_phns) - 1]
+    left_idx = 0
+    new_phns_left = []
+    sp_count = 0
+    # find the left different index
+    for key in word2phns.keys():
+        idx, wrd = key.split('_')
+        if wrd == 'sp':
+            sp_count += 1
+            new_phns_left.append('sp')
+        else:
+            idx = str(int(idx) - sp_count)
+            if idx + '_' + wrd in new_word2phns:
+                left_idx += len(new_word2phns[idx + '_' + wrd])
+                new_phns_left.extend(word2phns[key].split())
+            else:
+                span_to_repl[0] = len(new_phns_left)
+                span_to_add[0] = len(new_phns_left)
+                break
+
+    # reverse word2phns and new_word2phns
+    right_idx = 0
+    new_phns_right = []
+    sp_count = 0
+    word2phns_max_idx = get_max_idx(word2phns)
+    new_word2phns_max_idx = get_max_idx(new_word2phns)
+    new_phns_mid = []
+    if is_append:
+        new_phns_right = []
+        new_phns_mid = new_phns[left_idx:]
+        span_to_repl[0] = len(new_phns_left)
+        span_to_add[0] = len(new_phns_left)
+        span_to_add[1] = len(new_phns_left) + len(new_phns_mid)
+        span_to_repl[1] = len(old_phns) - len(new_phns_right)
+    # speech edit
+    else:
+        for key in list(word2phns.keys())[::-1]:
+            idx, wrd = key.split('_')
+            if wrd == 'sp':
+                sp_count += 1
+                new_phns_right = ['sp'] + new_phns_right
+            else:
+                idx = str(new_word2phns_max_idx - (word2phns_max_idx - int(idx)
+                                                   - sp_count))
+                if idx + '_' + wrd in new_word2phns:
+                    right_idx -= len(new_word2phns[idx + '_' + wrd])
+                    new_phns_right = word2phns[key].split() + new_phns_right
+                else:
+                    span_to_repl[1] = len(old_phns) - len(new_phns_right)
+                    new_phns_mid = new_phns[left_idx:right_idx]
+                    span_to_add[1] = len(new_phns_left) + len(new_phns_mid)
+                    if len(new_phns_mid) == 0:
+                        span_to_add[1] = min(span_to_add[1] + 1, len(new_phns))
+                        span_to_add[0] = max(0, span_to_add[0] - 1)
+                        span_to_repl[0] = max(0, span_to_repl[0] - 1)
+                        span_to_repl[1] = min(span_to_repl[1] + 1,
+                                              len(old_phns))
+                    break
+    new_phns = new_phns_left + new_phns_mid + new_phns_right
+    '''
+    For that reason cover should not be given.
+    For that reason cover is impossible to be given.
+    span_to_repl: [17, 23] "should not"
+    span_to_add: [17, 30]  "is impossible to"
+    '''
+    return mfa_start, mfa_end, old_phns, new_phns, span_to_repl, span_to_add
+
+
+# mfa 获得的 duration 和 fs2 的 duration_predictor 获取的 duration 可能不同
+# 此处获得一个缩放比例, 用于预测值和真实值之间的缩放
+def get_dur_adj_factor(orig_dur: List[int],
+                       pred_dur: List[int],
+                       phns: List[str]):
+    length = 0
+    factor_list = []
+    for orig, pred, phn in zip(orig_dur, pred_dur, phns):
+        if pred == 0 or phn == 'sp':
+            continue
+        else:
+            factor_list.append(orig / pred)
+    factor_list = np.array(factor_list)
+    factor_list.sort()
+    if len(factor_list) < 5:
+        return 1
+    length = 2
+    avg = np.average(factor_list[length:-length])
+    return avg
+
+
+def prep_feats_with_dur(wav_path: str,
+                        source_lang: str="English",
+                        target_lang: str="English",
+                        old_str: str="",
+                        new_str: str="",
+                        mask_reconstruct: bool=False,
+                        duration_adjust: bool=True,
+                        start_end_sp: bool=False,
+                        fs: int=24000,
+                        hop_length: int=300):
+    '''
+    Returns:
+        np.ndarray: new wav, replace the part to be edited in original wav with 0
+        List[str]: new phones
+        List[float]: mfa start of new wav
+        List[float]: mfa end of new wav
+        List[int]: masked mel boundary of original wav
+        List[int]: masked mel boundary of new wav
+    '''
+    wav_org, _ = librosa.load(wav_path, sr=fs)
+
+    mfa_start, mfa_end, old_phns, new_phns, span_to_repl, span_to_add = get_phns_and_spans(
+        wav_path=wav_path,
+        old_str=old_str,
+        new_str=new_str,
+        source_lang=source_lang,
+        target_lang=target_lang)
+
+    if start_end_sp:
+        if new_phns[-1] != 'sp':
+            new_phns = new_phns + ['sp']
+    # 中文的 phns 不一定都在 fastspeech2 的字典里, 用 sp 代替
+    if target_lang == "english" or target_lang == "chinese":
+        old_durs = eval_durs(old_phns, target_lang=source_lang)
+    else:
+        assert target_lang == "chinese" or target_lang == "english", \
+            "calculate duration_predict is not support for this language..."
+
+    orig_old_durs = [e - s for e, s in zip(mfa_end, mfa_start)]
+    if '[MASK]' in new_str:
+        new_phns = old_phns
+        span_to_add = span_to_repl
+        d_factor_left = get_dur_adj_factor(
+            orig_dur=orig_old_durs[:span_to_repl[0]],
+            pred_dur=old_durs[:span_to_repl[0]],
+            phns=old_phns[:span_to_repl[0]])
+        d_factor_right = get_dur_adj_factor(
+            orig_dur=orig_old_durs[span_to_repl[1]:],
+            pred_dur=old_durs[span_to_repl[1]:],
+            phns=old_phns[span_to_repl[1]:])
+        d_factor = (d_factor_left + d_factor_right) / 2
+        new_durs_adjusted = [d_factor * i for i in old_durs]
+    else:
+        if duration_adjust:
+            d_factor = get_dur_adj_factor(
+                orig_dur=orig_old_durs, pred_dur=old_durs, phns=old_phns)
+            d_factor = d_factor * 1.25
+        else:
+            d_factor = 1
+
+        if target_lang == "english" or target_lang == "chinese":
+            new_durs = eval_durs(new_phns, target_lang=target_lang)
+        else:
+            assert target_lang == "chinese" or target_lang == "english", \
+                "calculate duration_predict is not support for this language..."
+
+        new_durs_adjusted = [d_factor * i for i in new_durs]
+
+    new_span_dur_sum = sum(new_durs_adjusted[span_to_add[0]:span_to_add[1]])
+    old_span_dur_sum = sum(orig_old_durs[span_to_repl[0]:span_to_repl[1]])
+    dur_offset = new_span_dur_sum - old_span_dur_sum
+    new_mfa_start = mfa_start[:span_to_repl[0]]
+    new_mfa_end = mfa_end[:span_to_repl[0]]
+    for i in new_durs_adjusted[span_to_add[0]:span_to_add[1]]:
+        if len(new_mfa_end) == 0:
+            new_mfa_start.append(0)
+            new_mfa_end.append(i)
+        else:
+            new_mfa_start.append(new_mfa_end[-1])
+            new_mfa_end.append(new_mfa_end[-1] + i)
+    new_mfa_start += [i + dur_offset for i in mfa_start[span_to_repl[1]:]]
+    new_mfa_end += [i + dur_offset for i in mfa_end[span_to_repl[1]:]]
+
+    # 3. get new wav
+    # 在原始句子后拼接
+    if span_to_repl[0] >= len(mfa_start):
+        left_idx = len(wav_org)
+        right_idx = left_idx
+    # 在原始句子中间替换
+    else:
+        left_idx = int(np.floor(mfa_start[span_to_repl[0]] * fs))
+        right_idx = int(np.ceil(mfa_end[span_to_repl[1] - 1] * fs))
+    blank_wav = np.zeros(
+        (int(np.ceil(new_span_dur_sum * fs)), ), dtype=wav_org.dtype)
+    # 原始音频，需要编辑的部分替换成空音频，空音频的时间由 fs2 的 duration_predictor 决定
+    new_wav = np.concatenate(
+        [wav_org[:left_idx], blank_wav, wav_org[right_idx:]])
+
+    # 4. get old and new mel span to be mask
+    # [92, 92]
+
+    old_span_bdy, mfa_start, mfa_end = get_masked_mel_bdy(
+        mfa_start=mfa_start,
+        mfa_end=mfa_end,
+        fs=fs,
+        hop_length=hop_length,
+        span_to_repl=span_to_repl)
+    # [92, 174]
+    # new_mfa_start, new_mfa_end 时间级别的开始和结束时间 -> 帧级别
+    new_span_bdy, new_mfa_start, new_mfa_end = get_masked_mel_bdy(
+        mfa_start=new_mfa_start,
+        mfa_end=new_mfa_end,
+        fs=fs,
+        hop_length=hop_length,
+        span_to_repl=span_to_add)
+
+    # old_span_bdy, new_span_bdy 是帧级别的范围
+    return new_wav, new_phns, new_mfa_start, new_mfa_end, old_span_bdy, new_span_bdy
+
+
+def prep_feats(wav_path: str,
+               source_lang: str="english",
+               target_lang: str="english",
+               old_str: str="",
+               new_str: str="",
+               duration_adjust: bool=True,
+               start_end_sp: bool=False,
+               mask_reconstruct: bool=False,
+               fs: int=24000,
+               hop_length: int=300,
+               token_list: List[str]=[]):
+    wav, phns, mfa_start, mfa_end, old_span_bdy, new_span_bdy = prep_feats_with_dur(
+        source_lang=source_lang,
+        target_lang=target_lang,
+        old_str=old_str,
+        new_str=new_str,
+        wav_path=wav_path,
+        duration_adjust=duration_adjust,
+        start_end_sp=start_end_sp,
+        mask_reconstruct=mask_reconstruct,
+        fs=fs,
+        hop_length=hop_length)
+
+    token_to_id = {item: i for i, item in enumerate(token_list)}
+    text = np.array(
+        list(map(lambda x: token_to_id.get(x, token_to_id['<unk>']), phns)))
+    span_bdy = np.array(new_span_bdy)
+
+    batch = [('1', {
+        "speech": wav,
+        "align_start": mfa_start,
+        "align_end": mfa_end,
+        "text": text,
+        "span_bdy": span_bdy
+    })]
+
+    return batch, old_span_bdy, new_span_bdy
+
+
+def decode_with_model(mlm_model: nn.Layer,
+                      collate_fn,
+                      wav_path: str,
+                      source_lang: str="english",
+                      target_lang: str="english",
+                      old_str: str="",
+                      new_str: str="",
+                      use_teacher_forcing: bool=False,
+                      duration_adjust: bool=True,
+                      start_end_sp: bool=False,
+                      fs: int=24000,
+                      hop_length: int=300,
+                      token_list: List[str]=[]):
+    batch, old_span_bdy, new_span_bdy = prep_feats(
+        source_lang=source_lang,
+        target_lang=target_lang,
+        wav_path=wav_path,
+        old_str=old_str,
+        new_str=new_str,
+        duration_adjust=duration_adjust,
+        start_end_sp=start_end_sp,
+        fs=fs,
+        hop_length=hop_length,
+        token_list=token_list)
+
+    feats = collate_fn(batch)[1]
+
+    if 'text_masked_pos' in feats.keys():
+        feats.pop('text_masked_pos')
+
+    output = mlm_model.inference(
+        text=feats['text'],
+        speech=feats['speech'],
+        masked_pos=feats['masked_pos'],
+        speech_mask=feats['speech_mask'],
+        text_mask=feats['text_mask'],
+        speech_seg_pos=feats['speech_seg_pos'],
+        text_seg_pos=feats['text_seg_pos'],
+        span_bdy=new_span_bdy,
+        use_teacher_forcing=use_teacher_forcing)
+
+    # 拼接音频
+    output_feat = paddle.concat(x=output, axis=0)
+    wav_org, _ = librosa.load(wav_path, sr=fs)
+    return wav_org, output_feat, old_span_bdy, new_span_bdy, fs, hop_length
+
+
+def get_mlm_output(wav_path: str,
+                   model_name: str="paddle_checkpoint_en",
+                   source_lang: str="english",
+                   target_lang: str="english",
+                   old_str: str="",
+                   new_str: str="",
+                   use_teacher_forcing: bool=False,
+                   duration_adjust: bool=True,
+                   start_end_sp: bool=False):
+    mlm_model, train_conf = load_model(model_name)
+
+    collate_fn = build_mlm_collate_fn(
+        sr=train_conf.fs,
+        n_fft=train_conf.n_fft,
+        hop_length=train_conf.n_shift,
+        win_length=train_conf.win_length,
+        n_mels=train_conf.n_mels,
+        fmin=train_conf.fmin,
+        fmax=train_conf.fmax,
+        mlm_prob=train_conf.mlm_prob,
+        mean_phn_span=train_conf.mean_phn_span,
+        seg_emb=train_conf.model['enc_input_layer'] == 'sega_mlm')
+
+    return decode_with_model(
+        source_lang=source_lang,
+        target_lang=target_lang,
+        mlm_model=mlm_model,
+        collate_fn=collate_fn,
+        wav_path=wav_path,
+        old_str=old_str,
+        new_str=new_str,
+        use_teacher_forcing=use_teacher_forcing,
+        duration_adjust=duration_adjust,
+        start_end_sp=start_end_sp,
+        fs=train_conf.fs,
+        hop_length=train_conf.n_shift,
+        token_list=train_conf.token_list)
+
+
+def evaluate(uid: str,
+             source_lang: str="english",
+             target_lang: str="english",
+             prefix: os.PathLike="./prompt/dev/",
+             model_name: str="paddle_checkpoint_en",
+             new_str: str="",
+             prompt_decoding: bool=False,
+             task_name: str=None):
+
+    # get origin text and path of origin wav
+    old_str, wav_path = read_data(uid=uid, prefix=prefix)
+
+    if task_name == 'edit':
+        new_str = new_str
+    elif task_name == 'synthesize':
+        new_str = old_str + new_str
+    else:
+        new_str = old_str + ' '.join([ch for ch in new_str if is_chinese(ch)])
+
+    print('new_str is ', new_str)
+
+    results_dict = get_wav(
+        source_lang=source_lang,
+        target_lang=target_lang,
+        model_name=model_name,
+        wav_path=wav_path,
+        old_str=old_str,
+        new_str=new_str)
+    return results_dict
+
+
+if __name__ == "__main__":
+    # parse config and args
+    args = parse_args()
+
+    data_dict = evaluate(
+        uid=args.uid,
+        source_lang=args.source_lang,
+        target_lang=args.target_lang,
+        prefix=args.prefix,
+        model_name=args.model_name,
+        new_str=args.new_str,
+        task_name=args.task_name)
+    sf.write(args.output_name, data_dict['output'], samplerate=24000)
+    print("finished...")
--- a/examples/ernie_sat/local/sedit_arg_parser.py
+++ b/examples/ernie_sat/local/sedit_arg_parser.py
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import argparse



--- a/examples/ernie_sat/local/utils.py
+++ b/examples/ernie_sat/local/utils.py
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 from pathlib import Path
 from typing import Dict
 from typing import List

--- a/examples/ernie_sat/run_clone_en_to_zh_new.sh
+++ b/examples/ernie_sat/run_clone_en_to_zh_new.sh
+#!/bin/bash
+
+set -e
+source path.sh
+
+# en --> zh  的 语音合成
+# 根据 Prompt_003_new 作为提示语音: This was not the show for me. 来合成:  '今天天气很好'
+# 注: 输入的 new_str 需为中文汉字, 否则会通过预处理只保留中文汉字, 即合成预处理后的中文语音。
+
+python local/inference_new.py \
+    --task_name=cross-lingual_clone \
+    --model_name=paddle_checkpoint_dual_mask_enzh \
+    --uid=Prompt_003_new \
+    --new_str='今天天气很好.' \
+    --prefix='./prompt/dev/' \
+    --source_lang=english \
+    --target_lang=chinese \
+    --output_name=pred_clone.wav \
+    --voc=pwgan_aishell3 \
+    --voc_config=download/pwg_aishell3_ckpt_0.5/default.yaml \
+    --voc_ckpt=download/pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
+    --voc_stat=download/pwg_aishell3_ckpt_0.5/feats_stats.npy \
+    --am=fastspeech2_csmsc \
+    --am_config=download/fastspeech2_conformer_baker_ckpt_0.5/conformer.yaml \
+    --am_ckpt=download/fastspeech2_conformer_baker_ckpt_0.5/snapshot_iter_76000.pdz \
+    --am_stat=download/fastspeech2_conformer_baker_ckpt_0.5/speech_stats.npy \
+    --phones_dict=download/fastspeech2_conformer_baker_ckpt_0.5/phone_id_map.txt
--- a/examples/ernie_sat/run_gen_en_new.sh
+++ b/examples/ernie_sat/run_gen_en_new.sh
+#!/bin/bash
+
+set -e
+source path.sh
+
+# 纯英文的语音合成
+# 样例为根据 p299_096 对应的语音作为提示语音: This was not the show for me. 来合成:  'I enjoy my life.'
+
+python local/inference_new.py \
+    --task_name=synthesize \
+    --model_name=paddle_checkpoint_en \
+    --uid=p299_096 \
+    --new_str='I enjoy my life, do you?' \
+    --prefix='./prompt/dev/' \
+    --source_lang=english \
+    --target_lang=english \
+    --output_name=pred_gen.wav \
+    --voc=pwgan_aishell3 \
+    --voc_config=download/pwg_aishell3_ckpt_0.5/default.yaml \
+    --voc_ckpt=download/pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
+    --voc_stat=download/pwg_aishell3_ckpt_0.5/feats_stats.npy \
+    --am=fastspeech2_ljspeech \
+    --am_config=download/fastspeech2_nosil_ljspeech_ckpt_0.5/default.yaml \
+    --am_ckpt=download/fastspeech2_nosil_ljspeech_ckpt_0.5/snapshot_iter_100000.pdz \
+    --am_stat=download/fastspeech2_nosil_ljspeech_ckpt_0.5/speech_stats.npy \
+    --phones_dict=download/fastspeech2_nosil_ljspeech_ckpt_0.5/phone_id_map.txt
--- a/examples/ernie_sat/run_sedit_en_new.sh
+++ b/examples/ernie_sat/run_sedit_en_new.sh
+#!/bin/bash
+
+set -e
+source path.sh
+
+# 纯英文的语音编辑
+# 样例为把 p243_new 对应的原始语音: For that reason cover should not be given.编辑成 'for that reason cover is impossible to be given.' 对应的语音
+# NOTE: 语音编辑任务暂支持句子中 1 个位置的替换或者插入文本操作
+
+python local/inference_new.py \
+    --task_name=edit \
+    --model_name=paddle_checkpoint_en \
+    --uid=p243_new \
+    --new_str='for that reason cover is impossible to be given.' \
+    --prefix='./prompt/dev/' \
+    --source_lang=english \
+    --target_lang=english \
+    --output_name=pred_edit.wav \
+    --voc=pwgan_aishell3 \
+    --voc_config=download/pwg_aishell3_ckpt_0.5/default.yaml \
+    --voc_ckpt=download/pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
+    --voc_stat=download/pwg_aishell3_ckpt_0.5/feats_stats.npy \
+    --am=fastspeech2_ljspeech \
+    --am_config=download/fastspeech2_nosil_ljspeech_ckpt_0.5/default.yaml \
+    --am_ckpt=download/fastspeech2_nosil_ljspeech_ckpt_0.5/snapshot_iter_100000.pdz \
+    --am_stat=download/fastspeech2_nosil_ljspeech_ckpt_0.5/speech_stats.npy \
+    --phones_dict=download/fastspeech2_nosil_ljspeech_ckpt_0.5/phone_id_map.txt
--- a/examples/ernie_sat/test_run_new.sh
+++ b/examples/ernie_sat/test_run_new.sh
+#!/bin/bash
+
+rm -rf *.wav
+./run_sedit_en_new.sh       # 语音编辑任务(英文)
+./run_gen_en_new.sh         # 个性化语音合成任务(英文)
+./run_clone_en_to_zh_new.sh # 跨语言语音合成任务(英文到中文的语音克隆)
\ No newline at end of file
--- a/examples/vctk/ernie_sat/conf/default.yaml
+++ b/examples/vctk/ernie_sat/conf/default.yaml
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+
+fs: 24000          # sr
+n_fft: 2048        # FFT size (samples).
+n_shift: 300       # Hop size (samples). 12.5ms
+win_length: 1200   # Window length (samples). 50ms
+                   # If set to null, it will be the same as fft_size.
+window: "hann"     # Window function.
+
+# Only used for feats_type != raw
+
+fmin: 80           # Minimum frequency of Mel basis.
+fmax: 7600         # Maximum frequency of Mel basis.
+n_mels: 80         # The number of mel basis.
+
+mean_phn_span: 8
+mlm_prob: 0.8
+
+###########################################################
+#                       DATA SETTING                      #
+###########################################################
+batch_size: 20
+num_workers: 2
+
+###########################################################
+#                       MODEL SETTING                     #
+###########################################################
+model:
+    text_masking: false
+    postnet_layers: 5
+    postnet_filts: 5
+    postnet_chans: 256
+    encoder_type: conformer
+    decoder_type: conformer
+    enc_input_layer: sega_mlm
+    enc_pre_speech_layer: 0
+    enc_cnn_module_kernel: 7
+    enc_attention_dim: 384
+    enc_attention_heads: 2
+    enc_linear_units: 1536
+    enc_num_blocks: 4
+    enc_dropout_rate: 0.2
+    enc_positional_dropout_rate: 0.2
+    enc_attention_dropout_rate: 0.2
+    enc_normalize_before: true
+    enc_macaron_style: true
+    enc_use_cnn_module: true
+    enc_selfattention_layer_type: legacy_rel_selfattn
+    enc_activation_type: swish
+    enc_pos_enc_layer_type: legacy_rel_pos
+    enc_positionwise_layer_type: conv1d
+    enc_positionwise_conv_kernel_size: 3
+    dec_cnn_module_kernel: 31
+    dec_attention_dim: 384
+    dec_attention_heads: 2
+    dec_linear_units: 1536
+    dec_num_blocks: 4
+    dec_dropout_rate: 0.2
+    dec_positional_dropout_rate: 0.2
+    dec_attention_dropout_rate: 0.2
+    dec_macaron_style: true
+    dec_use_cnn_module: true
+    dec_selfattention_layer_type: legacy_rel_selfattn
+    dec_activation_type: swish
+    dec_pos_enc_layer_type: legacy_rel_pos
+    dec_positionwise_layer_type: conv1d
+    dec_positionwise_conv_kernel_size: 3
+
+###########################################################
+#                     OPTIMIZER SETTING                   #
+###########################################################
+optimizer:
+    optim: adam               # optimizer type
+    learning_rate: 0.001      # learning rate
+
+###########################################################
+#                     TRAINING SETTING                    #
+###########################################################
+max_epoch: 200
+num_snapshots: 5
+
+###########################################################
+#                       OTHER SETTING                     #
+###########################################################
+seed: 10086
+
+token_list:
+- <blank>
+- <unk>
+- AH0
+- T
+- N
+- sp
+- D
+- S
+- R
+- L
+- IH1
+- DH
+- AE1
+- M
+- EH1
+- K
+- Z
+- W
+- HH
+- ER0
+- AH1
+- IY1
+- P
+- V
+- F
+- B
+- AY1
+- IY0
+- EY1
+- AA1
+- AO1
+- UW1
+- IH0
+- OW1
+- NG
+- G
+- SH
+- ER1
+- Y
+- TH
+- AW1
+- CH
+- UH1
+- IH2
+- JH
+- OW0
+- EH2
+- OY1
+- AY2
+- EH0
+- EY2
+- UW0
+- AE2
+- AA2
+- OW2
+- AH2
+- ZH
+- AO2
+- IY2
+- AE0
+- UW2
+- AY0
+- AA0
+- AO0
+- AW2
+- EY0
+- UH2
+- ER2
+- OY2
+- UH0
+- AW0
+- OY0
+- <sos/eos>
\ No newline at end of file
--- a/examples/vctk/ernie_sat/local/preprocess.sh
+++ b/examples/vctk/ernie_sat/local/preprocess.sh
+#!/bin/bash
+
+stage=0
+stop_stage=100
+
+config_path=$1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # get durations from MFA's result
+    echo "Generate durations.txt from MFA results ..."
+    python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
+        --inputdir=./vctk_alignment \
+        --output durations.txt \
+        --config=${config_path}
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # extract features
+    echo "Extract features ..."
+    python3 ${BIN_DIR}/preprocess.py \
+        --dataset=vctk \
+        --rootdir=~/datasets/VCTK-Corpus-0.92/ \
+        --dumpdir=dump \
+        --dur-file=durations.txt \
+        --config=${config_path} \
+        --num-cpu=20 \
+        --cut-sil=True
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # get features' stats(mean and std)
+    echo "Get features' stats ..."
+    python3 ${MAIN_ROOT}/utils/compute_statistics.py \
+        --metadata=dump/train/raw/metadata.jsonl \
+        --field-name="speech"
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    # normalize and covert phone/speaker to id, dev and test should use train's stats
+    echo "Normalize ..."
+    python3 ${BIN_DIR}/normalize.py \
+        --metadata=dump/train/raw/metadata.jsonl \
+        --dumpdir=dump/train/norm \
+        --speech-stats=dump/train/speech_stats.npy \
+        --phones-dict=dump/phone_id_map.txt \
+        --speaker-dict=dump/speaker_id_map.txt
+
+    python3 ${BIN_DIR}/normalize.py \
+        --metadata=dump/dev/raw/metadata.jsonl \
+        --dumpdir=dump/dev/norm \
+        --speech-stats=dump/train/speech_stats.npy \
+        --phones-dict=dump/phone_id_map.txt \
+        --speaker-dict=dump/speaker_id_map.txt
+
+    python3 ${BIN_DIR}/normalize.py \
+        --metadata=dump/test/raw/metadata.jsonl \
+        --dumpdir=dump/test/norm \
+        --speech-stats=dump/train/speech_stats.npy \
+        --phones-dict=dump/phone_id_map.txt \
+        --speaker-dict=dump/speaker_id_map.txt
+fi
--- a/examples/vctk/ernie_sat/local/synthesize.sh
+++ b/examples/vctk/ernie_sat/local/synthesize.sh
+#!/bin/bash
\ No newline at end of file
--- a/examples/vctk/ernie_sat/local/train.sh
+++ b/examples/vctk/ernie_sat/local/train.sh
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+
+python3 ${BIN_DIR}/train.py \
+    --train-metadata=dump/train/norm/metadata.jsonl \
+    --dev-metadata=dump/dev/norm/metadata.jsonl \
+    --config=${config_path} \
+    --output-dir=${train_output_path} \
+    --ngpu=1 \
+    --phones-dict=dump/phone_id_map.txt
\ No newline at end of file
--- a/examples/vctk/ernie_sat/path.sh
+++ b/examples/vctk/ernie_sat/path.sh
+#!/bin/bash
+export MAIN_ROOT=`realpath ${PWD}/../../../`
+
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
+export LC_ALL=C
+
+export PYTHONDONTWRITEBYTECODE=1
+# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
+
+MODEL=ernie_sat
+export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
\ No newline at end of file
--- a/examples/vctk/ernie_sat/run.sh
+++ b/examples/vctk/ernie_sat/run.sh
+#!/bin/bash
+
+set -e
+source path.sh
+
+gpus=0,1
+stage=0
+stop_stage=100
+
+conf_path=conf/default.yaml
+train_output_path=exp/default
+ckpt_name=snapshot_iter_153.pdz
+
+# with the following command, you can choose the stage range you want to run
+# such as `./run.sh --stage 0 --stop-stage 0`
+# this can not be mixed use with `$1`, `$2` ...
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # prepare data
+    ./local/preprocess.sh ${conf_path} || exit -1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # train model, all `ckpt` under `train_output_path/checkpoints/` dir
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # synthesize, vocoder is pwgan
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
--- a/examples/vctk/tts3/conf/default.yaml
+++ b/examples/vctk/tts3/conf/default.yaml
@@ -24,7 +24,7 @@ f0max: 400         # Maximum f0 for pitch extraction.
 #                       DATA SETTING                      #
 ###########################################################
 batch_size: 64
-num_workers: 4
+num_workers: 2


 ###########################################################

--- a/paddlespeech/t2s/datasets/am_batch_fn.py
+++ b/paddlespeech/t2s/datasets/am_batch_fn.py
@@ -28,6 +28,149 @@ from paddlespeech.t2s.modules.nets_utils import phones_masking
 from paddlespeech.t2s.modules.nets_utils import phones_text_masking


+# 因为要传参数，所以需要额外构建
+def build_erniesat_collate_fn(
+        mlm_prob: float=0.8,
+        mean_phn_span: int=8,
+        seg_emb: bool=False,
+        text_masking: bool=False,
+        epoch: int=-1, ):
+
+    if epoch == -1:
+        mlm_prob_factor = 1
+    else:
+        mlm_prob_factor = 0.8
+
+    return ErnieSATCollateFn(
+        mlm_prob=mlm_prob * mlm_prob_factor,
+        mean_phn_span=mean_phn_span,
+        seg_emb=seg_emb,
+        text_masking=text_masking)
+
+
+class ErnieSATCollateFn:
+    """Functor class of common_collate_fn()"""
+
+    def __init__(self,
+                 mlm_prob: float=0.8,
+                 mean_phn_span: int=8,
+                 seg_emb: bool=False,
+                 text_masking: bool=False):
+        self.mlm_prob = mlm_prob
+        self.mean_phn_span = mean_phn_span
+        self.seg_emb = seg_emb
+        self.text_masking = text_masking
+
+    def __call__(self, exmaples):
+        return erniesat_batch_fn(
+            exmaples,
+            mlm_prob=self.mlm_prob,
+            mean_phn_span=self.mean_phn_span,
+            seg_emb=self.seg_emb,
+            text_masking=self.text_masking)
+
+
+def erniesat_batch_fn(examples,
+                      mlm_prob: float=0.8,
+                      mean_phn_span: int=8,
+                      seg_emb: bool=False,
+                      text_masking: bool=False):
+    # fields = ["text", "text_lengths", "speech", "speech_lengths", "durations", "pitch", "energy"]
+    text = [np.array(item["text"], dtype=np.int64) for item in examples]
+    speech = [np.array(item["speech"], dtype=np.float32) for item in examples]
+
+    text_lengths = [
+        np.array(item["text_lengths"], dtype=np.int64) for item in examples
+    ]
+    speech_lengths = [
+        np.array(item["speech_lengths"], dtype=np.int64) for item in examples
+    ]
+
+    align_start = [
+        np.array(item["align_start"], dtype=np.int64) for item in examples
+    ]
+
+    align_end = [
+        np.array(item["align_end"], dtype=np.int64) for item in examples
+    ]
+
+    align_start_lengths = [
+        np.array(len(item["align_start"]), dtype=np.int64) for item in examples
+    ]
+
+    # add_pad
+    text = batch_sequences(text)
+    speech = batch_sequences(speech)
+    align_start = batch_sequences(align_start)
+    align_end = batch_sequences(align_end)
+
+    # convert each batch to paddle.Tensor
+    text = paddle.to_tensor(text)
+    speech = paddle.to_tensor(speech)
+    text_lengths = paddle.to_tensor(text_lengths)
+    speech_lengths = paddle.to_tensor(speech_lengths)
+    align_start_lengths = paddle.to_tensor(align_start_lengths)
+
+    speech_pad = speech
+    text_pad = text
+
+    text_mask = make_non_pad_mask(
+        text_lengths, text_pad, length_dim=1).unsqueeze(-2)
+    speech_mask = make_non_pad_mask(
+        speech_lengths, speech_pad[:, :, 0], length_dim=1).unsqueeze(-2)
+
+    # dual_mask 的是混合中英时候同时 mask 语音和文本 
+    # ernie sat 在实现跨语言的时候都 mask 了
+    span_bdy = None
+    if text_masking:
+        masked_pos, text_masked_pos = phones_text_masking(
+            xs_pad=speech_pad,
+            src_mask=speech_mask,
+            text_pad=text_pad,
+            text_mask=text_mask,
+            align_start=align_start,
+            align_end=align_end,
+            align_start_lens=align_start_lengths,
+            mlm_prob=mlm_prob,
+            mean_phn_span=mean_phn_span,
+            span_bdy=span_bdy)
+    # 训练纯中文和纯英文的 -> a3t 没有对 phoneme 做 mask, 只对语音 mask 了
+    # a3t 和 ernie sat 的区别主要在于做 mask 的时候
+    else:
+        masked_pos = phones_masking(
+            xs_pad=speech_pad,
+            src_mask=speech_mask,
+            align_start=align_start,
+            align_end=align_end,
+            align_start_lens=align_start_lengths,
+            mlm_prob=mlm_prob,
+            mean_phn_span=mean_phn_span,
+            span_bdy=span_bdy)
+        text_masked_pos = paddle.zeros(paddle.shape(text_pad))
+
+    speech_seg_pos, text_seg_pos = get_seg_pos(
+        speech_pad=speech_pad,
+        text_pad=text_pad,
+        align_start=align_start,
+        align_end=align_end,
+        align_start_lens=align_start_lengths,
+        seg_emb=seg_emb)
+
+    batch = {
+        "text": text,
+        "speech": speech,
+        # need to generate 
+        "masked_pos": masked_pos,
+        "speech_mask": speech_mask,
+        "text_mask": text_mask,
+        "speech_seg_pos": speech_seg_pos,
+        "text_seg_pos": text_seg_pos,
+        "text_masked_pos": text_masked_pos
+    }
+
+    return batch
+
+
 def tacotron2_single_spk_batch_fn(examples):
    # fields = ["text", "text_lengths", "speech", "speech_lengths"]
    text = [np.array(item["text"], dtype=np.int64) for item in examples]
@@ -378,7 +521,6 @@ class MLMCollateFn:
            mean_phn_span=self.mean_phn_span,
            seg_emb=self.seg_emb,
            text_masking=self.text_masking,
-            attention_window=self.attention_window,
            not_sequence=self.not_sequence)


@@ -389,7 +531,6 @@ def mlm_collate_fn(
        mean_phn_span: int=8,
        seg_emb: bool=False,
        text_masking: bool=False,
-        attention_window: int=0,
        pad_value: int=0,
        not_sequence: Collection[str]=(),
 ) -> Tuple[List[str], Dict[str, paddle.Tensor]]:
@@ -420,6 +561,7 @@ def mlm_collate_fn(

    feats = feats_extract.get_log_mel_fbank(np.array(output["speech"][0]))
    feats = paddle.to_tensor(feats)
+    print("feats.shape:", feats.shape)
    feats_lens = paddle.shape(feats)[0]
    feats = paddle.unsqueeze(feats, 0)

@@ -439,6 +581,7 @@ def mlm_collate_fn(
        text_lens, text_pad, length_dim=1).unsqueeze(-2)
    speech_mask = make_non_pad_mask(
        feats_lens, speech_pad[:, :, 0], length_dim=1).unsqueeze(-2)
+
    span_bdy = None
    if 'span_bdy' in output.keys():
        span_bdy = output['span_bdy']

--- a/paddlespeech/t2s/exps/ernie_sat/__init__.py
+++ b/paddlespeech/t2s/exps/ernie_sat/__init__.py
--- a/paddlespeech/t2s/exps/ernie_sat/normalize.py
+++ b/paddlespeech/t2s/exps/ernie_sat/normalize.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Normalize feature files and dump them."""
+import argparse
+import logging
+from operator import itemgetter
+from pathlib import Path
+
+import jsonlines
+import numpy as np
+from sklearn.preprocessing import StandardScaler
+from tqdm import tqdm
+
+from paddlespeech.t2s.datasets.data_table import DataTable
+
+
+def main():
+    """Run preprocessing process."""
+    parser = argparse.ArgumentParser(
+        description="Normalize dumped raw features (See detail in parallel_wavegan/bin/normalize.py)."
+    )
+    parser.add_argument(
+        "--metadata",
+        type=str,
+        required=True,
+        help="directory including feature files to be normalized. "
+        "you need to specify either *-scp or rootdir.")
+
+    parser.add_argument(
+        "--dumpdir",
+        type=str,
+        required=True,
+        help="directory to dump normalized feature files.")
+    parser.add_argument(
+        "--speech-stats",
+        type=str,
+        required=True,
+        help="speech statistics file.")
+    parser.add_argument(
+        "--phones-dict", type=str, default=None, help="phone vocabulary file.")
+    parser.add_argument(
+        "--speaker-dict", type=str, default=None, help="speaker id map file.")
+
+    args = parser.parse_args()
+
+    dumpdir = Path(args.dumpdir).expanduser()
+    # use absolute path
+    dumpdir = dumpdir.resolve()
+    dumpdir.mkdir(parents=True, exist_ok=True)
+
+    # get dataset
+    with jsonlines.open(args.metadata, 'r') as reader:
+        metadata = list(reader)
+    dataset = DataTable(
+        metadata, converters={
+            "speech": np.load,
+        })
+    logging.info(f"The number of files = {len(dataset)}.")
+
+    # restore scaler
+    speech_scaler = StandardScaler()
+    speech_scaler.mean_ = np.load(args.speech_stats)[0]
+    speech_scaler.scale_ = np.load(args.speech_stats)[1]
+    speech_scaler.n_features_in_ = speech_scaler.mean_.shape[0]
+
+    vocab_phones = {}
+    with open(args.phones_dict, 'rt') as f:
+        phn_id = [line.strip().split() for line in f.readlines()]
+    for phn, id in phn_id:
+        vocab_phones[phn] = int(id)
+
+    vocab_speaker = {}
+    with open(args.speaker_dict, 'rt') as f:
+        spk_id = [line.strip().split() for line in f.readlines()]
+    for spk, id in spk_id:
+        vocab_speaker[spk] = int(id)
+
+    # process each file
+    output_metadata = []
+
+    for item in tqdm(dataset):
+        utt_id = item['utt_id']
+        speech = item['speech']
+
+        # normalize
+        speech = speech_scaler.transform(speech)
+        speech_dir = dumpdir / "data_speech"
+        speech_dir.mkdir(parents=True, exist_ok=True)
+        speech_path = speech_dir / f"{utt_id}_speech.npy"
+        np.save(speech_path, speech.astype(np.float32), allow_pickle=False)
+
+        phone_ids = [vocab_phones[p] for p in item['phones']]
+        spk_id = vocab_speaker[item["speaker"]]
+        record = {
+            "utt_id": item['utt_id'],
+            "spk_id": spk_id,
+            "text": phone_ids,
+            "text_lengths": item['text_lengths'],
+            "speech_lengths": item['speech_lengths'],
+            "durations": item['durations'],
+            "speech": str(speech_path),
+            "align_start": item['align_start'],
+            "align_end": item['align_end'],
+        }
+        # add spk_emb for voice cloning
+        if "spk_emb" in item:
+            record["spk_emb"] = str(item["spk_emb"])
+
+        output_metadata.append(record)
+    output_metadata.sort(key=itemgetter('utt_id'))
+    output_metadata_path = Path(args.dumpdir) / "metadata.jsonl"
+    with jsonlines.open(output_metadata_path, 'w') as writer:
+        for item in output_metadata:
+            writer.write(item)
+    logging.info(f"metadata dumped into {output_metadata_path}")
+
+
+if __name__ == "__main__":
+    main()
--- a/paddlespeech/t2s/exps/ernie_sat/preprocess.py
+++ b/paddlespeech/t2s/exps/ernie_sat/preprocess.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+from concurrent.futures import ThreadPoolExecutor
+from operator import itemgetter
+from pathlib import Path
+from typing import Any
+from typing import Dict
+from typing import List
+
+import jsonlines
+import librosa
+import numpy as np
+import tqdm
+import yaml
+from yacs.config import CfgNode
+
+from paddlespeech.t2s.datasets.get_feats import LogMelFBank
+from paddlespeech.t2s.datasets.preprocess_utils import compare_duration_and_mel_length
+from paddlespeech.t2s.datasets.preprocess_utils import get_input_token
+from paddlespeech.t2s.datasets.preprocess_utils import get_phn_dur
+from paddlespeech.t2s.datasets.preprocess_utils import get_spk_id_map
+from paddlespeech.t2s.datasets.preprocess_utils import merge_silence
+from paddlespeech.t2s.utils import str2bool
+
+
+def process_sentence(config: Dict[str, Any],
+                     fp: Path,
+                     sentences: Dict,
+                     output_dir: Path,
+                     mel_extractor=None,
+                     cut_sil: bool=True,
+                     spk_emb_dir: Path=None):
+    utt_id = fp.stem
+    # for vctk
+    if utt_id.endswith("_mic2"):
+        utt_id = utt_id[:-5]
+    record = None
+    if utt_id in sentences:
+        # reading, resampling may occur
+        wav, _ = librosa.load(str(fp), sr=config.fs)
+        if len(wav.shape) != 1:
+            return record
+        max_value = np.abs(wav).max()
+        if max_value > 1.0:
+            wav = wav / max_value
+        assert len(wav.shape) == 1, f"{utt_id} is not a mono-channel audio."
+        assert np.abs(wav).max(
+        ) <= 1.0, f"{utt_id} is seems to be different that 16 bit PCM."
+        phones = sentences[utt_id][0]
+        durations = sentences[utt_id][1]
+        speaker = sentences[utt_id][2]
+        d_cumsum = np.pad(np.array(durations).cumsum(0), (1, 0), 'constant')
+
+        # little imprecise than use *.TextGrid directly
+        times = librosa.frames_to_time(
+            d_cumsum, sr=config.fs, hop_length=config.n_shift)
+        if cut_sil:
+            start = 0
+            end = d_cumsum[-1]
+            if phones[0] == "sil" and len(durations) > 1:
+                start = times[1]
+                durations = durations[1:]
+                phones = phones[1:]
+            if phones[-1] == 'sil' and len(durations) > 1:
+                end = times[-2]
+                durations = durations[:-1]
+                phones = phones[:-1]
+            sentences[utt_id][0] = phones
+            sentences[utt_id][1] = durations
+            start, end = librosa.time_to_samples([start, end], sr=config.fs)
+            wav = wav[start:end]
+
+        # extract mel feats
+        logmel = mel_extractor.get_log_mel_fbank(wav)
+        # change duration according to mel_length
+        compare_duration_and_mel_length(sentences, utt_id, logmel)
+        # utt_id may be popped in compare_duration_and_mel_length
+        if utt_id not in sentences:
+            return None
+        phones = sentences[utt_id][0]
+        durations = sentences[utt_id][1]
+        num_frames = logmel.shape[0]
+        assert sum(durations) == num_frames
+
+        new_d_cumsum = np.pad(np.array(durations).cumsum(0), (1, 0), 'constant')
+        align_start = new_d_cumsum[:-1]
+        align_end = new_d_cumsum[1:]
+        assert len(align_start) == len(align_end) == len(durations)
+
+        mel_dir = output_dir / "data_speech"
+        mel_dir.mkdir(parents=True, exist_ok=True)
+        mel_path = mel_dir / (utt_id + "_speech.npy")
+        np.save(mel_path, logmel)
+        # align_start_lengths == text_lengths
+        record = {
+            "utt_id": utt_id,
+            "phones": phones,
+            "text_lengths": len(phones),
+            "speech_lengths": num_frames,
+            "durations": durations,
+            "speech": str(mel_path),
+            "speaker": speaker,
+            "align_start": align_start.tolist(),
+            "align_end": align_end.tolist(),
+        }
+        if spk_emb_dir:
+            if speaker in os.listdir(spk_emb_dir):
+                embed_name = utt_id + ".npy"
+                embed_path = spk_emb_dir / speaker / embed_name
+                if embed_path.is_file():
+                    record["spk_emb"] = str(embed_path)
+                else:
+                    return None
+    return record
+
+
+def process_sentences(config,
+                      fps: List[Path],
+                      sentences: Dict,
+                      output_dir: Path,
+                      mel_extractor=None,
+                      nprocs: int=1,
+                      cut_sil: bool=True,
+                      spk_emb_dir: Path=None):
+    if nprocs == 1:
+        results = []
+        for fp in tqdm.tqdm(fps, total=len(fps)):
+            record = process_sentence(
+                config=config,
+                fp=fp,
+                sentences=sentences,
+                output_dir=output_dir,
+                mel_extractor=mel_extractor,
+                cut_sil=cut_sil,
+                spk_emb_dir=spk_emb_dir)
+            if record:
+                results.append(record)
+    else:
+        with ThreadPoolExecutor(nprocs) as pool:
+            futures = []
+            with tqdm.tqdm(total=len(fps)) as progress:
+                for fp in fps:
+                    future = pool.submit(process_sentence, config, fp,
+                                         sentences, output_dir, mel_extractor,
+                                         cut_sil, spk_emb_dir)
+                    future.add_done_callback(lambda p: progress.update())
+                    futures.append(future)
+
+                results = []
+                for ft in futures:
+                    record = ft.result()
+                    if record:
+                        results.append(record)
+
+    results.sort(key=itemgetter("utt_id"))
+    with jsonlines.open(output_dir / "metadata.jsonl", 'w') as writer:
+        for item in results:
+            writer.write(item)
+    print("Done")
+
+
+def main():
+    # parse config and args
+    parser = argparse.ArgumentParser(
+        description="Preprocess audio and then extract features.")
+
+    parser.add_argument(
+        "--dataset",
+        default="baker",
+        type=str,
+        help="name of dataset, should in {baker, aishell3, ljspeech, vctk} now")
+
+    parser.add_argument(
+        "--rootdir", default=None, type=str, help="directory to dataset.")
+
+    parser.add_argument(
+        "--dumpdir",
+        type=str,
+        required=True,
+        help="directory to dump feature files.")
+    parser.add_argument(
+        "--dur-file", default=None, type=str, help="path to durations.txt.")
+
+    parser.add_argument("--config", type=str, help="fastspeech2 config file.")
+
+    parser.add_argument(
+        "--num-cpu", type=int, default=1, help="number of process.")
+
+    parser.add_argument(
+        "--cut-sil",
+        type=str2bool,
+        default=True,
+        help="whether cut sil in the edge of audio")
+
+    parser.add_argument(
+        "--spk_emb_dir",
+        default=None,
+        type=str,
+        help="directory to speaker embedding files.")
+    args = parser.parse_args()
+
+    rootdir = Path(args.rootdir).expanduser()
+    dumpdir = Path(args.dumpdir).expanduser()
+    # use absolute path
+    dumpdir = dumpdir.resolve()
+    dumpdir.mkdir(parents=True, exist_ok=True)
+    dur_file = Path(args.dur_file).expanduser()
+
+    if args.spk_emb_dir:
+        spk_emb_dir = Path(args.spk_emb_dir).expanduser().resolve()
+    else:
+        spk_emb_dir = None
+
+    assert rootdir.is_dir()
+    assert dur_file.is_file()
+
+    with open(args.config, 'rt') as f:
+        config = CfgNode(yaml.safe_load(f))
+
+    sentences, speaker_set = get_phn_dur(dur_file)
+
+    merge_silence(sentences)
+    phone_id_map_path = dumpdir / "phone_id_map.txt"
+    speaker_id_map_path = dumpdir / "speaker_id_map.txt"
+    get_input_token(sentences, phone_id_map_path, args.dataset)
+    get_spk_id_map(speaker_set, speaker_id_map_path)
+
+    if args.dataset == "baker":
+        wav_files = sorted(list((rootdir / "Wave").rglob("*.wav")))
+        # split data into 3 sections
+        num_train = 9800
+        num_dev = 100
+        train_wav_files = wav_files[:num_train]
+        dev_wav_files = wav_files[num_train:num_train + num_dev]
+        test_wav_files = wav_files[num_train + num_dev:]
+    elif args.dataset == "aishell3":
+        sub_num_dev = 5
+        wav_dir = rootdir / "train" / "wav"
+        train_wav_files = []
+        dev_wav_files = []
+        test_wav_files = []
+        for speaker in os.listdir(wav_dir):
+            wav_files = sorted(list((wav_dir / speaker).rglob("*.wav")))
+            if len(wav_files) > 100:
+                train_wav_files += wav_files[:-sub_num_dev * 2]
+                dev_wav_files += wav_files[-sub_num_dev * 2:-sub_num_dev]
+                test_wav_files += wav_files[-sub_num_dev:]
+            else:
+                train_wav_files += wav_files
+
+    elif args.dataset == "ljspeech":
+        wav_files = sorted(list((rootdir / "wavs").rglob("*.wav")))
+        # split data into 3 sections
+        num_train = 12900
+        num_dev = 100
+        train_wav_files = wav_files[:num_train]
+        dev_wav_files = wav_files[num_train:num_train + num_dev]
+        test_wav_files = wav_files[num_train + num_dev:]
+    elif args.dataset == "vctk":
+        sub_num_dev = 5
+        wav_dir = rootdir / "wav48_silence_trimmed"
+        train_wav_files = []
+        dev_wav_files = []
+        test_wav_files = []
+        for speaker in os.listdir(wav_dir):
+            wav_files = sorted(list((wav_dir / speaker).rglob("*_mic2.flac")))
+            if len(wav_files) > 100:
+                train_wav_files += wav_files[:-sub_num_dev * 2]
+                dev_wav_files += wav_files[-sub_num_dev * 2:-sub_num_dev]
+                test_wav_files += wav_files[-sub_num_dev:]
+            else:
+                train_wav_files += wav_files
+
+    else:
+        print("dataset should in {baker, aishell3, ljspeech, vctk} now!")
+
+    train_dump_dir = dumpdir / "train" / "raw"
+    train_dump_dir.mkdir(parents=True, exist_ok=True)
+    dev_dump_dir = dumpdir / "dev" / "raw"
+    dev_dump_dir.mkdir(parents=True, exist_ok=True)
+    test_dump_dir = dumpdir / "test" / "raw"
+    test_dump_dir.mkdir(parents=True, exist_ok=True)
+
+    # Extractor
+    mel_extractor = LogMelFBank(
+        sr=config.fs,
+        n_fft=config.n_fft,
+        hop_length=config.n_shift,
+        win_length=config.win_length,
+        window=config.window,
+        n_mels=config.n_mels,
+        fmin=config.fmin,
+        fmax=config.fmax)
+
+    # process for the 3 sections
+    if train_wav_files:
+        process_sentences(
+            config=config,
+            fps=train_wav_files,
+            sentences=sentences,
+            output_dir=train_dump_dir,
+            mel_extractor=mel_extractor,
+            nprocs=args.num_cpu,
+            cut_sil=args.cut_sil,
+            spk_emb_dir=spk_emb_dir)
+    if dev_wav_files:
+        process_sentences(
+            config=config,
+            fps=dev_wav_files,
+            sentences=sentences,
+            output_dir=dev_dump_dir,
+            mel_extractor=mel_extractor,
+            cut_sil=args.cut_sil,
+            spk_emb_dir=spk_emb_dir)
+    if test_wav_files:
+        process_sentences(
+            config=config,
+            fps=test_wav_files,
+            sentences=sentences,
+            output_dir=test_dump_dir,
+            mel_extractor=mel_extractor,
+            nprocs=args.num_cpu,
+            cut_sil=args.cut_sil,
+            spk_emb_dir=spk_emb_dir)
+
+
+if __name__ == "__main__":
+    main()
--- a/paddlespeech/t2s/exps/ernie_sat/synthesize.py
+++ b/paddlespeech/t2s/exps/ernie_sat/synthesize.py
--- a/paddlespeech/t2s/exps/ernie_sat/train.py
+++ b/paddlespeech/t2s/exps/ernie_sat/train.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import logging
+import os
+import shutil
+from pathlib import Path
+
+import jsonlines
+import numpy as np
+import paddle
+import yaml
+from paddle import DataParallel
+from paddle import distributed as dist
+from paddle.io import DataLoader
+from paddle.io import DistributedBatchSampler
+from yacs.config import CfgNode
+
+from paddlespeech.t2s.datasets.am_batch_fn import build_erniesat_collate_fn
+from paddlespeech.t2s.datasets.data_table import DataTable
+from paddlespeech.t2s.models.ernie_sat import ErnieSAT
+from paddlespeech.t2s.models.ernie_sat import ErnieSATEvaluator
+from paddlespeech.t2s.models.ernie_sat import ErnieSATUpdater
+from paddlespeech.t2s.training.extensions.snapshot import Snapshot
+from paddlespeech.t2s.training.extensions.visualizer import VisualDL
+from paddlespeech.t2s.training.optimizer import build_optimizers
+from paddlespeech.t2s.training.seeding import seed_everything
+from paddlespeech.t2s.training.trainer import Trainer
+
+
+def train_sp(args, config):
+    # decides device type and whether to run in parallel
+    # setup running environment correctly
+    if (not paddle.is_compiled_with_cuda()) or args.ngpu == 0:
+        paddle.set_device("cpu")
+    else:
+        paddle.set_device("gpu")
+    world_size = paddle.distributed.get_world_size()
+    if world_size > 1:
+        paddle.distributed.init_parallel_env()
+
+    # set the random seed, it is a must for multiprocess training
+    seed_everything(config.seed)
+
+    print(
+        f"rank: {dist.get_rank()}, pid: {os.getpid()}, parent_pid: {os.getppid()}",
+    )
+    fields = [
+        "text", "text_lengths", "speech", "speech_lengths", "align_start",
+        "align_end"
+    ]
+    converters = {"speech": np.load}
+    spk_num = None
+
+    # dataloader has been too verbose
+    logging.getLogger("DataLoader").disabled = True
+
+    # construct dataset for training and validation
+    with jsonlines.open(args.train_metadata, 'r') as reader:
+        train_metadata = list(reader)
+    train_dataset = DataTable(
+        data=train_metadata,
+        fields=fields,
+        converters=converters, )
+    with jsonlines.open(args.dev_metadata, 'r') as reader:
+        dev_metadata = list(reader)
+    dev_dataset = DataTable(
+        data=dev_metadata,
+        fields=fields,
+        converters=converters, )
+
+    # collate function and dataloader
+    collate_fn = build_erniesat_collate_fn(
+        mlm_prob=config.mlm_prob,
+        mean_phn_span=config.mean_phn_span,
+        seg_emb=config.model['enc_input_layer'] == 'sega_mlm',
+        text_masking=config["model"]["text_masking"],
+        epoch=config["max_epoch"])
+
+    train_sampler = DistributedBatchSampler(
+        train_dataset,
+        batch_size=config.batch_size,
+        shuffle=True,
+        drop_last=True)
+
+    print("samplers done!")
+
+    train_dataloader = DataLoader(
+        train_dataset,
+        batch_sampler=train_sampler,
+        collate_fn=collate_fn,
+        num_workers=config.num_workers)
+
+    dev_dataloader = DataLoader(
+        dev_dataset,
+        shuffle=False,
+        drop_last=False,
+        batch_size=config.batch_size,
+        collate_fn=collate_fn,
+        num_workers=config.num_workers)
+    print("dataloaders done!")
+
+    with open(args.phones_dict, "r") as f:
+        phn_id = [line.strip().split() for line in f.readlines()]
+    vocab_size = len(phn_id)
+    print("vocab_size:", vocab_size)
+
+    odim = config.n_mels
+    model = ErnieSAT(idim=vocab_size, odim=odim, **config["model"])
+
+    if world_size > 1:
+        model = DataParallel(model)
+    print("model done!")
+
+    optimizer = build_optimizers(model, **config["optimizer"])
+    print("optimizer done!")
+
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    if dist.get_rank() == 0:
+        config_name = args.config.split("/")[-1]
+        # copy conf to output_dir
+        shutil.copyfile(args.config, output_dir / config_name)
+
+    updater = ErnieSATUpdater(
+        model=model,
+        optimizer=optimizer,
+        dataloader=train_dataloader,
+        text_masking=config["model"]["text_masking"],
+        odim=odim,
+        output_dir=output_dir)
+
+    trainer = Trainer(updater, (config.max_epoch, 'epoch'), output_dir)
+
+    evaluator = ErnieSATEvaluator(
+        model=model,
+        dataloader=dev_dataloader,
+        text_masking=config["model"]["text_masking"],
+        odim=odim,
+        output_dir=output_dir, )
+
+    if dist.get_rank() == 0:
+        trainer.extend(evaluator, trigger=(1, "epoch"))
+        trainer.extend(VisualDL(output_dir), trigger=(1, "iteration"))
+    trainer.extend(
+        Snapshot(max_size=config.num_snapshots), trigger=(1, 'epoch'))
+    trainer.run()
+
+
+def main():
+    # parse args and config and redirect to train_sp
+    parser = argparse.ArgumentParser(description="Train an ErnieSAT model.")
+    parser.add_argument("--config", type=str, help="ErnieSAT config file.")
+    parser.add_argument("--train-metadata", type=str, help="training data.")
+    parser.add_argument("--dev-metadata", type=str, help="dev data.")
+    parser.add_argument("--output-dir", type=str, help="output dir.")
+    parser.add_argument(
+        "--ngpu", type=int, default=1, help="if ngpu=0, use cpu.")
+    parser.add_argument(
+        "--phones-dict", type=str, default=None, help="phone vocabulary file.")
+
+    args = parser.parse_args()
+
+    with open(args.config) as f:
+        config = CfgNode(yaml.safe_load(f))
+
+    print("========Args========")
+    print(yaml.safe_dump(vars(args)))
+    print("========Config========")
+    print(config)
+    print(
+        f"master see the word size: {dist.get_world_size()}, from pid: {os.getpid()}"
+    )
+
+    # dispatch
+    if args.ngpu > 1:
+        dist.spawn(train_sp, (args, config), nprocs=args.ngpu)
+    else:
+        train_sp(args, config)
+
+
+if __name__ == "__main__":
+    main()
--- a/paddlespeech/t2s/models/ernie_sat/__init__.py
+++ b/paddlespeech/t2s/models/ernie_sat/__init__.py
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,4 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from .ernie_sat import *
+from .ernie_sat_updater import *
 from .mlm import *
--- a/paddlespeech/t2s/models/ernie_sat/ernie_sat.py
+++ b/paddlespeech/t2s/models/ernie_sat/ernie_sat.py
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Dict
+from typing import List
+from typing import Optional
+
+import paddle
+from paddle import nn
+
+from paddlespeech.t2s.modules.activation import get_activation
+from paddlespeech.t2s.modules.conformer.convolution import ConvolutionModule
+from paddlespeech.t2s.modules.conformer.encoder_layer import EncoderLayer
+from paddlespeech.t2s.modules.layer_norm import LayerNorm
+from paddlespeech.t2s.modules.masked_fill import masked_fill
+from paddlespeech.t2s.modules.nets_utils import initialize
+from paddlespeech.t2s.modules.tacotron2.decoder import Postnet
+from paddlespeech.t2s.modules.transformer.attention import LegacyRelPositionMultiHeadedAttention
+from paddlespeech.t2s.modules.transformer.attention import MultiHeadedAttention
+from paddlespeech.t2s.modules.transformer.attention import RelPositionMultiHeadedAttention
+from paddlespeech.t2s.modules.transformer.embedding import LegacyRelPositionalEncoding
+from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding
+from paddlespeech.t2s.modules.transformer.embedding import RelPositionalEncoding
+from paddlespeech.t2s.modules.transformer.embedding import ScaledPositionalEncoding
+from paddlespeech.t2s.modules.transformer.multi_layer_conv import Conv1dLinear
+from paddlespeech.t2s.modules.transformer.multi_layer_conv import MultiLayeredConv1d
+from paddlespeech.t2s.modules.transformer.positionwise_feed_forward import PositionwiseFeedForward
+from paddlespeech.t2s.modules.transformer.repeat import repeat
+from paddlespeech.t2s.modules.transformer.subsampling import Conv2dSubsampling
+
+
+# MLM -> Mask Language Model
+class mySequential(nn.Sequential):
+    def forward(self, *inputs):
+        for module in self._sub_layers.values():
+            if type(inputs) == tuple:
+                inputs = module(*inputs)
+            else:
+                inputs = module(inputs)
+        return inputs
+
+
+class MaskInputLayer(nn.Layer):
+    def __init__(self, out_features: int) -> None:
+        super().__init__()
+        self.mask_feature = paddle.create_parameter(
+            shape=(1, 1, out_features),
+            dtype=paddle.float32,
+            default_initializer=paddle.nn.initializer.Assign(
+                paddle.normal(shape=(1, 1, out_features))))
+
+    def forward(self, input: paddle.Tensor,
+                masked_pos: paddle.Tensor=None) -> paddle.Tensor:
+        masked_pos = paddle.expand_as(paddle.unsqueeze(masked_pos, -1), input)
+        masked_input = masked_fill(input, masked_pos, 0) + masked_fill(
+            paddle.expand_as(self.mask_feature, input), ~masked_pos, 0)
+        return masked_input
+
+
+class MLMEncoder(nn.Layer):
+    """Conformer encoder module.
+
+    Args:
+        idim (int): Input dimension.
+        attention_dim (int): Dimension of attention.
+        attention_heads (int): The number of heads of multi head attention.
+        linear_units (int): The number of units of position-wise feed forward.
+        num_blocks (int): The number of decoder blocks.
+        dropout_rate (float): Dropout rate.
+        positional_dropout_rate (float): Dropout rate after adding positional encoding.
+        attention_dropout_rate (float): Dropout rate in attention.
+        input_layer (Union[str, paddle.nn.Layer]): Input layer type.
+        normalize_before (bool): Whether to use layer_norm before the first block.
+        concat_after (bool): Whether to concat attention layer's input and output.
+            if True, additional linear will be applied.
+            i.e. x -> x + linear(concat(x, att(x)))
+            if False, no additional linear will be applied. i.e. x -> x + att(x)
+        positionwise_layer_type (str): "linear", "conv1d", or "conv1d-linear".
+        positionwise_conv_kernel_size (int): Kernel size of positionwise conv1d layer.
+        macaron_style (bool): Whether to use macaron style for positionwise layer.
+        pos_enc_layer_type (str): Encoder positional encoding layer type.
+        selfattention_layer_type (str): Encoder attention layer type.
+        activation_type (str): Encoder activation function type.
+        use_cnn_module (bool): Whether to use convolution module.
+        zero_triu (bool): Whether to zero the upper triangular part of attention matrix.
+        cnn_module_kernel (int): Kernerl size of convolution module.
+        padding_idx (int): Padding idx for input_layer=embed.
+        stochastic_depth_rate (float): Maximum probability to skip the encoder layer.
+
+    """
+
+    def __init__(self,
+                 idim: int,
+                 vocab_size: int=0,
+                 pre_speech_layer: int=0,
+                 attention_dim: int=256,
+                 attention_heads: int=4,
+                 linear_units: int=2048,
+                 num_blocks: int=6,
+                 dropout_rate: float=0.1,
+                 positional_dropout_rate: float=0.1,
+                 attention_dropout_rate: float=0.0,
+                 input_layer: str="conv2d",
+                 normalize_before: bool=True,
+                 concat_after: bool=False,
+                 positionwise_layer_type: str="linear",
+                 positionwise_conv_kernel_size: int=1,
+                 macaron_style: bool=False,
+                 pos_enc_layer_type: str="abs_pos",
+                 pos_enc_class=None,
+                 selfattention_layer_type: str="selfattn",
+                 activation_type: str="swish",
+                 use_cnn_module: bool=False,
+                 zero_triu: bool=False,
+                 cnn_module_kernel: int=31,
+                 padding_idx: int=-1,
+                 stochastic_depth_rate: float=0.0,
+                 text_masking: bool=False):
+        """Construct an Encoder object."""
+        super().__init__()
+        self._output_size = attention_dim
+        self.text_masking = text_masking
+        if self.text_masking:
+            self.text_masking_layer = MaskInputLayer(attention_dim)
+        activation = get_activation(activation_type)
+        if pos_enc_layer_type == "abs_pos":
+            pos_enc_class = PositionalEncoding
+        elif pos_enc_layer_type == "scaled_abs_pos":
+            pos_enc_class = ScaledPositionalEncoding
+        elif pos_enc_layer_type == "rel_pos":
+            assert selfattention_layer_type == "rel_selfattn"
+            pos_enc_class = RelPositionalEncoding
+        elif pos_enc_layer_type == "legacy_rel_pos":
+            pos_enc_class = LegacyRelPositionalEncoding
+            assert selfattention_layer_type == "legacy_rel_selfattn"
+        else:
+            raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type)
+
+        self.conv_subsampling_factor = 1
+        if input_layer == "linear":
+            self.embed = nn.Sequential(
+                nn.Linear(idim, attention_dim),
+                nn.LayerNorm(attention_dim),
+                nn.Dropout(dropout_rate),
+                nn.ReLU(),
+                pos_enc_class(attention_dim, positional_dropout_rate), )
+        elif input_layer == "conv2d":
+            self.embed = Conv2dSubsampling(
+                idim,
+                attention_dim,
+                dropout_rate,
+                pos_enc_class(attention_dim, positional_dropout_rate), )
+            self.conv_subsampling_factor = 4
+        elif input_layer == "embed":
+            self.embed = nn.Sequential(
+                nn.Embedding(idim, attention_dim, padding_idx=padding_idx),
+                pos_enc_class(attention_dim, positional_dropout_rate), )
+        elif input_layer == "mlm":
+            self.segment_emb = None
+            self.speech_embed = mySequential(
+                MaskInputLayer(idim),
+                nn.Linear(idim, attention_dim),
+                nn.LayerNorm(attention_dim),
+                nn.ReLU(),
+                pos_enc_class(attention_dim, positional_dropout_rate))
+            self.text_embed = nn.Sequential(
+                nn.Embedding(
+                    vocab_size, attention_dim, padding_idx=padding_idx),
+                pos_enc_class(attention_dim, positional_dropout_rate), )
+        elif input_layer == "sega_mlm":
+            self.segment_emb = nn.Embedding(
+                500, attention_dim, padding_idx=padding_idx)
+            self.speech_embed = mySequential(
+                MaskInputLayer(idim),
+                nn.Linear(idim, attention_dim),
+                nn.LayerNorm(attention_dim),
+                nn.ReLU(),
+                pos_enc_class(attention_dim, positional_dropout_rate))
+            self.text_embed = nn.Sequential(
+                nn.Embedding(
+                    vocab_size, attention_dim, padding_idx=padding_idx),
+                pos_enc_class(attention_dim, positional_dropout_rate), )
+        elif isinstance(input_layer, nn.Layer):
+            self.embed = nn.Sequential(
+                input_layer,
+                pos_enc_class(attention_dim, positional_dropout_rate), )
+        elif input_layer is None:
+            self.embed = nn.Sequential(
+                pos_enc_class(attention_dim, positional_dropout_rate))
+        else:
+            raise ValueError("unknown input_layer: " + input_layer)
+        self.normalize_before = normalize_before
+
+        # self-attention module definition
+        if selfattention_layer_type == "selfattn":
+            encoder_selfattn_layer = MultiHeadedAttention
+            encoder_selfattn_layer_args = (attention_heads, attention_dim,
+                                           attention_dropout_rate, )
+        elif selfattention_layer_type == "legacy_rel_selfattn":
+            assert pos_enc_layer_type == "legacy_rel_pos"
+            encoder_selfattn_layer = LegacyRelPositionMultiHeadedAttention
+            encoder_selfattn_layer_args = (attention_heads, attention_dim,
+                                           attention_dropout_rate, )
+        elif selfattention_layer_type == "rel_selfattn":
+            assert pos_enc_layer_type == "rel_pos"
+            encoder_selfattn_layer = RelPositionMultiHeadedAttention
+            encoder_selfattn_layer_args = (attention_heads, attention_dim,
+                                           attention_dropout_rate, zero_triu, )
+        else:
+            raise ValueError("unknown encoder_attn_layer: " +
+                             selfattention_layer_type)
+
+        # feed-forward module definition
+        if positionwise_layer_type == "linear":
+            positionwise_layer = PositionwiseFeedForward
+            positionwise_layer_args = (attention_dim, linear_units,
+                                       dropout_rate, activation, )
+        elif positionwise_layer_type == "conv1d":
+            positionwise_layer = MultiLayeredConv1d
+            positionwise_layer_args = (attention_dim, linear_units,
+                                       positionwise_conv_kernel_size,
+                                       dropout_rate, )
+        elif positionwise_layer_type == "conv1d-linear":
+            positionwise_layer = Conv1dLinear
+            positionwise_layer_args = (attention_dim, linear_units,
+                                       positionwise_conv_kernel_size,
+                                       dropout_rate, )
+        else:
+            raise NotImplementedError("Support only linear or conv1d.")
+
+        # convolution module definition
+        convolution_layer = ConvolutionModule
+        convolution_layer_args = (attention_dim, cnn_module_kernel, activation)
+
+        self.encoders = repeat(
+            num_blocks,
+            lambda lnum: EncoderLayer(
+                attention_dim,
+                encoder_selfattn_layer(*encoder_selfattn_layer_args),
+                positionwise_layer(*positionwise_layer_args),
+                positionwise_layer(*positionwise_layer_args) if macaron_style else None,
+                convolution_layer(*convolution_layer_args) if use_cnn_module else None,
+                dropout_rate,
+                normalize_before,
+                concat_after,
+                stochastic_depth_rate * float(1 + lnum) / num_blocks, ), )
+        self.pre_speech_layer = pre_speech_layer
+        self.pre_speech_encoders = repeat(
+            self.pre_speech_layer,
+            lambda lnum: EncoderLayer(
+                attention_dim,
+                encoder_selfattn_layer(*encoder_selfattn_layer_args),
+                positionwise_layer(*positionwise_layer_args),
+                positionwise_layer(*positionwise_layer_args) if macaron_style else None,
+                convolution_layer(*convolution_layer_args) if use_cnn_module else None,
+                dropout_rate,
+                normalize_before,
+                concat_after,
+                stochastic_depth_rate * float(1 + lnum) / self.pre_speech_layer, ),
+        )
+        if self.normalize_before:
+            self.after_norm = LayerNorm(attention_dim)
+
+    def forward(self,
+                speech: paddle.Tensor,
+                text: paddle.Tensor,
+                masked_pos: paddle.Tensor,
+                speech_mask: paddle.Tensor=None,
+                text_mask: paddle.Tensor=None,
+                speech_seg_pos: paddle.Tensor=None,
+                text_seg_pos: paddle.Tensor=None):
+        """Encode input sequence.
+
+        """
+        if masked_pos is not None:
+            speech = self.speech_embed(speech, masked_pos)
+        else:
+            speech = self.speech_embed(speech)
+        if text is not None:
+            text = self.text_embed(text)
+        if speech_seg_pos is not None and text_seg_pos is not None and self.segment_emb:
+            speech_seg_emb = self.segment_emb(speech_seg_pos)
+            text_seg_emb = self.segment_emb(text_seg_pos)
+            text = (text[0] + text_seg_emb, text[1])
+            speech = (speech[0] + speech_seg_emb, speech[1])
+        if self.pre_speech_encoders:
+            speech, _ = self.pre_speech_encoders(speech, speech_mask)
+
+        if text is not None:
+            xs = paddle.concat([speech[0], text[0]], axis=1)
+            xs_pos_emb = paddle.concat([speech[1], text[1]], axis=1)
+            masks = paddle.concat([speech_mask, text_mask], axis=-1)
+        else:
+            xs = speech[0]
+            xs_pos_emb = speech[1]
+            masks = speech_mask
+
+        xs, masks = self.encoders((xs, xs_pos_emb), masks)
+
+        if isinstance(xs, tuple):
+            xs = xs[0]
+        if self.normalize_before:
+            xs = self.after_norm(xs)
+
+        return xs, masks
+
+
+class MLMDecoder(MLMEncoder):
+    def forward(self, xs: paddle.Tensor, masks: paddle.Tensor):
+        """Encode input sequence.
+
+        Args:
+            xs (paddle.Tensor): Input tensor (#batch, time, idim).
+            masks (paddle.Tensor): Mask tensor (#batch, time).
+
+        Returns:
+            paddle.Tensor: Output tensor (#batch, time, attention_dim).
+            paddle.Tensor: Mask tensor (#batch, time).
+
+        """
+        xs = self.embed(xs)
+        xs, masks = self.encoders(xs, masks)
+
+        if isinstance(xs, tuple):
+            xs = xs[0]
+        if self.normalize_before:
+            xs = self.after_norm(xs)
+
+        return xs, masks
+
+
+# encoder and decoder is nn.Layer, not str
+class MLM(nn.Layer):
+    def __init__(self,
+                 odim: int,
+                 encoder: nn.Layer,
+                 decoder: Optional[nn.Layer],
+                 postnet_layers: int=0,
+                 postnet_chans: int=0,
+                 postnet_filts: int=0,
+                 text_masking: bool=False):
+
+        super().__init__()
+        self.odim = odim
+        self.encoder = encoder
+        self.decoder = decoder
+        self.vocab_size = encoder.text_embed[0]._num_embeddings
+
+        if self.decoder is None or not (hasattr(self.decoder,
+                                                'output_layer') and
+                                        self.decoder.output_layer is not None):
+            self.sfc = nn.Linear(self.encoder._output_size, odim)
+        else:
+            self.sfc = None
+        if text_masking:
+            self.text_sfc = nn.Linear(
+                self.encoder.text_embed[0]._embedding_dim,
+                self.vocab_size,
+                weight_attr=self.encoder.text_embed[0]._weight_attr)
+        else:
+            self.text_sfc = None
+
+        self.postnet = (None if postnet_layers == 0 else Postnet(
+            idim=self.encoder._output_size,
+            odim=odim,
+            n_layers=postnet_layers,
+            n_chans=postnet_chans,
+            n_filts=postnet_filts,
+            use_batch_norm=True,
+            dropout_rate=0.5, ))
+
+    def inference(
+            self,
+            speech: paddle.Tensor,
+            text: paddle.Tensor,
+            masked_pos: paddle.Tensor,
+            speech_mask: paddle.Tensor,
+            text_mask: paddle.Tensor,
+            speech_seg_pos: paddle.Tensor,
+            text_seg_pos: paddle.Tensor,
+            span_bdy: List[int],
+            use_teacher_forcing: bool=False, ) -> Dict[str, paddle.Tensor]:
+        '''
+        Args:
+            speech (paddle.Tensor): input speech (1, Tmax, D).
+            text (paddle.Tensor): input text (1, Tmax2).
+            masked_pos (paddle.Tensor): masked position of input speech (1, Tmax)
+            speech_mask (paddle.Tensor): mask of speech (1, 1, Tmax).
+            text_mask (paddle.Tensor): mask of text (1, 1, Tmax2).
+            speech_seg_pos (paddle.Tensor): n-th phone of each mel, 0<=n<=Tmax2 (1, Tmax).
+            text_seg_pos (paddle.Tensor): n-th phone of each phone, 0<=n<=Tmax2 (1, Tmax2).
+            span_bdy (List[int]): masked mel boundary of input speech (2,)
+            use_teacher_forcing (bool): whether to use teacher forcing
+        Returns:
+            List[Tensor]:
+                eg:
+                [Tensor(shape=[1, 181, 80]), Tensor(shape=[80, 80]), Tensor(shape=[1, 67, 80])]
+        '''
+
+        z_cache = None
+        if use_teacher_forcing:
+            before_outs, zs, *_ = self.forward(
+                speech=speech,
+                text=text,
+                masked_pos=masked_pos,
+                speech_mask=speech_mask,
+                text_mask=text_mask,
+                speech_seg_pos=speech_seg_pos,
+                text_seg_pos=text_seg_pos)
+            if zs is None:
+                zs = before_outs
+
+            speech = speech.squeeze(0)
+            outs = [speech[:span_bdy[0]]]
+            outs += [zs[0][span_bdy[0]:span_bdy[1]]]
+            outs += [speech[span_bdy[1]:]]
+            return outs
+        return None
+
+
+class MLMEncAsDecoder(MLM):
+    def forward(self,
+                speech: paddle.Tensor,
+                text: paddle.Tensor,
+                masked_pos: paddle.Tensor,
+                speech_mask: paddle.Tensor,
+                text_mask: paddle.Tensor,
+                speech_seg_pos: paddle.Tensor,
+                text_seg_pos: paddle.Tensor):
+        # feats: (Batch, Length, Dim)
+        # -> encoder_out: (Batch, Length2, Dim2)
+        encoder_out, h_masks = self.encoder(
+            speech=speech,
+            text=text,
+            masked_pos=masked_pos,
+            speech_mask=speech_mask,
+            text_mask=text_mask,
+            speech_seg_pos=speech_seg_pos,
+            text_seg_pos=text_seg_pos)
+        if self.decoder is not None:
+            zs, _ = self.decoder(encoder_out, h_masks)
+        else:
+            zs = encoder_out
+        speech_hidden_states = zs[:, :paddle.shape(speech)[1], :]
+        if self.sfc is not None:
+            before_outs = paddle.reshape(
+                self.sfc(speech_hidden_states),
+                (paddle.shape(speech_hidden_states)[0], -1, self.odim))
+        else:
+            before_outs = speech_hidden_states
+        if self.postnet is not None:
+            after_outs = before_outs + paddle.transpose(
+                self.postnet(paddle.transpose(before_outs, [0, 2, 1])),
+                [0, 2, 1])
+        else:
+            after_outs = None
+        return before_outs, after_outs, None
+
+
+class MLMDualMaksing(MLM):
+    def forward(self,
+                speech: paddle.Tensor,
+                text: paddle.Tensor,
+                masked_pos: paddle.Tensor,
+                speech_mask: paddle.Tensor,
+                text_mask: paddle.Tensor,
+                speech_seg_pos: paddle.Tensor,
+                text_seg_pos: paddle.Tensor):
+        # feats: (Batch, Length, Dim)
+        # -> encoder_out: (Batch, Length2, Dim2)
+        encoder_out, h_masks = self.encoder(
+            speech=speech,
+            text=text,
+            masked_pos=masked_pos,
+            speech_mask=speech_mask,
+            text_mask=text_mask,
+            speech_seg_pos=speech_seg_pos,
+            text_seg_pos=text_seg_pos)
+        if self.decoder is not None:
+            zs, _ = self.decoder(encoder_out, h_masks)
+        else:
+            zs = encoder_out
+        speech_hidden_states = zs[:, :paddle.shape(speech)[1], :]
+        if self.text_sfc:
+            text_hiddent_states = zs[:, paddle.shape(speech)[1]:, :]
+            text_outs = paddle.reshape(
+                self.text_sfc(text_hiddent_states),
+                (paddle.shape(text_hiddent_states)[0], -1, self.vocab_size))
+        if self.sfc is not None:
+            before_outs = paddle.reshape(
+                self.sfc(speech_hidden_states),
+                (paddle.shape(speech_hidden_states)[0], -1, self.odim))
+        else:
+            before_outs = speech_hidden_states
+        if self.postnet is not None:
+            after_outs = before_outs + paddle.transpose(
+                self.postnet(paddle.transpose(before_outs, [0, 2, 1])),
+                [0, 2, 1])
+        else:
+            after_outs = None
+        return before_outs, after_outs, text_outs
+
+
+class ErnieSAT(nn.Layer):
+    def __init__(
+            self,
+            # network structure related
+            idim: int,
+            odim: int,
+            postnet_layers: int=5,
+            postnet_filts: int=5,
+            postnet_chans: int=256,
+            use_scaled_pos_enc: bool=False,
+            encoder_type: str='conformer',
+            decoder_type: str='conformer',
+            enc_input_layer: str='sega_mlm',
+            enc_pre_speech_layer: int=0,
+            enc_cnn_module_kernel: int=7,
+            enc_attention_dim: int=384,
+            enc_attention_heads: int=2,
+            enc_linear_units: int=1536,
+            enc_num_blocks: int=4,
+            enc_dropout_rate: float=0.2,
+            enc_positional_dropout_rate: float=0.2,
+            enc_attention_dropout_rate: float=0.2,
+            enc_normalize_before: bool=True,
+            enc_macaron_style: bool=True,
+            enc_use_cnn_module: bool=True,
+            enc_selfattention_layer_type: str='legacy_rel_selfattn',
+            enc_activation_type: str='swish',
+            enc_pos_enc_layer_type: str='legacy_rel_pos',
+            enc_positionwise_layer_type: str='conv1d',
+            enc_positionwise_conv_kernel_size: int=3,
+            text_masking: bool=False,
+            dec_cnn_module_kernel: int=31,
+            dec_attention_dim: int=384,
+            dec_attention_heads: int=2,
+            dec_linear_units: int=1536,
+            dec_num_blocks: int=4,
+            dec_dropout_rate: float=0.2,
+            dec_positional_dropout_rate: float=0.2,
+            dec_attention_dropout_rate: float=0.2,
+            dec_macaron_style: bool=True,
+            dec_use_cnn_module: bool=True,
+            dec_selfattention_layer_type: str='legacy_rel_selfattn',
+            dec_activation_type: str='swish',
+            dec_pos_enc_layer_type: str='legacy_rel_pos',
+            dec_positionwise_layer_type: str='conv1d',
+            dec_positionwise_conv_kernel_size: int=3,
+            init_type: str="xavier_uniform", ):
+        super().__init__()
+        # store hyperparameters
+        self.odim = odim
+
+        self.use_scaled_pos_enc = use_scaled_pos_enc
+
+        # initialize parameters
+        initialize(self, init_type)
+
+        # Encoder
+        if encoder_type == "conformer":
+            encoder = MLMEncoder(
+                idim=odim,
+                vocab_size=idim,
+                pre_speech_layer=enc_pre_speech_layer,
+                attention_dim=enc_attention_dim,
+                attention_heads=enc_attention_heads,
+                linear_units=enc_linear_units,
+                num_blocks=enc_num_blocks,
+                dropout_rate=enc_dropout_rate,
+                positional_dropout_rate=enc_positional_dropout_rate,
+                attention_dropout_rate=enc_attention_dropout_rate,
+                input_layer=enc_input_layer,
+                normalize_before=enc_normalize_before,
+                positionwise_layer_type=enc_positionwise_layer_type,
+                positionwise_conv_kernel_size=enc_positionwise_conv_kernel_size,
+                macaron_style=enc_macaron_style,
+                pos_enc_layer_type=enc_pos_enc_layer_type,
+                selfattention_layer_type=enc_selfattention_layer_type,
+                activation_type=enc_activation_type,
+                use_cnn_module=enc_use_cnn_module,
+                cnn_module_kernel=enc_cnn_module_kernel,
+                text_masking=text_masking)
+        else:
+            raise ValueError(f"{encoder_type} is not supported.")
+
+        # Decoder
+        if decoder_type != 'no_decoder':
+            decoder = MLMDecoder(
+                idim=0,
+                input_layer=None,
+                cnn_module_kernel=dec_cnn_module_kernel,
+                attention_dim=dec_attention_dim,
+                attention_heads=dec_attention_heads,
+                linear_units=dec_linear_units,
+                num_blocks=dec_num_blocks,
+                dropout_rate=dec_dropout_rate,
+                positional_dropout_rate=dec_positional_dropout_rate,
+                macaron_style=dec_macaron_style,
+                use_cnn_module=dec_use_cnn_module,
+                selfattention_layer_type=dec_selfattention_layer_type,
+                activation_type=dec_activation_type,
+                pos_enc_layer_type=dec_pos_enc_layer_type,
+                positionwise_layer_type=dec_positionwise_layer_type,
+                positionwise_conv_kernel_size=dec_positionwise_conv_kernel_size)
+
+        else:
+            decoder = None
+
+        model_class = MLMDualMaksing if text_masking else MLMEncAsDecoder
+
+        self.model = model_class(
+            odim=odim,
+            encoder=encoder,
+            decoder=decoder,
+            postnet_layers=postnet_layers,
+            postnet_filts=postnet_filts,
+            postnet_chans=postnet_chans,
+            text_masking=text_masking)
+
+        nn.initializer.set_global_initializer(None)
+
+    def forward(self,
+                speech: paddle.Tensor,
+                text: paddle.Tensor,
+                masked_pos: paddle.Tensor,
+                speech_mask: paddle.Tensor,
+                text_mask: paddle.Tensor,
+                speech_seg_pos: paddle.Tensor,
+                text_seg_pos: paddle.Tensor):
+        return self.model(
+            speech=speech,
+            text=text,
+            masked_pos=masked_pos,
+            speech_mask=speech_mask,
+            text_mask=text_mask,
+            speech_seg_pos=speech_seg_pos,
+            text_seg_pos=text_seg_pos)
+
+    def inference(
+            self,
+            speech: paddle.Tensor,
+            text: paddle.Tensor,
+            masked_pos: paddle.Tensor,
+            speech_mask: paddle.Tensor,
+            text_mask: paddle.Tensor,
+            speech_seg_pos: paddle.Tensor,
+            text_seg_pos: paddle.Tensor,
+            span_bdy: List[int],
+            use_teacher_forcing: bool=False, ) -> Dict[str, paddle.Tensor]:
+        return self.model.inference(
+            speech=speech,
+            text=text,
+            masked_pos=masked_pos,
+            speech_mask=speech_mask,
+            text_mask=text_mask,
+            speech_seg_pos=speech_seg_pos,
+            text_seg_pos=text_seg_pos,
+            span_bdy=span_bdy,
+            use_teacher_forcing=use_teacher_forcing)
--- a/paddlespeech/t2s/models/ernie_sat/ernie_sat_updater.py
+++ b/paddlespeech/t2s/models/ernie_sat/ernie_sat_updater.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+from pathlib import Path
+
+from paddle import distributed as dist
+from paddle.io import DataLoader
+from paddle.nn import Layer
+from paddle.optimizer import Optimizer
+
+from paddlespeech.t2s.modules.losses import MLMLoss
+from paddlespeech.t2s.training.extensions.evaluator import StandardEvaluator
+from paddlespeech.t2s.training.reporter import report
+from paddlespeech.t2s.training.updaters.standard_updater import StandardUpdater
+logging.basicConfig(
+    format='%(asctime)s [%(levelname)s] [%(filename)s:%(lineno)d] %(message)s',
+    datefmt='[%Y-%m-%d %H:%M:%S]')
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+
+class ErnieSATUpdater(StandardUpdater):
+    def __init__(self,
+                 model: Layer,
+                 optimizer: Optimizer,
+                 dataloader: DataLoader,
+                 init_state=None,
+                 text_masking: bool=False,
+                 odim: int=80,
+                 output_dir: Path=None):
+        super().__init__(model, optimizer, dataloader, init_state=None)
+
+        self.criterion = MLMLoss(text_masking=text_masking, odim=odim)
+
+        log_file = output_dir / 'worker_{}.log'.format(dist.get_rank())
+        self.filehandler = logging.FileHandler(str(log_file))
+        logger.addHandler(self.filehandler)
+        self.logger = logger
+        self.msg = ""
+
+    def update_core(self, batch):
+        self.msg = "Rank: {}, ".format(dist.get_rank())
+        losses_dict = {}
+
+        before_outs, after_outs, text_outs = self.model(
+            speech=batch["speech"],
+            text=batch["text"],
+            masked_pos=batch["masked_pos"],
+            speech_mask=batch["speech_mask"],
+            text_mask=batch["text_mask"],
+            speech_seg_pos=batch["speech_seg_pos"],
+            text_seg_pos=batch["text_seg_pos"])
+
+        mlm_loss, text_mlm_loss = self.criterion(
+            speech=batch["speech"],
+            before_outs=before_outs,
+            after_outs=after_outs,
+            masked_pos=batch["masked_pos"],
+            text=batch["text"],
+            # maybe None
+            text_outs=text_outs,
+            # maybe None
+            text_masked_pos=batch["text_masked_pos"])
+
+        loss = mlm_loss + text_mlm_loss if text_mlm_loss is not None else mlm_loss
+
+        optimizer = self.optimizer
+        optimizer.clear_grad()
+        loss.backward()
+        optimizer.step()
+
+        report("train/loss", float(loss))
+        report("train/mlm_loss", float(mlm_loss))
+        if text_mlm_loss is not None:
+            report("train/text_mlm_loss", float(text_mlm_loss))
+            losses_dict["text_mlm_loss"] = float(text_mlm_loss)
+
+        losses_dict["mlm_loss"] = float(mlm_loss)
+        losses_dict["loss"] = float(loss)
+        self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
+                              for k, v in losses_dict.items())
+
+
+class ErnieSATEvaluator(StandardEvaluator):
+    def __init__(self,
+                 model: Layer,
+                 dataloader: DataLoader,
+                 text_masking: bool=False,
+                 odim: int=80,
+                 output_dir: Path=None):
+        super().__init__(model, dataloader)
+
+        log_file = output_dir / 'worker_{}.log'.format(dist.get_rank())
+        self.filehandler = logging.FileHandler(str(log_file))
+        logger.addHandler(self.filehandler)
+        self.logger = logger
+        self.msg = ""
+
+        self.criterion = MLMLoss(text_masking=text_masking, odim=odim)
+
+    def evaluate_core(self, batch):
+        self.msg = "Evaluate: "
+        losses_dict = {}
+
+        before_outs, after_outs, text_outs = self.model(
+            speech=batch["speech"],
+            text=batch["text"],
+            masked_pos=batch["masked_pos"],
+            speech_mask=batch["speech_mask"],
+            text_mask=batch["text_mask"],
+            speech_seg_pos=batch["speech_seg_pos"],
+            text_seg_pos=batch["text_seg_pos"])
+
+        mlm_loss, text_mlm_loss = self.criterion(
+            speech=batch["speech"],
+            before_outs=before_outs,
+            after_outs=after_outs,
+            masked_pos=batch["masked_pos"],
+            text=batch["text"],
+            # maybe None
+            text_outs=text_outs,
+            # maybe None
+            text_masked_pos=batch["text_masked_pos"])
+        loss = mlm_loss + text_mlm_loss if text_mlm_loss is not None else mlm_loss
+
+        report("eval/loss", float(loss))
+        report("eval/mlm_loss", float(mlm_loss))
+        if text_mlm_loss is not None:
+            report("eval/text_mlm_loss", float(text_mlm_loss))
+            losses_dict["text_mlm_loss"] = float(text_mlm_loss)
+
+        losses_dict["mlm_loss"] = float(mlm_loss)
+        losses_dict["loss"] = float(loss)
+
+        self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
+                              for k, v in losses_dict.items())
+        self.logger.info(self.msg)
--- a/paddlespeech/t2s/models/ernie_sat/mlm.py
+++ b/paddlespeech/t2s/models/ernie_sat/mlm.py
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import argparse
 from typing import Dict
 from typing import List
 from typing import Optional
-from typing import Tuple
-from typing import Union

 import paddle
 import yaml
@@ -109,7 +120,6 @@ class MLMEncoder(nn.Layer):
                 positionwise_conv_kernel_size: int=1,
                 macaron_style: bool=False,
                 pos_enc_layer_type: str="abs_pos",
-                 pos_enc_class=None,
                 selfattention_layer_type: str="selfattn",
                 activation_type: str="swish",
                 use_cnn_module: bool=False,
@@ -334,7 +344,6 @@ class MLMDecoder(MLMEncoder):
 # encoder and decoder is nn.Layer, not str
 class MLM(nn.Layer):
    def __init__(self,
-                 token_list: Union[Tuple[str, ...], List[str]],
                 odim: int,
                 encoder: nn.Layer,
                 decoder: Optional[nn.Layer],
@@ -345,7 +354,6 @@ class MLM(nn.Layer):

        super().__init__()
        self.odim = odim
-        self.token_list = token_list.copy()
        self.encoder = encoder
        self.decoder = decoder
        self.vocab_size = encoder.text_embed[0]._num_embeddings
@@ -535,32 +543,6 @@ def build_model(args: argparse.Namespace, model_class=MLMEncAsDecoder) -> MLM:
    vocab_size = len(token_list)
    odim = 80

-    pos_enc_class = ScaledPositionalEncoding if args.use_scaled_pos_enc else PositionalEncoding
-
-    if "conformer" == args.encoder:
-        conformer_self_attn_layer_type = args.encoder_conf[
-            'selfattention_layer_type']
-        conformer_pos_enc_layer_type = args.encoder_conf['pos_enc_layer_type']
-        conformer_rel_pos_type = "legacy"
-        if conformer_rel_pos_type == "legacy":
-            if conformer_pos_enc_layer_type == "rel_pos":
-                conformer_pos_enc_layer_type = "legacy_rel_pos"
-            if conformer_self_attn_layer_type == "rel_selfattn":
-                conformer_self_attn_layer_type = "legacy_rel_selfattn"
-        elif conformer_rel_pos_type == "latest":
-            assert conformer_pos_enc_layer_type != "legacy_rel_pos"
-            assert conformer_self_attn_layer_type != "legacy_rel_selfattn"
-        else:
-            raise ValueError(f"Unknown rel_pos_type: {conformer_rel_pos_type}")
-        args.encoder_conf[
-            'selfattention_layer_type'] = conformer_self_attn_layer_type
-        args.encoder_conf['pos_enc_layer_type'] = conformer_pos_enc_layer_type
-        if "conformer" == args.decoder:
-            args.decoder_conf[
-                'selfattention_layer_type'] = conformer_self_attn_layer_type
-            args.decoder_conf[
-                'pos_enc_layer_type'] = conformer_pos_enc_layer_type
-
    # Encoder
    encoder_class = MLMEncoder

@@ -571,10 +553,7 @@ def build_model(args: argparse.Namespace, model_class=MLMEncAsDecoder) -> MLM:
        args.encoder_conf['text_masking'] = False

    encoder = encoder_class(
-        args.input_size,
-        vocab_size=vocab_size,
-        pos_enc_class=pos_enc_class,
-        **args.encoder_conf)
+        args.input_size, vocab_size=vocab_size, **args.encoder_conf)

    # Decoder
    if args.decoder != 'no_decoder':
@@ -591,7 +570,6 @@ def build_model(args: argparse.Namespace, model_class=MLMEncAsDecoder) -> MLM:
        odim=odim,
        encoder=encoder,
        decoder=decoder,
-        token_list=token_list,
        **args.model_conf, )

    # Initialize

--- a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
+++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
@@ -212,9 +212,7 @@ class FastSpeech2(nn.Layer):
        super().__init__()

        # store hyperparameters
-        self.idim = idim
        self.odim = odim
-        self.eos = idim - 1
        self.reduction_factor = reduction_factor
        self.encoder_type = encoder_type
        self.decoder_type = decoder_type

--- a/paddlespeech/t2s/modules/losses.py
+++ b/paddlespeech/t2s/modules/losses.py
@@ -1012,6 +1012,7 @@ class KLDivergenceLoss(nn.Layer):
 # loss for ERNIE SAT
 class MLMLoss(nn.Layer):
    def __init__(self,
+                 odim: int,
                 lsm_weight: float=0.1,
                 ignore_id: int=-1,
                 text_masking: bool=False):
@@ -1023,12 +1024,15 @@ class MLMLoss(nn.Layer):
        else:
            self.l1_loss_func = nn.L1Loss(reduction='none')
        self.text_masking = text_masking
+        self.odim = odim

-    def forward(self,
+    def forward(
+            self,
            speech: paddle.Tensor,
            before_outs: paddle.Tensor,
            after_outs: paddle.Tensor,
            masked_pos: paddle.Tensor,
+            # for text_loss when text_masking == True
            text: paddle.Tensor=None,
            text_outs: paddle.Tensor=None,
            text_masked_pos: paddle.Tensor=None):
@@ -1046,16 +1050,19 @@ class MLMLoss(nn.Layer):
                    paddle.reshape(after_outs, (-1, self.odim)),
                    paddle.reshape(xs_pad, (-1, self.odim))),
                axis=-1)
-        loss_mlm = paddle.sum((loss * paddle.reshape(
+        mlm_loss = paddle.sum((loss * paddle.reshape(
            mlm_loss_pos, [-1]))) / paddle.sum((mlm_loss_pos) + 1e-10)

+        text_mlm_loss = None
+
        if self.text_masking:
-            loss_text = paddle.sum((self.text_mlm_loss(
+            assert text is not None
+            assert text_outs is not None
+            assert text_masked_pos is not None
+            text_mlm_loss = paddle.sum((self.text_mlm_loss(
                paddle.reshape(text_outs, (-1, self.vocab_size)),
                paddle.reshape(text, (-1))) * paddle.reshape(
                    text_masked_pos,
                    (-1)))) / paddle.sum((text_masked_pos) + 1e-10)

-            return loss_mlm, loss_text
-
-        return loss_mlm
+        return mlm_loss, text_mlm_loss
--- a/paddlespeech/t2s/modules/nets_utils.py
+++ b/paddlespeech/t2s/modules/nets_utils.py
@@ -393,7 +393,6 @@ def phones_masking(xs_pad: paddle.Tensor,
                    mean_phn_span=mean_phn_span).nonzero()
                masked_start = align_start[idx][masked_phn_idxs].tolist()
                masked_end = align_end[idx][masked_phn_idxs].tolist()
-
                for s, e in zip(masked_start, masked_end):
                    masked_pos[idx, s:e] = 1
    non_eos_mask = paddle.reshape(src_mask, paddle.shape(xs_pad)[:2])