diff --git a/ernie-sat/.DS_Store b/ernie-sat/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..d786441fb926dbb0a14d94f455c5814ada534d7f
Binary files /dev/null and b/ernie-sat/.DS_Store differ
diff --git a/ernie-sat/README_zh.md b/ernie-sat/README_zh.md
new file mode 100644
index 0000000000000000000000000000000000000000..930f6cbadf7cda33d18abd373292dc5a413eea86
--- /dev/null
+++ b/ernie-sat/README_zh.md
@@ -0,0 +1,87 @@
+
+## 使用说明
+
+### 1.安装飞桨
+
+我们的代码基于 Paddle(version>=2.0)
+
+
+### 2.预训练模型
+预训练模型ERNIE-SAT的模型如下所示(链接暂无):
+- [ERNIE-SAT_ZH](http://bj.bcebos.com/wenxin-models/model-ernie-sat-base-zh.tar.gz) 
+- [ERNIE-SAT_EN](http://bj.bcebos.com/wenxin-models/model-ernie-sat-base-en.tar.gz)  
+- [ERNIE-SAT_ZH_and_EN](http://bj.bcebos.com/wenxin-models/model-ernie-sat-base-en_zh.tar.gz) 
+
+
+### 3.下载
+
+1. 我们使用parallel wavegan作为声码器(vocoder): 
+    - [pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_aishell3_ckpt_0.5.zip)  
+
+创建download文件夹，下载上述预训练的声码器(vocoder)模型并将其解压
+
+```bash
+mkdir download
+cd download
+unzip pwg_aishell3_ckpt_0.5.zip
+```
+
+ 2. 我们使用[FastSpeech2](https://arxiv.org/abs/2006.04558) 作为音素(phoneme)的持续时间预测器:
+    - [fastspeech2_conformer_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_conformer_baker_ckpt_0.5.zip)  中文场景下使用 
+    - [fastspeech2_nosil_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_ljspeech_ckpt_0.5.zip)  英文场景下使用
+
+ 下载上述预训练的fastspeech2模型并将其解压
+
+```bash
+cd download
+unzip fastspeech2_conformer_baker_ckpt_0.5.zip
+unzip fastspeech2_nosil_ljspeech_ckpt_0.5.zip
+```
+
+### 4.推理
+
+我们目前只开源了语音编辑、个性化语音合成、跨语言语音合成的推理代码，后续会逐步开源。
+注：当前采用的声码器版本与模型训练时版本（https://github.com/kan-bayashi/ParallelWaveGAN）在英文上存在差异，您可使用模型训练时版本作为您的声码器，模型将在后续更新中升级。
+
+我们提供特定音频文件, 以及其对应的文本、音素相关文件:
+- prompt_wav: 提供的音频文件
+- prompt/dev: 基于上述特定音频对应的文本、音素相关文件
+
+
+```text
+prompt_wav
+├── p299_096.wav                 # 样例语音文件1
+├── SSB03540428.wav              # 样例语音文件2
+└── ...
+```
+
+```text
+prompt/dev
+├── text                     # 样例语音对应文本
+├── wav.scp                  # 样例语音路径
+├── mfa_text                 # 样例语音对应音素
+├── mfa_start                # 样例语音中各个音素的开始时间
+└── mfa_end                  # 样例语音中各个音素的结束时间
+```
+1. `--am` 声学模型格式符合 {model_name}_{dataset}
+2. `--am_config`, `--am_checkpoint`, `--am_stat` 和 `--phones_dict` 是声学模型的参数，对应于 fastspeech2 预训练模型中的 4 个文件。
+3. `--voc` 声码器(vocoder)格式是否符合 {model_name}_{dataset}
+4. `--voc_config`, `--voc_checkpoint`, `--voc_stat` 是声码器的参数，对应于 parallel wavegan 预训练模型中的 3 个文件。
+5. `--lang` 对应模型的语言可以是 `zh` 或 `en` 。
+6. `--ngpu` 要使用的GPU数，如果 ngpu==0，则使用 cpu。
+7. ` --model_name` 模型名称
+8. ` --uid` 特定提示(prompt)语音的id
+9. ` --new_str` 输入的文本(本次开源暂时先设置特定的文本)
+10. ` --prefix` 特定音频对应的文本、音素相关文件的地址
+11. ` --source_language` , 源语言
+12. ` --target_language` , 目标语言
+13. ` --output_name` , 合成语音名称
+14. ` --task_name` , 任务名称, 包括：语音编辑任务、个性化语音合成任务、跨语言语音合成任务
+
+运行以下脚本即可进行实验
+```shell
+sh run_sedit_en.sh # 语音编辑任务(英文) 
+sh run_gen_en.sh # 个性化语音合成任务(英文)
+sh run_clone_en_to_zh.sh # 跨语言语音合成任务(英文到中文的克隆)
+```
+
diff --git a/ernie-sat/model_paddle.py b/ernie-sat/model_paddle.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc3caf93d4614662cb33989934055ec24624868a
--- /dev/null
+++ b/ernie-sat/model_paddle.py
@@ -0,0 +1,1057 @@
+import argparse
+from pathlib import Path
+from typing import Any, Callable, Dict, Optional
+from typing import List
+from typing import Sequence
+from typing import Tuple
+from typing import Union
+import humanfriendly
+from matplotlib.collections import Collection
+from matplotlib.pyplot import axis
+import librosa
+import soundfile as sf
+
+import numpy as np
+import paddle
+import paddle.nn.functional as F
+from paddle import nn
+from typeguard import check_argument_types
+import logging
+import math
+import yaml
+from abc import ABC, abstractmethod
+import warnings
+from paddle.amp import auto_cast
+
+import sys, os
+pypath = '..'
+for dir_name in os.listdir(pypath):
+    dir_path = os.path.join(pypath, dir_name)
+    if os.path.isdir(dir_path):
+        sys.path.append(dir_path)
+
+from paddlespeech.t2s.modules.nets_utils import initialize
+from paddlespeech.t2s.modules.nets_utils import make_non_pad_mask
+from paddlespeech.t2s.modules.nets_utils import make_pad_mask
+from paddlespeech.t2s.modules.predictor.duration_predictor import DurationPredictor
+from paddlespeech.t2s.modules.predictor.duration_predictor import DurationPredictorLoss
+from paddlespeech.t2s.modules.predictor.length_regulator import LengthRegulator
+from paddlespeech.t2s.modules.predictor.variance_predictor import VariancePredictor
+from paddlespeech.t2s.modules.tacotron2.decoder import Postnet
+from paddlespeech.t2s.modules.transformer.encoder import CNNDecoder
+from paddlespeech.t2s.modules.transformer.encoder import CNNPostnet
+from paddlespeech.t2s.modules.transformer.encoder import ConformerEncoder
+from paddlespeech.t2s.modules.transformer.encoder import TransformerEncoder
+from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding, ScaledPositionalEncoding, RelPositionalEncoding
+from paddlespeech.t2s.modules.transformer.subsampling import Conv2dSubsampling
+from paddlespeech.t2s.modules.masked_fill import masked_fill
+from paddlespeech.t2s.modules.transformer.attention import MultiHeadedAttention, RelPositionMultiHeadedAttention
+from paddlespeech.t2s.modules.transformer.positionwise_feed_forward import PositionwiseFeedForward
+from paddlespeech.t2s.modules.transformer.multi_layer_conv import Conv1dLinear, MultiLayeredConv1d
+from paddlespeech.t2s.modules.conformer.convolution import ConvolutionModule
+from paddlespeech.t2s.modules.transformer.repeat import repeat
+from paddlespeech.t2s.modules.conformer.encoder_layer import EncoderLayer
+from paddlespeech.t2s.modules.layer_norm import LayerNorm
+from paddlespeech.s2t.utils.error_rate import ErrorCalculator
+from paddlespeech.t2s.datasets.get_feats import LogMelFBank
+
+class Swish(nn.Layer):
+    """Construct an Swish object."""
+
+    def forward(self, x):
+        """Return Swich activation function."""
+        return x * F.sigmoid(x)
+
+
+def get_activation(act):
+    """Return activation function."""
+
+    activation_funcs = {
+        "hardtanh": nn.Hardtanh,
+        "tanh": nn.Tanh,
+        "relu": nn.ReLU,
+        "selu": nn.SELU,
+        "swish": Swish,
+    }
+
+    return activation_funcs[act]()
+
+class LegacyRelPositionalEncoding(PositionalEncoding):
+    """Relative positional encoding module (old version).
+
+    Details can be found in https://github.com/espnet/espnet/pull/2816.
+
+    See : Appendix B in https://arxiv.org/abs/1901.02860
+
+    Args:
+        d_model (int): Embedding dimension.
+        dropout_rate (float): Dropout rate.
+        max_len (int): Maximum input length.
+
+    """
+    def __init__(self, d_model: int, dropout_rate: float, max_len: int=5000):
+        """
+        Args:
+            d_model (int): Embedding dimension.
+            dropout_rate (float): Dropout rate.
+            max_len (int, optional): [Maximum input length.]. Defaults to 5000.
+        """
+        super().__init__(d_model, dropout_rate, max_len, reverse=True)
+
+    def extend_pe(self, x):
+        """Reset the positional encodings."""
+        if self.pe is not None:
+            if paddle.shape(self.pe)[1] >= paddle.shape(x)[1]:
+                # if self.pe.dtype != x.dtype or self.pe.device != x.device:
+                #     self.pe = self.pe.to(dtype=x.dtype, device=x.device)
+                return
+        pe = paddle.zeros((paddle.shape(x)[1], self.d_model))
+        if self.reverse:
+            position = paddle.arange(
+                paddle.shape(x)[1] - 1, -1, -1.0, dtype=paddle.float32
+            ).unsqueeze(1)
+        else:
+            position = paddle.arange(0, paddle.shape(x)[1], dtype=paddle.float32).unsqueeze(1)
+        div_term = paddle.exp(
+            paddle.arange(0, self.d_model, 2, dtype=paddle.float32)
+            * -(math.log(10000.0) / self.d_model)
+        )
+        pe[:, 0::2] = paddle.sin(position * div_term)
+        pe[:, 1::2] = paddle.cos(position * div_term)
+        pe = pe.unsqueeze(0)
+        self.pe = pe
+
+    def forward(self, x: paddle.Tensor) -> Tuple[paddle.Tensor, paddle.Tensor]:
+        """Compute positional encoding.
+        Args:
+            x (paddle.Tensor): Input tensor (batch, time, `*`).
+        Returns:
+            paddle.Tensor: Encoded tensor (batch, time, `*`).
+            paddle.Tensor: Positional embedding tensor (1, time, `*`).
+        """
+        self.extend_pe(x)  
+        x = x * self.xscale
+        pos_emb = self.pe[:, :paddle.shape(x)[1]]
+        return self.dropout(x), self.dropout(pos_emb)
+
+def dump_tensor(var, do_trans = False):
+    wf = open('/mnt/home/xiaoran/PaddleSpeech-develop/tmp_var.out', 'w')
+    for num in var.shape:
+        wf.write(str(num) + ' ')
+    wf.write('\n')
+    if do_trans:
+        var = paddle.transpose(var, [1,0])
+    if len(var.shape)==1:
+        for _var in var:
+            s = ("%.10f"%_var.item())
+            wf.write(s+' ')
+    elif len(var.shape)==2:
+        for __var in var:
+            for _var in __var:
+                s = ("%.10f"%_var.item())
+                wf.write(s+' ')
+            wf.write('\n')
+    elif len(var.shape)==3:
+        for ___var in var:
+            for __var in ___var:
+                for _var in __var:
+                    s = ("%.10f"%_var.item())
+                    wf.write(s+' ')
+                wf.write('\n')
+            wf.write('\n')
+    elif len(var.shape)==4:
+        for ____var in var:
+            for ___var in ____var:
+                for __var in ___var:
+                    for _var in __var:
+                        s = ("%.10f"%_var.item())
+                        wf.write(s+' ')
+                    wf.write('\n')
+                wf.write('\n')
+            wf.write('\n')
+
+class mySequential(nn.Sequential):
+    def forward(self, *inputs):
+        for module in self._sub_layers.values():
+            if type(inputs) == tuple:
+                inputs = module(*inputs)
+            else:
+                inputs = module(inputs)
+        return inputs
+
+class NewMaskInputLayer(nn.Layer):
+    __constants__ = ['out_features']
+    out_features: int
+
+    def __init__(self, out_features: int,
+                 device=None, dtype=None) -> None:
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super(NewMaskInputLayer, self).__init__()
+        self.mask_feature = paddle.create_parameter(
+            shape=(1,1,out_features), 
+            dtype=paddle.float32, 
+            default_initializer=paddle.nn.initializer.Assign(paddle.normal(shape=(1,1,out_features))))
+
+    def forward(self, input: paddle.Tensor, masked_position=None) -> paddle.Tensor:
+        masked_position = paddle.expand_as(paddle.unsqueeze(masked_position, -1), input)
+        masked_input = masked_fill(input, masked_position, 0) + masked_fill(paddle.expand_as(self.mask_feature, input), ~masked_position, 0)
+        return masked_input
+
+class LegacyRelPositionMultiHeadedAttention(MultiHeadedAttention):
+    """Multi-Head Attention layer with relative position encoding (old version).
+    Details can be found in https://github.com/espnet/espnet/pull/2816.
+    Paper: https://arxiv.org/abs/1901.02860
+
+    Args:
+        n_head (int): The number of heads.
+        n_feat (int): The number of features.
+        dropout_rate (float): Dropout rate.
+        zero_triu (bool): Whether to zero the upper triangular part of attention matrix.
+    """
+
+    def __init__(self, n_head, n_feat, dropout_rate, zero_triu=False):
+        """Construct an RelPositionMultiHeadedAttention object."""
+        super().__init__(n_head, n_feat, dropout_rate)
+        self.zero_triu = zero_triu
+        # linear transformation for positional encoding
+        self.linear_pos = nn.Linear(n_feat, n_feat, bias_attr=False)
+        # these two learnable bias are used in matrix c and matrix d
+        # as described in https://arxiv.org/abs/1901.02860 Section 3.3
+
+        self.pos_bias_u = paddle.create_parameter(
+            shape=(self.h, self.d_k),
+            dtype='float32',
+            default_initializer=paddle.nn.initializer.XavierUniform())
+        self.pos_bias_v = paddle.create_parameter(
+            shape=(self.h, self.d_k),
+            dtype='float32',
+            default_initializer=paddle.nn.initializer.XavierUniform())
+
+    def rel_shift(self, x):
+        """Compute relative positional encoding.
+        Args:
+            x(Tensor): Input tensor (batch, head, time1, time2).
+
+        Returns:
+            Tensor:Output tensor.
+        """
+        b, h, t1, t2 = paddle.shape(x)
+        zero_pad = paddle.zeros((b, h, t1, 1))
+        x_padded = paddle.concat([zero_pad, x], axis=-1)
+        x_padded = paddle.reshape(x_padded, [b, h, t2 + 1, t1])
+        # only keep the positions from 0 to time2
+        x = paddle.reshape(x_padded[:, :, 1:], [b, h, t1, t2])
+
+        if self.zero_triu:
+            ones = paddle.ones((t1, t2))
+            x = x * paddle.tril(ones, t2 - 1)[None, None, :, :]
+
+        return x
+
+    def forward(self, query, key, value, pos_emb, mask):
+        """Compute 'Scaled Dot Product Attention' with rel. positional encoding.
+
+        Args:
+            query(Tensor): Query tensor (#batch, time1, size).
+            key(Tensor): Key tensor (#batch, time2, size).
+            value(Tensor): Value tensor (#batch, time2, size).
+            pos_emb(Tensor): Positional embedding tensor (#batch, time1, size).
+            mask(Tensor): Mask tensor (#batch, 1, time2) or (#batch, time1, time2).
+
+        Returns:
+            Tensor: Output tensor (#batch, time1, d_model).
+        """
+        q, k, v = self.forward_qkv(query, key, value)
+        # (batch, time1, head, d_k)
+        q = paddle.transpose(q, [0, 2, 1, 3])
+
+        n_batch_pos = paddle.shape(pos_emb)[0]
+        p = paddle.reshape(self.linear_pos(pos_emb), [n_batch_pos, -1, self.h, self.d_k])
+        # (batch, head, time1, d_k)
+        p = paddle.transpose(p, [0, 2, 1, 3])
+        # (batch, head, time1, d_k)
+        q_with_bias_u = paddle.transpose((q + self.pos_bias_u), [0, 2, 1, 3])
+        # (batch, head, time1, d_k)
+        q_with_bias_v = paddle.transpose((q + self.pos_bias_v), [0, 2, 1, 3])
+
+        # compute attention score
+        # first compute matrix a and matrix c
+        # as described in https://arxiv.org/abs/1901.02860 Section 3.3
+        # (batch, head, time1, time2)
+        matrix_ac = paddle.matmul(q_with_bias_u, paddle.transpose(k, [0, 1, 3, 2]))
+
+        # compute matrix b and matrix d
+        # (batch, head, time1, time1)
+        matrix_bd = paddle.matmul(q_with_bias_v, paddle.transpose(p, [0, 1, 3, 2]))
+        matrix_bd = self.rel_shift(matrix_bd)
+        # (batch, head, time1, time2)
+        scores = (matrix_ac + matrix_bd) / math.sqrt(self.d_k)
+
+        return self.forward_attention(v, scores, mask)
+
+class MLMEncoder(nn.Layer):
+    """Conformer encoder module.
+
+    Args:
+        idim (int): Input dimension.
+        attention_dim (int): Dimension of attention.
+        attention_heads (int): The number of heads of multi head attention.
+        linear_units (int): The number of units of position-wise feed forward.
+        num_blocks (int): The number of decoder blocks.
+        dropout_rate (float): Dropout rate.
+        positional_dropout_rate (float): Dropout rate after adding positional encoding.
+        attention_dropout_rate (float): Dropout rate in attention.
+        input_layer (Union[str, paddle.nn.Layer]): Input layer type.
+        normalize_before (bool): Whether to use layer_norm before the first block.
+        concat_after (bool): Whether to concat attention layer's input and output.
+            if True, additional linear will be applied.
+            i.e. x -> x + linear(concat(x, att(x)))
+            if False, no additional linear will be applied. i.e. x -> x + att(x)
+        positionwise_layer_type (str): "linear", "conv1d", or "conv1d-linear".
+        positionwise_conv_kernel_size (int): Kernel size of positionwise conv1d layer.
+        macaron_style (bool): Whether to use macaron style for positionwise layer.
+        pos_enc_layer_type (str): Encoder positional encoding layer type.
+        selfattention_layer_type (str): Encoder attention layer type.
+        activation_type (str): Encoder activation function type.
+        use_cnn_module (bool): Whether to use convolution module.
+        zero_triu (bool): Whether to zero the upper triangular part of attention matrix.
+        cnn_module_kernel (int): Kernerl size of convolution module.
+        padding_idx (int): Padding idx for input_layer=embed.
+        stochastic_depth_rate (float): Maximum probability to skip the encoder layer.
+        intermediate_layers (Union[List[int], None]): indices of intermediate CTC layer.
+            indices start from 1.
+            if not None, intermediate outputs are returned (which changes return type
+            signature.)
+
+    """
+    def __init__(
+        self,
+        idim,
+        vocab_size=0,
+        pre_speech_layer: int = 0,
+        attention_dim=256,
+        attention_heads=4,
+        linear_units=2048,
+        num_blocks=6,
+        dropout_rate=0.1,
+        positional_dropout_rate=0.1,
+        attention_dropout_rate=0.0,
+        input_layer="conv2d",
+        normalize_before=True,
+        concat_after=False,
+        positionwise_layer_type="linear",
+        positionwise_conv_kernel_size=1,
+        macaron_style=False,
+        pos_enc_layer_type="abs_pos",
+        pos_enc_class=None,
+        selfattention_layer_type="selfattn",
+        activation_type="swish",
+        use_cnn_module=False,
+        zero_triu=False,
+        cnn_module_kernel=31,
+        padding_idx=-1,
+        stochastic_depth_rate=0.0,
+        intermediate_layers=None,
+        text_masking = False
+    ):
+        """Construct an Encoder object."""
+        super(MLMEncoder, self).__init__()
+        self._output_size = attention_dim
+        self.text_masking=text_masking
+        if self.text_masking:
+            self.text_masking_layer = NewMaskInputLayer(attention_dim)
+        activation = get_activation(activation_type)
+        if pos_enc_layer_type == "abs_pos":
+            pos_enc_class = PositionalEncoding
+        elif pos_enc_layer_type == "scaled_abs_pos":
+            pos_enc_class = ScaledPositionalEncoding
+        elif pos_enc_layer_type == "rel_pos":
+            assert selfattention_layer_type == "rel_selfattn"
+            pos_enc_class = RelPositionalEncoding
+        elif pos_enc_layer_type == "legacy_rel_pos":
+            pos_enc_class = LegacyRelPositionalEncoding
+            assert selfattention_layer_type == "legacy_rel_selfattn"
+        else:
+            raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type)
+
+        self.conv_subsampling_factor = 1
+        if input_layer == "linear":
+            self.embed = nn.Sequential(
+                nn.Linear(idim, attention_dim),
+                nn.LayerNorm(attention_dim),
+                nn.Dropout(dropout_rate),
+                nn.ReLU(),
+                pos_enc_class(attention_dim, positional_dropout_rate),
+            )
+        elif input_layer == "conv2d":
+            self.embed = Conv2dSubsampling(
+                idim,
+                attention_dim,
+                dropout_rate,
+                pos_enc_class(attention_dim, positional_dropout_rate),
+            )
+            self.conv_subsampling_factor = 4
+        elif input_layer == "embed":
+            self.embed = nn.Sequential(
+                nn.Embedding(idim, attention_dim, padding_idx=padding_idx),
+                pos_enc_class(attention_dim, positional_dropout_rate),
+            )
+        elif input_layer == "mlm":
+            self.segment_emb = None
+            self.speech_embed = mySequential(
+                NewMaskInputLayer(idim),
+                nn.Linear(idim, attention_dim),
+                nn.LayerNorm(attention_dim),
+                nn.ReLU(),
+                pos_enc_class(attention_dim, positional_dropout_rate)
+            )
+            self.text_embed = nn.Sequential(
+                nn.Embedding(vocab_size, attention_dim, padding_idx=padding_idx),
+                pos_enc_class(attention_dim, positional_dropout_rate),
+            )
+        elif input_layer=="sega_mlm":
+            self.segment_emb = nn.Embedding(500, attention_dim, padding_idx=padding_idx)
+            self.speech_embed = mySequential(
+                NewMaskInputLayer(idim),
+                nn.Linear(idim, attention_dim),
+                nn.LayerNorm(attention_dim),
+                nn.ReLU(),
+                pos_enc_class(attention_dim, positional_dropout_rate)
+            )
+            self.text_embed = nn.Sequential(
+                nn.Embedding(vocab_size, attention_dim, padding_idx=padding_idx),
+                pos_enc_class(attention_dim, positional_dropout_rate),
+            )
+        elif isinstance(input_layer, nn.Layer):
+            self.embed = nn.Sequential(
+                input_layer,
+                pos_enc_class(attention_dim, positional_dropout_rate),
+            )
+        elif input_layer is None:
+            self.embed = nn.Sequential(
+                pos_enc_class(attention_dim, positional_dropout_rate)
+            )
+        else:
+            raise ValueError("unknown input_layer: " + input_layer)
+        self.normalize_before = normalize_before
+
+        # self-attention module definition
+        if selfattention_layer_type == "selfattn":
+            logging.info("encoder self-attention layer type = self-attention")
+            encoder_selfattn_layer = MultiHeadedAttention
+            encoder_selfattn_layer_args = (
+                attention_heads,
+                attention_dim,
+                attention_dropout_rate,
+            )
+        elif selfattention_layer_type == "legacy_rel_selfattn":
+            assert pos_enc_layer_type == "legacy_rel_pos"
+            encoder_selfattn_layer = LegacyRelPositionMultiHeadedAttention
+            encoder_selfattn_layer_args = (
+                attention_heads,
+                attention_dim,
+                attention_dropout_rate,
+            )
+        elif selfattention_layer_type == "rel_selfattn":
+            logging.info("encoder self-attention layer type = relative self-attention")
+            assert pos_enc_layer_type == "rel_pos"
+            encoder_selfattn_layer = RelPositionMultiHeadedAttention
+            encoder_selfattn_layer_args = (
+                attention_heads,
+                attention_dim,
+                attention_dropout_rate,
+                zero_triu,
+            )
+        else:
+            raise ValueError("unknown encoder_attn_layer: " + selfattention_layer_type)
+
+        # feed-forward module definition
+        if positionwise_layer_type == "linear":
+            positionwise_layer = PositionwiseFeedForward
+            positionwise_layer_args = (
+                attention_dim,
+                linear_units,
+                dropout_rate,
+                activation,
+            )
+        elif positionwise_layer_type == "conv1d":
+            positionwise_layer = MultiLayeredConv1d
+            positionwise_layer_args = (
+                attention_dim,
+                linear_units,
+                positionwise_conv_kernel_size,
+                dropout_rate,
+            )
+        elif positionwise_layer_type == "conv1d-linear":
+            positionwise_layer = Conv1dLinear
+            positionwise_layer_args = (
+                attention_dim,
+                linear_units,
+                positionwise_conv_kernel_size,
+                dropout_rate,
+            )
+        else:
+            raise NotImplementedError("Support only linear or conv1d.")
+
+        # convolution module definition
+        convolution_layer = ConvolutionModule
+        convolution_layer_args = (attention_dim, cnn_module_kernel, activation)
+
+        self.encoders = repeat(
+            num_blocks,
+            lambda lnum: EncoderLayer(
+                attention_dim,
+                encoder_selfattn_layer(*encoder_selfattn_layer_args),
+                positionwise_layer(*positionwise_layer_args),
+                positionwise_layer(*positionwise_layer_args) if macaron_style else None,
+                convolution_layer(*convolution_layer_args) if use_cnn_module else None,
+                dropout_rate,
+                normalize_before,
+                concat_after,
+                stochastic_depth_rate * float(1 + lnum) / num_blocks,
+            ),
+        )
+        self.pre_speech_layer = pre_speech_layer
+        self.pre_speech_encoders = repeat(
+            self.pre_speech_layer,
+            lambda lnum: EncoderLayer(
+                attention_dim,
+                encoder_selfattn_layer(*encoder_selfattn_layer_args),
+                positionwise_layer(*positionwise_layer_args),
+                positionwise_layer(*positionwise_layer_args) if macaron_style else None,
+                convolution_layer(*convolution_layer_args) if use_cnn_module else None,
+                dropout_rate,
+                normalize_before,
+                concat_after,
+                stochastic_depth_rate * float(1 + lnum) / self.pre_speech_layer,
+            ),
+        )
+        if self.normalize_before:
+            self.after_norm = LayerNorm(attention_dim)
+
+        self.intermediate_layers = intermediate_layers
+
+
+    def forward(self, speech_pad, text_pad, masked_position, speech_mask=None, text_mask=None,speech_segment_pos=None, text_segment_pos=None):
+        """Encode input sequence.
+
+        """
+        if masked_position is not None:
+            speech_pad = self.speech_embed(speech_pad, masked_position)
+        else:
+            speech_pad = self.speech_embed(speech_pad)
+        # pure speech input
+        if -2 in np.array(text_pad):
+            text_pad = text_pad+3
+            text_mask = paddle.unsqueeze(bool(text_pad), 1)
+            text_segment_pos = paddle.zeros_like(text_pad)
+            text_pad = self.text_embed(text_pad)
+            text_pad = (text_pad[0] + self.segment_emb(text_segment_pos), text_pad[1])
+            text_segment_pos=None
+        elif text_pad is not None:
+            text_pad = self.text_embed(text_pad)
+        segment_emb = None
+        if speech_segment_pos is not None and text_segment_pos is not None and self.segment_emb:
+            speech_segment_emb = self.segment_emb(speech_segment_pos)
+            text_segment_emb = self.segment_emb(text_segment_pos)
+            text_pad = (text_pad[0] + text_segment_emb, text_pad[1])
+            speech_pad = (speech_pad[0] + speech_segment_emb, speech_pad[1])
+            segment_emb = paddle.concat([speech_segment_emb, text_segment_emb],axis=1)
+        if self.pre_speech_encoders:
+            speech_pad, _ = self.pre_speech_encoders(speech_pad, speech_mask)
+
+        if text_pad is not None:
+            xs = paddle.concat([speech_pad[0], text_pad[0]], axis=1)
+            xs_pos_emb = paddle.concat([speech_pad[1], text_pad[1]], axis=1)
+            masks = paddle.concat([speech_mask,text_mask],axis=-1)
+        else:
+            xs = speech_pad[0]
+            xs_pos_emb = speech_pad[1]
+            masks = speech_mask
+
+        xs, masks = self.encoders((xs,xs_pos_emb), masks)
+
+        if isinstance(xs, tuple):
+            xs = xs[0]
+        if self.normalize_before:
+            xs = self.after_norm(xs)
+
+        return xs, masks #, segment_emb
+
+
+class MLMDecoder(MLMEncoder):
+
+    def forward(self, xs, masks, masked_position=None,segment_emb=None):
+        """Encode input sequence.
+
+        Args:
+            xs (paddle.Tensor): Input tensor (#batch, time, idim).
+            masks (paddle.Tensor): Mask tensor (#batch, time).
+
+        Returns:
+            paddle.Tensor: Output tensor (#batch, time, attention_dim).
+            paddle.Tensor: Mask tensor (#batch, time).
+
+        """
+        emb, mlm_position = None, None
+        if not self.training:
+            masked_position = None
+        # if isinstance(self.embed, (Conv2dSubsampling, VGG2L)):
+        #     xs, masks = self.embed(xs, masks)
+        # else:
+        xs = self.embed(xs)
+        if segment_emb:
+            xs = (xs[0] + segment_emb, xs[1])
+        if self.intermediate_layers is None:
+            xs, masks = self.encoders(xs, masks)
+        else:
+            intermediate_outputs = []
+            for layer_idx, encoder_layer in enumerate(self.encoders):
+                xs, masks = encoder_layer(xs, masks)
+
+                if (
+                    self.intermediate_layers is not None
+                    and layer_idx + 1 in self.intermediate_layers
+                ):
+                    encoder_output = xs
+                    # intermediate branches also require normalization.
+                    if self.normalize_before:
+                        encoder_output = self.after_norm(encoder_output)
+                    intermediate_outputs.append(encoder_output)
+        if isinstance(xs, tuple):
+            xs = xs[0]
+        if self.normalize_before:
+            xs = self.after_norm(xs)
+
+        if self.intermediate_layers is not None:
+            return xs, masks, intermediate_outputs
+        return xs, masks
+
+class AbsESPnetModel(nn.Layer, ABC):
+    """The common abstract class among each tasks
+
+    "ESPnetModel" is referred to a class which inherits paddle.nn.Layer,
+    and makes the dnn-models forward as its member field,
+    a.k.a delegate pattern,
+    and defines "loss", "stats", and "weight" for the task.
+
+    If you intend to implement new task in ESPNet,
+    the model must inherit this class.
+    In other words, the "mediator" objects between
+    our training system and the your task class are
+    just only these three values, loss, stats, and weight.
+
+    Example:
+        >>> from espnet2.tasks.abs_task import AbsTask
+        >>> class YourESPnetModel(AbsESPnetModel):
+        ...     def forward(self, input, input_lengths):
+        ...         ...
+        ...         return loss, stats, weight
+        >>> class YourTask(AbsTask):
+        ...     @classmethod
+        ...     def build_model(cls, args: argparse.Namespace) -> YourESPnetModel:
+    """
+
+    @abstractmethod
+    def forward(
+        self, **batch: paddle.Tensor
+    ) -> Tuple[paddle.Tensor, Dict[str, paddle.Tensor], paddle.Tensor]:
+        raise NotImplementedError
+
+    @abstractmethod
+    def collect_feats(self, **batch: paddle.Tensor) -> Dict[str, paddle.Tensor]:
+        raise NotImplementedError
+
+class AbsFeatsExtract(nn.Layer, ABC):
+    @abstractmethod
+    def output_size(self) -> int:
+        raise NotImplementedError
+
+    @abstractmethod
+    def get_parameters(self) -> Dict[str, Any]:
+        raise NotImplementedError
+
+    @abstractmethod
+    def forward(
+        self, input: paddle.Tensor, input_lengths: paddle.Tensor
+    ) -> Tuple[paddle.Tensor, paddle.Tensor]:
+        raise NotImplementedError
+
+class AbsNormalize(nn.Layer, ABC):
+    @abstractmethod
+    def forward(
+        self, input: paddle.Tensor, input_lengths: paddle.Tensor = None
+    ) -> Tuple[paddle.Tensor, paddle.Tensor]:
+        # return output, output_lengths
+        raise NotImplementedError
+
+
+
+def pad_to_longformer_att_window(text, max_len, max_tlen,attention_window):
+    round = max_len % attention_window
+    if round != 0:
+        max_tlen += (attention_window - round)
+        n_batch = paddle.shape(text)[0]
+        text_pad = paddle.zeros(shape = (n_batch, max_tlen, *paddle.shape(text[0])[1:]), dtype=text.dtype)
+        for i in range(n_batch):
+            text_pad[i, : paddle.shape(text[i])[0]] = text[i]
+    else:
+        text_pad = text[:, : max_tlen]
+    return text_pad, max_tlen
+
+class ESPnetMLMModel(AbsESPnetModel):
+    def __init__(
+        self,
+        token_list: Union[Tuple[str, ...], List[str]],
+        odim: int,
+        feats_extract: Optional[AbsFeatsExtract],
+        normalize: Optional[AbsNormalize],
+        encoder: nn.Layer,
+        decoder: Optional[nn.Layer],
+        postnet_layers: int = 0,
+        postnet_chans: int = 0,
+        postnet_filts: int = 0,
+        ignore_id: int = -1,
+        lsm_weight: float = 0.0,
+        length_normalized_loss: bool = False,
+        report_cer: bool = True,
+        report_wer: bool = True,
+        sym_space: str = "<space>",
+        sym_blank: str = "<blank>",
+        masking_schema: str = "span",
+        mean_phn_span: int = 3,
+        mlm_prob: float = 0.25,
+        dynamic_mlm_prob = False,
+        decoder_seg_pos=False,
+        text_masking=False
+    ):
+
+        super().__init__()
+        # note that eos is the same as sos (equivalent ID)
+        self.odim = odim
+        self.ignore_id = ignore_id
+        self.token_list = token_list.copy()
+
+        self.normalize = normalize
+        self.encoder = encoder
+
+        self.decoder = decoder
+        self.vocab_size = encoder.text_embed[0]._num_embeddings
+        if report_cer or report_wer:
+            self.error_calculator = ErrorCalculator(
+                token_list, sym_space, sym_blank, report_cer, report_wer
+            )
+        else:
+            self.error_calculator = None
+
+        self.feats_extract = feats_extract
+        self.mlm_weight = 1.0
+        self.mlm_prob = mlm_prob
+        self.mlm_layer = 12
+        self.finetune_wo_mlm =True
+        self.max_span = 50
+        self.min_span = 4
+        self.mean_phn_span = mean_phn_span
+        self.masking_schema = masking_schema
+        if self.decoder is None or not (hasattr(self.decoder, 'output_layer') and self.decoder.output_layer is not None):
+            self.sfc = nn.Linear(self.encoder._output_size, odim)
+        else:
+            self.sfc=None
+        if text_masking:
+            self.text_sfc =  nn.Linear(self.encoder.text_embed[0]._embedding_dim, self.vocab_size, weight_attr = self.encoder.text_embed[0]._weight_attr)
+            self.text_mlm_loss = nn.CrossEntropyLoss(ignore_index=ignore_id)
+        else:
+            self.text_sfc = None
+            self.text_mlm_loss = None
+        self.decoder_seg_pos = decoder_seg_pos
+        if lsm_weight > 50:
+            self.l1_loss_func = nn.MSELoss(reduce=False)
+        else:
+            self.l1_loss_func = nn.L1Loss(reduction='none')
+        self.postnet = (
+            None
+            if postnet_layers == 0
+            else Postnet(
+                idim=self.encoder._output_size,
+                odim=odim,
+                n_layers=postnet_layers,
+                n_chans=postnet_chans,
+                n_filts=postnet_filts,
+                use_batch_norm=True,
+                dropout_rate=0.5,
+            )
+        )
+
+    def collect_feats(self,
+        speech, speech_lengths, text, text_lengths, masked_position, speech_mask, text_mask, speech_segment_pos, text_segment_pos, y_masks=None
+    ) -> Dict[str, paddle.Tensor]:
+        return {"feats": speech, "feats_lengths": speech_lengths}
+
+    def _forward(self, batch, speech_segment_pos,y_masks=None):
+        # feats: (Batch, Length, Dim)
+        # -> encoder_out: (Batch, Length2, Dim2)
+        speech_pad_placeholder = batch['speech_pad']
+        if self.decoder is not None:
+            ys_in = self._add_first_frame_and_remove_last_frame(batch['speech_pad'])
+        encoder_out, h_masks = self.encoder(**batch)
+        if self.decoder is not None:
+            zs, _ = self.decoder(ys_in, y_masks, encoder_out, bool(h_masks), self.encoder.segment_emb(speech_segment_pos))
+            speech_hidden_states = zs
+        else:
+            speech_hidden_states = encoder_out[:,:paddle.shape(batch['speech_pad'])[1], :]
+        if self.sfc is not None:
+            before_outs = paddle.reshape(self.sfc(speech_hidden_states), (paddle.shape(speech_hidden_states)[0], -1, self.odim))
+        else:
+            before_outs = speech_hidden_states
+        if self.postnet is not None:
+            after_outs = before_outs + paddle.transpose(self.postnet(
+                paddle.transpose(before_outs, [0, 2, 1])
+            ), (0, 2, 1))
+        else:
+            after_outs = None
+        return before_outs, after_outs, speech_pad_placeholder, batch['masked_position']
+
+    
+
+    
+    def inference(
+        self,
+        speech, text, masked_position, speech_mask, text_mask, speech_segment_pos, text_segment_pos,
+        span_boundary,
+        y_masks=None,
+        speech_lengths=None, text_lengths=None,
+        feats: Optional[paddle.Tensor] = None,
+        spembs: Optional[paddle.Tensor] = None,
+        sids: Optional[paddle.Tensor] = None,
+        lids: Optional[paddle.Tensor] = None,
+        threshold: float = 0.5,
+        minlenratio: float = 0.0,
+        maxlenratio: float = 10.0,
+        use_teacher_forcing: bool = False,
+    ) -> Dict[str, paddle.Tensor]:
+        
+        
+        batch = dict(
+            speech_pad=speech,
+            text_pad=text,
+            masked_position=masked_position,
+            speech_mask=speech_mask,
+            text_mask=text_mask,
+            speech_segment_pos=speech_segment_pos,
+            text_segment_pos=text_segment_pos,
+        )
+        
+
+        # # inference with teacher forcing
+        # hs, h_masks = self.encoder(**batch)
+
+        outs = [batch['speech_pad'][:,:span_boundary[0]]]
+        z_cache = None
+        if use_teacher_forcing:
+            before,zs, _, _ = self._forward(
+                batch, speech_segment_pos, y_masks=y_masks)
+            if zs is None:
+                zs = before
+            outs+=[zs[0][span_boundary[0]:span_boundary[1]]]
+            outs+=[batch['speech_pad'][:,span_boundary[1]:]]
+            return dict(feat_gen=outs)
+        
+            # concatenate attention weights -> (#layers, #heads, T_feats, T_text)
+        att_ws = paddle.stack(att_ws, axis=0)
+        outs += [batch['speech_pad'][:,span_boundary[1]:]]
+        return dict(feat_gen=outs, att_w=att_ws)
+
+
+    def _add_first_frame_and_remove_last_frame(self, ys: paddle.Tensor) -> paddle.Tensor:
+        ys_in = paddle.concat(
+            [paddle.zeros(shape = (paddle.shape(ys)[0], 1, paddle.shape(ys)[2]), dtype = ys.dtype), ys[:, :-1]], axis=1
+        )
+        return ys_in
+
+
+class ESPnetMLMEncAsDecoderModel(ESPnetMLMModel):
+
+    def _forward(self, batch, speech_segment_pos, y_masks=None):
+        # feats: (Batch, Length, Dim)
+        # -> encoder_out: (Batch, Length2, Dim2)
+        speech_pad_placeholder = batch['speech_pad']
+        encoder_out, h_masks = self.encoder(**batch) # segment_emb
+        if self.decoder is not None:
+            zs, _ = self.decoder(encoder_out, h_masks)
+        else:
+            zs = encoder_out
+        speech_hidden_states = zs[:,:paddle.shape(batch['speech_pad'])[1], :]
+        if self.sfc is not None:
+            before_outs = paddle.reshape(self.sfc(speech_hidden_states), (paddle.shape(speech_hidden_states)[0], -1, self.odim))
+        else:
+            before_outs = speech_hidden_states
+        if self.postnet is not None:
+            after_outs = before_outs + paddle.transpose(self.postnet(
+                paddle.transpose(before_outs, [0, 2, 1])
+            ), [0, 2, 1])
+        else:
+            after_outs = None
+        return before_outs, after_outs, speech_pad_placeholder, batch['masked_position']
+
+class ESPnetMLMDualMaksingModel(ESPnetMLMModel):
+
+    def _calc_mlm_loss(
+        self,
+        before_outs: paddle.Tensor,
+        after_outs: paddle.Tensor,
+        text_outs: paddle.Tensor,
+        batch
+    ):
+        xs_pad = batch['speech_pad']
+        text_pad = batch['text_pad']
+        masked_position = batch['masked_position']
+        text_masked_position = batch['text_masked_position']
+        mlm_loss_position = masked_position>0
+        loss = paddle.sum(self.l1_loss_func(paddle.reshape(before_outs, (-1, self.odim)), 
+                                            paddle.reshape(xs_pad, (-1, self.odim))), axis=-1)
+        if after_outs is not None:
+            loss += paddle.sum(self.l1_loss_func(paddle.reshape(after_outs, (-1, self.odim)), 
+                                                paddle.reshape(xs_pad, (-1, self.odim))), axis=-1)
+        loss_mlm = paddle.sum((loss * paddle.reshape(mlm_loss_position, axis=-1).float())) \
+                                            / paddle.sum((mlm_loss_position.float()) + 1e-10)
+
+        loss_text = paddle.sum((self.text_mlm_loss(paddle.reshape(text_outs, (-1,self.vocab_size)), paddle.reshape(text_pad, (-1))) * paddle.reshape(text_masked_position, (-1)).float())) \
+            /  paddle.sum((text_masked_position.float()) + 1e-10)
+        return loss_mlm, loss_text
+
+
+    def _forward(self, batch, speech_segment_pos, y_masks=None):
+        # feats: (Batch, Length, Dim)
+        # -> encoder_out: (Batch, Length2, Dim2)
+        speech_pad_placeholder = batch['speech_pad']
+        encoder_out, h_masks = self.encoder(**batch) # segment_emb
+        if self.decoder is not None:
+            zs, _ = self.decoder(encoder_out, h_masks)
+        else:
+            zs = encoder_out
+        speech_hidden_states = zs[:,:paddle.shape(batch['speech_pad'])[1], :]
+        if self.text_sfc:
+            text_hiddent_states = zs[:,paddle.shape(batch['speech_pad'])[1]:,:]
+            text_outs = paddle.reshape(self.text_sfc(text_hiddent_states), (paddle.shape(text_hiddent_states)[0], -1, self.vocab_size))
+        if self.sfc is not None:
+            before_outs = paddle.reshape(self.sfc(speech_hidden_states),
+            (paddle.shape(speech_hidden_states)[0], -1, self.odim))
+        else:
+            before_outs = speech_hidden_states
+        if self.postnet is not None:
+            after_outs = before_outs + paddle.transpose(self.postnet(
+                paddle.transpose(before_outs, [0,2,1])
+            ), [0, 2, 1])
+        else:
+            after_outs = None
+        return before_outs, after_outs,text_outs, None #, speech_pad_placeholder, batch['masked_position'],batch['text_masked_position']
+
+def build_model_from_file(config_file, model_file):
+    
+    state_dict = paddle.load(model_file)
+    model_class = ESPnetMLMDualMaksingModel if 'conformer_combine_vctk_aishell3_dual_masking' in config_file \
+        else ESPnetMLMEncAsDecoderModel
+
+    # 构建模型
+    args = yaml.safe_load(Path(config_file).open("r", encoding="utf-8"))
+    args = argparse.Namespace(**args)
+
+    model = build_model(args, model_class)
+
+    model.set_state_dict(state_dict)
+    return model, args
+
+
+def build_model(args: argparse.Namespace, model_class = ESPnetMLMEncAsDecoderModel) -> ESPnetMLMModel:
+    if isinstance(args.token_list, str):
+        with open(args.token_list, encoding="utf-8") as f:
+            token_list = [line.rstrip() for line in f]
+
+        # Overwriting token_list to keep it as "portable".
+        args.token_list = list(token_list)
+    elif isinstance(args.token_list, (tuple, list)):
+        token_list = list(args.token_list)
+    else:
+        raise RuntimeError("token_list must be str or list")
+    vocab_size = len(token_list)
+    logging.info(f"Vocabulary size: {vocab_size }")
+    
+    odim = 80
+
+
+    # Normalization layer
+    normalize = None
+
+    pos_enc_class = ScaledPositionalEncoding if args.use_scaled_pos_enc else PositionalEncoding
+
+    if "conformer" == args.encoder:
+        conformer_self_attn_layer_type = args.encoder_conf['selfattention_layer_type']
+        conformer_pos_enc_layer_type = args.encoder_conf['pos_enc_layer_type']
+        conformer_rel_pos_type = "legacy"
+        if conformer_rel_pos_type == "legacy":
+            if conformer_pos_enc_layer_type == "rel_pos":
+                conformer_pos_enc_layer_type = "legacy_rel_pos"
+                logging.warning(
+                    "Fallback to conformer_pos_enc_layer_type = 'legacy_rel_pos' "
+                    "due to the compatibility. If you want to use the new one, "
+                    "please use conformer_pos_enc_layer_type = 'latest'."
+                )
+            if conformer_self_attn_layer_type == "rel_selfattn":
+                conformer_self_attn_layer_type = "legacy_rel_selfattn"
+                logging.warning(
+                    "Fallback to "
+                    "conformer_self_attn_layer_type = 'legacy_rel_selfattn' "
+                    "due to the compatibility. If you want to use the new one, "
+                    "please use conformer_pos_enc_layer_type = 'latest'."
+                )
+        elif conformer_rel_pos_type == "latest":
+            assert conformer_pos_enc_layer_type != "legacy_rel_pos"
+            assert conformer_self_attn_layer_type != "legacy_rel_selfattn"
+        else:
+            raise ValueError(f"Unknown rel_pos_type: {conformer_rel_pos_type}")
+        args.encoder_conf['selfattention_layer_type'] = conformer_self_attn_layer_type
+        args.encoder_conf['pos_enc_layer_type'] = conformer_pos_enc_layer_type 
+        if "conformer"==args.decoder:
+            args.decoder_conf['selfattention_layer_type'] = conformer_self_attn_layer_type
+            args.decoder_conf['pos_enc_layer_type'] = conformer_pos_enc_layer_type 
+
+
+    # Encoder
+    encoder_class = MLMEncoder
+
+    if 'text_masking' in args.model_conf.keys() and args.model_conf['text_masking']:
+        args.encoder_conf['text_masking'] = True
+    else:
+        args.encoder_conf['text_masking'] = False
+    
+    encoder = encoder_class(args.input_size,vocab_size=vocab_size, pos_enc_class=pos_enc_class,
+    **args.encoder_conf)
+
+    # Decoder
+    if args.decoder != 'no_decoder':
+        decoder_class = MLMDecoder
+        decoder = decoder_class(
+            idim=0,
+            input_layer=None,
+            **args.decoder_conf,
+        )
+    else:
+        decoder = None
+
+    # Build model
+    model = model_class(
+        feats_extract=None, # maybe should be LogMelFbank
+        odim=odim,
+        normalize=normalize,
+        encoder=encoder,
+        decoder=decoder,
+        token_list=token_list,
+        **args.model_conf,
+    )
+
+
+    # Initialize
+    if args.init is not None:
+        initialize(model, args.init)
+
+    return model
diff --git a/ernie-sat/paddlespeech/__init__.py b/ernie-sat/paddlespeech/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b781c4a8e5cc99590e179faf1c4c3989349d4216
--- /dev/null
+++ b/ernie-sat/paddlespeech/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import _locale
+
+_locale._getdefaultlocale = (lambda *args: ['en_US', 'utf8'])
diff --git a/ernie-sat/paddlespeech/cli/README.md b/ernie-sat/paddlespeech/cli/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..19c822040de6699123781f14b6eac5bcf3ca15a6
--- /dev/null
+++ b/ernie-sat/paddlespeech/cli/README.md
@@ -0,0 +1,44 @@
+# PaddleSpeech Command Line
+
+([简体中文](./README_cn.md)|English)
+
+ The simplest approach to use PaddleSpeech models.
+
+ ## Help
+ ```bash
+ paddlespeech help
+ ```
+ ## Audio Classification
+ ```bash
+ paddlespeech cls --input input.wav
+ ```
+
+ ## Speaker Verification
+
+ ```bash
+ paddlespeech vector --task spk --input input_16k.wav
+ ```
+
+ ## Automatic Speech Recognition
+ ```
+ paddlespeech asr --lang zh --input input_16k.wav
+ ```
+ 
+ ## Speech Translation (English to Chinese)
+ 
+ (not support for Windows now)
+ ```bash
+ paddlespeech st --input input_16k.wav
+ ```
+ 
+ ## Text-to-Speech
+ ```bash
+ paddlespeech tts --input "你好，欢迎使用百度飞桨深度学习框架！" --output output.wav
+ ```
+ 
+ ## Text Post-precessing
+
+- Punctuation Restoration
+  ```bash
+  paddlespeech text --task punc --input 今天的天气真不错啊你下午有空吗我想约你一起去吃饭
+  ```
diff --git a/ernie-sat/paddlespeech/cli/README_cn.md b/ernie-sat/paddlespeech/cli/README_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..4b15d6c7bc68a39075aba7efb37a04e687b5ab35
--- /dev/null
+++ b/ernie-sat/paddlespeech/cli/README_cn.md
@@ -0,0 +1,45 @@
+# PaddleSpeech 命令行工具
+
+(简体中文|[English](./README.md))
+
+`paddlespeech.cli` 模块是 PaddleSpeech 的命令行工具，它提供了最简便的方式调用 PaddleSpeech 提供的不同语音应用场景的预训练模型，用一行命令就可以进行模型预测：
+
+ ## 命令行使用帮助
+ ```bash
+ paddlespeech help
+ ```
+
+ ## 声音分类
+ ```bash
+ paddlespeech cls --input input.wav
+ ```
+
+  ## 声纹识别
+
+ ```bash
+ paddlespeech vector --task spk --input input_16k.wav
+ ```
+
+ ## 语音识别
+ ```
+ paddlespeech asr --lang zh --input input_16k.wav
+ ```
+ 
+ ## 语音翻译（英-中）
+ 
+ (暂不支持Windows系统)
+ ```bash
+ paddlespeech st --input input_16k.wav
+ ```
+ 
+ ## 语音合成
+ ```bash
+ paddlespeech tts --input "你好，欢迎使用百度飞桨深度学习框架！" --output output.wav
+ ```
+ 
+ ## 文本后处理
+
+- 标点恢复
+  ```bash
+  paddlespeech text --task punc --input 今天的天气真不错啊你下午有空吗我想约你一起去吃饭
+  ```
diff --git a/ernie-sat/paddlespeech/cli/__init__.py b/ernie-sat/paddlespeech/cli/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ddf0359bc5fcb7ff80b437a65112869d7faa12eb
--- /dev/null
+++ b/ernie-sat/paddlespeech/cli/__init__.py
@@ -0,0 +1,26 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import _locale
+
+from .asr import ASRExecutor
+from .base_commands import BaseCommand
+from .base_commands import HelpCommand
+from .cls import CLSExecutor
+from .st import STExecutor
+from .stats import StatsExecutor
+from .text import TextExecutor
+from .tts import TTSExecutor
+from .vector import VectorExecutor
+
+_locale._getdefaultlocale = (lambda *args: ['en_US', 'utf8'])
diff --git a/ernie-sat/paddlespeech/cli/asr/__init__.py b/ernie-sat/paddlespeech/cli/asr/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ab0991fcda4c5eb9e5bc0c58de0e417c113f4b4
--- /dev/null
+++ b/ernie-sat/paddlespeech/cli/asr/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .infer import ASRExecutor
diff --git a/ernie-sat/paddlespeech/cli/asr/infer.py b/ernie-sat/paddlespeech/cli/asr/infer.py
new file mode 100644
index 0000000000000000000000000000000000000000..b12b9f6fce89a44564ed66a4346b10032100a4af
--- /dev/null
+++ b/ernie-sat/paddlespeech/cli/asr/infer.py
@@ -0,0 +1,544 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+import sys
+from collections import OrderedDict
+from typing import List
+from typing import Optional
+from typing import Union
+
+import librosa
+import numpy as np
+import paddle
+import soundfile
+from yacs.config import CfgNode
+
+from ..download import get_path_from_url
+from ..executor import BaseExecutor
+from ..log import logger
+from ..utils import cli_register
+from ..utils import download_and_decompress
+from ..utils import MODEL_HOME
+from ..utils import stats_wrapper
+from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
+from paddlespeech.s2t.transform.transformation import Transformation
+from paddlespeech.s2t.utils.dynamic_import import dynamic_import
+from paddlespeech.s2t.utils.utility import UpdateConfig
+
+__all__ = ['ASRExecutor']
+
+pretrained_models = {
+    # The tags for pretrained_models should be "{model_name}[_{dataset}][-{lang}][-...]".
+    # e.g. "conformer_wenetspeech-zh-16k" and "panns_cnn6-32k".
+    # Command line and python api use "{model_name}[_{dataset}]" as --model, usage:
+    # "paddlespeech asr --model conformer_wenetspeech --lang zh --sr 16000 --input ./input.wav"
+    "conformer_wenetspeech-zh-16k": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1_conformer_wenetspeech_ckpt_0.1.1.model.tar.gz',
+        'md5':
+        '76cb19ed857e6623856b7cd7ebbfeda4',
+        'cfg_path':
+        'model.yaml',
+        'ckpt_path':
+        'exp/conformer/checkpoints/wenetspeech',
+    },
+    "transformer_librispeech-en-16k": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr1/asr1_transformer_librispeech_ckpt_0.1.1.model.tar.gz',
+        'md5':
+        '2c667da24922aad391eacafe37bc1660',
+        'cfg_path':
+        'model.yaml',
+        'ckpt_path':
+        'exp/transformer/checkpoints/avg_10',
+    },
+    "deepspeech2offline_aishell-zh-16k": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_aishell_ckpt_0.1.1.model.tar.gz',
+        'md5':
+        '932c3593d62fe5c741b59b31318aa314',
+        'cfg_path':
+        'model.yaml',
+        'ckpt_path':
+        'exp/deepspeech2/checkpoints/avg_1',
+        'lm_url':
+        'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm',
+        'lm_md5':
+        '29e02312deb2e59b3c8686c7966d4fe3'
+    },
+    "deepspeech2online_aishell-zh-16k": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz',
+        'md5':
+        '23e16c69730a1cb5d735c98c83c21e16',
+        'cfg_path':
+        'model.yaml',
+        'ckpt_path':
+        'exp/deepspeech2_online/checkpoints/avg_1',
+        'lm_url':
+        'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm',
+        'lm_md5':
+        '29e02312deb2e59b3c8686c7966d4fe3'
+    },
+    "deepspeech2offline_librispeech-en-16k": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr0/asr0_deepspeech2_librispeech_ckpt_0.1.1.model.tar.gz',
+        'md5':
+        'f5666c81ad015c8de03aac2bc92e5762',
+        'cfg_path':
+        'model.yaml',
+        'ckpt_path':
+        'exp/deepspeech2/checkpoints/avg_1',
+        'lm_url':
+        'https://deepspeech.bj.bcebos.com/en_lm/common_crawl_00.prune01111.trie.klm',
+        'lm_md5':
+        '099a601759d467cd0a8523ff939819c5'
+    },
+}
+
+model_alias = {
+    "deepspeech2offline":
+    "paddlespeech.s2t.models.ds2:DeepSpeech2Model",
+    "deepspeech2online":
+    "paddlespeech.s2t.models.ds2_online:DeepSpeech2ModelOnline",
+    "conformer":
+    "paddlespeech.s2t.models.u2:U2Model",
+    "transformer":
+    "paddlespeech.s2t.models.u2:U2Model",
+    "wenetspeech":
+    "paddlespeech.s2t.models.u2:U2Model",
+}
+
+
+@cli_register(
+    name='paddlespeech.asr', description='Speech to text infer command.')
+class ASRExecutor(BaseExecutor):
+    def __init__(self):
+        super(ASRExecutor, self).__init__()
+
+        self.parser = argparse.ArgumentParser(
+            prog='paddlespeech.asr', add_help=True)
+        self.parser.add_argument(
+            '--input', type=str, default=None, help='Audio file to recognize.')
+        self.parser.add_argument(
+            '--model',
+            type=str,
+            default='conformer_wenetspeech',
+            choices=[tag[:tag.index('-')] for tag in pretrained_models.keys()],
+            help='Choose model type of asr task.')
+        self.parser.add_argument(
+            '--lang',
+            type=str,
+            default='zh',
+            help='Choose model language. zh or en, zh:[conformer_wenetspeech-zh-16k], en:[transformer_librispeech-en-16k]'
+        )
+        self.parser.add_argument(
+            "--sample_rate",
+            type=int,
+            default=16000,
+            choices=[8000, 16000],
+            help='Choose the audio sample rate of the model. 8000 or 16000')
+        self.parser.add_argument(
+            '--config',
+            type=str,
+            default=None,
+            help='Config of asr task. Use deault config when it is None.')
+        self.parser.add_argument(
+            '--decode_method',
+            type=str,
+            default='attention_rescoring',
+            choices=[
+                'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention',
+                'attention_rescoring'
+            ],
+            help='only support transformer and conformer model')
+        self.parser.add_argument(
+            '--ckpt_path',
+            type=str,
+            default=None,
+            help='Checkpoint file of model.')
+        self.parser.add_argument(
+            '--yes',
+            '-y',
+            action="store_true",
+            default=False,
+            help='No additional parameters required. Once set this parameter, it means accepting the request of the program by default, which includes transforming the audio sample rate'
+        )
+        self.parser.add_argument(
+            '--device',
+            type=str,
+            default=paddle.get_device(),
+            help='Choose device to execute model inference.')
+        self.parser.add_argument(
+            '-d',
+            '--job_dump_result',
+            action='store_true',
+            help='Save job result into file.')
+        self.parser.add_argument(
+            '-v',
+            '--verbose',
+            action='store_true',
+            help='Increase logger verbosity of current task.')
+
+    def _get_pretrained_path(self, tag: str) -> os.PathLike:
+        """
+        Download and returns pretrained resources path of current task.
+        """
+        support_models = list(pretrained_models.keys())
+        assert tag in pretrained_models, 'The model "{}" you want to use has not been supported, please choose other models.\nThe support models includes:\n\t\t{}\n'.format(
+            tag, '\n\t\t'.join(support_models))
+
+        res_path = os.path.join(MODEL_HOME, tag)
+        decompressed_path = download_and_decompress(pretrained_models[tag],
+                                                    res_path)
+        decompressed_path = os.path.abspath(decompressed_path)
+        logger.info(
+            'Use pretrained model stored in: {}'.format(decompressed_path))
+
+        return decompressed_path
+
+    def _init_from_path(self,
+                        model_type: str='wenetspeech',
+                        lang: str='zh',
+                        sample_rate: int=16000,
+                        cfg_path: Optional[os.PathLike]=None,
+                        decode_method: str='attention_rescoring',
+                        ckpt_path: Optional[os.PathLike]=None):
+        """
+        Init model and other resources from a specific path.
+        """
+        if hasattr(self, 'model'):
+            logger.info('Model had been initialized.')
+            return
+
+        if cfg_path is None or ckpt_path is None:
+            sample_rate_str = '16k' if sample_rate == 16000 else '8k'
+            tag = model_type + '-' + lang + '-' + sample_rate_str
+            res_path = self._get_pretrained_path(tag)  # wenetspeech_zh
+            self.res_path = res_path
+            self.cfg_path = os.path.join(res_path,
+                                         pretrained_models[tag]['cfg_path'])
+            self.ckpt_path = os.path.join(
+                res_path, pretrained_models[tag]['ckpt_path'] + ".pdparams")
+            logger.info(res_path)
+            logger.info(self.cfg_path)
+            logger.info(self.ckpt_path)
+        else:
+            self.cfg_path = os.path.abspath(cfg_path)
+            self.ckpt_path = os.path.abspath(ckpt_path + ".pdparams")
+            self.res_path = os.path.dirname(
+                os.path.dirname(os.path.abspath(self.cfg_path)))
+
+        #Init body.
+        self.config = CfgNode(new_allowed=True)
+        self.config.merge_from_file(self.cfg_path)
+
+        with UpdateConfig(self.config):
+            if "deepspeech2online" in model_type or "deepspeech2offline" in model_type:
+                from paddlespeech.s2t.io.collator import SpeechCollator
+                self.vocab = self.config.vocab_filepath
+                self.config.decode.lang_model_path = os.path.join(
+                    MODEL_HOME, 'language_model',
+                    self.config.decode.lang_model_path)
+                self.collate_fn_test = SpeechCollator.from_config(self.config)
+                self.text_feature = TextFeaturizer(
+                    unit_type=self.config.unit_type, vocab=self.vocab)
+                lm_url = pretrained_models[tag]['lm_url']
+                lm_md5 = pretrained_models[tag]['lm_md5']
+                self.download_lm(
+                    lm_url,
+                    os.path.dirname(self.config.decode.lang_model_path), lm_md5)
+
+            elif "conformer" in model_type or "transformer" in model_type or "wenetspeech" in model_type:
+                self.config.spm_model_prefix = os.path.join(
+                    self.res_path, self.config.spm_model_prefix)
+                self.text_feature = TextFeaturizer(
+                    unit_type=self.config.unit_type,
+                    vocab=self.config.vocab_filepath,
+                    spm_model_prefix=self.config.spm_model_prefix)
+                self.config.decode.decoding_method = decode_method
+
+            else:
+                raise Exception("wrong type")
+        model_name = model_type[:model_type.rindex(
+            '_')]  # model_type: {model_name}_{dataset}
+        model_class = dynamic_import(model_name, model_alias)
+        model_conf = self.config
+        model = model_class.from_config(model_conf)
+        self.model = model
+        self.model.eval()
+
+        # load model
+        model_dict = paddle.load(self.ckpt_path)
+        self.model.set_state_dict(model_dict)
+
+    def preprocess(self, model_type: str, input: Union[str, os.PathLike]):
+        """
+        Input preprocess and return paddle.Tensor stored in self.input.
+        Input content can be a text(tts), a file(asr, cls) or a streaming(not supported yet).
+        """
+
+        audio_file = input
+        if isinstance(audio_file, (str, os.PathLike)):
+            logger.info("Preprocess audio_file:" + audio_file)
+
+        # Get the object for feature extraction
+        if "deepspeech2online" in model_type or "deepspeech2offline" in model_type:
+            audio, _ = self.collate_fn_test.process_utterance(
+                audio_file=audio_file, transcript=" ")
+            audio_len = audio.shape[0]
+            audio = paddle.to_tensor(audio, dtype='float32')
+            audio_len = paddle.to_tensor(audio_len)
+            audio = paddle.unsqueeze(audio, axis=0)
+            # vocab_list = collate_fn_test.vocab_list
+            self._inputs["audio"] = audio
+            self._inputs["audio_len"] = audio_len
+            logger.info(f"audio feat shape: {audio.shape}")
+
+        elif "conformer" in model_type or "transformer" in model_type or "wenetspeech" in model_type:
+            logger.info("get the preprocess conf")
+            preprocess_conf = self.config.preprocess_config
+            preprocess_args = {"train": False}
+            preprocessing = Transformation(preprocess_conf)
+            logger.info("read the audio file")
+            audio, audio_sample_rate = soundfile.read(
+                audio_file, dtype="int16", always_2d=True)
+
+            if self.change_format:
+                if audio.shape[1] >= 2:
+                    audio = audio.mean(axis=1, dtype=np.int16)
+                else:
+                    audio = audio[:, 0]
+                # pcm16 -> pcm 32
+                audio = self._pcm16to32(audio)
+                audio = librosa.resample(
+                    audio,
+                    orig_sr=audio_sample_rate,
+                    target_sr=self.sample_rate)
+                audio_sample_rate = self.sample_rate
+                # pcm32 -> pcm 16
+                audio = self._pcm32to16(audio)
+            else:
+                audio = audio[:, 0]
+
+            logger.info(f"audio shape: {audio.shape}")
+            # fbank
+            audio = preprocessing(audio, **preprocess_args)
+
+            audio_len = paddle.to_tensor(audio.shape[0])
+            audio = paddle.to_tensor(audio, dtype='float32').unsqueeze(axis=0)
+
+            self._inputs["audio"] = audio
+            self._inputs["audio_len"] = audio_len
+            logger.info(f"audio feat shape: {audio.shape}")
+
+        else:
+            raise Exception("wrong type")
+
+    @paddle.no_grad()
+    def infer(self, model_type: str):
+        """
+        Model inference and result stored in self.output.
+        """
+
+        cfg = self.config.decode
+        audio = self._inputs["audio"]
+        audio_len = self._inputs["audio_len"]
+        if "deepspeech2online" in model_type or "deepspeech2offline" in model_type:
+            decode_batch_size = audio.shape[0]
+            self.model.decoder.init_decoder(
+                decode_batch_size, self.text_feature.vocab_list,
+                cfg.decoding_method, cfg.lang_model_path, cfg.alpha, cfg.beta,
+                cfg.beam_size, cfg.cutoff_prob, cfg.cutoff_top_n,
+                cfg.num_proc_bsearch)
+
+            result_transcripts = self.model.decode(audio, audio_len)
+            self.model.decoder.del_decoder()
+            self._outputs["result"] = result_transcripts[0]
+
+        elif "conformer" in model_type or "transformer" in model_type:
+            result_transcripts = self.model.decode(
+                audio,
+                audio_len,
+                text_feature=self.text_feature,
+                decoding_method=cfg.decoding_method,
+                beam_size=cfg.beam_size,
+                ctc_weight=cfg.ctc_weight,
+                decoding_chunk_size=cfg.decoding_chunk_size,
+                num_decoding_left_chunks=cfg.num_decoding_left_chunks,
+                simulate_streaming=cfg.simulate_streaming)
+            self._outputs["result"] = result_transcripts[0][0]
+        else:
+            raise Exception("invalid model name")
+
+    def postprocess(self) -> Union[str, os.PathLike]:
+        """
+            Output postprocess and return human-readable results such as texts and audio files.
+        """
+        return self._outputs["result"]
+
+    def download_lm(self, url, lm_dir, md5sum):
+        download_path = get_path_from_url(
+            url=url,
+            root_dir=lm_dir,
+            md5sum=md5sum,
+            decompress=False, )
+
+    def _pcm16to32(self, audio):
+        assert (audio.dtype == np.int16)
+        audio = audio.astype("float32")
+        bits = np.iinfo(np.int16).bits
+        audio = audio / (2**(bits - 1))
+        return audio
+
+    def _pcm32to16(self, audio):
+        assert (audio.dtype == np.float32)
+        bits = np.iinfo(np.int16).bits
+        audio = audio * (2**(bits - 1))
+        audio = np.round(audio).astype("int16")
+        return audio
+
+    def _check(self, audio_file: str, sample_rate: int, force_yes: bool):
+        self.sample_rate = sample_rate
+        if self.sample_rate != 16000 and self.sample_rate != 8000:
+            logger.error(
+                "invalid sample rate, please input --sr 8000 or --sr 16000")
+            return False
+
+        if isinstance(audio_file, (str, os.PathLike)):
+            if not os.path.isfile(audio_file):
+                logger.error("Please input the right audio file path")
+                return False
+
+        logger.info("checking the audio file format......")
+        try:
+            audio, audio_sample_rate = soundfile.read(
+                audio_file, dtype="int16", always_2d=True)
+            audio_duration = audio.shape[0] / audio_sample_rate
+            max_duration = 50.0
+            if audio_duration >= max_duration:
+                logger.error("Please input audio file less then 50 seconds.\n")
+                return
+        except Exception as e:
+            logger.exception(e)
+            logger.error(
+                "can not open the audio file, please check the audio file format is 'wav'. \n \
+                 you can try to use sox to change the file format.\n \
+                 For example: \n \
+                 sample rate: 16k \n \
+                 sox input_audio.xx --rate 16k --bits 16 --channels 1 output_audio.wav \n \
+                 sample rate: 8k \n \
+                 sox input_audio.xx --rate 8k --bits 16 --channels 1 output_audio.wav \n \
+                 ")
+            return False
+        logger.info("The sample rate is %d" % audio_sample_rate)
+        if audio_sample_rate != self.sample_rate:
+            logger.warning("The sample rate of the input file is not {}.\n \
+                            The program will resample the wav file to {}.\n \
+                            If the result does not meet your expectations，\n \
+                            Please input the 16k 16 bit 1 channel wav file. \
+                        ".format(self.sample_rate, self.sample_rate))
+            if force_yes is False:
+                while (True):
+                    logger.info(
+                        "Whether to change the sample rate and the channel. Y: change the sample. N: exit the prgream."
+                    )
+                    content = input("Input(Y/N):")
+                    if content.strip() == "Y" or content.strip(
+                    ) == "y" or content.strip() == "yes" or content.strip(
+                    ) == "Yes":
+                        logger.info(
+                            "change the sampele rate, channel to 16k and 1 channel"
+                        )
+                        break
+                    elif content.strip() == "N" or content.strip(
+                    ) == "n" or content.strip() == "no" or content.strip(
+                    ) == "No":
+                        logger.info("Exit the program")
+                        exit(1)
+                    else:
+                        logger.warning("Not regular input, please input again")
+
+            self.change_format = True
+        else:
+            logger.info("The audio file format is right")
+            self.change_format = False
+
+        return True
+
+    def execute(self, argv: List[str]) -> bool:
+        """
+            Command line entry.
+        """
+        parser_args = self.parser.parse_args(argv)
+
+        model = parser_args.model
+        lang = parser_args.lang
+        sample_rate = parser_args.sample_rate
+        config = parser_args.config
+        ckpt_path = parser_args.ckpt_path
+        decode_method = parser_args.decode_method
+        force_yes = parser_args.yes
+        device = parser_args.device
+
+        if not parser_args.verbose:
+            self.disable_task_loggers()
+
+        task_source = self.get_task_source(parser_args.input)
+        task_results = OrderedDict()
+        has_exceptions = False
+
+        for id_, input_ in task_source.items():
+            try:
+                res = self(input_, model, lang, sample_rate, config, ckpt_path,
+                           decode_method, force_yes, device)
+                task_results[id_] = res
+            except Exception as e:
+                has_exceptions = True
+                task_results[id_] = f'{e.__class__.__name__}: {e}'
+
+        self.process_task_results(parser_args.input, task_results,
+                                  parser_args.job_dump_result)
+
+        if has_exceptions:
+            return False
+        else:
+            return True
+
+    @stats_wrapper
+    def __call__(self,
+                 audio_file: os.PathLike,
+                 model: str='conformer_wenetspeech',
+                 lang: str='zh',
+                 sample_rate: int=16000,
+                 config: os.PathLike=None,
+                 ckpt_path: os.PathLike=None,
+                 decode_method: str='attention_rescoring',
+                 force_yes: bool=False,
+                 device=paddle.get_device()):
+        """
+        Python API to call an executor.
+        """
+        audio_file = os.path.abspath(audio_file)
+        if not self._check(audio_file, sample_rate, force_yes):
+            sys.exit(-1)
+        paddle.set_device(device)
+        self._init_from_path(model, lang, sample_rate, config, decode_method,
+                             ckpt_path)
+        self.preprocess(model, audio_file)
+        self.infer(model)
+        res = self.postprocess()  # Retrieve result of asr.
+
+        return res
diff --git a/ernie-sat/paddlespeech/cli/base_commands.py b/ernie-sat/paddlespeech/cli/base_commands.py
new file mode 100644
index 0000000000000000000000000000000000000000..97d5cd7fa3ad30ee2338b50cfd5123fe4cd99d05
--- /dev/null
+++ b/ernie-sat/paddlespeech/cli/base_commands.py
@@ -0,0 +1,49 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List
+
+from .entry import commands
+from .utils import cli_register
+from .utils import get_command
+
+__all__ = [
+    'BaseCommand',
+    'HelpCommand',
+]
+
+
+@cli_register(name='paddlespeech')
+class BaseCommand:
+    def execute(self, argv: List[str]) -> bool:
+        help = get_command('paddlespeech.help')
+        return help().execute(argv)
+
+
+@cli_register(name='paddlespeech.help', description='Show help for commands.')
+class HelpCommand:
+    def execute(self, argv: List[str]) -> bool:
+        msg = 'Usage:\n'
+        msg += '    paddlespeech <command> <options>\n\n'
+        msg += 'Commands:\n'
+        for command, detail in commands['paddlespeech'].items():
+            if command.startswith('_'):
+                continue
+
+            if '_description' not in detail:
+                continue
+            msg += '    {:<15}        {}\n'.format(command,
+                                                   detail['_description'])
+
+        print(msg)
+        return True
diff --git a/ernie-sat/paddlespeech/cli/cls/__init__.py b/ernie-sat/paddlespeech/cli/cls/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..13e316f8f687b34743a92bd0723b944741b74516
--- /dev/null
+++ b/ernie-sat/paddlespeech/cli/cls/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .infer import CLSExecutor
diff --git a/ernie-sat/paddlespeech/cli/cls/infer.py b/ernie-sat/paddlespeech/cli/cls/infer.py
new file mode 100644
index 0000000000000000000000000000000000000000..f56d8a579c5d85a9376748b482897483e5886115
--- /dev/null
+++ b/ernie-sat/paddlespeech/cli/cls/infer.py
@@ -0,0 +1,295 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+from collections import OrderedDict
+from typing import List
+from typing import Optional
+from typing import Union
+
+import numpy as np
+import paddle
+import yaml
+
+from ..executor import BaseExecutor
+from ..log import logger
+from ..utils import cli_register
+from ..utils import download_and_decompress
+from ..utils import MODEL_HOME
+from ..utils import stats_wrapper
+from paddleaudio import load
+from paddleaudio.features import LogMelSpectrogram
+from paddlespeech.s2t.utils.dynamic_import import dynamic_import
+
+__all__ = ['CLSExecutor']
+
+pretrained_models = {
+    # The tags for pretrained_models should be "{model_name}[_{dataset}][-{lang}][-...]".
+    # e.g. "conformer_wenetspeech-zh-16k", "transformer_aishell-zh-16k" and "panns_cnn6-32k".
+    # Command line and python api use "{model_name}[_{dataset}]" as --model, usage:
+    # "paddlespeech asr --model conformer_wenetspeech --lang zh --sr 16000 --input ./input.wav"
+    "panns_cnn6-32k": {
+        'url': 'https://paddlespeech.bj.bcebos.com/cls/panns_cnn6.tar.gz',
+        'md5': '4cf09194a95df024fd12f84712cf0f9c',
+        'cfg_path': 'panns.yaml',
+        'ckpt_path': 'cnn6.pdparams',
+        'label_file': 'audioset_labels.txt',
+    },
+    "panns_cnn10-32k": {
+        'url': 'https://paddlespeech.bj.bcebos.com/cls/panns_cnn10.tar.gz',
+        'md5': 'cb8427b22176cc2116367d14847f5413',
+        'cfg_path': 'panns.yaml',
+        'ckpt_path': 'cnn10.pdparams',
+        'label_file': 'audioset_labels.txt',
+    },
+    "panns_cnn14-32k": {
+        'url': 'https://paddlespeech.bj.bcebos.com/cls/panns_cnn14.tar.gz',
+        'md5': 'e3b9b5614a1595001161d0ab95edee97',
+        'cfg_path': 'panns.yaml',
+        'ckpt_path': 'cnn14.pdparams',
+        'label_file': 'audioset_labels.txt',
+    },
+}
+
+model_alias = {
+    "panns_cnn6": "paddlespeech.cls.models.panns:CNN6",
+    "panns_cnn10": "paddlespeech.cls.models.panns:CNN10",
+    "panns_cnn14": "paddlespeech.cls.models.panns:CNN14",
+}
+
+
+@cli_register(
+    name='paddlespeech.cls', description='Audio classification infer command.')
+class CLSExecutor(BaseExecutor):
+    def __init__(self):
+        super(CLSExecutor, self).__init__()
+
+        self.parser = argparse.ArgumentParser(
+            prog='paddlespeech.cls', add_help=True)
+        self.parser.add_argument(
+            '--input', type=str, default=None, help='Audio file to classify.')
+        self.parser.add_argument(
+            '--model',
+            type=str,
+            default='panns_cnn14',
+            choices=[tag[:tag.index('-')] for tag in pretrained_models.keys()],
+            help='Choose model type of cls task.')
+        self.parser.add_argument(
+            '--config',
+            type=str,
+            default=None,
+            help='Config of cls task. Use deault config when it is None.')
+        self.parser.add_argument(
+            '--ckpt_path',
+            type=str,
+            default=None,
+            help='Checkpoint file of model.')
+        self.parser.add_argument(
+            '--label_file',
+            type=str,
+            default=None,
+            help='Label file of cls task.')
+        self.parser.add_argument(
+            '--topk',
+            type=int,
+            default=1,
+            help='Return topk scores of classification result.')
+        self.parser.add_argument(
+            '--device',
+            type=str,
+            default=paddle.get_device(),
+            help='Choose device to execute model inference.')
+        self.parser.add_argument(
+            '-d',
+            '--job_dump_result',
+            action='store_true',
+            help='Save job result into file.')
+        self.parser.add_argument(
+            '-v',
+            '--verbose',
+            action='store_true',
+            help='Increase logger verbosity of current task.')
+
+    def _get_pretrained_path(self, tag: str) -> os.PathLike:
+        """
+            Download and returns pretrained resources path of current task.
+        """
+        support_models = list(pretrained_models.keys())
+        assert tag in pretrained_models, 'The model "{}" you want to use has not been supported, please choose other models.\nThe support models includes:\n\t\t{}\n'.format(
+            tag, '\n\t\t'.join(support_models))
+
+        res_path = os.path.join(MODEL_HOME, tag)
+        decompressed_path = download_and_decompress(pretrained_models[tag],
+                                                    res_path)
+        decompressed_path = os.path.abspath(decompressed_path)
+        logger.info(
+            'Use pretrained model stored in: {}'.format(decompressed_path))
+
+        return decompressed_path
+
+    def _init_from_path(self,
+                        model_type: str='panns_cnn14',
+                        cfg_path: Optional[os.PathLike]=None,
+                        ckpt_path: Optional[os.PathLike]=None,
+                        label_file: Optional[os.PathLike]=None):
+        """
+            Init model and other resources from a specific path.
+        """
+        if hasattr(self, 'model'):
+            logger.info('Model had been initialized.')
+            return
+
+        if label_file is None or ckpt_path is None:
+            tag = model_type + '-' + '32k'  # panns_cnn14-32k
+            self.res_path = self._get_pretrained_path(tag)
+            self.cfg_path = os.path.join(self.res_path,
+                                         pretrained_models[tag]['cfg_path'])
+            self.label_file = os.path.join(self.res_path,
+                                           pretrained_models[tag]['label_file'])
+            self.ckpt_path = os.path.join(self.res_path,
+                                          pretrained_models[tag]['ckpt_path'])
+        else:
+            self.cfg_path = os.path.abspath(cfg_path)
+            self.label_file = os.path.abspath(label_file)
+            self.ckpt_path = os.path.abspath(ckpt_path)
+
+        # config
+        with open(self.cfg_path, 'r') as f:
+            self._conf = yaml.safe_load(f)
+
+        # labels
+        self._label_list = []
+        with open(self.label_file, 'r') as f:
+            for line in f:
+                self._label_list.append(line.strip())
+
+        # model
+        model_class = dynamic_import(model_type, model_alias)
+        model_dict = paddle.load(self.ckpt_path)
+        self.model = model_class(extract_embedding=False)
+        self.model.set_state_dict(model_dict)
+        self.model.eval()
+
+    def preprocess(self, audio_file: Union[str, os.PathLike]):
+        """
+            Input preprocess and return paddle.Tensor stored in self.input.
+            Input content can be a text(tts), a file(asr, cls) or a streaming(not supported yet).
+        """
+        feat_conf = self._conf['feature']
+        logger.info(feat_conf)
+        waveform, _ = load(
+            file=audio_file,
+            sr=feat_conf['sample_rate'],
+            mono=True,
+            dtype='float32')
+        if isinstance(audio_file, (str, os.PathLike)):
+            logger.info("Preprocessing audio_file:" + audio_file)
+
+        # Feature extraction
+        feature_extractor = LogMelSpectrogram(
+            sr=feat_conf['sample_rate'],
+            n_fft=feat_conf['n_fft'],
+            hop_length=feat_conf['hop_length'],
+            window=feat_conf['window'],
+            win_length=feat_conf['window_length'],
+            f_min=feat_conf['f_min'],
+            f_max=feat_conf['f_max'],
+            n_mels=feat_conf['n_mels'], )
+        feats = feature_extractor(
+            paddle.to_tensor(paddle.to_tensor(waveform).unsqueeze(0)))
+        self._inputs['feats'] = paddle.transpose(feats, [0, 2, 1]).unsqueeze(
+            1)  # [B, N, T] -> [B, 1, T, N]
+
+    @paddle.no_grad()
+    def infer(self):
+        """
+            Model inference and result stored in self.output.
+        """
+        self._outputs['logits'] = self.model(self._inputs['feats'])
+
+    def _generate_topk_label(self, result: np.ndarray, topk: int) -> str:
+        assert topk <= len(
+            self._label_list), 'Value of topk is larger than number of labels.'
+
+        topk_idx = (-result).argsort()[:topk]
+        ret = ''
+        for idx in topk_idx:
+            label, score = self._label_list[idx], result[idx]
+            ret += f'{label} {score} '
+        return ret
+
+    def postprocess(self, topk: int) -> Union[str, os.PathLike]:
+        """
+            Output postprocess and return human-readable results such as texts and audio files.
+        """
+        return self._generate_topk_label(
+            result=self._outputs['logits'].squeeze(0).numpy(), topk=topk)
+
+    def execute(self, argv: List[str]) -> bool:
+        """
+            Command line entry.
+        """
+        parser_args = self.parser.parse_args(argv)
+
+        model_type = parser_args.model
+        label_file = parser_args.label_file
+        cfg_path = parser_args.config
+        ckpt_path = parser_args.ckpt_path
+        topk = parser_args.topk
+        device = parser_args.device
+
+        if not parser_args.verbose:
+            self.disable_task_loggers()
+
+        task_source = self.get_task_source(parser_args.input)
+        task_results = OrderedDict()
+        has_exceptions = False
+
+        for id_, input_ in task_source.items():
+            try:
+                res = self(input_, model_type, cfg_path, ckpt_path, label_file,
+                           topk, device)
+                task_results[id_] = res
+            except Exception as e:
+                has_exceptions = True
+                task_results[id_] = f'{e.__class__.__name__}: {e}'
+
+        self.process_task_results(parser_args.input, task_results,
+                                  parser_args.job_dump_result)
+
+        if has_exceptions:
+            return False
+        else:
+            return True
+
+    @stats_wrapper
+    def __call__(self,
+                 audio_file: os.PathLike,
+                 model: str='panns_cnn14',
+                 config: Optional[os.PathLike]=None,
+                 ckpt_path: Optional[os.PathLike]=None,
+                 label_file: Optional[os.PathLike]=None,
+                 topk: int=1,
+                 device: str=paddle.get_device()):
+        """
+            Python API to call an executor.
+        """
+        audio_file = os.path.abspath(os.path.expanduser(audio_file))
+        paddle.set_device(device)
+        self._init_from_path(model, config, ckpt_path, label_file)
+        self.preprocess(audio_file)
+        self.infer()
+        res = self.postprocess(topk)  # Retrieve result of cls.
+
+        return res
diff --git a/ernie-sat/paddlespeech/cli/download.py b/ernie-sat/paddlespeech/cli/download.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f09b6fad000f476f3bc38a851f982501a0232ba
--- /dev/null
+++ b/ernie-sat/paddlespeech/cli/download.py
@@ -0,0 +1,329 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import hashlib
+import os
+import os.path as osp
+import shutil
+import subprocess
+import tarfile
+import time
+import zipfile
+
+import requests
+from tqdm import tqdm
+
+from .log import logger
+
+__all__ = ['get_path_from_url']
+
+DOWNLOAD_RETRY_LIMIT = 3
+
+
+def _is_url(path):
+    """
+    Whether path is URL.
+    Args:
+        path (string): URL string or not.
+    """
+    return path.startswith('http://') or path.startswith('https://')
+
+
+def _map_path(url, root_dir):
+    # parse path after download under root_dir
+    fname = osp.split(url)[-1]
+    fpath = fname
+    return osp.join(root_dir, fpath)
+
+
+def _get_unique_endpoints(trainer_endpoints):
+    # Sorting is to avoid different environmental variables for each card
+    trainer_endpoints.sort()
+    ips = set()
+    unique_endpoints = set()
+    for endpoint in trainer_endpoints:
+        ip = endpoint.split(":")[0]
+        if ip in ips:
+            continue
+        ips.add(ip)
+        unique_endpoints.add(endpoint)
+    logger.info("unique_endpoints {}".format(unique_endpoints))
+    return unique_endpoints
+
+
+def get_path_from_url(url,
+                      root_dir,
+                      md5sum=None,
+                      check_exist=True,
+                      decompress=True,
+                      method='get'):
+    """ Download from given url to root_dir.
+    if file or directory specified by url is exists under
+    root_dir, return the path directly, otherwise download
+    from url and decompress it, return the path.
+    Args:
+        url (str): download url
+        root_dir (str): root dir for downloading, it should be
+                        WEIGHTS_HOME or DATASET_HOME
+        md5sum (str): md5 sum of download package
+        decompress (bool): decompress zip or tar file. Default is `True`
+        method (str): which download method to use. Support `wget` and `get`. Default is `get`.
+    Returns:
+        str: a local path to save downloaded models & weights & datasets.
+    """
+
+    from paddle.fluid.dygraph.parallel import ParallelEnv
+
+    assert _is_url(url), "downloading from {} not a url".format(url)
+    # parse path after download to decompress under root_dir
+    fullpath = _map_path(url, root_dir)
+    # Mainly used to solve the problem of downloading data from different 
+    # machines in the case of multiple machines. Different ips will download 
+    # data, and the same ip will only download data once.
+    unique_endpoints = _get_unique_endpoints(ParallelEnv().trainer_endpoints[:])
+    if osp.exists(fullpath) and check_exist and _md5check(fullpath, md5sum):
+        logger.info("Found {}".format(fullpath))
+    else:
+        if ParallelEnv().current_endpoint in unique_endpoints:
+            fullpath = _download(url, root_dir, md5sum, method=method)
+        else:
+            while not os.path.exists(fullpath):
+                time.sleep(1)
+
+    if ParallelEnv().current_endpoint in unique_endpoints:
+        if decompress and (tarfile.is_tarfile(fullpath) or
+                           zipfile.is_zipfile(fullpath)):
+            fullpath = _decompress(fullpath)
+
+    return fullpath
+
+
+def _get_download(url, fullname):
+    # using requests.get method
+    fname = osp.basename(fullname)
+    try:
+        req = requests.get(url, stream=True)
+    except Exception as e:  # requests.exceptions.ConnectionError
+        logger.info("Downloading {} from {} failed with exception {}".format(
+            fname, url, str(e)))
+        return False
+
+    if req.status_code != 200:
+        raise RuntimeError("Downloading from {} failed with code "
+                           "{}!".format(url, req.status_code))
+
+    # For protecting download interupted, download to
+    # tmp_fullname firstly, move tmp_fullname to fullname
+    # after download finished
+    tmp_fullname = fullname + "_tmp"
+    total_size = req.headers.get('content-length')
+    with open(tmp_fullname, 'wb') as f:
+        if total_size:
+            with tqdm(total=(int(total_size) + 1023) // 1024) as pbar:
+                for chunk in req.iter_content(chunk_size=1024):
+                    f.write(chunk)
+                    pbar.update(1)
+        else:
+            for chunk in req.iter_content(chunk_size=1024):
+                if chunk:
+                    f.write(chunk)
+    shutil.move(tmp_fullname, fullname)
+
+    return fullname
+
+
+def _wget_download(url, fullname):
+    # using wget to download url
+    tmp_fullname = fullname + "_tmp"
+    # –user-agent
+    command = 'wget -O {} -t {} {}'.format(tmp_fullname, DOWNLOAD_RETRY_LIMIT,
+                                           url)
+    subprc = subprocess.Popen(
+        command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    _ = subprc.communicate()
+
+    if subprc.returncode != 0:
+        raise RuntimeError(
+            '{} failed. Please make sure `wget` is installed or {} exists'.
+            format(command, url))
+
+    shutil.move(tmp_fullname, fullname)
+
+    return fullname
+
+
+_download_methods = {
+    'get': _get_download,
+    'wget': _wget_download,
+}
+
+
+def _download(url, path, md5sum=None, method='get'):
+    """
+    Download from url, save to path.
+    url (str): download url
+    path (str): download to given path
+    md5sum (str): md5 sum of download package
+    method (str): which download method to use. Support `wget` and `get`. Default is `get`.
+    """
+    assert method in _download_methods, 'make sure `{}` implemented'.format(
+        method)
+
+    if not osp.exists(path):
+        os.makedirs(path)
+
+    fname = osp.split(url)[-1]
+    fullname = osp.join(path, fname)
+    retry_cnt = 0
+
+    logger.info("Downloading {} from {}".format(fname, url))
+    while not (osp.exists(fullname) and _md5check(fullname, md5sum)):
+        if retry_cnt < DOWNLOAD_RETRY_LIMIT:
+            retry_cnt += 1
+        else:
+            raise RuntimeError("Download from {} failed. "
+                               "Retry limit reached".format(url))
+
+        if not _download_methods[method](url, fullname):
+            time.sleep(1)
+            continue
+
+    return fullname
+
+
+def _md5check(fullname, md5sum=None):
+    if md5sum is None:
+        return True
+
+    logger.info("File {} md5 checking...".format(fullname))
+    md5 = hashlib.md5()
+    with open(fullname, 'rb') as f:
+        for chunk in iter(lambda: f.read(4096), b""):
+            md5.update(chunk)
+    calc_md5sum = md5.hexdigest()
+
+    if calc_md5sum != md5sum:
+        logger.info("File {} md5 check failed, {}(calc) != "
+                    "{}(base)".format(fullname, calc_md5sum, md5sum))
+        return False
+    return True
+
+
+def _decompress(fname):
+    """
+    Decompress for zip and tar file
+    """
+    logger.info("Decompressing {}...".format(fname))
+
+    # For protecting decompressing interupted,
+    # decompress to fpath_tmp directory firstly, if decompress
+    # successed, move decompress files to fpath and delete
+    # fpath_tmp and remove download compress file.
+
+    if tarfile.is_tarfile(fname):
+        uncompressed_path = _uncompress_file_tar(fname)
+    elif zipfile.is_zipfile(fname):
+        uncompressed_path = _uncompress_file_zip(fname)
+    else:
+        raise TypeError("Unsupport compress file type {}".format(fname))
+
+    return uncompressed_path
+
+
+def _uncompress_file_zip(filepath):
+    files = zipfile.ZipFile(filepath, 'r')
+    file_list = files.namelist()
+
+    file_dir = os.path.dirname(filepath)
+
+    if _is_a_single_file(file_list):
+        rootpath = file_list[0]
+        uncompressed_path = os.path.join(file_dir, rootpath)
+
+        for item in file_list:
+            files.extract(item, file_dir)
+
+    elif _is_a_single_dir(file_list):
+        rootpath = os.path.splitext(file_list[0])[0].split(os.sep)[0]
+        uncompressed_path = os.path.join(file_dir, rootpath)
+
+        for item in file_list:
+            files.extract(item, file_dir)
+
+    else:
+        rootpath = os.path.splitext(filepath)[0].split(os.sep)[-1]
+        uncompressed_path = os.path.join(file_dir, rootpath)
+        if not os.path.exists(uncompressed_path):
+            os.makedirs(uncompressed_path)
+        for item in file_list:
+            files.extract(item, os.path.join(file_dir, rootpath))
+
+    files.close()
+
+    return uncompressed_path
+
+
+def _uncompress_file_tar(filepath, mode="r:*"):
+    files = tarfile.open(filepath, mode)
+    file_list = files.getnames()
+
+    file_dir = os.path.dirname(filepath)
+
+    if _is_a_single_file(file_list):
+        rootpath = file_list[0]
+        uncompressed_path = os.path.join(file_dir, rootpath)
+        for item in file_list:
+            files.extract(item, file_dir)
+    elif _is_a_single_dir(file_list):
+        rootpath = os.path.splitext(file_list[0])[0].split(os.sep)[-1]
+        uncompressed_path = os.path.join(file_dir, rootpath)
+        for item in file_list:
+            files.extract(item, file_dir)
+    else:
+        rootpath = os.path.splitext(filepath)[0].split(os.sep)[-1]
+        uncompressed_path = os.path.join(file_dir, rootpath)
+        if not os.path.exists(uncompressed_path):
+            os.makedirs(uncompressed_path)
+
+        for item in file_list:
+            files.extract(item, os.path.join(file_dir, rootpath))
+
+    files.close()
+
+    return uncompressed_path
+
+
+def _is_a_single_file(file_list):
+    if len(file_list) == 1 and file_list[0].find(os.sep) < -1:
+        return True
+    return False
+
+
+def _is_a_single_dir(file_list):
+    new_file_list = []
+    for file_path in file_list:
+        if '/' in file_path:
+            file_path = file_path.replace('/', os.sep)
+        elif '\\' in file_path:
+            file_path = file_path.replace('\\', os.sep)
+        new_file_list.append(file_path)
+
+    file_name = new_file_list[0].split(os.sep)[0]
+    for i in range(1, len(new_file_list)):
+        if file_name != new_file_list[i].split(os.sep)[0]:
+            return False
+    return True
diff --git a/ernie-sat/paddlespeech/cli/entry.py b/ernie-sat/paddlespeech/cli/entry.py
new file mode 100644
index 0000000000000000000000000000000000000000..32123ece750457dac8ca90aff1a8731fea569188
--- /dev/null
+++ b/ernie-sat/paddlespeech/cli/entry.py
@@ -0,0 +1,41 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+from collections import defaultdict
+
+__all__ = ['commands']
+
+
+def _CommandDict():
+    return defaultdict(_CommandDict)
+
+
+def _execute():
+    com = commands
+
+    idx = 0
+    for _argv in (['paddlespeech'] + sys.argv[1:]):
+        if _argv not in com:
+            break
+        idx += 1
+        com = com[_argv]
+
+    # The method 'execute' of a command instance returns 'True' for a success
+    # while 'False' for a failure. Here converts this result into a exit status
+    # in bash: 0 for a success and 1 for a failure.
+    status = 0 if com['_entry']().execute(sys.argv[idx:]) else 1
+    return status
+
+
+commands = _CommandDict()
diff --git a/ernie-sat/paddlespeech/cli/executor.py b/ernie-sat/paddlespeech/cli/executor.py
new file mode 100644
index 0000000000000000000000000000000000000000..064939a85da7a87ac0de8d68c8729d78a5c2125c
--- /dev/null
+++ b/ernie-sat/paddlespeech/cli/executor.py
@@ -0,0 +1,229 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+import os
+import sys
+from abc import ABC
+from abc import abstractmethod
+from collections import OrderedDict
+from typing import Any
+from typing import Dict
+from typing import List
+from typing import Union
+
+import paddle
+
+from .log import logger
+
+
+class BaseExecutor(ABC):
+    """
+        An abstract executor of paddlespeech tasks.
+    """
+
+    def __init__(self):
+        self._inputs = OrderedDict()
+        self._outputs = OrderedDict()
+
+    @abstractmethod
+    def _get_pretrained_path(self, tag: str) -> os.PathLike:
+        """
+        Download and returns pretrained resources path of current task.
+
+        Args:
+            tag (str): A tag of pretrained model.
+
+        Returns:
+            os.PathLike: The path on which resources of pretrained model locate. 
+        """
+        pass
+
+    @abstractmethod
+    def _init_from_path(self, *args, **kwargs):
+        """
+        Init model and other resources from arguments. This method should be called by `__call__()`.
+        """
+        pass
+
+    @abstractmethod
+    def preprocess(self, input: Any, *args, **kwargs):
+        """
+        Input preprocess and return paddle.Tensor stored in self._inputs.
+        Input content can be a text(tts), a file(asr, cls), a stream(not supported yet) or anything needed.
+
+        Args:
+            input (Any): Input text/file/stream or other content.
+        """
+        pass
+
+    @paddle.no_grad()
+    @abstractmethod
+    def infer(self, *args, **kwargs):
+        """
+        Model inference and put results into self._outputs.
+        This method get input tensors from self._inputs, and write output tensors into self._outputs.
+        """
+        pass
+
+    @abstractmethod
+    def postprocess(self, *args, **kwargs) -> Union[str, os.PathLike]:
+        """
+        Output postprocess and return results.
+        This method get model output from self._outputs and convert it into human-readable results.
+
+        Returns:
+            Union[str, os.PathLike]: Human-readable results such as texts and audio files.
+        """
+        pass
+
+    @abstractmethod
+    def execute(self, argv: List[str]) -> bool:
+        """
+        Command line entry. This method can only be accessed by a command line such as `paddlespeech asr`.
+
+        Args:
+            argv (List[str]): Arguments from command line.
+
+        Returns:
+            int: Result of the command execution. `True` for a success and `False` for a failure.
+        """
+        pass
+
+    @abstractmethod
+    def __call__(self, *arg, **kwargs):
+        """
+        Python API to call an executor.
+        """
+        pass
+
+    def get_task_source(self, input_: Union[str, os.PathLike, None]
+                        ) -> Dict[str, Union[str, os.PathLike]]:
+        """
+        Get task input source from command line input.
+
+        Args:
+            input_ (Union[str, os.PathLike, None]): Input from command line.
+
+        Returns:
+            Dict[str, Union[str, os.PathLike]]: A dict with ids and inputs.
+        """
+        if self._is_job_input(input_):
+            ret = self._get_job_contents(input_)
+        else:
+            ret = OrderedDict()
+
+            if input_ is None:  # Take input from stdin
+                for i, line in enumerate(sys.stdin):
+                    line = line.strip()
+                    if len(line.split(' ')) == 1:
+                        ret[str(i + 1)] = line
+                    elif len(line.split(' ')) == 2:
+                        id_, info = line.split(' ')
+                        ret[id_] = info
+                    else:  # No valid input info from one line.
+                        continue
+            else:
+                ret[1] = input_
+        return ret
+
+    def process_task_results(self,
+                             input_: Union[str, os.PathLike, None],
+                             results: Dict[str, os.PathLike],
+                             job_dump_result: bool=False):
+        """
+        Handling task results and redirect stdout if needed.
+
+        Args:
+            input_ (Union[str, os.PathLike, None]): Input from command line.
+            results (Dict[str, os.PathLike]): Task outputs.
+            job_dump_result (bool, optional): if True, dumps job results into file. Defaults to False.
+        """
+
+        if not self._is_job_input(input_) and len(
+                results) == 1:  # Only one input sample
+            raw_text = list(results.values())[0]
+        else:
+            raw_text = self._format_task_results(results)
+
+        print(raw_text, end='')  # Stdout
+
+        if self._is_job_input(
+                input_) and job_dump_result:  # Dump to *.job.done 
+            try:
+                job_output_file = os.path.abspath(input_) + '.done'
+                sys.stdout = open(job_output_file, 'w')
+                print(raw_text, end='')
+                logger.info(f'Results had been saved to: {job_output_file}')
+            finally:
+                sys.stdout.close()
+
+    def _is_job_input(self, input_: Union[str, os.PathLike]) -> bool:
+        """
+        Check if current input file is a job input or not.
+
+        Args:
+            input_ (Union[str, os.PathLike]): Input file of current task.
+
+        Returns:
+            bool: return `True` for job input, `False` otherwise.
+        """
+        return input_ and os.path.isfile(input_) and (input_.endswith('.job') or
+                                                      input_.endswith('.txt'))
+
+    def _get_job_contents(
+            self, job_input: os.PathLike) -> Dict[str, Union[str, os.PathLike]]:
+        """
+        Read a job input file and return its contents in a dictionary.
+
+        Args:
+            job_input (os.PathLike): The job input file.
+
+        Returns:
+            Dict[str, str]: Contents of job input.
+        """
+        job_contents = OrderedDict()
+        with open(job_input) as f:
+            for line in f:
+                line = line.strip()
+                if not line:
+                    continue
+                k, v = line.split(' ')
+                job_contents[k] = v
+        return job_contents
+
+    def _format_task_results(
+            self, results: Dict[str, Union[str, os.PathLike]]) -> str:
+        """
+        Convert task results to raw text.
+
+        Args:
+            results (Dict[str, str]): A dictionary of task results.
+
+        Returns:
+            str: A string object contains task results.
+        """
+        ret = ''
+        for k, v in results.items():
+            ret += f'{k} {v}\n'
+        return ret
+
+    def disable_task_loggers(self):
+        """
+        Disable all loggers in current task.
+        """
+        loggers = [
+            logging.getLogger(name) for name in logging.root.manager.loggerDict
+        ]
+        for l in loggers:
+            l.disabled = True
diff --git a/ernie-sat/paddlespeech/cli/log.py b/ernie-sat/paddlespeech/cli/log.py
new file mode 100644
index 0000000000000000000000000000000000000000..8644064c73ef407476e7870e65d1149019762723
--- /dev/null
+++ b/ernie-sat/paddlespeech/cli/log.py
@@ -0,0 +1,59 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import functools
+import logging
+
+__all__ = [
+    'logger',
+]
+
+
+class Logger(object):
+    def __init__(self, name: str=None):
+        name = 'PaddleSpeech' if not name else name
+        self.logger = logging.getLogger(name)
+
+        log_config = {
+            'DEBUG': 10,
+            'INFO': 20,
+            'TRAIN': 21,
+            'EVAL': 22,
+            'WARNING': 30,
+            'ERROR': 40,
+            'CRITICAL': 50,
+            'EXCEPTION': 100,
+        }
+        for key, level in log_config.items():
+            logging.addLevelName(level, key)
+            if key == 'EXCEPTION':
+                self.__dict__[key.lower()] = self.logger.exception
+            else:
+                self.__dict__[key.lower()] = functools.partial(self.__call__,
+                                                               level)
+
+        self.format = logging.Formatter(
+            fmt='[%(asctime)-15s] [%(levelname)8s] - %(message)s')
+
+        self.handler = logging.StreamHandler()
+        self.handler.setFormatter(self.format)
+
+        self.logger.addHandler(self.handler)
+        self.logger.setLevel(logging.DEBUG)
+        self.logger.propagate = False
+
+    def __call__(self, log_level: str, msg: str):
+        self.logger.log(log_level, msg)
+
+
+logger = Logger()
diff --git a/ernie-sat/paddlespeech/cli/st/__init__.py b/ernie-sat/paddlespeech/cli/st/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8cdb4e34826d50d9898d79e2ffe09c369026b3e5
--- /dev/null
+++ b/ernie-sat/paddlespeech/cli/st/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .infer import STExecutor
diff --git a/ernie-sat/paddlespeech/cli/st/infer.py b/ernie-sat/paddlespeech/cli/st/infer.py
new file mode 100644
index 0000000000000000000000000000000000000000..e64fc57d1bf2574a016e2655aa021a919e59ab98
--- /dev/null
+++ b/ernie-sat/paddlespeech/cli/st/infer.py
@@ -0,0 +1,380 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+import subprocess
+from collections import OrderedDict
+from typing import List
+from typing import Optional
+from typing import Union
+
+import kaldiio
+import numpy as np
+import paddle
+import soundfile
+from kaldiio import WriteHelper
+from yacs.config import CfgNode
+
+from ..executor import BaseExecutor
+from ..log import logger
+from ..utils import cli_register
+from ..utils import download_and_decompress
+from ..utils import MODEL_HOME
+from ..utils import stats_wrapper
+from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
+from paddlespeech.s2t.utils.dynamic_import import dynamic_import
+from paddlespeech.s2t.utils.utility import UpdateConfig
+
+__all__ = ["STExecutor"]
+
+pretrained_models = {
+    "fat_st_ted-en-zh": {
+        "url":
+        "https://paddlespeech.bj.bcebos.com/s2t/ted_en_zh/st1/st1_transformer_mtl_noam_ted-en-zh_ckpt_0.1.1.model.tar.gz",
+        "md5":
+        "d62063f35a16d91210a71081bd2dd557",
+        "cfg_path":
+        "model.yaml",
+        "ckpt_path":
+        "exp/transformer_mtl_noam/checkpoints/fat_st_ted-en-zh.pdparams",
+    }
+}
+
+model_alias = {"fat_st": "paddlespeech.s2t.models.u2_st:U2STModel"}
+
+kaldi_bins = {
+    "url":
+    "https://paddlespeech.bj.bcebos.com/s2t/ted_en_zh/st1/kaldi_bins.tar.gz",
+    "md5":
+    "c0682303b3f3393dbf6ed4c4e35a53eb",
+}
+
+
+@cli_register(
+    name="paddlespeech.st", description="Speech translation infer command.")
+class STExecutor(BaseExecutor):
+    def __init__(self):
+        super(STExecutor, self).__init__()
+
+        self.parser = argparse.ArgumentParser(
+            prog="paddlespeech.st", add_help=True)
+        self.parser.add_argument(
+            "--input", type=str, default=None, help="Audio file to translate.")
+        self.parser.add_argument(
+            "--model",
+            type=str,
+            default="fat_st_ted",
+            choices=[tag[:tag.index('-')] for tag in pretrained_models.keys()],
+            help="Choose model type of st task.")
+        self.parser.add_argument(
+            "--src_lang",
+            type=str,
+            default="en",
+            help="Choose model source language.")
+        self.parser.add_argument(
+            "--tgt_lang",
+            type=str,
+            default="zh",
+            help="Choose model target language.")
+        self.parser.add_argument(
+            "--sample_rate",
+            type=int,
+            default=16000,
+            choices=[16000],
+            help='Choose the audio sample rate of the model. 8000 or 16000')
+        self.parser.add_argument(
+            "--config",
+            type=str,
+            default=None,
+            help="Config of st task. Use deault config when it is None.")
+        self.parser.add_argument(
+            "--ckpt_path",
+            type=str,
+            default=None,
+            help="Checkpoint file of model.")
+        self.parser.add_argument(
+            "--device",
+            type=str,
+            default=paddle.get_device(),
+            help="Choose device to execute model inference.")
+        self.parser.add_argument(
+            '-d',
+            '--job_dump_result',
+            action='store_true',
+            help='Save job result into file.')
+        self.parser.add_argument(
+            '-v',
+            '--verbose',
+            action='store_true',
+            help='Increase logger verbosity of current task.')
+
+    def _get_pretrained_path(self, tag: str) -> os.PathLike:
+        """
+            Download and returns pretrained resources path of current task.
+        """
+        support_models = list(pretrained_models.keys())
+        assert tag in pretrained_models, 'The model "{}" you want to use has not been supported, please choose other models.\nThe support models includes:\n\t\t{}\n'.format(
+            tag, '\n\t\t'.join(support_models))
+
+        res_path = os.path.join(MODEL_HOME, tag)
+        decompressed_path = download_and_decompress(pretrained_models[tag],
+                                                    res_path)
+        decompressed_path = os.path.abspath(decompressed_path)
+        logger.info(
+            "Use pretrained model stored in: {}".format(decompressed_path))
+
+        return decompressed_path
+
+    def _set_kaldi_bins(self) -> os.PathLike:
+        """
+            Download and returns kaldi_bins resources path of current task.
+        """
+        decompressed_path = download_and_decompress(kaldi_bins, MODEL_HOME)
+        decompressed_path = os.path.abspath(decompressed_path)
+        logger.info("Kaldi_bins stored in: {}".format(decompressed_path))
+        if "LD_LIBRARY_PATH" in os.environ:
+            os.environ["LD_LIBRARY_PATH"] += f":{decompressed_path}"
+        else:
+            os.environ["LD_LIBRARY_PATH"] = f"{decompressed_path}"
+        os.environ["PATH"] += f":{decompressed_path}"
+        return decompressed_path
+
+    def _init_from_path(self,
+                        model_type: str="fat_st_ted",
+                        src_lang: str="en",
+                        tgt_lang: str="zh",
+                        cfg_path: Optional[os.PathLike]=None,
+                        ckpt_path: Optional[os.PathLike]=None):
+        """
+            Init model and other resources from a specific path.
+        """
+        if hasattr(self, 'model'):
+            logger.info('Model had been initialized.')
+            return
+
+        if cfg_path is None or ckpt_path is None:
+            tag = model_type + "-" + src_lang + "-" + tgt_lang
+            res_path = self._get_pretrained_path(tag)
+            self.cfg_path = os.path.join(res_path,
+                                         pretrained_models[tag]["cfg_path"])
+            self.ckpt_path = os.path.join(res_path,
+                                          pretrained_models[tag]["ckpt_path"])
+            logger.info(res_path)
+            logger.info(self.cfg_path)
+            logger.info(self.ckpt_path)
+        else:
+            self.cfg_path = os.path.abspath(cfg_path)
+            self.ckpt_path = os.path.abspath(ckpt_path)
+            res_path = os.path.dirname(
+                os.path.dirname(os.path.abspath(self.cfg_path)))
+
+        #Init body.
+        self.config = CfgNode(new_allowed=True)
+        self.config.merge_from_file(self.cfg_path)
+        self.config.decode.decoding_method = "fullsentence"
+
+        with UpdateConfig(self.config):
+            self.config.cmvn_path = os.path.join(res_path,
+                                                 self.config.cmvn_path)
+            self.config.spm_model_prefix = os.path.join(
+                res_path, self.config.spm_model_prefix)
+            self.text_feature = TextFeaturizer(
+                unit_type=self.config.unit_type,
+                vocab=self.config.vocab_filepath,
+                spm_model_prefix=self.config.spm_model_prefix)
+
+        model_conf = self.config
+        model_name = model_type[:model_type.rindex(
+            '_')]  # model_type: {model_name}_{dataset}
+        model_class = dynamic_import(model_name, model_alias)
+        self.model = model_class.from_config(model_conf)
+        self.model.eval()
+
+        # load model
+        params_path = self.ckpt_path
+        model_dict = paddle.load(params_path)
+        self.model.set_state_dict(model_dict)
+
+        # set kaldi bins
+        self._set_kaldi_bins()
+
+    def _check(self, audio_file: str, sample_rate: int):
+        _, audio_sample_rate = soundfile.read(
+            audio_file, dtype="int16", always_2d=True)
+        if audio_sample_rate != sample_rate:
+            raise Exception("invalid sample rate")
+            sys.exit(-1)
+
+    def preprocess(self, wav_file: Union[str, os.PathLike], model_type: str):
+        """
+            Input preprocess and return paddle.Tensor stored in self.input.
+            Input content can be a file(wav).
+        """
+        audio_file = os.path.abspath(wav_file)
+        logger.info("Preprocess audio_file:" + audio_file)
+
+        if "fat_st" in model_type:
+            cmvn = self.config.cmvn_path
+            utt_name = "_tmp"
+
+            # Get the object for feature extraction
+            fbank_extract_command = [
+                "compute-fbank-feats", "--num-mel-bins=80", "--verbose=2",
+                "--sample-frequency=16000", "scp:-", "ark:-"
+            ]
+            fbank_extract_process = subprocess.Popen(
+                fbank_extract_command,
+                stdin=subprocess.PIPE,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE)
+            fbank_extract_process.stdin.write(
+                f"{utt_name} {wav_file}".encode("utf8"))
+            fbank_extract_process.stdin.close()
+            fbank_feat = dict(
+                kaldiio.load_ark(fbank_extract_process.stdout))[utt_name]
+
+            extract_command = ["compute-kaldi-pitch-feats", "scp:-", "ark:-"]
+            pitch_extract_process = subprocess.Popen(
+                extract_command,
+                stdin=subprocess.PIPE,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE)
+            pitch_extract_process.stdin.write(
+                f"{utt_name} {wav_file}".encode("utf8"))
+            process_command = ["process-kaldi-pitch-feats", "ark:", "ark:-"]
+            pitch_process = subprocess.Popen(
+                process_command,
+                stdin=pitch_extract_process.stdout,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE)
+            pitch_extract_process.stdin.close()
+            pitch_feat = dict(kaldiio.load_ark(pitch_process.stdout))[utt_name]
+            concated_feat = np.concatenate((fbank_feat, pitch_feat), axis=1)
+            raw_feat = f"{utt_name}.raw"
+            with WriteHelper(
+                    f"ark,scp:{raw_feat}.ark,{raw_feat}.scp") as writer:
+                writer(utt_name, concated_feat)
+            cmvn_command = [
+                "apply-cmvn", "--norm-vars=true", cmvn, f"scp:{raw_feat}.scp",
+                "ark:-"
+            ]
+            cmvn_process = subprocess.Popen(
+                cmvn_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+            process_command = [
+                "copy-feats", "--compress=true", "ark:-", "ark:-"
+            ]
+            process = subprocess.Popen(
+                process_command,
+                stdin=cmvn_process.stdout,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE)
+            norm_feat = dict(kaldiio.load_ark(process.stdout))[utt_name]
+            self._inputs["audio"] = paddle.to_tensor(norm_feat).unsqueeze(0)
+            self._inputs["audio_len"] = paddle.to_tensor(
+                self._inputs["audio"].shape[1], dtype="int64")
+        else:
+            raise ValueError("Wrong model type.")
+
+    @paddle.no_grad()
+    def infer(self, model_type: str):
+        """
+            Model inference and result stored in self.output.
+        """
+        cfg = self.config.decode
+        audio = self._inputs["audio"]
+        audio_len = self._inputs["audio_len"]
+        if model_type == "fat_st_ted":
+            hyps = self.model.decode(
+                audio,
+                audio_len,
+                text_feature=self.text_feature,
+                decoding_method=cfg.decoding_method,
+                beam_size=cfg.beam_size,
+                word_reward=cfg.word_reward,
+                decoding_chunk_size=cfg.decoding_chunk_size,
+                num_decoding_left_chunks=cfg.num_decoding_left_chunks,
+                simulate_streaming=cfg.simulate_streaming)
+            self._outputs["result"] = hyps
+        else:
+            raise ValueError("Wrong model type.")
+
+    def postprocess(self, model_type: str) -> Union[str, os.PathLike]:
+        """
+            Output postprocess and return human-readable results such as texts and audio files.
+        """
+        if model_type == "fat_st_ted":
+            return self._outputs["result"]
+        else:
+            raise ValueError("Wrong model type.")
+
+    def execute(self, argv: List[str]) -> bool:
+        """
+            Command line entry.
+        """
+        parser_args = self.parser.parse_args(argv)
+
+        model = parser_args.model
+        src_lang = parser_args.src_lang
+        tgt_lang = parser_args.tgt_lang
+        sample_rate = parser_args.sample_rate
+        config = parser_args.config
+        ckpt_path = parser_args.ckpt_path
+        device = parser_args.device
+
+        if not parser_args.verbose:
+            self.disable_task_loggers()
+
+        task_source = self.get_task_source(parser_args.input)
+        task_results = OrderedDict()
+        has_exceptions = False
+
+        for id_, input_ in task_source.items():
+            try:
+                res = self(input_, model, src_lang, tgt_lang, sample_rate,
+                           config, ckpt_path, device)
+                task_results[id_] = res
+            except Exception as e:
+                has_exceptions = True
+                task_results[id_] = f'{e.__class__.__name__}: {e}'
+
+        self.process_task_results(parser_args.input, task_results,
+                                  parser_args.job_dump_result)
+
+        if has_exceptions:
+            return False
+        else:
+            return True
+
+    @stats_wrapper
+    def __call__(self,
+                 audio_file: os.PathLike,
+                 model: str='fat_st_ted',
+                 src_lang: str='en',
+                 tgt_lang: str='zh',
+                 sample_rate: int=16000,
+                 config: Optional[os.PathLike]=None,
+                 ckpt_path: Optional[os.PathLike]=None,
+                 device: str=paddle.get_device()):
+        """
+            Python API to call an executor.
+        """
+        audio_file = os.path.abspath(audio_file)
+        self._check(audio_file, sample_rate)
+        paddle.set_device(device)
+        self._init_from_path(model, src_lang, tgt_lang, config, ckpt_path)
+        self.preprocess(audio_file, model)
+        self.infer(model)
+        res = self.postprocess(model)
+
+        return res
diff --git a/ernie-sat/paddlespeech/cli/stats/__init__.py b/ernie-sat/paddlespeech/cli/stats/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9fe6c4abaf10de2f24f751ddd62f456768a82475
--- /dev/null
+++ b/ernie-sat/paddlespeech/cli/stats/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .infer import StatsExecutor
diff --git a/ernie-sat/paddlespeech/cli/stats/infer.py b/ernie-sat/paddlespeech/cli/stats/infer.py
new file mode 100644
index 0000000000000000000000000000000000000000..4ef50449c37e08c1a3c5f9b8894a5b4141e1c33f
--- /dev/null
+++ b/ernie-sat/paddlespeech/cli/stats/infer.py
@@ -0,0 +1,193 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+from typing import List
+
+from prettytable import PrettyTable
+
+from ..log import logger
+from ..utils import cli_register
+from ..utils import stats_wrapper
+
+__all__ = ['StatsExecutor']
+
+model_name_format = {
+    'asr': 'Model-Language-Sample Rate',
+    'cls': 'Model-Sample Rate',
+    'st': 'Model-Source language-Target language',
+    'text': 'Model-Task-Language',
+    'tts': 'Model-Language'
+}
+
+
+@cli_register(
+    name='paddlespeech.stats',
+    description='Get speech tasks support models list.')
+class StatsExecutor():
+    def __init__(self):
+        super(StatsExecutor, self).__init__()
+
+        self.parser = argparse.ArgumentParser(
+            prog='paddlespeech.stats', add_help=True)
+        self.parser.add_argument(
+            '--task',
+            type=str,
+            default='asr',
+            choices=['asr', 'cls', 'st', 'text', 'tts'],
+            help='Choose speech task.',
+            required=True)
+        self.task_choices = ['asr', 'cls', 'st', 'text', 'tts']
+
+    def show_support_models(self, pretrained_models: dict):
+        fields = model_name_format[self.task].split("-")
+        table = PrettyTable(fields)
+        for key in pretrained_models:
+            table.add_row(key.split("-"))
+        print(table)
+
+    def execute(self, argv: List[str]) -> bool:
+        """
+            Command line entry.
+        """
+        parser_args = self.parser.parse_args(argv)
+        self.task = parser_args.task
+        if self.task not in self.task_choices:
+            logger.error(
+                "Please input correct speech task, choices = ['asr', 'cls', 'st', 'text', 'tts']"
+            )
+            return False
+
+        elif self.task == 'asr':
+            try:
+                from ..asr.infer import pretrained_models
+                logger.info(
+                    "Here is the list of ASR pretrained models released by PaddleSpeech that can be used by command line and python API"
+                )
+                self.show_support_models(pretrained_models)
+                return True
+            except BaseException:
+                logger.error("Failed to get the list of ASR pretrained models.")
+                return False
+
+        elif self.task == 'cls':
+            try:
+                from ..cls.infer import pretrained_models
+                logger.info(
+                    "Here is the list of CLS pretrained models released by PaddleSpeech that can be used by command line and python API"
+                )
+                self.show_support_models(pretrained_models)
+                return True
+            except BaseException:
+                logger.error("Failed to get the list of CLS pretrained models.")
+                return False
+
+        elif self.task == 'st':
+            try:
+                from ..st.infer import pretrained_models
+                logger.info(
+                    "Here is the list of ST pretrained models released by PaddleSpeech that can be used by command line and python API"
+                )
+                self.show_support_models(pretrained_models)
+                return True
+            except BaseException:
+                logger.error("Failed to get the list of ST pretrained models.")
+                return False
+
+        elif self.task == 'text':
+            try:
+                from ..text.infer import pretrained_models
+                logger.info(
+                    "Here is the list of TEXT pretrained models released by PaddleSpeech that can be used by command line and python API"
+                )
+                self.show_support_models(pretrained_models)
+                return True
+            except BaseException:
+                logger.error(
+                    "Failed to get the list of TEXT pretrained models.")
+                return False
+
+        elif self.task == 'tts':
+            try:
+                from ..tts.infer import pretrained_models
+                logger.info(
+                    "Here is the list of TTS pretrained models released by PaddleSpeech that can be used by command line and python API"
+                )
+                self.show_support_models(pretrained_models)
+                return True
+            except BaseException:
+                logger.error("Failed to get the list of TTS pretrained models.")
+                return False
+
+    @stats_wrapper
+    def __call__(
+            self,
+            task: str=None, ):
+        """
+            Python API to call an executor.
+        """
+        self.task = task
+        if self.task not in self.task_choices:
+            print(
+                "Please input correct speech task, choices = ['asr', 'cls', 'st', 'text', 'tts']"
+            )
+
+        elif self.task == 'asr':
+            try:
+                from ..asr.infer import pretrained_models
+                print(
+                    "Here is the list of ASR pretrained models released by PaddleSpeech that can be used by command line and python API"
+                )
+                self.show_support_models(pretrained_models)
+            except BaseException:
+                print("Failed to get the list of ASR pretrained models.")
+
+        elif self.task == 'cls':
+            try:
+                from ..cls.infer import pretrained_models
+                print(
+                    "Here is the list of CLS pretrained models released by PaddleSpeech that can be used by command line and python API"
+                )
+                self.show_support_models(pretrained_models)
+            except BaseException:
+                print("Failed to get the list of CLS pretrained models.")
+
+        elif self.task == 'st':
+            try:
+                from ..st.infer import pretrained_models
+                print(
+                    "Here is the list of ST pretrained models released by PaddleSpeech that can be used by command line and python API"
+                )
+                self.show_support_models(pretrained_models)
+            except BaseException:
+                print("Failed to get the list of ST pretrained models.")
+
+        elif self.task == 'text':
+            try:
+                from ..text.infer import pretrained_models
+                print(
+                    "Here is the list of TEXT pretrained models released by PaddleSpeech that can be used by command line and python API"
+                )
+                self.show_support_models(pretrained_models)
+            except BaseException:
+                print("Failed to get the list of TEXT pretrained models.")
+
+        elif self.task == 'tts':
+            try:
+                from ..tts.infer import pretrained_models
+                print(
+                    "Here is the list of TTS pretrained models released by PaddleSpeech that can be used by command line and python API"
+                )
+                self.show_support_models(pretrained_models)
+            except BaseException:
+                print("Failed to get the list of TTS pretrained models.")
diff --git a/ernie-sat/paddlespeech/cli/text/__init__.py b/ernie-sat/paddlespeech/cli/text/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4573fa15c4fee5bc63913cf6a62552c2219bc57
--- /dev/null
+++ b/ernie-sat/paddlespeech/cli/text/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .infer import TextExecutor
diff --git a/ernie-sat/paddlespeech/cli/text/infer.py b/ernie-sat/paddlespeech/cli/text/infer.py
new file mode 100644
index 0000000000000000000000000000000000000000..dcf306c69f850bbcf13c08b81c1bb906141c71ea
--- /dev/null
+++ b/ernie-sat/paddlespeech/cli/text/infer.py
@@ -0,0 +1,322 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+import re
+from collections import OrderedDict
+from typing import List
+from typing import Optional
+from typing import Union
+
+import paddle
+
+from ...s2t.utils.dynamic_import import dynamic_import
+from ..executor import BaseExecutor
+from ..log import logger
+from ..utils import cli_register
+from ..utils import download_and_decompress
+from ..utils import MODEL_HOME
+from ..utils import stats_wrapper
+
+__all__ = ['TextExecutor']
+
+pretrained_models = {
+    # The tags for pretrained_models should be "{model_name}[_{dataset}][-{lang}][-...]".
+    # e.g. "conformer_wenetspeech-zh-16k", "transformer_aishell-zh-16k" and "panns_cnn6-32k".
+    # Command line and python api use "{model_name}[_{dataset}]" as --model, usage:
+    # "paddlespeech asr --model conformer_wenetspeech --lang zh --sr 16000 --input ./input.wav"
+    "ernie_linear_p7_wudao-punc-zh": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/text/ernie_linear_p7_wudao-punc-zh.tar.gz',
+        'md5':
+        '12283e2ddde1797c5d1e57036b512746',
+        'cfg_path':
+        'ckpt/model_config.json',
+        'ckpt_path':
+        'ckpt/model_state.pdparams',
+        'vocab_file':
+        'punc_vocab.txt',
+    },
+    "ernie_linear_p3_wudao-punc-zh": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/text/ernie_linear_p3_wudao-punc-zh.tar.gz',
+        'md5':
+        '448eb2fdf85b6a997e7e652e80c51dd2',
+        'cfg_path':
+        'ckpt/model_config.json',
+        'ckpt_path':
+        'ckpt/model_state.pdparams',
+        'vocab_file':
+        'punc_vocab.txt',
+    },
+}
+
+model_alias = {
+    "ernie_linear_p7": "paddlespeech.text.models:ErnieLinear",
+    "ernie_linear_p3": "paddlespeech.text.models:ErnieLinear",
+}
+
+tokenizer_alias = {
+    "ernie_linear_p7": "paddlenlp.transformers:ErnieTokenizer",
+    "ernie_linear_p3": "paddlenlp.transformers:ErnieTokenizer",
+}
+
+
+@cli_register(name='paddlespeech.text', description='Text infer command.')
+class TextExecutor(BaseExecutor):
+    def __init__(self):
+        super(TextExecutor, self).__init__()
+
+        self.parser = argparse.ArgumentParser(
+            prog='paddlespeech.text', add_help=True)
+        self.parser.add_argument(
+            '--input', type=str, default=None, help='Input text.')
+        self.parser.add_argument(
+            '--task',
+            type=str,
+            default='punc',
+            choices=['punc'],
+            help='Choose text task.')
+        self.parser.add_argument(
+            '--model',
+            type=str,
+            default='ernie_linear_p7_wudao',
+            choices=[tag[:tag.index('-')] for tag in pretrained_models.keys()],
+            help='Choose model type of text task.')
+        self.parser.add_argument(
+            '--lang',
+            type=str,
+            default='zh',
+            choices=['zh', 'en'],
+            help='Choose model language.')
+        self.parser.add_argument(
+            '--config',
+            type=str,
+            default=None,
+            help='Config of cls task. Use deault config when it is None.')
+        self.parser.add_argument(
+            '--ckpt_path',
+            type=str,
+            default=None,
+            help='Checkpoint file of model.')
+        self.parser.add_argument(
+            '--punc_vocab',
+            type=str,
+            default=None,
+            help='Vocabulary file of punctuation restoration task.')
+        self.parser.add_argument(
+            '--device',
+            type=str,
+            default=paddle.get_device(),
+            help='Choose device to execute model inference.')
+        self.parser.add_argument(
+            '-d',
+            '--job_dump_result',
+            action='store_true',
+            help='Save job result into file.')
+        self.parser.add_argument(
+            '-v',
+            '--verbose',
+            action='store_true',
+            help='Increase logger verbosity of current task.')
+
+    def _get_pretrained_path(self, tag: str) -> os.PathLike:
+        """
+            Download and returns pretrained resources path of current task.
+        """
+        support_models = list(pretrained_models.keys())
+        assert tag in pretrained_models, 'The model "{}" you want to use has not been supported, please choose other models.\nThe support models includes:\n\t\t{}\n'.format(
+            tag, '\n\t\t'.join(support_models))
+
+        res_path = os.path.join(MODEL_HOME, tag)
+        decompressed_path = download_and_decompress(pretrained_models[tag],
+                                                    res_path)
+        decompressed_path = os.path.abspath(decompressed_path)
+        logger.info(
+            'Use pretrained model stored in: {}'.format(decompressed_path))
+
+        return decompressed_path
+
+    def _init_from_path(self,
+                        task: str='punc',
+                        model_type: str='ernie_linear_p7_wudao',
+                        lang: str='zh',
+                        cfg_path: Optional[os.PathLike]=None,
+                        ckpt_path: Optional[os.PathLike]=None,
+                        vocab_file: Optional[os.PathLike]=None):
+        """
+            Init model and other resources from a specific path.
+        """
+        if hasattr(self, 'model'):
+            logger.info('Model had been initialized.')
+            return
+
+        self.task = task
+
+        if cfg_path is None or ckpt_path is None or vocab_file is None:
+            tag = '-'.join([model_type, task, lang])
+            self.res_path = self._get_pretrained_path(tag)
+            self.cfg_path = os.path.join(self.res_path,
+                                         pretrained_models[tag]['cfg_path'])
+            self.ckpt_path = os.path.join(self.res_path,
+                                          pretrained_models[tag]['ckpt_path'])
+            self.vocab_file = os.path.join(self.res_path,
+                                           pretrained_models[tag]['vocab_file'])
+        else:
+            self.cfg_path = os.path.abspath(cfg_path)
+            self.ckpt_path = os.path.abspath(ckpt_path)
+            self.vocab_file = os.path.abspath(vocab_file)
+
+        model_name = model_type[:model_type.rindex('_')]
+        if self.task == 'punc':
+            # punc list
+            self._punc_list = []
+            with open(self.vocab_file, 'r') as f:
+                for line in f:
+                    self._punc_list.append(line.strip())
+
+            # model
+            model_class = dynamic_import(model_name, model_alias)
+            tokenizer_class = dynamic_import(model_name, tokenizer_alias)
+            self.model = model_class(
+                cfg_path=self.cfg_path, ckpt_path=self.ckpt_path)
+            self.tokenizer = tokenizer_class.from_pretrained('ernie-1.0')
+        else:
+            raise NotImplementedError
+
+        self.model.eval()
+
+    def _clean_text(self, text):
+        text = text.lower()
+        text = re.sub('[^A-Za-z0-9\u4e00-\u9fa5]', '', text)
+        text = re.sub(f'[{"".join([p for p in self._punc_list][1:])}]', '',
+                      text)
+        return text
+
+    def preprocess(self, text: Union[str, os.PathLike]):
+        """
+            Input preprocess and return paddle.Tensor stored in self.input.
+            Input content can be a text(tts), a file(asr, cls) or a streaming(not supported yet).
+        """
+        if self.task == 'punc':
+            clean_text = self._clean_text(text)
+            assert len(clean_text) > 0, f'Invalid input string: {text}'
+
+            tokenized_input = self.tokenizer(
+                list(clean_text), return_length=True, is_split_into_words=True)
+
+            self._inputs['input_ids'] = tokenized_input['input_ids']
+            self._inputs['seg_ids'] = tokenized_input['token_type_ids']
+            self._inputs['seq_len'] = tokenized_input['seq_len']
+        else:
+            raise NotImplementedError
+
+    @paddle.no_grad()
+    def infer(self):
+        """
+            Model inference and result stored in self.output.
+        """
+        if self.task == 'punc':
+            input_ids = paddle.to_tensor(self._inputs['input_ids']).unsqueeze(0)
+            seg_ids = paddle.to_tensor(self._inputs['seg_ids']).unsqueeze(0)
+            logits, _ = self.model(input_ids, seg_ids)
+            preds = paddle.argmax(logits, axis=-1).squeeze(0)
+
+            self._outputs['preds'] = preds
+        else:
+            raise NotImplementedError
+
+    def postprocess(self) -> Union[str, os.PathLike]:
+        """
+            Output postprocess and return human-readable results such as texts and audio files.
+        """
+        if self.task == 'punc':
+            input_ids = self._inputs['input_ids']
+            seq_len = self._inputs['seq_len']
+            preds = self._outputs['preds']
+
+            tokens = self.tokenizer.convert_ids_to_tokens(
+                input_ids[1:seq_len - 1])
+            labels = preds[1:seq_len - 1].tolist()
+            assert len(tokens) == len(labels)
+
+            text = ''
+            for t, l in zip(tokens, labels):
+                text += t
+                if l != 0:  # Non punc.
+                    text += self._punc_list[l]
+
+            return text
+        else:
+            raise NotImplementedError
+
+    def execute(self, argv: List[str]) -> bool:
+        """
+            Command line entry.
+        """
+        parser_args = self.parser.parse_args(argv)
+
+        task = parser_args.task
+        model_type = parser_args.model
+        lang = parser_args.lang
+        cfg_path = parser_args.config
+        ckpt_path = parser_args.ckpt_path
+        punc_vocab = parser_args.punc_vocab
+        device = parser_args.device
+
+        if not parser_args.verbose:
+            self.disable_task_loggers()
+
+        task_source = self.get_task_source(parser_args.input)
+        task_results = OrderedDict()
+        has_exceptions = False
+
+        for id_, input_ in task_source.items():
+            try:
+                res = self(input_, task, model_type, lang, cfg_path, ckpt_path,
+                           punc_vocab, device)
+                task_results[id_] = res
+            except Exception as e:
+                has_exceptions = True
+                task_results[id_] = f'{e.__class__.__name__}: {e}'
+
+        self.process_task_results(parser_args.input, task_results,
+                                  parser_args.job_dump_result)
+
+        if has_exceptions:
+            return False
+        else:
+            return True
+
+    @stats_wrapper
+    def __call__(
+            self,
+            text: str,
+            task: str='punc',
+            model: str='ernie_linear_p7_wudao',
+            lang: str='zh',
+            config: Optional[os.PathLike]=None,
+            ckpt_path: Optional[os.PathLike]=None,
+            punc_vocab: Optional[os.PathLike]=None,
+            device: str=paddle.get_device(), ):
+        """
+            Python API to call an executor.
+        """
+        paddle.set_device(device)
+        self._init_from_path(task, model, lang, config, ckpt_path, punc_vocab)
+        self.preprocess(text)
+        self.infer()
+        res = self.postprocess()  # Retrieve result of text task.
+
+        return res
diff --git a/ernie-sat/paddlespeech/cli/tts/__init__.py b/ernie-sat/paddlespeech/cli/tts/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4cc3c42fc3a8107fecd45f9c589b47aec73817f4
--- /dev/null
+++ b/ernie-sat/paddlespeech/cli/tts/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .infer import TTSExecutor
diff --git a/ernie-sat/paddlespeech/cli/tts/infer.py b/ernie-sat/paddlespeech/cli/tts/infer.py
new file mode 100644
index 0000000000000000000000000000000000000000..c7a1edc93325166f7f3eeb172d577a5353eeb234
--- /dev/null
+++ b/ernie-sat/paddlespeech/cli/tts/infer.py
@@ -0,0 +1,838 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+import time
+from collections import OrderedDict
+from typing import Any
+from typing import List
+from typing import Optional
+from typing import Union
+
+import numpy as np
+import paddle
+import soundfile as sf
+import yaml
+from yacs.config import CfgNode
+
+from ..executor import BaseExecutor
+from ..log import logger
+from ..utils import cli_register
+from ..utils import download_and_decompress
+from ..utils import MODEL_HOME
+from ..utils import stats_wrapper
+from paddlespeech.s2t.utils.dynamic_import import dynamic_import
+from paddlespeech.t2s.frontend import English
+from paddlespeech.t2s.frontend.zh_frontend import Frontend
+from paddlespeech.t2s.modules.normalizer import ZScore
+
+__all__ = ['TTSExecutor']
+
+pretrained_models = {
+    # speedyspeech
+    "speedyspeech_csmsc-zh": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_ckpt_0.5.zip',
+        'md5':
+        '9edce23b1a87f31b814d9477bf52afbc',
+        'config':
+        'default.yaml',
+        'ckpt':
+        'snapshot_iter_11400.pdz',
+        'speech_stats':
+        'feats_stats.npy',
+        'phones_dict':
+        'phone_id_map.txt',
+        'tones_dict':
+        'tone_id_map.txt',
+    },
+
+    # fastspeech2
+    "fastspeech2_csmsc-zh": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip',
+        'md5':
+        '637d28a5e53aa60275612ba4393d5f22',
+        'config':
+        'default.yaml',
+        'ckpt':
+        'snapshot_iter_76000.pdz',
+        'speech_stats':
+        'speech_stats.npy',
+        'phones_dict':
+        'phone_id_map.txt',
+    },
+    "fastspeech2_ljspeech-en": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_ljspeech_ckpt_0.5.zip',
+        'md5':
+        'ffed800c93deaf16ca9b3af89bfcd747',
+        'config':
+        'default.yaml',
+        'ckpt':
+        'snapshot_iter_100000.pdz',
+        'speech_stats':
+        'speech_stats.npy',
+        'phones_dict':
+        'phone_id_map.txt',
+    },
+    "fastspeech2_aishell3-zh": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_aishell3_ckpt_0.4.zip',
+        'md5':
+        'f4dd4a5f49a4552b77981f544ab3392e',
+        'config':
+        'default.yaml',
+        'ckpt':
+        'snapshot_iter_96400.pdz',
+        'speech_stats':
+        'speech_stats.npy',
+        'phones_dict':
+        'phone_id_map.txt',
+        'speaker_dict':
+        'speaker_id_map.txt',
+    },
+    "fastspeech2_vctk-en": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_vctk_ckpt_0.5.zip',
+        'md5':
+        '743e5024ca1e17a88c5c271db9779ba4',
+        'config':
+        'default.yaml',
+        'ckpt':
+        'snapshot_iter_66200.pdz',
+        'speech_stats':
+        'speech_stats.npy',
+        'phones_dict':
+        'phone_id_map.txt',
+        'speaker_dict':
+        'speaker_id_map.txt',
+    },
+    # tacotron2
+    "tacotron2_csmsc-zh": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_csmsc_ckpt_0.2.0.zip',
+        'md5':
+        '0df4b6f0bcbe0d73c5ed6df8867ab91a',
+        'config':
+        'default.yaml',
+        'ckpt':
+        'snapshot_iter_30600.pdz',
+        'speech_stats':
+        'speech_stats.npy',
+        'phones_dict':
+        'phone_id_map.txt',
+    },
+    "tacotron2_ljspeech-en": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_ljspeech_ckpt_0.2.0.zip',
+        'md5':
+        '6a5eddd81ae0e81d16959b97481135f3',
+        'config':
+        'default.yaml',
+        'ckpt':
+        'snapshot_iter_60300.pdz',
+        'speech_stats':
+        'speech_stats.npy',
+        'phones_dict':
+        'phone_id_map.txt',
+    },
+
+    # pwgan
+    "pwgan_csmsc-zh": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip',
+        'md5':
+        '2e481633325b5bdf0a3823c714d2c117',
+        'config':
+        'pwg_default.yaml',
+        'ckpt':
+        'pwg_snapshot_iter_400000.pdz',
+        'speech_stats':
+        'pwg_stats.npy',
+    },
+    "pwgan_ljspeech-en": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_ljspeech_ckpt_0.5.zip',
+        'md5':
+        '53610ba9708fd3008ccaf8e99dacbaf0',
+        'config':
+        'pwg_default.yaml',
+        'ckpt':
+        'pwg_snapshot_iter_400000.pdz',
+        'speech_stats':
+        'pwg_stats.npy',
+    },
+    "pwgan_aishell3-zh": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_aishell3_ckpt_0.5.zip',
+        'md5':
+        'd7598fa41ad362d62f85ffc0f07e3d84',
+        'config':
+        'default.yaml',
+        'ckpt':
+        'snapshot_iter_1000000.pdz',
+        'speech_stats':
+        'feats_stats.npy',
+    },
+    "pwgan_vctk-en": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_vctk_ckpt_0.1.1.zip',
+        'md5':
+        'b3da1defcde3e578be71eb284cb89f2c',
+        'config':
+        'default.yaml',
+        'ckpt':
+        'snapshot_iter_1500000.pdz',
+        'speech_stats':
+        'feats_stats.npy',
+    },
+    # mb_melgan
+    "mb_melgan_csmsc-zh": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_ckpt_0.1.1.zip',
+        'md5':
+        'ee5f0604e20091f0d495b6ec4618b90d',
+        'config':
+        'default.yaml',
+        'ckpt':
+        'snapshot_iter_1000000.pdz',
+        'speech_stats':
+        'feats_stats.npy',
+    },
+    # style_melgan
+    "style_melgan_csmsc-zh": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/style_melgan/style_melgan_csmsc_ckpt_0.1.1.zip',
+        'md5':
+        '5de2d5348f396de0c966926b8c462755',
+        'config':
+        'default.yaml',
+        'ckpt':
+        'snapshot_iter_1500000.pdz',
+        'speech_stats':
+        'feats_stats.npy',
+    },
+    # hifigan
+    "hifigan_csmsc-zh": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_ckpt_0.1.1.zip',
+        'md5':
+        'dd40a3d88dfcf64513fba2f0f961ada6',
+        'config':
+        'default.yaml',
+        'ckpt':
+        'snapshot_iter_2500000.pdz',
+        'speech_stats':
+        'feats_stats.npy',
+    },
+    "hifigan_ljspeech-en": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_ljspeech_ckpt_0.2.0.zip',
+        'md5':
+        '70e9131695decbca06a65fe51ed38a72',
+        'config':
+        'default.yaml',
+        'ckpt':
+        'snapshot_iter_2500000.pdz',
+        'speech_stats':
+        'feats_stats.npy',
+    },
+    "hifigan_aishell3-zh": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_ckpt_0.2.0.zip',
+        'md5':
+        '3bb49bc75032ed12f79c00c8cc79a09a',
+        'config':
+        'default.yaml',
+        'ckpt':
+        'snapshot_iter_2500000.pdz',
+        'speech_stats':
+        'feats_stats.npy',
+    },
+    "hifigan_vctk-en": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_vctk_ckpt_0.2.0.zip',
+        'md5':
+        '7da8f88359bca2457e705d924cf27bd4',
+        'config':
+        'default.yaml',
+        'ckpt':
+        'snapshot_iter_2500000.pdz',
+        'speech_stats':
+        'feats_stats.npy',
+    },
+
+    # wavernn
+    "wavernn_csmsc-zh": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/wavernn/wavernn_csmsc_ckpt_0.2.0.zip',
+        'md5':
+        'ee37b752f09bcba8f2af3b777ca38e13',
+        'config':
+        'default.yaml',
+        'ckpt':
+        'snapshot_iter_400000.pdz',
+        'speech_stats':
+        'feats_stats.npy',
+    }
+}
+
+model_alias = {
+    # acoustic model
+    "speedyspeech":
+    "paddlespeech.t2s.models.speedyspeech:SpeedySpeech",
+    "speedyspeech_inference":
+    "paddlespeech.t2s.models.speedyspeech:SpeedySpeechInference",
+    "fastspeech2":
+    "paddlespeech.t2s.models.fastspeech2:FastSpeech2",
+    "fastspeech2_inference":
+    "paddlespeech.t2s.models.fastspeech2:FastSpeech2Inference",
+    "tacotron2":
+    "paddlespeech.t2s.models.tacotron2:Tacotron2",
+    "tacotron2_inference":
+    "paddlespeech.t2s.models.tacotron2:Tacotron2Inference",
+    # voc
+    "pwgan":
+    "paddlespeech.t2s.models.parallel_wavegan:PWGGenerator",
+    "pwgan_inference":
+    "paddlespeech.t2s.models.parallel_wavegan:PWGInference",
+    "mb_melgan":
+    "paddlespeech.t2s.models.melgan:MelGANGenerator",
+    "mb_melgan_inference":
+    "paddlespeech.t2s.models.melgan:MelGANInference",
+    "style_melgan":
+    "paddlespeech.t2s.models.melgan:StyleMelGANGenerator",
+    "style_melgan_inference":
+    "paddlespeech.t2s.models.melgan:StyleMelGANInference",
+    "hifigan":
+    "paddlespeech.t2s.models.hifigan:HiFiGANGenerator",
+    "hifigan_inference":
+    "paddlespeech.t2s.models.hifigan:HiFiGANInference",
+    "wavernn":
+    "paddlespeech.t2s.models.wavernn:WaveRNN",
+    "wavernn_inference":
+    "paddlespeech.t2s.models.wavernn:WaveRNNInference",
+}
+
+
+@cli_register(
+    name='paddlespeech.tts', description='Text to Speech infer command.')
+class TTSExecutor(BaseExecutor):
+    def __init__(self):
+        super().__init__()
+
+        self.parser = argparse.ArgumentParser(
+            prog='paddlespeech.tts', add_help=True)
+        self.parser.add_argument(
+            '--input', type=str, default=None, help='Input text to generate.')
+        # acoustic model
+        self.parser.add_argument(
+            '--am',
+            type=str,
+            default='fastspeech2_csmsc',
+            choices=[
+                'speedyspeech_csmsc',
+                'fastspeech2_csmsc',
+                'fastspeech2_ljspeech',
+                'fastspeech2_aishell3',
+                'fastspeech2_vctk',
+                'tacotron2_csmsc',
+                'tacotron2_ljspeech',
+            ],
+            help='Choose acoustic model type of tts task.')
+        self.parser.add_argument(
+            '--am_config',
+            type=str,
+            default=None,
+            help='Config of acoustic model. Use deault config when it is None.')
+        self.parser.add_argument(
+            '--am_ckpt',
+            type=str,
+            default=None,
+            help='Checkpoint file of acoustic model.')
+        self.parser.add_argument(
+            "--am_stat",
+            type=str,
+            default=None,
+            help="mean and standard deviation used to normalize spectrogram when training acoustic model."
+        )
+        self.parser.add_argument(
+            "--phones_dict",
+            type=str,
+            default=None,
+            help="phone vocabulary file.")
+        self.parser.add_argument(
+            "--tones_dict",
+            type=str,
+            default=None,
+            help="tone vocabulary file.")
+        self.parser.add_argument(
+            "--speaker_dict",
+            type=str,
+            default=None,
+            help="speaker id map file.")
+        self.parser.add_argument(
+            '--spk_id',
+            type=int,
+            default=0,
+            help='spk id for multi speaker acoustic model')
+        # vocoder
+        self.parser.add_argument(
+            '--voc',
+            type=str,
+            default='pwgan_csmsc',
+            choices=[
+                'pwgan_csmsc',
+                'pwgan_ljspeech',
+                'pwgan_aishell3',
+                'pwgan_vctk',
+                'mb_melgan_csmsc',
+                'style_melgan_csmsc',
+                'hifigan_csmsc',
+                'hifigan_ljspeech',
+                'hifigan_aishell3',
+                'hifigan_vctk',
+                'wavernn_csmsc',
+            ],
+            help='Choose vocoder type of tts task.')
+
+        self.parser.add_argument(
+            '--voc_config',
+            type=str,
+            default=None,
+            help='Config of voc. Use deault config when it is None.')
+        self.parser.add_argument(
+            '--voc_ckpt',
+            type=str,
+            default=None,
+            help='Checkpoint file of voc.')
+        self.parser.add_argument(
+            "--voc_stat",
+            type=str,
+            default=None,
+            help="mean and standard deviation used to normalize spectrogram when training voc."
+        )
+        # other
+        self.parser.add_argument(
+            '--lang',
+            type=str,
+            default='zh',
+            help='Choose model language. zh or en')
+        self.parser.add_argument(
+            '--device',
+            type=str,
+            default=paddle.get_device(),
+            help='Choose device to execute model inference.')
+
+        self.parser.add_argument(
+            '--output', type=str, default='output.wav', help='output file name')
+        self.parser.add_argument(
+            '-d',
+            '--job_dump_result',
+            action='store_true',
+            help='Save job result into file.')
+        self.parser.add_argument(
+            '-v',
+            '--verbose',
+            action='store_true',
+            help='Increase logger verbosity of current task.')
+
+    def _get_pretrained_path(self, tag: str) -> os.PathLike:
+        """
+        Download and returns pretrained resources path of current task.
+        """
+        support_models = list(pretrained_models.keys())
+        assert tag in pretrained_models, 'The model "{}" you want to use has not been supported, please choose other models.\nThe support models includes:\n\t\t{}\n'.format(
+            tag, '\n\t\t'.join(support_models))
+
+        res_path = os.path.join(MODEL_HOME, tag)
+        decompressed_path = download_and_decompress(pretrained_models[tag],
+                                                    res_path)
+        decompressed_path = os.path.abspath(decompressed_path)
+        logger.info(
+            'Use pretrained model stored in: {}'.format(decompressed_path))
+        return decompressed_path
+
+    def _init_from_path(
+            self,
+            am: str='fastspeech2_csmsc',
+            am_config: Optional[os.PathLike]=None,
+            am_ckpt: Optional[os.PathLike]=None,
+            am_stat: Optional[os.PathLike]=None,
+            phones_dict: Optional[os.PathLike]=None,
+            tones_dict: Optional[os.PathLike]=None,
+            speaker_dict: Optional[os.PathLike]=None,
+            voc: str='pwgan_csmsc',
+            voc_config: Optional[os.PathLike]=None,
+            voc_ckpt: Optional[os.PathLike]=None,
+            voc_stat: Optional[os.PathLike]=None,
+            lang: str='zh', ):
+        """
+        Init model and other resources from a specific path.
+        """
+        if hasattr(self, 'am_inference') and hasattr(self, 'voc_inference'):
+            logger.info('Models had been initialized.')
+            return
+        # am
+        am_tag = am + '-' + lang
+        if am_ckpt is None or am_config is None or am_stat is None or phones_dict is None:
+            am_res_path = self._get_pretrained_path(am_tag)
+            self.am_res_path = am_res_path
+            self.am_config = os.path.join(am_res_path,
+                                          pretrained_models[am_tag]['config'])
+            self.am_ckpt = os.path.join(am_res_path,
+                                        pretrained_models[am_tag]['ckpt'])
+            self.am_stat = os.path.join(
+                am_res_path, pretrained_models[am_tag]['speech_stats'])
+            # must have phones_dict in acoustic
+            self.phones_dict = os.path.join(
+                am_res_path, pretrained_models[am_tag]['phones_dict'])
+            print("self.phones_dict:", self.phones_dict)
+            logger.info(am_res_path)
+            logger.info(self.am_config)
+            logger.info(self.am_ckpt)
+        else:
+            self.am_config = os.path.abspath(am_config)
+            self.am_ckpt = os.path.abspath(am_ckpt)
+            self.am_stat = os.path.abspath(am_stat)
+            self.phones_dict = os.path.abspath(phones_dict)
+            self.am_res_path = os.path.dirname(os.path.abspath(self.am_config))
+        print("self.phones_dict:", self.phones_dict)
+
+        # for speedyspeech
+        self.tones_dict = None
+        if 'tones_dict' in pretrained_models[am_tag]:
+            self.tones_dict = os.path.join(
+                am_res_path, pretrained_models[am_tag]['tones_dict'])
+            if tones_dict:
+                self.tones_dict = tones_dict
+
+        # for multi speaker fastspeech2
+        self.speaker_dict = None
+        if 'speaker_dict' in pretrained_models[am_tag]:
+            self.speaker_dict = os.path.join(
+                am_res_path, pretrained_models[am_tag]['speaker_dict'])
+            if speaker_dict:
+                self.speaker_dict = speaker_dict
+
+        # voc
+        voc_tag = voc + '-' + lang
+        if voc_ckpt is None or voc_config is None or voc_stat is None:
+            voc_res_path = self._get_pretrained_path(voc_tag)
+            self.voc_res_path = voc_res_path
+            self.voc_config = os.path.join(voc_res_path,
+                                           pretrained_models[voc_tag]['config'])
+            self.voc_ckpt = os.path.join(voc_res_path,
+                                         pretrained_models[voc_tag]['ckpt'])
+            self.voc_stat = os.path.join(
+                voc_res_path, pretrained_models[voc_tag]['speech_stats'])
+            logger.info(voc_res_path)
+            logger.info(self.voc_config)
+            logger.info(self.voc_ckpt)
+        else:
+            self.voc_config = os.path.abspath(voc_config)
+            self.voc_ckpt = os.path.abspath(voc_ckpt)
+            self.voc_stat = os.path.abspath(voc_stat)
+            self.voc_res_path = os.path.dirname(
+                os.path.abspath(self.voc_config))
+
+        # Init body.
+        with open(self.am_config) as f:
+            self.am_config = CfgNode(yaml.safe_load(f))
+        with open(self.voc_config) as f:
+            self.voc_config = CfgNode(yaml.safe_load(f))
+
+        with open(self.phones_dict, "r") as f:
+            phn_id = [line.strip().split() for line in f.readlines()]
+        vocab_size = len(phn_id)
+        print("vocab_size:", vocab_size)
+
+        tone_size = None
+        if self.tones_dict:
+            with open(self.tones_dict, "r") as f:
+                tone_id = [line.strip().split() for line in f.readlines()]
+            tone_size = len(tone_id)
+            print("tone_size:", tone_size)
+
+        spk_num = None
+        if self.speaker_dict:
+            with open(self.speaker_dict, 'rt') as f:
+                spk_id = [line.strip().split() for line in f.readlines()]
+            spk_num = len(spk_id)
+            print("spk_num:", spk_num)
+
+        # frontend
+        if lang == 'zh':
+            self.frontend = Frontend(
+                phone_vocab_path=self.phones_dict,
+                tone_vocab_path=self.tones_dict)
+
+        elif lang == 'en':
+            self.frontend = English(phone_vocab_path=self.phones_dict)
+        print("frontend done!")
+
+        # acoustic model
+        odim = self.am_config.n_mels
+        # model: {model_name}_{dataset}
+        am_name = am[:am.rindex('_')]
+
+        am_class = dynamic_import(am_name, model_alias)
+        am_inference_class = dynamic_import(am_name + '_inference', model_alias)
+
+        if am_name == 'fastspeech2':
+            am = am_class(
+                idim=vocab_size,
+                odim=odim,
+                spk_num=spk_num,
+                **self.am_config["model"])
+        elif am_name == 'speedyspeech':
+            am = am_class(
+                vocab_size=vocab_size,
+                tone_size=tone_size,
+                **self.am_config["model"])
+        elif am_name == 'tacotron2':
+            am = am_class(idim=vocab_size, odim=odim, **self.am_config["model"])
+
+        am.set_state_dict(paddle.load(self.am_ckpt)["main_params"])
+        am.eval()
+        am_mu, am_std = np.load(self.am_stat)
+        am_mu = paddle.to_tensor(am_mu)
+        am_std = paddle.to_tensor(am_std)
+        am_normalizer = ZScore(am_mu, am_std)
+        self.am_inference = am_inference_class(am_normalizer, am)
+        self.am_inference.eval()
+        print("acoustic model done!")
+
+        # vocoder
+        # model: {model_name}_{dataset}
+        voc_name = voc[:voc.rindex('_')]
+        voc_class = dynamic_import(voc_name, model_alias)
+        voc_inference_class = dynamic_import(voc_name + '_inference',
+                                             model_alias)
+        if voc_name != 'wavernn':
+            voc = voc_class(**self.voc_config["generator_params"])
+            voc.set_state_dict(paddle.load(self.voc_ckpt)["generator_params"])
+            voc.remove_weight_norm()
+            voc.eval()
+        else:
+            voc = voc_class(**self.voc_config["model"])
+            voc.set_state_dict(paddle.load(self.voc_ckpt)["main_params"])
+            voc.eval()
+        voc_mu, voc_std = np.load(self.voc_stat)
+        voc_mu = paddle.to_tensor(voc_mu)
+        voc_std = paddle.to_tensor(voc_std)
+        voc_normalizer = ZScore(voc_mu, voc_std)
+        self.voc_inference = voc_inference_class(voc_normalizer, voc)
+        self.voc_inference.eval()
+        print("voc done!")
+
+    def preprocess(self, input: Any, *args, **kwargs):
+        """
+        Input preprocess and return paddle.Tensor stored in self._inputs.
+        Input content can be a text(tts), a file(asr, cls), a stream(not supported yet) or anything needed.
+
+        Args:
+            input (Any): Input text/file/stream or other content.
+        """
+        pass
+
+    @paddle.no_grad()
+    def infer(self,
+              text: str,
+              lang: str='zh',
+              am: str='fastspeech2_csmsc',
+              spk_id: int=0):
+        """
+        Model inference and result stored in self.output.
+        """
+        am_name = am[:am.rindex('_')]
+        am_dataset = am[am.rindex('_') + 1:]
+        get_tone_ids = False
+        merge_sentences = False
+        frontend_st = time.time()
+        if am_name == 'speedyspeech':
+            get_tone_ids = True
+        if lang == 'zh':
+            input_ids = self.frontend.get_input_ids(
+                text,
+                merge_sentences=merge_sentences,
+                get_tone_ids=get_tone_ids)
+            phone_ids = input_ids["phone_ids"]
+            if get_tone_ids:
+                tone_ids = input_ids["tone_ids"]
+        elif lang == 'en':
+            input_ids = self.frontend.get_input_ids(
+                text, merge_sentences=merge_sentences)
+            phone_ids = input_ids["phone_ids"]
+        else:
+            print("lang should in {'zh', 'en'}!")
+        self.frontend_time = time.time() - frontend_st
+
+        self.am_time = 0
+        self.voc_time = 0
+        flags = 0
+        for i in range(len(phone_ids)):
+            am_st = time.time()
+            part_phone_ids = phone_ids[i]
+            # am
+            if am_name == 'speedyspeech':
+                part_tone_ids = tone_ids[i]
+                mel = self.am_inference(part_phone_ids, part_tone_ids)
+            # fastspeech2
+            else:
+                # multi speaker
+                if am_dataset in {"aishell3", "vctk"}:
+                    mel = self.am_inference(
+                        part_phone_ids, spk_id=paddle.to_tensor(spk_id))
+                else:
+                    mel = self.am_inference(part_phone_ids)
+            self.am_time += (time.time() - am_st)
+            # voc
+            voc_st = time.time()
+            wav = self.voc_inference(mel)
+            if flags == 0:
+                wav_all = wav
+                flags = 1
+            else:
+                wav_all = paddle.concat([wav_all, wav])
+            self.voc_time += (time.time() - voc_st)
+        self._outputs['wav'] = wav_all
+
+    def postprocess(self, output: str='output.wav') -> Union[str, os.PathLike]:
+        """
+        Output postprocess and return results.
+        This method get model output from self._outputs and convert it into human-readable results.
+
+        Returns:
+            Union[str, os.PathLike]: Human-readable results such as texts and audio files.
+        """
+        output = os.path.abspath(os.path.expanduser(output))
+        sf.write(
+            output, self._outputs['wav'].numpy(), samplerate=self.am_config.fs)
+        return output
+
+    def execute(self, argv: List[str]) -> bool:
+        """
+        Command line entry.
+        """
+
+        args = self.parser.parse_args(argv)
+
+        am = args.am
+        am_config = args.am_config
+        am_ckpt = args.am_ckpt
+        am_stat = args.am_stat
+        phones_dict = args.phones_dict
+        print("phones_dict:", phones_dict)
+        tones_dict = args.tones_dict
+        speaker_dict = args.speaker_dict
+        voc = args.voc
+        voc_config = args.voc_config
+        voc_ckpt = args.voc_ckpt
+        voc_stat = args.voc_stat
+        lang = args.lang
+        device = args.device
+        spk_id = args.spk_id
+
+        if not args.verbose:
+            self.disable_task_loggers()
+
+        task_source = self.get_task_source(args.input)
+        task_results = OrderedDict()
+        has_exceptions = False
+
+        for id_, input_ in task_source.items():
+            if len(task_source) > 1:
+                assert isinstance(args.output,
+                                  str) and args.output.endswith('.wav')
+                output = args.output.replace('.wav', f'_{id_}.wav')
+            else:
+                output = args.output
+
+            try:
+                res = self(
+                    text=input_,
+                    # acoustic model related
+                    am=am,
+                    am_config=am_config,
+                    am_ckpt=am_ckpt,
+                    am_stat=am_stat,
+                    phones_dict=phones_dict,
+                    tones_dict=tones_dict,
+                    speaker_dict=speaker_dict,
+                    spk_id=spk_id,
+                    # vocoder related
+                    voc=voc,
+                    voc_config=voc_config,
+                    voc_ckpt=voc_ckpt,
+                    voc_stat=voc_stat,
+                    # other
+                    lang=lang,
+                    device=device,
+                    output=output)
+                task_results[id_] = res
+            except Exception as e:
+                has_exceptions = True
+                task_results[id_] = f'{e.__class__.__name__}: {e}'
+
+        self.process_task_results(args.input, task_results,
+                                  args.job_dump_result)
+
+        if has_exceptions:
+            return False
+        else:
+            return True
+
+    @stats_wrapper
+    def __call__(self,
+                 text: str,
+                 am: str='fastspeech2_csmsc',
+                 am_config: Optional[os.PathLike]=None,
+                 am_ckpt: Optional[os.PathLike]=None,
+                 am_stat: Optional[os.PathLike]=None,
+                 spk_id: int=0,
+                 phones_dict: Optional[os.PathLike]=None,
+                 tones_dict: Optional[os.PathLike]=None,
+                 speaker_dict: Optional[os.PathLike]=None,
+                 voc: str='pwgan_csmsc',
+                 voc_config: Optional[os.PathLike]=None,
+                 voc_ckpt: Optional[os.PathLike]=None,
+                 voc_stat: Optional[os.PathLike]=None,
+                 lang: str='zh',
+                 device: str=paddle.get_device(),
+                 output: str='output.wav'):
+        """
+        Python API to call an executor.
+        """
+        paddle.set_device(device)
+        self._init_from_path(
+            am=am,
+            am_config=am_config,
+            am_ckpt=am_ckpt,
+            am_stat=am_stat,
+            phones_dict=phones_dict,
+            tones_dict=tones_dict,
+            speaker_dict=speaker_dict,
+            voc=voc,
+            voc_config=voc_config,
+            voc_ckpt=voc_ckpt,
+            voc_stat=voc_stat,
+            lang=lang)
+
+        self.infer(text=text, lang=lang, am=am, spk_id=spk_id)
+
+        res = self.postprocess(output=output)
+
+        return res
diff --git a/ernie-sat/paddlespeech/cli/utils.py b/ernie-sat/paddlespeech/cli/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7d64b9a95e296a57abcf6340d4e6581e3ccbbe6
--- /dev/null
+++ b/ernie-sat/paddlespeech/cli/utils.py
@@ -0,0 +1,340 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import hashlib
+import inspect
+import json
+import os
+import tarfile
+import threading
+import time
+import uuid
+import zipfile
+from typing import Any
+from typing import Dict
+
+import paddle
+import requests
+import yaml
+from paddle.framework import load
+
+import paddleaudio
+from . import download
+from .entry import commands
+try:
+    from .. import __version__
+except ImportError:
+    __version__ = "0.0.0"  # for develop branch
+
+requests.adapters.DEFAULT_RETRIES = 3
+
+__all__ = [
+    'cli_register',
+    'get_command',
+    'download_and_decompress',
+    'load_state_dict_from_url',
+    'stats_wrapper',
+]
+
+
+def cli_register(name: str, description: str='') -> Any:
+    def _warpper(command):
+        items = name.split('.')
+
+        com = commands
+        for item in items:
+            com = com[item]
+        com['_entry'] = command
+        if description:
+            com['_description'] = description
+        return command
+
+    return _warpper
+
+
+def get_command(name: str) -> Any:
+    items = name.split('.')
+    com = commands
+    for item in items:
+        com = com[item]
+
+    return com['_entry']
+
+
+def _get_uncompress_path(filepath: os.PathLike) -> os.PathLike:
+    file_dir = os.path.dirname(filepath)
+    is_zip_file = False
+    if tarfile.is_tarfile(filepath):
+        files = tarfile.open(filepath, "r:*")
+        file_list = files.getnames()
+    elif zipfile.is_zipfile(filepath):
+        files = zipfile.ZipFile(filepath, 'r')
+        file_list = files.namelist()
+        is_zip_file = True
+    else:
+        return file_dir
+
+    if download._is_a_single_file(file_list):
+        rootpath = file_list[0]
+        uncompressed_path = os.path.join(file_dir, rootpath)
+    elif download._is_a_single_dir(file_list):
+        if is_zip_file:
+            rootpath = os.path.splitext(file_list[0])[0].split(os.sep)[0]
+        else:
+            rootpath = os.path.splitext(file_list[0])[0].split(os.sep)[-1]
+        uncompressed_path = os.path.join(file_dir, rootpath)
+    else:
+        rootpath = os.path.splitext(filepath)[0].split(os.sep)[-1]
+        uncompressed_path = os.path.join(file_dir, rootpath)
+
+    files.close()
+    return uncompressed_path
+
+
+def download_and_decompress(archive: Dict[str, str], path: str) -> os.PathLike:
+    """
+    Download archieves and decompress to specific path.
+    """
+    if not os.path.isdir(path):
+        os.makedirs(path)
+
+    assert 'url' in archive and 'md5' in archive, \
+        'Dictionary keys of "url" and "md5" are required in the archive, but got: {}'.format(list(archive.keys()))
+
+    filepath = os.path.join(path, os.path.basename(archive['url']))
+    if os.path.isfile(filepath) and download._md5check(filepath,
+                                                       archive['md5']):
+        uncompress_path = _get_uncompress_path(filepath)
+        if not os.path.isdir(uncompress_path):
+            download._decompress(filepath)
+    else:
+        StatsWorker(
+            task='download',
+            version=__version__,
+            extra_info={
+                'download_url': archive['url'],
+                'paddle_version': paddle.__version__
+            }).start()
+        uncompress_path = download.get_path_from_url(archive['url'], path,
+                                                     archive['md5'])
+
+    return uncompress_path
+
+
+def load_state_dict_from_url(url: str, path: str, md5: str=None) -> os.PathLike:
+    """
+    Download and load a state dict from url
+    """
+    if not os.path.isdir(path):
+        os.makedirs(path)
+
+    download.get_path_from_url(url, path, md5)
+    return load(os.path.join(path, os.path.basename(url)))
+
+
+def _get_user_home():
+    return os.path.expanduser('~')
+
+
+def _get_paddlespcceh_home():
+    if 'PPSPEECH_HOME' in os.environ:
+        home_path = os.environ['PPSPEECH_HOME']
+        if os.path.exists(home_path):
+            if os.path.isdir(home_path):
+                return home_path
+            else:
+                raise RuntimeError(
+                    'The environment variable PPSPEECH_HOME {} is not a directory.'.
+                    format(home_path))
+        else:
+            return home_path
+    return os.path.join(_get_user_home(), '.paddlespeech')
+
+
+def _get_sub_home(directory):
+    home = os.path.join(_get_paddlespcceh_home(), directory)
+    if not os.path.exists(home):
+        os.makedirs(home)
+    return home
+
+
+PPSPEECH_HOME = _get_paddlespcceh_home()
+MODEL_HOME = _get_sub_home('models')
+CONF_HOME = _get_sub_home('conf')
+
+
+def _md5(text: str):
+    '''Calculate the md5 value of the input text.'''
+    md5code = hashlib.md5(text.encode())
+    return md5code.hexdigest()
+
+
+class ConfigCache:
+    def __init__(self):
+        self._data = {}
+        self._initialize()
+        self.file = os.path.join(CONF_HOME, 'cache.yaml')
+        if not os.path.exists(self.file):
+            self.flush()
+            return
+
+        with open(self.file, 'r') as file:
+            try:
+                cfg = yaml.load(file, Loader=yaml.FullLoader)
+                self._data.update(cfg)
+            except Exception as e:
+                self.flush()
+
+    @property
+    def cache_info(self):
+        return self._data['cache_info']
+
+    def _initialize(self):
+        # Set default configuration values.
+        cache_info = _md5(str(uuid.uuid1())[-12:]) + "-" + str(int(time.time()))
+        self._data['cache_info'] = cache_info
+
+    def flush(self):
+        '''Flush the current configuration into the configuration file.'''
+        with open(self.file, 'w') as file:
+            cfg = json.loads(json.dumps(self._data))
+            yaml.dump(cfg, file)
+
+
+stats_api = "http://paddlepaddle.org.cn/paddlehub/stat"
+cache_info = ConfigCache().cache_info
+
+
+class StatsWorker(threading.Thread):
+    def __init__(self,
+                 task="asr",
+                 model=None,
+                 version=__version__,
+                 extra_info={}):
+        threading.Thread.__init__(self)
+        self._task = task
+        self._model = model
+        self._version = version
+        self._extra_info = extra_info
+
+    def run(self):
+        params = {
+            'task': self._task,
+            'version': self._version,
+            'from': 'ppspeech'
+        }
+        if self._model:
+            params['model'] = self._model
+
+        self._extra_info.update({
+            'cache_info': cache_info,
+        })
+        params.update({"extra": json.dumps(self._extra_info)})
+
+        try:
+            requests.get(stats_api, params)
+        except Exception:
+            pass
+
+        return
+
+
+def _note_one_stat(cls_name, params={}):
+    task = cls_name.replace('Executor', '').lower()  # XXExecutor
+    extra_info = {
+        'paddle_version': paddle.__version__,
+    }
+
+    if 'model' in params:
+        model = params['model']
+    else:
+        model = None
+
+    if 'audio_file' in params:
+        try:
+            _, sr = paddleaudio.load(params['audio_file'])
+        except Exception:
+            sr = -1
+
+    if task == 'asr':
+        extra_info.update({
+            'lang': params['lang'],
+            'inp_sr': sr,
+            'model_sr': params['sample_rate'],
+        })
+    elif task == 'st':
+        extra_info.update({
+            'lang':
+            params['src_lang'] + '-' + params['tgt_lang'],
+            'inp_sr':
+            sr,
+            'model_sr':
+            params['sample_rate'],
+        })
+    elif task == 'tts':
+        model = params['am']
+        extra_info.update({
+            'lang': params['lang'],
+            'vocoder': params['voc'],
+        })
+    elif task == 'cls':
+        extra_info.update({
+            'inp_sr': sr,
+        })
+    elif task == 'text':
+        extra_info.update({
+            'sub_task': params['task'],
+            'lang': params['lang'],
+        })
+    else:
+        return
+
+    StatsWorker(
+        task=task,
+        model=model,
+        version=__version__,
+        extra_info=extra_info, ).start()
+
+
+def _parse_args(func, *args, **kwargs):
+    # FullArgSpec(args, varargs, varkw, defaults, kwonlyargs, kwonlydefaults, annotations)
+    argspec = inspect.getfullargspec(func)
+
+    keys = argspec[0]
+    if keys[0] == 'self':  # Remove self pointer.
+        keys = keys[1:]
+
+    default_values = argspec[3]
+    values = [None] * (len(keys) - len(default_values))
+    values.extend(list(default_values))
+    params = dict(zip(keys, values))
+
+    for idx, v in enumerate(args):
+        params[keys[idx]] = v
+    for k, v in kwargs.items():
+        params[k] = v
+
+    return params
+
+
+def stats_wrapper(executor_func):
+    def _warpper(self, *args, **kwargs):
+        try:
+            _note_one_stat(
+                type(self).__name__, _parse_args(executor_func, *args,
+                                                 **kwargs))
+        except Exception:
+            pass
+        return executor_func(self, *args, **kwargs)
+
+    return _warpper
diff --git a/ernie-sat/paddlespeech/cli/vector/__init__.py b/ernie-sat/paddlespeech/cli/vector/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..038596af02cc3e74d0446f7d279ef8016b429255
--- /dev/null
+++ b/ernie-sat/paddlespeech/cli/vector/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .infer import VectorExecutor
diff --git a/ernie-sat/paddlespeech/cli/vector/infer.py b/ernie-sat/paddlespeech/cli/vector/infer.py
new file mode 100644
index 0000000000000000000000000000000000000000..68e832ac74d4dda805a4185ab09a72f2eb7d6413
--- /dev/null
+++ b/ernie-sat/paddlespeech/cli/vector/infer.py
@@ -0,0 +1,519 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+import sys
+from collections import OrderedDict
+from typing import Dict
+from typing import List
+from typing import Optional
+from typing import Union
+
+import paddle
+import soundfile
+from yacs.config import CfgNode
+
+from ..executor import BaseExecutor
+from ..log import logger
+from ..utils import cli_register
+from ..utils import download_and_decompress
+from ..utils import MODEL_HOME
+from ..utils import stats_wrapper
+from paddleaudio.backends import load as load_audio
+from paddleaudio.compliance.librosa import melspectrogram
+from paddlespeech.s2t.utils.dynamic_import import dynamic_import
+from paddlespeech.vector.io.batch import feature_normalize
+from paddlespeech.vector.modules.sid_model import SpeakerIdetification
+
+pretrained_models = {
+    # The tags for pretrained_models should be "{model_name}[-{dataset}][-{sr}][-...]".
+    # e.g. "ecapatdnn_voxceleb12-16k".
+    # Command line and python api use "{model_name}[-{dataset}]" as --model, usage:
+    # "paddlespeech vector --task spk --model ecapatdnn_voxceleb12-16k --sr 16000 --input ./input.wav"
+    "ecapatdnn_voxceleb12-16k": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/vector/voxceleb/sv0_ecapa_tdnn_voxceleb12_ckpt_0_2_0.tar.gz',
+        'md5':
+        'cc33023c54ab346cd318408f43fcaf95',
+        'cfg_path':
+        'conf/model.yaml',  # the yaml config path
+        'ckpt_path':
+        'model/model',  # the format is ${dir}/{model_name}, 
+        # so the first 'model' is dir, the second 'model' is the name
+        # this means we have a model stored as model/model.pdparams
+    },
+}
+
+model_alias = {
+    "ecapatdnn": "paddlespeech.vector.models.ecapa_tdnn:EcapaTdnn",
+}
+
+
+@cli_register(
+    name="paddlespeech.vector",
+    description="Speech to vector embedding infer command.")
+class VectorExecutor(BaseExecutor):
+    def __init__(self):
+        super(VectorExecutor, self).__init__()
+
+        self.parser = argparse.ArgumentParser(
+            prog="paddlespeech.vector", add_help=True)
+
+        self.parser.add_argument(
+            "--model",
+            type=str,
+            default="ecapatdnn_voxceleb12",
+            choices=["ecapatdnn_voxceleb12"],
+            help="Choose model type of vector task.")
+        self.parser.add_argument(
+            "--task",
+            type=str,
+            default="spk",
+            choices=["spk", "score"],
+            help="task type in vector domain")
+        self.parser.add_argument(
+            "--input",
+            type=str,
+            default=None,
+            help="Audio file to extract embedding.")
+        self.parser.add_argument(
+            "--sample_rate",
+            type=int,
+            default=16000,
+            choices=[16000],
+            help="Choose the audio sample rate of the model. 8000 or 16000")
+        self.parser.add_argument(
+            "--ckpt_path",
+            type=str,
+            default=None,
+            help="Checkpoint file of model.")
+        self.parser.add_argument(
+            '--config',
+            type=str,
+            default=None,
+            help='Config of asr task. Use deault config when it is None.')
+        self.parser.add_argument(
+            "--device",
+            type=str,
+            default=paddle.get_device(),
+            help="Choose device to execute model inference.")
+        self.parser.add_argument(
+            '-d',
+            '--job_dump_result',
+            action='store_true',
+            help='Save job result into file.')
+
+        self.parser.add_argument(
+            '-v',
+            '--verbose',
+            action='store_true',
+            help='Increase logger verbosity of current task.')
+
+    def execute(self, argv: List[str]) -> bool:
+        """Command line entry for vector model
+
+        Args:
+            argv (List[str]): command line args list
+
+        Returns:
+            bool: 
+                 False: some audio occurs error
+                 True: all audio process success
+        """
+        # stage 0: parse the args and get the required args
+        parser_args = self.parser.parse_args(argv)
+        model = parser_args.model
+        sample_rate = parser_args.sample_rate
+        config = parser_args.config
+        ckpt_path = parser_args.ckpt_path
+        device = parser_args.device
+
+        # stage 1: configurate the verbose flag
+        if not parser_args.verbose:
+            self.disable_task_loggers()
+
+        # stage 2: read the input data and store them as a list
+        task_source = self.get_task_source(parser_args.input)
+        logger.info(f"task source: {task_source}")
+
+        # stage 3: process the audio one by one
+        # we do action according the task type
+        task_result = OrderedDict()
+        has_exceptions = False
+        for id_, input_ in task_source.items():
+            try:
+                # extract the speaker audio embedding
+                if parser_args.task == "spk":
+                    logger.info("do vector spk task")
+                    res = self(input_, model, sample_rate, config, ckpt_path,
+                               device)
+                    task_result[id_] = res
+                elif parser_args.task == "score":
+                    logger.info("do vector score task")
+                    logger.info(f"input content {input_}")
+                    if len(input_.split()) != 2:
+                        logger.error(
+                            f"vector score task input {input_} wav num is not two,"
+                            "that is {len(input_.split())}")
+                        sys.exit(-1)
+
+                    # get the enroll and test embedding
+                    enroll_audio, test_audio = input_.split()
+                    logger.info(
+                        f"score task, enroll audio: {enroll_audio}, test audio: {test_audio}"
+                    )
+                    enroll_embedding = self(enroll_audio, model, sample_rate,
+                                            config, ckpt_path, device)
+                    test_embedding = self(test_audio, model, sample_rate,
+                                          config, ckpt_path, device)
+
+                    # get the score
+                    res = self.get_embeddings_score(enroll_embedding,
+                                                    test_embedding)
+                    task_result[id_] = res
+            except Exception as e:
+                has_exceptions = True
+                task_result[id_] = f'{e.__class__.__name__}: {e}'
+
+        logger.info("task result as follows: ")
+        logger.info(f"{task_result}")
+
+        # stage 4: process the all the task results
+        self.process_task_results(parser_args.input, task_result,
+                                  parser_args.job_dump_result)
+
+        # stage 5: return the exception flag
+        #          if return False, somen audio process occurs error
+        if has_exceptions:
+            return False
+        else:
+            return True
+
+    def _get_job_contents(
+            self, job_input: os.PathLike) -> Dict[str, Union[str, os.PathLike]]:
+        """
+        Read a job input file and return its contents in a dictionary.
+        Refactor from the Executor._get_job_contents
+
+        Args:
+            job_input (os.PathLike): The job input file.
+
+        Returns:
+            Dict[str, str]: Contents of job input.
+        """
+        job_contents = OrderedDict()
+        with open(job_input) as f:
+            for line in f:
+                line = line.strip()
+                if not line:
+                    continue
+                k = line.split(' ')[0]
+                v = ' '.join(line.split(' ')[1:])
+                job_contents[k] = v
+        return job_contents
+
+    def get_embeddings_score(self, enroll_embedding, test_embedding):
+        """get the enroll embedding and test embedding score
+
+        Args:
+            enroll_embedding (numpy.array): shape: (emb_size), enroll audio embedding
+            test_embedding (numpy.array): shape: (emb_size), test audio embedding
+
+        Returns:
+            score: the score between enroll embedding and test embedding
+        """
+        if not hasattr(self, "score_func"):
+            self.score_func = paddle.nn.CosineSimilarity(axis=0)
+            logger.info("create the cosine score function ")
+
+        score = self.score_func(
+            paddle.to_tensor(enroll_embedding),
+            paddle.to_tensor(test_embedding))
+
+        return score.item()
+
+    @stats_wrapper
+    def __call__(self,
+                 audio_file: os.PathLike,
+                 model: str='ecapatdnn_voxceleb12',
+                 sample_rate: int=16000,
+                 config: os.PathLike=None,
+                 ckpt_path: os.PathLike=None,
+                 device=paddle.get_device()):
+        """Extract the audio embedding
+
+        Args:
+            audio_file (os.PathLike): audio path, 
+                                      whose format must be wav and sample rate must be matched the model
+            model (str, optional): mode type, which is been loaded from the pretrained model list. 
+                                   Defaults to 'ecapatdnn-voxceleb12'.
+            sample_rate (int, optional): model sample rate. Defaults to 16000.
+            config (os.PathLike, optional): yaml config. Defaults to None.
+            ckpt_path (os.PathLike, optional): pretrained model path. Defaults to None.
+            device (optional): paddle running host device. Defaults to paddle.get_device().
+
+        Returns:
+            dict: return the audio embedding and the embedding shape
+        """
+        # stage 0: check the audio format
+        audio_file = os.path.abspath(audio_file)
+        if not self._check(audio_file, sample_rate):
+            sys.exit(-1)
+
+        # stage 1: set the paddle runtime host device
+        logger.info(f"device type: {device}")
+        paddle.device.set_device(device)
+
+        # stage 2: read the specific pretrained model
+        self._init_from_path(model, sample_rate, config, ckpt_path)
+
+        # stage 3: preprocess the audio and get the audio feat
+        self.preprocess(model, audio_file)
+
+        # stage 4: infer the model and get the audio embedding
+        self.infer(model)
+
+        # stage 5: process the result and set them to output dict
+        res = self.postprocess()
+
+        return res
+
+    def _get_pretrained_path(self, tag: str) -> os.PathLike:
+        """get the neural network path from the pretrained model list
+           we stored all the pretained mode in the variable `pretrained_models`
+
+        Args:
+            tag (str): model tag in the pretrained model list
+
+        Returns:
+            os.PathLike: the downloaded pretrained model path in the disk
+        """
+        support_models = list(pretrained_models.keys())
+        assert tag in pretrained_models, \
+            'The model "{}" you want to use has not been supported,'\
+            'please choose other models.\n' \
+            'The support models includes\n\t\t{}'.format(tag, "\n\t\t".join(support_models))
+
+        res_path = os.path.join(MODEL_HOME, tag)
+        decompressed_path = download_and_decompress(pretrained_models[tag],
+                                                    res_path)
+
+        decompressed_path = os.path.abspath(decompressed_path)
+        logger.info(
+            'Use pretrained model stored in: {}'.format(decompressed_path))
+
+        return decompressed_path
+
+    def _init_from_path(self,
+                        model_type: str='ecapatdnn_voxceleb12',
+                        sample_rate: int=16000,
+                        cfg_path: Optional[os.PathLike]=None,
+                        ckpt_path: Optional[os.PathLike]=None):
+        """Init the neural network from the model path
+
+        Args:
+            model_type (str, optional): model tag in the pretrained model list. 
+                                        Defaults to 'ecapatdnn_voxceleb12'.
+            sample_rate (int, optional): model sample rate. 
+                                         Defaults to 16000.
+            cfg_path (Optional[os.PathLike], optional): yaml config file path. 
+                                                        Defaults to None.
+            ckpt_path (Optional[os.PathLike], optional): the pretrained model path, which is stored in the disk. 
+                                                         Defaults to None.
+        """
+        # stage 0: avoid to init the mode again
+        if hasattr(self, "model"):
+            logger.info("Model has been initialized")
+            return
+
+        # stage 1: get the model and config path
+        #          if we want init the network from the model stored in the disk,
+        #          we must pass the config path and the ckpt model path
+        if cfg_path is None or ckpt_path is None:
+            # get the mode from pretrained list
+            sample_rate_str = "16k" if sample_rate == 16000 else "8k"
+            tag = model_type + "-" + sample_rate_str
+            logger.info(f"load the pretrained model: {tag}")
+            # get the model from the pretrained list
+            # we download the pretrained model and store it in the res_path
+            res_path = self._get_pretrained_path(tag)
+            self.res_path = res_path
+
+            self.cfg_path = os.path.join(res_path,
+                                         pretrained_models[tag]['cfg_path'])
+            self.ckpt_path = os.path.join(
+                res_path, pretrained_models[tag]['ckpt_path'] + '.pdparams')
+        else:
+            # get the model from disk
+            self.cfg_path = os.path.abspath(cfg_path)
+            self.ckpt_path = os.path.abspath(ckpt_path + ".pdparams")
+            self.res_path = os.path.dirname(
+                os.path.dirname(os.path.abspath(self.cfg_path)))
+
+        logger.info(f"start to read the ckpt from {self.ckpt_path}")
+        logger.info(f"read the config from {self.cfg_path}")
+        logger.info(f"get the res path {self.res_path}")
+
+        # stage 2: read and config and init the model body
+        self.config = CfgNode(new_allowed=True)
+        self.config.merge_from_file(self.cfg_path)
+
+        # stage 3: get the model name to instance the model network with dynamic_import
+        logger.info("start to dynamic import the model class")
+        model_name = model_type[:model_type.rindex('_')]
+        logger.info(f"model name {model_name}")
+        model_class = dynamic_import(model_name, model_alias)
+        model_conf = self.config.model
+        backbone = model_class(**model_conf)
+        model = SpeakerIdetification(
+            backbone=backbone, num_class=self.config.num_speakers)
+        self.model = model
+        self.model.eval()
+
+        # stage 4: load the model parameters
+        logger.info("start to set the model parameters to model")
+        model_dict = paddle.load(self.ckpt_path)
+        self.model.set_state_dict(model_dict)
+
+        logger.info("create the model instance success")
+
+    @paddle.no_grad()
+    def infer(self, model_type: str):
+        """Infer the model to get the embedding
+
+        Args:
+            model_type (str): speaker verification model type
+        """
+        # stage 0: get the feat and length from _inputs
+        feats = self._inputs["feats"]
+        lengths = self._inputs["lengths"]
+        logger.info("start to do backbone network model forward")
+        logger.info(
+            f"feats shape:{feats.shape}, lengths shape: {lengths.shape}")
+
+        # stage 1: get the audio embedding
+        # embedding from (1, emb_size, 1) -> (emb_size)
+        embedding = self.model.backbone(feats, lengths).squeeze().numpy()
+        logger.info(f"embedding size: {embedding.shape}")
+
+        # stage 2: put the embedding and dim info to _outputs property
+        #          the embedding type is numpy.array
+        self._outputs["embedding"] = embedding
+
+    def postprocess(self) -> Union[str, os.PathLike]:
+        """Return the audio embedding info
+
+        Returns:
+            Union[str, os.PathLike]: audio embedding info
+        """
+        embedding = self._outputs["embedding"]
+        return embedding
+
+    def preprocess(self, model_type: str, input_file: Union[str, os.PathLike]):
+        """Extract the audio feat
+
+        Args:
+            model_type (str): speaker verification model type
+            input_file (Union[str, os.PathLike]): audio file path
+        """
+        audio_file = input_file
+        if isinstance(audio_file, (str, os.PathLike)):
+            logger.info(f"Preprocess audio file: {audio_file}")
+
+        # stage 1: load the audio sample points
+        #    Note: this process must match the training process
+        waveform, sr = load_audio(audio_file)
+        logger.info(f"load the audio sample points, shape is: {waveform.shape}")
+
+        # stage 2: get the audio feat
+        # Note: Now we only support fbank feature
+        try:
+            feat = melspectrogram(
+                x=waveform,
+                sr=self.config.sr,
+                n_mels=self.config.n_mels,
+                window_size=self.config.window_size,
+                hop_length=self.config.hop_size)
+            logger.info(f"extract the audio feat, shape is: {feat.shape}")
+        except Exception as e:
+            logger.info(f"feat occurs exception {e}")
+            sys.exit(-1)
+
+        feat = paddle.to_tensor(feat).unsqueeze(0)
+        # in inference period, the lengths is all one without padding
+        lengths = paddle.ones([1])
+
+        # stage 3: we do feature normalize,
+        #          Now we assume that the feat must do normalize
+        feat = feature_normalize(feat, mean_norm=True, std_norm=False)
+
+        # stage 4: store the feat and length in the _inputs,
+        #          which will be used in other function
+        logger.info(f"feats shape: {feat.shape}")
+        self._inputs["feats"] = feat
+        self._inputs["lengths"] = lengths
+
+        logger.info("audio extract the feat success")
+
+    def _check(self, audio_file: str, sample_rate: int):
+        """Check if the model sample match the audio sample rate 
+
+        Args:
+            audio_file (str): audio file path, which will be extracted the embedding
+            sample_rate (int): the desired model sample rate 
+
+        Returns:
+            bool: return if the audio sample rate matches the model sample rate
+        """
+        self.sample_rate = sample_rate
+        if self.sample_rate != 16000 and self.sample_rate != 8000:
+            logger.error(
+                "invalid sample rate, please input --sr 8000 or --sr 16000")
+            return False
+
+        if isinstance(audio_file, (str, os.PathLike)):
+            if not os.path.isfile(audio_file):
+                logger.error("Please input the right audio file path")
+                return False
+
+        logger.info("checking the aduio file format......")
+        try:
+            audio, audio_sample_rate = soundfile.read(
+                audio_file, dtype="float32", always_2d=True)
+        except Exception as e:
+            logger.exception(e)
+            logger.error(
+                "can not open the audio file, please check the audio file format is 'wav'. \n \
+                 you can try to use sox to change the file format.\n \
+                 For example: \n \
+                 sample rate: 16k \n \
+                 sox input_audio.xx --rate 16k --bits 16 --channels 1 output_audio.wav \n \
+                 sample rate: 8k \n \
+                 sox input_audio.xx --rate 8k --bits 16 --channels 1 output_audio.wav \n \
+                 ")
+            return False
+
+        logger.info(f"The sample rate is {audio_sample_rate}")
+
+        if audio_sample_rate != self.sample_rate:
+            logger.error("The sample rate of the input file is not {}.\n \
+                            The program will resample the wav file to {}.\n \
+                            If the result does not meet your expectations，\n \
+                            Please input the 16k 16 bit 1 channel wav file. \
+                        ".format(self.sample_rate, self.sample_rate))
+            sys.exit(-1)
+        else:
+            logger.info("The audio file format is right")
+
+        return True
diff --git a/ernie-sat/paddlespeech/cls/__init__.py b/ernie-sat/paddlespeech/cls/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..185a92b8d94d3426d616c0624f0f2ee04339349e
--- /dev/null
+++ b/ernie-sat/paddlespeech/cls/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/ernie-sat/paddlespeech/cls/exps/__init__.py b/ernie-sat/paddlespeech/cls/exps/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..185a92b8d94d3426d616c0624f0f2ee04339349e
--- /dev/null
+++ b/ernie-sat/paddlespeech/cls/exps/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/ernie-sat/paddlespeech/cls/exps/panns/__init__.py b/ernie-sat/paddlespeech/cls/exps/panns/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..185a92b8d94d3426d616c0624f0f2ee04339349e
--- /dev/null
+++ b/ernie-sat/paddlespeech/cls/exps/panns/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/ernie-sat/paddlespeech/cls/exps/panns/deploy/__init__.py b/ernie-sat/paddlespeech/cls/exps/panns/deploy/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..185a92b8d94d3426d616c0624f0f2ee04339349e
--- /dev/null
+++ b/ernie-sat/paddlespeech/cls/exps/panns/deploy/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/ernie-sat/paddlespeech/cls/exps/panns/deploy/predict.py b/ernie-sat/paddlespeech/cls/exps/panns/deploy/predict.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4e5c22fb12b6453ba6ef6e4192f6a9442b960a9
--- /dev/null
+++ b/ernie-sat/paddlespeech/cls/exps/panns/deploy/predict.py
@@ -0,0 +1,145 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+
+import numpy as np
+from paddle import inference
+from scipy.special import softmax
+
+from paddleaudio.backends import load as load_audio
+from paddleaudio.datasets import ESC50
+from paddleaudio.features import melspectrogram
+
+# yapf: disable
+parser = argparse.ArgumentParser()
+parser.add_argument("--model_dir", type=str, required=True, default="./export", help="The directory to static model.")
+parser.add_argument('--device', choices=['cpu', 'gpu', 'xpu'], default="gpu", help="Select which device to train model, defaults to gpu.")
+parser.add_argument("--wav", type=str, required=True, help="Audio file to infer.")
+parser.add_argument("--batch_size", type=int, default=1, help="Batch size per GPU/CPU for training.")
+parser.add_argument('--use_tensorrt', type=eval, default=False, choices=[True, False], help='Enable to use tensorrt to speed up.')
+parser.add_argument("--precision", type=str, default="fp32", choices=["fp32", "fp16"], help='The tensorrt precision.')
+parser.add_argument('--cpu_threads', type=int, default=10, help='Number of threads to predict when using cpu.')
+parser.add_argument('--enable_mkldnn', type=eval, default=False, choices=[True, False], help='Enable to use mkldnn to speed up when using cpu.')
+parser.add_argument("--log_dir", type=str, default="./log", help="The path to save log.")
+args = parser.parse_args()
+# yapf: enable
+
+
+def extract_features(files: str, **kwargs):
+    waveforms = []
+    srs = []
+    max_length = float('-inf')
+    for file in files:
+        waveform, sr = load_audio(file, sr=None)
+        max_length = max(max_length, len(waveform))
+        waveforms.append(waveform)
+        srs.append(sr)
+
+    feats = []
+    for i in range(len(waveforms)):
+        # padding
+        if len(waveforms[i]) < max_length:
+            pad_width = max_length - len(waveforms[i])
+            waveforms[i] = np.pad(waveforms[i], pad_width=(0, pad_width))
+
+        feat = melspectrogram(waveforms[i], sr, **kwargs).transpose()
+        feats.append(feat)
+
+    return np.stack(feats, axis=0)
+
+
+class Predictor(object):
+    def __init__(self,
+                 model_dir,
+                 device="gpu",
+                 batch_size=1,
+                 use_tensorrt=False,
+                 precision="fp32",
+                 cpu_threads=10,
+                 enable_mkldnn=False):
+        self.batch_size = batch_size
+
+        model_file = os.path.join(model_dir, "inference.pdmodel")
+        params_file = os.path.join(model_dir, "inference.pdiparams")
+
+        assert os.path.isfile(model_file) and os.path.isfile(
+            params_file), 'Please check model and parameter files.'
+
+        config = inference.Config(model_file, params_file)
+        if device == "gpu":
+            # set GPU configs accordingly
+            # such as intialize the gpu memory, enable tensorrt
+            config.enable_use_gpu(100, 0)
+            precision_map = {
+                "fp16": inference.PrecisionType.Half,
+                "fp32": inference.PrecisionType.Float32,
+            }
+            precision_mode = precision_map[precision]
+
+            if use_tensorrt:
+                config.enable_tensorrt_engine(
+                    max_batch_size=batch_size,
+                    min_subgraph_size=30,
+                    precision_mode=precision_mode)
+        elif device == "cpu":
+            # set CPU configs accordingly,
+            # such as enable_mkldnn, set_cpu_math_library_num_threads
+            config.disable_gpu()
+            if enable_mkldnn:
+                # cache 10 different shapes for mkldnn to avoid memory leak
+                config.set_mkldnn_cache_capacity(10)
+                config.enable_mkldnn()
+            config.set_cpu_math_library_num_threads(cpu_threads)
+        elif device == "xpu":
+            # set XPU configs accordingly
+            config.enable_xpu(100)
+
+        config.switch_use_feed_fetch_ops(False)
+        self.predictor = inference.create_predictor(config)
+        self.input_handles = [
+            self.predictor.get_input_handle(name)
+            for name in self.predictor.get_input_names()
+        ]
+        self.output_handle = self.predictor.get_output_handle(
+            self.predictor.get_output_names()[0])
+
+    def predict(self, wavs):
+        feats = extract_features(wavs)
+
+        self.input_handles[0].copy_from_cpu(feats)
+        self.predictor.run()
+        logits = self.output_handle.copy_to_cpu()
+        probs = softmax(logits, axis=1)
+        indices = np.argmax(probs, axis=1)
+
+        return indices
+
+
+if __name__ == "__main__":
+    # Define predictor to do prediction.
+    predictor = Predictor(args.model_dir, args.device, args.batch_size,
+                          args.use_tensorrt, args.precision, args.cpu_threads,
+                          args.enable_mkldnn)
+
+    wavs = [args.wav]
+
+    for i in range(len(wavs)):
+        wavs[i] = os.path.abspath(os.path.expanduser(wavs[i]))
+        assert os.path.isfile(
+            wavs[i]), f'Please check input wave file: {wavs[i]}'
+
+    results = predictor.predict(wavs)
+    for idx, wav in enumerate(wavs):
+        print(f'Wav: {wav} \t Label: {ESC50.label_list[results[idx]]}')
diff --git a/ernie-sat/paddlespeech/cls/exps/panns/export_model.py b/ernie-sat/paddlespeech/cls/exps/panns/export_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..c295c6a33838b086480ddc4e681341cbd023d560
--- /dev/null
+++ b/ernie-sat/paddlespeech/cls/exps/panns/export_model.py
@@ -0,0 +1,45 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+
+import paddle
+
+from paddleaudio.datasets import ESC50
+from paddlespeech.cls.models import cnn14
+from paddlespeech.cls.models import SoundClassifier
+
+# yapf: disable
+parser = argparse.ArgumentParser(__doc__)
+parser.add_argument("--checkpoint", type=str, required=True, help="Checkpoint of model.")
+parser.add_argument("--output_dir", type=str, default='./export', help="Path to save static model and its parameters.")
+args = parser.parse_args()
+# yapf: enable
+
+if __name__ == '__main__':
+    model = SoundClassifier(
+        backbone=cnn14(pretrained=False, extract_embedding=True),
+        num_class=len(ESC50.label_list))
+    model.set_state_dict(paddle.load(args.checkpoint))
+    model.eval()
+
+    model = paddle.jit.to_static(
+        model,
+        input_spec=[
+            paddle.static.InputSpec(
+                shape=[None, None, 64], dtype=paddle.float32)
+        ])
+
+    # Save in static graph model.
+    paddle.jit.save(model, os.path.join(args.output_dir, "inference"))
diff --git a/ernie-sat/paddlespeech/cls/exps/panns/predict.py b/ernie-sat/paddlespeech/cls/exps/panns/predict.py
new file mode 100644
index 0000000000000000000000000000000000000000..ffe42d3904bc9b62c678b623147de320b287c071
--- /dev/null
+++ b/ernie-sat/paddlespeech/cls/exps/panns/predict.py
@@ -0,0 +1,72 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+
+import paddle
+import paddle.nn.functional as F
+import yaml
+
+from paddleaudio.backends import load as load_audio
+from paddleaudio.features import LogMelSpectrogram
+from paddleaudio.utils import logger
+from paddlespeech.cls.models import SoundClassifier
+from paddlespeech.s2t.utils.dynamic_import import dynamic_import
+
+# yapf: disable
+parser = argparse.ArgumentParser(__doc__)
+parser.add_argument("--cfg_path", type=str, required=True)
+args = parser.parse_args()
+# yapf: enable
+
+
+def extract_features(file: str, **feat_conf) -> paddle.Tensor:
+    file = os.path.abspath(os.path.expanduser(file))
+    waveform, _ = load_audio(file, sr=feat_conf['sr'])
+    feature_extractor = LogMelSpectrogram(**feat_conf)
+    feat = feature_extractor(paddle.to_tensor(waveform).unsqueeze(0))
+    feat = paddle.transpose(feat, [0, 2, 1])
+    return feat
+
+
+if __name__ == '__main__':
+
+    args.cfg_path = os.path.abspath(os.path.expanduser(args.cfg_path))
+    with open(args.cfg_path, 'r') as f:
+        config = yaml.safe_load(f)
+
+    model_conf = config['model']
+    data_conf = config['data']
+    feat_conf = config['feature']
+    predicting_conf = config['predicting']
+
+    ds_class = dynamic_import(data_conf['dataset'])
+    backbone_class = dynamic_import(model_conf['backbone'])
+
+    model = SoundClassifier(
+        backbone=backbone_class(pretrained=False, extract_embedding=True),
+        num_class=len(ds_class.label_list))
+    model.set_state_dict(paddle.load(predicting_conf['checkpoint']))
+    model.eval()
+
+    feat = extract_features(predicting_conf['audio_file'], **feat_conf)
+    logits = model(feat)
+    probs = F.softmax(logits, axis=1).numpy()
+
+    sorted_indices = (-probs[0]).argsort()
+
+    msg = f"[{predicting_conf['audio_file']}]\n"
+    for idx in sorted_indices[:predicting_conf['top_k']]:
+        msg += f'{ds_class.label_list[idx]}: {probs[0][idx]}\n'
+    logger.info(msg)
diff --git a/ernie-sat/paddlespeech/cls/exps/panns/train.py b/ernie-sat/paddlespeech/cls/exps/panns/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e292214827cdbbaaed51a51c175331cd159b098
--- /dev/null
+++ b/ernie-sat/paddlespeech/cls/exps/panns/train.py
@@ -0,0 +1,172 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+
+import paddle
+import yaml
+
+from paddleaudio.features import LogMelSpectrogram
+from paddleaudio.utils import logger
+from paddleaudio.utils import Timer
+from paddlespeech.cls.models import SoundClassifier
+from paddlespeech.s2t.utils.dynamic_import import dynamic_import
+
+# yapf: disable
+parser = argparse.ArgumentParser(__doc__)
+parser.add_argument("--cfg_path", type=str, required=True)
+args = parser.parse_args()
+# yapf: enable
+
+if __name__ == "__main__":
+    nranks = paddle.distributed.get_world_size()
+    if paddle.distributed.get_world_size() > 1:
+        paddle.distributed.init_parallel_env()
+    local_rank = paddle.distributed.get_rank()
+
+    args.cfg_path = os.path.abspath(os.path.expanduser(args.cfg_path))
+    with open(args.cfg_path, 'r') as f:
+        config = yaml.safe_load(f)
+
+    model_conf = config['model']
+    data_conf = config['data']
+    feat_conf = config['feature']
+    training_conf = config['training']
+
+    # Dataset
+    ds_class = dynamic_import(data_conf['dataset'])
+    train_ds = ds_class(**data_conf['train'])
+    dev_ds = ds_class(**data_conf['dev'])
+    train_sampler = paddle.io.DistributedBatchSampler(
+        train_ds,
+        batch_size=training_conf['batch_size'],
+        shuffle=True,
+        drop_last=False)
+    train_loader = paddle.io.DataLoader(
+        train_ds,
+        batch_sampler=train_sampler,
+        num_workers=training_conf['num_workers'],
+        return_list=True,
+        use_buffer_reader=True, )
+
+    # Feature
+    feature_extractor = LogMelSpectrogram(**feat_conf)
+
+    # Model
+    backbone_class = dynamic_import(model_conf['backbone'])
+    backbone = backbone_class(pretrained=True, extract_embedding=True)
+    model = SoundClassifier(backbone, num_class=data_conf['num_classes'])
+    model = paddle.DataParallel(model)
+    optimizer = paddle.optimizer.Adam(
+        learning_rate=training_conf['learning_rate'],
+        parameters=model.parameters())
+    criterion = paddle.nn.loss.CrossEntropyLoss()
+
+    steps_per_epoch = len(train_sampler)
+    timer = Timer(steps_per_epoch * training_conf['epochs'])
+    timer.start()
+
+    for epoch in range(1, training_conf['epochs'] + 1):
+        model.train()
+
+        avg_loss = 0
+        num_corrects = 0
+        num_samples = 0
+        for batch_idx, batch in enumerate(train_loader):
+            waveforms, labels = batch
+            feats = feature_extractor(
+                waveforms
+            )  # Need a padding when lengths of waveforms differ in a batch.
+            feats = paddle.transpose(feats, [0, 2, 1])  # To [N, length, n_mels]
+
+            logits = model(feats)
+
+            loss = criterion(logits, labels)
+            loss.backward()
+            optimizer.step()
+            if isinstance(optimizer._learning_rate,
+                          paddle.optimizer.lr.LRScheduler):
+                optimizer._learning_rate.step()
+            optimizer.clear_grad()
+
+            # Calculate loss
+            avg_loss += loss.numpy()[0]
+
+            # Calculate metrics
+            preds = paddle.argmax(logits, axis=1)
+            num_corrects += (preds == labels).numpy().sum()
+            num_samples += feats.shape[0]
+
+            timer.count()
+
+            if (batch_idx + 1
+                ) % training_conf['log_freq'] == 0 and local_rank == 0:
+                lr = optimizer.get_lr()
+                avg_loss /= training_conf['log_freq']
+                avg_acc = num_corrects / num_samples
+
+                print_msg = 'Epoch={}/{}, Step={}/{}'.format(
+                    epoch, training_conf['epochs'], batch_idx + 1,
+                    steps_per_epoch)
+                print_msg += ' loss={:.4f}'.format(avg_loss)
+                print_msg += ' acc={:.4f}'.format(avg_acc)
+                print_msg += ' lr={:.6f} step/sec={:.2f} | ETA {}'.format(
+                    lr, timer.timing, timer.eta)
+                logger.train(print_msg)
+
+                avg_loss = 0
+                num_corrects = 0
+                num_samples = 0
+
+        if epoch % training_conf[
+                'save_freq'] == 0 and batch_idx + 1 == steps_per_epoch and local_rank == 0:
+            dev_sampler = paddle.io.BatchSampler(
+                dev_ds,
+                batch_size=training_conf['batch_size'],
+                shuffle=False,
+                drop_last=False)
+            dev_loader = paddle.io.DataLoader(
+                dev_ds,
+                batch_sampler=dev_sampler,
+                num_workers=training_conf['num_workers'],
+                return_list=True, )
+
+            model.eval()
+            num_corrects = 0
+            num_samples = 0
+            with logger.processing('Evaluation on validation dataset'):
+                for batch_idx, batch in enumerate(dev_loader):
+                    waveforms, labels = batch
+                    feats = feature_extractor(waveforms)
+                    feats = paddle.transpose(feats, [0, 2, 1])
+
+                    logits = model(feats)
+
+                    preds = paddle.argmax(logits, axis=1)
+                    num_corrects += (preds == labels).numpy().sum()
+                    num_samples += feats.shape[0]
+
+            print_msg = '[Evaluation result]'
+            print_msg += ' dev_acc={:.4f}'.format(num_corrects / num_samples)
+
+            logger.eval(print_msg)
+
+            # Save model
+            save_dir = os.path.join(training_conf['checkpoint_dir'],
+                                    'epoch_{}'.format(epoch))
+            logger.info('Saving model checkpoint to {}'.format(save_dir))
+            paddle.save(model.state_dict(),
+                        os.path.join(save_dir, 'model.pdparams'))
+            paddle.save(optimizer.state_dict(),
+                        os.path.join(save_dir, 'model.pdopt'))
diff --git a/ernie-sat/paddlespeech/cls/models/__init__.py b/ernie-sat/paddlespeech/cls/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4bfadda11968aa2262d810941761dca6838a0d79
--- /dev/null
+++ b/ernie-sat/paddlespeech/cls/models/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .panns import *
diff --git a/ernie-sat/paddlespeech/cls/models/panns/__init__.py b/ernie-sat/paddlespeech/cls/models/panns/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..638f772f9e99c4fa910cbcee9333c6772024ed8a
--- /dev/null
+++ b/ernie-sat/paddlespeech/cls/models/panns/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .classifier import *
+from .panns import *
diff --git a/ernie-sat/paddlespeech/cls/models/panns/classifier.py b/ernie-sat/paddlespeech/cls/models/panns/classifier.py
new file mode 100644
index 0000000000000000000000000000000000000000..df64158ff0980a9f38017811a4ee9292a5563b00
--- /dev/null
+++ b/ernie-sat/paddlespeech/cls/models/panns/classifier.py
@@ -0,0 +1,36 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddle.nn as nn
+
+
+class SoundClassifier(nn.Layer):
+    """
+    Model for sound classification which uses panns pretrained models to extract
+    embeddings from audio files.
+    """
+
+    def __init__(self, backbone, num_class, dropout=0.1):
+        super(SoundClassifier, self).__init__()
+        self.backbone = backbone
+        self.dropout = nn.Dropout(dropout)
+        self.fc = nn.Linear(self.backbone.emb_size, num_class)
+
+    def forward(self, x):
+        # x: (batch_size, num_frames, num_melbins) -> (batch_size, 1, num_frames, num_melbins)
+        x = x.unsqueeze(1)
+        x = self.backbone(x)
+        x = self.dropout(x)
+        logits = self.fc(x)
+
+        return logits
diff --git a/ernie-sat/paddlespeech/cls/models/panns/panns.py b/ernie-sat/paddlespeech/cls/models/panns/panns.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d2dac56ac23d9b3322e49703f98e15faf936fd0
--- /dev/null
+++ b/ernie-sat/paddlespeech/cls/models/panns/panns.py
@@ -0,0 +1,309 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddleaudio.utils.download import load_state_dict_from_url
+from paddleaudio.utils.env import MODEL_HOME
+
+__all__ = ['CNN14', 'CNN10', 'CNN6', 'cnn14', 'cnn10', 'cnn6']
+
+pretrained_model_urls = {
+    'cnn14': 'https://bj.bcebos.com/paddleaudio/models/panns_cnn14.pdparams',
+    'cnn10': 'https://bj.bcebos.com/paddleaudio/models/panns_cnn10.pdparams',
+    'cnn6': 'https://bj.bcebos.com/paddleaudio/models/panns_cnn6.pdparams',
+}
+
+
+class ConvBlock(nn.Layer):
+    def __init__(self, in_channels, out_channels):
+        super(ConvBlock, self).__init__()
+
+        self.conv1 = nn.Conv2D(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=(3, 3),
+            stride=(1, 1),
+            padding=(1, 1),
+            bias_attr=False)
+        self.conv2 = nn.Conv2D(
+            in_channels=out_channels,
+            out_channels=out_channels,
+            kernel_size=(3, 3),
+            stride=(1, 1),
+            padding=(1, 1),
+            bias_attr=False)
+        self.bn1 = nn.BatchNorm2D(out_channels)
+        self.bn2 = nn.BatchNorm2D(out_channels)
+
+    def forward(self, x, pool_size=(2, 2), pool_type='avg'):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = F.relu(x)
+
+        x = self.conv2(x)
+        x = self.bn2(x)
+        x = F.relu(x)
+
+        if pool_type == 'max':
+            x = F.max_pool2d(x, kernel_size=pool_size)
+        elif pool_type == 'avg':
+            x = F.avg_pool2d(x, kernel_size=pool_size)
+        elif pool_type == 'avg+max':
+            x = F.avg_pool2d(
+                x, kernel_size=pool_size) + F.max_pool2d(
+                    x, kernel_size=pool_size)
+        else:
+            raise Exception(
+                f'Pooling type of {pool_type} is not supported. It must be one of "max", "avg" and "avg+max".'
+            )
+        return x
+
+
+class ConvBlock5x5(nn.Layer):
+    def __init__(self, in_channels, out_channels):
+        super(ConvBlock5x5, self).__init__()
+
+        self.conv1 = nn.Conv2D(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=(5, 5),
+            stride=(1, 1),
+            padding=(2, 2),
+            bias_attr=False)
+        self.bn1 = nn.BatchNorm2D(out_channels)
+
+    def forward(self, x, pool_size=(2, 2), pool_type='avg'):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = F.relu(x)
+
+        if pool_type == 'max':
+            x = F.max_pool2d(x, kernel_size=pool_size)
+        elif pool_type == 'avg':
+            x = F.avg_pool2d(x, kernel_size=pool_size)
+        elif pool_type == 'avg+max':
+            x = F.avg_pool2d(
+                x, kernel_size=pool_size) + F.max_pool2d(
+                    x, kernel_size=pool_size)
+        else:
+            raise Exception(
+                f'Pooling type of {pool_type} is not supported. It must be one of "max", "avg" and "avg+max".'
+            )
+        return x
+
+
+class CNN14(nn.Layer):
+    """
+    The CNN14(14-layer CNNs) mainly consist of 6 convolutional blocks while each convolutional
+    block consists of 2 convolutional layers with a kernel size of 3 × 3.
+
+    Reference:
+        PANNs: Large-Scale Pretrained Audio Neural Networks for Audio Pattern Recognition
+        https://arxiv.org/pdf/1912.10211.pdf
+    """
+    emb_size = 2048
+
+    def __init__(self, extract_embedding: bool=True):
+
+        super(CNN14, self).__init__()
+        self.bn0 = nn.BatchNorm2D(64)
+        self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
+        self.conv_block2 = ConvBlock(in_channels=64, out_channels=128)
+        self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
+        self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)
+        self.conv_block5 = ConvBlock(in_channels=512, out_channels=1024)
+        self.conv_block6 = ConvBlock(in_channels=1024, out_channels=2048)
+
+        self.fc1 = nn.Linear(2048, self.emb_size)
+        self.fc_audioset = nn.Linear(self.emb_size, 527)
+        self.extract_embedding = extract_embedding
+
+    def forward(self, x):
+        x.stop_gradient = False
+        x = x.transpose([0, 3, 2, 1])
+        x = self.bn0(x)
+        x = x.transpose([0, 3, 2, 1])
+
+        x = self.conv_block1(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+
+        x = self.conv_block2(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+
+        x = self.conv_block3(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+
+        x = self.conv_block4(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+
+        x = self.conv_block5(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+
+        x = self.conv_block6(x, pool_size=(1, 1), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+
+        x = x.mean(axis=3)
+        x = x.max(axis=2) + x.mean(axis=2)
+
+        x = F.dropout(x, p=0.5, training=self.training)
+        x = F.relu(self.fc1(x))
+
+        if self.extract_embedding:
+            output = F.dropout(x, p=0.5, training=self.training)
+        else:
+            output = F.sigmoid(self.fc_audioset(x))
+        return output
+
+
+class CNN10(nn.Layer):
+    """
+    The CNN10(14-layer CNNs) mainly consist of 4 convolutional blocks while each convolutional
+    block consists of 2 convolutional layers with a kernel size of 3 × 3.
+
+    Reference:
+        PANNs: Large-Scale Pretrained Audio Neural Networks for Audio Pattern Recognition
+        https://arxiv.org/pdf/1912.10211.pdf
+    """
+    emb_size = 512
+
+    def __init__(self, extract_embedding: bool=True):
+
+        super(CNN10, self).__init__()
+        self.bn0 = nn.BatchNorm2D(64)
+        self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
+        self.conv_block2 = ConvBlock(in_channels=64, out_channels=128)
+        self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
+        self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)
+
+        self.fc1 = nn.Linear(512, self.emb_size)
+        self.fc_audioset = nn.Linear(self.emb_size, 527)
+        self.extract_embedding = extract_embedding
+
+    def forward(self, x):
+        x.stop_gradient = False
+        x = x.transpose([0, 3, 2, 1])
+        x = self.bn0(x)
+        x = x.transpose([0, 3, 2, 1])
+
+        x = self.conv_block1(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+
+        x = self.conv_block2(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+
+        x = self.conv_block3(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+
+        x = self.conv_block4(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+
+        x = x.mean(axis=3)
+        x = x.max(axis=2) + x.mean(axis=2)
+
+        x = F.dropout(x, p=0.5, training=self.training)
+        x = F.relu(self.fc1(x))
+
+        if self.extract_embedding:
+            output = F.dropout(x, p=0.5, training=self.training)
+        else:
+            output = F.sigmoid(self.fc_audioset(x))
+        return output
+
+
+class CNN6(nn.Layer):
+    """
+    The CNN14(14-layer CNNs) mainly consist of 4 convolutional blocks while each convolutional
+    block consists of 1 convolutional layers with a kernel size of 5 × 5.
+
+    Reference:
+        PANNs: Large-Scale Pretrained Audio Neural Networks for Audio Pattern Recognition
+        https://arxiv.org/pdf/1912.10211.pdf
+    """
+    emb_size = 512
+
+    def __init__(self, extract_embedding: bool=True):
+
+        super(CNN6, self).__init__()
+        self.bn0 = nn.BatchNorm2D(64)
+        self.conv_block1 = ConvBlock5x5(in_channels=1, out_channels=64)
+        self.conv_block2 = ConvBlock5x5(in_channels=64, out_channels=128)
+        self.conv_block3 = ConvBlock5x5(in_channels=128, out_channels=256)
+        self.conv_block4 = ConvBlock5x5(in_channels=256, out_channels=512)
+
+        self.fc1 = nn.Linear(512, self.emb_size)
+        self.fc_audioset = nn.Linear(self.emb_size, 527)
+        self.extract_embedding = extract_embedding
+
+    def forward(self, x):
+        x.stop_gradient = False
+        x = x.transpose([0, 3, 2, 1])
+        x = self.bn0(x)
+        x = x.transpose([0, 3, 2, 1])
+
+        x = self.conv_block1(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+
+        x = self.conv_block2(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+
+        x = self.conv_block3(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+
+        x = self.conv_block4(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+
+        x = x.mean(axis=3)
+        x = x.max(axis=2) + x.mean(axis=2)
+
+        x = F.dropout(x, p=0.5, training=self.training)
+        x = F.relu(self.fc1(x))
+
+        if self.extract_embedding:
+            output = F.dropout(x, p=0.5, training=self.training)
+        else:
+            output = F.sigmoid(self.fc_audioset(x))
+        return output
+
+
+def cnn14(pretrained: bool=False, extract_embedding: bool=True) -> CNN14:
+    model = CNN14(extract_embedding=extract_embedding)
+    if pretrained:
+        state_dict = load_state_dict_from_url(
+            url=pretrained_model_urls['cnn14'],
+            path=os.path.join(MODEL_HOME, 'panns'))
+        model.set_state_dict(state_dict)
+    return model
+
+
+def cnn10(pretrained: bool=False, extract_embedding: bool=True) -> CNN10:
+    model = CNN10(extract_embedding=extract_embedding)
+    if pretrained:
+        state_dict = load_state_dict_from_url(
+            url=pretrained_model_urls['cnn10'],
+            path=os.path.join(MODEL_HOME, 'panns'))
+        model.set_state_dict(state_dict)
+    return model
+
+
+def cnn6(pretrained: bool=False, extract_embedding: bool=True) -> CNN6:
+    model = CNN6(extract_embedding=extract_embedding)
+    if pretrained:
+        state_dict = load_state_dict_from_url(
+            url=pretrained_model_urls['cnn6'],
+            path=os.path.join(MODEL_HOME, 'panns'))
+        model.set_state_dict(state_dict)
+    return model
diff --git a/ernie-sat/paddlespeech/s2t/__init__.py b/ernie-sat/paddlespeech/s2t/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..855ceef96f5fced0bb2f1299bc011fe1fa663ec3
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/__init__.py
@@ -0,0 +1,507 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any
+from typing import List
+from typing import Tuple
+from typing import Union
+
+import paddle
+from paddle import nn
+from paddle.fluid import core
+from paddle.nn import functional as F
+
+from paddlespeech.s2t.utils.log import Log
+
+#TODO(Hui Zhang): remove  fluid import
+logger = Log(__name__).getlog()
+
+########### hcak logging #############
+logger.warn = logger.warning
+
+########### hcak paddle #############
+paddle.half = 'float16'
+paddle.float = 'float32'
+paddle.double = 'float64'
+paddle.short = 'int16'
+paddle.int = 'int32'
+paddle.long = 'int64'
+paddle.uint16 = 'uint16'
+paddle.cdouble = 'complex128'
+
+
+def convert_dtype_to_string(tensor_dtype):
+    """
+    Convert the data type in numpy to the data type in Paddle
+    Args:
+        tensor_dtype(core.VarDesc.VarType): the data type in numpy.
+    Returns:
+        core.VarDesc.VarType: the data type in Paddle.
+    """
+    dtype = tensor_dtype
+    if dtype == core.VarDesc.VarType.FP32:
+        return paddle.float32
+    elif dtype == core.VarDesc.VarType.FP64:
+        return paddle.float64
+    elif dtype == core.VarDesc.VarType.FP16:
+        return paddle.float16
+    elif dtype == core.VarDesc.VarType.INT32:
+        return paddle.int32
+    elif dtype == core.VarDesc.VarType.INT16:
+        return paddle.int16
+    elif dtype == core.VarDesc.VarType.INT64:
+        return paddle.int64
+    elif dtype == core.VarDesc.VarType.BOOL:
+        return paddle.bool
+    elif dtype == core.VarDesc.VarType.BF16:
+        # since there is still no support for bfloat16 in NumPy,
+        # uint16 is used for casting bfloat16
+        return paddle.uint16
+    elif dtype == core.VarDesc.VarType.UINT8:
+        return paddle.uint8
+    elif dtype == core.VarDesc.VarType.INT8:
+        return paddle.int8
+    elif dtype == core.VarDesc.VarType.COMPLEX64:
+        return paddle.complex64
+    elif dtype == core.VarDesc.VarType.COMPLEX128:
+        return paddle.complex128
+    else:
+        raise ValueError("Not supported tensor dtype %s" % dtype)
+
+
+if not hasattr(paddle, 'softmax'):
+    logger.debug("register user softmax to paddle, remove this when fixed!")
+    setattr(paddle, 'softmax', paddle.nn.functional.softmax)
+
+if not hasattr(paddle, 'log_softmax'):
+    logger.debug("register user log_softmax to paddle, remove this when fixed!")
+    setattr(paddle, 'log_softmax', paddle.nn.functional.log_softmax)
+
+if not hasattr(paddle, 'sigmoid'):
+    logger.debug("register user sigmoid to paddle, remove this when fixed!")
+    setattr(paddle, 'sigmoid', paddle.nn.functional.sigmoid)
+
+if not hasattr(paddle, 'log_sigmoid'):
+    logger.debug("register user log_sigmoid to paddle, remove this when fixed!")
+    setattr(paddle, 'log_sigmoid', paddle.nn.functional.log_sigmoid)
+
+if not hasattr(paddle, 'relu'):
+    logger.debug("register user relu to paddle, remove this when fixed!")
+    setattr(paddle, 'relu', paddle.nn.functional.relu)
+
+
+def cat(xs, dim=0):
+    return paddle.concat(xs, axis=dim)
+
+
+if not hasattr(paddle, 'cat'):
+    logger.debug(
+        "override cat of paddle if exists or register, remove this when fixed!")
+    paddle.cat = cat
+
+
+########### hcak paddle.Tensor #############
+def item(x: paddle.Tensor):
+    return x.numpy().item()
+
+
+if not hasattr(paddle.Tensor, 'item'):
+    logger.debug(
+        "override item of paddle.Tensor if exists or register, remove this when fixed!"
+    )
+    paddle.Tensor.item = item
+
+
+def func_long(x: paddle.Tensor):
+    return paddle.cast(x, paddle.long)
+
+
+if not hasattr(paddle.Tensor, 'long'):
+    logger.debug(
+        "override long of paddle.Tensor if exists or register, remove this when fixed!"
+    )
+    paddle.Tensor.long = func_long
+
+if not hasattr(paddle.Tensor, 'numel'):
+    logger.debug(
+        "override numel of paddle.Tensor if exists or register, remove this when fixed!"
+    )
+    paddle.Tensor.numel = paddle.numel
+
+
+def new_full(x: paddle.Tensor,
+             size: Union[List[int], Tuple[int], paddle.Tensor],
+             fill_value: Union[float, int, bool, paddle.Tensor],
+             dtype=None):
+    return paddle.full(size, fill_value, dtype=x.dtype)
+
+
+if not hasattr(paddle.Tensor, 'new_full'):
+    logger.debug(
+        "override new_full of paddle.Tensor if exists or register, remove this when fixed!"
+    )
+    paddle.Tensor.new_full = new_full
+
+
+def eq(xs: paddle.Tensor, ys: Union[paddle.Tensor, float]) -> paddle.Tensor:
+    if convert_dtype_to_string(xs.dtype) == paddle.bool:
+        xs = xs.astype(paddle.int)
+    return xs.equal(
+        paddle.to_tensor(
+            ys, dtype=convert_dtype_to_string(xs.dtype), place=xs.place))
+
+
+if not hasattr(paddle.Tensor, 'eq'):
+    logger.debug(
+        "override eq of paddle.Tensor if exists or register, remove this when fixed!"
+    )
+    paddle.Tensor.eq = eq
+
+if not hasattr(paddle, 'eq'):
+    logger.debug(
+        "override eq of paddle if exists or register, remove this when fixed!")
+    paddle.eq = eq
+
+
+def contiguous(xs: paddle.Tensor) -> paddle.Tensor:
+    return xs
+
+
+if not hasattr(paddle.Tensor, 'contiguous'):
+    logger.debug(
+        "override contiguous of paddle.Tensor if exists or register, remove this when fixed!"
+    )
+    paddle.Tensor.contiguous = contiguous
+
+
+def size(xs: paddle.Tensor, *args: int) -> paddle.Tensor:
+    nargs = len(args)
+    assert (nargs <= 1)
+    s = paddle.shape(xs)
+    if nargs == 1:
+        return s[args[0]]
+    else:
+        return s
+
+
+#`to_static` do not process `size` property, maybe some `paddle` api dependent on it.
+logger.debug(
+    "override size of paddle.Tensor "
+    "(`to_static` do not process `size` property, maybe some `paddle` api dependent on it), remove this when fixed!"
+)
+paddle.Tensor.size = size
+
+
+def view(xs: paddle.Tensor, *args: int) -> paddle.Tensor:
+    return xs.reshape(args)
+
+
+if not hasattr(paddle.Tensor, 'view'):
+    logger.debug("register user view to paddle.Tensor, remove this when fixed!")
+    paddle.Tensor.view = view
+
+
+def view_as(xs: paddle.Tensor, ys: paddle.Tensor) -> paddle.Tensor:
+    return xs.reshape(ys.size())
+
+
+if not hasattr(paddle.Tensor, 'view_as'):
+    logger.debug(
+        "register user view_as to paddle.Tensor, remove this when fixed!")
+    paddle.Tensor.view_as = view_as
+
+
+def is_broadcastable(shp1, shp2):
+    for a, b in zip(shp1[::-1], shp2[::-1]):
+        if a == 1 or b == 1 or a == b:
+            pass
+        else:
+            return False
+    return True
+
+
+def masked_fill(xs: paddle.Tensor,
+                mask: paddle.Tensor,
+                value: Union[float, int]):
+    assert is_broadcastable(xs.shape, mask.shape) is True, (xs.shape,
+                                                            mask.shape)
+    bshape = paddle.broadcast_shape(xs.shape, mask.shape)
+    mask = mask.broadcast_to(bshape)
+    trues = paddle.ones_like(xs) * value
+    xs = paddle.where(mask, trues, xs)
+    return xs
+
+
+if not hasattr(paddle.Tensor, 'masked_fill'):
+    logger.debug(
+        "register user masked_fill to paddle.Tensor, remove this when fixed!")
+    paddle.Tensor.masked_fill = masked_fill
+
+
+def masked_fill_(xs: paddle.Tensor,
+                 mask: paddle.Tensor,
+                 value: Union[float, int]) -> paddle.Tensor:
+    assert is_broadcastable(xs.shape, mask.shape) is True
+    bshape = paddle.broadcast_shape(xs.shape, mask.shape)
+    mask = mask.broadcast_to(bshape)
+    trues = paddle.ones_like(xs) * value
+    ret = paddle.where(mask, trues, xs)
+    paddle.assign(ret.detach(), output=xs)
+    return xs
+
+
+if not hasattr(paddle.Tensor, 'masked_fill_'):
+    logger.debug(
+        "register user masked_fill_ to paddle.Tensor, remove this when fixed!")
+    paddle.Tensor.masked_fill_ = masked_fill_
+
+
+def fill_(xs: paddle.Tensor, value: Union[float, int]) -> paddle.Tensor:
+    val = paddle.full_like(xs, value)
+    paddle.assign(val.detach(), output=xs)
+    return xs
+
+
+if not hasattr(paddle.Tensor, 'fill_'):
+    logger.debug(
+        "register user fill_ to paddle.Tensor, remove this when fixed!")
+    paddle.Tensor.fill_ = fill_
+
+
+def repeat(xs: paddle.Tensor, *size: Any) -> paddle.Tensor:
+    return paddle.tile(xs, size)
+
+
+if not hasattr(paddle.Tensor, 'repeat'):
+    logger.debug(
+        "register user repeat to paddle.Tensor, remove this when fixed!")
+    paddle.Tensor.repeat = repeat
+
+if not hasattr(paddle.Tensor, 'softmax'):
+    logger.debug(
+        "register user softmax to paddle.Tensor, remove this when fixed!")
+    setattr(paddle.Tensor, 'softmax', paddle.nn.functional.softmax)
+
+if not hasattr(paddle.Tensor, 'sigmoid'):
+    logger.debug(
+        "register user sigmoid to paddle.Tensor, remove this when fixed!")
+    setattr(paddle.Tensor, 'sigmoid', paddle.nn.functional.sigmoid)
+
+if not hasattr(paddle.Tensor, 'relu'):
+    logger.debug("register user relu to paddle.Tensor, remove this when fixed!")
+    setattr(paddle.Tensor, 'relu', paddle.nn.functional.relu)
+
+
+def type_as(x: paddle.Tensor, other: paddle.Tensor) -> paddle.Tensor:
+    return x.astype(other.dtype)
+
+
+if not hasattr(paddle.Tensor, 'type_as'):
+    logger.debug(
+        "register user type_as to paddle.Tensor, remove this when fixed!")
+    setattr(paddle.Tensor, 'type_as', type_as)
+
+
+def to(x: paddle.Tensor, *args, **kwargs) -> paddle.Tensor:
+    assert len(args) == 1
+    if isinstance(args[0], str):  # dtype
+        return x.astype(args[0])
+    elif isinstance(args[0], paddle.Tensor):  # Tensor
+        return x.astype(args[0].dtype)
+    else:  # Device
+        return x
+
+
+if not hasattr(paddle.Tensor, 'to'):
+    logger.debug("register user to to paddle.Tensor, remove this when fixed!")
+    setattr(paddle.Tensor, 'to', to)
+
+
+def func_float(x: paddle.Tensor) -> paddle.Tensor:
+    return x.astype(paddle.float)
+
+
+if not hasattr(paddle.Tensor, 'float'):
+    logger.debug(
+        "register user float to paddle.Tensor, remove this when fixed!")
+    setattr(paddle.Tensor, 'float', func_float)
+
+
+def func_int(x: paddle.Tensor) -> paddle.Tensor:
+    return x.astype(paddle.int)
+
+
+if not hasattr(paddle.Tensor, 'int'):
+    logger.debug("register user int to paddle.Tensor, remove this when fixed!")
+    setattr(paddle.Tensor, 'int', func_int)
+
+
+def tolist(x: paddle.Tensor) -> List[Any]:
+    return x.numpy().tolist()
+
+
+if not hasattr(paddle.Tensor, 'tolist'):
+    logger.debug(
+        "register user tolist to paddle.Tensor, remove this when fixed!")
+    setattr(paddle.Tensor, 'tolist', tolist)
+
+########### hack paddle.nn #############
+from paddle.nn import Layer
+from typing import Optional
+from typing import Mapping
+from typing import Iterable
+from typing import Tuple
+from typing import Iterator
+from collections import OrderedDict, abc as container_abcs
+
+
+class LayerDict(paddle.nn.Layer):
+    r"""Holds submodules in a dictionary.
+
+    :class:`~paddle.nn.LayerDict` can be indexed like a regular Python dictionary,
+    but modules it contains are properly registered, and will be visible by all
+    :class:`~paddle.nn.Layer` methods.
+
+    :class:`~paddle.nn.LayerDict` is an **ordered** dictionary that respects
+
+    * the order of insertion, and
+
+    * in :meth:`~paddle.nn.LayerDict.update`, the order of the merged
+      ``OrderedDict``, ``dict`` (started from Python 3.6) or another
+      :class:`~paddle.nn.LayerDict` (the argument to
+      :meth:`~paddle.nn.LayerDict.update`).
+
+    Note that :meth:`~paddle.nn.LayerDict.update` with other unordered mapping
+    types (e.g., Python's plain ``dict`` before Python version 3.6) does not
+    preserve the order of the merged mapping.
+
+    Args:
+        modules (iterable, optional): a mapping (dictionary) of (string: module)
+            or an iterable of key-value pairs of type (string, module)
+
+    Example::
+
+        class MyModule(nn.Layer):
+            def __init__(self):
+                super(MyModule, self).__init__()
+                self.choices = nn.LayerDict({
+                        'conv': nn.Conv2d(10, 10, 3),
+                        'pool': nn.MaxPool2d(3)
+                })
+                self.activations = nn.LayerDict([
+                        ['lrelu', nn.LeakyReLU()],
+                        ['prelu', nn.PReLU()]
+                ])
+
+            def forward(self, x, choice, act):
+                x = self.choices[choice](x)
+                x = self.activations[act](x)
+                return x
+    """
+
+    def __init__(self, modules: Optional[Mapping[str, Layer]]=None) -> None:
+        super(LayerDict, self).__init__()
+        if modules is not None:
+            self.update(modules)
+
+    def __getitem__(self, key: str) -> Layer:
+        return self._modules[key]
+
+    def __setitem__(self, key: str, module: Layer) -> None:
+        self.add_module(key, module)
+
+    def __delitem__(self, key: str) -> None:
+        del self._modules[key]
+
+    def __len__(self) -> int:
+        return len(self._modules)
+
+    def __iter__(self) -> Iterator[str]:
+        return iter(self._modules)
+
+    def __contains__(self, key: str) -> bool:
+        return key in self._modules
+
+    def clear(self) -> None:
+        """Remove all items from the LayerDict.
+        """
+        self._modules.clear()
+
+    def pop(self, key: str) -> Layer:
+        r"""Remove key from the LayerDict and return its module.
+
+        Args:
+            key (string): key to pop from the LayerDict
+        """
+        v = self[key]
+        del self[key]
+        return v
+
+    def keys(self) -> Iterable[str]:
+        r"""Return an iterable of the LayerDict keys.
+        """
+        return self._modules.keys()
+
+    def items(self) -> Iterable[Tuple[str, Layer]]:
+        r"""Return an iterable of the LayerDict key/value pairs.
+        """
+        return self._modules.items()
+
+    def values(self) -> Iterable[Layer]:
+        r"""Return an iterable of the LayerDict values.
+        """
+        return self._modules.values()
+
+    def update(self, modules: Mapping[str, Layer]) -> None:
+        r"""Update the :class:`~paddle.nn.LayerDict` with the key-value pairs from a
+        mapping or an iterable, overwriting existing keys.
+
+        .. note::
+            If :attr:`modules` is an ``OrderedDict``, a :class:`~paddle.nn.LayerDict`, or
+            an iterable of key-value pairs, the order of new elements in it is preserved.
+
+        Args:
+            modules (iterable): a mapping (dictionary) from string to :class:`~paddle.nn.Layer`,
+                or an iterable of key-value pairs of type (string, :class:`~paddle.nn.Layer`)
+        """
+        if not isinstance(modules, container_abcs.Iterable):
+            raise TypeError("LayerDict.update should be called with an "
+                            "iterable of key/value pairs, but got " + type(
+                                modules).__name__)
+
+        if isinstance(modules,
+                      (OrderedDict, LayerDict, container_abcs.Mapping)):
+            for key, module in modules.items():
+                self[key] = module
+        else:
+            # modules here can be a list with two items
+            for j, m in enumerate(modules):
+                if not isinstance(m, container_abcs.Iterable):
+                    raise TypeError("LayerDict update sequence element "
+                                    "#" + str(j) + " should be Iterable; is" +
+                                    type(m).__name__)
+                if not len(m) == 2:
+                    raise ValueError("LayerDict update sequence element "
+                                     "#" + str(j) + " has length " + str(
+                                         len(m)) + "; 2 is required")
+                # modules can be Mapping (what it's typed at), or a list: [(name1, module1), (name2, module2)]
+                # that's too cumbersome to type correctly with overloads, so we add an ignore here
+                self[m[0]] = m[1]  # type: ignore[assignment]
+
+    # remove forward alltogether to fallback on Module's _forward_unimplemented
+
+
+if not hasattr(paddle.nn, 'LayerDict'):
+    logger.debug(
+        "register user LayerDict to paddle.nn, remove this when fixed!")
+    setattr(paddle.nn, 'LayerDict', LayerDict)
diff --git a/ernie-sat/paddlespeech/s2t/decoders/README.md b/ernie-sat/paddlespeech/s2t/decoders/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..0b91ddd1b1b9a61dbb7421c89ca853cb62a1444f
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/decoders/README.md
@@ -0,0 +1,14 @@
+# Decoders
+we borrow a lot of code from Espnet Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+## Reference
+### CTC Prefix Beam Search
+* [Sequence Modeling With CTC](https://distill.pub/2017/ctc/)
+* [First-Pass Large Vocabulary Continuous Speech Recognition using Bi-Directional Recurrent DNNs](https://arxiv.org/pdf/1408.2873.pdf)
+
+### CTC Prefix Score & Join CTC/ATT One-passing Decoding
+* [Hybrid CTC/Attention Architecture for End-to-End Speech Recognition](http://www.ifp.illinois.edu/speech/speech_web_lg/slides/2019/watanabe_hybridCTCAttention_2017.pdf)
+* [Vectorized Beam Search for CTC-Attention-based Speech Recognition](https://www.isca-speech.org/archive/pdfs/interspeech_2019/seki19b_interspeech.pdf)
+
+### Streaming Join CTC/ATT Beam Search
+* [STREAMING TRANSFORMER ASR WITH BLOCKWISE SYNCHRONOUS BEAM SEARCH](https://arxiv.org/abs/2006.14941)
diff --git a/ernie-sat/paddlespeech/s2t/decoders/__init__.py b/ernie-sat/paddlespeech/s2t/decoders/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..185a92b8d94d3426d616c0624f0f2ee04339349e
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/decoders/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/ernie-sat/paddlespeech/s2t/decoders/beam_search/__init__.py b/ernie-sat/paddlespeech/s2t/decoders/beam_search/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..79a1e9d30fe2277dceb3b762fcb1e29d21ebddce
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/decoders/beam_search/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .batch_beam_search import BatchBeamSearch
+from .beam_search import beam_search
+from .beam_search import BeamSearch
+from .beam_search import Hypothesis
diff --git a/ernie-sat/paddlespeech/s2t/decoders/beam_search/batch_beam_search.py b/ernie-sat/paddlespeech/s2t/decoders/beam_search/batch_beam_search.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed9790cdff7ba8f4566d90995ea0e8992fccb11e
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/decoders/beam_search/batch_beam_search.py
@@ -0,0 +1,18 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Reference espnet Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
+
+
+class BatchBeamSearch():
+    pass
diff --git a/ernie-sat/paddlespeech/s2t/decoders/beam_search/beam_search.py b/ernie-sat/paddlespeech/s2t/decoders/beam_search/beam_search.py
new file mode 100644
index 0000000000000000000000000000000000000000..f331cb1c93e1331aa25600e6b5b819212ed6f096
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/decoders/beam_search/beam_search.py
@@ -0,0 +1,531 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+"""Beam search module."""
+from itertools import chain
+from typing import Any
+from typing import Dict
+from typing import List
+from typing import NamedTuple
+from typing import Tuple
+from typing import Union
+
+import paddle
+
+from ..scorers.scorer_interface import PartialScorerInterface
+from ..scorers.scorer_interface import ScorerInterface
+from ..utils import end_detect
+from paddlespeech.s2t.utils.log import Log
+
+logger = Log(__name__).getlog()
+
+
+class Hypothesis(NamedTuple):
+    """Hypothesis data type."""
+
+    yseq: paddle.Tensor  # (T,)
+    score: Union[float, paddle.Tensor] = 0
+    scores: Dict[str, Union[float, paddle.Tensor]] = dict()
+    states: Dict[str, Any] = dict()
+
+    def asdict(self) -> dict:
+        """Convert data to JSON-friendly dict."""
+        return self._replace(
+            yseq=self.yseq.tolist(),
+            score=float(self.score),
+            scores={k: float(v)
+                    for k, v in self.scores.items()}, )._asdict()
+
+
+class BeamSearch(paddle.nn.Layer):
+    """Beam search implementation."""
+
+    def __init__(
+            self,
+            scorers: Dict[str, ScorerInterface],
+            weights: Dict[str, float],
+            beam_size: int,
+            vocab_size: int,
+            sos: int,
+            eos: int,
+            token_list: List[str]=None,
+            pre_beam_ratio: float=1.5,
+            pre_beam_score_key: str=None, ):
+        """Initialize beam search.
+
+        Args:
+            scorers (dict[str, ScorerInterface]): Dict of decoder modules
+                e.g., Decoder, CTCPrefixScorer, LM
+                The scorer will be ignored if it is `None`
+            weights (dict[str, float]): Dict of weights for each scorers
+                The scorer will be ignored if its weight is 0
+            beam_size (int): The number of hypotheses kept during search
+            vocab_size (int): The number of vocabulary
+            sos (int): Start of sequence id
+            eos (int): End of sequence id
+            token_list (list[str]): List of tokens for debug log
+            pre_beam_score_key (str): key of scores to perform pre-beam search
+            pre_beam_ratio (float): beam size in the pre-beam search
+                will be `int(pre_beam_ratio * beam_size)`
+
+        """
+        super().__init__()
+        # set scorers
+        self.weights = weights
+        self.scorers = dict()  # all = full + partial
+        self.full_scorers = dict()  # full tokens
+        self.part_scorers = dict()  # partial tokens
+        # this module dict is required for recursive cast
+        # `self.to(device, dtype)` in `recog.py`
+        self.nn_dict = paddle.nn.LayerDict()  # nn.Layer
+        for k, v in scorers.items():
+            w = weights.get(k, 0)
+            if w == 0 or v is None:
+                continue
+            assert isinstance(
+                v, ScorerInterface
+            ), f"{k} ({type(v)}) does not implement ScorerInterface"
+            self.scorers[k] = v
+            if isinstance(v, PartialScorerInterface):
+                self.part_scorers[k] = v
+            else:
+                self.full_scorers[k] = v
+            if isinstance(v, paddle.nn.Layer):
+                self.nn_dict[k] = v
+
+        # set configurations
+        self.sos = sos
+        self.eos = eos
+        self.token_list = token_list
+        # pre_beam_size > beam_size
+        self.pre_beam_size = int(pre_beam_ratio * beam_size)
+        self.beam_size = beam_size
+        self.n_vocab = vocab_size
+        if (pre_beam_score_key is not None and pre_beam_score_key != "full" and
+                pre_beam_score_key not in self.full_scorers):
+            raise KeyError(
+                f"{pre_beam_score_key} is not found in {self.full_scorers}")
+        # selected `key` scorer to do pre beam search
+        self.pre_beam_score_key = pre_beam_score_key
+        # do_pre_beam when need, valid and has part_scorers
+        self.do_pre_beam = (self.pre_beam_score_key is not None and
+                            self.pre_beam_size < self.n_vocab and
+                            len(self.part_scorers) > 0)
+
+    def init_hyp(self, x: paddle.Tensor) -> List[Hypothesis]:
+        """Get an initial hypothesis data.
+
+        Args:
+            x (paddle.Tensor): The encoder output feature, (T, D)
+
+        Returns:
+            Hypothesis: The initial hypothesis.
+
+        """
+        init_states = dict()
+        init_scores = dict()
+        for k, d in self.scorers.items():
+            init_states[k] = d.init_state(x)
+            init_scores[k] = 0.0
+        return [
+            Hypothesis(
+                yseq=paddle.to_tensor([self.sos], place=x.place),
+                score=0.0,
+                scores=init_scores,
+                states=init_states, )
+        ]
+
+    @staticmethod
+    def append_token(xs: paddle.Tensor,
+                     x: Union[int, paddle.Tensor]) -> paddle.Tensor:
+        """Append new token to prefix tokens.
+
+        Args:
+            xs (paddle.Tensor): The prefix token, (T,)
+            x (int): The new token to append
+
+        Returns:
+            paddle.Tensor: (T+1,), New tensor contains: xs + [x] with xs.dtype and xs.device
+
+        """
+        x = paddle.to_tensor([x], dtype=xs.dtype) if isinstance(x, int) else x
+        return paddle.concat((xs, x))
+
+    def score_full(self, hyp: Hypothesis, x: paddle.Tensor
+                   ) -> Tuple[Dict[str, paddle.Tensor], Dict[str, Any]]:
+        """Score new hypothesis by `self.full_scorers`.
+
+        Args:
+            hyp (Hypothesis): Hypothesis with prefix tokens to score
+            x (paddle.Tensor): Corresponding input feature, (T, D)
+
+        Returns:
+            Tuple[Dict[str, paddle.Tensor], Dict[str, Any]]: Tuple of
+                score dict of `hyp` that has string keys of `self.full_scorers`
+                and tensor score values of shape: `(self.n_vocab,)`,
+                and state dict that has string keys
+                and state values of `self.full_scorers`
+
+        """
+        scores = dict()
+        states = dict()
+        for k, d in self.full_scorers.items():
+            # scores[k] shape (self.n_vocab,)
+            scores[k], states[k] = d.score(hyp.yseq, hyp.states[k], x)
+        return scores, states
+
+    def score_partial(self,
+                      hyp: Hypothesis,
+                      ids: paddle.Tensor,
+                      x: paddle.Tensor
+                      ) -> Tuple[Dict[str, paddle.Tensor], Dict[str, Any]]:
+        """Score new hypothesis by `self.part_scorers`.
+
+        Args:
+            hyp (Hypothesis): Hypothesis with prefix tokens to score
+            ids (paddle.Tensor): 1D tensor of new partial tokens to score, 
+                len(ids) < n_vocab
+            x (paddle.Tensor): Corresponding input feature, (T, D)
+
+        Returns:
+            Tuple[Dict[str, paddle.Tensor], Dict[str, Any]]: Tuple of
+                score dict of `hyp` that has string keys of `self.part_scorers`
+                and tensor score values of shape: `(len(ids),)`,
+                and state dict that has string keys
+                and state values of `self.part_scorers`
+
+        """
+        scores = dict()
+        states = dict()
+        for k, d in self.part_scorers.items():
+            # scores[k] shape (len(ids),)
+            scores[k], states[k] = d.score_partial(hyp.yseq, ids, hyp.states[k],
+                                                   x)
+        return scores, states
+
+    def beam(self, weighted_scores: paddle.Tensor,
+             ids: paddle.Tensor) -> Tuple[paddle.Tensor, paddle.Tensor]:
+        """Compute topk full token ids and partial token ids.
+
+        Args:
+            weighted_scores (paddle.Tensor): The weighted sum scores for each tokens.
+                Its shape is `(self.n_vocab,)`.
+            ids (paddle.Tensor): The partial token ids(Global) to compute topk.
+
+        Returns:
+            Tuple[paddle.Tensor, paddle.Tensor]: 
+                The topk full token ids and partial token ids.
+                Their shapes are `(self.beam_size,)`.
+                i.e. (global ids, global relative local ids).
+
+        """
+        # no pre beam performed, `ids` equal to `weighted_scores`
+        if weighted_scores.size(0) == ids.size(0):
+            top_ids = weighted_scores.topk(
+                self.beam_size)[1]  # index in n_vocab
+            return top_ids, top_ids
+
+        # mask pruned in pre-beam not to select in topk
+        tmp = weighted_scores[ids]
+        weighted_scores[:] = -float("inf")
+        weighted_scores[ids] = tmp
+        # top_ids no equal to local_ids, since ids shape not same
+        top_ids = weighted_scores.topk(self.beam_size)[1]  # index in n_vocab
+        local_ids = weighted_scores[ids].topk(
+            self.beam_size)[1]  # index in len(ids)
+        return top_ids, local_ids
+
+    @staticmethod
+    def merge_scores(
+            prev_scores: Dict[str, float],
+            next_full_scores: Dict[str, paddle.Tensor],
+            full_idx: int,
+            next_part_scores: Dict[str, paddle.Tensor],
+            part_idx: int, ) -> Dict[str, paddle.Tensor]:
+        """Merge scores for new hypothesis.
+
+        Args:
+            prev_scores (Dict[str, float]):
+                The previous hypothesis scores by `self.scorers`
+            next_full_scores (Dict[str, paddle.Tensor]): scores by `self.full_scorers`
+            full_idx (int): The next token id for `next_full_scores`
+            next_part_scores (Dict[str, paddle.Tensor]):
+                scores of partial tokens by `self.part_scorers`
+            part_idx (int): The new token id for `next_part_scores`
+
+        Returns:
+            Dict[str, paddle.Tensor]: The new score dict.
+                Its keys are names of `self.full_scorers` and `self.part_scorers`.
+                Its values are scalar tensors by the scorers.
+
+        """
+        new_scores = dict()
+        for k, v in next_full_scores.items():
+            new_scores[k] = prev_scores[k] + v[full_idx]
+        for k, v in next_part_scores.items():
+            new_scores[k] = prev_scores[k] + v[part_idx]
+        return new_scores
+
+    def merge_states(self, states: Any, part_states: Any, part_idx: int) -> Any:
+        """Merge states for new hypothesis.
+
+        Args:
+            states: states of `self.full_scorers`
+            part_states: states of `self.part_scorers`
+            part_idx (int): The new token id for `part_scores`
+
+        Returns:
+            Dict[str, paddle.Tensor]: The new score dict.
+                Its keys are names of `self.full_scorers` and `self.part_scorers`.
+                Its values are states of the scorers.
+
+        """
+        new_states = dict()
+        for k, v in states.items():
+            new_states[k] = v
+        for k, d in self.part_scorers.items():
+            new_states[k] = d.select_state(part_states[k], part_idx)
+        return new_states
+
+    def search(self, running_hyps: List[Hypothesis],
+               x: paddle.Tensor) -> List[Hypothesis]:
+        """Search new tokens for running hypotheses and encoded speech x.
+
+        Args:
+            running_hyps (List[Hypothesis]): Running hypotheses on beam
+            x (paddle.Tensor): Encoded speech feature (T, D)
+
+        Returns:
+            List[Hypotheses]: Best sorted hypotheses
+
+        """
+        best_hyps = []
+        part_ids = paddle.arange(self.n_vocab)  # no pre-beam
+        for hyp in running_hyps:
+            # scoring
+            weighted_scores = paddle.zeros([self.n_vocab], dtype=x.dtype)
+            scores, states = self.score_full(hyp, x)
+            for k in self.full_scorers:
+                weighted_scores += self.weights[k] * scores[k]
+            # partial scoring
+            if self.do_pre_beam:
+                pre_beam_scores = (weighted_scores
+                                   if self.pre_beam_score_key == "full" else
+                                   scores[self.pre_beam_score_key])
+                part_ids = paddle.topk(pre_beam_scores, self.pre_beam_size)[1]
+            part_scores, part_states = self.score_partial(hyp, part_ids, x)
+            for k in self.part_scorers:
+                weighted_scores[part_ids] += self.weights[k] * part_scores[k]
+            # add previous hyp score
+            weighted_scores += hyp.score
+
+            # update hyps
+            for j, part_j in zip(*self.beam(weighted_scores, part_ids)):
+                # `part_j` is `j` relative id in `part_scores`
+                # will be (2 x beam at most)
+                best_hyps.append(
+                    Hypothesis(
+                        score=weighted_scores[j],
+                        yseq=self.append_token(hyp.yseq, j),
+                        scores=self.merge_scores(hyp.scores, scores, j,
+                                                 part_scores, part_j),
+                        states=self.merge_states(states, part_states, part_j),
+                    ))
+
+            # sort and prune 2 x beam -> beam
+            best_hyps = sorted(
+                best_hyps, key=lambda x: x.score,
+                reverse=True)[:min(len(best_hyps), self.beam_size)]
+        return best_hyps
+
+    def forward(self,
+                x: paddle.Tensor,
+                maxlenratio: float=0.0,
+                minlenratio: float=0.0) -> List[Hypothesis]:
+        """Perform beam search.
+
+        Args:
+            x (paddle.Tensor): Encoded speech feature (T, D)
+            maxlenratio (float): Input length ratio to obtain max output length.
+                If maxlenratio=0.0 (default), it uses a end-detect function
+                    to automatically find maximum hypothesis lengths
+                If maxlenratio<0.0, its absolute value is interpreted
+                    as a constant max output length.
+            minlenratio (float): Input length ratio to obtain min output length.
+
+        Returns:
+            list[Hypothesis]: N-best decoding results
+
+        """
+        # set length bounds
+        if maxlenratio == 0:
+            maxlen = x.shape[0]
+        elif maxlenratio < 0:
+            maxlen = -1 * int(maxlenratio)
+        else:
+            maxlen = max(1, int(maxlenratio * x.size(0)))
+        minlen = int(minlenratio * x.size(0))
+        logger.info("decoder input length: " + str(x.shape[0]))
+        logger.info("max output length: " + str(maxlen))
+        logger.info("min output length: " + str(minlen))
+
+        # main loop of prefix search
+        running_hyps = self.init_hyp(x)
+        ended_hyps = []
+        for i in range(maxlen):
+            logger.debug("position " + str(i))
+            best = self.search(running_hyps, x)
+            # post process of one iteration
+            running_hyps = self.post_process(i, maxlen, maxlenratio, best,
+                                             ended_hyps)
+            # end detection
+            if maxlenratio == 0.0 and end_detect(
+                [h.asdict() for h in ended_hyps], i):
+                logger.info(f"end detected at {i}")
+                break
+            if len(running_hyps) == 0:
+                logger.info("no hypothesis. Finish decoding.")
+                break
+            else:
+                logger.debug(f"remained hypotheses: {len(running_hyps)}")
+
+        nbest_hyps = sorted(ended_hyps, key=lambda x: x.score, reverse=True)
+        # check the number of hypotheses reaching to eos
+        if len(nbest_hyps) == 0:
+            logger.warning("there is no N-best results, perform recognition "
+                           "again with smaller minlenratio.")
+            return ([] if minlenratio < 0.1 else
+                    self.forward(x, maxlenratio, max(0.0, minlenratio - 0.1)))
+
+        # report the best result
+        best = nbest_hyps[0]
+        for k, v in best.scores.items():
+            logger.info(
+                f"{float(v):6.2f} * {self.weights[k]:3} = {float(v) * self.weights[k]:6.2f} for {k}"
+            )
+        logger.info(f"total log probability: {float(best.score):.2f}")
+        logger.info(
+            f"normalized log probability: {float(best.score) / len(best.yseq):.2f}"
+        )
+        logger.info(f"total number of ended hypotheses: {len(nbest_hyps)}")
+        if self.token_list is not None:
+            # logger.info(
+            #     "best hypo: "
+            #     + "".join([self.token_list[x] for x in best.yseq[1:-1]])
+            #     + "\n"
+            # )
+            logger.info("best hypo: " + "".join(
+                [self.token_list[x] for x in best.yseq[1:]]) + "\n")
+        return nbest_hyps
+
+    def post_process(
+            self,
+            i: int,
+            maxlen: int,
+            maxlenratio: float,
+            running_hyps: List[Hypothesis],
+            ended_hyps: List[Hypothesis], ) -> List[Hypothesis]:
+        """Perform post-processing of beam search iterations.
+
+        Args:
+            i (int): The length of hypothesis tokens.
+            maxlen (int): The maximum length of tokens in beam search.
+            maxlenratio (int): The maximum length ratio in beam search.
+            running_hyps (List[Hypothesis]): The running hypotheses in beam search.
+            ended_hyps (List[Hypothesis]): The ended hypotheses in beam search.
+
+        Returns:
+            List[Hypothesis]: The new running hypotheses.
+
+        """
+        logger.debug(f"the number of running hypotheses: {len(running_hyps)}")
+        if self.token_list is not None:
+            logger.debug("best hypo: " + "".join(
+                [self.token_list[x] for x in running_hyps[0].yseq[1:]]))
+        # add eos in the final loop to avoid that there are no ended hyps
+        if i == maxlen - 1:
+            logger.info("adding <eos> in the last position in the loop")
+            running_hyps = [
+                h._replace(yseq=self.append_token(h.yseq, self.eos))
+                for h in running_hyps
+            ]
+
+        # add ended hypotheses to a final list, and removed them from current hypotheses
+        # (this will be a problem, number of hyps < beam)
+        remained_hyps = []
+        for hyp in running_hyps:
+            if hyp.yseq[-1] == self.eos:
+                # e.g., Word LM needs to add final <eos> score
+                for k, d in chain(self.full_scorers.items(),
+                                  self.part_scorers.items()):
+                    s = d.final_score(hyp.states[k])
+                    hyp.scores[k] += s
+                    hyp = hyp._replace(score=hyp.score + self.weights[k] * s)
+                ended_hyps.append(hyp)
+            else:
+                remained_hyps.append(hyp)
+        return remained_hyps
+
+
+def beam_search(
+        x: paddle.Tensor,
+        sos: int,
+        eos: int,
+        beam_size: int,
+        vocab_size: int,
+        scorers: Dict[str, ScorerInterface],
+        weights: Dict[str, float],
+        token_list: List[str]=None,
+        maxlenratio: float=0.0,
+        minlenratio: float=0.0,
+        pre_beam_ratio: float=1.5,
+        pre_beam_score_key: str="full", ) -> list:
+    """Perform beam search with scorers.
+
+    Args:
+        x (paddle.Tensor): Encoded speech feature (T, D)
+        sos (int): Start of sequence id
+        eos (int): End of sequence id
+        beam_size (int): The number of hypotheses kept during search
+        vocab_size (int): The number of vocabulary
+        scorers (dict[str, ScorerInterface]): Dict of decoder modules
+            e.g., Decoder, CTCPrefixScorer, LM
+            The scorer will be ignored if it is `None`
+        weights (dict[str, float]): Dict of weights for each scorers
+            The scorer will be ignored if its weight is 0
+        token_list (list[str]): List of tokens for debug log
+        maxlenratio (float): Input length ratio to obtain max output length.
+            If maxlenratio=0.0 (default), it uses a end-detect function
+            to automatically find maximum hypothesis lengths
+        minlenratio (float): Input length ratio to obtain min output length.
+        pre_beam_score_key (str): key of scores to perform pre-beam search
+        pre_beam_ratio (float): beam size in the pre-beam search
+            will be `int(pre_beam_ratio * beam_size)`
+
+    Returns:
+        List[Dict]: N-best decoding results
+
+    """
+    ret = BeamSearch(
+        scorers,
+        weights,
+        beam_size=beam_size,
+        vocab_size=vocab_size,
+        pre_beam_ratio=pre_beam_ratio,
+        pre_beam_score_key=pre_beam_score_key,
+        sos=sos,
+        eos=eos,
+        token_list=token_list, ).forward(
+            x=x, maxlenratio=maxlenratio, minlenratio=minlenratio)
+    return [h.asdict() for h in ret]
diff --git a/ernie-sat/paddlespeech/s2t/decoders/ctcdecoder/__init__.py b/ernie-sat/paddlespeech/s2t/decoders/ctcdecoder/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..37ceae6e5f8c3016713c4417ea167dec9e3fdc42
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/decoders/ctcdecoder/__init__.py
@@ -0,0 +1,18 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .swig_wrapper import ctc_beam_search_decoding
+from .swig_wrapper import ctc_beam_search_decoding_batch
+from .swig_wrapper import ctc_greedy_decoding
+from .swig_wrapper import CTCBeamSearchDecoder
+from .swig_wrapper import Scorer
diff --git a/ernie-sat/paddlespeech/s2t/decoders/ctcdecoder/decoders_deprecated.py b/ernie-sat/paddlespeech/s2t/decoders/ctcdecoder/decoders_deprecated.py
new file mode 100644
index 0000000000000000000000000000000000000000..fef08807e928d7b63b147384c86cb45c553623a9
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/decoders/ctcdecoder/decoders_deprecated.py
@@ -0,0 +1,248 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Contains various CTC decoders."""
+import multiprocessing
+from itertools import groupby
+from math import log
+
+import numpy as np
+
+
+def ctc_greedy_decoder(probs_seq, vocabulary):
+    """CTC greedy (best path) decoder.
+
+    Path consisting of the most probable tokens are further post-processed to
+    remove consecutive repetitions and all blanks.
+
+    :param probs_seq: 2-D list of probabilities over the vocabulary for each
+                      character. Each element is a list of float probabilities
+                      for one character.
+    :type probs_seq: list
+    :param vocabulary: Vocabulary list.
+    :type vocabulary: list
+    :return: Decoding result string.
+    :rtype: baseline
+    """
+    # dimension verification
+    for probs in probs_seq:
+        if not len(probs) == len(vocabulary) + 1:
+            raise ValueError("probs_seq dimension mismatchedd with vocabulary")
+    # argmax to get the best index for each time step
+    max_index_list = list(np.array(probs_seq).argmax(axis=1))
+    # remove consecutive duplicate indexes
+    index_list = [index_group[0] for index_group in groupby(max_index_list)]
+    # remove blank indexes
+    blank_index = len(vocabulary)
+    index_list = [index for index in index_list if index != blank_index]
+    # convert index list to string
+    return ''.join([vocabulary[index] for index in index_list])
+
+
+def ctc_beam_search_decoder(probs_seq,
+                            beam_size,
+                            vocabulary,
+                            cutoff_prob=1.0,
+                            cutoff_top_n=40,
+                            ext_scoring_func=None,
+                            nproc=False):
+    """CTC Beam search decoder.
+
+    It utilizes beam search to approximately select top best decoding
+    labels and returning results in the descending order.
+    The implementation is based on Prefix Beam Search
+    (https://arxiv.org/abs/1408.2873), and the unclear part is
+    redesigned. Two important modifications: 1) in the iterative computation
+    of probabilities, the assignment operation is changed to accumulation for
+    one prefix may comes from different paths; 2) the if condition "if l^+ not
+    in A_prev then" after probabilities' computation is deprecated for it is
+    hard to understand and seems unnecessary.
+
+    :param probs_seq: 2-D list of probability distributions over each time
+                      step, with each element being a list of normalized
+                      probabilities over vocabulary and blank.
+    :type probs_seq: 2-D list
+    :param beam_size: Width for beam search.
+    :type beam_size: int
+    :param vocabulary: Vocabulary list.
+    :type vocabulary: list
+    :param cutoff_prob: Cutoff probability in pruning,
+                        default 1.0, no pruning.
+    :type cutoff_prob: float
+    :param ext_scoring_func: External scoring function for
+                            partially decoded sentence, e.g. word count
+                            or language model.
+    :type external_scoring_func: callable
+    :param nproc: Whether the decoder used in multiprocesses.
+    :type nproc: bool
+    :return: List of tuples of log probability and sentence as decoding
+             results, in descending order of the probability.
+    :rtype: list
+    """
+    # dimension check
+    for prob_list in probs_seq:
+        if not len(prob_list) == len(vocabulary) + 1:
+            raise ValueError("The shape of prob_seq does not match with the "
+                             "shape of the vocabulary.")
+
+    # blank_id assign
+    blank_id = len(vocabulary)
+
+    # If the decoder called in the multiprocesses, then use the global scorer
+    # instantiated in ctc_beam_search_decoder_batch().
+    if nproc is True:
+        global ext_nproc_scorer
+        ext_scoring_func = ext_nproc_scorer
+
+    # initialize
+    # prefix_set_prev: the set containing selected prefixes
+    # probs_b_prev: prefixes' probability ending with blank in previous step
+    # probs_nb_prev: prefixes' probability ending with non-blank in previous step
+    prefix_set_prev = {'\t': 1.0}
+    probs_b_prev, probs_nb_prev = {'\t': 1.0}, {'\t': 0.0}
+
+    # extend prefix in loop
+    for time_step in range(len(probs_seq)):
+        # prefix_set_next: the set containing candidate prefixes
+        # probs_b_cur: prefixes' probability ending with blank in current step
+        # probs_nb_cur: prefixes' probability ending with non-blank in current step
+        prefix_set_next, probs_b_cur, probs_nb_cur = {}, {}, {}
+
+        prob_idx = list(enumerate(probs_seq[time_step]))
+        cutoff_len = len(prob_idx)
+        # If pruning is enabled
+        if cutoff_prob < 1.0 or cutoff_top_n < cutoff_len:
+            prob_idx = sorted(prob_idx, key=lambda asd: asd[1], reverse=True)
+            cutoff_len, cum_prob = 0, 0.0
+            for i in range(len(prob_idx)):
+                cum_prob += prob_idx[i][1]
+                cutoff_len += 1
+                if cum_prob >= cutoff_prob:
+                    break
+            cutoff_len = min(cutoff_len, cutoff_top_n)
+            prob_idx = prob_idx[0:cutoff_len]
+
+        for l in prefix_set_prev:
+            if l not in prefix_set_next:
+                probs_b_cur[l], probs_nb_cur[l] = 0.0, 0.0
+
+            # extend prefix by travering prob_idx
+            for index in range(cutoff_len):
+                c, prob_c = prob_idx[index][0], prob_idx[index][1]
+
+                if c == blank_id:
+                    probs_b_cur[l] += prob_c * (
+                        probs_b_prev[l] + probs_nb_prev[l])
+                else:
+                    last_char = l[-1]
+                    new_char = vocabulary[c]
+                    l_plus = l + new_char
+                    if l_plus not in prefix_set_next:
+                        probs_b_cur[l_plus], probs_nb_cur[l_plus] = 0.0, 0.0
+
+                    if new_char == last_char:
+                        probs_nb_cur[l_plus] += prob_c * probs_b_prev[l]
+                        probs_nb_cur[l] += prob_c * probs_nb_prev[l]
+                    elif new_char == ' ':
+                        if (ext_scoring_func is None) or (len(l) == 1):
+                            score = 1.0
+                        else:
+                            prefix = l[1:]
+                            score = ext_scoring_func(prefix)
+                        probs_nb_cur[l_plus] += score * prob_c * (
+                            probs_b_prev[l] + probs_nb_prev[l])
+                    else:
+                        probs_nb_cur[l_plus] += prob_c * (
+                            probs_b_prev[l] + probs_nb_prev[l])
+                    # add l_plus into prefix_set_next
+                    prefix_set_next[l_plus] = probs_nb_cur[
+                        l_plus] + probs_b_cur[l_plus]
+            # add l into prefix_set_next
+            prefix_set_next[l] = probs_b_cur[l] + probs_nb_cur[l]
+        # update probs
+        probs_b_prev, probs_nb_prev = probs_b_cur, probs_nb_cur
+
+        # store top beam_size prefixes
+        prefix_set_prev = sorted(
+            prefix_set_next.items(), key=lambda asd: asd[1], reverse=True)
+        if beam_size < len(prefix_set_prev):
+            prefix_set_prev = prefix_set_prev[:beam_size]
+        prefix_set_prev = dict(prefix_set_prev)
+
+    beam_result = []
+    for seq, prob in prefix_set_prev.items():
+        if prob > 0.0 and len(seq) > 1:
+            result = seq[1:]
+            # score last word by external scorer
+            if (ext_scoring_func is not None) and (result[-1] != ' '):
+                prob = prob * ext_scoring_func(result)
+            log_prob = log(prob)
+            beam_result.append((log_prob, result))
+        else:
+            beam_result.append((float('-inf'), ''))
+
+    # output top beam_size decoding results
+    beam_result = sorted(beam_result, key=lambda asd: asd[0], reverse=True)
+    return beam_result
+
+
+def ctc_beam_search_decoder_batch(probs_split,
+                                  beam_size,
+                                  vocabulary,
+                                  num_processes,
+                                  cutoff_prob=1.0,
+                                  cutoff_top_n=40,
+                                  ext_scoring_func=None):
+    """CTC beam search decoder using multiple processes.
+
+    :param probs_seq: 3-D list with each element as an instance of 2-D list
+                      of probabilities used by ctc_beam_search_decoder().
+    :type probs_seq: 3-D list
+    :param beam_size: Width for beam search.
+    :type beam_size: int
+    :param vocabulary: Vocabulary list.
+    :type vocabulary: list
+    :param num_processes: Number of parallel processes.
+    :type num_processes: int
+    :param cutoff_prob: Cutoff probability in pruning,
+                        default 1.0, no pruning.
+    :type cutoff_prob: float
+    :param num_processes: Number of parallel processes.
+    :type num_processes: int
+    :param ext_scoring_func: External scoring function for
+                            partially decoded sentence, e.g. word count
+                            or language model.
+    :type external_scoring_function: callable
+    :return: List of tuples of log probability and sentence as decoding
+             results, in descending order of the probability.
+    :rtype: list
+    """
+    if not num_processes > 0:
+        raise ValueError("Number of processes must be positive!")
+
+    # use global variable to pass the externnal scorer to beam search decoder
+    global ext_nproc_scorer
+    ext_nproc_scorer = ext_scoring_func
+    nproc = True
+
+    pool = multiprocessing.Pool(processes=num_processes)
+    results = []
+    for i, probs_list in enumerate(probs_split):
+        args = (probs_list, beam_size, vocabulary, cutoff_prob, cutoff_top_n,
+                None, nproc)
+        results.append(pool.apply_async(ctc_beam_search_decoder, args))
+
+    pool.close()
+    pool.join()
+    beam_search_results = [result.get() for result in results]
+    return beam_search_results
diff --git a/ernie-sat/paddlespeech/s2t/decoders/ctcdecoder/scorer_deprecated.py b/ernie-sat/paddlespeech/s2t/decoders/ctcdecoder/scorer_deprecated.py
new file mode 100644
index 0000000000000000000000000000000000000000..362098fe65ec34106926e1804dfbb5abb273d97d
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/decoders/ctcdecoder/scorer_deprecated.py
@@ -0,0 +1,78 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""External Scorer for Beam Search Decoder."""
+import os
+
+import kenlm
+import numpy as np
+
+
+class Scorer(object):
+    """External scorer to evaluate a prefix or whole sentence in
+       beam search decoding, including the score from n-gram language
+       model and word count.
+
+    :param alpha: Parameter associated with language model. Don't use
+                  language model when alpha = 0.
+    :type alpha: float
+    :param beta: Parameter associated with word count. Don't use word
+                count when beta = 0.
+    :type beta: float
+    :model_path: Path to load language model.
+    :type model_path: str
+    """
+
+    def __init__(self, alpha, beta, model_path):
+        self._alpha = alpha
+        self._beta = beta
+        if not os.path.isfile(model_path):
+            raise IOError("Invaid language model path: %s" % model_path)
+        self._language_model = kenlm.LanguageModel(model_path)
+
+    # n-gram language model scoring
+    def _language_model_score(self, sentence):
+        #log10 prob of last word
+        log_cond_prob = list(
+            self._language_model.full_scores(sentence, eos=False))[-1][0]
+        return np.power(10, log_cond_prob)
+
+    # word insertion term
+    def _word_count(self, sentence):
+        words = sentence.strip().split(' ')
+        return len(words)
+
+    # reset alpha and beta
+    def reset_params(self, alpha, beta):
+        self._alpha = alpha
+        self._beta = beta
+
+    # execute evaluation
+    def __call__(self, sentence, log=False):
+        """Evaluation function, gathering all the different scores
+        and return the final one.
+
+        :param sentence: The input sentence for evaluation
+        :type sentence: str
+        :param log: Whether return the score in log representation.
+        :type log: bool
+        :return: Evaluation score, in the decimal or log.
+        :rtype: float
+        """
+        lm = self._language_model_score(sentence)
+        word_cnt = self._word_count(sentence)
+        if log is False:
+            score = np.power(lm, self._alpha) * np.power(word_cnt, self._beta)
+        else:
+            score = self._alpha * np.log(lm) + self._beta * np.log(word_cnt)
+        return score
diff --git a/ernie-sat/paddlespeech/s2t/decoders/ctcdecoder/swig_wrapper.py b/ernie-sat/paddlespeech/s2t/decoders/ctcdecoder/swig_wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e2a850646a0449f248baf99d90c2eff82fb025d
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/decoders/ctcdecoder/swig_wrapper.py
@@ -0,0 +1,159 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Wrapper for various CTC decoders in SWIG."""
+import paddlespeech_ctcdecoders
+
+
+class Scorer(paddlespeech_ctcdecoders.Scorer):
+    """Wrapper for Scorer.
+
+    :param alpha: Parameter associated with language model. Don't use
+                  language model when alpha = 0.
+    :type alpha: float
+    :param beta: Parameter associated with word count. Don't use word
+                 count when beta = 0.
+    :type beta: float
+    :model_path: Path to load language model.
+    :type model_path: str
+    :param vocabulary: Vocabulary list.
+    :type vocabulary: list
+    """
+
+    def __init__(self, alpha, beta, model_path, vocabulary):
+        paddlespeech_ctcdecoders.Scorer.__init__(self, alpha, beta, model_path,
+                                                 vocabulary)
+
+
+def ctc_greedy_decoding(probs_seq, vocabulary, blank_id):
+    """Wrapper for ctc best path decodeing function in swig.
+
+    :param probs_seq: 2-D list of probability distributions over each time
+                      step, with each element being a list of normalized
+                      probabilities over vocabulary and blank.
+    :type probs_seq: 2-D list
+    :param vocabulary: Vocabulary list.
+    :type vocabulary: list
+    :return: Decoding result string.
+    :rtype: str
+    """
+    result = paddlespeech_ctcdecoders.ctc_greedy_decoding(probs_seq.tolist(),
+                                                          vocabulary, blank_id)
+    return result
+
+
+def ctc_beam_search_decoding(probs_seq,
+                             vocabulary,
+                             beam_size,
+                             cutoff_prob=1.0,
+                             cutoff_top_n=40,
+                             ext_scoring_func=None,
+                             blank_id=0):
+    """Wrapper for the CTC Beam Search Decoding function.
+
+    :param probs_seq: 2-D list of probability distributions over each time
+                      step, with each element being a list of normalized
+                      probabilities over vocabulary and blank.
+    :type probs_seq: 2-D list
+    :param vocabulary: Vocabulary list.
+    :type vocabulary: list
+    :param beam_size: Width for beam search.
+    :type beam_size: int
+    :param cutoff_prob: Cutoff probability in pruning,
+                        default 1.0, no pruning.
+    :type cutoff_prob: float
+    :param cutoff_top_n: Cutoff number in pruning, only top cutoff_top_n
+                         characters with highest probs in vocabulary will be
+                         used in beam search, default 40.
+    :type cutoff_top_n: int
+    :param ext_scoring_func: External scoring function for
+                             partially decoded sentence, e.g. word count
+                             or language model.
+    :type external_scoring_func: callable
+    :return: List of tuples of log probability and sentence as decoding
+             results, in descending order of the probability.
+    :rtype: list
+    """
+    beam_results = paddlespeech_ctcdecoders.ctc_beam_search_decoding(
+        probs_seq.tolist(), vocabulary, beam_size, cutoff_prob, cutoff_top_n,
+        ext_scoring_func, blank_id)
+    beam_results = [(res[0], res[1].decode('utf-8')) for res in beam_results]
+    return beam_results
+
+
+def ctc_beam_search_decoding_batch(probs_split,
+                                   vocabulary,
+                                   beam_size,
+                                   num_processes,
+                                   cutoff_prob=1.0,
+                                   cutoff_top_n=40,
+                                   ext_scoring_func=None,
+                                   blank_id=0):
+    """Wrapper for the batched CTC beam search decodeing batch function.
+
+    :param probs_seq: 3-D list with each element as an instance of 2-D list
+                      of probabilities used by ctc_beam_search_decoder().
+    :type probs_seq: 3-D list
+    :param vocabulary: Vocabulary list.
+    :type vocabulary: list
+    :param beam_size: Width for beam search.
+    :type beam_size: int
+    :param num_processes: Number of parallel processes.
+    :type num_processes: int
+    :param cutoff_prob: Cutoff probability in vocabulary pruning,
+                        default 1.0, no pruning.
+    :type cutoff_prob: float
+    :param cutoff_top_n: Cutoff number in pruning, only top cutoff_top_n
+                         characters with highest probs in vocabulary will be
+                         used in beam search, default 40.
+    :type cutoff_top_n: int
+    :param num_processes: Number of parallel processes.
+    :type num_processes: int
+    :param ext_scoring_func: External scoring function for
+                             partially decoded sentence, e.g. word count
+                             or language model.
+    :type external_scoring_function: callable
+    :return: List of tuples of log probability and sentence as decoding
+             results, in descending order of the probability.
+    :rtype: list
+    """
+    probs_split = [probs_seq.tolist() for probs_seq in probs_split]
+
+    batch_beam_results = paddlespeech_ctcdecoders.ctc_beam_search_decoding_batch(
+        probs_split, vocabulary, beam_size, num_processes, cutoff_prob,
+        cutoff_top_n, ext_scoring_func, blank_id)
+    batch_beam_results = [[(res[0], res[1]) for res in beam_results]
+                          for beam_results in batch_beam_results]
+    return batch_beam_results
+
+
+class CTCBeamSearchDecoder(paddlespeech_ctcdecoders.CtcBeamSearchDecoderBatch):
+    """Wrapper for CtcBeamSearchDecoderBatch.
+    Args:
+        vocab_list (list): Vocabulary list.
+        beam_size (int): Width for beam search.
+        num_processes (int): Number of parallel processes.
+        param cutoff_prob (float): Cutoff probability in vocabulary pruning,
+                            default 1.0, no pruning.
+        cutoff_top_n (int): Cutoff number in pruning, only top cutoff_top_n
+                            characters with highest probs in vocabulary will be
+                            used in beam search, default 40.
+        param ext_scorer (Scorer): External scorer for partially decoded sentence, e.g. word count
+                                or language model.
+    """
+
+    def __init__(self, vocab_list, batch_size, beam_size, num_processes,
+                 cutoff_prob, cutoff_top_n, _ext_scorer, blank_id):
+        paddlespeech_ctcdecoders.CtcBeamSearchDecoderBatch.__init__(
+            self, vocab_list, batch_size, beam_size, num_processes, cutoff_prob,
+            cutoff_top_n, _ext_scorer, blank_id)
diff --git a/ernie-sat/paddlespeech/s2t/decoders/ctcdecoder/tests/test_decoders.py b/ernie-sat/paddlespeech/s2t/decoders/ctcdecoder/tests/test_decoders.py
new file mode 100644
index 0000000000000000000000000000000000000000..a284890d6a45728efed01a0915a1069ca68154b1
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/decoders/ctcdecoder/tests/test_decoders.py
@@ -0,0 +1,100 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Test decoders."""
+import unittest
+
+from paddlespeech.s2t.decoders import decoders_deprecated as decoder
+
+
+class TestDecoders(unittest.TestCase):
+    def setUp(self):
+        self.vocab_list = ["\'", ' ', 'a', 'b', 'c', 'd']
+        self.beam_size = 20
+        self.probs_seq1 = [[
+            0.06390443, 0.21124858, 0.27323887, 0.06870235, 0.0361254,
+            0.18184413, 0.16493624
+        ], [
+            0.03309247, 0.22866108, 0.24390638, 0.09699597, 0.31895462,
+            0.0094893, 0.06890021
+        ], [
+            0.218104, 0.19992557, 0.18245131, 0.08503348, 0.14903535,
+            0.08424043, 0.08120984
+        ], [
+            0.12094152, 0.19162472, 0.01473646, 0.28045061, 0.24246305,
+            0.05206269, 0.09772094
+        ], [
+            0.1333387, 0.00550838, 0.00301669, 0.21745861, 0.20803985,
+            0.41317442, 0.01946335
+        ], [
+            0.16468227, 0.1980699, 0.1906545, 0.18963251, 0.19860937,
+            0.04377724, 0.01457421
+        ]]
+        self.probs_seq2 = [[
+            0.08034842, 0.22671944, 0.05799633, 0.36814645, 0.11307441,
+            0.04468023, 0.10903471
+        ], [
+            0.09742457, 0.12959763, 0.09435383, 0.21889204, 0.15113123,
+            0.10219457, 0.20640612
+        ], [
+            0.45033529, 0.09091417, 0.15333208, 0.07939558, 0.08649316,
+            0.12298585, 0.01654384
+        ], [
+            0.02512238, 0.22079203, 0.19664364, 0.11906379, 0.07816055,
+            0.22538587, 0.13483174
+        ], [
+            0.17928453, 0.06065261, 0.41153005, 0.1172041, 0.11880313,
+            0.07113197, 0.04139363
+        ], [
+            0.15882358, 0.1235788, 0.23376776, 0.20510435, 0.00279306,
+            0.05294827, 0.22298418
+        ]]
+        self.greedy_result = ["ac'bdc", "b'da"]
+        self.beam_search_result = ['acdc', "b'a"]
+
+    def test_greedy_decoder_1(self):
+        bst_result = decoder.ctc_greedy_decoder(self.probs_seq1,
+                                                self.vocab_list)
+        self.assertEqual(bst_result, self.greedy_result[0])
+
+    def test_greedy_decoder_2(self):
+        bst_result = decoder.ctc_greedy_decoder(self.probs_seq2,
+                                                self.vocab_list)
+        self.assertEqual(bst_result, self.greedy_result[1])
+
+    def test_beam_search_decoder_1(self):
+        beam_result = decoder.ctc_beam_search_decoder(
+            probs_seq=self.probs_seq1,
+            beam_size=self.beam_size,
+            vocabulary=self.vocab_list)
+        self.assertEqual(beam_result[0][1], self.beam_search_result[0])
+
+    def test_beam_search_decoder_2(self):
+        beam_result = decoder.ctc_beam_search_decoder(
+            probs_seq=self.probs_seq2,
+            beam_size=self.beam_size,
+            vocabulary=self.vocab_list)
+        self.assertEqual(beam_result[0][1], self.beam_search_result[1])
+
+    def test_beam_search_decoder_batch(self):
+        beam_results = decoder.ctc_beam_search_decoder_batch(
+            probs_split=[self.probs_seq1, self.probs_seq2],
+            beam_size=self.beam_size,
+            vocabulary=self.vocab_list,
+            num_processes=24)
+        self.assertEqual(beam_results[0][0][1], self.beam_search_result[0])
+        self.assertEqual(beam_results[1][0][1], self.beam_search_result[1])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/ernie-sat/paddlespeech/s2t/decoders/recog.py b/ernie-sat/paddlespeech/s2t/decoders/recog.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d2aa2109191d80bc1c85f0153d0f5ec80e421aa
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/decoders/recog.py
@@ -0,0 +1,196 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Reference espnet Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
+# Modified from espnet(https://github.com/espnet/espnet)
+"""V2 backend for `asr_recog.py` using py:class:`decoders.beam_search.BeamSearch`."""
+import jsonlines
+import paddle
+from yacs.config import CfgNode
+
+from .beam_search import BatchBeamSearch
+from .beam_search import BeamSearch
+from .scorers.length_bonus import LengthBonus
+from .scorers.scorer_interface import BatchScorerInterface
+from .utils import add_results_to_json
+from paddlespeech.s2t.exps import dynamic_import_tester
+from paddlespeech.s2t.io.reader import LoadInputsAndTargets
+from paddlespeech.s2t.models.asr_interface import ASRInterface
+from paddlespeech.s2t.models.lm_interface import dynamic_import_lm
+from paddlespeech.s2t.utils.log import Log
+
+logger = Log(__name__).getlog()
+
+# NOTE: you need this func to generate our sphinx doc
+
+
+def get_config(config_path):
+    confs = CfgNode(new_allowed=True)
+    confs.merge_from_file(config_path)
+    return confs
+
+
+def load_trained_model(args):
+    confs = get_config(args.model_conf)
+    class_obj = dynamic_import_tester(args.model_name)
+    exp = class_obj(confs, args)
+    with exp.eval():
+        exp.setup()
+        exp.restore()
+    char_list = exp.args.char_list
+    model = exp.model
+    return model, char_list, exp, confs
+
+
+def load_trained_lm(args):
+    lm_args = get_config(args.rnnlm_conf)
+    lm_model_module = lm_args.model_module
+    lm_class = dynamic_import_lm(lm_model_module)
+    lm = lm_class(**lm_args.model)
+    model_dict = paddle.load(args.rnnlm)
+    lm.set_state_dict(model_dict)
+    return lm
+
+
+def recog_v2(args):
+    """Decode with custom models that implements ScorerInterface.
+
+    Args:
+        args (namespace): The program arguments.
+        See py:func:`bin.asr_recog.get_parser` for details
+
+    """
+    logger.warning("experimental API for custom LMs is selected by --api v2")
+    if args.batchsize > 1:
+        raise NotImplementedError("multi-utt batch decoding is not implemented")
+    if args.streaming_mode is not None:
+        raise NotImplementedError("streaming mode is not implemented")
+    if args.word_rnnlm:
+        raise NotImplementedError("word LM is not implemented")
+
+    # set_deterministic(args)
+    model, char_list, exp, confs = load_trained_model(args)
+    assert isinstance(model, ASRInterface)
+
+    load_inputs_and_targets = LoadInputsAndTargets(
+        mode="asr",
+        load_output=False,
+        sort_in_input_length=False,
+        preprocess_conf=confs.preprocess_config
+        if args.preprocess_conf is None else args.preprocess_conf,
+        preprocess_args={"train": False}, )
+
+    if args.rnnlm:
+        lm = load_trained_lm(args)
+        lm.eval()
+    else:
+        lm = None
+
+    if args.ngram_model:
+        from .scorers.ngram import NgramFullScorer
+        from .scorers.ngram import NgramPartScorer
+
+        if args.ngram_scorer == "full":
+            ngram = NgramFullScorer(args.ngram_model, char_list)
+        else:
+            ngram = NgramPartScorer(args.ngram_model, char_list)
+    else:
+        ngram = None
+
+    scorers = model.scorers()  # decoder
+    scorers["lm"] = lm
+    scorers["ngram"] = ngram
+    scorers["length_bonus"] = LengthBonus(len(char_list))
+    weights = dict(
+        decoder=1.0 - args.ctc_weight,
+        ctc=args.ctc_weight,
+        lm=args.lm_weight,
+        ngram=args.ngram_weight,
+        length_bonus=args.penalty, )
+    beam_search = BeamSearch(
+        beam_size=args.beam_size,
+        vocab_size=len(char_list),
+        weights=weights,
+        scorers=scorers,
+        sos=model.sos,
+        eos=model.eos,
+        token_list=char_list,
+        pre_beam_score_key=None if args.ctc_weight == 1.0 else "full", )
+
+    # TODO(karita): make all scorers batchfied
+    if args.batchsize == 1:
+        non_batch = [
+            k for k, v in beam_search.full_scorers.items()
+            if not isinstance(v, BatchScorerInterface)
+        ]
+        if len(non_batch) == 0:
+            beam_search.__class__ = BatchBeamSearch
+            logger.info("BatchBeamSearch implementation is selected.")
+        else:
+            logger.warning(f"As non-batch scorers {non_batch} are found, "
+                           f"fall back to non-batch implementation.")
+
+    if args.ngpu > 1:
+        raise NotImplementedError("only single GPU decoding is supported")
+    if args.ngpu == 1:
+        device = "gpu:0"
+    else:
+        device = "cpu"
+    paddle.set_device(device)
+    dtype = getattr(paddle, args.dtype)
+    logger.info(f"Decoding device={device}, dtype={dtype}")
+    model.to(device=device, dtype=dtype)
+    model.eval()
+    beam_search.to(device=device, dtype=dtype)
+    beam_search.eval()
+
+    # read json data
+    js = []
+    with jsonlines.open(args.recog_json, "r") as reader:
+        for item in reader:
+            js.append(item)
+    # jsonlines to dict, key by 'utt', value by jsonline
+    js = {item['utt']: item for item in js}
+
+    new_js = {}
+    with paddle.no_grad():
+        with jsonlines.open(args.result_label, "w") as f:
+            for idx, name in enumerate(js.keys(), 1):
+                logger.info(f"({idx}/{len(js.keys())}) decoding " + name)
+                batch = [(name, js[name])]
+                feat = load_inputs_and_targets(batch)[0][0]
+                logger.info(f'feat: {feat.shape}')
+                enc = model.encode(paddle.to_tensor(feat).to(dtype))
+                logger.info(f'eout: {enc.shape}')
+                nbest_hyps = beam_search(
+                    x=enc,
+                    maxlenratio=args.maxlenratio,
+                    minlenratio=args.minlenratio)
+                nbest_hyps = [
+                    h.asdict()
+                    for h in nbest_hyps[:min(len(nbest_hyps), args.nbest)]
+                ]
+                new_js[name] = add_results_to_json(js[name], nbest_hyps,
+                                                   char_list)
+
+                item = new_js[name]['output'][0]  # 1-best
+                ref = item['text']
+                rec_text = item['rec_text'].replace('▁', ' ').replace(
+                    '<eos>', '').strip()
+                rec_tokenid = list(map(int, item['rec_tokenid'].split()))
+                f.write({
+                    "utt": name,
+                    "refs": [ref],
+                    "hyps": [rec_text],
+                    "hyps_tokenid": [rec_tokenid],
+                })
diff --git a/ernie-sat/paddlespeech/s2t/decoders/recog_bin.py b/ernie-sat/paddlespeech/s2t/decoders/recog_bin.py
new file mode 100644
index 0000000000000000000000000000000000000000..37b49f3a059fa29d177d3db0f9be973450660430
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/decoders/recog_bin.py
@@ -0,0 +1,376 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Reference espnet Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
+# Modified from espnet(https://github.com/espnet/espnet)
+"""End-to-end speech recognition model decoding script."""
+import logging
+import os
+import random
+import sys
+
+import configargparse
+import numpy as np
+from distutils.util import strtobool
+
+
+def get_parser():
+    """Get default arguments."""
+    parser = configargparse.ArgumentParser(
+        description="Transcribe text from speech using "
+        "a speech recognition model on one CPU or GPU",
+        config_file_parser_class=configargparse.YAMLConfigFileParser,
+        formatter_class=configargparse.ArgumentDefaultsHelpFormatter, )
+    parser.add(
+        '--model-name',
+        type=str,
+        default='u2_kaldi',
+        help='model name, e.g: deepspeech2, u2, u2_kaldi, u2_st')
+    # general configuration
+    parser.add("--config", is_config_file=True, help="Config file path")
+    parser.add(
+        "--config2",
+        is_config_file=True,
+        help="Second config file path that overwrites the settings in `--config`",
+    )
+    parser.add(
+        "--config3",
+        is_config_file=True,
+        help="Third config file path that overwrites the settings "
+        "in `--config` and `--config2`", )
+
+    parser.add_argument("--ngpu", type=int, default=0, help="Number of GPUs")
+    parser.add_argument(
+        "--dtype",
+        choices=("float16", "float32", "float64"),
+        default="float32",
+        help="Float precision (only available in --api v2)", )
+    parser.add_argument("--debugmode", type=int, default=1, help="Debugmode")
+    parser.add_argument("--seed", type=int, default=1, help="Random seed")
+    parser.add_argument(
+        "--verbose", "-V", type=int, default=2, help="Verbose option")
+    parser.add_argument(
+        "--batchsize",
+        type=int,
+        default=1,
+        help="Batch size for beam search (0: means no batch processing)", )
+    parser.add_argument(
+        "--preprocess-conf",
+        type=str,
+        default=None,
+        help="The configuration file for the pre-processing", )
+    parser.add_argument(
+        "--api",
+        default="v2",
+        choices=["v2"],
+        help="Beam search APIs "
+        "v2: Experimental API. It supports any models that implements ScorerInterface.",
+    )
+    # task related
+    parser.add_argument(
+        "--recog-json", type=str, help="Filename of recognition data (json)")
+    parser.add_argument(
+        "--result-label",
+        type=str,
+        required=True,
+        help="Filename of result label data (json)", )
+    # model (parameter) related
+    parser.add_argument(
+        "--model",
+        type=str,
+        required=True,
+        help="Model file parameters to read")
+    parser.add_argument(
+        "--model-conf", type=str, default=None, help="Model config file")
+    parser.add_argument(
+        "--num-spkrs",
+        type=int,
+        default=1,
+        choices=[1, 2],
+        help="Number of speakers in the speech", )
+    parser.add_argument(
+        "--num-encs",
+        default=1,
+        type=int,
+        help="Number of encoders in the model.")
+    # search related
+    parser.add_argument(
+        "--nbest", type=int, default=1, help="Output N-best hypotheses")
+    parser.add_argument("--beam-size", type=int, default=1, help="Beam size")
+    parser.add_argument(
+        "--penalty", type=float, default=0.0, help="Incertion penalty")
+    parser.add_argument(
+        "--maxlenratio",
+        type=float,
+        default=0.0,
+        help="""Input length ratio to obtain max output length.
+                        If maxlenratio=0.0 (default), it uses a end-detect function
+                        to automatically find maximum hypothesis lengths.
+                        If maxlenratio<0.0, its absolute value is interpreted
+                        as a constant max output length""", )
+    parser.add_argument(
+        "--minlenratio",
+        type=float,
+        default=0.0,
+        help="Input length ratio to obtain min output length", )
+    parser.add_argument(
+        "--ctc-weight",
+        type=float,
+        default=0.0,
+        help="CTC weight in joint decoding")
+    parser.add_argument(
+        "--weights-ctc-dec",
+        type=float,
+        action="append",
+        help="ctc weight assigned to each encoder during decoding."
+        "[in multi-encoder mode only]", )
+    parser.add_argument(
+        "--ctc-window-margin",
+        type=int,
+        default=0,
+        help="""Use CTC window with margin parameter to accelerate
+                        CTC/attention decoding especially on GPU. Smaller magin
+                        makes decoding faster, but may increase search errors.
+                        If margin=0 (default), this function is disabled""", )
+    # transducer related
+    parser.add_argument(
+        "--search-type",
+        type=str,
+        default="default",
+        choices=["default", "nsc", "tsd", "alsd", "maes"],
+        help="""Type of beam search implementation to use during inference.
+        Can be either: default beam search ("default"),
+        N-Step Constrained beam search ("nsc"), Time-Synchronous Decoding ("tsd"),
+        Alignment-Length Synchronous Decoding ("alsd") or
+        modified Adaptive Expansion Search ("maes").""", )
+    parser.add_argument(
+        "--nstep",
+        type=int,
+        default=1,
+        help="""Number of expansion steps allowed in NSC beam search or mAES
+        (nstep > 0 for NSC and nstep > 1 for mAES).""", )
+    parser.add_argument(
+        "--prefix-alpha",
+        type=int,
+        default=2,
+        help="Length prefix difference allowed in NSC beam search or mAES.", )
+    parser.add_argument(
+        "--max-sym-exp",
+        type=int,
+        default=2,
+        help="Number of symbol expansions allowed in TSD.", )
+    parser.add_argument(
+        "--u-max",
+        type=int,
+        default=400,
+        help="Length prefix difference allowed in ALSD.", )
+    parser.add_argument(
+        "--expansion-gamma",
+        type=float,
+        default=2.3,
+        help="Allowed logp difference for prune-by-value method in mAES.", )
+    parser.add_argument(
+        "--expansion-beta",
+        type=int,
+        default=2,
+        help="""Number of additional candidates for expanded hypotheses
+                selection in mAES.""", )
+    parser.add_argument(
+        "--score-norm",
+        type=strtobool,
+        nargs="?",
+        default=True,
+        help="Normalize final hypotheses' score by length", )
+    parser.add_argument(
+        "--softmax-temperature",
+        type=float,
+        default=1.0,
+        help="Penalization term for softmax function.", )
+    # rnnlm related
+    parser.add_argument(
+        "--rnnlm", type=str, default=None, help="RNNLM model file to read")
+    parser.add_argument(
+        "--rnnlm-conf",
+        type=str,
+        default=None,
+        help="RNNLM model config file to read")
+    parser.add_argument(
+        "--word-rnnlm",
+        type=str,
+        default=None,
+        help="Word RNNLM model file to read")
+    parser.add_argument(
+        "--word-rnnlm-conf",
+        type=str,
+        default=None,
+        help="Word RNNLM model config file to read", )
+    parser.add_argument(
+        "--word-dict", type=str, default=None, help="Word list to read")
+    parser.add_argument(
+        "--lm-weight", type=float, default=0.1, help="RNNLM weight")
+    # ngram related
+    parser.add_argument(
+        "--ngram-model",
+        type=str,
+        default=None,
+        help="ngram model file to read")
+    parser.add_argument(
+        "--ngram-weight", type=float, default=0.1, help="ngram weight")
+    parser.add_argument(
+        "--ngram-scorer",
+        type=str,
+        default="part",
+        choices=("full", "part"),
+        help="""if the ngram is set as a part scorer, similar with CTC scorer,
+                ngram scorer only scores topK hypethesis.
+                if the ngram is set as full scorer, ngram scorer scores all hypthesis
+                the decoding speed of part scorer is musch faster than full one""",
+    )
+    # streaming related
+    parser.add_argument(
+        "--streaming-mode",
+        type=str,
+        default=None,
+        choices=["window", "segment"],
+        help="""Use streaming recognizer for inference.
+                        `--batchsize` must be set to 0 to enable this mode""", )
+    parser.add_argument(
+        "--streaming-window", type=int, default=10, help="Window size")
+    parser.add_argument(
+        "--streaming-min-blank-dur",
+        type=int,
+        default=10,
+        help="Minimum blank duration threshold", )
+    parser.add_argument(
+        "--streaming-onset-margin", type=int, default=1, help="Onset margin")
+    parser.add_argument(
+        "--streaming-offset-margin", type=int, default=1, help="Offset margin")
+    # non-autoregressive related
+    # Mask CTC related. See https://arxiv.org/abs/2005.08700 for the detail.
+    parser.add_argument(
+        "--maskctc-n-iterations",
+        type=int,
+        default=10,
+        help="Number of decoding iterations."
+        "For Mask CTC, set 0 to predict 1 mask/iter.", )
+    parser.add_argument(
+        "--maskctc-probability-threshold",
+        type=float,
+        default=0.999,
+        help="Threshold probability for CTC output", )
+    # quantize model related
+    parser.add_argument(
+        "--quantize-config",
+        nargs="*",
+        help="Quantize config list. E.g.: --quantize-config=[Linear,LSTM,GRU]",
+    )
+    parser.add_argument(
+        "--quantize-dtype",
+        type=str,
+        default="qint8",
+        help="Dtype dynamic quantize")
+    parser.add_argument(
+        "--quantize-asr-model",
+        type=bool,
+        default=False,
+        help="Quantize asr model", )
+    parser.add_argument(
+        "--quantize-lm-model",
+        type=bool,
+        default=False,
+        help="Quantize lm model", )
+    return parser
+
+
+def main(args):
+    """Run the main decoding function."""
+    parser = get_parser()
+    parser.add_argument(
+        "--output", metavar="CKPT_DIR", help="path to save checkpoint.")
+    parser.add_argument(
+        "--checkpoint_path", type=str, help="path to load checkpoint")
+    parser.add_argument("--dict-path", type=str, help="path to load checkpoint")
+    args = parser.parse_args(args)
+
+    if args.ngpu == 0 and args.dtype == "float16":
+        raise ValueError(
+            f"--dtype {args.dtype} does not support the CPU backend.")
+
+    # logging info
+    if args.verbose == 1:
+        logging.basicConfig(
+            level=logging.INFO,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+    elif args.verbose == 2:
+        logging.basicConfig(
+            level=logging.DEBUG,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+    else:
+        logging.basicConfig(
+            level=logging.WARN,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+        logging.warning("Skip DEBUG/INFO messages")
+    logging.info(args)
+
+    # check CUDA_VISIBLE_DEVICES
+    if args.ngpu > 0:
+        cvd = os.environ.get("CUDA_VISIBLE_DEVICES")
+        if cvd is None:
+            logging.warning("CUDA_VISIBLE_DEVICES is not set.")
+        elif args.ngpu != len(cvd.split(",")):
+            logging.error("#gpus is not matched with CUDA_VISIBLE_DEVICES.")
+            sys.exit(1)
+
+        # TODO(mn5k): support of multiple GPUs
+        if args.ngpu > 1:
+            logging.error("The program only supports ngpu=1.")
+            sys.exit(1)
+
+    # display PYTHONPATH
+    logging.info("python path = " + os.environ.get("PYTHONPATH", "(None)"))
+
+    # seed setting
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    logging.info("set random seed = %d" % args.seed)
+
+    # validate rnn options
+    if args.rnnlm is not None and args.word_rnnlm is not None:
+        logging.error(
+            "It seems that both --rnnlm and --word-rnnlm are specified. "
+            "Please use either option.")
+        sys.exit(1)
+
+    # recog
+    if args.num_spkrs == 1:
+        if args.num_encs == 1:
+            # Experimental API that supports custom LMs
+            if args.api == "v2":
+                from paddlespeech.s2t.decoders.recog import recog_v2
+                recog_v2(args)
+            else:
+                raise ValueError("Only support --api v2")
+        else:
+            if args.api == "v2":
+                raise NotImplementedError(
+                    f"--num-encs {args.num_encs} > 1 is not supported in --api v2"
+                )
+    elif args.num_spkrs == 2:
+        raise ValueError("asr_mix not supported.")
+
+
+if __name__ == "__main__":
+    main(sys.argv[1:])
diff --git a/ernie-sat/paddlespeech/s2t/decoders/scorers/__init__.py b/ernie-sat/paddlespeech/s2t/decoders/scorers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..185a92b8d94d3426d616c0624f0f2ee04339349e
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/decoders/scorers/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/ernie-sat/paddlespeech/s2t/decoders/scorers/ctc.py b/ernie-sat/paddlespeech/s2t/decoders/scorers/ctc.py
new file mode 100644
index 0000000000000000000000000000000000000000..81d8b078392eb0282d59cfbefbb72a2583647aae
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/decoders/scorers/ctc.py
@@ -0,0 +1,164 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+"""ScorerInterface implementation for CTC."""
+import numpy as np
+import paddle
+
+from .ctc_prefix_score import CTCPrefixScore
+from .ctc_prefix_score import CTCPrefixScorePD
+from .scorer_interface import BatchPartialScorerInterface
+
+
+class CTCPrefixScorer(BatchPartialScorerInterface):
+    """Decoder interface wrapper for CTCPrefixScore."""
+
+    def __init__(self, ctc: paddle.nn.Layer, eos: int):
+        """Initialize class.
+
+        Args:
+            ctc (paddle.nn.Layer): The CTC implementation.
+                For example, :class:`paddlespeech.s2t.modules.ctc.CTC`
+            eos (int): The end-of-sequence id.
+
+        """
+        self.ctc = ctc
+        self.eos = eos
+        self.impl = None
+
+    def init_state(self, x: paddle.Tensor):
+        """Get an initial state for decoding.
+
+        Args:
+            x (paddle.Tensor): The encoded feature tensor
+
+        Returns: initial state
+
+        """
+        logp = self.ctc.log_softmax(x.unsqueeze(0)).squeeze(0).numpy()
+        # TODO(karita): use CTCPrefixScorePD
+        self.impl = CTCPrefixScore(logp, 0, self.eos, np)
+        return 0, self.impl.initial_state()
+
+    def select_state(self, state, i, new_id=None):
+        """Select state with relative ids in the main beam search.
+
+        Args:
+            state: Decoder state for prefix tokens
+            i (int): Index to select a state in the main beam search
+            new_id (int): New label id to select a state if necessary
+
+        Returns:
+            state: pruned state
+
+        """
+        if type(state) == tuple:
+            if len(state) == 2:  # for CTCPrefixScore
+                sc, st = state
+                return sc[i], st[i]
+            else:  # for CTCPrefixScorePD (need new_id > 0)
+                r, log_psi, f_min, f_max, scoring_idmap = state
+                s = log_psi[i, new_id].expand(log_psi.size(1))
+                if scoring_idmap is not None:
+                    return r[:, :, i, scoring_idmap[i, new_id]], s, f_min, f_max
+                else:
+                    return r[:, :, i, new_id], s, f_min, f_max
+        return None if state is None else state[i]
+
+    def score_partial(self, y, ids, state, x):
+        """Score new token.
+
+        Args:
+            y (paddle.Tensor): 1D prefix token
+            next_tokens (paddle.Tensor): paddle.int64 next token to score
+            state: decoder state for prefix tokens
+            x (paddle.Tensor): 2D encoder feature that generates ys
+
+        Returns:
+            tuple[paddle.Tensor, Any]:
+                Tuple of a score tensor for y that has a shape `(len(next_tokens),)`
+                and next state for ys
+
+        """
+        prev_score, state = state
+        presub_score, new_st = self.impl(y.cpu(), ids.cpu(), state)
+        tscore = paddle.to_tensor(
+            presub_score - prev_score, place=x.place, dtype=x.dtype)
+        return tscore, (presub_score, new_st)
+
+    def batch_init_state(self, x: paddle.Tensor):
+        """Get an initial state for decoding.
+
+        Args:
+            x (paddle.Tensor): The encoded feature tensor
+
+        Returns: initial state
+
+        """
+        logp = self.ctc.log_softmax(x.unsqueeze(0))  # assuming batch_size = 1
+        xlen = paddle.to_tensor([logp.size(1)])
+        self.impl = CTCPrefixScorePD(logp, xlen, 0, self.eos)
+        return None
+
+    def batch_score_partial(self, y, ids, state, x):
+        """Score new token.
+
+        Args:
+            y (paddle.Tensor): 1D prefix token
+            ids (paddle.Tensor): paddle.int64 next token to score
+            state: decoder state for prefix tokens
+            x (paddle.Tensor): 2D encoder feature that generates ys
+
+        Returns:
+            tuple[paddle.Tensor, Any]:
+                Tuple of a score tensor for y that has a shape `(len(next_tokens),)`
+                and next state for ys
+
+        """
+        batch_state = (
+            (paddle.stack([s[0] for s in state], axis=2),
+             paddle.stack([s[1] for s in state]), state[0][2], state[0][3], )
+            if state[0] is not None else None)
+        return self.impl(y, batch_state, ids)
+
+    def extend_prob(self, x: paddle.Tensor):
+        """Extend probs for decoding.
+
+        This extension is for streaming decoding
+        as in Eq (14) in https://arxiv.org/abs/2006.14941
+
+        Args:
+            x (paddle.Tensor): The encoded feature tensor
+
+        """
+        logp = self.ctc.log_softmax(x.unsqueeze(0))
+        self.impl.extend_prob(logp)
+
+    def extend_state(self, state):
+        """Extend state for decoding.
+
+        This extension is for streaming decoding
+        as in Eq (14) in https://arxiv.org/abs/2006.14941
+
+        Args:
+            state: The states of hyps
+
+        Returns: extended state
+
+        """
+        new_state = []
+        for s in state:
+            new_state.append(self.impl.extend_state(s))
+
+        return new_state
diff --git a/ernie-sat/paddlespeech/s2t/decoders/scorers/ctc_prefix_score.py b/ernie-sat/paddlespeech/s2t/decoders/scorers/ctc_prefix_score.py
new file mode 100644
index 0000000000000000000000000000000000000000..78b8fe36c8c0383d642740cab252ba7c89ba2ec0
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/decoders/scorers/ctc_prefix_score.py
@@ -0,0 +1,370 @@
+#!/usr/bin/env python3
+# Copyright 2018 Mitsubishi Electric Research Labs (Takaaki Hori)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+import numpy as np
+import paddle
+import six
+
+
+class CTCPrefixScorePD():
+    """Batch processing of CTCPrefixScore
+
+    which is based on Algorithm 2 in WATANABE et al.
+    "HYBRID CTC/ATTENTION ARCHITECTURE FOR END-TO-END SPEECH RECOGNITION,"
+    but extended to efficiently compute the label probabilities for multiple
+    hypotheses simultaneously
+    See also Seki et al. "Vectorized Beam Search for CTC-Attention-Based
+    Speech Recognition," In INTERSPEECH (pp. 3825-3829), 2019.
+    """
+
+    def __init__(self, x, xlens, blank, eos, margin=0):
+        """Construct CTC prefix scorer
+
+        `margin` is M in eq.(22,23)
+
+        :param paddle.Tensor x: input label posterior sequences (B, T, O)
+        :param paddle.Tensor xlens: input lengths (B,)
+        :param int blank: blank label id
+        :param int eos: end-of-sequence id
+        :param int margin: margin parameter for windowing (0 means no windowing)
+        """
+        # In the comment lines,
+        # we assume T: input_length, B: batch size, W: beam width, O: output dim.
+        self.logzero = -10000000000.0
+        self.blank = blank
+        self.eos = eos
+        self.batch = x.size(0)
+        self.input_length = x.size(1)
+        self.odim = x.size(2)
+        self.dtype = x.dtype
+
+        # Pad the rest of posteriors in the batch
+        # TODO(takaaki-hori): need a better way without for-loops
+        for i, l in enumerate(xlens):
+            if l < self.input_length:
+                x[i, l:, :] = self.logzero
+                x[i, l:, blank] = 0
+        # Reshape input x
+        xn = x.transpose([1, 0, 2])  # (B, T, O) -> (T, B, O)
+        xb = xn[:, :, self.blank].unsqueeze(2).expand(-1, -1,
+                                                      self.odim)  # (T,B,O)
+        self.x = paddle.stack([xn, xb])  # (2, T, B, O)
+        self.end_frames = paddle.to_tensor(xlens) - 1  # (B,)
+
+        # Setup CTC windowing
+        self.margin = margin
+        if margin > 0:
+            self.frame_ids = paddle.arange(self.input_length, dtype=self.dtype)
+        # Base indices for index conversion
+        # B idx, hyp idx. shape (B*W, 1)
+        self.idx_bh = None
+        # B idx. shape (B,)
+        self.idx_b = paddle.arange(self.batch)
+        # B idx, O idx. shape (B, 1)
+        self.idx_bo = (self.idx_b * self.odim).unsqueeze(1)
+
+    def __call__(self, y, state, scoring_ids=None, att_w=None):
+        """Compute CTC prefix scores for next labels
+
+        :param list y: prefix label sequences
+        :param tuple state: previous CTC state
+        :param paddle.Tensor scoring_ids: selected next ids to score (BW, O'), O' <= O
+        :param paddle.Tensor att_w: attention weights to decide CTC window
+        :return new_state, ctc_local_scores (BW, O)
+        """
+        output_length = len(y[0]) - 1  # ignore sos
+        last_ids = [yi[-1] for yi in y]  # last output label ids
+        n_bh = len(last_ids)  # batch * hyps
+        n_hyps = n_bh // self.batch  # assuming each utterance has the same # of hyps
+        self.scoring_num = scoring_ids.size(
+            -1) if scoring_ids is not None else 0
+        # prepare state info
+        if state is None:
+            r_prev = paddle.full(
+                (self.input_length, 2, self.batch, n_hyps),
+                self.logzero,
+                dtype=self.dtype, )  # (T, 2, B, W)
+            r_prev[:, 1] = paddle.cumsum(self.x[0, :, :, self.blank],
+                                         0).unsqueeze(2)
+            r_prev = r_prev.view(-1, 2, n_bh)  # (T, 2, BW)
+            s_prev = 0.0  # score
+            f_min_prev = 0  # eq. 22-23
+            f_max_prev = 1  # eq. 22-23
+        else:
+            r_prev, s_prev, f_min_prev, f_max_prev = state
+
+        # select input dimensions for scoring
+        if self.scoring_num > 0:
+            # (BW, O)
+            scoring_idmap = paddle.full(
+                (n_bh, self.odim), -1, dtype=paddle.long)
+            snum = self.scoring_num
+            if self.idx_bh is None or n_bh > len(self.idx_bh):
+                self.idx_bh = paddle.arange(n_bh).view(-1, 1)  # (BW, 1)
+            scoring_idmap[self.idx_bh[:n_bh], scoring_ids] = paddle.arange(snum)
+            scoring_idx = (
+                scoring_ids + self.idx_bo.repeat(1, n_hyps).view(-1,
+                                                                 1)  # (BW,1)
+            ).view(-1)  # (BWO)
+            # x_ shape (2, T, B*W, O)
+            x_ = paddle.index_select(
+                self.x.view(2, -1, self.batch * self.odim), scoring_idx,
+                2).view(2, -1, n_bh, snum)
+        else:
+            scoring_ids = None
+            scoring_idmap = None
+            snum = self.odim
+            # x_ shape (2, T, B*W, O)
+            x_ = self.x.unsqueeze(3).repeat(1, 1, 1, n_hyps, 1).view(2, -1,
+                                                                     n_bh, snum)
+
+        # new CTC forward probs are prepared as a (T x 2 x BW x S) tensor
+        # that corresponds to r_t^n(h) and r_t^b(h) in a batch.
+        r = paddle.full(
+            (self.input_length, 2, n_bh, snum),
+            self.logzero,
+            dtype=self.dtype, )
+        if output_length == 0:
+            r[0, 0] = x_[0, 0]
+
+        r_sum = paddle.logsumexp(r_prev, 1)  #(T,BW)
+        log_phi = r_sum.unsqueeze(2).repeat(1, 1, snum)  # (T, BW, O)
+        if scoring_ids is not None:
+            for idx in range(n_bh):
+                pos = scoring_idmap[idx, last_ids[idx]]
+                if pos >= 0:
+                    log_phi[:, idx, pos] = r_prev[:, 1, idx]
+        else:
+            for idx in range(n_bh):
+                log_phi[:, idx, last_ids[idx]] = r_prev[:, 1, idx]
+
+        # decide start and end frames based on attention weights
+        if att_w is not None and self.margin > 0:
+            f_arg = paddle.matmul(att_w, self.frame_ids)
+            f_min = max(int(f_arg.min().cpu()), f_min_prev)
+            f_max = max(int(f_arg.max().cpu()), f_max_prev)
+            start = min(f_max_prev, max(f_min - self.margin, output_length, 1))
+            end = min(f_max + self.margin, self.input_length)
+        else:
+            f_min = f_max = 0
+            # if one frame one out, the output_length is the eating frame num now.
+            start = max(output_length, 1)
+            end = self.input_length
+
+        # compute forward probabilities log(r_t^n(h)) and log(r_t^b(h))
+        for t in range(start, end):
+            rp = r[t - 1]  # (2 x BW x O') 
+            rr = paddle.stack([rp[0], log_phi[t - 1], rp[0], rp[1]]).view(
+                2, 2, n_bh, snum)  # (2,2,BW,O')
+            r[t] = paddle.logsumexp(rr, 1) + x_[:, t]
+
+        # compute log prefix probabilities log(psi)
+        log_phi_x = paddle.concat(
+            (log_phi[0].unsqueeze(0), log_phi[:-1]), axis=0) + x_[0]
+        if scoring_ids is not None:
+            log_psi = paddle.full(
+                (n_bh, self.odim), self.logzero, dtype=self.dtype)
+            log_psi_ = paddle.logsumexp(
+                paddle.concat(
+                    (log_phi_x[start:end], r[start - 1, 0].unsqueeze(0)),
+                    axis=0),
+                axis=0, )
+            for si in range(n_bh):
+                log_psi[si, scoring_ids[si]] = log_psi_[si]
+        else:
+            log_psi = paddle.logsumexp(
+                paddle.concat(
+                    (log_phi_x[start:end], r[start - 1, 0].unsqueeze(0)),
+                    axis=0),
+                axis=0, )
+
+        for si in range(n_bh):
+            log_psi[si, self.eos] = r_sum[self.end_frames[si // n_hyps], si]
+
+        # exclude blank probs
+        log_psi[:, self.blank] = self.logzero
+
+        return (log_psi - s_prev), (r, log_psi, f_min, f_max, scoring_idmap)
+
+    def index_select_state(self, state, best_ids):
+        """Select CTC states according to best ids
+
+        :param state    : CTC state
+        :param best_ids : index numbers selected by beam pruning (B, W)
+        :return selected_state
+        """
+        r, s, f_min, f_max, scoring_idmap = state
+        # convert ids to BHO space
+        n_bh = len(s)
+        n_hyps = n_bh // self.batch
+        vidx = (best_ids + (self.idx_b *
+                            (n_hyps * self.odim)).view(-1, 1)).view(-1)
+        # select hypothesis scores
+        s_new = paddle.index_select(s.view(-1), vidx, 0)
+        s_new = s_new.view(-1, 1).repeat(1, self.odim).view(n_bh, self.odim)
+        # convert ids to BHS space (S: scoring_num)
+        if scoring_idmap is not None:
+            snum = self.scoring_num
+            hyp_idx = (best_ids // self.odim +
+                       (self.idx_b * n_hyps).view(-1, 1)).view(-1)
+            label_ids = paddle.fmod(best_ids, self.odim).view(-1)
+            score_idx = scoring_idmap[hyp_idx, label_ids]
+            score_idx[score_idx == -1] = 0
+            vidx = score_idx + hyp_idx * snum
+        else:
+            snum = self.odim
+        # select forward probabilities
+        r_new = paddle.index_select(r.view(-1, 2, n_bh * snum), vidx, 2).view(
+            -1, 2, n_bh)
+        return r_new, s_new, f_min, f_max
+
+    def extend_prob(self, x):
+        """Extend CTC prob.
+
+        :param paddle.Tensor x: input label posterior sequences (B, T, O)
+        """
+
+        if self.x.shape[1] < x.shape[1]:  # self.x (2,T,B,O); x (B,T,O)
+            # Pad the rest of posteriors in the batch
+            # TODO(takaaki-hori): need a better way without for-loops
+            xlens = [x.size(1)]
+            for i, l in enumerate(xlens):
+                if l < self.input_length:
+                    x[i, l:, :] = self.logzero
+                    x[i, l:, self.blank] = 0
+            tmp_x = self.x
+            xn = x.transpose([1, 0, 2])  # (B, T, O) -> (T, B, O)
+            xb = xn[:, :, self.blank].unsqueeze(2).expand(-1, -1, self.odim)
+            self.x = paddle.stack([xn, xb])  # (2, T, B, O)
+            self.x[:, :tmp_x.shape[1], :, :] = tmp_x
+            self.input_length = x.size(1)
+            self.end_frames = paddle.to_tensor(xlens) - 1
+
+    def extend_state(self, state):
+        """Compute CTC prefix state.
+
+
+        :param state    : CTC state
+        :return ctc_state
+        """
+
+        if state is None:
+            # nothing to do
+            return state
+        else:
+            r_prev, s_prev, f_min_prev, f_max_prev = state
+
+            r_prev_new = paddle.full(
+                (self.input_length, 2),
+                self.logzero,
+                dtype=self.dtype, )
+            start = max(r_prev.shape[0], 1)
+            r_prev_new[0:start] = r_prev
+            for t in range(start, self.input_length):
+                r_prev_new[t, 1] = r_prev_new[t - 1, 1] + self.x[0, t, :,
+                                                                 self.blank]
+
+            return (r_prev_new, s_prev, f_min_prev, f_max_prev)
+
+
+class CTCPrefixScore():
+    """Compute CTC label sequence scores
+
+    which is based on Algorithm 2 in WATANABE et al.
+    "HYBRID CTC/ATTENTION ARCHITECTURE FOR END-TO-END SPEECH RECOGNITION,"
+    but extended to efficiently compute the probabilities of multiple labels
+    simultaneously
+    """
+
+    def __init__(self, x, blank, eos, xp):
+        self.xp = xp
+        self.logzero = -10000000000.0
+        self.blank = blank
+        self.eos = eos
+        self.input_length = len(x)
+        self.x = x  # (T, O)
+
+    def initial_state(self):
+        """Obtain an initial CTC state
+
+        :return: CTC state
+        """
+        # initial CTC state is made of a frame x 2 tensor that corresponds to
+        # r_t^n(<sos>) and r_t^b(<sos>), where 0 and 1 of axis=1 represent
+        # superscripts n and b (non-blank and blank), respectively.
+        # r shape (T, 2)
+        r = self.xp.full((self.input_length, 2), self.logzero, dtype=np.float32)
+        r[0, 1] = self.x[0, self.blank]
+        for i in six.moves.range(1, self.input_length):
+            r[i, 1] = r[i - 1, 1] + self.x[i, self.blank]
+        return r
+
+    def __call__(self, y, cs, r_prev):
+        """Compute CTC prefix scores for next labels
+
+        :param y     : prefix label sequence
+        :param cs    : array of next labels
+        :param r_prev: previous CTC state
+        :return ctc_scores, ctc_states
+        """
+        # initialize CTC states
+        output_length = len(y) - 1  # ignore sos
+        # new CTC states are prepared as a frame x (n or b) x n_labels tensor
+        # that corresponds to r_t^n(h) and r_t^b(h).
+        # r shape (T, 2, n_labels)
+        r = self.xp.ndarray((self.input_length, 2, len(cs)), dtype=np.float32)
+        xs = self.x[:, cs]
+        if output_length == 0:
+            r[0, 0] = xs[0]
+            r[0, 1] = self.logzero
+        else:
+            # Although the code does not exactly follow Algorithm 2, 
+            # we don't have to change it because we can assume 
+            # r_t(h)=0 for t < |h| in CTC forward computation 
+            # (Note: we assume here that index t starts with 0).
+            # The purpose of this difference is to reduce the number of for-loops.
+            # https://github.com/espnet/espnet/pull/3655
+            # where we start to accumulate r_t(h) from t=|h| 
+            # and iterate r_t(h) = (r_{t-1}(h) + ...) to T-1, 
+            # avoiding accumulating zeros for t=1~|h|-1.
+            # Thus, we need to set r_{|h|-1}(h) = 0, 
+            # i.e., r[output_length-1] = logzero, for initialization.
+            # This is just for reducing the computation.
+            r[output_length - 1] = self.logzero
+
+        # prepare forward probabilities for the last label
+        r_sum = self.xp.logaddexp(r_prev[:, 0],
+                                  r_prev[:, 1])  # log(r_t^n(g) + r_t^b(g))
+        last = y[-1]
+        if output_length > 0 and last in cs:
+            log_phi = self.xp.ndarray(
+                (self.input_length, len(cs)), dtype=np.float32)
+            for i in six.moves.range(len(cs)):
+                log_phi[:, i] = r_sum if cs[i] != last else r_prev[:, 1]
+        else:
+            log_phi = r_sum
+
+        # compute forward probabilities log(r_t^n(h)), log(r_t^b(h)),
+        # and log prefix probabilities log(psi)
+        start = max(output_length, 1)
+        log_psi = r[start - 1, 0]
+        for t in six.moves.range(start, self.input_length):
+            r[t, 0] = self.xp.logaddexp(r[t - 1, 0], log_phi[t - 1]) + xs[t]
+            r[t, 1] = (self.xp.logaddexp(r[t - 1, 0], r[t - 1, 1]) +
+                       self.x[t, self.blank])
+            log_psi = self.xp.logaddexp(log_psi, log_phi[t - 1] + xs[t])
+
+        # get P(...eos|X) that ends with the prefix itself
+        eos_pos = self.xp.where(cs == self.eos)[0]
+        if len(eos_pos) > 0:
+            log_psi[eos_pos] = r_sum[-1]  # log(r_T^n(g) + r_T^b(g))
+
+        # exclude blank probs
+        blank_pos = self.xp.where(cs == self.blank)[0]
+        if len(blank_pos) > 0:
+            log_psi[blank_pos] = self.logzero
+
+        # return the log prefix probability and CTC states, where the label axis
+        # of the CTC states is moved to the first axis to slice it easily
+        # log_psi shape (n_labels,), state shape (n_labels, T, 2)
+        return log_psi, self.xp.rollaxis(r, 2)
diff --git a/ernie-sat/paddlespeech/s2t/decoders/scorers/length_bonus.py b/ernie-sat/paddlespeech/s2t/decoders/scorers/length_bonus.py
new file mode 100644
index 0000000000000000000000000000000000000000..c5a76db886788cccde19b6d02e8c9611be78ebde
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/decoders/scorers/length_bonus.py
@@ -0,0 +1,73 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+"""Length bonus module."""
+from typing import Any
+from typing import List
+from typing import Tuple
+
+import paddle
+
+from .scorer_interface import BatchScorerInterface
+
+
+class LengthBonus(BatchScorerInterface):
+    """Length bonus in beam search."""
+
+    def __init__(self, n_vocab: int):
+        """Initialize class.
+
+        Args:
+            n_vocab (int): The number of tokens in vocabulary for beam search
+
+        """
+        self.n = n_vocab
+
+    def score(self, y, state, x):
+        """Score new token.
+
+        Args:
+            y (paddle.Tensor): 1D paddle.int64 prefix tokens.
+            state: Scorer state for prefix tokens
+            x (paddle.Tensor): 2D encoder feature that generates ys.
+
+        Returns:
+            tuple[paddle.Tensor, Any]: Tuple of
+                paddle.float32 scores for next token (n_vocab)
+                and None
+
+        """
+        return paddle.to_tensor(
+            [1.0], place=x.place, dtype=x.dtype).expand(self.n), None
+
+    def batch_score(self,
+                    ys: paddle.Tensor,
+                    states: List[Any],
+                    xs: paddle.Tensor) -> Tuple[paddle.Tensor, List[Any]]:
+        """Score new token batch.
+
+        Args:
+            ys (paddle.Tensor): paddle.int64 prefix tokens (n_batch, ylen).
+            states (List[Any]): Scorer states for prefix tokens.
+            xs (paddle.Tensor):
+                The encoder feature that generates ys (n_batch, xlen, n_feat).
+
+        Returns:
+            tuple[paddle.Tensor, List[Any]]: Tuple of
+                batchfied scores for next token with shape of `(n_batch, n_vocab)`
+                and next state list for ys.
+
+        """
+        return (paddle.to_tensor([1.0], place=xs.place, dtype=xs.dtype).expand(
+            ys.shape[0], self.n), None, )
diff --git a/ernie-sat/paddlespeech/s2t/decoders/scorers/ngram.py b/ernie-sat/paddlespeech/s2t/decoders/scorers/ngram.py
new file mode 100644
index 0000000000000000000000000000000000000000..f2600828dd2d0a89679c0b44421ea5fbc04e289e
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/decoders/scorers/ngram.py
@@ -0,0 +1,116 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+"""Ngram lm implement."""
+from abc import ABC
+
+import kenlm
+import paddle
+
+from .scorer_interface import BatchScorerInterface
+from .scorer_interface import PartialScorerInterface
+
+
+class Ngrambase(ABC):
+    """Ngram base implemented through ScorerInterface."""
+
+    def __init__(self, ngram_model, token_list):
+        """Initialize Ngrambase.
+
+        Args:
+            ngram_model: ngram model path
+            token_list: token list from dict or model.json
+
+        """
+        self.chardict = [x if x != "<eos>" else "</s>" for x in token_list]
+        self.charlen = len(self.chardict)
+        self.lm = kenlm.LanguageModel(ngram_model)
+        self.tmpkenlmstate = kenlm.State()
+
+    def init_state(self, x):
+        """Initialize tmp state."""
+        state = kenlm.State()
+        self.lm.NullContextWrite(state)
+        return state
+
+    def score_partial_(self, y, next_token, state, x):
+        """Score interface for both full and partial scorer.
+
+        Args:
+            y: previous char
+            next_token: next token need to be score
+            state: previous state
+            x: encoded feature
+
+        Returns:
+            tuple[paddle.Tensor, List[Any]]: Tuple of
+                batchfied scores for next token with shape of `(n_batch, n_vocab)`
+                and next state list for ys.
+
+        """
+        out_state = kenlm.State()
+        ys = self.chardict[y[-1]] if y.shape[0] > 1 else "<s>"
+        self.lm.BaseScore(state, ys, out_state)
+        scores = paddle.empty_like(next_token, dtype=x.dtype)
+        for i, j in enumerate(next_token):
+            scores[i] = self.lm.BaseScore(out_state, self.chardict[j],
+                                          self.tmpkenlmstate)
+        return scores, out_state
+
+
+class NgramFullScorer(Ngrambase, BatchScorerInterface):
+    """Fullscorer for ngram."""
+
+    def score(self, y, state, x):
+        """Score interface for both full and partial scorer.
+
+        Args:
+            y: previous char
+            state: previous state
+            x: encoded feature
+
+        Returns:
+            tuple[paddle.Tensor, List[Any]]: Tuple of
+                batchfied scores for next token with shape of `(n_batch, n_vocab)`
+                and next state list for ys.
+
+        """
+        return self.score_partial_(y,
+                                   paddle.to_tensor(range(self.charlen)), state,
+                                   x)
+
+
+class NgramPartScorer(Ngrambase, PartialScorerInterface):
+    """Partialscorer for ngram."""
+
+    def score_partial(self, y, next_token, state, x):
+        """Score interface for both full and partial scorer.
+
+        Args:
+            y: previous char
+            next_token: next token need to be score
+            state: previous state
+            x: encoded feature
+
+        Returns:
+            tuple[paddle.Tensor, List[Any]]: Tuple of
+                batchfied scores for next token with shape of `(n_batch, n_vocab)`
+                and next state list for ys.
+
+        """
+        return self.score_partial_(y, next_token, state, x)
+
+    def select_state(self, state, i):
+        """Empty select state for scorer interface."""
+        return state
diff --git a/ernie-sat/paddlespeech/s2t/decoders/scorers/scorer_interface.py b/ernie-sat/paddlespeech/s2t/decoders/scorers/scorer_interface.py
new file mode 100644
index 0000000000000000000000000000000000000000..3272e6b7aa0422a3ebbf45e3b6d2931a70fab784
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/decoders/scorers/scorer_interface.py
@@ -0,0 +1,202 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+"""Scorer interface module."""
+import warnings
+from typing import Any
+from typing import List
+from typing import Tuple
+
+import paddle
+
+
+class ScorerInterface:
+    """Scorer interface for beam search.
+
+    The scorer performs scoring of the all tokens in vocabulary.
+
+    Examples:
+        * Search heuristics
+            * :class:`scorers.length_bonus.LengthBonus`
+        * Decoder networks of the sequence-to-sequence models
+            * :class:`transformer.decoder.Decoder`
+            * :class:`rnn.decoders.Decoder`
+        * Neural language models
+            * :class:`lm.transformer.TransformerLM`
+            * :class:`lm.default.DefaultRNNLM`
+            * :class:`lm.seq_rnn.SequentialRNNLM`
+
+    """
+
+    def init_state(self, x: paddle.Tensor) -> Any:
+        """Get an initial state for decoding (optional).
+
+        Args:
+            x (paddle.Tensor): The encoded feature tensor
+
+        Returns: initial state
+
+        """
+        return None
+
+    def select_state(self, state: Any, i: int, new_id: int=None) -> Any:
+        """Select state with relative ids in the main beam search.
+
+        Args:
+            state: Decoder state for prefix tokens
+            i (int): Index to select a state in the main beam search
+            new_id (int): New label index to select a state if necessary
+
+        Returns:
+            state: pruned state
+
+        """
+        return None if state is None else state[i]
+
+    def score(self, y: paddle.Tensor, state: Any,
+              x: paddle.Tensor) -> Tuple[paddle.Tensor, Any]:
+        """Score new token (required).
+
+        Args:
+            y (paddle.Tensor): 1D paddle.int64 prefix tokens.
+            state: Scorer state for prefix tokens
+            x (paddle.Tensor): The encoder feature that generates ys.
+
+        Returns:
+            tuple[paddle.Tensor, Any]: Tuple of
+                scores for next token that has a shape of `(n_vocab)`
+                and next state for ys
+
+        """
+        raise NotImplementedError
+
+    def final_score(self, state: Any) -> float:
+        """Score eos (optional).
+
+        Args:
+            state: Scorer state for prefix tokens
+
+        Returns:
+            float: final score
+
+        """
+        return 0.0
+
+
+class BatchScorerInterface(ScorerInterface):
+    """Batch scorer interface."""
+
+    def batch_init_state(self, x: paddle.Tensor) -> Any:
+        """Get an initial state for decoding (optional).
+
+        Args:
+            x (paddle.Tensor): The encoded feature tensor
+
+        Returns: initial state
+
+        """
+        return self.init_state(x)
+
+    def batch_score(self,
+                    ys: paddle.Tensor,
+                    states: List[Any],
+                    xs: paddle.Tensor) -> Tuple[paddle.Tensor, List[Any]]:
+        """Score new token batch (required).
+
+        Args:
+            ys (paddle.Tensor): paddle.int64 prefix tokens (n_batch, ylen).
+            states (List[Any]): Scorer states for prefix tokens.
+            xs (paddle.Tensor):
+                The encoder feature that generates ys (n_batch, xlen, n_feat).
+
+        Returns:
+            tuple[paddle.Tensor, List[Any]]: Tuple of
+                batchfied scores for next token with shape of `(n_batch, n_vocab)`
+                and next state list for ys.
+
+        """
+        warnings.warn(
+            "{} batch score is implemented through for loop not parallelized".
+            format(self.__class__.__name__))
+        scores = list()
+        outstates = list()
+        for i, (y, state, x) in enumerate(zip(ys, states, xs)):
+            score, outstate = self.score(y, state, x)
+            outstates.append(outstate)
+            scores.append(score)
+        scores = paddle.cat(scores, 0).view(ys.shape[0], -1)
+        return scores, outstates
+
+
+class PartialScorerInterface(ScorerInterface):
+    """Partial scorer interface for beam search.
+
+    The partial scorer performs scoring when non-partial scorer finished scoring,
+    and receives pre-pruned next tokens to score because it is too heavy to score
+    all the tokens.
+
+    Score sub-set of tokens, not all.
+
+    Examples:
+         * Prefix search for connectionist-temporal-classification models
+             * :class:`decoders.scorers.ctc.CTCPrefixScorer`
+
+    """
+
+    def score_partial(self,
+                      y: paddle.Tensor,
+                      next_tokens: paddle.Tensor,
+                      state: Any,
+                      x: paddle.Tensor) -> Tuple[paddle.Tensor, Any]:
+        """Score new token (required).
+
+        Args:
+            y (paddle.Tensor): 1D prefix token
+            next_tokens (paddle.Tensor): paddle.int64 next token to score
+            state: decoder state for prefix tokens
+            x (paddle.Tensor): The encoder feature that generates ys
+
+        Returns:
+            tuple[paddle.Tensor, Any]:
+                Tuple of a score tensor for y that has a shape `(len(next_tokens),)`
+                and next state for ys
+
+        """
+        raise NotImplementedError
+
+
+class BatchPartialScorerInterface(BatchScorerInterface, PartialScorerInterface):
+    """Batch partial scorer interface for beam search."""
+
+    def batch_score_partial(
+            self,
+            ys: paddle.Tensor,
+            next_tokens: paddle.Tensor,
+            states: List[Any],
+            xs: paddle.Tensor, ) -> Tuple[paddle.Tensor, Any]:
+        """Score new token (required).
+
+        Args:
+            ys (paddle.Tensor): paddle.int64 prefix tokens (n_batch, ylen).
+            next_tokens (paddle.Tensor): paddle.int64 tokens to score (n_batch, n_token).
+            states (List[Any]): Scorer states for prefix tokens.
+            xs (paddle.Tensor):
+                The encoder feature that generates ys (n_batch, xlen, n_feat).
+
+        Returns:
+            tuple[paddle.Tensor, Any]:
+                Tuple of a score tensor for ys that has a shape `(n_batch, n_vocab)`
+                and next states for ys
+        """
+        raise NotImplementedError
diff --git a/ernie-sat/paddlespeech/s2t/decoders/utils.py b/ernie-sat/paddlespeech/s2t/decoders/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..a609f1c6330dda817cd287fdaf6f6da7d94e26f6
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/decoders/utils.py
@@ -0,0 +1,130 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+import numpy as np
+
+from paddlespeech.s2t.utils.log import Log
+logger = Log(__name__).getlog()
+
+__all__ = ["end_detect", "parse_hypothesis", "add_results_to_json"]
+
+
+def end_detect(ended_hyps, i, M=3, D_end=np.log(1 * np.exp(-10))):
+    """End detection.
+
+    described in Eq. (50) of S. Watanabe et al
+    "Hybrid CTC/Attention Architecture for End-to-End Speech Recognition"
+
+    :param ended_hyps: dict
+    :param i: int
+    :param M: int
+    :param D_end: float
+    :return: bool
+    """
+    if len(ended_hyps) == 0:
+        return False
+    count = 0
+    best_hyp = sorted(ended_hyps, key=lambda x: x["score"], reverse=True)[0]
+    for m in range(M):
+        # get ended_hyps with their length is i - m
+        hyp_length = i - m
+        hyps_same_length = [
+            x for x in ended_hyps if len(x["yseq"]) == hyp_length
+        ]
+        if len(hyps_same_length) > 0:
+            best_hyp_same_length = sorted(
+                hyps_same_length, key=lambda x: x["score"], reverse=True)[0]
+            if best_hyp_same_length["score"] - best_hyp["score"] < D_end:
+                count += 1
+
+    if count == M:
+        return True
+    else:
+        return False
+
+
+# * ------------------ recognition related ------------------ *
+def parse_hypothesis(hyp, char_list):
+    """Parse hypothesis.
+
+    Args:
+        hyp (list[dict[str, Any]]): Recognition hypothesis.
+        char_list (list[str]): List of characters.
+
+    Returns:
+        tuple(str, str, str, float)
+
+    """
+    # remove sos and get results
+    tokenid_as_list = list(map(int, hyp["yseq"][1:]))
+    token_as_list = [char_list[idx] for idx in tokenid_as_list]
+    score = float(hyp["score"])
+
+    # convert to string
+    tokenid = " ".join([str(idx) for idx in tokenid_as_list])
+    token = " ".join(token_as_list)
+    text = "".join(token_as_list).replace("<space>", " ")
+
+    return text, token, tokenid, score
+
+
+def add_results_to_json(js, nbest_hyps, char_list):
+    """Add N-best results to json.
+
+    Args:
+        js (dict[str, Any]): Groundtruth utterance dict.
+        nbest_hyps_sd (list[dict[str, Any]]):
+            List of hypothesis for multi_speakers: nutts x nspkrs.
+        char_list (list[str]): List of characters.
+
+    Returns:
+        dict[str, Any]: N-best results added utterance dict.
+
+    """
+    # copy old json info
+    new_js = dict()
+    new_js["utt2spk"] = js["utt2spk"]
+    new_js["output"] = []
+
+    for n, hyp in enumerate(nbest_hyps, 1):
+        # parse hypothesis
+        rec_text, rec_token, rec_tokenid, score = parse_hypothesis(hyp,
+                                                                   char_list)
+
+        # copy ground-truth
+        if len(js["output"]) > 0:
+            out_dic = dict(js["output"][0].items())
+        else:
+            # for no reference case (e.g., speech translation)
+            out_dic = {"name": ""}
+
+        # update name
+        out_dic["name"] += "[%d]" % n
+
+        # add recognition results
+        out_dic["rec_text"] = rec_text
+        out_dic["rec_token"] = rec_token
+        out_dic["rec_tokenid"] = rec_tokenid
+        out_dic["score"] = score
+
+        # add to list of N-best result dicts
+        new_js["output"].append(out_dic)
+
+        # show 1-best result
+        if n == 1:
+            if "text" in out_dic.keys():
+                logger.info("groundtruth: %s" % out_dic["text"])
+            logger.info("prediction : %s" % out_dic["rec_text"])
+
+    return new_js
diff --git a/ernie-sat/paddlespeech/s2t/exps/__init__.py b/ernie-sat/paddlespeech/s2t/exps/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b4d03066fb14bc00a02e6c1fa89d44165ba32145
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/exps/__init__.py
@@ -0,0 +1,62 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from paddlespeech.s2t.training.trainer import Trainer
+from paddlespeech.s2t.utils.dynamic_import import dynamic_import
+
+model_trainer_alias = {
+    "ds2": "paddlespeech.s2t.exp.deepspeech2.model:DeepSpeech2Trainer",
+    "u2": "paddlespeech.s2t.exps.u2.model:U2Trainer",
+    "u2_kaldi": "paddlespeech.s2t.exps.u2_kaldi.model:U2Trainer",
+    "u2_st": "paddlespeech.s2t.exps.u2_st.model:U2STTrainer",
+}
+
+
+def dynamic_import_trainer(module):
+    """Import Trainer dynamically.
+
+    Args:
+        module (str): trainer name. e.g., ds2, u2, u2_kaldi
+
+    Returns:
+        type: Trainer class
+
+    """
+    model_class = dynamic_import(module, model_trainer_alias)
+    assert issubclass(model_class,
+                      Trainer), f"{module} does not implement Trainer"
+    return model_class
+
+
+model_tester_alias = {
+    "ds2": "paddlespeech.s2t.exp.deepspeech2.model:DeepSpeech2Tester",
+    "u2": "paddlespeech.s2t.exps.u2.model:U2Tester",
+    "u2_kaldi": "paddlespeech.s2t.exps.u2_kaldi.model:U2Tester",
+    "u2_st": "paddlespeech.s2t.exps.u2_st.model:U2STTester",
+}
+
+
+def dynamic_import_tester(module):
+    """Import Tester dynamically.
+
+    Args:
+        module (str): tester name. e.g., ds2, u2, u2_kaldi
+
+    Returns:
+        type: Tester class
+
+    """
+    model_class = dynamic_import(module, model_tester_alias)
+    assert issubclass(model_class,
+                      Trainer), f"{module} does not implement Tester"
+    return model_class
diff --git a/ernie-sat/paddlespeech/s2t/exps/deepspeech2/__init__.py b/ernie-sat/paddlespeech/s2t/exps/deepspeech2/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..185a92b8d94d3426d616c0624f0f2ee04339349e
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/exps/deepspeech2/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/ernie-sat/paddlespeech/s2t/exps/deepspeech2/bin/__init__.py b/ernie-sat/paddlespeech/s2t/exps/deepspeech2/bin/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..185a92b8d94d3426d616c0624f0f2ee04339349e
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/exps/deepspeech2/bin/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/ernie-sat/paddlespeech/s2t/exps/deepspeech2/bin/deploy/__init__.py b/ernie-sat/paddlespeech/s2t/exps/deepspeech2/bin/deploy/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..185a92b8d94d3426d616c0624f0f2ee04339349e
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/exps/deepspeech2/bin/deploy/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/ernie-sat/paddlespeech/s2t/exps/deepspeech2/bin/deploy/client.py b/ernie-sat/paddlespeech/s2t/exps/deepspeech2/bin/deploy/client.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7ed8429b20b8dd717bf1c741a709168d0d5c2c7
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/exps/deepspeech2/bin/deploy/client.py
@@ -0,0 +1,95 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Client-end for the ASR demo."""
+import argparse
+import sys
+
+import keyboard
+import pyaudio
+
+from paddlespeech.s2t.utils.socket_server import socket_send
+
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    "--host_ip",
+    default="localhost",
+    type=str,
+    help="Server IP address. (default: %(default)s)")
+parser.add_argument(
+    "--host_port",
+    default=8086,
+    type=int,
+    help="Server Port. (default: %(default)s)")
+args = parser.parse_args()
+
+is_recording = False
+enable_trigger_record = True
+
+
+def on_press_release(x):
+    """Keyboard callback function."""
+    global is_recording, enable_trigger_record
+    press = keyboard.KeyboardEvent('down', 28, 'space')
+    release = keyboard.KeyboardEvent('up', 28, 'space')
+    if x.event_type == 'down' and x.name == press.name:
+        if (not is_recording) and enable_trigger_record:
+            sys.stdout.write("Start Recording ... ")
+            sys.stdout.flush()
+            is_recording = True
+    if x.event_type == 'up' and x.name == release.name:
+        if is_recording:
+            is_recording = False
+
+
+data_list = []
+
+
+def callback(in_data, frame_count, time_info, status):
+    """Audio recorder's stream callback function."""
+    global data_list, is_recording, enable_trigger_record
+    if is_recording:
+        data_list.append(in_data)
+        enable_trigger_record = False
+    elif len(data_list) > 0:
+        socket_send(args.host_ip, args.host_port, ''.join(data_list))
+        data_list = []
+    enable_trigger_record = True
+    return (in_data, pyaudio.paContinue)
+
+
+def main():
+    # prepare audio recorder
+    p = pyaudio.PyAudio()
+    stream = p.open(
+        format=pyaudio.paInt16,
+        channels=1,
+        rate=16000,
+        input=True,
+        stream_callback=callback)
+    stream.start_stream()
+
+    # prepare keyboard listener
+    while (1):
+        keyboard.hook(on_press_release)
+        if keyboard.record('esc'):
+            break
+
+    # close up
+    stream.stop_stream()
+    stream.close()
+    p.terminate()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/ernie-sat/paddlespeech/s2t/exps/deepspeech2/bin/deploy/record.py b/ernie-sat/paddlespeech/s2t/exps/deepspeech2/bin/deploy/record.py
new file mode 100644
index 0000000000000000000000000000000000000000..94ad0f1cb36dc3360bebda27c8e5de8d16f9862f
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/exps/deepspeech2/bin/deploy/record.py
@@ -0,0 +1,55 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Record wav from Microphone"""
+# http://people.csail.mit.edu/hubert/pyaudio/
+import wave
+
+import pyaudio
+
+CHUNK = 1024
+FORMAT = pyaudio.paInt16
+CHANNELS = 1
+RATE = 16000
+RECORD_SECONDS = 5
+WAVE_OUTPUT_FILENAME = "output.wav"
+
+p = pyaudio.PyAudio()
+
+stream = p.open(
+    format=FORMAT,
+    channels=CHANNELS,
+    rate=RATE,
+    input=True,
+    frames_per_buffer=CHUNK)
+
+print("* recording")
+
+frames = []
+
+for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
+    data = stream.read(CHUNK)
+    frames.append(data)
+
+print("* done recording")
+
+stream.stop_stream()
+stream.close()
+p.terminate()
+
+wf = wave.open(WAVE_OUTPUT_FILENAME, 'wb')
+wf.setnchannels(CHANNELS)
+wf.setsampwidth(p.get_sample_size(FORMAT))
+wf.setframerate(RATE)
+wf.writeframes(b''.join(frames))
+wf.close()
diff --git a/ernie-sat/paddlespeech/s2t/exps/deepspeech2/bin/deploy/runtime.py b/ernie-sat/paddlespeech/s2t/exps/deepspeech2/bin/deploy/runtime.py
new file mode 100644
index 0000000000000000000000000000000000000000..5755a5f101c18927aff975262d627f81a74fb783
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/exps/deepspeech2/bin/deploy/runtime.py
@@ -0,0 +1,198 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Server-end for the ASR demo."""
+import functools
+
+import numpy as np
+import paddle
+from paddle.inference import Config
+from paddle.inference import create_predictor
+from paddle.io import DataLoader
+from yacs.config import CfgNode
+
+from paddlespeech.s2t.io.collator import SpeechCollator
+from paddlespeech.s2t.io.dataset import ManifestDataset
+from paddlespeech.s2t.models.ds2 import DeepSpeech2Model
+from paddlespeech.s2t.training.cli import default_argument_parser
+from paddlespeech.s2t.utils.socket_server import AsrRequestHandler
+from paddlespeech.s2t.utils.socket_server import AsrTCPServer
+from paddlespeech.s2t.utils.socket_server import warm_up_test
+from paddlespeech.s2t.utils.utility import add_arguments
+from paddlespeech.s2t.utils.utility import print_arguments
+
+
+def init_predictor(args):
+    if args.model_dir is not None:
+        config = Config(args.model_dir)
+    else:
+        config = Config(args.model_file, args.params_file)
+
+    config.enable_memory_optim()
+    if args.use_gpu:
+        config.enable_use_gpu(memory_pool_init_size_mb=1000, device_id=0)
+    else:
+        # If not specific mkldnn, you can set the blas thread.
+        # The thread num should not be greater than the number of cores in the CPU.
+        config.set_cpu_math_library_num_threads(4)
+        config.enable_mkldnn()
+
+    predictor = create_predictor(config)
+    return predictor
+
+
+def run(predictor, img):
+    # copy img data to input tensor
+    input_names = predictor.get_input_names()
+    for i, name in enumerate(input_names):
+        input_tensor = predictor.get_input_handle(name)
+        #input_tensor.reshape(img[i].shape)
+        #input_tensor.copy_from_cpu(img[i].copy())
+
+    # do the inference
+    predictor.run()
+
+    results = []
+    # get out data from output tensor
+    output_names = predictor.get_output_names()
+    for i, name in enumerate(output_names):
+        output_tensor = predictor.get_output_handle(name)
+        output_data = output_tensor.copy_to_cpu()
+        results.append(output_data)
+
+    return results
+
+
+def inference(config, args):
+    predictor = init_predictor(args)
+
+
+def start_server(config, args):
+    """Start the ASR server"""
+    config.defrost()
+    config.manifest = config.test_manifest
+    dataset = ManifestDataset.from_config(config)
+
+    config.augmentation_config = ""
+    config.keep_transcription_text = True
+    config.batch_size = 1
+    config.num_workers = 0
+    collate_fn = SpeechCollator.from_config(config)
+    test_loader = DataLoader(dataset, collate_fn=collate_fn, num_workers=0)
+
+    model = DeepSpeech2Model.from_pretrained(test_loader, config,
+                                             args.checkpoint_path)
+    model.eval()
+
+    # prepare ASR inference handler
+    def file_to_transcript(filename):
+        feature = test_loader.collate_fn.process_utterance(filename, "")
+        audio = np.array([feature[0]]).astype('float32')  #[1, T, D]
+        audio_len = feature[0].shape[0]
+        audio_len = np.array([audio_len]).astype('int64')  # [1]
+
+        result_transcript = model.decode(
+            paddle.to_tensor(audio),
+            paddle.to_tensor(audio_len),
+            vocab_list=test_loader.collate_fn.vocab_list,
+            decoding_method=config.decode.decoding_method,
+            lang_model_path=config.decode.lang_model_path,
+            beam_alpha=config.decode.alpha,
+            beam_beta=config.decode.beta,
+            beam_size=config.decode.beam_size,
+            cutoff_prob=config.decode.cutoff_prob,
+            cutoff_top_n=config.decode.cutoff_top_n,
+            num_processes=config.decode.num_proc_bsearch)
+        return result_transcript[0]
+
+    # warming up with utterrances sampled from Librispeech
+    print('-----------------------------------------------------------')
+    print('Warming up ...')
+    warm_up_test(
+        audio_process_handler=file_to_transcript,
+        manifest_path=args.warmup_manifest,
+        num_test_cases=3)
+    print('-----------------------------------------------------------')
+
+    # start the server
+    server = AsrTCPServer(
+        server_address=(args.host_ip, args.host_port),
+        RequestHandlerClass=AsrRequestHandler,
+        speech_save_dir=args.speech_save_dir,
+        audio_process_handler=file_to_transcript)
+    print("ASR Server Started.")
+    server.serve_forever()
+
+
+def main(config, args):
+    start_server(config, args)
+
+
+if __name__ == "__main__":
+    parser = default_argument_parser()
+    add_arg = functools.partial(add_arguments, argparser=parser)
+    # yapf: disable
+    add_arg('host_ip',          str,
+            'localhost',
+            "Server's IP address.")
+    add_arg('host_port',        int,    8089,    "Server's IP port.")
+    add_arg('speech_save_dir',  str,
+            'demo_cache',
+            "Directory to save demo audios.")
+    add_arg('warmup_manifest',  str, None, "Filepath of manifest to warm up.")
+    add_arg(
+        "--model_file",
+        type=str,
+        default="",
+        help="Model filename, Specify this when your model is a combined model."
+    )
+    add_arg(
+        "--params_file",
+        type=str,
+        default="",
+        help="Parameter filename, Specify this when your model is a combined model."
+    )
+    add_arg(
+        "--model_dir",
+        type=str,
+        default=None,
+        help="Model dir, If you load a non-combined model, specify the directory of the model."
+    )
+    add_arg("--use_gpu",
+                        type=bool,
+                        default=False,
+                        help="Whether use gpu.")
+    args = parser.parse_args()
+    print_arguments(args, globals())
+
+    # https://yaml.org/type/float.html
+    config = CfgNode(new_allowed=True)
+    if args.config:
+        config.merge_from_file(args.config)
+    if args.decode_cfg:
+        decode_confs = CfgNode(new_allowed=True)
+        decode_confs.merge_from_file(args.decode_cfg)
+        config.decode = decode_confs
+    if args.opts:
+        config.merge_from_list(args.opts)
+    config.freeze()
+    print(config)
+
+    args.warmup_manifest = config.test_manifest
+    print_arguments(args, globals())
+
+    if args.dump_config:
+        with open(args.dump_config, 'w') as f:
+            print(config, file=f)
+
+    main(config, args)
diff --git a/ernie-sat/paddlespeech/s2t/exps/deepspeech2/bin/deploy/send.py b/ernie-sat/paddlespeech/s2t/exps/deepspeech2/bin/deploy/send.py
new file mode 100644
index 0000000000000000000000000000000000000000..596e701027c03276f153babdf710b361eb34b882
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/exps/deepspeech2/bin/deploy/send.py
@@ -0,0 +1,50 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Socket client to send wav to ASR server."""
+import argparse
+import wave
+
+from paddlespeech.s2t.utils.socket_server import socket_send
+
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    "--host_ip",
+    default="localhost",
+    type=str,
+    help="Server IP address. (default: %(default)s)")
+parser.add_argument(
+    "--host_port",
+    default=8086,
+    type=int,
+    help="Server Port. (default: %(default)s)")
+args = parser.parse_args()
+
+WAVE_OUTPUT_FILENAME = "output.wav"
+
+
+def main():
+    wf = wave.open(WAVE_OUTPUT_FILENAME, 'rb')
+    nframe = wf.getnframes()
+    data = wf.readframes(nframe)
+    print(f"Wave: {WAVE_OUTPUT_FILENAME}")
+    print(f"Wave samples: {nframe}")
+    print(f"Wave channels: {wf.getnchannels()}")
+    print(f"Wave sample rate: {wf.getframerate()}")
+    print(f"Wave sample width: {wf.getsampwidth()}")
+    assert isinstance(data, bytes)
+    socket_send(args.host_ip, args.host_port, data)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/ernie-sat/paddlespeech/s2t/exps/deepspeech2/bin/deploy/server.py b/ernie-sat/paddlespeech/s2t/exps/deepspeech2/bin/deploy/server.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d0b4f2197c05383285bbf590f53f334d134c969
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/exps/deepspeech2/bin/deploy/server.py
@@ -0,0 +1,133 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Server-end for the ASR demo."""
+import functools
+
+import numpy as np
+import paddle
+from paddle.io import DataLoader
+from yacs.config import CfgNode
+
+from paddlespeech.s2t.io.collator import SpeechCollator
+from paddlespeech.s2t.io.dataset import ManifestDataset
+from paddlespeech.s2t.models.ds2 import DeepSpeech2Model
+from paddlespeech.s2t.training.cli import default_argument_parser
+from paddlespeech.s2t.utils.socket_server import AsrRequestHandler
+from paddlespeech.s2t.utils.socket_server import AsrTCPServer
+from paddlespeech.s2t.utils.socket_server import warm_up_test
+from paddlespeech.s2t.utils.utility import add_arguments
+from paddlespeech.s2t.utils.utility import print_arguments
+
+
+def start_server(config, args):
+    """Start the ASR server"""
+    config.defrost()
+    config.manifest = config.test_manifest
+    dataset = ManifestDataset.from_config(config)
+
+    config.augmentation_config = ""
+    config.keep_transcription_text = True
+    config.batch_size = 1
+    config.num_workers = 0
+    collate_fn = SpeechCollator.from_config(config)
+    test_loader = DataLoader(dataset, collate_fn=collate_fn, num_workers=0)
+
+    model = DeepSpeech2Model.from_pretrained(test_loader, config,
+                                             args.checkpoint_path)
+    model.eval()
+
+    # prepare ASR inference handler
+    def file_to_transcript(filename):
+        feature = test_loader.collate_fn.process_utterance(filename, "")
+        audio = np.array([feature[0]]).astype('float32')  #[1, T, D]
+        # audio = audio.swapaxes(1,2)
+        print('---file_to_transcript feature----')
+        print(audio.shape)
+        audio_len = feature[0].shape[0]
+        print(audio_len)
+        audio_len = np.array([audio_len]).astype('int64')  # [1]
+
+        result_transcript = model.decode(
+            paddle.to_tensor(audio),
+            paddle.to_tensor(audio_len),
+            vocab_list=test_loader.collate_fn.vocab_list,
+            decoding_method=config.decode.decoding_method,
+            lang_model_path=config.decode.lang_model_path,
+            beam_alpha=config.decode.alpha,
+            beam_beta=config.decode.beta,
+            beam_size=config.decode.beam_size,
+            cutoff_prob=config.decode.cutoff_prob,
+            cutoff_top_n=config.decode.cutoff_top_n,
+            num_processes=config.decode.num_proc_bsearch)
+        return result_transcript[0]
+
+    # warming up with utterrances sampled from Librispeech
+    print('-----------------------------------------------------------')
+    print('Warming up ...')
+    warm_up_test(
+        audio_process_handler=file_to_transcript,
+        manifest_path=args.warmup_manifest,
+        num_test_cases=3)
+    print('-----------------------------------------------------------')
+
+    # start the server
+    server = AsrTCPServer(
+        server_address=(args.host_ip, args.host_port),
+        RequestHandlerClass=AsrRequestHandler,
+        speech_save_dir=args.speech_save_dir,
+        audio_process_handler=file_to_transcript)
+    print("ASR Server Started.")
+    server.serve_forever()
+
+
+def main(config, args):
+    start_server(config, args)
+
+
+if __name__ == "__main__":
+    parser = default_argument_parser()
+    add_arg = functools.partial(add_arguments, argparser=parser)
+    # yapf: disable
+    add_arg('host_ip',          str,
+            'localhost',
+            "Server's IP address.")
+    add_arg('host_port',        int,    8088,    "Server's IP port.")
+    add_arg('speech_save_dir',  str,
+            'demo_cache',
+            "Directory to save demo audios.")
+    add_arg('warmup_manifest', str, None, "Filepath of manifest to warm up.")
+    args = parser.parse_args()
+    print_arguments(args, globals())
+
+    # https://yaml.org/type/float.html
+    config = CfgNode(new_allowed=True)
+    if args.config:
+        config.merge_from_file(args.config)
+    if args.decode_cfg:
+        decode_confs = CfgNode(new_allowed=True)
+        decode_confs.merge_from_file(args.decode_cfg)
+        config.decode = decode_confs
+    if args.opts:
+        config.merge_from_list(args.opts)
+    config.freeze()
+    print(config)
+
+    args.warmup_manifest = config.test_manifest
+    print_arguments(args, globals())
+
+    if args.dump_config:
+        with open(args.dump_config, 'w') as f:
+            print(config, file=f)
+
+    main(config, args)
diff --git a/ernie-sat/paddlespeech/s2t/exps/deepspeech2/bin/export.py b/ernie-sat/paddlespeech/s2t/exps/deepspeech2/bin/export.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee013d79e6ed3d39516ee65d5c4df5ec30a24b42
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/exps/deepspeech2/bin/export.py
@@ -0,0 +1,56 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Export for DeepSpeech2 model."""
+from yacs.config import CfgNode
+
+from paddlespeech.s2t.exps.deepspeech2.model import DeepSpeech2Tester as Tester
+from paddlespeech.s2t.training.cli import default_argument_parser
+from paddlespeech.s2t.utils.utility import print_arguments
+
+
+def main_sp(config, args):
+    exp = Tester(config, args)
+    with exp.eval():
+        exp.setup()
+        exp.run_export()
+
+
+def main(config, args):
+    main_sp(config, args)
+
+
+if __name__ == "__main__":
+    parser = default_argument_parser()
+    # save jit model to 
+    parser.add_argument(
+        "--export_path", type=str, help="path of the jit model to save")
+    parser.add_argument(
+        "--model_type", type=str, default='offline', help="offline/online")
+    args = parser.parse_args()
+    print("model_type:{}".format(args.model_type))
+    print_arguments(args)
+
+    # https://yaml.org/type/float.html
+    config = CfgNode(new_allowed=True)
+    if args.config:
+        config.merge_from_file(args.config)
+    if args.opts:
+        config.merge_from_list(args.opts)
+    config.freeze()
+    print(config)
+    if args.dump_config:
+        with open(args.dump_config, 'w') as f:
+            print(config, file=f)
+
+    main(config, args)
diff --git a/ernie-sat/paddlespeech/s2t/exps/deepspeech2/bin/test.py b/ernie-sat/paddlespeech/s2t/exps/deepspeech2/bin/test.py
new file mode 100644
index 0000000000000000000000000000000000000000..388b380d1c78aeb45970486091285f4c1248eb55
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/exps/deepspeech2/bin/test.py
@@ -0,0 +1,60 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Evaluation for DeepSpeech2 model."""
+from yacs.config import CfgNode
+
+from paddlespeech.s2t.exps.deepspeech2.model import DeepSpeech2Tester as Tester
+from paddlespeech.s2t.training.cli import default_argument_parser
+from paddlespeech.s2t.utils.utility import print_arguments
+
+
+def main_sp(config, args):
+    exp = Tester(config, args)
+    with exp.eval():
+        exp.setup()
+        exp.run_test()
+
+
+def main(config, args):
+    main_sp(config, args)
+
+
+if __name__ == "__main__":
+    parser = default_argument_parser()
+    parser.add_argument(
+        "--model_type", type=str, default='offline', help='offline/online')
+    # save asr result to 
+    parser.add_argument(
+        "--result_file", type=str, help="path of save the asr result")
+    args = parser.parse_args()
+    print_arguments(args, globals())
+    print("model_type:{}".format(args.model_type))
+
+    # https://yaml.org/type/float.html
+    config = CfgNode(new_allowed=True)
+    if args.config:
+        config.merge_from_file(args.config)
+    if args.decode_cfg:
+        decode_confs = CfgNode(new_allowed=True)
+        decode_confs.merge_from_file(args.decode_cfg)
+        config.decode = decode_confs
+    if args.opts:
+        config.merge_from_list(args.opts)
+    config.freeze()
+    print(config)
+    if args.dump_config:
+        with open(args.dump_config, 'w') as f:
+            print(config, file=f)
+
+    main(config, args)
diff --git a/ernie-sat/paddlespeech/s2t/exps/deepspeech2/bin/test_export.py b/ernie-sat/paddlespeech/s2t/exps/deepspeech2/bin/test_export.py
new file mode 100644
index 0000000000000000000000000000000000000000..707eb9e1bc26204fe5b6a9070e02f7ad95d5f334
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/exps/deepspeech2/bin/test_export.py
@@ -0,0 +1,65 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Evaluation for DeepSpeech2 model."""
+from yacs.config import CfgNode
+
+from paddlespeech.s2t.exps.deepspeech2.model import DeepSpeech2ExportTester as ExportTester
+from paddlespeech.s2t.training.cli import default_argument_parser
+from paddlespeech.s2t.utils.utility import print_arguments
+
+
+def main_sp(config, args):
+    exp = ExportTester(config, args)
+    with exp.eval():
+        exp.setup()
+        exp.run_test()
+
+
+def main(config, args):
+    main_sp(config, args)
+
+
+if __name__ == "__main__":
+    parser = default_argument_parser()
+    # save asr result to
+    parser.add_argument(
+        "--result_file", type=str, help="path of save the asr result")
+    #load jit model from
+    parser.add_argument(
+        "--export_path", type=str, help="path of the jit model to save")
+    parser.add_argument(
+        "--model_type", type=str, default='offline', help='offline/online')
+    parser.add_argument(
+        "--enable-auto-log", action="store_true", help="use auto log")
+    args = parser.parse_args()
+    print_arguments(args, globals())
+    print("model_type:{}".format(args.model_type))
+
+    # https://yaml.org/type/float.html
+    config = CfgNode(new_allowed=True)
+    if args.config:
+        config.merge_from_file(args.config)
+    if args.decode_cfg:
+        decode_confs = CfgNode(new_allowed=True)
+        decode_confs.merge_from_file(args.decode_cfg)
+        config.decode = decode_confs
+    if args.opts:
+        config.merge_from_list(args.opts)
+    config.freeze()
+    print(config)
+    if args.dump_config:
+        with open(args.dump_config, 'w') as f:
+            print(config, file=f)
+
+    main(config, args)
diff --git a/ernie-sat/paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py b/ernie-sat/paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py
new file mode 100644
index 0000000000000000000000000000000000000000..a909dd416a03766dba505129a20edaeb9bee0cd8
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py
@@ -0,0 +1,205 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Evaluation for DeepSpeech2 model."""
+import os
+import sys
+from pathlib import Path
+
+import paddle
+import soundfile
+from yacs.config import CfgNode
+
+from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
+from paddlespeech.s2t.io.collator import SpeechCollator
+from paddlespeech.s2t.models.ds2 import DeepSpeech2Model
+from paddlespeech.s2t.models.ds2_online import DeepSpeech2ModelOnline
+from paddlespeech.s2t.training.cli import default_argument_parser
+from paddlespeech.s2t.utils import mp_tools
+from paddlespeech.s2t.utils.checkpoint import Checkpoint
+from paddlespeech.s2t.utils.log import Log
+from paddlespeech.s2t.utils.utility import print_arguments
+from paddlespeech.s2t.utils.utility import UpdateConfig
+
+logger = Log(__name__).getlog()
+
+
+class DeepSpeech2Tester_hub():
+    def __init__(self, config, args):
+        self.args = args
+        self.config = config
+        self.audio_file = args.audio_file
+        self.collate_fn_test = SpeechCollator.from_config(config)
+        self._text_featurizer = TextFeaturizer(
+            unit_type=config.unit_type, vocab=None)
+
+    def compute_result_transcripts(self, audio, audio_len, vocab_list, cfg):
+        result_transcripts = self.model.decode(
+            audio,
+            audio_len,
+            vocab_list,
+            decoding_method=cfg.decoding_method,
+            lang_model_path=cfg.lang_model_path,
+            beam_alpha=cfg.alpha,
+            beam_beta=cfg.beta,
+            beam_size=cfg.beam_size,
+            cutoff_prob=cfg.cutoff_prob,
+            cutoff_top_n=cfg.cutoff_top_n,
+            num_processes=cfg.num_proc_bsearch)
+
+        return result_transcripts
+
+    @mp_tools.rank_zero_only
+    @paddle.no_grad()
+    def test(self):
+        self.model.eval()
+        cfg = self.config
+        audio_file = self.audio_file
+        collate_fn_test = self.collate_fn_test
+        audio, _ = collate_fn_test.process_utterance(
+            audio_file=audio_file, transcript=" ")
+        audio_len = audio.shape[0]
+        audio = paddle.to_tensor(audio, dtype='float32')
+        audio_len = paddle.to_tensor(audio_len)
+        audio = paddle.unsqueeze(audio, axis=0)
+        vocab_list = collate_fn_test.vocab_list
+        result_transcripts = self.compute_result_transcripts(
+            audio, audio_len, vocab_list, cfg.decode)
+        logger.info("result_transcripts: " + result_transcripts[0])
+
+    def run_test(self):
+        self.resume()
+        try:
+            self.test()
+        except KeyboardInterrupt:
+            exit(-1)
+
+    def setup(self):
+        """Setup the experiment.
+        """
+        paddle.set_device('gpu' if self.args.ngpu > 0 else 'cpu')
+
+        self.setup_output_dir()
+        self.setup_checkpointer()
+
+        self.setup_model()
+
+    def setup_output_dir(self):
+        """Create a directory used for output.
+        """
+        # output dir
+        if self.args.output:
+            output_dir = Path(self.args.output).expanduser()
+            output_dir.mkdir(parents=True, exist_ok=True)
+        else:
+            output_dir = Path(
+                self.args.checkpoint_path).expanduser().parent.parent
+            output_dir.mkdir(parents=True, exist_ok=True)
+        self.output_dir = output_dir
+
+    def setup_model(self):
+        config = self.config.clone()
+        with UpdateConfig(config):
+            config.input_dim = self.collate_fn_test.feature_size
+            config.output_dim = self.collate_fn_test.vocab_size
+
+        if self.args.model_type == 'offline':
+            model = DeepSpeech2Model.from_config(config)
+        elif self.args.model_type == 'online':
+            model = DeepSpeech2ModelOnline.from_config(config)
+        else:
+            raise Exception("wrong model type")
+
+        self.model = model
+
+    def setup_checkpointer(self):
+        """Create a directory used to save checkpoints into.
+
+        It is "checkpoints" inside the output directory.
+        """
+        # checkpoint dir
+        checkpoint_dir = self.output_dir / "checkpoints"
+        checkpoint_dir.mkdir(exist_ok=True)
+
+        self.checkpoint_dir = checkpoint_dir
+
+        self.checkpoint = Checkpoint(
+            kbest_n=self.config.checkpoint.kbest_n,
+            latest_n=self.config.checkpoint.latest_n)
+
+    def resume(self):
+        """Resume from the checkpoint at checkpoints in the output
+        directory or load a specified checkpoint.
+        """
+        params_path = self.args.checkpoint_path + ".pdparams"
+        model_dict = paddle.load(params_path)
+        self.model.set_state_dict(model_dict)
+
+
+def check(audio_file):
+    logger.info("checking the audio file format......")
+    try:
+        sig, sample_rate = soundfile.read(audio_file)
+    except Exception as e:
+        logger.error(str(e))
+        logger.error(
+            "can not open the wav file, please check the audio file format")
+        sys.exit(-1)
+    logger.info("The sample rate is %d" % sample_rate)
+    assert (sample_rate == 16000)
+    logger.info("The audio file format is right")
+
+
+def main_sp(config, args):
+    exp = DeepSpeech2Tester_hub(config, args)
+    exp.setup()
+    exp.run_test()
+
+
+def main(config, args):
+    main_sp(config, args)
+
+
+if __name__ == "__main__":
+    parser = default_argument_parser()
+    parser.add_argument(
+        "--model_type", type=str, default='offline', help='offline/online')
+    parser.add_argument("--audio_file", type=str, help='audio file path')
+    # save asr result to
+    parser.add_argument(
+        "--result_file", type=str, help="path of save the asr result")
+    args = parser.parse_args()
+    print_arguments(args, globals())
+    if not os.path.isfile(args.audio_file):
+        print("Please input the audio file path")
+        sys.exit(-1)
+    check(args.audio_file)
+    print("model_type:{}".format(args.model_type))
+
+    # https://yaml.org/type/float.html
+    config = CfgNode(new_allowed=True)
+    if args.config:
+        config.merge_from_file(args.config)
+    if args.decode_cfg:
+        decode_confs = CfgNode(new_allowed=True)
+        decode_confs.merge_from_file(args.decode_cfg)
+        config.decode = decode_confs
+    if args.opts:
+        config.merge_from_list(args.opts)
+    config.freeze()
+    print(config)
+    if args.dump_config:
+        with open(args.dump_config, 'w') as f:
+            print(config, file=f)
+
+    main(config, args)
diff --git a/ernie-sat/paddlespeech/s2t/exps/deepspeech2/bin/train.py b/ernie-sat/paddlespeech/s2t/exps/deepspeech2/bin/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..09e8662f1ce75ff351d7981dc1c4382082f9b61a
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/exps/deepspeech2/bin/train.py
@@ -0,0 +1,56 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Trainer for DeepSpeech2 model."""
+from paddle import distributed as dist
+from yacs.config import CfgNode
+
+from paddlespeech.s2t.exps.deepspeech2.model import DeepSpeech2Trainer as Trainer
+from paddlespeech.s2t.training.cli import default_argument_parser
+from paddlespeech.s2t.utils.utility import print_arguments
+
+
+def main_sp(config, args):
+    exp = Trainer(config, args)
+    exp.setup()
+    exp.run()
+
+
+def main(config, args):
+    if args.ngpu > 1:
+        dist.spawn(main_sp, args=(config, args), nprocs=args.ngpu)
+    else:
+        main_sp(config, args)
+
+
+if __name__ == "__main__":
+    parser = default_argument_parser()
+    parser.add_argument(
+        "--model_type", type=str, default='offline', help='offline/online')
+    args = parser.parse_args()
+    print("model_type:{}".format(args.model_type))
+    print_arguments(args, globals())
+
+    # https://yaml.org/type/float.html
+    config = CfgNode(new_allowed=True)
+    if args.config:
+        config.merge_from_file(args.config)
+    if args.opts:
+        config.merge_from_list(args.opts)
+    config.freeze()
+    print(config)
+    if args.dump_config:
+        with open(args.dump_config, 'w') as f:
+            print(config, file=f)
+
+    main(config, args)
diff --git a/ernie-sat/paddlespeech/s2t/exps/deepspeech2/model.py b/ernie-sat/paddlespeech/s2t/exps/deepspeech2/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e9ede769c3f84156035d7bb1708cd94c8a3bb66
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/exps/deepspeech2/model.py
@@ -0,0 +1,649 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Contains DeepSpeech2 and DeepSpeech2Online model."""
+import os
+import time
+from collections import defaultdict
+from contextlib import nullcontext
+
+import jsonlines
+import numpy as np
+import paddle
+from paddle import distributed as dist
+from paddle import inference
+from paddle.io import DataLoader
+
+from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
+from paddlespeech.s2t.io.collator import SpeechCollator
+from paddlespeech.s2t.io.dataset import ManifestDataset
+from paddlespeech.s2t.io.sampler import SortagradBatchSampler
+from paddlespeech.s2t.io.sampler import SortagradDistributedBatchSampler
+from paddlespeech.s2t.models.ds2 import DeepSpeech2InferModel
+from paddlespeech.s2t.models.ds2 import DeepSpeech2Model
+from paddlespeech.s2t.models.ds2_online import DeepSpeech2InferModelOnline
+from paddlespeech.s2t.models.ds2_online import DeepSpeech2ModelOnline
+from paddlespeech.s2t.training.gradclip import ClipGradByGlobalNormWithLog
+from paddlespeech.s2t.training.reporter import report
+from paddlespeech.s2t.training.timer import Timer
+from paddlespeech.s2t.training.trainer import Trainer
+from paddlespeech.s2t.utils import error_rate
+from paddlespeech.s2t.utils import layer_tools
+from paddlespeech.s2t.utils import mp_tools
+from paddlespeech.s2t.utils.log import Log
+from paddlespeech.s2t.utils.utility import UpdateConfig
+
+logger = Log(__name__).getlog()
+
+
+class DeepSpeech2Trainer(Trainer):
+    def __init__(self, config, args):
+        super().__init__(config, args)
+
+    def train_batch(self, batch_index, batch_data, msg):
+        batch_size = self.config.batch_size
+        accum_grad = self.config.accum_grad
+
+        start = time.time()
+
+        # forward
+        utt, audio, audio_len, text, text_len = batch_data
+        loss = self.model(audio, audio_len, text, text_len)
+        losses_np = {
+            'train_loss': float(loss),
+        }
+
+        # loss backward
+        if (batch_index + 1) % accum_grad != 0:
+            # Disable gradient synchronizations across DDP processes.
+            # Within this context, gradients will be accumulated on module
+            # variables, which will later be synchronized.
+            context = self.model.no_sync if (hasattr(self.model, "no_sync") and
+                                             self.parallel) else nullcontext
+        else:
+            # Used for single gpu training and DDP gradient synchronization
+            # processes.
+            context = nullcontext
+
+        with context():
+            loss.backward()
+            layer_tools.print_grads(self.model, print_func=None)
+
+        # optimizer step
+        if (batch_index + 1) % accum_grad == 0:
+            self.optimizer.step()
+            self.optimizer.clear_grad()
+            self.iteration += 1
+
+        iteration_time = time.time() - start
+
+        for k, v in losses_np.items():
+            report(k, v)
+        report("batch_size", batch_size)
+        report("accum", accum_grad)
+        report("step_cost", iteration_time)
+
+        if dist.get_rank() == 0 and self.visualizer:
+            for k, v in losses_np.items():
+                # `step -1` since we update `step` after optimizer.step().
+                self.visualizer.add_scalar("train/{}".format(k), v,
+                                           self.iteration - 1)
+
+    @paddle.no_grad()
+    def valid(self):
+        logger.info(f"Valid Total Examples: {len(self.valid_loader.dataset)}")
+        self.model.eval()
+        valid_losses = defaultdict(list)
+        num_seen_utts = 1
+        total_loss = 0.0
+        for i, batch in enumerate(self.valid_loader):
+            utt, audio, audio_len, text, text_len = batch
+            loss = self.model(audio, audio_len, text, text_len)
+            if paddle.isfinite(loss):
+                num_utts = batch[1].shape[0]
+                num_seen_utts += num_utts
+                total_loss += float(loss) * num_utts
+                valid_losses['val_loss'].append(float(loss))
+
+            if (i + 1) % self.config.log_interval == 0:
+                valid_dump = {k: np.mean(v) for k, v in valid_losses.items()}
+                valid_dump['val_history_loss'] = total_loss / num_seen_utts
+
+                # logging
+                msg = f"Valid: Rank: {dist.get_rank()}, "
+                msg += "epoch: {}, ".format(self.epoch)
+                msg += "step: {}, ".format(self.iteration)
+                msg += "batch : {}/{}, ".format(i + 1, len(self.valid_loader))
+                msg += ', '.join('{}: {:>.6f}'.format(k, v)
+                                 for k, v in valid_dump.items())
+                logger.info(msg)
+
+        logger.info('Rank {} Val info val_loss {}'.format(
+            dist.get_rank(), total_loss / num_seen_utts))
+        return total_loss, num_seen_utts
+
+    def setup_model(self):
+        config = self.config.clone()
+        with UpdateConfig(config):
+            if self.train:
+                config.input_dim = self.train_loader.collate_fn.feature_size
+                config.output_dim = self.train_loader.collate_fn.vocab_size
+            else:
+                config.input_dim = self.test_loader.collate_fn.feature_size
+                config.output_dim = self.test_loader.collate_fn.vocab_size
+
+        if self.args.model_type == 'offline':
+            model = DeepSpeech2Model.from_config(config)
+        elif self.args.model_type == 'online':
+            model = DeepSpeech2ModelOnline.from_config(config)
+        else:
+            raise Exception("wrong model type")
+        if self.parallel:
+            model = paddle.DataParallel(model)
+
+        logger.info(f"{model}")
+        layer_tools.print_params(model, logger.info)
+        self.model = model
+        logger.info("Setup model!")
+
+        if not self.train:
+            return
+
+        grad_clip = ClipGradByGlobalNormWithLog(config.global_grad_clip)
+        lr_scheduler = paddle.optimizer.lr.ExponentialDecay(
+            learning_rate=config.lr, gamma=config.lr_decay, verbose=True)
+        optimizer = paddle.optimizer.Adam(
+            learning_rate=lr_scheduler,
+            parameters=model.parameters(),
+            weight_decay=paddle.regularizer.L2Decay(config.weight_decay),
+            grad_clip=grad_clip)
+        self.optimizer = optimizer
+        self.lr_scheduler = lr_scheduler
+        logger.info("Setup optimizer/lr_scheduler!")
+
+    def setup_dataloader(self):
+        config = self.config.clone()
+        config.defrost()
+        if self.train:
+            # train
+            config.manifest = config.train_manifest
+            train_dataset = ManifestDataset.from_config(config)
+            if self.parallel:
+                batch_sampler = SortagradDistributedBatchSampler(
+                    train_dataset,
+                    batch_size=config.batch_size,
+                    num_replicas=None,
+                    rank=None,
+                    shuffle=True,
+                    drop_last=True,
+                    sortagrad=config.sortagrad,
+                    shuffle_method=config.shuffle_method)
+            else:
+                batch_sampler = SortagradBatchSampler(
+                    train_dataset,
+                    shuffle=True,
+                    batch_size=config.batch_size,
+                    drop_last=True,
+                    sortagrad=config.sortagrad,
+                    shuffle_method=config.shuffle_method)
+
+            config.keep_transcription_text = False
+            collate_fn_train = SpeechCollator.from_config(config)
+            self.train_loader = DataLoader(
+                train_dataset,
+                batch_sampler=batch_sampler,
+                collate_fn=collate_fn_train,
+                num_workers=config.num_workers)
+
+            # dev
+            config.manifest = config.dev_manifest
+            dev_dataset = ManifestDataset.from_config(config)
+
+            config.augmentation_config = ""
+            config.keep_transcription_text = False
+            collate_fn_dev = SpeechCollator.from_config(config)
+            self.valid_loader = DataLoader(
+                dev_dataset,
+                batch_size=int(config.batch_size),
+                shuffle=False,
+                drop_last=False,
+                collate_fn=collate_fn_dev,
+                num_workers=config.num_workers)
+            logger.info("Setup train/valid  Dataloader!")
+        else:
+            # test
+            config.manifest = config.test_manifest
+            test_dataset = ManifestDataset.from_config(config)
+
+            config.augmentation_config = ""
+            config.keep_transcription_text = True
+            collate_fn_test = SpeechCollator.from_config(config)
+            decode_batch_size = config.get('decode', dict()).get(
+                'decode_batch_size', 1)
+            self.test_loader = DataLoader(
+                test_dataset,
+                batch_size=decode_batch_size,
+                shuffle=False,
+                drop_last=False,
+                collate_fn=collate_fn_test,
+                num_workers=config.num_workers)
+            logger.info("Setup test  Dataloader!")
+
+
+class DeepSpeech2Tester(DeepSpeech2Trainer):
+    def __init__(self, config, args):
+        super().__init__(config, args)
+        self._text_featurizer = TextFeaturizer(
+            unit_type=config.unit_type, vocab=None)
+
+    def ordid2token(self, texts, texts_len):
+        """ ord() id to chr() chr """
+        trans = []
+        for text, n in zip(texts, texts_len):
+            n = n.numpy().item()
+            ids = text[:n]
+            trans.append(''.join([chr(i) for i in ids]))
+        return trans
+
+    def compute_metrics(self,
+                        utts,
+                        audio,
+                        audio_len,
+                        texts,
+                        texts_len,
+                        fout=None):
+        decode_cfg = self.config.decode
+        errors_sum, len_refs, num_ins = 0.0, 0, 0
+        errors_func = error_rate.char_errors if decode_cfg.error_rate_type == 'cer' else error_rate.word_errors
+        error_rate_func = error_rate.cer if decode_cfg.error_rate_type == 'cer' else error_rate.wer
+
+        target_transcripts = self.ordid2token(texts, texts_len)
+
+        result_transcripts = self.compute_result_transcripts(audio, audio_len)
+
+        for utt, target, result in zip(utts, target_transcripts,
+                                       result_transcripts):
+            errors, len_ref = errors_func(target, result)
+            errors_sum += errors
+            len_refs += len_ref
+            num_ins += 1
+            if fout:
+                fout.write({"utt": utt, "ref": target, "hyp": result})
+            logger.info(f"Utt: {utt}")
+            logger.info(f"Ref: {target}")
+            logger.info(f"Hyp: {result}")
+            logger.info(
+                "Current error rate [%s] = %f" %
+                (decode_cfg.error_rate_type, error_rate_func(target, result)))
+
+        return dict(
+            errors_sum=errors_sum,
+            len_refs=len_refs,
+            num_ins=num_ins,
+            error_rate=errors_sum / len_refs,
+            error_rate_type=decode_cfg.error_rate_type)
+
+    def compute_result_transcripts(self, audio, audio_len):
+        result_transcripts = self.model.decode(audio, audio_len)
+        return result_transcripts
+
+    @mp_tools.rank_zero_only
+    @paddle.no_grad()
+    def test(self):
+        logger.info(f"Test Total Examples: {len(self.test_loader.dataset)}")
+        self.model.eval()
+        error_rate_type = None
+        errors_sum, len_refs, num_ins = 0.0, 0, 0
+
+        # Initialized the decoder in model
+        decode_cfg = self.config.decode
+        vocab_list = self.test_loader.collate_fn.vocab_list
+        decode_batch_size = self.test_loader.batch_size
+        self.model.decoder.init_decoder(
+            decode_batch_size, vocab_list, decode_cfg.decoding_method,
+            decode_cfg.lang_model_path, decode_cfg.alpha, decode_cfg.beta,
+            decode_cfg.beam_size, decode_cfg.cutoff_prob,
+            decode_cfg.cutoff_top_n, decode_cfg.num_proc_bsearch)
+
+        with jsonlines.open(self.args.result_file, 'w') as fout:
+            for i, batch in enumerate(self.test_loader):
+                utts, audio, audio_len, texts, texts_len = batch
+                metrics = self.compute_metrics(utts, audio, audio_len, texts,
+                                               texts_len, fout)
+                errors_sum += metrics['errors_sum']
+                len_refs += metrics['len_refs']
+                num_ins += metrics['num_ins']
+                error_rate_type = metrics['error_rate_type']
+                logger.info("Error rate [%s] (%d/?) = %f" %
+                            (error_rate_type, num_ins, errors_sum / len_refs))
+
+        # logging
+        msg = "Test: "
+        msg += "epoch: {}, ".format(self.epoch)
+        msg += "step: {}, ".format(self.iteration)
+        msg += "Final error rate [%s] (%d/%d) = %f" % (
+            error_rate_type, num_ins, num_ins, errors_sum / len_refs)
+        logger.info(msg)
+        self.model.decoder.del_decoder()
+
+    @paddle.no_grad()
+    def export(self):
+        if self.args.model_type == 'offline':
+            infer_model = DeepSpeech2InferModel.from_pretrained(
+                self.test_loader, self.config, self.args.checkpoint_path)
+        elif self.args.model_type == 'online':
+            infer_model = DeepSpeech2InferModelOnline.from_pretrained(
+                self.test_loader, self.config, self.args.checkpoint_path)
+        else:
+            raise Exception("wrong model type")
+
+        infer_model.eval()
+        feat_dim = self.test_loader.collate_fn.feature_size
+        static_model = infer_model.export()
+        logger.info(f"Export code: {static_model.forward.code}")
+        paddle.jit.save(static_model, self.args.export_path)
+
+
+class DeepSpeech2ExportTester(DeepSpeech2Tester):
+    def __init__(self, config, args):
+        super().__init__(config, args)
+        self.apply_static = True
+        self.args = args
+
+    @mp_tools.rank_zero_only
+    @paddle.no_grad()
+    def test(self):
+        logger.info(f"Test Total Examples: {len(self.test_loader.dataset)}")
+        if self.args.enable_auto_log is True:
+            from paddlespeech.s2t.utils.log import Autolog
+            self.autolog = Autolog(
+                batch_size=self.config.decode.decode_batch_size,
+                model_name="deepspeech2",
+                model_precision="fp32").getlog()
+        self.model.eval()
+        error_rate_type = None
+        errors_sum, len_refs, num_ins = 0.0, 0, 0
+
+        # Initialized the decoder in model
+        decode_cfg = self.config.decode
+        vocab_list = self.test_loader.collate_fn.vocab_list
+        if self.args.model_type == "online":
+            decode_batch_size = 1
+        elif self.args.model_type == "offline":
+            decode_batch_size = self.test_loader.batch_size
+        else:
+            raise Exception("wrong model type")
+        self.model.decoder.init_decoder(
+            decode_batch_size, vocab_list, decode_cfg.decoding_method,
+            decode_cfg.lang_model_path, decode_cfg.alpha, decode_cfg.beta,
+            decode_cfg.beam_size, decode_cfg.cutoff_prob,
+            decode_cfg.cutoff_top_n, decode_cfg.num_proc_bsearch)
+
+        with jsonlines.open(self.args.result_file, 'w') as fout:
+            for i, batch in enumerate(self.test_loader):
+                utts, audio, audio_len, texts, texts_len = batch
+                metrics = self.compute_metrics(utts, audio, audio_len, texts,
+                                               texts_len, fout)
+                errors_sum += metrics['errors_sum']
+                len_refs += metrics['len_refs']
+                num_ins += metrics['num_ins']
+                error_rate_type = metrics['error_rate_type']
+                logger.info("Error rate [%s] (%d/?) = %f" %
+                            (error_rate_type, num_ins, errors_sum / len_refs))
+        # logging
+        msg = "Test: "
+        msg += "epoch: {}, ".format(self.epoch)
+        msg += "step: {}, ".format(self.iteration)
+        msg += "Final error rate [%s] (%d/%d) = %f" % (
+            error_rate_type, num_ins, num_ins, errors_sum / len_refs)
+        logger.info(msg)
+        if self.args.enable_auto_log is True:
+            self.autolog.report()
+        self.model.decoder.del_decoder()
+
+    def compute_result_transcripts(self, audio, audio_len):
+        if self.args.model_type == "online":
+            output_probs, output_lens, trans_batch = self.static_forward_online(
+                audio, audio_len, decoder_chunk_size=1)
+            result_transcripts = [trans[-1] for trans in trans_batch]
+        elif self.args.model_type == "offline":
+            output_probs, output_lens = self.static_forward_offline(audio,
+                                                                    audio_len)
+            batch_size = output_probs.shape[0]
+            self.model.decoder.reset_decoder(batch_size=batch_size)
+
+            self.model.decoder.next(output_probs, output_lens)
+
+            trans_best, trans_beam = self.model.decoder.decode()
+
+            result_transcripts = trans_best
+
+        else:
+            raise Exception("wrong model type")
+
+        self.predictor.clear_intermediate_tensor()
+        self.predictor.try_shrink_memory()
+
+        #replace the <space> with ' '
+        result_transcripts = [
+            self._text_featurizer.detokenize(sentence)
+            for sentence in result_transcripts
+        ]
+
+        return result_transcripts
+
+    def run_test(self):
+        """Do Test/Decode"""
+        try:
+            with Timer("Test/Decode Done: {}"):
+                with self.eval():
+                    self.test()
+        except KeyboardInterrupt:
+            exit(-1)
+
+    def static_forward_online(self, audio, audio_len,
+                              decoder_chunk_size: int=1):
+        """
+        Parameters
+        ----------
+            audio (Tensor): shape[B, T, D]
+            audio_len (Tensor): shape[B]
+            decoder_chunk_size(int)
+        Returns
+        -------
+            output_probs(numpy.array): shape[B, T, vocab_size]
+            output_lens(numpy.array): shape[B]
+            trans(list(list(str))): shape[B, T]
+        """
+        output_probs_list = []
+        output_lens_list = []
+        subsampling_rate = self.model.encoder.conv.subsampling_rate
+        receptive_field_length = self.model.encoder.conv.receptive_field_length
+        chunk_stride = subsampling_rate * decoder_chunk_size
+        chunk_size = (decoder_chunk_size - 1
+                      ) * subsampling_rate + receptive_field_length
+
+        x_batch = audio.numpy()
+        batch_size, Tmax, x_dim = x_batch.shape
+        x_len_batch = audio_len.numpy().astype(np.int64)
+        if (Tmax - chunk_size) % chunk_stride != 0:
+            # The length of padding for the batch
+            padding_len_batch = chunk_stride - (Tmax - chunk_size
+                                                ) % chunk_stride
+        else:
+            padding_len_batch = 0
+        x_list = np.split(x_batch, batch_size, axis=0)
+        x_len_list = np.split(x_len_batch, batch_size, axis=0)
+
+        trans_batch = []
+        for x, x_len in zip(x_list, x_len_list):
+            if self.args.enable_auto_log is True:
+                self.autolog.times.start()
+            x_len = x_len[0]
+            assert (chunk_size <= x_len)
+
+            if (x_len - chunk_size) % chunk_stride != 0:
+                padding_len_x = chunk_stride - (x_len - chunk_size
+                                                ) % chunk_stride
+            else:
+                padding_len_x = 0
+
+            padding = np.zeros(
+                (x.shape[0], padding_len_x, x.shape[2]), dtype=x.dtype)
+            padded_x = np.concatenate([x, padding], axis=1)
+
+            num_chunk = (x_len + padding_len_x - chunk_size) / chunk_stride + 1
+            num_chunk = int(num_chunk)
+
+            chunk_state_h_box = np.zeros(
+                (self.config.num_rnn_layers, 1, self.config.rnn_layer_size),
+                dtype=x.dtype)
+            chunk_state_c_box = np.zeros(
+                (self.config.num_rnn_layers, 1, self.config.rnn_layer_size),
+                dtype=x.dtype)
+
+            input_names = self.predictor.get_input_names()
+            audio_handle = self.predictor.get_input_handle(input_names[0])
+            audio_len_handle = self.predictor.get_input_handle(input_names[1])
+            h_box_handle = self.predictor.get_input_handle(input_names[2])
+            c_box_handle = self.predictor.get_input_handle(input_names[3])
+
+            trans = []
+            probs_chunk_list = []
+            probs_chunk_lens_list = []
+            if self.args.enable_auto_log is True:
+                # record the model preprocessing time
+                self.autolog.times.stamp()
+
+            self.model.decoder.reset_decoder(batch_size=1)
+            for i in range(0, num_chunk):
+                start = i * chunk_stride
+                end = start + chunk_size
+                x_chunk = padded_x[:, start:end, :]
+                if x_len < i * chunk_stride:
+                    x_chunk_lens = 0
+                else:
+                    x_chunk_lens = min(x_len - i * chunk_stride, chunk_size)
+                #means the number of input frames in the chunk is not enough for predicting one prob
+                if (x_chunk_lens < receptive_field_length):
+                    break
+                x_chunk_lens = np.array([x_chunk_lens])
+                audio_handle.reshape(x_chunk.shape)
+                audio_handle.copy_from_cpu(x_chunk)
+
+                audio_len_handle.reshape(x_chunk_lens.shape)
+                audio_len_handle.copy_from_cpu(x_chunk_lens)
+
+                h_box_handle.reshape(chunk_state_h_box.shape)
+                h_box_handle.copy_from_cpu(chunk_state_h_box)
+
+                c_box_handle.reshape(chunk_state_c_box.shape)
+                c_box_handle.copy_from_cpu(chunk_state_c_box)
+
+                output_names = self.predictor.get_output_names()
+                output_handle = self.predictor.get_output_handle(
+                    output_names[0])
+                output_lens_handle = self.predictor.get_output_handle(
+                    output_names[1])
+                output_state_h_handle = self.predictor.get_output_handle(
+                    output_names[2])
+                output_state_c_handle = self.predictor.get_output_handle(
+                    output_names[3])
+                self.predictor.run()
+                output_chunk_probs = output_handle.copy_to_cpu()
+                output_chunk_lens = output_lens_handle.copy_to_cpu()
+                chunk_state_h_box = output_state_h_handle.copy_to_cpu()
+                chunk_state_c_box = output_state_c_handle.copy_to_cpu()
+                self.model.decoder.next(output_chunk_probs, output_chunk_lens)
+                probs_chunk_list.append(output_chunk_probs)
+                probs_chunk_lens_list.append(output_chunk_lens)
+                trans_best, trans_beam = self.model.decoder.decode()
+                trans.append(trans_best[0])
+            trans_batch.append(trans)
+            output_probs = np.concatenate(probs_chunk_list, axis=1)
+            output_lens = np.sum(probs_chunk_lens_list, axis=0)
+            vocab_size = output_probs.shape[2]
+            output_probs_padding_len = Tmax + padding_len_batch - output_probs.shape[
+                1]
+            output_probs_padding = np.zeros(
+                (1, output_probs_padding_len, vocab_size),
+                dtype=output_probs.
+                dtype)  # The prob padding for a piece of utterance
+            output_probs = np.concatenate(
+                [output_probs, output_probs_padding], axis=1)
+            output_probs_list.append(output_probs)
+            output_lens_list.append(output_lens)
+            if self.args.enable_auto_log is True:
+                # record the model inference time
+                self.autolog.times.stamp()
+                # record the post processing time
+                self.autolog.times.stamp()
+                self.autolog.times.end()
+        output_probs = np.concatenate(output_probs_list, axis=0)
+        output_lens = np.concatenate(output_lens_list, axis=0)
+        return output_probs, output_lens, trans_batch
+
+    def static_forward_offline(self, audio, audio_len):
+        """
+        Parameters
+        ----------
+            audio (Tensor): shape[B, T, D]
+            audio_len (Tensor): shape[B]
+
+        Returns
+        -------
+            output_probs(numpy.array): shape[B, T, vocab_size]
+            output_lens(numpy.array): shape[B]
+        """
+        x = audio.numpy()
+        x_len = audio_len.numpy().astype(np.int64)
+
+        input_names = self.predictor.get_input_names()
+        audio_handle = self.predictor.get_input_handle(input_names[0])
+        audio_len_handle = self.predictor.get_input_handle(input_names[1])
+
+        audio_handle.reshape(x.shape)
+        audio_handle.copy_from_cpu(x)
+
+        audio_len_handle.reshape(x_len.shape)
+        audio_len_handle.copy_from_cpu(x_len)
+
+        if self.args.enable_auto_log is True:
+            self.autolog.times.start()
+            # record the prefix processing time
+            self.autolog.times.stamp()
+        self.predictor.run()
+        if self.args.enable_auto_log is True:
+            # record the model inference time
+            self.autolog.times.stamp()
+            # record the post processing time
+            self.autolog.times.stamp()
+            self.autolog.times.end()
+
+        output_names = self.predictor.get_output_names()
+        output_handle = self.predictor.get_output_handle(output_names[0])
+        output_lens_handle = self.predictor.get_output_handle(output_names[1])
+        output_probs = output_handle.copy_to_cpu()
+        output_lens = output_lens_handle.copy_to_cpu()
+        return output_probs, output_lens
+
+    def setup_model(self):
+        super().setup_model()
+        deepspeech_config = inference.Config(
+            self.args.export_path + ".pdmodel",
+            self.args.export_path + ".pdiparams")
+        if (os.environ['CUDA_VISIBLE_DEVICES'].strip() != ''):
+            deepspeech_config.enable_use_gpu(100, 0)
+            deepspeech_config.enable_memory_optim()
+        deepspeech_predictor = inference.create_predictor(deepspeech_config)
+        self.predictor = deepspeech_predictor
diff --git a/ernie-sat/paddlespeech/s2t/exps/lm/transformer/__init__.py b/ernie-sat/paddlespeech/s2t/exps/lm/transformer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..185a92b8d94d3426d616c0624f0f2ee04339349e
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/exps/lm/transformer/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/ernie-sat/paddlespeech/s2t/exps/lm/transformer/bin/__init__.py b/ernie-sat/paddlespeech/s2t/exps/lm/transformer/bin/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..185a92b8d94d3426d616c0624f0f2ee04339349e
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/exps/lm/transformer/bin/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/ernie-sat/paddlespeech/s2t/exps/lm/transformer/bin/cacu_perplexity.py b/ernie-sat/paddlespeech/s2t/exps/lm/transformer/bin/cacu_perplexity.py
new file mode 100644
index 0000000000000000000000000000000000000000..f3e4d2099f31f30ca30adbd9b37cb93e4f7b965e
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/exps/lm/transformer/bin/cacu_perplexity.py
@@ -0,0 +1,82 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+
+import configargparse
+
+
+def get_parser():
+    """Get default arguments."""
+    parser = configargparse.ArgumentParser(
+        description="The parser for caculating the perplexity of transformer language model ",
+        config_file_parser_class=configargparse.YAMLConfigFileParser,
+        formatter_class=configargparse.ArgumentDefaultsHelpFormatter, )
+
+    parser.add_argument(
+        "--rnnlm", type=str, default=None, help="RNNLM model file to read")
+
+    parser.add_argument(
+        "--rnnlm-conf",
+        type=str,
+        default=None,
+        help="RNNLM model config file to read")
+
+    parser.add_argument(
+        "--vocab_path",
+        type=str,
+        default=None,
+        help="vocab path to for token2id")
+
+    parser.add_argument(
+        "--bpeprefix",
+        type=str,
+        default=None,
+        help="The path of bpeprefix for loading")
+
+    parser.add_argument(
+        "--text_path",
+        type=str,
+        default=None,
+        help="The path of text file for testing ")
+
+    parser.add_argument(
+        "--ngpu",
+        type=int,
+        default=0,
+        help="The number of gpu to use, 0 for using cpu instead")
+
+    parser.add_argument(
+        "--dtype",
+        choices=("float16", "float32", "float64"),
+        default="float32",
+        help="Float precision (only available in --api v2)", )
+
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default=".",
+        help="The output directory to store the sentence PPL")
+
+    return parser
+
+
+def main(args):
+    parser = get_parser()
+    args = parser.parse_args(args)
+    from paddlespeech.s2t.exps.lm.transformer.lm_cacu_perplexity import run_get_perplexity
+    run_get_perplexity(args)
+
+
+if __name__ == "__main__":
+    main(sys.argv[1:])
diff --git a/ernie-sat/paddlespeech/s2t/exps/lm/transformer/lm_cacu_perplexity.py b/ernie-sat/paddlespeech/s2t/exps/lm/transformer/lm_cacu_perplexity.py
new file mode 100644
index 0000000000000000000000000000000000000000..e628f3234fb7a144ebea3989406e0c2a9e0e9898
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/exps/lm/transformer/lm_cacu_perplexity.py
@@ -0,0 +1,132 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Caculating the PPL of LM model
+import os
+
+import numpy as np
+import paddle
+from paddle.io import DataLoader
+from yacs.config import CfgNode
+
+from paddlespeech.s2t.models.lm.dataset import TextCollatorSpm
+from paddlespeech.s2t.models.lm.dataset import TextDataset
+from paddlespeech.s2t.models.lm_interface import dynamic_import_lm
+from paddlespeech.s2t.utils.log import Log
+
+logger = Log(__name__).getlog()
+
+
+def get_config(config_path):
+    confs = CfgNode(new_allowed=True)
+    confs.merge_from_file(config_path)
+    return confs
+
+
+def load_trained_lm(args):
+    lm_config = get_config(args.rnnlm_conf)
+    lm_model_module = lm_config.model_module
+    lm_class = dynamic_import_lm(lm_model_module)
+    lm = lm_class(**lm_config.model)
+    model_dict = paddle.load(args.rnnlm)
+    lm.set_state_dict(model_dict)
+    return lm, lm_config
+
+
+def write_dict_into_file(ppl_dict, name):
+    with open(name, "w") as f:
+        for key in ppl_dict.keys():
+            f.write(key + " " + ppl_dict[key] + "\n")
+    return
+
+
+def cacu_perplexity(
+        lm_model,
+        lm_config,
+        args,
+        log_base=None, ):
+    unit_type = lm_config.data.unit_type
+    batch_size = lm_config.decoding.batch_size
+    num_workers = lm_config.decoding.num_workers
+    text_file_path = args.text_path
+
+    total_nll = 0.0
+    total_ntokens = 0
+    ppl_dict = {}
+    len_dict = {}
+    text_dataset = TextDataset.from_file(text_file_path)
+    collate_fn_text = TextCollatorSpm(
+        unit_type=unit_type,
+        vocab_filepath=args.vocab_path,
+        spm_model_prefix=args.bpeprefix)
+    train_loader = DataLoader(
+        text_dataset,
+        batch_size=batch_size,
+        collate_fn=collate_fn_text,
+        num_workers=num_workers)
+
+    logger.info("start caculating PPL......")
+    for i, (keys, ys_input_pad, ys_output_pad,
+            y_lens) in enumerate(train_loader()):
+
+        ys_input_pad = paddle.to_tensor(ys_input_pad)
+        ys_output_pad = paddle.to_tensor(ys_output_pad)
+        _, unused_logp, unused_count, nll, nll_count = lm_model.forward(
+            ys_input_pad, ys_output_pad)
+        nll = nll.numpy()
+        nll_count = nll_count.numpy()
+        for key, _nll, ntoken in zip(keys, nll, nll_count):
+            if log_base is None:
+                utt_ppl = np.exp(_nll / ntoken)
+            else:
+                utt_ppl = log_base**(_nll / ntoken / np.log(log_base))
+
+            # Write PPL of each utts for debugging or analysis
+            ppl_dict[key] = str(utt_ppl)
+            len_dict[key] = str(ntoken)
+
+        total_nll += nll.sum()
+        total_ntokens += nll_count.sum()
+        logger.info("Current total nll: " + str(total_nll))
+        logger.info("Current total tokens: " + str(total_ntokens))
+    write_dict_into_file(ppl_dict, os.path.join(args.output_dir, "uttPPL"))
+    write_dict_into_file(len_dict, os.path.join(args.output_dir, "uttLEN"))
+    if log_base is None:
+        ppl = np.exp(total_nll / total_ntokens)
+    else:
+        ppl = log_base**(total_nll / total_ntokens / np.log(log_base))
+
+    if log_base is None:
+        log_base = np.e
+    else:
+        log_base = log_base
+
+    return ppl, log_base
+
+
+def run_get_perplexity(args):
+    if args.ngpu > 1:
+        raise NotImplementedError("only single GPU decoding is supported")
+    if args.ngpu == 1:
+        device = "gpu:0"
+    else:
+        device = "cpu"
+    paddle.set_device(device)
+    dtype = getattr(paddle, args.dtype)
+    logger.info(f"Decoding device={device}, dtype={dtype}")
+    lm_model, lm_config = load_trained_lm(args)
+    lm_model.to(device=device, dtype=dtype)
+    lm_model.eval()
+    PPL, log_base = cacu_perplexity(lm_model, lm_config, args, None)
+    logger.info("Final PPL: " + str(PPL))
+    logger.info("The log base is:" + str("%.2f" % log_base))
diff --git a/ernie-sat/paddlespeech/s2t/exps/u2/__init__.py b/ernie-sat/paddlespeech/s2t/exps/u2/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..185a92b8d94d3426d616c0624f0f2ee04339349e
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/exps/u2/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/ernie-sat/paddlespeech/s2t/exps/u2/bin/__init__.py b/ernie-sat/paddlespeech/s2t/exps/u2/bin/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..185a92b8d94d3426d616c0624f0f2ee04339349e
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/exps/u2/bin/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/ernie-sat/paddlespeech/s2t/exps/u2/bin/alignment.py b/ernie-sat/paddlespeech/s2t/exps/u2/bin/alignment.py
new file mode 100644
index 0000000000000000000000000000000000000000..e3390feb1af359ede500e9fc8af582a4021060e3
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/exps/u2/bin/alignment.py
@@ -0,0 +1,57 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Alignment for U2 model."""
+from yacs.config import CfgNode
+
+from paddlespeech.s2t.exps.u2.model import U2Tester as Tester
+from paddlespeech.s2t.training.cli import default_argument_parser
+from paddlespeech.s2t.utils.utility import print_arguments
+
+
+def main_sp(config, args):
+    exp = Tester(config, args)
+    with exp.eval():
+        exp.setup()
+        exp.run_align()
+
+
+def main(config, args):
+    main_sp(config, args)
+
+
+if __name__ == "__main__":
+    parser = default_argument_parser()
+    # save asr result to
+    parser.add_argument(
+        "--result_file", type=str, help="path of save the asr result")
+    args = parser.parse_args()
+    print_arguments(args, globals())
+
+    # https://yaml.org/type/float.html
+    config = CfgNode(new_allowed=True)
+    if args.config:
+        config.merge_from_file(args.config)
+    if args.decode_cfg:
+        decode_confs = CfgNode(new_allowed=True)
+        decode_confs.merge_from_file(args.decode_cfg)
+        config.decode = decode_confs
+    if args.opts:
+        config.merge_from_list(args.opts)
+    config.freeze()
+    print(config)
+    if args.dump_config:
+        with open(args.dump_config, 'w') as f:
+            print(config, file=f)
+
+    main(config, args)
diff --git a/ernie-sat/paddlespeech/s2t/exps/u2/bin/export.py b/ernie-sat/paddlespeech/s2t/exps/u2/bin/export.py
new file mode 100644
index 0000000000000000000000000000000000000000..592b12379be7648377ad3c450f2b608610121fbe
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/exps/u2/bin/export.py
@@ -0,0 +1,53 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Export for U2 model."""
+from yacs.config import CfgNode
+
+from paddlespeech.s2t.exps.u2.model import U2Tester as Tester
+from paddlespeech.s2t.training.cli import default_argument_parser
+from paddlespeech.s2t.utils.utility import print_arguments
+
+
+def main_sp(config, args):
+    exp = Tester(config, args)
+    with exp.eval():
+        exp.setup()
+        exp.run_export()
+
+
+def main(config, args):
+    main_sp(config, args)
+
+
+if __name__ == "__main__":
+    parser = default_argument_parser()
+    # save jit model to
+    parser.add_argument(
+        "--export_path", type=str, help="path of the jit model to save")
+    args = parser.parse_args()
+    print_arguments(args, globals())
+
+    # https://yaml.org/type/float.html
+    config = CfgNode(new_allowed=True)
+    if args.config:
+        config.merge_from_file(args.config)
+    if args.opts:
+        config.merge_from_list(args.opts)
+    config.freeze()
+    print(config)
+    if args.dump_config:
+        with open(args.dump_config, 'w') as f:
+            print(config, file=f)
+
+    main(config, args)
diff --git a/ernie-sat/paddlespeech/s2t/exps/u2/bin/test.py b/ernie-sat/paddlespeech/s2t/exps/u2/bin/test.py
new file mode 100644
index 0000000000000000000000000000000000000000..f14d804f188a1e6089f4c513974a3e324f3f34f4
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/exps/u2/bin/test.py
@@ -0,0 +1,64 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Evaluation for U2 model."""
+import cProfile
+
+from yacs.config import CfgNode
+
+from paddlespeech.s2t.exps.u2.model import U2Tester as Tester
+from paddlespeech.s2t.training.cli import default_argument_parser
+from paddlespeech.s2t.utils.utility import print_arguments
+
+# TODO(hui zhang): dynamic load
+
+
+def main_sp(config, args):
+    exp = Tester(config, args)
+    with exp.eval():
+        exp.setup()
+        exp.run_test()
+
+
+def main(config, args):
+    main_sp(config, args)
+
+
+if __name__ == "__main__":
+    parser = default_argument_parser()
+    # save asr result to
+    parser.add_argument(
+        "--result_file", type=str, help="path of save the asr result")
+    args = parser.parse_args()
+    print_arguments(args, globals())
+
+    # https://yaml.org/type/float.html
+    config = CfgNode(new_allowed=True)
+    if args.config:
+        config.merge_from_file(args.config)
+    if args.decode_cfg:
+        decode_confs = CfgNode(new_allowed=True)
+        decode_confs.merge_from_file(args.decode_cfg)
+        config.decode = decode_confs
+    if args.opts:
+        config.merge_from_list(args.opts)
+    config.freeze()
+    print(config)
+    if args.dump_config:
+        with open(args.dump_config, 'w') as f:
+            print(config, file=f)
+
+    # Setting for profiling
+    pr = cProfile.Profile()
+    pr.runcall(main, config, args)
+    pr.dump_stats('test.profile')
diff --git a/ernie-sat/paddlespeech/s2t/exps/u2/bin/test_wav.py b/ernie-sat/paddlespeech/s2t/exps/u2/bin/test_wav.py
new file mode 100644
index 0000000000000000000000000000000000000000..9904813a581cd4b3ea2dcbe873e5abec8e802491
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/exps/u2/bin/test_wav.py
@@ -0,0 +1,141 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Evaluation for U2 model."""
+import os
+import sys
+from pathlib import Path
+
+import paddle
+import soundfile
+from yacs.config import CfgNode
+
+from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
+from paddlespeech.s2t.models.u2 import U2Model
+from paddlespeech.s2t.training.cli import default_argument_parser
+from paddlespeech.s2t.transform.transformation import Transformation
+from paddlespeech.s2t.utils.log import Log
+from paddlespeech.s2t.utils.utility import UpdateConfig
+logger = Log(__name__).getlog()
+
+# TODO(hui zhang): dynamic load
+
+
+class U2Infer():
+    def __init__(self, config, args):
+        self.args = args
+        self.config = config
+        self.audio_file = args.audio_file
+
+        self.preprocess_conf = config.preprocess_config
+        self.preprocess_args = {"train": False}
+        self.preprocessing = Transformation(self.preprocess_conf)
+
+        self.text_feature = TextFeaturizer(
+            unit_type=config.unit_type,
+            vocab=config.vocab_filepath,
+            spm_model_prefix=config.spm_model_prefix)
+
+        paddle.set_device('gpu' if self.args.ngpu > 0 else 'cpu')
+
+        # model
+        model_conf = config
+        with UpdateConfig(model_conf):
+            model_conf.input_dim = config.feat_dim
+            model_conf.output_dim = self.text_feature.vocab_size
+        model = U2Model.from_config(model_conf)
+        self.model = model
+        self.model.eval()
+
+        # load model
+        params_path = self.args.checkpoint_path + ".pdparams"
+        model_dict = paddle.load(params_path)
+        self.model.set_state_dict(model_dict)
+
+    def run(self):
+        check(args.audio_file)
+
+        with paddle.no_grad():
+            # read
+            audio, sample_rate = soundfile.read(
+                self.audio_file, dtype="int16", always_2d=True)
+
+            audio = audio[:, 0]
+            logger.info(f"audio shape: {audio.shape}")
+
+            # fbank
+            feat = self.preprocessing(audio, **self.preprocess_args)
+            logger.info(f"feat shape: {feat.shape}")
+
+            ilen = paddle.to_tensor(feat.shape[0])
+            xs = paddle.to_tensor(feat, dtype='float32').unsqueeze(axis=0)
+
+            decode_config = self.config.decode
+            result_transcripts = self.model.decode(
+                xs,
+                ilen,
+                text_feature=self.text_feature,
+                decoding_method=decode_config.decoding_method,
+                beam_size=decode_config.beam_size,
+                ctc_weight=decode_config.ctc_weight,
+                decoding_chunk_size=decode_config.decoding_chunk_size,
+                num_decoding_left_chunks=decode_config.num_decoding_left_chunks,
+                simulate_streaming=decode_config.simulate_streaming)
+            rsl = result_transcripts[0][0]
+            utt = Path(self.audio_file).name
+            logger.info(f"hyp: {utt} {result_transcripts[0][0]}")
+            return rsl
+
+
+def check(audio_file):
+    if not os.path.isfile(audio_file):
+        print("Please input the right audio file path")
+        sys.exit(-1)
+
+    logger.info("checking the audio file format......")
+    try:
+        sig, sample_rate = soundfile.read(audio_file)
+    except Exception as e:
+        logger.error(str(e))
+        logger.error(
+            "can not open the wav file, please check the audio file format")
+        sys.exit(-1)
+    logger.info("The sample rate is %d" % sample_rate)
+    assert (sample_rate == 16000)
+    logger.info("The audio file format is right")
+
+
+def main(config, args):
+    U2Infer(config, args).run()
+
+
+if __name__ == "__main__":
+    parser = default_argument_parser()
+    # save asr result to
+    parser.add_argument(
+        "--result_file", type=str, help="path of save the asr result")
+    parser.add_argument(
+        "--audio_file", type=str, help="path of the input audio file")
+    args = parser.parse_args()
+
+    config = CfgNode(new_allowed=True)
+    if args.config:
+        config.merge_from_file(args.config)
+    if args.decode_cfg:
+        decode_confs = CfgNode(new_allowed=True)
+        decode_confs.merge_from_file(args.decode_cfg)
+        config.decode = decode_confs
+    if args.opts:
+        config.merge_from_list(args.opts)
+    config.freeze()
+    main(config, args)
diff --git a/ernie-sat/paddlespeech/s2t/exps/u2/bin/train.py b/ernie-sat/paddlespeech/s2t/exps/u2/bin/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..53c223283f180397e69b6a5290a4572e5a76cc41
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/exps/u2/bin/train.py
@@ -0,0 +1,61 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Trainer for U2 model."""
+import cProfile
+import os
+
+from paddle import distributed as dist
+from yacs.config import CfgNode
+
+from paddlespeech.s2t.exps.u2.model import U2Trainer as Trainer
+from paddlespeech.s2t.training.cli import default_argument_parser
+from paddlespeech.s2t.utils.utility import print_arguments
+
+# from paddlespeech.s2t.exps.u2.trainer import U2Trainer as Trainer
+
+
+def main_sp(config, args):
+    exp = Trainer(config, args)
+    exp.setup()
+    exp.run()
+
+
+def main(config, args):
+    if args.ngpu > 1:
+        dist.spawn(main_sp, args=(config, args), nprocs=args.ngpu)
+    else:
+        main_sp(config, args)
+
+
+if __name__ == "__main__":
+    parser = default_argument_parser()
+    args = parser.parse_args()
+    print_arguments(args, globals())
+
+    # https://yaml.org/type/float.html
+    config = CfgNode(new_allowed=True)
+    if args.config:
+        config.merge_from_file(args.config)
+    if args.opts:
+        config.merge_from_list(args.opts)
+    config.freeze()
+    print(config)
+    if args.dump_config:
+        with open(args.dump_config, 'w') as f:
+            print(config, file=f)
+
+    # Setting for profiling
+    pr = cProfile.Profile()
+    pr.runcall(main, config, args)
+    pr.dump_stats(os.path.join(args.output, 'train.profile'))
diff --git a/ernie-sat/paddlespeech/s2t/exps/u2/model.py b/ernie-sat/paddlespeech/s2t/exps/u2/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..efcc9629fdbf63981cfdc4cc5b91693e5f3a85ee
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/exps/u2/model.py
@@ -0,0 +1,546 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Contains U2 model."""
+import json
+import os
+import time
+from collections import defaultdict
+from collections import OrderedDict
+from contextlib import nullcontext
+
+import jsonlines
+import numpy as np
+import paddle
+from paddle import distributed as dist
+
+from paddlespeech.s2t.frontend.featurizer import TextFeaturizer
+from paddlespeech.s2t.io.dataloader import BatchDataLoader
+from paddlespeech.s2t.models.u2 import U2Model
+from paddlespeech.s2t.training.optimizer import OptimizerFactory
+from paddlespeech.s2t.training.reporter import ObsScope
+from paddlespeech.s2t.training.reporter import report
+from paddlespeech.s2t.training.scheduler import LRSchedulerFactory
+from paddlespeech.s2t.training.timer import Timer
+from paddlespeech.s2t.training.trainer import Trainer
+from paddlespeech.s2t.utils import ctc_utils
+from paddlespeech.s2t.utils import error_rate
+from paddlespeech.s2t.utils import layer_tools
+from paddlespeech.s2t.utils import mp_tools
+from paddlespeech.s2t.utils.log import Log
+from paddlespeech.s2t.utils.utility import UpdateConfig
+
+logger = Log(__name__).getlog()
+
+
+class U2Trainer(Trainer):
+    def __init__(self, config, args):
+        super().__init__(config, args)
+
+    def train_batch(self, batch_index, batch_data, msg):
+        train_conf = self.config
+        start = time.time()
+
+        # forward
+        utt, audio, audio_len, text, text_len = batch_data
+        loss, attention_loss, ctc_loss = self.model(audio, audio_len, text,
+                                                    text_len)
+
+        # loss div by `batch_size * accum_grad`
+        loss /= train_conf.accum_grad
+        losses_np = {'loss': float(loss) * train_conf.accum_grad}
+        if attention_loss:
+            losses_np['att_loss'] = float(attention_loss)
+        if ctc_loss:
+            losses_np['ctc_loss'] = float(ctc_loss)
+
+        # loss backward
+        if (batch_index + 1) % train_conf.accum_grad != 0:
+            # Disable gradient synchronizations across DDP processes.
+            # Within this context, gradients will be accumulated on module
+            # variables, which will later be synchronized.
+            # When using cpu w/o DDP, model does not have `no_sync`
+            context = self.model.no_sync if (hasattr(self.model, "no_sync") and
+                                             self.parallel) else nullcontext
+        else:
+            # Used for single gpu training and DDP gradient synchronization
+            # processes.
+            context = nullcontext
+        with context():
+            loss.backward()
+            layer_tools.print_grads(self.model, print_func=None)
+
+        # optimizer step
+        if (batch_index + 1) % train_conf.accum_grad == 0:
+            self.optimizer.step()
+            self.optimizer.clear_grad()
+            self.lr_scheduler.step()
+            self.iteration += 1
+
+        iteration_time = time.time() - start
+
+        for k, v in losses_np.items():
+            report(k, v)
+        report("batch_size", self.config.batch_size)
+        report("accum", train_conf.accum_grad)
+        report("step_cost", iteration_time)
+
+        if (batch_index + 1) % train_conf.accum_grad == 0:
+            if dist.get_rank() == 0 and self.visualizer:
+                losses_np_v = losses_np.copy()
+                losses_np_v.update({"lr": self.lr_scheduler()})
+                for key, val in losses_np_v.items():
+                    self.visualizer.add_scalar(
+                        tag='train/' + key, value=val, step=self.iteration - 1)
+
+    @paddle.no_grad()
+    def valid(self):
+        self.model.eval()
+        logger.info(f"Valid Total Examples: {len(self.valid_loader.dataset)}")
+        valid_losses = defaultdict(list)
+        num_seen_utts = 1
+        total_loss = 0.0
+        for i, batch in enumerate(self.valid_loader):
+            utt, audio, audio_len, text, text_len = batch
+            loss, attention_loss, ctc_loss = self.model(audio, audio_len, text,
+                                                        text_len)
+            if paddle.isfinite(loss):
+                num_utts = batch[1].shape[0]
+                num_seen_utts += num_utts
+                total_loss += float(loss) * num_utts
+                valid_losses['val_loss'].append(float(loss))
+                if attention_loss:
+                    valid_losses['val_att_loss'].append(float(attention_loss))
+                if ctc_loss:
+                    valid_losses['val_ctc_loss'].append(float(ctc_loss))
+
+            if (i + 1) % self.config.log_interval == 0:
+                valid_dump = {k: np.mean(v) for k, v in valid_losses.items()}
+                valid_dump['val_history_loss'] = total_loss / num_seen_utts
+
+                # logging
+                msg = f"Valid: Rank: {dist.get_rank()}, "
+                msg += "epoch: {}, ".format(self.epoch)
+                msg += "step: {}, ".format(self.iteration)
+                msg += "batch: {}/{}, ".format(i + 1, len(self.valid_loader))
+                msg += ', '.join('{}: {:>.6f}'.format(k, v)
+                                 for k, v in valid_dump.items())
+                logger.info(msg)
+
+        logger.info('Rank {} Val info val_loss {}'.format(
+            dist.get_rank(), total_loss / num_seen_utts))
+        return total_loss, num_seen_utts
+
+    def do_train(self):
+        """The training process control by step."""
+        # !!!IMPORTANT!!!
+        # Try to export the model by script, if fails, we should refine
+        # the code to satisfy the script export requirements
+        # script_model = paddle.jit.to_static(self.model)
+        # script_model_path = str(self.checkpoint_dir / 'init')
+        # paddle.jit.save(script_model, script_model_path)
+
+        self.before_train()
+
+        logger.info(f"Train Total Examples: {len(self.train_loader.dataset)}")
+        while self.epoch < self.config.n_epoch:
+            with Timer("Epoch-Train Time Cost: {}"):
+                self.model.train()
+                try:
+                    data_start_time = time.time()
+                    for batch_index, batch in enumerate(self.train_loader):
+                        dataload_time = time.time() - data_start_time
+                        msg = "Train:"
+                        observation = OrderedDict()
+                        with ObsScope(observation):
+                            report("Rank", dist.get_rank())
+                            report("epoch", self.epoch)
+                            report('step', self.iteration)
+                            report("lr", self.lr_scheduler())
+                            self.train_batch(batch_index, batch, msg)
+                            self.after_train_batch()
+                            report('iter', batch_index + 1)
+                            report('total', len(self.train_loader))
+                            report('reader_cost', dataload_time)
+                        observation['batch_cost'] = observation[
+                            'reader_cost'] + observation['step_cost']
+                        observation['samples'] = observation['batch_size']
+                        observation['ips,samples/s'] = observation[
+                            'batch_size'] / observation['batch_cost']
+                        for k, v in observation.items():
+                            msg += f" {k.split(',')[0]}: "
+                            msg += f"{v:>.8f}" if isinstance(v,
+                                                             float) else f"{v}"
+                            msg += f" {k.split(',')[1]}" if len(
+                                k.split(',')) == 2 else ""
+                            msg += ","
+                        msg = msg[:-1]  # remove the last ","
+                        if (batch_index + 1) % self.config.log_interval == 0:
+                            logger.info(msg)
+                        data_start_time = time.time()
+                except Exception as e:
+                    logger.error(e)
+                    raise e
+
+            with Timer("Eval Time Cost: {}"):
+                total_loss, num_seen_utts = self.valid()
+                if dist.get_world_size() > 1:
+                    num_seen_utts = paddle.to_tensor(num_seen_utts)
+                    # the default operator in all_reduce function is sum.
+                    dist.all_reduce(num_seen_utts)
+                    total_loss = paddle.to_tensor(total_loss)
+                    dist.all_reduce(total_loss)
+                    cv_loss = total_loss / num_seen_utts
+                    cv_loss = float(cv_loss)
+                else:
+                    cv_loss = total_loss / num_seen_utts
+
+            logger.info(
+                'Epoch {} Val info val_loss {}'.format(self.epoch, cv_loss))
+            if self.visualizer:
+                self.visualizer.add_scalar(
+                    tag='eval/cv_loss', value=cv_loss, step=self.epoch)
+                self.visualizer.add_scalar(
+                    tag='eval/lr', value=self.lr_scheduler(), step=self.epoch)
+
+            self.save(tag=self.epoch, infos={'val_loss': cv_loss})
+            self.new_epoch()
+
+    def setup_dataloader(self):
+        config = self.config.clone()
+
+        if self.train:
+            # train/valid dataset, return token ids
+            self.train_loader = BatchDataLoader(
+                json_file=config.train_manifest,
+                train_mode=True,
+                sortagrad=config.sortagrad,
+                batch_size=config.batch_size,
+                maxlen_in=config.maxlen_in,
+                maxlen_out=config.maxlen_out,
+                minibatches=config.minibatches,
+                mini_batch_size=self.args.ngpu,
+                batch_count=config.batch_count,
+                batch_bins=config.batch_bins,
+                batch_frames_in=config.batch_frames_in,
+                batch_frames_out=config.batch_frames_out,
+                batch_frames_inout=config.batch_frames_inout,
+                preprocess_conf=config.preprocess_config,
+                n_iter_processes=config.num_workers,
+                subsampling_factor=1,
+                num_encs=1,
+                dist_sampler=config.get('dist_sampler', False),
+                shortest_first=False)
+
+            self.valid_loader = BatchDataLoader(
+                json_file=config.dev_manifest,
+                train_mode=False,
+                sortagrad=False,
+                batch_size=config.batch_size,
+                maxlen_in=float('inf'),
+                maxlen_out=float('inf'),
+                minibatches=0,
+                mini_batch_size=self.args.ngpu,
+                batch_count='auto',
+                batch_bins=0,
+                batch_frames_in=0,
+                batch_frames_out=0,
+                batch_frames_inout=0,
+                preprocess_conf=config.preprocess_config,
+                n_iter_processes=config.num_workers,
+                subsampling_factor=1,
+                num_encs=1,
+                dist_sampler=config.get('dist_sampler', False),
+                shortest_first=False)
+            logger.info("Setup train/valid Dataloader!")
+        else:
+            decode_batch_size = config.get('decode', dict()).get(
+                'decode_batch_size', 1)
+            # test dataset, return raw text
+            self.test_loader = BatchDataLoader(
+                json_file=config.test_manifest,
+                train_mode=False,
+                sortagrad=False,
+                batch_size=decode_batch_size,
+                maxlen_in=float('inf'),
+                maxlen_out=float('inf'),
+                minibatches=0,
+                mini_batch_size=1,
+                batch_count='auto',
+                batch_bins=0,
+                batch_frames_in=0,
+                batch_frames_out=0,
+                batch_frames_inout=0,
+                preprocess_conf=config.preprocess_config,
+                n_iter_processes=1,
+                subsampling_factor=1,
+                num_encs=1)
+
+            self.align_loader = BatchDataLoader(
+                json_file=config.test_manifest,
+                train_mode=False,
+                sortagrad=False,
+                batch_size=decode_batch_size,
+                maxlen_in=float('inf'),
+                maxlen_out=float('inf'),
+                minibatches=0,
+                mini_batch_size=1,
+                batch_count='auto',
+                batch_bins=0,
+                batch_frames_in=0,
+                batch_frames_out=0,
+                batch_frames_inout=0,
+                preprocess_conf=config.preprocess_config,
+                n_iter_processes=1,
+                subsampling_factor=1,
+                num_encs=1)
+            logger.info("Setup test/align Dataloader!")
+
+    def setup_model(self):
+        config = self.config
+        model_conf = config
+
+        with UpdateConfig(model_conf):
+            if self.train:
+                model_conf.input_dim = self.train_loader.feat_dim
+                model_conf.output_dim = self.train_loader.vocab_size
+            else:
+                model_conf.input_dim = self.test_loader.feat_dim
+                model_conf.output_dim = self.test_loader.vocab_size
+
+        model = U2Model.from_config(model_conf)
+
+        if self.parallel:
+            model = paddle.DataParallel(model)
+
+        logger.info(f"{model}")
+        layer_tools.print_params(model, logger.info)
+        self.model = model
+        logger.info("Setup model!")
+
+        if not self.train:
+            return
+
+        train_config = config
+        optim_type = train_config.optim
+        optim_conf = train_config.optim_conf
+        scheduler_type = train_config.scheduler
+        scheduler_conf = train_config.scheduler_conf
+
+        scheduler_args = {
+            "learning_rate": optim_conf.lr,
+            "verbose": False,
+            "warmup_steps": scheduler_conf.warmup_steps,
+            "gamma": scheduler_conf.lr_decay,
+            "d_model": model_conf.encoder_conf.output_size,
+        }
+        lr_scheduler = LRSchedulerFactory.from_args(scheduler_type,
+                                                    scheduler_args)
+
+        def optimizer_args(
+                config,
+                parameters,
+                lr_scheduler=None, ):
+            train_config = config
+            optim_type = train_config.optim
+            optim_conf = train_config.optim_conf
+            scheduler_type = train_config.scheduler
+            scheduler_conf = train_config.scheduler_conf
+            return {
+                "grad_clip": train_config.global_grad_clip,
+                "weight_decay": optim_conf.weight_decay,
+                "learning_rate": lr_scheduler
+                if lr_scheduler else optim_conf.lr,
+                "parameters": parameters,
+                "epsilon": 1e-9 if optim_type == 'noam' else None,
+                "beta1": 0.9 if optim_type == 'noam' else None,
+                "beat2": 0.98 if optim_type == 'noam' else None,
+            }
+
+        optimzer_args = optimizer_args(config, model.parameters(), lr_scheduler)
+        optimizer = OptimizerFactory.from_args(optim_type, optimzer_args)
+
+        self.optimizer = optimizer
+        self.lr_scheduler = lr_scheduler
+        logger.info("Setup optimizer/lr_scheduler!")
+
+
+class U2Tester(U2Trainer):
+    def __init__(self, config, args):
+        super().__init__(config, args)
+        self.text_feature = TextFeaturizer(
+            unit_type=self.config.unit_type,
+            vocab=self.config.vocab_filepath,
+            spm_model_prefix=self.config.spm_model_prefix)
+        self.vocab_list = self.text_feature.vocab_list
+
+    def id2token(self, texts, texts_len, text_feature):
+        """ ord() id to chr() chr """
+        trans = []
+        for text, n in zip(texts, texts_len):
+            n = n.numpy().item()
+            ids = text[:n]
+            trans.append(text_feature.defeaturize(ids.numpy().tolist()))
+        return trans
+
+    def compute_metrics(self,
+                        utts,
+                        audio,
+                        audio_len,
+                        texts,
+                        texts_len,
+                        fout=None):
+        decode_config = self.config.decode
+        errors_sum, len_refs, num_ins = 0.0, 0, 0
+        errors_func = error_rate.char_errors if decode_config.error_rate_type == 'cer' else error_rate.word_errors
+        error_rate_func = error_rate.cer if decode_config.error_rate_type == 'cer' else error_rate.wer
+
+        start_time = time.time()
+        target_transcripts = self.id2token(texts, texts_len, self.text_feature)
+        result_transcripts, result_tokenids = self.model.decode(
+            audio,
+            audio_len,
+            text_feature=self.text_feature,
+            decoding_method=decode_config.decoding_method,
+            beam_size=decode_config.beam_size,
+            ctc_weight=decode_config.ctc_weight,
+            decoding_chunk_size=decode_config.decoding_chunk_size,
+            num_decoding_left_chunks=decode_config.num_decoding_left_chunks,
+            simulate_streaming=decode_config.simulate_streaming)
+        decode_time = time.time() - start_time
+
+        for utt, target, result, rec_tids in zip(
+                utts, target_transcripts, result_transcripts, result_tokenids):
+            errors, len_ref = errors_func(target, result)
+            errors_sum += errors
+            len_refs += len_ref
+            num_ins += 1
+            if fout:
+                fout.write({
+                    "utt": utt,
+                    "refs": [target],
+                    "hyps": [result],
+                    "hyps_tokenid": [rec_tids],
+                })
+            logger.info(f"Utt: {utt}")
+            logger.info(f"Ref: {target}")
+            logger.info(f"Hyp: {result}")
+            logger.info("One example error rate [%s] = %f" % (
+                decode_config.error_rate_type, error_rate_func(target, result)))
+
+        return dict(
+            errors_sum=errors_sum,
+            len_refs=len_refs,
+            num_ins=num_ins,  # num examples
+            error_rate=errors_sum / len_refs,
+            error_rate_type=decode_config.error_rate_type,
+            num_frames=audio_len.sum().numpy().item(),
+            decode_time=decode_time)
+
+    @mp_tools.rank_zero_only
+    @paddle.no_grad()
+    def test(self):
+        assert self.args.result_file
+        self.model.eval()
+        logger.info(f"Test Total Examples: {len(self.test_loader.dataset)}")
+
+        stride_ms = self.config.stride_ms
+        error_rate_type = None
+        errors_sum, len_refs, num_ins = 0.0, 0, 0
+        num_frames = 0.0
+        num_time = 0.0
+        with jsonlines.open(self.args.result_file, 'w') as fout:
+            for i, batch in enumerate(self.test_loader):
+                metrics = self.compute_metrics(*batch, fout=fout)
+                num_frames += metrics['num_frames']
+                num_time += metrics["decode_time"]
+                errors_sum += metrics['errors_sum']
+                len_refs += metrics['len_refs']
+                num_ins += metrics['num_ins']
+                error_rate_type = metrics['error_rate_type']
+                rtf = num_time / (num_frames * stride_ms)
+                logger.info(
+                    "RTF: %f, Error rate [%s] (%d/?) = %f" %
+                    (rtf, error_rate_type, num_ins, errors_sum / len_refs))
+
+        rtf = num_time / (num_frames * stride_ms)
+        msg = "Test: "
+        msg += "epoch: {}, ".format(self.epoch)
+        msg += "step: {}, ".format(self.iteration)
+        msg += "RTF: {}, ".format(rtf)
+        msg += "Final error rate [%s] (%d/%d) = %f" % (
+            error_rate_type, num_ins, num_ins, errors_sum / len_refs)
+        logger.info(msg)
+
+        # test meta results
+        err_meta_path = os.path.splitext(self.args.result_file)[0] + '.err'
+        err_type_str = "{}".format(error_rate_type)
+        with open(err_meta_path, 'w') as f:
+            data = json.dumps({
+                "epoch":
+                self.epoch,
+                "step":
+                self.iteration,
+                "rtf":
+                rtf,
+                error_rate_type:
+                errors_sum / len_refs,
+                "dataset_hour": (num_frames * stride_ms) / 1000.0 / 3600.0,
+                "process_hour":
+                num_time / 1000.0 / 3600.0,
+                "num_examples":
+                num_ins,
+                "err_sum":
+                errors_sum,
+                "ref_len":
+                len_refs,
+                "decode_method":
+                self.config.decode.decoding_method,
+            })
+            f.write(data + '\n')
+
+    @paddle.no_grad()
+    def align(self):
+        ctc_utils.ctc_align(self.config, self.model, self.align_loader,
+                            self.config.decode.decode_batch_size,
+                            self.config.stride_ms, self.vocab_list,
+                            self.args.result_file)
+
+    def load_inferspec(self):
+        """infer model and input spec.
+
+        Returns:
+            nn.Layer: inference model
+            List[paddle.static.InputSpec]: input spec.
+        """
+        from paddlespeech.s2t.models.u2 import U2InferModel
+        infer_model = U2InferModel.from_pretrained(self.test_loader,
+                                                   self.config.clone(),
+                                                   self.args.checkpoint_path)
+        feat_dim = self.test_loader.feat_dim
+        input_spec = [
+            paddle.static.InputSpec(shape=[1, None, feat_dim],
+                                    dtype='float32'),  # audio, [B,T,D]
+            paddle.static.InputSpec(shape=[1],
+                                    dtype='int64'),  # audio_length, [B]
+        ]
+        return infer_model, input_spec
+
+    @paddle.no_grad()
+    def export(self):
+        infer_model, input_spec = self.load_inferspec()
+        assert isinstance(input_spec, list), type(input_spec)
+        infer_model.eval()
+        static_model = paddle.jit.to_static(infer_model, input_spec=input_spec)
+        logger.info(f"Export code: {static_model.forward.code}")
+        paddle.jit.save(static_model, self.args.export_path)
diff --git a/ernie-sat/paddlespeech/s2t/exps/u2/trainer.py b/ernie-sat/paddlespeech/s2t/exps/u2/trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab87c30d6cdb83053b53e080094f5c52ef8050f1
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/exps/u2/trainer.py
@@ -0,0 +1,219 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Contains U2 model."""
+import paddle
+from paddle import distributed as dist
+from paddle.io import DataLoader
+
+from paddlespeech.s2t.io.collator import SpeechCollator
+from paddlespeech.s2t.io.dataset import ManifestDataset
+from paddlespeech.s2t.io.sampler import SortagradBatchSampler
+from paddlespeech.s2t.io.sampler import SortagradDistributedBatchSampler
+from paddlespeech.s2t.models.u2 import U2Evaluator
+from paddlespeech.s2t.models.u2 import U2Model
+from paddlespeech.s2t.models.u2 import U2Updater
+from paddlespeech.s2t.training.extensions.snapshot import Snapshot
+from paddlespeech.s2t.training.extensions.visualizer import VisualDL
+from paddlespeech.s2t.training.optimizer import OptimizerFactory
+from paddlespeech.s2t.training.scheduler import LRSchedulerFactory
+from paddlespeech.s2t.training.timer import Timer
+from paddlespeech.s2t.training.trainer import Trainer
+from paddlespeech.s2t.training.updaters.trainer import Trainer as NewTrainer
+from paddlespeech.s2t.utils import layer_tools
+from paddlespeech.s2t.utils.log import Log
+from paddlespeech.s2t.utils.utility import UpdateConfig
+
+logger = Log(__name__).getlog()
+
+
+class U2Trainer(Trainer):
+    def __init__(self, config, args):
+        super().__init__(config, args)
+
+    def setup_dataloader(self):
+        config = self.config.clone()
+        config.defrost()
+        config.keep_transcription_text = False
+
+        # train/valid dataset, return token ids
+        config.manifest = config.train_manifest
+        train_dataset = ManifestDataset.from_config(config)
+
+        config.manifest = config.dev_manifest
+        dev_dataset = ManifestDataset.from_config(config)
+
+        collate_fn_train = SpeechCollator.from_config(config)
+
+        collate_fn_dev = SpeechCollator.from_config(config)
+
+        if self.parallel:
+            batch_sampler = SortagradDistributedBatchSampler(
+                train_dataset,
+                batch_size=config.batch_size,
+                num_replicas=None,
+                rank=None,
+                shuffle=True,
+                drop_last=True,
+                sortagrad=config.sortagrad,
+                shuffle_method=config.shuffle_method)
+        else:
+            batch_sampler = SortagradBatchSampler(
+                train_dataset,
+                shuffle=True,
+                batch_size=config.batch_size,
+                drop_last=True,
+                sortagrad=config.sortagrad,
+                shuffle_method=config.shuffle_method)
+        self.train_loader = DataLoader(
+            train_dataset,
+            batch_sampler=batch_sampler,
+            collate_fn=collate_fn_train,
+            num_workers=config.num_workers, )
+        self.valid_loader = DataLoader(
+            dev_dataset,
+            batch_size=config.batch_size,
+            shuffle=False,
+            drop_last=False,
+            collate_fn=collate_fn_dev,
+            num_workers=config.num_workers, )
+
+        # test dataset, return raw text
+        config.manifest = config.test_manifest
+        # filter test examples, will cause less examples, but no mismatch with training
+        # and can use large batch size , save training time, so filter test egs now.
+        config.min_input_len = 0.0  # second
+        config.max_input_len = float('inf')  # second
+        config.min_output_len = 0.0  # tokens
+        config.max_output_len = float('inf')  # tokens
+        config.min_output_input_ratio = 0.00
+        config.max_output_input_ratio = float('inf')
+
+        test_dataset = ManifestDataset.from_config(config)
+        # return text ord id
+        config.keep_transcription_text = True
+        self.test_loader = DataLoader(
+            test_dataset,
+            batch_size=config.decode.batch_size,
+            shuffle=False,
+            drop_last=False,
+            collate_fn=SpeechCollator.from_config(config))
+        # return text token id
+        config.keep_transcription_text = False
+        self.align_loader = DataLoader(
+            test_dataset,
+            batch_size=config.decode.batch_size,
+            shuffle=False,
+            drop_last=False,
+            collate_fn=SpeechCollator.from_config(config))
+        logger.info("Setup train/valid/test/align Dataloader!")
+
+    def setup_model(self):
+        config = self.config
+        model_conf = config
+        with UpdateConfig(model_conf):
+            model_conf.input_dim = self.train_loader.collate_fn.feature_size
+            model_conf.output_dim = self.train_loader.collate_fn.vocab_size
+
+        model = U2Model.from_config(model_conf)
+
+        if self.parallel:
+            model = paddle.DataParallel(model)
+
+        model.train()
+        logger.info(f"{model}")
+        layer_tools.print_params(model, logger.info)
+
+        train_config = config
+        optim_type = train_config.optim
+        optim_conf = train_config.optim_conf
+        scheduler_type = train_config.scheduler
+        scheduler_conf = train_config.scheduler_conf
+
+        scheduler_args = {
+            "learning_rate": optim_conf.lr,
+            "verbose": False,
+            "warmup_steps": scheduler_conf.warmup_steps,
+            "gamma": scheduler_conf.lr_decay,
+            "d_model": model_conf.encoder_conf.output_size,
+        }
+        lr_scheduler = LRSchedulerFactory.from_args(scheduler_type,
+                                                    scheduler_args)
+
+        def optimizer_args(
+                config,
+                parameters,
+                lr_scheduler=None, ):
+            train_config = config
+            optim_type = train_config.optim
+            optim_conf = train_config.optim_conf
+            scheduler_type = train_config.scheduler
+            scheduler_conf = train_config.scheduler_conf
+            return {
+                "grad_clip": train_config.global_grad_clip,
+                "weight_decay": optim_conf.weight_decay,
+                "learning_rate": lr_scheduler
+                if lr_scheduler else optim_conf.lr,
+                "parameters": parameters,
+                "epsilon": 1e-9 if optim_type == 'noam' else None,
+                "beta1": 0.9 if optim_type == 'noam' else None,
+                "beat2": 0.98 if optim_type == 'noam' else None,
+            }
+
+        optimzer_args = optimizer_args(config, model.parameters(), lr_scheduler)
+        optimizer = OptimizerFactory.from_args(optim_type, optimzer_args)
+
+        self.model = model
+        self.optimizer = optimizer
+        self.lr_scheduler = lr_scheduler
+        logger.info("Setup model/optimizer/lr_scheduler!")
+
+    def setup_updater(self):
+        output_dir = self.output_dir
+        config = self.config
+
+        updater = U2Updater(
+            model=self.model,
+            optimizer=self.optimizer,
+            scheduler=self.lr_scheduler,
+            dataloader=self.train_loader,
+            output_dir=output_dir,
+            accum_grad=config.accum_grad)
+
+        trainer = NewTrainer(updater, (config.n_epoch, 'epoch'), output_dir)
+
+        evaluator = U2Evaluator(self.model, self.valid_loader)
+
+        trainer.extend(evaluator, trigger=(1, "epoch"))
+
+        if dist.get_rank() == 0:
+            trainer.extend(VisualDL(output_dir), trigger=(1, "iteration"))
+            num_snapshots = config.checkpoint.kbest_n
+            trainer.extend(
+                Snapshot(
+                    mode='kbest',
+                    max_size=num_snapshots,
+                    indicator='VALID/LOSS',
+                    less_better=True),
+                trigger=(1, 'epoch'))
+        # print(trainer.extensions)
+        # trainer.run()
+        self.trainer = trainer
+
+    def run(self):
+        """The routine of the experiment after setup. This method is intended
+        to be used by the user.
+        """
+        self.setup_updater()
+        with Timer("Training Done: {}"):
+            self.trainer.run()
diff --git a/ernie-sat/paddlespeech/s2t/exps/u2_kaldi/__init__.py b/ernie-sat/paddlespeech/s2t/exps/u2_kaldi/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..185a92b8d94d3426d616c0624f0f2ee04339349e
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/exps/u2_kaldi/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/ernie-sat/paddlespeech/s2t/exps/u2_kaldi/bin/__init__.py b/ernie-sat/paddlespeech/s2t/exps/u2_kaldi/bin/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..185a92b8d94d3426d616c0624f0f2ee04339349e
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/exps/u2_kaldi/bin/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/ernie-sat/paddlespeech/s2t/exps/u2_kaldi/bin/recog.py b/ernie-sat/paddlespeech/s2t/exps/u2_kaldi/bin/recog.py
new file mode 100644
index 0000000000000000000000000000000000000000..37ddd229e43b65d6c8e00ff7c9d5a1262ccbd97a
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/exps/u2_kaldi/bin/recog.py
@@ -0,0 +1,19 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+
+from paddlespeech.s2t.decoders.recog_bin import main
+
+if __name__ == "__main__":
+    main(sys.argv[1:])
diff --git a/ernie-sat/paddlespeech/s2t/exps/u2_kaldi/bin/test.py b/ernie-sat/paddlespeech/s2t/exps/u2_kaldi/bin/test.py
new file mode 100644
index 0000000000000000000000000000000000000000..422483b9797059f8b836c8abe6ee4546e56619f7
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/exps/u2_kaldi/bin/test.py
@@ -0,0 +1,87 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Evaluation for U2 model."""
+import cProfile
+
+from yacs.config import CfgNode
+
+from paddlespeech.s2t.training.cli import default_argument_parser
+from paddlespeech.s2t.utils.dynamic_import import dynamic_import
+from paddlespeech.s2t.utils.utility import print_arguments
+
+model_test_alias = {
+    "u2": "paddlespeech.s2t.exps.u2.model:U2Tester",
+    "u2_kaldi": "paddlespeech.s2t.exps.u2_kaldi.model:U2Tester",
+}
+
+
+def main_sp(config, args):
+    class_obj = dynamic_import(args.model_name, model_test_alias)
+    exp = class_obj(config, args)
+    with exp.eval():
+        exp.setup()
+        if args.run_mode == 'test':
+            exp.run_test()
+        elif args.run_mode == 'export':
+            exp.run_export()
+        elif args.run_mode == 'align':
+            exp.run_align()
+
+
+def main(config, args):
+    main_sp(config, args)
+
+
+if __name__ == "__main__":
+    parser = default_argument_parser()
+    parser.add_argument(
+        '--model-name',
+        type=str,
+        default='u2_kaldi',
+        help='model name, e.g: deepspeech2, u2, u2_kaldi, u2_st')
+    parser.add_argument(
+        '--run-mode',
+        type=str,
+        default='test',
+        help='run mode, e.g. test, align, export')
+    parser.add_argument(
+        '--dict-path', type=str, default=None, help='dict path.')
+    # save asr result to 
+    parser.add_argument(
+        "--result-file", type=str, help="path of save the asr result")
+    # save jit model to 
+    parser.add_argument(
+        "--export-path", type=str, help="path of the jit model to save")
+    args = parser.parse_args()
+    print_arguments(args, globals())
+
+    config = CfgNode()
+    config.set_new_allowed(True)
+    config.merge_from_file(args.config)
+    if args.decode_cfg:
+        decode_confs = CfgNode(new_allowed=True)
+        decode_confs.merge_from_file(args.decode_cfg)
+        config.decode = decode_confs
+    if args.opts:
+        config.merge_from_list(args.opts)
+    config.freeze()
+    print(config)
+    if args.dump_config:
+        with open(args.dump_config, 'w') as f:
+            print(config, file=f)
+
+    # Setting for profiling
+    pr = cProfile.Profile()
+    pr.runcall(main, config, args)
+    pr.dump_stats('test.profile')
diff --git a/ernie-sat/paddlespeech/s2t/exps/u2_kaldi/bin/train.py b/ernie-sat/paddlespeech/s2t/exps/u2_kaldi/bin/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..fcfc05a8aea553d154ebde5289560d9c818672ff
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/exps/u2_kaldi/bin/train.py
@@ -0,0 +1,69 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Trainer for U2 model."""
+import cProfile
+import os
+
+from paddle import distributed as dist
+from yacs.config import CfgNode
+
+from paddlespeech.s2t.training.cli import default_argument_parser
+from paddlespeech.s2t.utils.dynamic_import import dynamic_import
+from paddlespeech.s2t.utils.utility import print_arguments
+
+model_train_alias = {
+    "u2": "paddlespeech.s2t.exps.u2.model:U2Trainer",
+    "u2_kaldi": "paddlespeech.s2t.exps.u2_kaldi.model:U2Trainer",
+}
+
+
+def main_sp(config, args):
+    class_obj = dynamic_import(args.model_name, model_train_alias)
+    exp = class_obj(config, args)
+    exp.setup()
+    exp.run()
+
+
+def main(config, args):
+    if args.ngpu > 1:
+        dist.spawn(main_sp, args=(config, args), nprocs=args.ngpu)
+    else:
+        main_sp(config, args)
+
+
+if __name__ == "__main__":
+    parser = default_argument_parser()
+    parser.add_argument(
+        '--model-name',
+        type=str,
+        default='u2_kaldi',
+        help='model name, e.g: deepspeech2, u2, u2_kaldi, u2_st')
+    args = parser.parse_args()
+    print_arguments(args, globals())
+
+    config = CfgNode()
+    config.set_new_allowed(True)
+    config.merge_from_file(args.config)
+    if args.opts:
+        config.merge_from_list(args.opts)
+    config.freeze()
+    print(config)
+    if args.dump_config:
+        with open(args.dump_config, 'w') as f:
+            print(config, file=f)
+
+    # Setting for profiling
+    pr = cProfile.Profile()
+    pr.runcall(main, config, args)
+    pr.dump_stats(os.path.join(args.output, 'train.profile'))
diff --git a/ernie-sat/paddlespeech/s2t/exps/u2_kaldi/model.py b/ernie-sat/paddlespeech/s2t/exps/u2_kaldi/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc995977ada577770612f99d05387ed0bb87d39e
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/exps/u2_kaldi/model.py
@@ -0,0 +1,509 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Contains U2 model."""
+import json
+import os
+import time
+from collections import defaultdict
+from contextlib import nullcontext
+
+import jsonlines
+import numpy as np
+import paddle
+from paddle import distributed as dist
+
+from paddlespeech.s2t.frontend.featurizer import TextFeaturizer
+from paddlespeech.s2t.frontend.utility import load_dict
+from paddlespeech.s2t.io.dataloader import BatchDataLoader
+from paddlespeech.s2t.models.u2 import U2Model
+from paddlespeech.s2t.training.optimizer import OptimizerFactory
+from paddlespeech.s2t.training.scheduler import LRSchedulerFactory
+from paddlespeech.s2t.training.timer import Timer
+from paddlespeech.s2t.training.trainer import Trainer
+from paddlespeech.s2t.utils import ctc_utils
+from paddlespeech.s2t.utils import error_rate
+from paddlespeech.s2t.utils import layer_tools
+from paddlespeech.s2t.utils import mp_tools
+from paddlespeech.s2t.utils.log import Log
+from paddlespeech.s2t.utils.utility import UpdateConfig
+
+logger = Log(__name__).getlog()
+
+
+class U2Trainer(Trainer):
+    def __init__(self, config, args):
+        super().__init__(config, args)
+
+    def train_batch(self, batch_index, batch_data, msg):
+        train_conf = self.config
+        start = time.time()
+
+        # forward
+        utt, audio, audio_len, text, text_len = batch_data
+        loss, attention_loss, ctc_loss = self.model(audio, audio_len, text,
+                                                    text_len)
+
+        # loss div by `batch_size * accum_grad`
+        loss /= train_conf.accum_grad
+        losses_np = {'loss': float(loss) * train_conf.accum_grad}
+        if attention_loss:
+            losses_np['att_loss'] = float(attention_loss)
+        if ctc_loss:
+            losses_np['ctc_loss'] = float(ctc_loss)
+
+        # loss backward
+        if (batch_index + 1) % train_conf.accum_grad != 0:
+            # Disable gradient synchronizations across DDP processes.
+            # Within this context, gradients will be accumulated on module
+            # variables, which will later be synchronized.
+            context = self.model.no_sync if (hasattr(self.model, "no_sync") and
+                                             self.parallel) else nullcontext
+        else:
+            # Used for single gpu training and DDP gradient synchronization
+            # processes.
+            context = nullcontext
+        with context():
+            loss.backward()
+            layer_tools.print_grads(self.model, print_func=None)
+
+        # optimizer step
+        if (batch_index + 1) % train_conf.accum_grad == 0:
+            self.optimizer.step()
+            self.optimizer.clear_grad()
+            self.lr_scheduler.step()
+            self.iteration += 1
+
+        iteration_time = time.time() - start
+
+        if (batch_index + 1) % train_conf.log_interval == 0:
+            msg += "train time: {:>.3f}s, ".format(iteration_time)
+            msg += "batch size: {}, ".format(self.config.batch_size)
+            msg += "accum: {}, ".format(train_conf.accum_grad)
+            msg += ', '.join('{}: {:>.6f}'.format(k, v)
+                             for k, v in losses_np.items())
+            logger.info(msg)
+
+            if dist.get_rank() == 0 and self.visualizer:
+                losses_np_v = losses_np.copy()
+                losses_np_v.update({"lr": self.lr_scheduler()})
+                for key, val in losses_np_v.items():
+                    self.visualizer.add_scalar(
+                        tag="train/" + key, value=val, step=self.iteration - 1)
+
+    @paddle.no_grad()
+    def valid(self):
+        self.model.eval()
+        logger.info(f"Valid Total Examples: {len(self.valid_loader.dataset)}")
+        valid_losses = defaultdict(list)
+        num_seen_utts = 1
+        total_loss = 0.0
+
+        for i, batch in enumerate(self.valid_loader):
+            utt, audio, audio_len, text, text_len = batch
+            loss, attention_loss, ctc_loss = self.model(audio, audio_len, text,
+                                                        text_len)
+            if paddle.isfinite(loss):
+                num_utts = batch[1].shape[0]
+                num_seen_utts += num_utts
+                total_loss += float(loss) * num_utts
+                valid_losses['val_loss'].append(float(loss))
+                if attention_loss:
+                    valid_losses['val_att_loss'].append(float(attention_loss))
+                if ctc_loss:
+                    valid_losses['val_ctc_loss'].append(float(ctc_loss))
+
+            if (i + 1) % self.config.log_interval == 0:
+                valid_dump = {k: np.mean(v) for k, v in valid_losses.items()}
+                valid_dump['val_history_loss'] = total_loss / num_seen_utts
+
+                # logging
+                msg = f"Valid: Rank: {dist.get_rank()}, "
+                msg += "epoch: {}, ".format(self.epoch)
+                msg += "step: {}, ".format(self.iteration)
+                msg += "batch: {}/{}, ".format(i + 1, len(self.valid_loader))
+                msg += ', '.join('{}: {:>.6f}'.format(k, v)
+                                 for k, v in valid_dump.items())
+                logger.info(msg)
+
+        logger.info('Rank {} Val info val_loss {}'.format(
+            dist.get_rank(), total_loss / num_seen_utts))
+        return total_loss, num_seen_utts
+
+    def do_train(self):
+        """The training process control by step."""
+        # !!!IMPORTANT!!!
+        # Try to export the model by script, if fails, we should refine
+        # the code to satisfy the script export requirements
+        # script_model = paddle.jit.to_static(self.model)
+        # script_model_path = str(self.checkpoint_dir / 'init')
+        # paddle.jit.save(script_model, script_model_path)
+
+        self.before_train()
+
+        logger.info(f"Train Total Examples: {len(self.train_loader.dataset)}")
+        while self.epoch < self.config.n_epoch:
+            with Timer("Epoch-Train Time Cost: {}"):
+                self.model.train()
+                try:
+                    data_start_time = time.time()
+                    for batch_index, batch in enumerate(self.train_loader):
+                        dataload_time = time.time() - data_start_time
+                        msg = "Train: Rank: {}, ".format(dist.get_rank())
+                        msg += "epoch: {}, ".format(self.epoch)
+                        msg += "step: {}, ".format(self.iteration)
+                        msg += "batch : {}/{}, ".format(batch_index + 1,
+                                                        len(self.train_loader))
+                        msg += "lr: {:>.8f}, ".format(self.lr_scheduler())
+                        msg += "data time: {:>.3f}s, ".format(dataload_time)
+                        self.train_batch(batch_index, batch, msg)
+                        self.after_train_batch()
+                        data_start_time = time.time()
+                except Exception as e:
+                    logger.error(e)
+                    raise e
+
+            with Timer("Eval Time Cost: {}"):
+                total_loss, num_seen_utts = self.valid()
+                if dist.get_world_size() > 1:
+                    num_seen_utts = paddle.to_tensor(num_seen_utts)
+                    # the default operator in all_reduce function is sum.
+                    dist.all_reduce(num_seen_utts)
+                    total_loss = paddle.to_tensor(total_loss)
+                    dist.all_reduce(total_loss)
+                    cv_loss = total_loss / num_seen_utts
+                    cv_loss = float(cv_loss)
+                else:
+                    cv_loss = total_loss / num_seen_utts
+
+            logger.info(
+                'Epoch {} Val info val_loss {}'.format(self.epoch, cv_loss))
+            if self.visualizer:
+                self.visualizer.add_scalar(
+                    tag='eval/cv_loss', value=cv_loss, step=self.epoch)
+                self.visualizer.add_scalar(
+                    tag='eval/lr', value=self.lr_scheduler(), step=self.epoch)
+
+            self.save(tag=self.epoch, infos={'val_loss': cv_loss})
+            self.new_epoch()
+
+    def setup_dataloader(self):
+        config = self.config.clone()
+        # train/valid dataset, return token ids
+        self.train_loader = BatchDataLoader(
+            json_file=config.train_manifest,
+            train_mode=True,
+            sortagrad=False,
+            batch_size=config.batch_size,
+            maxlen_in=float('inf'),
+            maxlen_out=float('inf'),
+            minibatches=0,
+            mini_batch_size=self.args.ngpu,
+            batch_count='auto',
+            batch_bins=0,
+            batch_frames_in=0,
+            batch_frames_out=0,
+            batch_frames_inout=0,
+            preprocess_conf=config.preprocess_config,
+            n_iter_processes=config.num_workers,
+            subsampling_factor=1,
+            num_encs=1)
+
+        self.valid_loader = BatchDataLoader(
+            json_file=config.dev_manifest,
+            train_mode=False,
+            sortagrad=False,
+            batch_size=config.batch_size,
+            maxlen_in=float('inf'),
+            maxlen_out=float('inf'),
+            minibatches=0,
+            mini_batch_size=self.args.ngpu,
+            batch_count='auto',
+            batch_bins=0,
+            batch_frames_in=0,
+            batch_frames_out=0,
+            batch_frames_inout=0,
+            preprocess_conf=None,
+            n_iter_processes=config.num_workers,
+            subsampling_factor=1,
+            num_encs=1)
+
+        decode_batch_size = config.get('decode', dict()).get(
+            'decode_batch_size', 1)
+        # test dataset, return raw text
+        self.test_loader = BatchDataLoader(
+            json_file=config.test_manifest,
+            train_mode=False,
+            sortagrad=False,
+            batch_size=decode_batch_size,
+            maxlen_in=float('inf'),
+            maxlen_out=float('inf'),
+            minibatches=0,
+            mini_batch_size=1,
+            batch_count='auto',
+            batch_bins=0,
+            batch_frames_in=0,
+            batch_frames_out=0,
+            batch_frames_inout=0,
+            preprocess_conf=None,
+            n_iter_processes=1,
+            subsampling_factor=1,
+            num_encs=1)
+
+        self.align_loader = BatchDataLoader(
+            json_file=config.test_manifest,
+            train_mode=False,
+            sortagrad=False,
+            batch_size=decode_batch_size,
+            maxlen_in=float('inf'),
+            maxlen_out=float('inf'),
+            minibatches=0,
+            mini_batch_size=1,
+            batch_count='auto',
+            batch_bins=0,
+            batch_frames_in=0,
+            batch_frames_out=0,
+            batch_frames_inout=0,
+            preprocess_conf=None,
+            n_iter_processes=1,
+            subsampling_factor=1,
+            num_encs=1)
+        logger.info("Setup train/valid/test/align Dataloader!")
+
+    def setup_model(self):
+        config = self.config
+
+        # model
+        model_conf = config
+        with UpdateConfig(model_conf):
+            model_conf.input_dim = self.train_loader.feat_dim
+            model_conf.output_dim = self.train_loader.vocab_size
+        model = U2Model.from_config(model_conf)
+        if self.parallel:
+            model = paddle.DataParallel(model)
+        layer_tools.print_params(model, logger.info)
+
+        # lr
+        scheduler_conf = config.scheduler_conf
+        scheduler_args = {
+            "learning_rate": scheduler_conf.lr,
+            "warmup_steps": scheduler_conf.warmup_steps,
+            "gamma": scheduler_conf.lr_decay,
+            "d_model": model_conf.encoder_conf.output_size,
+            "verbose": False,
+        }
+        lr_scheduler = LRSchedulerFactory.from_args(config.scheduler,
+                                                    scheduler_args)
+
+        # opt
+        def optimizer_args(
+                config,
+                parameters,
+                lr_scheduler=None, ):
+            optim_conf = config.optim_conf
+            return {
+                "grad_clip": optim_conf.global_grad_clip,
+                "weight_decay": optim_conf.weight_decay,
+                "learning_rate": lr_scheduler,
+                "parameters": parameters,
+            }
+
+        optimzer_args = optimizer_args(config, model.parameters(), lr_scheduler)
+        optimizer = OptimizerFactory.from_args(config.optim, optimzer_args)
+
+        self.model = model
+        self.lr_scheduler = lr_scheduler
+        self.optimizer = optimizer
+        logger.info("Setup model/optimizer/lr_scheduler!")
+
+
+class U2Tester(U2Trainer):
+    def __init__(self, config, args):
+        super().__init__(config, args)
+        self.text_feature = TextFeaturizer(
+            unit_type=self.config.unit_type,
+            vocab=self.config.vocab_filepath,
+            spm_model_prefix=self.config.spm_model_prefix)
+        self.vocab_list = self.text_feature.vocab_list
+
+    def id2token(self, texts, texts_len, text_feature):
+        """ ord() id to chr() chr """
+        trans = []
+        for text, n in zip(texts, texts_len):
+            n = n.numpy().item()
+            ids = text[:n]
+            trans.append(text_feature.defeaturize(ids.numpy().tolist()))
+        return trans
+
+    def compute_metrics(self,
+                        utts,
+                        audio,
+                        audio_len,
+                        texts,
+                        texts_len,
+                        fout=None):
+        decode_cfg = self.config.decode
+        errors_sum, len_refs, num_ins = 0.0, 0, 0
+        errors_func = error_rate.char_errors if decode_cfg.error_rate_type == 'cer' else error_rate.word_errors
+        error_rate_func = error_rate.cer if decode_cfg.error_rate_type == 'cer' else error_rate.wer
+
+        start_time = time.time()
+        target_transcripts = self.id2token(texts, texts_len, self.text_feature)
+        result_transcripts, result_tokenids = self.model.decode(
+            audio,
+            audio_len,
+            text_feature=self.text_feature,
+            decoding_method=decode_cfg.decoding_method,
+            beam_size=decode_cfg.beam_size,
+            ctc_weight=decode_cfg.ctc_weight,
+            decoding_chunk_size=decode_cfg.decoding_chunk_size,
+            num_decoding_left_chunks=decode_cfg.num_decoding_left_chunks,
+            simulate_streaming=decode_cfg.simulate_streaming)
+        decode_time = time.time() - start_time
+
+        for i, (utt, target, result, rec_tids) in enumerate(
+                zip(utts, target_transcripts, result_transcripts,
+                    result_tokenids)):
+            errors, len_ref = errors_func(target, result)
+            errors_sum += errors
+            len_refs += len_ref
+            num_ins += 1
+            if fout:
+                fout.write({
+                    "utt": utt,
+                    "refs": [target],
+                    "hyps": [result],
+                    "hyps_tokenid": [rec_tids],
+                })
+            logger.info(f"Utt: {utt}")
+            logger.info(f"Ref: {target}")
+            logger.info(f"Hyp: {result}")
+            logger.info(
+                "One example error rate [%s] = %f" %
+                (decode_cfg.error_rate_type, error_rate_func(target, result)))
+
+        return dict(
+            errors_sum=errors_sum,
+            len_refs=len_refs,
+            num_ins=num_ins,  # num examples
+            error_rate=errors_sum / len_refs,
+            error_rate_type=decode_cfg.error_rate_type,
+            num_frames=audio_len.sum().numpy().item(),
+            decode_time=decode_time)
+
+    @mp_tools.rank_zero_only
+    @paddle.no_grad()
+    def test(self):
+        assert self.args.result_file
+        self.model.eval()
+        logger.info(f"Test Total Examples: {len(self.test_loader.dataset)}")
+
+        stride_ms = self.config.stride_ms
+        error_rate_type = None
+        errors_sum, len_refs, num_ins = 0.0, 0, 0
+        num_frames = 0.0
+        num_time = 0.0
+        with jsonlines.open(self.args.result_file, 'w') as fout:
+            for i, batch in enumerate(self.test_loader):
+                metrics = self.compute_metrics(*batch, fout=fout)
+                num_frames += metrics['num_frames']
+                num_time += metrics["decode_time"]
+                errors_sum += metrics['errors_sum']
+                len_refs += metrics['len_refs']
+                num_ins += metrics['num_ins']
+                error_rate_type = metrics['error_rate_type']
+                rtf = num_time / (num_frames * stride_ms)
+                logger.info(
+                    "RTF: %f, Error rate [%s] (%d/?) = %f" %
+                    (rtf, error_rate_type, num_ins, errors_sum / len_refs))
+
+        rtf = num_time / (num_frames * stride_ms)
+        msg = "Test: "
+        msg += "epoch: {}, ".format(self.epoch)
+        msg += "step: {}, ".format(self.iteration)
+        msg += "RTF: {}, ".format(rtf)
+        msg += "Final error rate [%s] (%d/%d) = %f" % (
+            error_rate_type, num_ins, num_ins, errors_sum / len_refs)
+        logger.info(msg)
+
+        # test meta results
+        err_meta_path = os.path.splitext(self.args.result_file)[0] + '.err'
+        err_type_str = "{}".format(error_rate_type)
+        with open(err_meta_path, 'w') as f:
+            data = json.dumps({
+                "epoch":
+                self.epoch,
+                "step":
+                self.iteration,
+                "rtf":
+                rtf,
+                error_rate_type:
+                errors_sum / len_refs,
+                "dataset_hour": (num_frames * stride_ms) / 1000.0 / 3600.0,
+                "process_hour":
+                num_time / 1000.0 / 3600.0,
+                "num_examples":
+                num_ins,
+                "err_sum":
+                errors_sum,
+                "ref_len":
+                len_refs,
+                "decode_method":
+                self.config.decode.decoding_method,
+            })
+            f.write(data + '\n')
+
+    @paddle.no_grad()
+    def align(self):
+        ctc_utils.ctc_align(self.config, self.model, self.align_loader,
+                            self.config.decode.decode_batch_size,
+                            self.config.stride_ms, self.vocab_list,
+                            self.args.result_file)
+
+    def load_inferspec(self):
+        """infer model and input spec.
+
+        Returns:
+            nn.Layer: inference model
+            List[paddle.static.InputSpec]: input spec.
+        """
+        from paddlespeech.s2t.models.u2 import U2InferModel
+        infer_model = U2InferModel.from_pretrained(self.test_loader,
+                                                   self.config.clone(),
+                                                   self.args.checkpoint_path)
+        feat_dim = self.test_loader.feat_dim
+        input_spec = [
+            paddle.static.InputSpec(shape=[1, None, feat_dim],
+                                    dtype='float32'),  # audio, [B,T,D]
+            paddle.static.InputSpec(shape=[1],
+                                    dtype='int64'),  # audio_length, [B]
+        ]
+        return infer_model, input_spec
+
+    @paddle.no_grad()
+    def export(self):
+        infer_model, input_spec = self.load_inferspec()
+        assert isinstance(input_spec, list), type(input_spec)
+        infer_model.eval()
+        static_model = paddle.jit.to_static(infer_model, input_spec=input_spec)
+        logger.info(f"Export code: {static_model.forward.code}")
+        paddle.jit.save(static_model, self.args.export_path)
+
+    def setup_dict(self):
+        # load dictionary for debug log
+        self.args.char_list = load_dict(self.args.dict_path,
+                                        "maskctc" in self.args.model_name)
+
+    def setup(self):
+        super().setup()
+        self.setup_dict()
diff --git a/ernie-sat/paddlespeech/s2t/exps/u2_st/__init__.py b/ernie-sat/paddlespeech/s2t/exps/u2_st/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..185a92b8d94d3426d616c0624f0f2ee04339349e
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/exps/u2_st/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/ernie-sat/paddlespeech/s2t/exps/u2_st/bin/__init__.py b/ernie-sat/paddlespeech/s2t/exps/u2_st/bin/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..185a92b8d94d3426d616c0624f0f2ee04339349e
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/exps/u2_st/bin/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/ernie-sat/paddlespeech/s2t/exps/u2_st/bin/export.py b/ernie-sat/paddlespeech/s2t/exps/u2_st/bin/export.py
new file mode 100644
index 0000000000000000000000000000000000000000..c641152fe4bd75b5c72617370b02b29cc79f0435
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/exps/u2_st/bin/export.py
@@ -0,0 +1,53 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Export for U2 model."""
+from yacs.config import CfgNode
+
+from paddlespeech.s2t.exps.u2_st.model import U2STTester as Tester
+from paddlespeech.s2t.training.cli import default_argument_parser
+from paddlespeech.s2t.utils.utility import print_arguments
+
+
+def main_sp(config, args):
+    exp = Tester(config, args)
+    with exp.eval():
+        exp.setup()
+        exp.run_export()
+
+
+def main(config, args):
+    main_sp(config, args)
+
+
+if __name__ == "__main__":
+    parser = default_argument_parser()
+    # save jit model to
+    parser.add_argument(
+        "--export_path", type=str, help="path of the jit model to save")
+    args = parser.parse_args()
+    print_arguments(args, globals())
+
+    # https://yaml.org/type/float.html
+    config = CfgNode(new_allowed=True)
+    if args.config:
+        config.merge_from_file(args.config)
+    if args.opts:
+        config.merge_from_list(args.opts)
+    config.freeze()
+    print(config)
+    if args.dump_config:
+        with open(args.dump_config, 'w') as f:
+            print(config, file=f)
+
+    main(config, args)
diff --git a/ernie-sat/paddlespeech/s2t/exps/u2_st/bin/test.py b/ernie-sat/paddlespeech/s2t/exps/u2_st/bin/test.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d70a310347c6eecba7358124a58c9de9ef11c31
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/exps/u2_st/bin/test.py
@@ -0,0 +1,64 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Evaluation for U2 model."""
+import cProfile
+
+from yacs.config import CfgNode
+
+from paddlespeech.s2t.exps.u2_st.model import U2STTester as Tester
+from paddlespeech.s2t.training.cli import default_argument_parser
+from paddlespeech.s2t.utils.utility import print_arguments
+
+# TODO(hui zhang): dynamic load
+
+
+def main_sp(config, args):
+    exp = Tester(config, args)
+    with exp.eval():
+        exp.setup()
+        exp.run_test()
+
+
+def main(config, args):
+    main_sp(config, args)
+
+
+if __name__ == "__main__":
+    parser = default_argument_parser()
+    # save asr result to
+    parser.add_argument(
+        "--result_file", type=str, help="path of save the asr result")
+    args = parser.parse_args()
+    print_arguments(args, globals())
+
+    # https://yaml.org/type/float.html
+    config = CfgNode(new_allowed=True)
+    if args.config:
+        config.merge_from_file(args.config)
+    if args.decode_cfg:
+        decode_conf = CfgNode(new_allowed=True)
+        decode_conf.merge_from_file(args.decode_cfg)
+        config.decode = decode_conf
+    if args.opts:
+        config.merge_from_list(args.opts)
+    config.freeze()
+    print(config)
+    if args.dump_config:
+        with open(args.dump_config, 'w') as f:
+            print(config, file=f)
+
+    # Setting for profiling
+    pr = cProfile.Profile()
+    pr.runcall(main, config, args)
+    pr.dump_stats('test.profile')
diff --git a/ernie-sat/paddlespeech/s2t/exps/u2_st/bin/train.py b/ernie-sat/paddlespeech/s2t/exps/u2_st/bin/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..4dec9ec8ae5da869f8b78282ea19c7a6964610db
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/exps/u2_st/bin/train.py
@@ -0,0 +1,59 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Trainer for U2 model."""
+import cProfile
+import os
+
+from paddle import distributed as dist
+from yacs.config import CfgNode
+
+from paddlespeech.s2t.exps.u2_st.model import U2STTrainer as Trainer
+from paddlespeech.s2t.training.cli import default_argument_parser
+from paddlespeech.s2t.utils.utility import print_arguments
+
+
+def main_sp(config, args):
+    exp = Trainer(config, args)
+    exp.setup()
+    exp.run()
+
+
+def main(config, args):
+    if args.ngpu > 1:
+        dist.spawn(main_sp, args=(config, args), nprocs=args.ngpu)
+    else:
+        main_sp(config, args)
+
+
+if __name__ == "__main__":
+    parser = default_argument_parser()
+    args = parser.parse_args()
+    print_arguments(args, globals())
+
+    # https://yaml.org/type/float.html
+    config = CfgNode(new_allowed=True)
+    if args.config:
+        config.merge_from_file(args.config)
+    if args.opts:
+        config.merge_from_list(args.opts)
+    config.freeze()
+    print(config)
+    if args.dump_config:
+        with open(args.dump_config, 'w') as f:
+            print(config, file=f)
+
+    # Setting for profiling
+    pr = cProfile.Profile()
+    pr.runcall(main, config, args)
+    pr.dump_stats(os.path.join(args.output, 'train.profile'))
diff --git a/ernie-sat/paddlespeech/s2t/exps/u2_st/model.py b/ernie-sat/paddlespeech/s2t/exps/u2_st/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a32eda7717cc4077a90eb561e0f01ac8a212f51
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/exps/u2_st/model.py
@@ -0,0 +1,552 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Contains U2 model."""
+import json
+import os
+import time
+from collections import defaultdict
+from collections import OrderedDict
+from contextlib import nullcontext
+
+import jsonlines
+import numpy as np
+import paddle
+from paddle import distributed as dist
+
+from paddlespeech.s2t.frontend.featurizer import TextFeaturizer
+from paddlespeech.s2t.io.dataloader import BatchDataLoader
+from paddlespeech.s2t.models.u2_st import U2STModel
+from paddlespeech.s2t.training.optimizer import OptimizerFactory
+from paddlespeech.s2t.training.reporter import ObsScope
+from paddlespeech.s2t.training.reporter import report
+from paddlespeech.s2t.training.scheduler import LRSchedulerFactory
+from paddlespeech.s2t.training.timer import Timer
+from paddlespeech.s2t.training.trainer import Trainer
+from paddlespeech.s2t.utils import bleu_score
+from paddlespeech.s2t.utils import layer_tools
+from paddlespeech.s2t.utils import mp_tools
+from paddlespeech.s2t.utils.log import Log
+from paddlespeech.s2t.utils.utility import UpdateConfig
+
+logger = Log(__name__).getlog()
+
+
+class U2STTrainer(Trainer):
+    def __init__(self, config, args):
+        super().__init__(config, args)
+
+    def train_batch(self, batch_index, batch_data, msg):
+        train_conf = self.config
+        start = time.time()
+        # forward
+        utt, audio, audio_len, text, text_len = batch_data
+        if isinstance(text, list) and isinstance(text_len, list):
+            # joint training with ASR. Two decoding texts [translation, transcription]
+            text, text_transcript = text
+            text_len, text_transcript_len = text_len
+            loss, st_loss, attention_loss, ctc_loss = self.model(
+                audio, audio_len, text, text_len, text_transcript,
+                text_transcript_len)
+        else:
+            loss, st_loss, attention_loss, ctc_loss = self.model(
+                audio, audio_len, text, text_len)
+
+        # loss div by `batch_size * accum_grad`
+        loss /= train_conf.accum_grad
+        losses_np = {'loss': float(loss) * train_conf.accum_grad}
+        if st_loss:
+            losses_np['st_loss'] = float(st_loss)
+        if attention_loss:
+            losses_np['att_loss'] = float(attention_loss)
+        if ctc_loss:
+            losses_np['ctc_loss'] = float(ctc_loss)
+
+        # loss backward
+        if (batch_index + 1) % train_conf.accum_grad != 0:
+            # Disable gradient synchronizations across DDP processes.
+            # Within this context, gradients will be accumulated on module
+            # variables, which will later be synchronized.
+            context = self.model.no_sync if (hasattr(self.model, "no_sync") and
+                                             self.parallel) else nullcontext
+        else:
+            # Used for single gpu training and DDP gradient synchronization
+            # processes.
+            context = nullcontext
+        with context():
+            loss.backward()
+            layer_tools.print_grads(self.model, print_func=None)
+
+        # optimizer step
+        if (batch_index + 1) % train_conf.accum_grad == 0:
+            self.optimizer.step()
+            self.optimizer.clear_grad()
+            self.lr_scheduler.step()
+            self.iteration += 1
+
+        iteration_time = time.time() - start
+
+        for k, v in losses_np.items():
+            report(k, v)
+        report("batch_size", self.config.batch_size)
+        report("accum", train_conf.accum_grad)
+        report("step_cost", iteration_time)
+
+        if (batch_index + 1) % train_conf.log_interval == 0:
+            msg += "train time: {:>.3f}s, ".format(iteration_time)
+            msg += "batch size: {}, ".format(self.config.batch_size)
+            msg += "accum: {}, ".format(train_conf.accum_grad)
+            msg += ', '.join('{}: {:>.6f}'.format(k, v)
+                             for k, v in losses_np.items())
+            logger.info(msg)
+
+            if dist.get_rank() == 0 and self.visualizer:
+                losses_np_v = losses_np.copy()
+                losses_np_v.update({"lr": self.lr_scheduler()})
+                for key, val in losses_np_v.items():
+                    self.visualizer.add_scalar(
+                        tag="train/" + key, value=val, step=self.iteration - 1)
+
+    @paddle.no_grad()
+    def valid(self):
+        self.model.eval()
+        logger.info(f"Valid Total Examples: {len(self.valid_loader.dataset)}")
+        valid_losses = defaultdict(list)
+        num_seen_utts = 1
+        total_loss = 0.0
+        for i, batch in enumerate(self.valid_loader):
+            utt, audio, audio_len, text, text_len = batch
+            if isinstance(text, list) and isinstance(text_len, list):
+                text, text_transcript = text
+                text_len, text_transcript_len = text_len
+                loss, st_loss, attention_loss, ctc_loss = self.model(
+                    audio, audio_len, text, text_len, text_transcript,
+                    text_transcript_len)
+            else:
+                loss, st_loss, attention_loss, ctc_loss = self.model(
+                    audio, audio_len, text, text_len)
+            if paddle.isfinite(loss):
+                num_utts = batch[1].shape[0]
+                num_seen_utts += num_utts
+                total_loss += float(st_loss) * num_utts
+                valid_losses['val_loss'].append(float(st_loss))
+                if attention_loss:
+                    valid_losses['val_att_loss'].append(float(attention_loss))
+                if ctc_loss:
+                    valid_losses['val_ctc_loss'].append(float(ctc_loss))
+
+            if (i + 1) % self.config.log_interval == 0:
+                valid_dump = {k: np.mean(v) for k, v in valid_losses.items()}
+                valid_dump['val_history_st_loss'] = total_loss / num_seen_utts
+
+                # logging
+                msg = f"Valid: Rank: {dist.get_rank()}, "
+                msg += "epoch: {}, ".format(self.epoch)
+                msg += "step: {}, ".format(self.iteration)
+                msg += "batch: {}/{}, ".format(i + 1, len(self.valid_loader))
+                msg += ', '.join('{}: {:>.6f}'.format(k, v)
+                                 for k, v in valid_dump.items())
+                logger.info(msg)
+
+        logger.info('Rank {} Val info st_val_loss {}'.format(
+            dist.get_rank(), total_loss / num_seen_utts))
+        return total_loss, num_seen_utts
+
+    def do_train(self):
+        """The training process control by step."""
+        # !!!IMPORTANT!!!
+        # Try to export the model by script, if fails, we should refine
+        # the code to satisfy the script export requirements
+        # script_model = paddle.jit.to_static(self.model)
+        # script_model_path = str(self.checkpoint_dir / 'init')
+        # paddle.jit.save(script_model, script_model_path)
+
+        self.before_train()
+
+        logger.info(f"Train Total Examples: {len(self.train_loader.dataset)}")
+        while self.epoch < self.config.n_epoch:
+            with Timer("Epoch-Train Time Cost: {}"):
+                self.model.train()
+                try:
+                    data_start_time = time.time()
+                    for batch_index, batch in enumerate(self.train_loader):
+                        dataload_time = time.time() - data_start_time
+                        msg = "Train:"
+                        observation = OrderedDict()
+                        with ObsScope(observation):
+                            report("Rank", dist.get_rank())
+                            report("epoch", self.epoch)
+                            report('step', self.iteration)
+                            report("lr", self.lr_scheduler())
+                            self.train_batch(batch_index, batch, msg)
+                            self.after_train_batch()
+                            report('iter', batch_index + 1)
+                            report('total', len(self.train_loader))
+                            report('reader_cost', dataload_time)
+                        observation['batch_cost'] = observation[
+                            'reader_cost'] + observation['step_cost']
+                        observation['samples'] = observation['batch_size']
+                        observation['ips,sent./sec'] = observation[
+                            'batch_size'] / observation['batch_cost']
+                        for k, v in observation.items():
+                            msg += f" {k.split(',')[0]}: "
+                            msg += f"{v:>.8f}" if isinstance(v,
+                                                             float) else f"{v}"
+                            msg += f" {k.split(',')[1]}" if len(
+                                k.split(',')) == 2 else ""
+                            msg += ","
+                        msg = msg[:-1]  # remove the last ","
+                        if (batch_index + 1) % self.config.log_interval == 0:
+                            logger.info(msg)
+                except Exception as e:
+                    logger.error(e)
+                    raise e
+
+            with Timer("Eval Time Cost: {}"):
+                total_loss, num_seen_utts = self.valid()
+                if dist.get_world_size() > 1:
+                    num_seen_utts = paddle.to_tensor(num_seen_utts)
+                    # the default operator in all_reduce function is sum.
+                    dist.all_reduce(num_seen_utts)
+                    total_loss = paddle.to_tensor(total_loss)
+                    dist.all_reduce(total_loss)
+                    cv_loss = total_loss / num_seen_utts
+                    cv_loss = float(cv_loss)
+                else:
+                    cv_loss = total_loss / num_seen_utts
+
+            logger.info(
+                'Epoch {} Val info val_loss {}'.format(self.epoch, cv_loss))
+            if self.visualizer:
+                self.visualizer.add_scalar(
+                    tag='eval/cv_loss', value=cv_loss, step=self.epoch)
+                self.visualizer.add_scalar(
+                    tag='eval/lr', value=self.lr_scheduler(), step=self.epoch)
+
+            self.save(tag=self.epoch, infos={'val_loss': cv_loss})
+            self.new_epoch()
+
+    def setup_dataloader(self):
+        config = self.config.clone()
+
+        load_transcript = True if config.model_conf.asr_weight > 0 else False
+
+        if self.train:
+            # train/valid dataset, return token ids
+            self.train_loader = BatchDataLoader(
+                json_file=config.train_manifest,
+                train_mode=True,
+                sortagrad=False,
+                batch_size=config.batch_size,
+                maxlen_in=config.maxlen_in,
+                maxlen_out=config.maxlen_out,
+                minibatches=0,
+                mini_batch_size=1,
+                batch_count='auto',
+                batch_bins=0,
+                batch_frames_in=0,
+                batch_frames_out=0,
+                batch_frames_inout=0,
+                preprocess_conf=config.
+                preprocess_config,  # aug will be off when train_mode=False
+                n_iter_processes=config.num_workers,
+                subsampling_factor=1,
+                load_aux_output=load_transcript,
+                num_encs=1,
+                dist_sampler=True)
+
+            self.valid_loader = BatchDataLoader(
+                json_file=config.dev_manifest,
+                train_mode=False,
+                sortagrad=False,
+                batch_size=config.batch_size,
+                maxlen_in=float('inf'),
+                maxlen_out=float('inf'),
+                minibatches=0,
+                mini_batch_size=1,
+                batch_count='auto',
+                batch_bins=0,
+                batch_frames_in=0,
+                batch_frames_out=0,
+                batch_frames_inout=0,
+                preprocess_conf=config.
+                preprocess_config,  # aug will be off when train_mode=False
+                n_iter_processes=config.num_workers,
+                subsampling_factor=1,
+                load_aux_output=load_transcript,
+                num_encs=1,
+                dist_sampler=False)
+            logger.info("Setup train/valid Dataloader!")
+        else:
+            # test dataset, return raw text
+            decode_batch_size = config.get('decode', dict()).get(
+                'decode_batch_size', 1)
+            self.test_loader = BatchDataLoader(
+                json_file=config.test_manifest,
+                train_mode=False,
+                sortagrad=False,
+                batch_size=decode_batch_size,
+                maxlen_in=float('inf'),
+                maxlen_out=float('inf'),
+                minibatches=0,
+                mini_batch_size=1,
+                batch_count='auto',
+                batch_bins=0,
+                batch_frames_in=0,
+                batch_frames_out=0,
+                batch_frames_inout=0,
+                preprocess_conf=config.
+                preprocess_config,  # aug will be off when train_mode=False
+                n_iter_processes=config.num_workers,
+                subsampling_factor=1,
+                num_encs=1,
+                dist_sampler=False)
+
+            logger.info("Setup test Dataloader!")
+
+    def setup_model(self):
+        config = self.config
+        model_conf = config
+        with UpdateConfig(model_conf):
+            if self.train:
+                model_conf.input_dim = self.train_loader.feat_dim
+                model_conf.output_dim = self.train_loader.vocab_size
+            else:
+                model_conf.input_dim = self.test_loader.feat_dim
+                model_conf.output_dim = self.test_loader.vocab_size
+
+        model = U2STModel.from_config(model_conf)
+
+        if self.parallel:
+            model = paddle.DataParallel(model)
+
+        logger.info(f"{model}")
+        layer_tools.print_params(model, logger.info)
+
+        train_config = config
+        optim_type = train_config.optim
+        optim_conf = train_config.optim_conf
+        scheduler_type = train_config.scheduler
+        scheduler_conf = train_config.scheduler_conf
+
+        scheduler_args = {
+            "learning_rate": optim_conf.lr,
+            "verbose": False,
+            "warmup_steps": scheduler_conf.warmup_steps,
+            "gamma": scheduler_conf.lr_decay,
+            "d_model": model_conf.encoder_conf.output_size,
+        }
+        lr_scheduler = LRSchedulerFactory.from_args(scheduler_type,
+                                                    scheduler_args)
+
+        def optimizer_args(
+                config,
+                parameters,
+                lr_scheduler=None, ):
+            train_config = config
+            optim_type = train_config.optim
+            optim_conf = train_config.optim_conf
+            scheduler_type = train_config.scheduler
+            scheduler_conf = train_config.scheduler_conf
+            return {
+                "grad_clip": train_config.global_grad_clip,
+                "weight_decay": optim_conf.weight_decay,
+                "learning_rate": lr_scheduler
+                if lr_scheduler else optim_conf.lr,
+                "parameters": parameters,
+                "epsilon": 1e-9 if optim_type == 'noam' else None,
+                "beta1": 0.9 if optim_type == 'noam' else None,
+                "beat2": 0.98 if optim_type == 'noam' else None,
+            }
+
+        optimzer_args = optimizer_args(config, model.parameters(), lr_scheduler)
+        optimizer = OptimizerFactory.from_args(optim_type, optimzer_args)
+
+        self.model = model
+        self.optimizer = optimizer
+        self.lr_scheduler = lr_scheduler
+        logger.info("Setup model/optimizer/lr_scheduler!")
+
+
+class U2STTester(U2STTrainer):
+    def __init__(self, config, args):
+        super().__init__(config, args)
+        self.text_feature = TextFeaturizer(
+            unit_type=self.config.unit_type,
+            vocab=self.config.vocab_filepath,
+            spm_model_prefix=self.config.spm_model_prefix)
+        self.vocab_list = self.text_feature.vocab_list
+
+    def id2token(self, texts, texts_len, text_feature):
+        """ ord() id to chr() chr """
+        trans = []
+        for text, n in zip(texts, texts_len):
+            n = n.numpy().item()
+            ids = text[:n]
+            trans.append(text_feature.defeaturize(ids.numpy().tolist()))
+        return trans
+
+    def translate(self, audio, audio_len):
+        """"E2E translation from extracted audio feature"""
+        decode_cfg = self.config.decode
+        self.model.eval()
+
+        hyps = self.model.decode(
+            audio,
+            audio_len,
+            text_feature=self.text_feature,
+            decoding_method=decode_cfg.decoding_method,
+            beam_size=decode_cfg.beam_size,
+            word_reward=decode_cfg.word_reward,
+            maxlenratio=decode_cfg.maxlenratio,
+            decoding_chunk_size=decode_cfg.decoding_chunk_size,
+            num_decoding_left_chunks=decode_cfg.num_decoding_left_chunks,
+            simulate_streaming=decode_cfg.simulate_streaming)
+        return hyps
+
+    def compute_translation_metrics(self,
+                                    utts,
+                                    audio,
+                                    audio_len,
+                                    texts,
+                                    texts_len,
+                                    bleu_func,
+                                    fout=None):
+        decode_cfg = self.config.decode
+        len_refs, num_ins = 0, 0
+
+        start_time = time.time()
+
+        refs = self.id2token(texts, texts_len, self.text_feature)
+
+        hyps = self.model.decode(
+            audio,
+            audio_len,
+            text_feature=self.text_feature,
+            decoding_method=decode_cfg.decoding_method,
+            beam_size=decode_cfg.beam_size,
+            word_reward=decode_cfg.word_reward,
+            maxlenratio=decode_cfg.maxlenratio,
+            decoding_chunk_size=decode_cfg.decoding_chunk_size,
+            num_decoding_left_chunks=decode_cfg.num_decoding_left_chunks,
+            simulate_streaming=decode_cfg.simulate_streaming)
+
+        decode_time = time.time() - start_time
+
+        for utt, target, result in zip(utts, refs, hyps):
+            len_refs += len(target.split())
+            num_ins += 1
+            if fout:
+                fout.write({"utt": utt, "ref": target, "hyp": result})
+            logger.info(f"Utt: {utt}")
+            logger.info(f"Ref: {target}")
+            logger.info(f"Hyp: {result}")
+            logger.info("One example BLEU = %s" %
+                        (bleu_func([result], [[target]]).prec_str))
+
+        return dict(
+            hyps=hyps,
+            refs=refs,
+            bleu=bleu_func(hyps, [refs]).score,
+            len_refs=len_refs,
+            num_ins=num_ins,  # num examples
+            num_frames=audio_len.sum().numpy().item(),
+            decode_time=decode_time)
+
+    @mp_tools.rank_zero_only
+    @paddle.no_grad()
+    def test(self):
+        assert self.args.result_file
+        self.model.eval()
+        logger.info(f"Test Total Examples: {len(self.test_loader.dataset)}")
+
+        decode_cfg = self.config.decode
+        bleu_func = bleu_score.char_bleu if decode_cfg.error_rate_type == 'char-bleu' else bleu_score.bleu
+
+        stride_ms = self.config.stride_ms
+        hyps, refs = [], []
+        len_refs, num_ins = 0, 0
+        num_frames = 0.0
+        num_time = 0.0
+        with jsonlines.open(self.args.result_file, 'w') as fout:
+            for i, batch in enumerate(self.test_loader):
+                metrics = self.compute_translation_metrics(
+                    *batch, bleu_func=bleu_func, fout=fout)
+                hyps += metrics['hyps']
+                refs += metrics['refs']
+                bleu = metrics['bleu']
+                num_frames += metrics['num_frames']
+                num_time += metrics["decode_time"]
+                len_refs += metrics['len_refs']
+                num_ins += metrics['num_ins']
+                rtf = num_time / (num_frames * stride_ms)
+                logger.info("RTF: %f, instance (%d), batch BELU   = %f" %
+                            (rtf, num_ins, bleu))
+
+        rtf = num_time / (num_frames * stride_ms)
+        msg = "Test: "
+        msg += "epoch: {}, ".format(self.epoch)
+        msg += "step: {}, ".format(self.iteration)
+        msg += "RTF: {}, ".format(rtf)
+        msg += "Test set [%s]: %s" % (len(hyps), str(bleu_func(hyps, [refs])))
+        logger.info(msg)
+        bleu_meta_path = os.path.splitext(self.args.result_file)[0] + '.bleu'
+        err_type_str = "BLEU"
+        with open(bleu_meta_path, 'w') as f:
+            data = json.dumps({
+                "epoch":
+                self.epoch,
+                "step":
+                self.iteration,
+                "rtf":
+                rtf,
+                err_type_str:
+                bleu_func(hyps, [refs]).score,
+                "dataset_hour": (num_frames * stride_ms) / 1000.0 / 3600.0,
+                "process_hour":
+                num_time / 1000.0 / 3600.0,
+                "num_examples":
+                num_ins,
+                "decode_method":
+                self.config.decode.decoding_method,
+            })
+            f.write(data + '\n')
+
+    def load_inferspec(self):
+        """infer model and input spec.
+
+        Returns:
+            nn.Layer: inference model
+            List[paddle.static.InputSpec]: input spec.
+        """
+        from paddlespeech.s2t.models.u2_st import U2STInferModel
+        infer_model = U2STInferModel.from_pretrained(self.test_loader,
+                                                     self.config.clone(),
+                                                     self.args.checkpoint_path)
+        feat_dim = self.test_loader.feat_dim
+        input_spec = [
+            paddle.static.InputSpec(shape=[1, None, feat_dim],
+                                    dtype='float32'),  # audio, [B,T,D]
+            paddle.static.InputSpec(shape=[1],
+                                    dtype='int64'),  # audio_length, [B]
+        ]
+        return infer_model, input_spec
+
+    @paddle.no_grad()
+    def export(self):
+        infer_model, input_spec = self.load_inferspec()
+        assert isinstance(input_spec, list), type(input_spec)
+        infer_model.eval()
+        static_model = paddle.jit.to_static(infer_model, input_spec=input_spec)
+        logger.info(f"Export code: {static_model.forward.code}")
+        paddle.jit.save(static_model, self.args.export_path)
diff --git a/ernie-sat/paddlespeech/s2t/frontend/__init__.py b/ernie-sat/paddlespeech/s2t/frontend/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..185a92b8d94d3426d616c0624f0f2ee04339349e
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/frontend/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/ernie-sat/paddlespeech/s2t/frontend/audio.py b/ernie-sat/paddlespeech/s2t/frontend/audio.py
new file mode 100644
index 0000000000000000000000000000000000000000..7f71e5dd947621621d8c02f72984e8269aa1940f
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/frontend/audio.py
@@ -0,0 +1,730 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Contains the audio segment class."""
+import copy
+import io
+import random
+import re
+import struct
+
+import numpy as np
+import resampy
+import soundfile
+from scipy import signal
+
+from .utility import convert_samples_from_float32
+from .utility import convert_samples_to_float32
+from .utility import subfile_from_tar
+
+
+class AudioSegment():
+    """Monaural audio segment abstraction.
+
+    :param samples: Audio samples [num_samples x num_channels].
+    :type samples: ndarray.float32
+    :param sample_rate: Audio sample rate.
+    :type sample_rate: int
+    :raises TypeError: If the sample data type is not float or int.
+    """
+
+    def __init__(self, samples, sample_rate):
+        """Create audio segment from samples.
+
+        Samples are convert float32 internally, with int scaled to [-1, 1].
+        """
+        self._samples = self._convert_samples_to_float32(samples)
+        self._sample_rate = sample_rate
+        if self._samples.ndim >= 2:
+            self._samples = np.mean(self._samples, 1)
+
+    def __eq__(self, other):
+        """Return whether two objects are equal."""
+        if type(other) is not type(self):
+            return False
+        if self._sample_rate != other._sample_rate:
+            return False
+        if self._samples.shape != other._samples.shape:
+            return False
+        if np.any(self.samples != other._samples):
+            return False
+        return True
+
+    def __ne__(self, other):
+        """Return whether two objects are unequal."""
+        return not self.__eq__(other)
+
+    def __str__(self):
+        """Return human-readable representation of segment."""
+        return ("%s: num_samples=%d, sample_rate=%d, duration=%.2fsec, "
+                "rms=%.2fdB" % (type(self), self.num_samples, self.sample_rate,
+                                self.duration, self.rms_db))
+
+    @classmethod
+    def from_file(cls, file, infos=None):
+        """Create audio segment from audio file.
+
+        Args:
+            filepath (str|file): Filepath or file object to audio file.
+            infos (TarLocalData, optional): tar2obj and tar2infos. Defaults to None.
+
+        Returns:
+            AudioSegment: Audio segment instance.
+        """
+        if isinstance(file, str) and re.findall(r".seqbin_\d+$", file):
+            return cls.from_sequence_file(file)
+        elif isinstance(file, str) and file.startswith('tar:'):
+            return cls.from_file(subfile_from_tar(file, infos))
+        else:
+            samples, sample_rate = soundfile.read(file, dtype='float32')
+            return cls(samples, sample_rate)
+
+    @classmethod
+    def slice_from_file(cls, file, start=None, end=None):
+        """Loads a small section of an audio without having to load
+        the entire file into the memory which can be incredibly wasteful.
+
+        :param file: Input audio filepath or file object.
+        :type file: str|file
+        :param start: Start time in seconds. If start is negative, it wraps
+                      around from the end. If not provided, this function
+                      reads from the very beginning.
+        :type start: float
+        :param end: End time in seconds. If end is negative, it wraps around
+                    from the end. If not provided, the default behvaior is
+                    to read to the end of the file.
+        :type end: float
+        :return: AudioSegment instance of the specified slice of the input
+                 audio file.
+        :rtype: AudioSegment
+        :raise ValueError: If start or end is incorrectly set, e.g. out of
+                           bounds in time.
+        """
+        sndfile = soundfile.SoundFile(file)
+        sample_rate = sndfile.samplerate
+        duration = float(len(sndfile)) / sample_rate
+        start = 0. if start is None else start
+        end = duration if end is None else end
+        if start < 0.0:
+            start += duration
+        if end < 0.0:
+            end += duration
+        if start < 0.0:
+            raise ValueError("The slice start position (%f s) is out of "
+                             "bounds." % start)
+        if end < 0.0:
+            raise ValueError("The slice end position (%f s) is out of bounds." %
+                             end)
+        if start > end:
+            raise ValueError("The slice start position (%f s) is later than "
+                             "the slice end position (%f s)." % (start, end))
+        if end > duration:
+            raise ValueError("The slice end position (%f s) is out of bounds "
+                             "(> %f s)" % (end, duration))
+        start_frame = int(start * sample_rate)
+        end_frame = int(end * sample_rate)
+        sndfile.seek(start_frame)
+        data = sndfile.read(frames=end_frame - start_frame, dtype='float32')
+        return cls(data, sample_rate)
+
+    @classmethod
+    def from_sequence_file(cls, filepath):
+        """Create audio segment from sequence file. Sequence file is a binary
+        file containing a collection of multiple audio files, with several
+        header bytes in the head indicating the offsets of each audio byte data
+        chunk.
+
+        The format is:
+
+            4 bytes (int, version),
+            4 bytes (int, num of utterance),
+            4 bytes (int, bytes per header),
+            [bytes_per_header*(num_utterance+1)] bytes (offsets for each audio),
+            audio_bytes_data_of_1st_utterance,
+            audio_bytes_data_of_2nd_utterance,
+            ......
+
+        Sequence file name must end with ".seqbin". And the filename of the 5th
+        utterance's audio file in sequence file "xxx.seqbin" must be
+        "xxx.seqbin_5", with "5" indicating the utterance index within this
+        sequence file (starting from 1).
+
+        :param filepath: Filepath of sequence file.
+        :type filepath: str
+        :return: Audio segment instance.
+        :rtype: AudioSegment
+        """
+        # parse filepath
+        matches = re.match(r"(.+\.seqbin)_(\d+)", filepath)
+        if matches is None:
+            raise IOError("File type of %s is not supported" % filepath)
+        filename = matches.group(1)
+        fileno = int(matches.group(2))
+
+        # read headers
+        f = io.open(filename, mode='rb', encoding='utf8')
+        version = f.read(4)
+        num_utterances = struct.unpack("i", f.read(4))[0]
+        bytes_per_header = struct.unpack("i", f.read(4))[0]
+        header_bytes = f.read(bytes_per_header * (num_utterances + 1))
+        header = [
+            struct.unpack("i", header_bytes[bytes_per_header * i:
+                                            bytes_per_header * (i + 1)])[0]
+            for i in range(num_utterances + 1)
+        ]
+
+        # read audio bytes
+        f.seek(header[fileno - 1])
+        audio_bytes = f.read(header[fileno] - header[fileno - 1])
+        f.close()
+
+        # create audio segment
+        try:
+            return cls.from_bytes(audio_bytes)
+        except Exception as e:
+            samples = np.frombuffer(audio_bytes, dtype='int16')
+            return cls(samples=samples, sample_rate=8000)
+
+    @classmethod
+    def from_bytes(cls, bytes):
+        """Create audio segment from a byte string containing audio samples.
+
+        :param bytes: Byte string containing audio samples.
+        :type bytes: str
+        :return: Audio segment instance.
+        :rtype: AudioSegment
+        """
+        samples, sample_rate = soundfile.read(
+            io.BytesIO(bytes), dtype='float32')
+        return cls(samples, sample_rate)
+
+    @classmethod
+    def from_pcm(cls, samples, sample_rate):
+        """Create audio segment from a byte string containing audio samples.
+        :param samples: Audio samples [num_samples x num_channels].
+        :type samples: numpy.ndarray
+        :param sample_rate: Audio sample rate.
+        :type sample_rate: int
+        :return: Audio segment instance.
+        :rtype: AudioSegment
+        """
+        return cls(samples, sample_rate)
+
+    @classmethod
+    def concatenate(cls, *segments):
+        """Concatenate an arbitrary number of audio segments together.
+
+        :param *segments: Input audio segments to be concatenated.
+        :type *segments: tuple of AudioSegment
+        :return: Audio segment instance as concatenating results.
+        :rtype: AudioSegment
+        :raises ValueError: If the number of segments is zero, or if the
+                            sample_rate of any segments does not match.
+        :raises TypeError: If any segment is not AudioSegment instance.
+        """
+        # Perform basic sanity-checks.
+        if len(segments) == 0:
+            raise ValueError("No audio segments are given to concatenate.")
+        sample_rate = segments[0]._sample_rate
+        for seg in segments:
+            if sample_rate != seg._sample_rate:
+                raise ValueError("Can't concatenate segments with "
+                                 "different sample rates")
+            if type(seg) is not cls:
+                raise TypeError("Only audio segments of the same type "
+                                "can be concatenated.")
+        samples = np.concatenate([seg.samples for seg in segments])
+        return cls(samples, sample_rate)
+
+    @classmethod
+    def make_silence(cls, duration, sample_rate):
+        """Creates a silent audio segment of the given duration and sample rate.
+
+        :param duration: Length of silence in seconds.
+        :type duration: float
+        :param sample_rate: Sample rate.
+        :type sample_rate: float
+        :return: Silent AudioSegment instance of the given duration.
+        :rtype: AudioSegment
+        """
+        samples = np.zeros(int(duration * sample_rate))
+        return cls(samples, sample_rate)
+
+    def to_wav_file(self, filepath, dtype='float32'):
+        """Save audio segment to disk as wav file.
+
+        :param filepath: WAV filepath or file object to save the
+                         audio segment.
+        :type filepath: str|file
+        :param dtype: Subtype for audio file. Options: 'int16', 'int32',
+                      'float32', 'float64'. Default is 'float32'.
+        :type dtype: str
+        :raises TypeError: If dtype is not supported.
+        """
+        samples = self._convert_samples_from_float32(self._samples, dtype)
+        subtype_map = {
+            'int16': 'PCM_16',
+            'int32': 'PCM_32',
+            'float32': 'FLOAT',
+            'float64': 'DOUBLE'
+        }
+        soundfile.write(
+            filepath,
+            samples,
+            self._sample_rate,
+            format='WAV',
+            subtype=subtype_map[dtype])
+
+    def superimpose(self, other):
+        """Add samples from another segment to those of this segment
+        (sample-wise addition, not segment concatenation).
+
+        Note that this is an in-place transformation.
+
+        :param other: Segment containing samples to be added in.
+        :type other: AudioSegments
+        :raise TypeError: If type of two segments don't match.
+        :raise ValueError: If the sample rates of the two segments are not
+                           equal, or if the lengths of segments don't match.
+        """
+        if isinstance(other, type(self)):
+            raise TypeError("Cannot add segments of different types: %s "
+                            "and %s." % (type(self), type(other)))
+        if self._sample_rate != other._sample_rate:
+            raise ValueError("Sample rates must match to add segments.")
+        if len(self._samples) != len(other._samples):
+            raise ValueError("Segment lengths must match to add segments.")
+        self._samples += other._samples
+
+    def to_bytes(self, dtype='float32'):
+        """Create a byte string containing the audio content.
+
+        :param dtype: Data type for export samples. Options: 'int16', 'int32',
+                      'float32', 'float64'. Default is 'float32'.
+        :type dtype: str
+        :return: Byte string containing audio content.
+        :rtype: str
+        """
+        samples = self._convert_samples_from_float32(self._samples, dtype)
+        return samples.tostring()
+
+    def to(self, dtype='int16'):
+        """Create a `dtype` audio content.
+
+        :param dtype: Data type for export samples. Options: 'int16', 'int32',
+                      'float32', 'float64'. Default is 'float32'.
+        :type dtype: str
+        :return: np.ndarray containing `dtype` audio content.
+        :rtype: str
+        """
+        samples = self._convert_samples_from_float32(self._samples, dtype)
+        return samples
+
+    def gain_db(self, gain):
+        """Apply gain in decibels to samples.
+
+        Note that this is an in-place transformation.
+
+        :param gain: Gain in decibels to apply to samples.
+        :type gain: float|1darray
+        """
+        self._samples *= 10.**(gain / 20.)
+
+    def change_speed(self, speed_rate):
+        """Change the audio speed by linear interpolation.
+
+        Note that this is an in-place transformation.
+
+        :param speed_rate: Rate of speed change:
+                           speed_rate > 1.0, speed up the audio;
+                           speed_rate = 1.0, unchanged;
+                           speed_rate < 1.0, slow down the audio;
+                           speed_rate <= 0.0, not allowed, raise ValueError.
+        :type speed_rate: float
+        :raises ValueError: If speed_rate <= 0.0.
+        """
+        if speed_rate == 1.0:
+            return
+        if speed_rate <= 0:
+            raise ValueError("speed_rate should be greater than zero.")
+
+        # numpy
+        # old_length = self._samples.shape[0]
+        # new_length = int(old_length / speed_rate)
+        # old_indices = np.arange(old_length)
+        # new_indices = np.linspace(start=0, stop=old_length, num=new_length)
+        # self._samples = np.interp(new_indices, old_indices, self._samples)
+
+        # sox, slow
+        try:
+            import soxbindings as sox
+        except ImportError:
+            try:
+                from paddlespeech.s2t.utils import dynamic_pip_install
+                package = "sox"
+                dynamic_pip_install.install(package)
+                package = "soxbindings"
+                dynamic_pip_install.install(package)
+                import soxbindings as sox
+            except Exception:
+                raise RuntimeError(
+                    "Can not install soxbindings on your system.")
+
+        tfm = sox.Transformer()
+        tfm.set_globals(multithread=False)
+        tfm.speed(speed_rate)
+        self._samples = tfm.build_array(
+            input_array=self._samples,
+            sample_rate_in=self._sample_rate).squeeze(-1).astype(
+                np.float32).copy()
+
+    def normalize(self, target_db=-20, max_gain_db=300.0):
+        """Normalize audio to be of the desired RMS value in decibels.
+
+        Note that this is an in-place transformation.
+
+        :param target_db: Target RMS value in decibels. This value should be
+                          less than 0.0 as 0.0 is full-scale audio.
+        :type target_db: float
+        :param max_gain_db: Max amount of gain in dB that can be applied for
+                            normalization. This is to prevent nans when
+                            attempting to normalize a signal consisting of
+                            all zeros.
+        :type max_gain_db: float
+        :raises ValueError: If the required gain to normalize the segment to
+                            the target_db value exceeds max_gain_db.
+        """
+        gain = target_db - self.rms_db
+        if gain > max_gain_db:
+            raise ValueError(
+                "Unable to normalize segment to %f dB because the "
+                "the probable gain have exceeds max_gain_db (%f dB)" %
+                (target_db, max_gain_db))
+        self.gain_db(min(max_gain_db, target_db - self.rms_db))
+
+    def normalize_online_bayesian(self,
+                                  target_db,
+                                  prior_db,
+                                  prior_samples,
+                                  startup_delay=0.0):
+        """Normalize audio using a production-compatible online/causal
+        algorithm. This uses an exponential likelihood and gamma prior to
+        make online estimates of the RMS even when there are very few samples.
+
+        Note that this is an in-place transformation.
+
+        :param target_db: Target RMS value in decibels.
+        :type target_bd: float
+        :param prior_db: Prior RMS estimate in decibels.
+        :type prior_db: float
+        :param prior_samples: Prior strength in number of samples.
+        :type prior_samples: float
+        :param startup_delay: Default 0.0s. If provided, this function will
+                              accrue statistics for the first startup_delay
+                              seconds before applying online normalization.
+        :type startup_delay: float
+        """
+        # Estimate total RMS online.
+        startup_sample_idx = min(self.num_samples - 1,
+                                 int(self.sample_rate * startup_delay))
+        prior_mean_squared = 10.**(prior_db / 10.)
+        prior_sum_of_squares = prior_mean_squared * prior_samples
+        cumsum_of_squares = np.cumsum(self.samples**2)
+        sample_count = np.arange(self.num_samples) + 1
+        if startup_sample_idx > 0:
+            cumsum_of_squares[:startup_sample_idx] = \
+                cumsum_of_squares[startup_sample_idx]
+            sample_count[:startup_sample_idx] = \
+                sample_count[startup_sample_idx]
+        mean_squared_estimate = ((cumsum_of_squares + prior_sum_of_squares) /
+                                 (sample_count + prior_samples))
+        rms_estimate_db = 10 * np.log10(mean_squared_estimate)
+        # Compute required time-varying gain.
+        gain_db = target_db - rms_estimate_db
+        self.gain_db(gain_db)
+
+    def resample(self, target_sample_rate, filter='kaiser_best'):
+        """Resample the audio to a target sample rate.
+
+        Note that this is an in-place transformation.
+
+        :param target_sample_rate: Target sample rate.
+        :type target_sample_rate: int
+        :param filter: The resampling filter to use one of {'kaiser_best',
+                       'kaiser_fast'}.
+        :type filter: str
+        """
+        self._samples = resampy.resample(
+            self.samples, self.sample_rate, target_sample_rate, filter=filter)
+        self._sample_rate = target_sample_rate
+
+    def pad_silence(self, duration, sides='both'):
+        """Pad this audio sample with a period of silence.
+
+        Note that this is an in-place transformation.
+
+        :param duration: Length of silence in seconds to pad.
+        :type duration: float
+        :param sides: Position for padding:
+                     'beginning' - adds silence in the beginning;
+                     'end' - adds silence in the end;
+                     'both' - adds silence in both the beginning and the end.
+        :type sides: str
+        :raises ValueError: If sides is not supported.
+        """
+        if duration == 0.0:
+            return self
+        cls = type(self)
+        silence = self.make_silence(duration, self._sample_rate)
+        if sides == "beginning":
+            padded = cls.concatenate(silence, self)
+        elif sides == "end":
+            padded = cls.concatenate(self, silence)
+        elif sides == "both":
+            padded = cls.concatenate(silence, self, silence)
+        else:
+            raise ValueError("Unknown value for the sides %s" % sides)
+        self._samples = padded._samples
+
+    def shift(self, shift_ms):
+        """Shift the audio in time. If `shift_ms` is positive, shift with time
+        advance; if negative, shift with time delay. Silence are padded to
+        keep the duration unchanged.
+
+        Note that this is an in-place transformation.
+
+        :param shift_ms: Shift time in millseconds. If positive, shift with
+                         time advance; if negative; shift with time delay.
+        :type shift_ms: float
+        :raises ValueError: If shift_ms is longer than audio duration.
+        """
+        if abs(shift_ms) / 1000.0 > self.duration:
+            raise ValueError("Absolute value of shift_ms should be smaller "
+                             "than audio duration.")
+        shift_samples = int(shift_ms * self._sample_rate / 1000)
+        if shift_samples > 0:
+            # time advance
+            self._samples[:-shift_samples] = self._samples[shift_samples:]
+            self._samples[-shift_samples:] = 0
+        elif shift_samples < 0:
+            # time delay
+            self._samples[-shift_samples:] = self._samples[:shift_samples]
+            self._samples[:-shift_samples] = 0
+
+    def subsegment(self, start_sec=None, end_sec=None):
+        """Cut the AudioSegment between given boundaries.
+
+        Note that this is an in-place transformation.
+
+        :param start_sec: Beginning of subsegment in seconds.
+        :type start_sec: float
+        :param end_sec: End of subsegment in seconds.
+        :type end_sec: float
+        :raise ValueError: If start_sec or end_sec is incorrectly set, e.g. out
+                           of bounds in time.
+        """
+        start_sec = 0.0 if start_sec is None else start_sec
+        end_sec = self.duration if end_sec is None else end_sec
+        if start_sec < 0.0:
+            start_sec = self.duration + start_sec
+        if end_sec < 0.0:
+            end_sec = self.duration + end_sec
+        if start_sec < 0.0:
+            raise ValueError("The slice start position (%f s) is out of "
+                             "bounds." % start_sec)
+        if end_sec < 0.0:
+            raise ValueError("The slice end position (%f s) is out of bounds." %
+                             end_sec)
+        if start_sec > end_sec:
+            raise ValueError("The slice start position (%f s) is later than "
+                             "the end position (%f s)." % (start_sec, end_sec))
+        if end_sec > self.duration:
+            raise ValueError("The slice end position (%f s) is out of bounds "
+                             "(> %f s)" % (end_sec, self.duration))
+        start_sample = int(round(start_sec * self._sample_rate))
+        end_sample = int(round(end_sec * self._sample_rate))
+        self._samples = self._samples[start_sample:end_sample]
+
+    def random_subsegment(self, subsegment_length, rng=None):
+        """Cut the specified length of the audiosegment randomly.
+
+        Note that this is an in-place transformation.
+
+        :param subsegment_length: Subsegment length in seconds.
+        :type subsegment_length: float
+        :param rng: Random number generator state.
+        :type rng: random.Random
+        :raises ValueError: If the length of subsegment is greater than
+                            the origineal segemnt.
+        """
+        rng = random.Random() if rng is None else rng
+        if subsegment_length > self.duration:
+            raise ValueError("Length of subsegment must not be greater "
+                             "than original segment.")
+        start_time = rng.uniform(0.0, self.duration - subsegment_length)
+        self.subsegment(start_time, start_time + subsegment_length)
+
+    def convolve(self, impulse_segment, allow_resample=False):
+        """Convolve this audio segment with the given impulse segment.
+
+        Note that this is an in-place transformation.
+
+        :param impulse_segment: Impulse response segments.
+        :type impulse_segment: AudioSegment
+        :param allow_resample: Indicates whether resampling is allowed when
+                               the impulse_segment has a different sample
+                               rate from this signal.
+        :type allow_resample: bool
+        :raises ValueError: If the sample rate is not match between two
+                            audio segments when resample is not allowed.
+        """
+        if allow_resample and self.sample_rate != impulse_segment.sample_rate:
+            impulse_segment.resample(self.sample_rate)
+        if self.sample_rate != impulse_segment.sample_rate:
+            raise ValueError("Impulse segment's sample rate (%d Hz) is not "
+                             "equal to base signal sample rate (%d Hz)." %
+                             (impulse_segment.sample_rate, self.sample_rate))
+        samples = signal.fftconvolve(self.samples, impulse_segment.samples,
+                                     "full")
+        self._samples = samples
+
+    def convolve_and_normalize(self, impulse_segment, allow_resample=False):
+        """Convolve and normalize the resulting audio segment so that it
+        has the same average power as the input signal.
+
+        Note that this is an in-place transformation.
+
+        :param impulse_segment: Impulse response segments.
+        :type impulse_segment: AudioSegment
+        :param allow_resample: Indicates whether resampling is allowed when
+                               the impulse_segment has a different sample
+                               rate from this signal.
+        :type allow_resample: bool
+        """
+        target_db = self.rms_db
+        self.convolve(impulse_segment, allow_resample=allow_resample)
+        self.normalize(target_db)
+
+    def add_noise(self,
+                  noise,
+                  snr_dB,
+                  allow_downsampling=False,
+                  max_gain_db=300.0,
+                  rng=None):
+        """Add the given noise segment at a specific signal-to-noise ratio.
+        If the noise segment is longer than this segment, a random subsegment
+        of matching length is sampled from it and used instead.
+
+        Note that this is an in-place transformation.
+
+        :param noise: Noise signal to add.
+        :type noise: AudioSegment
+        :param snr_dB: Signal-to-Noise Ratio, in decibels.
+        :type snr_dB: float
+        :param allow_downsampling: Whether to allow the noise signal to be
+                                   downsampled to match the base signal sample
+                                   rate.
+        :type allow_downsampling: bool
+        :param max_gain_db: Maximum amount of gain to apply to noise signal
+                            before adding it in. This is to prevent attempting
+                            to apply infinite gain to a zero signal.
+        :type max_gain_db: float
+        :param rng: Random number generator state.
+        :type rng: None|random.Random
+        :raises ValueError: If the sample rate does not match between the two
+                            audio segments when downsampling is not allowed, or
+                            if the duration of noise segments is shorter than
+                            original audio segments.
+        """
+        rng = random.Random() if rng is None else rng
+        if allow_downsampling and noise.sample_rate > self.sample_rate:
+            noise = noise.resample(self.sample_rate)
+        if noise.sample_rate != self.sample_rate:
+            raise ValueError("Noise sample rate (%d Hz) is not equal to base "
+                             "signal sample rate (%d Hz)." % (noise.sample_rate,
+                                                              self.sample_rate))
+        if noise.duration < self.duration:
+            raise ValueError("Noise signal (%f sec) must be at least as long as"
+                             " base signal (%f sec)." %
+                             (noise.duration, self.duration))
+        noise_gain_db = min(self.rms_db - noise.rms_db - snr_dB, max_gain_db)
+        noise_new = copy.deepcopy(noise)
+        noise_new.random_subsegment(self.duration, rng=rng)
+        noise_new.gain_db(noise_gain_db)
+        self.superimpose(noise_new)
+
+    @property
+    def samples(self):
+        """Return audio samples.
+
+        :return: Audio samples.
+        :rtype: ndarray
+        """
+        return self._samples.copy()
+
+    @property
+    def sample_rate(self):
+        """Return audio sample rate.
+
+        :return: Audio sample rate.
+        :rtype: int
+        """
+        return self._sample_rate
+
+    @property
+    def num_samples(self):
+        """Return number of samples.
+
+        :return: Number of samples.
+        :rtype: int
+        """
+        return self._samples.shape[0]
+
+    @property
+    def duration(self):
+        """Return audio duration.
+
+        :return: Audio duration in seconds.
+        :rtype: float
+        """
+        return self._samples.shape[0] / float(self._sample_rate)
+
+    @property
+    def rms_db(self):
+        """Return root mean square energy of the audio in decibels.
+
+        :return: Root mean square energy in decibels.
+        :rtype: float
+        """
+        # square root => multiply by 10 instead of 20 for dBs
+        mean_square = np.mean(self._samples**2)
+        return 10 * np.log10(mean_square)
+
+    def _convert_samples_to_float32(self, samples):
+        """Convert sample type to float32.
+
+        Audio sample type is usually integer or float-point.
+        Integers will be scaled to [-1, 1] in float32.
+        """
+        return convert_samples_to_float32(samples)
+
+    def _convert_samples_from_float32(self, samples, dtype):
+        """Convert sample type from float32 to dtype.
+
+        Audio sample type is usually integer or float-point. For integer
+        type, float32 will be rescaled from [-1, 1] to the maximum range
+        supported by the integer type.
+
+        This is for writing a audio file.
+        """
+        return convert_samples_from_float32(samples, dtype)
diff --git a/ernie-sat/paddlespeech/s2t/frontend/augmentor/__init__.py b/ernie-sat/paddlespeech/s2t/frontend/augmentor/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..185a92b8d94d3426d616c0624f0f2ee04339349e
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/frontend/augmentor/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/ernie-sat/paddlespeech/s2t/frontend/augmentor/augmentation.py b/ernie-sat/paddlespeech/s2t/frontend/augmentor/augmentation.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c5ca4fe630bcccbe4f41b869b9039f219857e89
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/frontend/augmentor/augmentation.py
@@ -0,0 +1,230 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Contains the data augmentation pipeline."""
+import json
+import os
+from collections.abc import Sequence
+from inspect import signature
+from pprint import pformat
+
+import numpy as np
+
+from paddlespeech.s2t.frontend.augmentor.base import AugmentorBase
+from paddlespeech.s2t.utils.dynamic_import import dynamic_import
+from paddlespeech.s2t.utils.log import Log
+
+logger = Log(__name__).getlog()
+
+__all__ = ["AugmentationPipeline"]
+
+import_alias = dict(
+    volume="paddlespeech.s2t.frontend.augmentor.impulse_response:VolumePerturbAugmentor",
+    shift="paddlespeech.s2t.frontend.augmentor.shift_perturb:ShiftPerturbAugmentor",
+    speed="paddlespeech.s2t.frontend.augmentor.speed_perturb:SpeedPerturbAugmentor",
+    resample="paddlespeech.s2t.frontend.augmentor.resample:ResampleAugmentor",
+    bayesian_normal="paddlespeech.s2t.frontend.augmentor.online_bayesian_normalization:OnlineBayesianNormalizationAugmentor",
+    noise="paddlespeech.s2t.frontend.augmentor.noise_perturb:NoisePerturbAugmentor",
+    impulse="paddlespeech.s2t.frontend.augmentor.impulse_response:ImpulseResponseAugmentor",
+    specaug="paddlespeech.s2t.frontend.augmentor.spec_augment:SpecAugmentor", )
+
+
+class AugmentationPipeline():
+    """Build a pre-processing pipeline with various augmentation models.Such a
+    data augmentation pipeline is oftern leveraged to augment the training
+    samples to make the model invariant to certain types of perturbations in the
+    real world, improving model's generalization ability.
+
+    The pipeline is built according the the augmentation configuration in json
+    string, e.g.
+    
+    .. code-block::
+
+        [ {
+                "type": "noise",
+                "params": {"min_snr_dB": 10,
+                           "max_snr_dB": 20,
+                           "noise_manifest_path": "datasets/manifest.noise"},
+                "prob": 0.0
+            },
+            {
+                "type": "speed",
+                "params": {"min_speed_rate": 0.9,
+                           "max_speed_rate": 1.1},
+                "prob": 1.0
+            },
+            {
+                "type": "shift",
+                "params": {"min_shift_ms": -5,
+                           "max_shift_ms": 5},
+                "prob": 1.0
+            },
+            {
+                "type": "volume",
+                "params": {"min_gain_dBFS": -10,
+                           "max_gain_dBFS": 10},
+                "prob": 0.0
+            },
+            {
+                "type": "bayesian_normal",
+                "params": {"target_db": -20,
+                           "prior_db": -20,
+                           "prior_samples": 100},
+                "prob": 0.0
+            }
+        ]
+        
+    This augmentation configuration inserts two augmentation models
+    into the pipeline, with one is VolumePerturbAugmentor and the other
+    SpeedPerturbAugmentor. "prob" indicates the probability of the current
+    augmentor to take effect. If "prob" is zero, the augmentor does not take
+    effect.
+
+    Params:
+        preprocess_conf(str): Augmentation configuration in `json file` or `json string`.
+        random_seed(int): Random seed.
+    
+    Raises:
+        ValueError: If the augmentation json config is in incorrect format".
+    """
+
+    SPEC_TYPES = {'specaug'}
+
+    def __init__(self, preprocess_conf: str, random_seed: int=0):
+        self._rng = np.random.RandomState(random_seed)
+        self.conf = {'mode': 'sequential', 'process': []}
+        if preprocess_conf:
+            if os.path.isfile(preprocess_conf):
+                # json file
+                with open(preprocess_conf, 'r') as fin:
+                    json_string = fin.read()
+            else:
+                # json string
+                json_string = preprocess_conf
+            process = json.loads(json_string)
+            self.conf['process'] += process
+
+        self._augmentors, self._rates = self._parse_pipeline_from('all')
+        self._audio_augmentors, self._audio_rates = self._parse_pipeline_from(
+            'audio')
+        self._spec_augmentors, self._spec_rates = self._parse_pipeline_from(
+            'feature')
+        logger.info(
+            f"Augmentation: {pformat(list(zip(self._augmentors, self._rates)))}")
+
+    def __call__(self, xs, uttid_list=None, **kwargs):
+        if not isinstance(xs, Sequence):
+            is_batch = False
+            xs = [xs]
+        else:
+            is_batch = True
+
+        if isinstance(uttid_list, str):
+            uttid_list = [uttid_list for _ in range(len(xs))]
+
+        if self.conf.get("mode", "sequential") == "sequential":
+            for idx, (func, rate) in enumerate(
+                    zip(self._augmentors, self._rates), 0):
+                if self._rng.uniform(0., 1.) >= rate:
+                    continue
+
+                # Derive only the args which the func has
+                try:
+                    param = signature(func).parameters
+                except ValueError:
+                    # Some function, e.g. built-in function, are failed
+                    param = {}
+                _kwargs = {k: v for k, v in kwargs.items() if k in param}
+
+                try:
+                    if uttid_list is not None and "uttid" in param:
+                        xs = [
+                            func(x, u, **_kwargs)
+                            for x, u in zip(xs, uttid_list)
+                        ]
+                    else:
+                        xs = [func(x, **_kwargs) for x in xs]
+                except Exception:
+                    logger.fatal("Catch a exception from {}th func: {}".format(
+                        idx, func))
+                    raise
+        else:
+            raise NotImplementedError(
+                "Not supporting mode={}".format(self.conf["mode"]))
+
+        if is_batch:
+            return xs
+        else:
+            return xs[0]
+
+    def transform_audio(self, audio_segment):
+        """Run the pre-processing pipeline for data augmentation.
+
+        Note that this is an in-place transformation.
+        
+        :param audio_segment: Audio segment to process.
+        :type audio_segment: AudioSegmenet|SpeechSegment
+        """
+        for augmentor, rate in zip(self._audio_augmentors, self._audio_rates):
+            if self._rng.uniform(0., 1.) < rate:
+                augmentor.transform_audio(audio_segment)
+
+    def transform_feature(self, spec_segment):
+        """spectrogram augmentation.
+         
+        Args:
+            spec_segment (np.ndarray): audio feature, (D, T).
+        """
+        for augmentor, rate in zip(self._spec_augmentors, self._spec_rates):
+            if self._rng.uniform(0., 1.) < rate:
+                spec_segment = augmentor.transform_feature(spec_segment)
+        return spec_segment
+
+    def _parse_pipeline_from(self, aug_type='all'):
+        """Parse the config json to build a augmentation pipelien."""
+        assert aug_type in ('audio', 'feature', 'all'), aug_type
+        audio_confs = []
+        feature_confs = []
+        all_confs = []
+        for config in self.conf['process']:
+            all_confs.append(config)
+            if config["type"] in self.SPEC_TYPES:
+                feature_confs.append(config)
+            else:
+                audio_confs.append(config)
+
+        if aug_type == 'audio':
+            aug_confs = audio_confs
+        elif aug_type == 'feature':
+            aug_confs = feature_confs
+        elif aug_type == 'all':
+            aug_confs = all_confs
+        else:
+            raise ValueError(f"Not support: {aug_type}")
+
+        augmentors = [
+            self._get_augmentor(config["type"], config["params"])
+            for config in aug_confs
+        ]
+        rates = [config["prob"] for config in aug_confs]
+        return augmentors, rates
+
+    def _get_augmentor(self, augmentor_type, params):
+        """Return an augmentation model by the type name, and pass in params."""
+        class_obj = dynamic_import(augmentor_type, import_alias)
+        assert issubclass(class_obj, AugmentorBase)
+        try:
+            obj = class_obj(self._rng, **params)
+        except Exception:
+            raise ValueError("Unknown augmentor type [%s]." % augmentor_type)
+        return obj
diff --git a/ernie-sat/paddlespeech/s2t/frontend/augmentor/base.py b/ernie-sat/paddlespeech/s2t/frontend/augmentor/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..18d003c0b125c76a6016e830227d4ee3f5ddc19e
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/frontend/augmentor/base.py
@@ -0,0 +1,59 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Contains the abstract base class for augmentation models."""
+from abc import ABCMeta
+from abc import abstractmethod
+
+
+class AugmentorBase():
+    """Abstract base class for augmentation model (augmentor) class.
+    All augmentor classes should inherit from this class, and implement the
+    following abstract methods.
+    """
+
+    __metaclass__ = ABCMeta
+
+    @abstractmethod
+    def __init__(self):
+        pass
+
+    @abstractmethod
+    def __call__(self, xs):
+        raise NotImplementedError("AugmentorBase: Not impl __call__")
+
+    @abstractmethod
+    def transform_audio(self, audio_segment):
+        """Adds various effects to the input audio segment. Such effects
+        will augment the training data to make the model invariant to certain
+        types of perturbations in the real world, improving model's
+        generalization ability.
+        
+        Note that this is an in-place transformation.
+
+        :param audio_segment: Audio segment to add effects to.
+        :type audio_segment: AudioSegmenet|SpeechSegment
+        """
+        raise NotImplementedError("AugmentorBase: Not impl transform_audio")
+
+    @abstractmethod
+    def transform_feature(self, spec_segment):
+        """Adds various effects to the input audo feature segment. Such effects
+        will augment the training data to make the model invariant to certain
+        types of time_mask or freq_mask in the real world, improving model's
+        generalization ability.
+        
+        Args:
+            spec_segment (Spectrogram): Spectrogram segment to add effects to.
+        """
+        raise NotImplementedError("AugmentorBase: Not impl transform_feature")
diff --git a/ernie-sat/paddlespeech/s2t/frontend/augmentor/impulse_response.py b/ernie-sat/paddlespeech/s2t/frontend/augmentor/impulse_response.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ba45bb20c2e3816358b34209fa3cb03142b285b
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/frontend/augmentor/impulse_response.py
@@ -0,0 +1,52 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Contains the impulse response augmentation model."""
+import jsonlines
+
+from paddlespeech.s2t.frontend.audio import AudioSegment
+from paddlespeech.s2t.frontend.augmentor.base import AugmentorBase
+
+
+class ImpulseResponseAugmentor(AugmentorBase):
+    """Augmentation model for adding impulse response effect.
+
+    :param rng: Random generator object.
+    :type rng: random.Random
+    :param impulse_manifest_path: Manifest path for impulse audio data.
+    :type impulse_manifest_path: str
+    """
+
+    def __init__(self, rng, impulse_manifest_path):
+        self._rng = rng
+        with jsonlines.open(impulse_manifest_path, 'r') as reader:
+            self._impulse_manifest = list(reader)
+
+    def __call__(self, x, uttid=None, train=True):
+        if not train:
+            return x
+        self.transform_audio(x)
+        return x
+
+    def transform_audio(self, audio_segment):
+        """Add impulse response effect.
+
+        Note that this is an in-place transformation.
+
+        :param audio_segment: Audio segment to add effects to.
+        :type audio_segment: AudioSegmenet|SpeechSegment
+        """
+        impulse_json = self._rng.choice(
+            self._impulse_manifest, 1, replace=False)[0]
+        impulse_segment = AudioSegment.from_file(impulse_json['audio_filepath'])
+        audio_segment.convolve(impulse_segment, allow_resample=True)
diff --git a/ernie-sat/paddlespeech/s2t/frontend/augmentor/noise_perturb.py b/ernie-sat/paddlespeech/s2t/frontend/augmentor/noise_perturb.py
new file mode 100644
index 0000000000000000000000000000000000000000..71165dac893a526963b0e870ccbf99cf979aac42
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/frontend/augmentor/noise_perturb.py
@@ -0,0 +1,66 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Contains the noise perturb augmentation model."""
+import jsonlines
+
+from paddlespeech.s2t.frontend.audio import AudioSegment
+from paddlespeech.s2t.frontend.augmentor.base import AugmentorBase
+
+
+class NoisePerturbAugmentor(AugmentorBase):
+    """Augmentation model for adding background noise.
+
+    :param rng: Random generator object.
+    :type rng: random.Random
+    :param min_snr_dB: Minimal signal noise ratio, in decibels.
+    :type min_snr_dB: float
+    :param max_snr_dB: Maximal signal noise ratio, in decibels.
+    :type max_snr_dB: float
+    :param noise_manifest_path: Manifest path for noise audio data.
+    :type noise_manifest_path: str
+    """
+
+    def __init__(self, rng, min_snr_dB, max_snr_dB, noise_manifest_path):
+        self._min_snr_dB = min_snr_dB
+        self._max_snr_dB = max_snr_dB
+        self._rng = rng
+        with jsonlines.open(noise_manifest_path, 'r') as reader:
+            self._noise_manifest = list(reader)
+
+    def __call__(self, x, uttid=None, train=True):
+        if not train:
+            return x
+        self.transform_audio(x)
+        return x
+
+    def transform_audio(self, audio_segment):
+        """Add background noise audio.
+
+        Note that this is an in-place transformation.
+
+        :param audio_segment: Audio segment to add effects to.
+        :type audio_segment: AudioSegmenet|SpeechSegment
+        """
+        noise_json = self._rng.choice(self._noise_manifest, 1, replace=False)[0]
+        if noise_json['duration'] < audio_segment.duration:
+            raise RuntimeError("The duration of sampled noise audio is smaller "
+                               "than the audio segment to add effects to.")
+        diff_duration = noise_json['duration'] - audio_segment.duration
+        start = self._rng.uniform(0, diff_duration)
+        end = start + audio_segment.duration
+        noise_segment = AudioSegment.slice_from_file(
+            noise_json['audio_filepath'], start=start, end=end)
+        snr_dB = self._rng.uniform(self._min_snr_dB, self._max_snr_dB)
+        audio_segment.add_noise(
+            noise_segment, snr_dB, allow_downsampling=True, rng=self._rng)
diff --git a/ernie-sat/paddlespeech/s2t/frontend/augmentor/online_bayesian_normalization.py b/ernie-sat/paddlespeech/s2t/frontend/augmentor/online_bayesian_normalization.py
new file mode 100644
index 0000000000000000000000000000000000000000..f9d1530f0a3b5d90bc5c57c494d9c4ef1849c7b6
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/frontend/augmentor/online_bayesian_normalization.py
@@ -0,0 +1,63 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Contain the online bayesian normalization augmentation model."""
+from paddlespeech.s2t.frontend.augmentor.base import AugmentorBase
+
+
+class OnlineBayesianNormalizationAugmentor(AugmentorBase):
+    """Augmentation model for adding online bayesian normalization.
+
+    :param rng: Random generator object.
+    :type rng: random.Random
+    :param target_db: Target RMS value in decibels.
+    :type target_db: float
+    :param prior_db: Prior RMS estimate in decibels.
+    :type prior_db: float
+    :param prior_samples: Prior strength in number of samples.
+    :type prior_samples: int
+    :param startup_delay: Default 0.0s. If provided, this function will
+                          accrue statistics for the first startup_delay 
+                          seconds before applying online normalization.
+    :type starup_delay: float.
+    """
+
+    def __init__(self,
+                 rng,
+                 target_db,
+                 prior_db,
+                 prior_samples,
+                 startup_delay=0.0):
+        self._target_db = target_db
+        self._prior_db = prior_db
+        self._prior_samples = prior_samples
+        self._rng = rng
+        self._startup_delay = startup_delay
+
+    def __call__(self, x, uttid=None, train=True):
+        if not train:
+            return x
+        self.transform_audio(x)
+        return x
+
+    def transform_audio(self, audio_segment):
+        """Normalizes the input audio using the online Bayesian approach.
+
+        Note that this is an in-place transformation.
+
+        :param audio_segment: Audio segment to add effects to.
+        :type audio_segment: AudioSegment|SpeechSegment
+        """
+        audio_segment.normalize_online_bayesian(self._target_db, self._prior_db,
+                                                self._prior_samples,
+                                                self._startup_delay)
diff --git a/ernie-sat/paddlespeech/s2t/frontend/augmentor/resample.py b/ernie-sat/paddlespeech/s2t/frontend/augmentor/resample.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e6402ff71da982400a482e8de479878d8eb8a46
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/frontend/augmentor/resample.py
@@ -0,0 +1,48 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Contain the resample augmentation model."""
+from paddlespeech.s2t.frontend.augmentor.base import AugmentorBase
+
+
+class ResampleAugmentor(AugmentorBase):
+    """Augmentation model for resampling.
+
+    See more info here:
+    https://ccrma.stanford.edu/~jos/resample/index.html
+    
+    :param rng: Random generator object.
+    :type rng: random.Random
+    :param new_sample_rate: New sample rate in Hz.
+    :type new_sample_rate: int
+    """
+
+    def __init__(self, rng, new_sample_rate):
+        self._new_sample_rate = new_sample_rate
+        self._rng = rng
+
+    def __call__(self, x, uttid=None, train=True):
+        if not train:
+            return x
+        self.transform_audio(x)
+        return x
+
+    def transform_audio(self, audio_segment):
+        """Resamples the input audio to a target sample rate.
+
+        Note that this is an in-place transformation.
+
+        :param audio: Audio segment to add effects to.
+        :type audio: AudioSegment|SpeechSegment
+        """
+        audio_segment.resample(self._new_sample_rate)
diff --git a/ernie-sat/paddlespeech/s2t/frontend/augmentor/shift_perturb.py b/ernie-sat/paddlespeech/s2t/frontend/augmentor/shift_perturb.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed6f162b99010e8b8e38522ea3a89e57ecd3dced
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/frontend/augmentor/shift_perturb.py
@@ -0,0 +1,49 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Contains the volume perturb augmentation model."""
+from paddlespeech.s2t.frontend.augmentor.base import AugmentorBase
+
+
+class ShiftPerturbAugmentor(AugmentorBase):
+    """Augmentation model for adding random shift perturbation.
+    
+    :param rng: Random generator object.
+    :type rng: random.Random
+    :param min_shift_ms: Minimal shift in milliseconds.
+    :type min_shift_ms: float
+    :param max_shift_ms: Maximal shift in milliseconds.
+    :type max_shift_ms: float
+    """
+
+    def __init__(self, rng, min_shift_ms, max_shift_ms):
+        self._min_shift_ms = min_shift_ms
+        self._max_shift_ms = max_shift_ms
+        self._rng = rng
+
+    def __call__(self, x, uttid=None, train=True):
+        if not train:
+            return x
+        self.transform_audio(x)
+        return x
+
+    def transform_audio(self, audio_segment):
+        """Shift audio.
+
+        Note that this is an in-place transformation.
+
+        :param audio_segment: Audio segment to add effects to.
+        :type audio_segment: AudioSegmenet|SpeechSegment
+        """
+        shift_ms = self._rng.uniform(self._min_shift_ms, self._max_shift_ms)
+        audio_segment.shift(shift_ms)
diff --git a/ernie-sat/paddlespeech/s2t/frontend/augmentor/spec_augment.py b/ernie-sat/paddlespeech/s2t/frontend/augmentor/spec_augment.py
new file mode 100644
index 0000000000000000000000000000000000000000..e91cfdce42b621934fa25b69cc629ad03c7fec34
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/frontend/augmentor/spec_augment.py
@@ -0,0 +1,256 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Contains the volume perturb augmentation model."""
+import random
+
+import numpy as np
+from PIL import Image
+from PIL.Image import BICUBIC
+
+from paddlespeech.s2t.frontend.augmentor.base import AugmentorBase
+from paddlespeech.s2t.utils.log import Log
+
+logger = Log(__name__).getlog()
+
+
+class SpecAugmentor(AugmentorBase):
+    """Augmentation model for Time warping, Frequency masking, Time masking.
+
+    SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition
+        https://arxiv.org/abs/1904.08779
+
+    SpecAugment on Large Scale Datasets
+        https://arxiv.org/abs/1912.05533
+
+    """
+
+    def __init__(self,
+                 rng,
+                 F,
+                 T,
+                 n_freq_masks,
+                 n_time_masks,
+                 p=1.0,
+                 W=40,
+                 adaptive_number_ratio=0,
+                 adaptive_size_ratio=0,
+                 max_n_time_masks=20,
+                 replace_with_zero=True,
+                 warp_mode='PIL'):
+        """SpecAugment class.
+        Args:
+            rng (random.Random): random generator object.
+            F (int): parameter for frequency masking
+            T (int): parameter for time masking
+            n_freq_masks (int): number of frequency masks
+            n_time_masks (int): number of time masks
+            p (float): parameter for upperbound of the time mask
+            W (int): parameter for time warping
+            adaptive_number_ratio (float): adaptive multiplicity ratio for time masking
+            adaptive_size_ratio (float): adaptive size ratio for time masking
+            max_n_time_masks (int): maximum number of time masking
+            replace_with_zero (bool): pad zero on mask if true else use mean
+            warp_mode (str):  "PIL" (default, fast, not differentiable)
+                 or "sparse_image_warp" (slow, differentiable)
+        """
+        super().__init__()
+        self._rng = rng
+        self.inplace = True
+        self.replace_with_zero = replace_with_zero
+
+        self.mode = warp_mode
+        self.W = W
+        self.F = F
+        self.T = T
+        self.n_freq_masks = n_freq_masks
+        self.n_time_masks = n_time_masks
+        self.p = p
+
+        # adaptive SpecAugment
+        self.adaptive_number_ratio = adaptive_number_ratio
+        self.adaptive_size_ratio = adaptive_size_ratio
+        self.max_n_time_masks = max_n_time_masks
+
+        if adaptive_number_ratio > 0:
+            self.n_time_masks = 0
+            logger.info('n_time_masks is set ot zero for adaptive SpecAugment.')
+        if adaptive_size_ratio > 0:
+            self.T = 0
+            logger.info('T is set to zero for adaptive SpecAugment.')
+
+        self._freq_mask = None
+        self._time_mask = None
+
+    def librispeech_basic(self):
+        self.W = 80
+        self.F = 27
+        self.T = 100
+        self.n_freq_masks = 1
+        self.n_time_masks = 1
+        self.p = 1.0
+
+    def librispeech_double(self):
+        self.W = 80
+        self.F = 27
+        self.T = 100
+        self.n_freq_masks = 2
+        self.n_time_masks = 2
+        self.p = 1.0
+
+    def switchboard_mild(self):
+        self.W = 40
+        self.F = 15
+        self.T = 70
+        self.n_freq_masks = 2
+        self.n_time_masks = 2
+        self.p = 0.2
+
+    def switchboard_strong(self):
+        self.W = 40
+        self.F = 27
+        self.T = 70
+        self.n_freq_masks = 2
+        self.n_time_masks = 2
+        self.p = 0.2
+
+    @property
+    def freq_mask(self):
+        return self._freq_mask
+
+    @property
+    def time_mask(self):
+        return self._time_mask
+
+    def __repr__(self):
+        return f"specaug: F-{self.F}, T-{self.T}, F-n-{self.n_freq_masks}, T-n-{self.n_time_masks}"
+
+    def time_warp(self, x, mode='PIL'):
+        """time warp for spec augment
+        move random center frame by the random width ~ uniform(-window, window)
+
+        Args:
+            x (np.ndarray): spectrogram (time, freq)
+            mode (str): PIL or sparse_image_warp
+
+        Raises:
+            NotImplementedError: [description]
+            NotImplementedError: [description]
+
+        Returns:
+            np.ndarray: time warped spectrogram (time, freq)
+        """
+        window = max_time_warp = self.W
+        if window == 0:
+            return x
+
+        if mode == "PIL":
+            t = x.shape[0]
+            if t - window <= window:
+                return x
+            # NOTE: randrange(a, b) emits a, a + 1, ..., b - 1
+            center = random.randrange(window, t - window)
+            warped = random.randrange(center - window, center +
+                                      window) + 1  # 1 ... t - 1
+
+            left = Image.fromarray(x[:center]).resize((x.shape[1], warped),
+                                                      BICUBIC)
+            right = Image.fromarray(x[center:]).resize((x.shape[1], t - warped),
+                                                       BICUBIC)
+            if self.inplace:
+                x[:warped] = left
+                x[warped:] = right
+                return x
+            return np.concatenate((left, right), 0)
+        elif mode == "sparse_image_warp":
+            raise NotImplementedError('sparse_image_warp')
+        else:
+            raise NotImplementedError(
+                "unknown resize mode: " + mode +
+                ", choose one from (PIL, sparse_image_warp).")
+
+    def mask_freq(self, x, replace_with_zero=False):
+        """freq mask
+
+        Args:
+            x (np.ndarray): spectrogram (time, freq)
+            replace_with_zero (bool, optional): Defaults to False.
+
+        Returns:
+            np.ndarray: freq mask spectrogram (time, freq)
+        """
+        n_bins = x.shape[1]
+        for i in range(0, self.n_freq_masks):
+            f = int(self._rng.uniform(low=0, high=self.F))
+            f_0 = int(self._rng.uniform(low=0, high=n_bins - f))
+            assert f_0 <= f_0 + f
+            if replace_with_zero:
+                x[:, f_0:f_0 + f] = 0
+            else:
+                x[:, f_0:f_0 + f] = x.mean()
+            self._freq_mask = (f_0, f_0 + f)
+        return x
+
+    def mask_time(self, x, replace_with_zero=False):
+        """time mask
+
+        Args:
+            x (np.ndarray): spectrogram (time, freq)
+            replace_with_zero (bool, optional): Defaults to False.
+
+        Returns:
+            np.ndarray: time mask spectrogram (time, freq)
+        """
+        n_frames = x.shape[0]
+
+        if self.adaptive_number_ratio > 0:
+            n_masks = int(n_frames * self.adaptive_number_ratio)
+            n_masks = min(n_masks, self.max_n_time_masks)
+        else:
+            n_masks = self.n_time_masks
+
+        if self.adaptive_size_ratio > 0:
+            T = self.adaptive_size_ratio * n_frames
+        else:
+            T = self.T
+
+        for i in range(n_masks):
+            t = int(self._rng.uniform(low=0, high=T))
+            t = min(t, int(n_frames * self.p))
+            t_0 = int(self._rng.uniform(low=0, high=n_frames - t))
+            assert t_0 <= t_0 + t
+            if replace_with_zero:
+                x[t_0:t_0 + t, :] = 0
+            else:
+                x[t_0:t_0 + t, :] = x.mean()
+            self._time_mask = (t_0, t_0 + t)
+        return x
+
+    def __call__(self, x, train=True):
+        if not train:
+            return x
+        return self.transform_feature(x)
+
+    def transform_feature(self, x: np.ndarray):
+        """
+        Args:
+            x (np.ndarray): `[T, F]`
+        Returns:
+            x (np.ndarray): `[T, F]`
+        """
+        assert isinstance(x, np.ndarray)
+        assert x.ndim == 2
+        x = self.time_warp(x, self.mode)
+        x = self.mask_freq(x, self.replace_with_zero)
+        x = self.mask_time(x, self.replace_with_zero)
+        return x
diff --git a/ernie-sat/paddlespeech/s2t/frontend/augmentor/speed_perturb.py b/ernie-sat/paddlespeech/s2t/frontend/augmentor/speed_perturb.py
new file mode 100644
index 0000000000000000000000000000000000000000..af0b23ee40ae810fc52710702ac50c5803b77b57
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/frontend/augmentor/speed_perturb.py
@@ -0,0 +1,106 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Contain the speech perturbation augmentation model."""
+import numpy as np
+
+from paddlespeech.s2t.frontend.augmentor.base import AugmentorBase
+
+
+class SpeedPerturbAugmentor(AugmentorBase):
+    """Augmentation model for adding speed perturbation."""
+
+    def __init__(self, rng, min_speed_rate=0.9, max_speed_rate=1.1,
+                 num_rates=3):
+        """speed perturbation.
+        
+        The speed perturbation in kaldi uses sox-speed instead of sox-tempo,
+        and sox-speed just to resample the input,
+        i.e pitch and tempo are changed both.
+
+        "Why use speed option instead of tempo -s in SoX for speed perturbation"
+        https://groups.google.com/forum/#!topic/kaldi-help/8OOG7eE4sZ8
+    
+        Sox speed:
+        https://pysox.readthedocs.io/en/latest/api.html#sox.transform.Transformer
+        
+        See reference paper here:
+        http://www.danielpovey.com/files/2015_interspeech_augmentation.pdf
+        
+        Espnet:
+        https://espnet.github.io/espnet/_modules/espnet/transform/perturb.html
+        
+        Nemo:
+        https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/asr/parts/perturb.py#L92
+
+        Args:
+            rng (random.Random): Random generator object.
+            min_speed_rate (float): Lower bound of new speed rate to sample and should
+                not be smaller than 0.9.
+            max_speed_rate (float): Upper bound of new speed rate to sample and should
+                not be larger than 1.1.
+            num_rates (int, optional): Number of discrete rates to allow. 
+                Can be a positive or negative integer. Defaults to 3.
+                If a positive integer greater than 0 is provided, the range of
+                speed rates will be discretized into `num_rates` values.
+                If a negative integer or 0 is provided, the full range of speed rates
+                will be sampled uniformly.
+                Note: If a positive integer is provided and the resultant discretized
+                range of rates contains the value '1.0', then those samples with rate=1.0,
+                will not be augmented at all and simply skipped. This is to unnecessary
+                augmentation and increase computation time. Effective augmentation chance
+                in such a case is = `prob * (num_rates - 1 / num_rates) * 100`% chance
+                where `prob` is the global probability of a sample being augmented.
+
+        Raises:
+            ValueError: when speed_rate error
+        """
+        if min_speed_rate < 0.9:
+            raise ValueError(
+                "Sampling speed below 0.9 can cause unnatural effects")
+        if max_speed_rate > 1.1:
+            raise ValueError(
+                "Sampling speed above 1.1 can cause unnatural effects")
+        self._min_rate = min_speed_rate
+        self._max_rate = max_speed_rate
+        self._rng = rng
+        self._num_rates = num_rates
+        if num_rates > 0:
+            self._rates = np.linspace(
+                self._min_rate, self._max_rate, self._num_rates, endpoint=True)
+
+    def __call__(self, x, uttid=None, train=True):
+        if not train:
+            return x
+        self.transform_audio(x)
+        return x
+
+    def transform_audio(self, audio_segment):
+        """Sample a new speed rate from the given range and
+        changes the speed of the given audio clip.
+
+        Note that this is an in-place transformation.
+
+        :param audio_segment: Audio segment to add effects to.
+        :type audio_segment: AudioSegment|SpeechSegment
+        """
+        if self._num_rates < 0:
+            speed_rate = self._rng.uniform(self._min_rate, self._max_rate)
+        else:
+            speed_rate = self._rng.choice(self._rates)
+
+        # Skip perturbation in case of identity speed rate
+        if speed_rate == 1.0:
+            return
+
+        audio_segment.change_speed(speed_rate)
diff --git a/ernie-sat/paddlespeech/s2t/frontend/augmentor/volume_perturb.py b/ernie-sat/paddlespeech/s2t/frontend/augmentor/volume_perturb.py
new file mode 100644
index 0000000000000000000000000000000000000000..8cd2dc0a81f6f3be843efd7b2711c78c3a441487
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/frontend/augmentor/volume_perturb.py
@@ -0,0 +1,55 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Contains the volume perturb augmentation model."""
+from paddlespeech.s2t.frontend.augmentor.base import AugmentorBase
+
+
+class VolumePerturbAugmentor(AugmentorBase):
+    """Augmentation model for adding random volume perturbation.
+    
+    This is used for multi-loudness training of PCEN. See
+
+    https://arxiv.org/pdf/1607.05666v1.pdf
+
+    for more details.
+
+    :param rng: Random generator object.
+    :type rng: random.Random
+    :param min_gain_dBFS: Minimal gain in dBFS.
+    :type min_gain_dBFS: float
+    :param max_gain_dBFS: Maximal gain in dBFS.
+    :type max_gain_dBFS: float
+    """
+
+    def __init__(self, rng, min_gain_dBFS, max_gain_dBFS):
+        self._min_gain_dBFS = min_gain_dBFS
+        self._max_gain_dBFS = max_gain_dBFS
+        self._rng = rng
+
+    def __call__(self, x, uttid=None, train=True):
+        if not train:
+            return x
+        self.transform_audio(x)
+        return x
+
+    def transform_audio(self, audio_segment):
+        """Change audio loadness.
+
+        Note that this is an in-place transformation.
+
+        :param audio_segment: Audio segment to add effects to.
+        :type audio_segment: AudioSegmenet|SpeechSegment
+        """
+        gain = self._rng.uniform(self._min_gain_dBFS, self._max_gain_dBFS)
+        audio_segment.gain_db(gain)
diff --git a/ernie-sat/paddlespeech/s2t/frontend/featurizer/__init__.py b/ernie-sat/paddlespeech/s2t/frontend/featurizer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6992700d9a4f8e46ca1a6b6b48ebe66f8e92e9b2
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/frontend/featurizer/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .audio_featurizer import AudioFeaturizer  #noqa: F401
+from .speech_featurizer import SpeechFeaturizer
+from .text_featurizer import TextFeaturizer
diff --git a/ernie-sat/paddlespeech/s2t/frontend/featurizer/audio_featurizer.py b/ernie-sat/paddlespeech/s2t/frontend/featurizer/audio_featurizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f3b646c5ac5c0e19bdddc54d9ed398fbf14a263
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/frontend/featurizer/audio_featurizer.py
@@ -0,0 +1,363 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Contains the audio featurizer class."""
+import numpy as np
+from python_speech_features import delta
+from python_speech_features import logfbank
+from python_speech_features import mfcc
+
+
+class AudioFeaturizer():
+    """Audio featurizer, for extracting features from audio contents of
+    AudioSegment or SpeechSegment.
+
+    Currently, it supports feature types of linear spectrogram and mfcc.
+
+    :param spectrum_type: Specgram feature type. Options: 'linear'.
+    :type spectrum_type: str
+    :param stride_ms: Striding size (in milliseconds) for generating frames.
+    :type stride_ms: float
+    :param window_ms: Window size (in milliseconds) for generating frames.
+    :type window_ms: float
+    :param max_freq: When spectrum_type is 'linear', only FFT bins
+                     corresponding to frequencies between [0, max_freq] are
+                     returned; when spectrum_type is 'mfcc', max_feq is the
+                     highest band edge of mel filters.
+    :types max_freq: None|float
+    :param target_sample_rate: Audio are resampled (if upsampling or
+                               downsampling is allowed) to this before
+                               extracting spectrogram features.
+    :type target_sample_rate: float
+    :param use_dB_normalization: Whether to normalize the audio to a certain
+                                 decibels before extracting the features.
+    :type use_dB_normalization: bool
+    :param target_dB: Target audio decibels for normalization.
+    :type target_dB: float
+    """
+
+    def __init__(self,
+                 spectrum_type: str='linear',
+                 feat_dim: int=None,
+                 delta_delta: bool=False,
+                 stride_ms=10.0,
+                 window_ms=20.0,
+                 n_fft=None,
+                 max_freq=None,
+                 target_sample_rate=16000,
+                 use_dB_normalization=True,
+                 target_dB=-20,
+                 dither=1.0):
+        self._spectrum_type = spectrum_type
+        # mfcc and fbank using `feat_dim`
+        self._feat_dim = feat_dim
+        # mfcc and fbank using `delta-delta`
+        self._delta_delta = delta_delta
+        self._stride_ms = stride_ms
+        self._window_ms = window_ms
+        self._max_freq = max_freq
+        self._target_sample_rate = target_sample_rate
+        self._use_dB_normalization = use_dB_normalization
+        self._target_dB = target_dB
+        self._fft_point = n_fft
+        self._dither = dither
+
+    def featurize(self,
+                  audio_segment,
+                  allow_downsampling=True,
+                  allow_upsampling=True):
+        """Extract audio features from AudioSegment or SpeechSegment.
+
+        :param audio_segment: Audio/speech segment to extract features from.
+        :type audio_segment: AudioSegment|SpeechSegment
+        :param allow_downsampling: Whether to allow audio downsampling before
+                                   featurizing.
+        :type allow_downsampling: bool
+        :param allow_upsampling: Whether to allow audio upsampling before
+                                 featurizing.
+        :type allow_upsampling: bool
+        :return: Spectrogram audio feature in 2darray.
+        :rtype: ndarray
+        :raises ValueError: If audio sample rate is not supported.
+        """
+        # upsampling or downsampling
+        if ((audio_segment.sample_rate > self._target_sample_rate and
+             allow_downsampling) or
+            (audio_segment.sample_rate < self._target_sample_rate and
+             allow_upsampling)):
+            audio_segment.resample(self._target_sample_rate)
+        if audio_segment.sample_rate != self._target_sample_rate:
+            raise ValueError("Audio sample rate is not supported. "
+                             "Turn allow_downsampling or allow up_sampling on.")
+        # decibel normalization
+        if self._use_dB_normalization:
+            audio_segment.normalize(target_db=self._target_dB)
+        # extract spectrogram
+        return self._compute_specgram(audio_segment)
+
+    @property
+    def stride_ms(self):
+        return self._stride_ms
+
+    @property
+    def feature_size(self):
+        """audio feature size"""
+        feat_dim = 0
+        if self._spectrum_type == 'linear':
+            fft_point = self._window_ms if self._fft_point is None else self._fft_point
+            feat_dim = int(fft_point * (self._target_sample_rate / 1000) / 2 +
+                           1)
+        elif self._spectrum_type == 'mfcc':
+            # mfcc, delta, delta-delta
+            feat_dim = int(self._feat_dim *
+                           3) if self._delta_delta else int(self._feat_dim)
+        elif self._spectrum_type == 'fbank':
+            # fbank, delta, delta-delta
+            feat_dim = int(self._feat_dim *
+                           3) if self._delta_delta else int(self._feat_dim)
+        else:
+            raise ValueError("Unknown spectrum_type %s. "
+                             "Supported values: linear." % self._spectrum_type)
+        return feat_dim
+
+    def _compute_specgram(self, audio_segment):
+        """Extract various audio features."""
+        sample_rate = audio_segment.sample_rate
+        if self._spectrum_type == 'linear':
+            samples = audio_segment.samples
+            return self._compute_linear_specgram(
+                samples,
+                sample_rate,
+                stride_ms=self._stride_ms,
+                window_ms=self._window_ms,
+                max_freq=self._max_freq)
+        elif self._spectrum_type == 'mfcc':
+            samples = audio_segment.to('int16')
+            return self._compute_mfcc(
+                samples,
+                sample_rate,
+                feat_dim=self._feat_dim,
+                stride_ms=self._stride_ms,
+                window_ms=self._window_ms,
+                max_freq=self._max_freq,
+                dither=self._dither,
+                delta_delta=self._delta_delta)
+        elif self._spectrum_type == 'fbank':
+            samples = audio_segment.to('int16')
+            return self._compute_fbank(
+                samples,
+                sample_rate,
+                feat_dim=self._feat_dim,
+                stride_ms=self._stride_ms,
+                window_ms=self._window_ms,
+                max_freq=self._max_freq,
+                dither=self._dither,
+                delta_delta=self._delta_delta)
+        else:
+            raise ValueError("Unknown spectrum_type %s. "
+                             "Supported values: linear." % self._spectrum_type)
+
+    def _specgram_real(self, samples, window_size, stride_size, sample_rate):
+        """Compute the spectrogram for samples from a real signal."""
+        # extract strided windows
+        truncate_size = (len(samples) - window_size) % stride_size
+        samples = samples[:len(samples) - truncate_size]
+        nshape = (window_size, (len(samples) - window_size) // stride_size + 1)
+        nstrides = (samples.strides[0], samples.strides[0] * stride_size)
+        windows = np.lib.stride_tricks.as_strided(
+            samples, shape=nshape, strides=nstrides)
+        assert np.all(
+            windows[:, 1] == samples[stride_size:(stride_size + window_size)])
+        # window weighting, squared Fast Fourier Transform (fft), scaling
+        weighting = np.hanning(window_size)[:, None]
+        # https://numpy.org/doc/stable/reference/generated/numpy.fft.rfft.html
+        fft = np.fft.rfft(windows * weighting, n=None, axis=0)
+        fft = np.absolute(fft)
+        fft = fft**2
+        scale = np.sum(weighting**2) * sample_rate
+        fft[1:-1, :] *= (2.0 / scale)
+        fft[(0, -1), :] /= scale
+        # prepare fft frequency list
+        freqs = float(sample_rate) / window_size * np.arange(fft.shape[0])
+        return fft, freqs
+
+    def _compute_linear_specgram(self,
+                                 samples,
+                                 sample_rate,
+                                 stride_ms=10.0,
+                                 window_ms=20.0,
+                                 max_freq=None,
+                                 eps=1e-14):
+        """Compute the linear spectrogram from FFT energy.
+
+        Args:
+            samples ([type]): [description]
+            sample_rate ([type]): [description]
+            stride_ms (float, optional): [description]. Defaults to 10.0.
+            window_ms (float, optional): [description]. Defaults to 20.0.
+            max_freq ([type], optional): [description]. Defaults to None.
+            eps ([type], optional): [description]. Defaults to 1e-14.
+
+        Raises:
+            ValueError: [description]
+            ValueError: [description]
+
+        Returns:
+            np.ndarray: log spectrogram, (time, freq)
+        """
+        if max_freq is None:
+            max_freq = sample_rate / 2
+        if max_freq > sample_rate / 2:
+            raise ValueError("max_freq must not be greater than half of "
+                             "sample rate.")
+        if stride_ms > window_ms:
+            raise ValueError("Stride size must not be greater than "
+                             "window size.")
+        stride_size = int(0.001 * sample_rate * stride_ms)
+        window_size = int(0.001 * sample_rate * window_ms)
+        specgram, freqs = self._specgram_real(
+            samples,
+            window_size=window_size,
+            stride_size=stride_size,
+            sample_rate=sample_rate)
+        ind = np.where(freqs <= max_freq)[0][-1] + 1
+        # (freq, time)
+        spec = np.log(specgram[:ind, :] + eps)
+        return np.transpose(spec)
+
+    def _concat_delta_delta(self, feat):
+        """append delat, delta-delta feature.
+
+        Args:
+            feat (np.ndarray): (T, D)
+
+        Returns:
+            np.ndarray: feat with delta-delta, (T, 3*D)
+        """
+        # Deltas
+        d_feat = delta(feat, 2)
+        # Deltas-Deltas
+        dd_feat = delta(feat, 2)
+        # concat above three features
+        concat_feat = np.concatenate((feat, d_feat, dd_feat), axis=1)
+        return concat_feat
+
+    def _compute_mfcc(self,
+                      samples,
+                      sample_rate,
+                      feat_dim=13,
+                      stride_ms=10.0,
+                      window_ms=25.0,
+                      max_freq=None,
+                      dither=1.0,
+                      delta_delta=True):
+        """Compute mfcc from samples.
+
+        Args:
+            samples (np.ndarray, np.int16): the audio signal from which to compute features.
+            sample_rate (float): the sample rate of the signal we are working with, in Hz.
+            feat_dim (int): the number of cepstrum to return, default 13.
+            stride_ms (float, optional): stride length in ms. Defaults to 10.0.
+            window_ms (float, optional): window length in ms. Defaults to 25.0.
+            max_freq ([type], optional): highest band edge of mel filters. In Hz, default is samplerate/2. Defaults to None.
+            delta_delta (bool, optional): Whether with delta delta. Defaults to False.
+
+        Raises:
+            ValueError: max_freq > samplerate/2
+            ValueError: stride_ms > window_ms
+
+        Returns:
+            np.ndarray: mfcc feature, (D, T).
+        """
+        if max_freq is None:
+            max_freq = sample_rate / 2
+        if max_freq > sample_rate / 2:
+            raise ValueError("max_freq must not be greater than half of "
+                             "sample rate.")
+        if stride_ms > window_ms:
+            raise ValueError("Stride size must not be greater than "
+                             "window size.")
+        # compute the 13 cepstral coefficients, and the first one is replaced
+        # by log(frame energy), (T, D)
+        mfcc_feat = mfcc(
+            signal=samples,
+            samplerate=sample_rate,
+            winlen=0.001 * window_ms,
+            winstep=0.001 * stride_ms,
+            numcep=feat_dim,
+            nfilt=23,
+            nfft=512,
+            lowfreq=20,
+            highfreq=max_freq,
+            dither=dither,
+            remove_dc_offset=True,
+            preemph=0.97,
+            ceplifter=22,
+            useEnergy=True,
+            winfunc='povey')
+        if delta_delta:
+            mfcc_feat = self._concat_delta_delta(mfcc_feat)
+        return mfcc_feat
+
+    def _compute_fbank(self,
+                       samples,
+                       sample_rate,
+                       feat_dim=40,
+                       stride_ms=10.0,
+                       window_ms=25.0,
+                       max_freq=None,
+                       dither=1.0,
+                       delta_delta=False):
+        """Compute logfbank from samples.
+        
+        Args:
+            samples (np.ndarray, np.int16): the audio signal from which to compute features. Should be an N*1 array
+            sample_rate (float): the sample rate of the signal we are working with, in Hz.
+            feat_dim (int): the number of cepstrum to return, default 13.
+            stride_ms (float, optional): stride length in ms. Defaults to 10.0.
+            window_ms (float, optional): window length in ms. Defaults to 20.0.
+            max_freq (float, optional): highest band edge of mel filters. In Hz, default is samplerate/2. Defaults to None.
+            delta_delta (bool, optional): Whether with delta delta. Defaults to False.
+
+        Raises:
+            ValueError: max_freq > samplerate/2
+            ValueError: stride_ms > window_ms
+
+        Returns:
+            np.ndarray: mfcc feature, (D, T).
+        """
+        if max_freq is None:
+            max_freq = sample_rate / 2
+        if max_freq > sample_rate / 2:
+            raise ValueError("max_freq must not be greater than half of "
+                             "sample rate.")
+        if stride_ms > window_ms:
+            raise ValueError("Stride size must not be greater than "
+                             "window size.")
+        # (T, D)
+        fbank_feat = logfbank(
+            signal=samples,
+            samplerate=sample_rate,
+            winlen=0.001 * window_ms,
+            winstep=0.001 * stride_ms,
+            nfilt=feat_dim,
+            nfft=512,
+            lowfreq=20,
+            highfreq=max_freq,
+            dither=dither,
+            remove_dc_offset=True,
+            preemph=0.97,
+            wintype='povey')
+        if delta_delta:
+            fbank_feat = self._concat_delta_delta(fbank_feat)
+        return fbank_feat
diff --git a/ernie-sat/paddlespeech/s2t/frontend/featurizer/speech_featurizer.py b/ernie-sat/paddlespeech/s2t/frontend/featurizer/speech_featurizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..9dc86829ae2aa9068393462e27f4701efe965585
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/frontend/featurizer/speech_featurizer.py
@@ -0,0 +1,106 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Contains the speech featurizer class."""
+from paddlespeech.s2t.frontend.featurizer.audio_featurizer import AudioFeaturizer
+from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
+
+
+class SpeechFeaturizer():
+    """Speech and Text feature extraction.
+    """
+
+    def __init__(self,
+                 unit_type,
+                 vocab_filepath,
+                 spm_model_prefix=None,
+                 spectrum_type='linear',
+                 feat_dim=None,
+                 delta_delta=False,
+                 stride_ms=10.0,
+                 window_ms=20.0,
+                 n_fft=None,
+                 max_freq=None,
+                 target_sample_rate=16000,
+                 use_dB_normalization=True,
+                 target_dB=-20,
+                 dither=1.0,
+                 maskctc=False):
+        self.stride_ms = stride_ms
+        self.window_ms = window_ms
+
+        self.audio_feature = AudioFeaturizer(
+            spectrum_type=spectrum_type,
+            feat_dim=feat_dim,
+            delta_delta=delta_delta,
+            stride_ms=stride_ms,
+            window_ms=window_ms,
+            n_fft=n_fft,
+            max_freq=max_freq,
+            target_sample_rate=target_sample_rate,
+            use_dB_normalization=use_dB_normalization,
+            target_dB=target_dB,
+            dither=dither)
+        self.feature_size = self.audio_feature.feature_size
+
+        self.text_feature = TextFeaturizer(
+            unit_type=unit_type,
+            vocab=vocab_filepath,
+            spm_model_prefix=spm_model_prefix,
+            maskctc=maskctc)
+        self.vocab_size = self.text_feature.vocab_size
+
+    def featurize(self, speech_segment, keep_transcription_text):
+        """Extract features for speech segment.
+
+        1. For audio parts, extract the audio features.
+        2. For transcript parts, keep the original text or convert text string
+           to a list of token indices in char-level.
+
+        Args:
+            speech_segment (SpeechSegment): Speech segment to extract features from.
+            keep_transcription_text (bool): True, keep transcript text, False, token ids
+
+        Returns:
+            tuple: 1) spectrogram audio feature in 2darray, 2) list oftoken indices.
+        """
+        spec_feature = self.audio_feature.featurize(speech_segment)
+
+        if keep_transcription_text:
+            return spec_feature, speech_segment.transcript
+
+        if speech_segment.has_token:
+            text_ids = speech_segment.token_ids
+        else:
+            text_ids = self.text_feature.featurize(speech_segment.transcript)
+        return spec_feature, text_ids
+
+    def text_featurize(self, text, keep_transcription_text):
+        """Extract features for speech segment.
+
+        1. For audio parts, extract the audio features.
+        2. For transcript parts, keep the original text or convert text string
+           to a list of token indices in char-level.
+
+        Args:
+            text (str): text.
+            keep_transcription_text (bool): True, keep transcript text, False, token ids
+
+        Returns:
+            (str|List[int]): text, or list of token indices.
+        """
+        if keep_transcription_text:
+            return text
+
+        text_ids = self.text_feature.featurize(text)
+        return text_ids
diff --git a/ernie-sat/paddlespeech/s2t/frontend/featurizer/text_featurizer.py b/ernie-sat/paddlespeech/s2t/frontend/featurizer/text_featurizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c0fa5e2f63b05387cd6ce9af6fb0331c400cfb8
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/frontend/featurizer/text_featurizer.py
@@ -0,0 +1,235 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Contains the text featurizer class."""
+from pprint import pformat
+from typing import Union
+
+import sentencepiece as spm
+
+from ..utility import BLANK
+from ..utility import EOS
+from ..utility import load_dict
+from ..utility import MASKCTC
+from ..utility import SOS
+from ..utility import SPACE
+from ..utility import UNK
+from paddlespeech.s2t.utils.log import Log
+
+logger = Log(__name__).getlog()
+
+__all__ = ["TextFeaturizer"]
+
+
+class TextFeaturizer():
+    def __init__(self, unit_type, vocab, spm_model_prefix=None, maskctc=False):
+        """Text featurizer, for processing or extracting features from text.
+
+        Currently, it supports char/word/sentence-piece level tokenizing and conversion into
+        a list of token indices. Note that the token indexing order follows the
+        given vocabulary file.
+
+        Args:
+            unit_type (str): unit type, e.g. char, word, spm
+            vocab Option[str, list]: Filepath to load vocabulary for token indices conversion, or vocab list.
+            spm_model_prefix (str, optional): spm model prefix. Defaults to None.
+        """
+        assert unit_type in ('char', 'spm', 'word')
+        self.unit_type = unit_type
+        self.unk = UNK
+        self.maskctc = maskctc
+
+        if vocab:
+            self.vocab_dict, self._id2token, self.vocab_list, self.unk_id, self.eos_id, self.blank_id = self._load_vocabulary_from_file(
+                vocab, maskctc)
+            self.vocab_size = len(self.vocab_list)
+        else:
+            logger.warning("TextFeaturizer: not have vocab file or vocab list.")
+
+        if unit_type == 'spm':
+            spm_model = spm_model_prefix + '.model'
+            self.sp = spm.SentencePieceProcessor()
+            self.sp.Load(spm_model)
+
+    def tokenize(self, text, replace_space=True):
+        if self.unit_type == 'char':
+            tokens = self.char_tokenize(text, replace_space)
+        elif self.unit_type == 'word':
+            tokens = self.word_tokenize(text)
+        else:  # spm
+            tokens = self.spm_tokenize(text)
+        return tokens
+
+    def detokenize(self, tokens):
+        if self.unit_type == 'char':
+            text = self.char_detokenize(tokens)
+        elif self.unit_type == 'word':
+            text = self.word_detokenize(tokens)
+        else:  # spm
+            text = self.spm_detokenize(tokens)
+        return text
+
+    def featurize(self, text):
+        """Convert text string to a list of token indices.
+
+        Args:
+            text (str): Text to process.
+
+        Returns:
+            List[int]: List of token indices.
+        """
+        tokens = self.tokenize(text)
+        ids = []
+        for token in tokens:
+            if token not in self.vocab_dict:
+                logger.debug(f"Text Token: {token} -> {self.unk}")
+                token = self.unk
+            ids.append(self.vocab_dict[token])
+        return ids
+
+    def defeaturize(self, idxs):
+        """Convert a list of token indices to text string,
+        ignore index after eos_id.
+
+        Args:
+            idxs (List[int]): List of token indices.
+
+        Returns:
+            str: Text.
+        """
+        tokens = []
+        for idx in idxs:
+            if idx == self.eos_id:
+                break
+            tokens.append(self._id2token[idx])
+        text = self.detokenize(tokens)
+        return text
+
+    def char_tokenize(self, text, replace_space=True):
+        """Character tokenizer.
+
+        Args:
+            text (str): text string.
+            replace_space (bool): False only used by build_vocab.py.
+
+        Returns:
+            List[str]: tokens.
+        """
+        text = text.strip()
+        if replace_space:
+            text_list = [SPACE if item == " " else item for item in list(text)]
+        else:
+            text_list = list(text)
+        return text_list
+
+    def char_detokenize(self, tokens):
+        """Character detokenizer.
+
+        Args:
+            tokens (List[str]): tokens.
+
+        Returns:
+           str: text string.
+        """
+        tokens = [t.replace(SPACE, " ") for t in tokens]
+        return "".join(tokens)
+
+    def word_tokenize(self, text):
+        """Word tokenizer, separate by <space>."""
+        return text.strip().split()
+
+    def word_detokenize(self, tokens):
+        """Word detokenizer, separate by <space>."""
+        return " ".join(tokens)
+
+    def spm_tokenize(self, text):
+        """spm tokenize.
+
+        Args:
+            text (str): text string.
+
+        Returns:
+            List[str]: sentence pieces str code
+        """
+        stats = {"num_empty": 0, "num_filtered": 0}
+
+        def valid(line):
+            return True
+
+        def encode(l):
+            return self.sp.EncodeAsPieces(l)
+
+        def encode_line(line):
+            line = line.strip()
+            if len(line) > 0:
+                line = encode(line)
+                if valid(line):
+                    return line
+                else:
+                    stats["num_filtered"] += 1
+            else:
+                stats["num_empty"] += 1
+            return None
+
+        enc_line = encode_line(text)
+        return enc_line
+
+    def spm_detokenize(self, tokens, input_format='piece'):
+        """spm detokenize.
+
+        Args:
+            ids (List[str]): tokens.
+
+        Returns:
+            str: text
+        """
+        if input_format == "piece":
+
+            def decode(l):
+                return "".join(self.sp.DecodePieces(l))
+        elif input_format == "id":
+
+            def decode(l):
+                return "".join(self.sp.DecodeIds(l))
+
+        return decode(tokens)
+
+    def _load_vocabulary_from_file(self, vocab: Union[str, list],
+                                   maskctc: bool):
+        """Load vocabulary from file."""
+        if isinstance(vocab, list):
+            vocab_list = vocab
+        else:
+            vocab_list = load_dict(vocab, maskctc)
+        assert vocab_list is not None
+        logger.debug(f"Vocab: {pformat(vocab_list)}")
+
+        id2token = dict(
+            [(idx, token) for (idx, token) in enumerate(vocab_list)])
+        token2id = dict(
+            [(token, idx) for (idx, token) in enumerate(vocab_list)])
+
+        blank_id = vocab_list.index(BLANK) if BLANK in vocab_list else -1
+        maskctc_id = vocab_list.index(MASKCTC) if MASKCTC in vocab_list else -1
+        unk_id = vocab_list.index(UNK) if UNK in vocab_list else -1
+        eos_id = vocab_list.index(EOS) if EOS in vocab_list else -1
+        sos_id = vocab_list.index(SOS) if SOS in vocab_list else -1
+        space_id = vocab_list.index(SPACE) if SPACE in vocab_list else -1
+
+        logger.info(f"BLANK id: {blank_id}")
+        logger.info(f"UNK id: {unk_id}")
+        logger.info(f"EOS id: {eos_id}")
+        logger.info(f"SOS id: {sos_id}")
+        logger.info(f"SPACE id: {space_id}")
+        logger.info(f"MASKCTC id: {maskctc_id}")
+        return token2id, id2token, vocab_list, unk_id, eos_id, blank_id
diff --git a/ernie-sat/paddlespeech/s2t/frontend/normalizer.py b/ernie-sat/paddlespeech/s2t/frontend/normalizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..b596b2ab09ffc9dc60fd52bfb43ebaa1be7a730b
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/frontend/normalizer.py
@@ -0,0 +1,200 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Contains feature normalizers."""
+import json
+
+import jsonlines
+import numpy as np
+import paddle
+from paddle.io import DataLoader
+from paddle.io import Dataset
+
+from paddlespeech.s2t.frontend.audio import AudioSegment
+from paddlespeech.s2t.frontend.utility import load_cmvn
+from paddlespeech.s2t.utils.log import Log
+
+__all__ = ["FeatureNormalizer"]
+
+logger = Log(__name__).getlog()
+
+
+# https://github.com/PaddlePaddle/Paddle/pull/31481
+class CollateFunc(object):
+    def __init__(self, feature_func):
+        self.feature_func = feature_func
+
+    def __call__(self, batch):
+        mean_stat = None
+        var_stat = None
+        number = 0
+        for item in batch:
+            audioseg = AudioSegment.from_file(item['feat'])
+            feat = self.feature_func(audioseg)  #(T, D)
+
+            sums = np.sum(feat, axis=0)
+            if mean_stat is None:
+                mean_stat = sums
+            else:
+                mean_stat += sums
+
+            square_sums = np.sum(np.square(feat), axis=0)
+            if var_stat is None:
+                var_stat = square_sums
+            else:
+                var_stat += square_sums
+
+            number += feat.shape[0]
+        return number, mean_stat, var_stat
+
+
+class AudioDataset(Dataset):
+    def __init__(self, manifest_path, num_samples=-1, rng=None, random_seed=0):
+        self._rng = rng if rng else np.random.RandomState(random_seed)
+
+        with jsonlines.open(manifest_path, 'r') as reader:
+            manifest = list(reader)
+
+        if num_samples == -1:
+            sampled_manifest = manifest
+        else:
+            sampled_manifest = self._rng.choice(
+                manifest, num_samples, replace=False)
+        self.items = sampled_manifest
+
+    def __len__(self):
+        return len(self.items)
+
+    def __getitem__(self, idx):
+        return self.items[idx]
+
+
+class FeatureNormalizer(object):
+    """Feature normalizer. Normalize features to be of zero mean and unit
+    stddev.
+
+    if mean_std_filepath is provided (not None), the normalizer will directly
+    initilize from the file. Otherwise, both manifest_path and featurize_func
+    should be given for on-the-fly mean and stddev computing.
+
+    :param mean_std_filepath: File containing the pre-computed mean and stddev.
+    :type mean_std_filepath: None|str
+    :param manifest_path: Manifest of instances for computing mean and stddev.
+    :type meanifest_path: None|str
+    :param featurize_func: Function to extract features. It should be callable
+                           with ``featurize_func(audio_segment)``.
+    :type featurize_func: None|callable
+    :param num_samples: Number of random samples for computing mean and stddev.
+    :type num_samples: int
+    :param random_seed: Random seed for sampling instances.
+    :type random_seed: int
+    :raises ValueError: If both mean_std_filepath and manifest_path
+                        (or both mean_std_filepath and featurize_func) are None.
+    """
+
+    def __init__(self,
+                 mean_std_filepath,
+                 manifest_path=None,
+                 featurize_func=None,
+                 num_samples=500,
+                 num_workers=0,
+                 random_seed=0):
+        if not mean_std_filepath:
+            if not (manifest_path and featurize_func):
+                raise ValueError("If mean_std_filepath is None, meanifest_path "
+                                 "and featurize_func should not be None.")
+            self._rng = np.random.RandomState(random_seed)
+            self._compute_mean_std(manifest_path, featurize_func, num_samples,
+                                   num_workers)
+        else:
+            mean_std = mean_std_filepath
+            self._read_mean_std_from_file(mean_std)
+
+    def apply(self, features):
+        """Normalize features to be of zero mean and unit stddev.
+
+        :param features: Input features to be normalized.
+        :type features: ndarray, shape (T, D)
+        :param eps:  added to stddev to provide numerical stablibity.
+        :type eps: float
+        :return: Normalized features.
+        :rtype: ndarray
+        """
+        return (features - self._mean) * self._istd
+
+    def _read_mean_std_from_file(self, mean_std, eps=1e-20):
+        """Load mean and std from file."""
+        if isinstance(mean_std, list):
+            mean = mean_std[0]['cmvn_stats']['mean']
+            istd = mean_std[0]['cmvn_stats']['istd']
+        else:
+            filetype = mean_std.split(".")[-1]
+            mean, istd = load_cmvn(mean_std, filetype=filetype)
+        self._mean = np.expand_dims(mean, axis=0)
+        self._istd = np.expand_dims(istd, axis=0)
+
+    def write_to_file(self, filepath):
+        """Write the mean and stddev to the file.
+
+        :param filepath: File to write mean and stddev.
+        :type filepath: str
+        """
+        with open(filepath, 'w') as fout:
+            fout.write(json.dumps(self.cmvn_info))
+
+    def _compute_mean_std(self,
+                          manifest_path,
+                          featurize_func,
+                          num_samples,
+                          num_workers,
+                          batch_size=64,
+                          eps=1e-20):
+        """Compute mean and std from randomly sampled instances."""
+        paddle.set_device('cpu')
+
+        collate_func = CollateFunc(featurize_func)
+        dataset = AudioDataset(manifest_path, num_samples, self._rng)
+        data_loader = DataLoader(
+            dataset,
+            batch_size=batch_size,
+            shuffle=False,
+            num_workers=num_workers,
+            collate_fn=collate_func)
+
+        with paddle.no_grad():
+            all_mean_stat = None
+            all_var_stat = None
+            all_number = 0
+            wav_number = 0
+            for i, batch in enumerate(data_loader):
+                number, mean_stat, var_stat = batch
+                if i == 0:
+                    all_mean_stat = mean_stat
+                    all_var_stat = var_stat
+                else:
+                    all_mean_stat += mean_stat
+                    all_var_stat += var_stat
+                all_number += number
+                wav_number += batch_size
+
+                if wav_number % 1000 == 0:
+                    logger.info(
+                        f'process {wav_number} wavs,{all_number} frames.')
+
+        self.cmvn_info = {
+            'mean_stat': list(all_mean_stat.tolist()),
+            'var_stat': list(all_var_stat.tolist()),
+            'frame_num': all_number,
+        }
+
+        return self.cmvn_info
diff --git a/ernie-sat/paddlespeech/s2t/frontend/speech.py b/ernie-sat/paddlespeech/s2t/frontend/speech.py
new file mode 100644
index 0000000000000000000000000000000000000000..96997104741ec4e36390674f8e45086b2db588a2
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/frontend/speech.py
@@ -0,0 +1,243 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Contains the speech segment class."""
+import numpy as np
+
+from paddlespeech.s2t.frontend.audio import AudioSegment
+
+
+class SpeechSegment(AudioSegment):
+    """Speech Segment with Text
+
+    Args:
+        AudioSegment (AudioSegment): Audio Segment
+    """
+
+    def __init__(self,
+                 samples,
+                 sample_rate,
+                 transcript,
+                 tokens=None,
+                 token_ids=None):
+        """Speech segment abstraction, a subclass of AudioSegment,
+            with an additional transcript.
+
+        Args:
+            samples (ndarray.float32): Audio samples [num_samples x num_channels].
+            sample_rate (int): Audio sample rate.
+            transcript (str): Transcript text for the speech.
+            tokens (List[str], optinal): Transcript tokens for the speech.
+            token_ids (List[int], optional): Transcript token ids for the speech.
+        """
+        AudioSegment.__init__(self, samples, sample_rate)
+        self._transcript = transcript
+        # must init `tokens` with `token_ids` at the same time
+        self._tokens = tokens
+        self._token_ids = token_ids
+
+    def __eq__(self, other):
+        """Return whether two objects are equal.
+
+        Returns:
+            bool: True, when equal to other
+        """
+        if not AudioSegment.__eq__(self, other):
+            return False
+        if self._transcript != other._transcript:
+            return False
+        if self.has_token and other.has_token:
+            if self._tokens != other._tokens:
+                return False
+            if self._token_ids != other._token_ids:
+                return False
+        return True
+
+    def __ne__(self, other):
+        """Return whether two objects are unequal."""
+        return not self.__eq__(other)
+
+    @classmethod
+    def from_file(cls,
+                  filepath,
+                  transcript,
+                  tokens=None,
+                  token_ids=None,
+                  infos=None):
+        """Create speech segment from audio file and corresponding transcript.
+
+        Args:
+            filepath (str|file): Filepath or file object to audio file.
+            transcript (str): Transcript text for the speech.
+            tokens (List[str], optional): text tokens. Defaults to None.
+            token_ids (List[int], optional): text token ids. Defaults to None.
+            infos (TarLocalData, optional): tar2obj and tar2infos. Defaults to None.
+
+        Returns:
+            SpeechSegment: Speech segment instance.
+        """
+        audio = AudioSegment.from_file(filepath, infos)
+        return cls(audio.samples, audio.sample_rate, transcript, tokens,
+                   token_ids)
+
+    @classmethod
+    def from_bytes(cls, bytes, transcript, tokens=None, token_ids=None):
+        """Create speech segment from a byte string and corresponding
+
+        Args:
+            filepath (str|file): Filepath or file object to audio file.
+            transcript (str): Transcript text for the speech.
+            tokens (List[str], optional): text tokens. Defaults to None.
+            token_ids (List[int], optional): text token ids. Defaults to None.
+
+        Returns:
+            SpeechSegment: Speech segment instance.
+        """
+        audio = AudioSegment.from_bytes(bytes)
+        return cls(audio.samples, audio.sample_rate, transcript, tokens,
+                   token_ids)
+
+    @classmethod
+    def from_pcm(cls,
+                 samples,
+                 sample_rate,
+                 transcript,
+                 tokens=None,
+                 token_ids=None):
+        """Create speech segment from pcm on online mode 
+        Args:
+            samples (numpy.ndarray): Audio samples [num_samples x num_channels].
+            sample_rate (int): Audio sample rate.
+            transcript (str): Transcript text for the speech.
+            tokens (List[str], optional): text tokens. Defaults to None.
+            token_ids (List[int], optional): text token ids. Defaults to None.
+        Returns: 
+            SpeechSegment: Speech segment instance.
+        """
+        audio = AudioSegment.from_pcm(samples, sample_rate)
+        return cls(audio.samples, audio.sample_rate, transcript, tokens,
+                   token_ids)
+
+    @classmethod
+    def concatenate(cls, *segments):
+        """Concatenate an arbitrary number of speech segments together, both
+        audio and transcript will be concatenated.
+
+        :param *segments: Input speech segments to be concatenated.
+        :type *segments: tuple of SpeechSegment
+        :return: Speech segment instance.
+        :rtype: SpeechSegment
+        :raises ValueError: If the number of segments is zero, or if the 
+                            sample_rate of any two segments does not match.
+        :raises TypeError: If any segment is not SpeechSegment instance.
+        """
+        if len(segments) == 0:
+            raise ValueError("No speech segments are given to concatenate.")
+        sample_rate = segments[0]._sample_rate
+        transcripts = ""
+        tokens = []
+        token_ids = []
+        for seg in segments:
+            if sample_rate != seg._sample_rate:
+                raise ValueError("Can't concatenate segments with "
+                                 "different sample rates")
+            if type(seg) is not cls:
+                raise TypeError("Only speech segments of the same type "
+                                "instance can be concatenated.")
+            transcripts += seg._transcript
+            if self.has_token:
+                tokens += seg._tokens
+                token_ids += seg._token_ids
+        samples = np.concatenate([seg.samples for seg in segments])
+        return cls(samples, sample_rate, transcripts, tokens, token_ids)
+
+    @classmethod
+    def slice_from_file(cls,
+                        filepath,
+                        transcript,
+                        tokens=None,
+                        token_ids=None,
+                        start=None,
+                        end=None):
+        """Loads a small section of an speech without having to load
+        the entire file into the memory which can be incredibly wasteful.
+
+        :param filepath: Filepath or file object to audio file.
+        :type filepath: str|file
+        :param start: Start time in seconds. If start is negative, it wraps
+                      around from the end. If not provided, this function 
+                      reads from the very beginning.
+        :type start: float
+        :param end: End time in seconds. If end is negative, it wraps around
+                    from the end. If not provided, the default behvaior is
+                    to read to the end of the file.
+        :type end: float
+        :param transcript: Transcript text for the speech. if not provided, 
+                           the defaults is an empty string.
+        :type transript: str
+        :return: SpeechSegment instance of the specified slice of the input
+                 speech file.
+        :rtype: SpeechSegment
+        """
+        audio = AudioSegment.slice_from_file(filepath, start, end)
+        return cls(audio.samples, audio.sample_rate, transcript, tokens,
+                   token_ids)
+
+    @classmethod
+    def make_silence(cls, duration, sample_rate):
+        """Creates a silent speech segment of the given duration and
+        sample rate, transcript will be an empty string.
+
+        Args:
+            duration (float): Length of silence in seconds.
+            sample_rate (float): Sample rate.
+
+        Returns:
+            SpeechSegment: Silence of the given duration.
+        """
+        audio = AudioSegment.make_silence(duration, sample_rate)
+        return cls(audio.samples, audio.sample_rate, "")
+
+    @property
+    def has_token(self):
+        if self._tokens and self._token_ids:
+            return True
+        return False
+
+    @property
+    def transcript(self):
+        """Return the transcript text.
+
+        Returns:
+            str: Transcript text for the speech.
+        """
+
+        return self._transcript
+
+    @property
+    def tokens(self):
+        """Return the transcript text tokens.
+
+        Returns:
+            List[str]: text tokens.
+        """
+        return self._tokens
+
+    @property
+    def token_ids(self):
+        """Return the transcript text token ids.
+
+        Returns:
+            List[int]: text token ids.
+        """
+        return self._token_ids
diff --git a/ernie-sat/paddlespeech/s2t/frontend/utility.py b/ernie-sat/paddlespeech/s2t/frontend/utility.py
new file mode 100644
index 0000000000000000000000000000000000000000..d35785db6825761e8bc26aada4c2c4d9d8066b0c
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/frontend/utility.py
@@ -0,0 +1,393 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Contains data helper functions."""
+import json
+import math
+import tarfile
+from collections import namedtuple
+from typing import List
+from typing import Optional
+from typing import Text
+
+import jsonlines
+import numpy as np
+
+from paddlespeech.s2t.utils.log import Log
+
+logger = Log(__name__).getlog()
+
+__all__ = [
+    "load_dict", "load_cmvn", "read_manifest", "rms_to_db", "rms_to_dbfs",
+    "max_dbfs", "mean_dbfs", "gain_db_to_ratio", "normalize_audio", "SOS",
+    "EOS", "UNK", "BLANK", "MASKCTC", "SPACE", "convert_samples_to_float32",
+    "convert_samples_from_float32"
+]
+
+IGNORE_ID = -1
+# `sos` and `eos` using same token
+SOS = "<eos>"
+EOS = SOS
+UNK = "<unk>"
+BLANK = "<blank>"
+MASKCTC = "<mask>"
+SPACE = "<space>"
+
+
+def load_dict(dict_path: Optional[Text], maskctc=False) -> Optional[List[Text]]:
+    if dict_path is None:
+        return None
+
+    with open(dict_path, "r") as f:
+        dictionary = f.readlines()
+    # first token is `<blank>`
+    # multi line: `<blank> 0\n`
+    # one line: `<blank>`
+    # space is relpace with <space>
+    char_list = [entry[:-1].split(" ")[0] for entry in dictionary]
+    if BLANK not in char_list:
+        char_list.insert(0, BLANK)
+    if EOS not in char_list:
+        char_list.append(EOS)
+    # for non-autoregressive maskctc model
+    if maskctc and MASKCTC not in char_list:
+        char_list.append(MASKCTC)
+    return char_list
+
+
+def read_manifest(
+        manifest_path,
+        max_input_len=float('inf'),
+        min_input_len=0.0,
+        max_output_len=float('inf'),
+        min_output_len=0.0,
+        max_output_input_ratio=float('inf'),
+        min_output_input_ratio=0.0, ):
+    """Load and parse manifest file.
+
+    Args:
+        manifest_path ([type]): Manifest file to load and parse.
+        max_input_len ([type], optional): maximum output seq length,
+            in seconds for raw wav, in frame numbers for feature data.
+            Defaults to float('inf').
+        min_input_len (float, optional): minimum input seq length,
+            in seconds for raw wav, in frame numbers for feature data.
+            Defaults to 0.0.
+        max_output_len (float, optional): maximum input seq length,
+            in modeling units. Defaults to 500.0.
+        min_output_len (float, optional): minimum input seq length,
+            in modeling units. Defaults to 0.0.
+        max_output_input_ratio (float, optional):
+            maximum output seq length/output seq length ratio. Defaults to 10.0.
+        min_output_input_ratio (float, optional):
+            minimum output seq length/output seq length ratio. Defaults to 0.05.
+
+    Raises:
+        IOError: If failed to parse the manifest.
+
+    Returns:
+        List[dict]: Manifest parsing results.
+    """
+    manifest = []
+    with jsonlines.open(manifest_path, 'r') as reader:
+        for json_data in reader:
+            feat_len = json_data["input"][0]["shape"][
+                0] if "input" in json_data and "shape" in json_data["input"][
+                    0] else 1.0
+            token_len = json_data["output"][0]["shape"][
+                0] if "output" in json_data and "shape" in json_data["output"][
+                    0] else 1.0
+            conditions = [
+                feat_len >= min_input_len,
+                feat_len <= max_input_len,
+                token_len >= min_output_len,
+                token_len <= max_output_len,
+                token_len / feat_len >= min_output_input_ratio,
+                token_len / feat_len <= max_output_input_ratio,
+            ]
+            if all(conditions):
+                manifest.append(json_data)
+    return manifest
+
+
+# Tar File read
+TarLocalData = namedtuple('TarLocalData', ['tar2info', 'tar2object'])
+
+
+def parse_tar(file):
+    """Parse a tar file to get a tarfile object
+    and a map containing tarinfoes
+    """
+    result = {}
+    f = tarfile.open(file)
+    for tarinfo in f.getmembers():
+        result[tarinfo.name] = tarinfo
+    return f, result
+
+
+def subfile_from_tar(file, local_data=None):
+    """Get subfile object from tar.
+
+    tar:tarpath#filename
+
+    It will return a subfile object from tar file
+    and cached tar file info for next reading request.
+    """
+    tarpath, filename = file.split(':', 1)[1].split('#', 1)
+
+    if local_data is None:
+        local_data = TarLocalData(tar2info={}, tar2object={})
+
+    assert isinstance(local_data, TarLocalData)
+
+    if 'tar2info' not in local_data.__dict__:
+        local_data.tar2info = {}
+    if 'tar2object' not in local_data.__dict__:
+        local_data.tar2object = {}
+
+    if tarpath not in local_data.tar2info:
+        fobj, infos = parse_tar(tarpath)
+        local_data.tar2info[tarpath] = infos
+        local_data.tar2object[tarpath] = fobj
+    else:
+        fobj = local_data.tar2object[tarpath]
+        infos = local_data.tar2info[tarpath]
+    return fobj.extractfile(infos[filename])
+
+
+def rms_to_db(rms: float):
+    """Root Mean Square to dB.
+
+    Args:
+        rms ([float]): root mean square
+
+    Returns:
+        float: dB
+    """
+    return 20.0 * math.log10(max(1e-16, rms))
+
+
+def rms_to_dbfs(rms: float):
+    """Root Mean Square to dBFS.
+    https://fireattack.wordpress.com/2017/02/06/replaygain-loudness-normalization-and-applications/
+    Audio is mix of sine wave, so 1 amp sine wave's Full scale is 0.7071, equal to -3.0103dB.
+
+    dB = dBFS + 3.0103
+    dBFS = db - 3.0103
+    e.g. 0 dB = -3.0103 dBFS
+
+    Args:
+        rms ([float]): root mean square
+
+    Returns:
+        float: dBFS
+    """
+    return rms_to_db(rms) - 3.0103
+
+
+def max_dbfs(sample_data: np.ndarray):
+    """Peak dBFS based on the maximum energy sample.
+
+    Args:
+        sample_data ([np.ndarray]): float array, [-1, 1].
+
+    Returns:
+        float: dBFS
+    """
+    # Peak dBFS based on the maximum energy sample. Will prevent overdrive if used for normalization.
+    return rms_to_dbfs(max(abs(np.min(sample_data)), abs(np.max(sample_data))))
+
+
+def mean_dbfs(sample_data):
+    """Peak dBFS based on the RMS energy.
+
+    Args:
+        sample_data ([np.ndarray]): float array, [-1, 1].
+
+    Returns:
+        float: dBFS
+    """
+    return rms_to_dbfs(
+        math.sqrt(np.mean(np.square(sample_data, dtype=np.float64))))
+
+
+def gain_db_to_ratio(gain_db: float):
+    """dB to ratio
+
+    Args:
+        gain_db (float): gain in dB
+
+    Returns:
+        float: scale in amp
+    """
+    return math.pow(10.0, gain_db / 20.0)
+
+
+def normalize_audio(sample_data: np.ndarray, dbfs: float=-3.0103):
+    """Nomalize audio to dBFS.
+
+    Args:
+        sample_data (np.ndarray): input wave samples, [-1, 1].
+        dbfs (float, optional): target dBFS. Defaults to -3.0103.
+
+    Returns:
+        np.ndarray: normalized wave
+    """
+    return np.maximum(
+        np.minimum(sample_data * gain_db_to_ratio(dbfs - max_dbfs(sample_data)),
+                   1.0), -1.0)
+
+
+def _load_json_cmvn(json_cmvn_file):
+    """ Load the json format cmvn stats file and calculate cmvn
+
+    Args:
+        json_cmvn_file: cmvn stats file in json format
+
+    Returns:
+        a numpy array of [means, vars]
+    """
+    with open(json_cmvn_file) as f:
+        cmvn_stats = json.load(f)
+
+    means = cmvn_stats['mean_stat']
+    variance = cmvn_stats['var_stat']
+    count = cmvn_stats['frame_num']
+    for i in range(len(means)):
+        means[i] /= count
+        variance[i] = variance[i] / count - means[i] * means[i]
+        if variance[i] < 1.0e-20:
+            variance[i] = 1.0e-20
+        variance[i] = 1.0 / math.sqrt(variance[i])
+    cmvn = np.array([means, variance])
+    return cmvn
+
+
+def _load_kaldi_cmvn(kaldi_cmvn_file):
+    """ Load the kaldi format cmvn stats file and calculate cmvn
+
+    Args:
+        kaldi_cmvn_file:  kaldi text style global cmvn file, which
+           is generated by:
+           compute-cmvn-stats --binary=false scp:feats.scp global_cmvn
+
+    Returns:
+        a numpy array of [means, vars]
+    """
+    means = []
+    variance = []
+    with open(kaldi_cmvn_file, 'r') as fid:
+        # kaldi binary file start with '\0B'
+        if fid.read(2) == '\0B':
+            logger.error('kaldi cmvn binary file is not supported, please '
+                         'recompute it by: compute-cmvn-stats --binary=false '
+                         ' scp:feats.scp global_cmvn')
+            sys.exit(1)
+        fid.seek(0)
+        arr = fid.read().split()
+        assert (arr[0] == '[')
+        assert (arr[-2] == '0')
+        assert (arr[-1] == ']')
+        feat_dim = int((len(arr) - 2 - 2) / 2)
+        for i in range(1, feat_dim + 1):
+            means.append(float(arr[i]))
+        count = float(arr[feat_dim + 1])
+        for i in range(feat_dim + 2, 2 * feat_dim + 2):
+            variance.append(float(arr[i]))
+
+    for i in range(len(means)):
+        means[i] /= count
+        variance[i] = variance[i] / count - means[i] * means[i]
+        if variance[i] < 1.0e-20:
+            variance[i] = 1.0e-20
+        variance[i] = 1.0 / math.sqrt(variance[i])
+    cmvn = np.array([means, variance])
+    return cmvn
+
+
+def load_cmvn(cmvn_file: str, filetype: str):
+    """load cmvn from file.
+
+    Args:
+        cmvn_file (str): cmvn path.
+        filetype (str): file type, optional[npz, json, kaldi].
+
+    Raises:
+        ValueError: file type not support.
+
+    Returns:
+        Tuple[np.ndarray, np.ndarray]: mean, istd
+    """
+    assert filetype in ['npz', 'json', 'kaldi'], filetype
+    filetype = filetype.lower()
+    if filetype == "json":
+        cmvn = _load_json_cmvn(cmvn_file)
+    elif filetype == "kaldi":
+        cmvn = _load_kaldi_cmvn(cmvn_file)
+    elif filetype == "npz":
+        eps = 1e-14
+        npzfile = np.load(cmvn_file)
+        mean = np.squeeze(npzfile["mean"])
+        std = np.squeeze(npzfile["std"])
+        istd = 1 / (std + eps)
+        cmvn = [mean, istd]
+    else:
+        raise ValueError(f"cmvn file type no support: {filetype}")
+    return cmvn[0], cmvn[1]
+
+
+def convert_samples_to_float32(samples):
+    """Convert sample type to float32.
+
+    Audio sample type is usually integer or float-point.
+    Integers will be scaled to [-1, 1] in float32.
+
+    PCM16 -> PCM32
+    """
+    float32_samples = samples.astype('float32')
+    if samples.dtype in np.sctypes['int']:
+        bits = np.iinfo(samples.dtype).bits
+        float32_samples *= (1. / 2**(bits - 1))
+    elif samples.dtype in np.sctypes['float']:
+        pass
+    else:
+        raise TypeError("Unsupported sample type: %s." % samples.dtype)
+    return float32_samples
+
+
+def convert_samples_from_float32(samples, dtype):
+    """Convert sample type from float32 to dtype.
+
+    Audio sample type is usually integer or float-point. For integer
+    type, float32 will be rescaled from [-1, 1] to the maximum range
+    supported by the integer type.
+
+    PCM32 -> PCM16
+    """
+    dtype = np.dtype(dtype)
+    output_samples = samples.copy()
+    if dtype in np.sctypes['int']:
+        bits = np.iinfo(dtype).bits
+        output_samples *= (2**(bits - 1) / 1.)
+        min_val = np.iinfo(dtype).min
+        max_val = np.iinfo(dtype).max
+        output_samples[output_samples > max_val] = max_val
+        output_samples[output_samples < min_val] = min_val
+    elif samples.dtype in np.sctypes['float']:
+        min_val = np.finfo(dtype).min
+        max_val = np.finfo(dtype).max
+        output_samples[output_samples > max_val] = max_val
+        output_samples[output_samples < min_val] = min_val
+    else:
+        raise TypeError("Unsupported sample type: %s." % samples.dtype)
+    return output_samples.astype(dtype)
diff --git a/ernie-sat/paddlespeech/s2t/io/__init__.py b/ernie-sat/paddlespeech/s2t/io/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..185a92b8d94d3426d616c0624f0f2ee04339349e
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/io/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/ernie-sat/paddlespeech/s2t/io/batchfy.py b/ernie-sat/paddlespeech/s2t/io/batchfy.py
new file mode 100644
index 0000000000000000000000000000000000000000..f3630f2e38bd647a1213f3f515f101fa554d7f59
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/io/batchfy.py
@@ -0,0 +1,470 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+import itertools
+
+import numpy as np
+
+from paddlespeech.s2t.utils.log import Log
+
+__all__ = ["make_batchset"]
+
+logger = Log(__name__).getlog()
+
+
+def batchfy_by_seq(
+        sorted_data,
+        batch_size,
+        max_length_in,
+        max_length_out,
+        min_batch_size=1,
+        shortest_first=False,
+        ikey="input",
+        iaxis=0,
+        okey="output",
+        oaxis=0, ):
+    """Make batch set from json dictionary
+
+    :param List[(str, Dict[str, Any])] sorted_data: dictionary loaded from data.json
+    :param int batch_size: batch size
+    :param int max_length_in: maximum length of input to decide adaptive batch size
+    :param int max_length_out: maximum length of output to decide adaptive batch size
+    :param int min_batch_size: mininum batch size (for multi-gpu)
+    :param bool shortest_first: Sort from batch with shortest samples
+        to longest if true, otherwise reverse
+    :param str ikey: key to access input
+        (for ASR ikey="input", for TTS, MT ikey="output".)
+    :param int iaxis: dimension to access input
+        (for ASR, TTS iaxis=0, for MT iaxis="1".)
+    :param str okey: key to access output
+        (for ASR, MT okey="output". for TTS okey="input".)
+    :param int oaxis: dimension to access output
+        (for ASR, TTS, MT oaxis=0, reserved for future research, -1 means all axis.)
+    :return: List[List[Tuple[str, dict]]] list of batches
+    """
+    if batch_size <= 0:
+        raise ValueError(f"Invalid batch_size={batch_size}")
+
+    # check #utts is more than min_batch_size
+    if len(sorted_data) < min_batch_size:
+        raise ValueError(
+            f"#utts({len(sorted_data)}) is less than min_batch_size({min_batch_size})."
+        )
+
+    # make list of minibatches
+    minibatches = []
+    start = 0
+    while True:
+        _, info = sorted_data[start]
+        ilen = int(info[ikey][iaxis]["shape"][0])
+        olen = (int(info[okey][oaxis]["shape"][0]) if oaxis >= 0 else
+                max(map(lambda x: int(x["shape"][0]), info[okey])))
+        factor = max(int(ilen / max_length_in), int(olen / max_length_out))
+        # change batchsize depending on the input and output length
+        # if ilen = 1000 and max_length_in = 800
+        # then b = batchsize / 2
+        # and max(min_batches, .) avoids batchsize = 0
+        bs = max(min_batch_size, int(batch_size / (1 + factor)))
+        end = min(len(sorted_data), start + bs)
+        minibatch = sorted_data[start:end]
+        if shortest_first:
+            minibatch.reverse()
+
+        # check each batch is more than minimum batchsize
+        if len(minibatch) < min_batch_size:
+            mod = min_batch_size - len(minibatch) % min_batch_size
+            additional_minibatch = [
+                sorted_data[i] for i in np.random.randint(0, start, mod)
+            ]
+            if shortest_first:
+                additional_minibatch.reverse()
+            minibatch.extend(additional_minibatch)
+        minibatches.append(minibatch)
+
+        if end == len(sorted_data):
+            break
+        start = end
+
+    # batch: List[List[Tuple[str, dict]]]
+    return minibatches
+
+
+def batchfy_by_bin(
+        sorted_data,
+        batch_bins,
+        num_batches=0,
+        min_batch_size=1,
+        shortest_first=False,
+        ikey="input",
+        okey="output", ):
+    """Make variably sized batch set, which maximizes
+
+    the number of bins up to `batch_bins`.
+
+    :param List[(str, Dict[str, Any])] sorted_data: dictionary loaded from data.json
+    :param int batch_bins: Maximum frames of a batch
+    :param int num_batches: # number of batches to use (for debug)
+    :param int min_batch_size: minimum batch size (for multi-gpu)
+    :param int test: Return only every `test` batches
+    :param bool shortest_first: Sort from batch with shortest samples
+        to longest if true, otherwise reverse
+
+    :param str ikey: key to access input (for ASR ikey="input", for TTS ikey="output".)
+    :param str okey: key to access output (for ASR okey="output". for TTS okey="input".)
+
+    :return: List[Tuple[str, Dict[str, List[Dict[str, Any]]]] list of batches
+    """
+    if batch_bins <= 0:
+        raise ValueError(f"invalid batch_bins={batch_bins}")
+    length = len(sorted_data)
+    idim = int(sorted_data[0][1][ikey][0]["shape"][1])
+    odim = int(sorted_data[0][1][okey][0]["shape"][1])
+    logger.info("# utts: " + str(len(sorted_data)))
+    minibatches = []
+    start = 0
+    n = 0
+    while True:
+        # Dynamic batch size depending on size of samples
+        b = 0
+        next_size = 0
+        max_olen = 0
+        while next_size < batch_bins and (start + b) < length:
+            ilen = int(sorted_data[start + b][1][ikey][0]["shape"][0]) * idim
+            olen = int(sorted_data[start + b][1][okey][0]["shape"][0]) * odim
+            if olen > max_olen:
+                max_olen = olen
+            next_size = (max_olen + ilen) * (b + 1)
+            if next_size <= batch_bins:
+                b += 1
+            elif next_size == 0:
+                raise ValueError(
+                    f"Can't fit one sample in batch_bins ({batch_bins}): "
+                    f"Please increase the value")
+        end = min(length, start + max(min_batch_size, b))
+        batch = sorted_data[start:end]
+        if shortest_first:
+            batch.reverse()
+        minibatches.append(batch)
+        # Check for min_batch_size and fixes the batches if needed
+        i = -1
+        while len(minibatches[i]) < min_batch_size:
+            missing = min_batch_size - len(minibatches[i])
+            if -i == len(minibatches):
+                minibatches[i + 1].extend(minibatches[i])
+                minibatches = minibatches[1:]
+                break
+            else:
+                minibatches[i].extend(minibatches[i - 1][:missing])
+                minibatches[i - 1] = minibatches[i - 1][missing:]
+                i -= 1
+        if end == length:
+            break
+        start = end
+        n += 1
+    if num_batches > 0:
+        minibatches = minibatches[:num_batches]
+    lengths = [len(x) for x in minibatches]
+    logger.info(
+        str(len(minibatches)) + " batches containing from " + str(min(lengths))
+        + " to " + str(max(lengths)) + " samples " + "(avg " + str(
+            int(np.mean(lengths))) + " samples).")
+    return minibatches
+
+
+def batchfy_by_frame(
+        sorted_data,
+        max_frames_in,
+        max_frames_out,
+        max_frames_inout,
+        num_batches=0,
+        min_batch_size=1,
+        shortest_first=False,
+        ikey="input",
+        okey="output", ):
+    """Make variable batch set, which maximizes the number of frames to max_batch_frame.
+
+    :param List[(str, Dict[str, Any])] sorteddata: dictionary loaded from data.json
+    :param int max_frames_in: Maximum input frames of a batch
+    :param int max_frames_out: Maximum output frames of a batch
+    :param int max_frames_inout: Maximum input+output frames of a batch
+    :param int num_batches: # number of batches to use (for debug)
+    :param int min_batch_size: minimum batch size (for multi-gpu)
+    :param int test: Return only every `test` batches
+    :param bool shortest_first: Sort from batch with shortest samples
+        to longest if true, otherwise reverse
+
+    :param str ikey: key to access input (for ASR ikey="input", for TTS ikey="output".)
+    :param str okey: key to access output (for ASR okey="output". for TTS okey="input".)
+
+    :return: List[Tuple[str, Dict[str, List[Dict[str, Any]]]] list of batches
+    """
+    if max_frames_in <= 0 and max_frames_out <= 0 and max_frames_inout <= 0:
+        raise ValueError(
+            "At least, one of `--batch-frames-in`, `--batch-frames-out` or "
+            "`--batch-frames-inout` should be > 0")
+    length = len(sorted_data)
+    minibatches = []
+    start = 0
+    end = 0
+    while end != length:
+        # Dynamic batch size depending on size of samples
+        b = 0
+        max_olen = 0
+        max_ilen = 0
+        while (start + b) < length:
+            ilen = int(sorted_data[start + b][1][ikey][0]["shape"][0])
+            if ilen > max_frames_in and max_frames_in != 0:
+                raise ValueError(
+                    f"Can't fit one sample in --batch-frames-in ({max_frames_in}): "
+                    f"Please increase the value")
+            olen = int(sorted_data[start + b][1][okey][0]["shape"][0])
+            if olen > max_frames_out and max_frames_out != 0:
+                raise ValueError(
+                    f"Can't fit one sample in --batch-frames-out ({max_frames_out}): "
+                    f"Please increase the value")
+            if ilen + olen > max_frames_inout and max_frames_inout != 0:
+                raise ValueError(
+                    f"Can't fit one sample in --batch-frames-out ({max_frames_inout}): "
+                    f"Please increase the value")
+            max_olen = max(max_olen, olen)
+            max_ilen = max(max_ilen, ilen)
+            in_ok = max_ilen * (b + 1) <= max_frames_in or max_frames_in == 0
+            out_ok = max_olen * (b + 1) <= max_frames_out or max_frames_out == 0
+            inout_ok = (max_ilen + max_olen) * (
+                b + 1) <= max_frames_inout or max_frames_inout == 0
+            if in_ok and out_ok and inout_ok:
+                # add more seq in the minibatch
+                b += 1
+            else:
+                # no more seq in the minibatch
+                break
+        end = min(length, start + b)
+        batch = sorted_data[start:end]
+        if shortest_first:
+            batch.reverse()
+        minibatches.append(batch)
+        # Check for min_batch_size and fixes the batches if needed
+        i = -1
+        while len(minibatches[i]) < min_batch_size:
+            missing = min_batch_size - len(minibatches[i])
+            if -i == len(minibatches):
+                minibatches[i + 1].extend(minibatches[i])
+                minibatches = minibatches[1:]
+                break
+            else:
+                minibatches[i].extend(minibatches[i - 1][:missing])
+                minibatches[i - 1] = minibatches[i - 1][missing:]
+                i -= 1
+        start = end
+    if num_batches > 0:
+        minibatches = minibatches[:num_batches]
+    lengths = [len(x) for x in minibatches]
+    logger.info(
+        str(len(minibatches)) + " batches containing from " + str(min(lengths))
+        + " to " + str(max(lengths)) + " samples" + "(avg " + str(
+            int(np.mean(lengths))) + " samples).")
+
+    return minibatches
+
+
+def batchfy_shuffle(data, batch_size, min_batch_size, num_batches,
+                    shortest_first):
+    import random
+
+    logger.info("use shuffled batch.")
+    sorted_data = random.sample(data.items(), len(data.items()))
+    logger.info("# utts: " + str(len(sorted_data)))
+    # make list of minibatches
+    minibatches = []
+    start = 0
+    while True:
+        end = min(len(sorted_data), start + batch_size)
+        # check each batch is more than minimum batchsize
+        minibatch = sorted_data[start:end]
+        if shortest_first:
+            minibatch.reverse()
+        if len(minibatch) < min_batch_size:
+            mod = min_batch_size - len(minibatch) % min_batch_size
+            additional_minibatch = [
+                sorted_data[i] for i in np.random.randint(0, start, mod)
+            ]
+            if shortest_first:
+                additional_minibatch.reverse()
+            minibatch.extend(additional_minibatch)
+        minibatches.append(minibatch)
+        if end == len(sorted_data):
+            break
+        start = end
+
+    # for debugging
+    if num_batches > 0:
+        minibatches = minibatches[:num_batches]
+        logger.info("# minibatches: " + str(len(minibatches)))
+    return minibatches
+
+
+BATCH_COUNT_CHOICES = ["auto", "seq", "bin", "frame"]
+BATCH_SORT_KEY_CHOICES = ["input", "output", "shuffle"]
+
+
+def make_batchset(
+        data,
+        batch_size=0,
+        max_length_in=float("inf"),
+        max_length_out=float("inf"),
+        num_batches=0,
+        min_batch_size=1,
+        shortest_first=False,
+        batch_sort_key="input",
+        count="auto",
+        batch_bins=0,
+        batch_frames_in=0,
+        batch_frames_out=0,
+        batch_frames_inout=0,
+        iaxis=0,
+        oaxis=0, ):
+    """Make batch set from json dictionary
+
+    if utts have "category" value,
+
+        >>> data = [{'category': 'A', 'input': ..., 'utt':'utt1'},
+        ...         {'category': 'B', 'input': ..., 'utt':'utt2'},
+        ...         {'category': 'B', 'input': ..., 'utt':'utt3'},
+        ...         {'category': 'A', 'input': ..., 'utt':'utt4'}]
+        >>> make_batchset(data, batchsize=2, ...)
+        [[('utt1', ...), ('utt4', ...)], [('utt2', ...), ('utt3': ...)]]
+
+    Note that if any utts doesn't have "category",
+    perform as same as batchfy_by_{count}
+
+    :param List[Dict[str, Any]] data: dictionary loaded from data.json
+    :param int batch_size: maximum number of sequences in a minibatch.
+    :param int batch_bins: maximum number of bins (frames x dim) in a minibatch.
+    :param int batch_frames_in:  maximum number of input frames in a minibatch.
+    :param int batch_frames_out: maximum number of output frames in a minibatch.
+    :param int batch_frames_out: maximum number of input+output frames in a minibatch.
+    :param str count: strategy to count maximum size of batch.
+        For choices, see io.batchfy.BATCH_COUNT_CHOICES
+
+    :param int max_length_in: maximum length of input to decide adaptive batch size
+    :param int max_length_out: maximum length of output to decide adaptive batch size
+    :param int num_batches: # number of batches to use (for debug)
+    :param int min_batch_size: minimum batch size (for multi-gpu)
+    :param bool shortest_first: Sort from batch with shortest samples
+        to longest if true, otherwise reverse
+    :param str batch_sort_key: how to sort data before creating minibatches
+        ["input", "output", "shuffle"]
+    :param bool swap_io: if True, use "input" as output and "output"
+        as input in `data` dict
+    :param bool mt: if True, use 0-axis of "output" as output and 1-axis of "output"
+        as input in `data` dict
+    :param int iaxis: dimension to access input
+        (for ASR, TTS iaxis=0, for MT iaxis="1".)
+    :param int oaxis: dimension to access output (for ASR, TTS, MT oaxis=0,
+        reserved for future research, -1 means all axis.)
+    :return: List[List[Tuple[str, dict]]] list of batches
+    """
+    # check args
+    if count not in BATCH_COUNT_CHOICES:
+        raise ValueError(
+            f"arg 'count' ({count}) should be one of {BATCH_COUNT_CHOICES}")
+    if batch_sort_key not in BATCH_SORT_KEY_CHOICES:
+        raise ValueError(f"arg 'batch_sort_key' ({batch_sort_key}) should be "
+                         f"one of {BATCH_SORT_KEY_CHOICES}")
+
+    ikey = "input"
+    okey = "output"
+    batch_sort_axis = 0  # index of list 
+    if count == "auto":
+        if batch_size != 0:
+            count = "seq"
+        elif batch_bins != 0:
+            count = "bin"
+        elif batch_frames_in != 0 or batch_frames_out != 0 or batch_frames_inout != 0:
+            count = "frame"
+        else:
+            raise ValueError(
+                f"cannot detect `count` manually set one of {BATCH_COUNT_CHOICES}"
+            )
+        logger.info(f"count is auto detected as {count}")
+
+    if count != "seq" and batch_sort_key == "shuffle":
+        raise ValueError(
+            "batch_sort_key=shuffle is only available if batch_count=seq")
+
+    category2data = {}  # Dict[str, dict]
+    for v in data:
+        k = v['utt']
+        category2data.setdefault(v.get("category"), {})[k] = v
+
+    batches_list = []  # List[List[List[Tuple[str, dict]]]]
+    for d in category2data.values():
+        if batch_sort_key == "shuffle":
+            batches = batchfy_shuffle(d, batch_size, min_batch_size,
+                                      num_batches, shortest_first)
+            batches_list.append(batches)
+            continue
+
+        # sort it by input lengths (long to short)
+        sorted_data = sorted(
+            d.items(),
+            key=lambda data: float(data[1][batch_sort_key][batch_sort_axis]["shape"][0]),
+            reverse=not shortest_first, )
+        logger.info("# utts: " + str(len(sorted_data)))
+
+        if count == "seq":
+            batches = batchfy_by_seq(
+                sorted_data,
+                batch_size=batch_size,
+                max_length_in=max_length_in,
+                max_length_out=max_length_out,
+                min_batch_size=min_batch_size,
+                shortest_first=shortest_first,
+                ikey=ikey,
+                iaxis=iaxis,
+                okey=okey,
+                oaxis=oaxis, )
+        if count == "bin":
+            batches = batchfy_by_bin(
+                sorted_data,
+                batch_bins=batch_bins,
+                min_batch_size=min_batch_size,
+                shortest_first=shortest_first,
+                ikey=ikey,
+                okey=okey, )
+        if count == "frame":
+            batches = batchfy_by_frame(
+                sorted_data,
+                max_frames_in=batch_frames_in,
+                max_frames_out=batch_frames_out,
+                max_frames_inout=batch_frames_inout,
+                min_batch_size=min_batch_size,
+                shortest_first=shortest_first,
+                ikey=ikey,
+                okey=okey, )
+        batches_list.append(batches)
+
+    if len(batches_list) == 1:
+        batches = batches_list[0]
+    else:
+        # Concat list. This way is faster than "sum(batch_list, [])"
+        batches = list(itertools.chain(*batches_list))
+
+    # for debugging
+    if num_batches > 0:
+        batches = batches[:num_batches]
+    logger.info("# minibatches: " + str(len(batches)))
+
+    # batch: List[List[Tuple[str, dict]]]
+    return batches
diff --git a/ernie-sat/paddlespeech/s2t/io/collator.py b/ernie-sat/paddlespeech/s2t/io/collator.py
new file mode 100644
index 0000000000000000000000000000000000000000..b99fc80c023f72e401726fc347066c5bd569b40a
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/io/collator.py
@@ -0,0 +1,347 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import io
+
+import numpy as np
+
+from paddlespeech.s2t.frontend.augmentor.augmentation import AugmentationPipeline
+from paddlespeech.s2t.frontend.featurizer.speech_featurizer import SpeechFeaturizer
+from paddlespeech.s2t.frontend.normalizer import FeatureNormalizer
+from paddlespeech.s2t.frontend.speech import SpeechSegment
+from paddlespeech.s2t.frontend.utility import IGNORE_ID
+from paddlespeech.s2t.frontend.utility import TarLocalData
+from paddlespeech.s2t.io.reader import LoadInputsAndTargets
+from paddlespeech.s2t.io.utility import pad_list
+from paddlespeech.s2t.utils.log import Log
+
+__all__ = ["SpeechCollator", "TripletSpeechCollator"]
+
+logger = Log(__name__).getlog()
+
+
+def _tokenids(text, keep_transcription_text):
+    # for training text is token ids
+    tokens = text  # token ids
+
+    if keep_transcription_text:
+        # text is string, convert to unicode ord
+        assert isinstance(text, str), (type(text), text)
+        tokens = [ord(t) for t in text]
+
+    tokens = np.array(tokens, dtype=np.int64)
+    return tokens
+
+
+class SpeechCollatorBase():
+    def __init__(
+            self,
+            aug_file,
+            mean_std_filepath,
+            vocab_filepath,
+            spm_model_prefix,
+            random_seed=0,
+            unit_type="char",
+            spectrum_type='linear',  # 'linear', 'mfcc', 'fbank'
+            feat_dim=0,  # 'mfcc', 'fbank'
+            delta_delta=False,  # 'mfcc', 'fbank'
+            stride_ms=10.0,  # ms
+            window_ms=20.0,  # ms
+            n_fft=None,  # fft points
+            max_freq=None,  # None for samplerate/2
+            target_sample_rate=16000,  # target sample rate
+            use_dB_normalization=True,
+            target_dB=-20,
+            dither=1.0,
+            keep_transcription_text=True):
+        """SpeechCollator Collator
+
+        Args:
+            unit_type(str): token unit type, e.g. char, word, spm
+            vocab_filepath (str): vocab file path.
+            mean_std_filepath (str): mean and std file path, which suffix is *.npy
+            spm_model_prefix (str): spm model prefix, need if `unit_type` is spm.
+            augmentation_config (str, optional): augmentation json str. Defaults to '{}'.
+            stride_ms (float, optional): stride size in ms. Defaults to 10.0.
+            window_ms (float, optional): window size in ms. Defaults to 20.0.
+            n_fft (int, optional): fft points for rfft. Defaults to None.
+            max_freq (int, optional): max cut freq. Defaults to None.
+            target_sample_rate (int, optional): target sample rate which used for training. Defaults to 16000.
+            spectrum_type (str, optional): 'linear', 'mfcc' or 'fbank'. Defaults to 'linear'.
+            feat_dim (int, optional): audio feature dim, using by 'mfcc' or 'fbank'. Defaults to None.
+            delta_delta (bool, optional): audio feature with delta-delta, using by 'fbank' or 'mfcc'. Defaults to False.
+            use_dB_normalization (bool, optional): do dB normalization. Defaults to True.
+            target_dB (int, optional): target dB. Defaults to -20.
+            random_seed (int, optional): for random generator. Defaults to 0.
+            keep_transcription_text (bool, optional): True, when not in training mode, will not do tokenizer; Defaults to False.
+            if ``keep_transcription_text`` is False, text is token ids else is raw string.
+
+        Do augmentations
+        Padding audio features with zeros to make them have the same shape (or
+        a user-defined shape) within one batch.
+        """
+        self.keep_transcription_text = keep_transcription_text
+        self.train_mode = not keep_transcription_text
+
+        self.stride_ms = stride_ms
+        self.window_ms = window_ms
+        self.feat_dim = feat_dim
+
+        self.loader = LoadInputsAndTargets()
+
+        # only for tar filetype
+        self._local_data = TarLocalData(tar2info={}, tar2object={})
+
+        self.augmentation = AugmentationPipeline(
+            preprocess_conf=aug_file.read(), random_seed=random_seed)
+
+        self._normalizer = FeatureNormalizer(
+            mean_std_filepath) if mean_std_filepath else None
+
+        self._speech_featurizer = SpeechFeaturizer(
+            unit_type=unit_type,
+            vocab_filepath=vocab_filepath,
+            spm_model_prefix=spm_model_prefix,
+            spectrum_type=spectrum_type,
+            feat_dim=feat_dim,
+            delta_delta=delta_delta,
+            stride_ms=stride_ms,
+            window_ms=window_ms,
+            n_fft=n_fft,
+            max_freq=max_freq,
+            target_sample_rate=target_sample_rate,
+            use_dB_normalization=use_dB_normalization,
+            target_dB=target_dB,
+            dither=dither)
+
+        self.feature_size = self._speech_featurizer.audio_feature.feature_size
+        self.text_feature = self._speech_featurizer.text_feature
+        self.vocab_dict = self.text_feature.vocab_dict
+        self.vocab_list = self.text_feature.vocab_list
+        self.vocab_size = self.text_feature.vocab_size
+
+    def process_utterance(self, audio_file, transcript):
+        """Load, augment, featurize and normalize for speech data.
+
+        :param audio_file: Filepath or file object of audio file.
+        :type audio_file: str | file
+        :param transcript: Transcription text.
+        :type transcript: str
+        :return: Tuple of audio feature tensor and data of transcription part,
+                 where transcription part could be token ids or text.
+        :rtype: tuple of (2darray, list)
+        """
+        filetype = self.loader.file_type(audio_file)
+
+        if filetype != 'sound':
+            spectrum = self.loader._get_from_loader(audio_file, filetype)
+            feat_dim = spectrum.shape[1]
+            assert feat_dim == self.feat_dim, f"expect feat dim {self.feat_dim}, but got {feat_dim}"
+
+            if self.keep_transcription_text:
+                transcript_part = transcript
+            else:
+                text_ids = self.text_feature.featurize(transcript)
+                transcript_part = text_ids
+        else:
+            # read audio
+            speech_segment = SpeechSegment.from_file(
+                audio_file, transcript, infos=self._local_data)
+            # audio augment
+            self.augmentation.transform_audio(speech_segment)
+
+            # extract speech feature
+            spectrum, transcript_part = self._speech_featurizer.featurize(
+                speech_segment, self.keep_transcription_text)
+            # CMVN spectrum
+            if self._normalizer:
+                spectrum = self._normalizer.apply(spectrum)
+
+        # spectrum augment
+        spectrum = self.augmentation.transform_feature(spectrum)
+        return spectrum, transcript_part
+
+    def __call__(self, batch):
+        """batch examples
+
+        Args:
+            batch (List[Dict]): batch is [dict(audio, text, ...)]
+                audio (np.ndarray) shape (T, D)
+                text (List[int] or str): shape (U,)
+
+        Returns:
+            tuple(utts, xs_pad, ilens, ys_pad, olens): batched data.
+                utts: (B,)
+                xs_pad : (B, Tmax, D)
+                ilens: (B,)
+                ys_pad : (B, Umax)
+                olens: (B,)
+        """
+        audios = []
+        audio_lens = []
+        texts = []
+        text_lens = []
+        utts = []
+        tids = []  # tokenids
+
+        for idx, item in enumerate(batch):
+            utts.append(item['utt'])
+
+            audio = item['input'][0]['feat']
+            text = item['output'][0]['text']
+            audio, text = self.process_utterance(audio, text)
+
+            audios.append(audio)  # [T, D]
+            audio_lens.append(audio.shape[0])
+
+            tokens = _tokenids(text, self.keep_transcription_text)
+            texts.append(tokens)
+            text_lens.append(tokens.shape[0])
+
+        #[B, T, D]
+        xs_pad = pad_list(audios, 0.0).astype(np.float32)
+        ilens = np.array(audio_lens).astype(np.int64)
+        ys_pad = pad_list(texts, IGNORE_ID).astype(np.int64)
+        olens = np.array(text_lens).astype(np.int64)
+        return utts, xs_pad, ilens, ys_pad, olens
+
+
+class SpeechCollator(SpeechCollatorBase):
+    @classmethod
+    def from_config(cls, config):
+        """Build a SpeechCollator object from a config.
+
+        Args:
+            config (yacs.config.CfgNode): configs object.
+
+        Returns:
+            SpeechCollator: collator object.
+        """
+        assert 'augmentation_config' in config
+        assert 'keep_transcription_text' in config
+        assert 'mean_std_filepath' in config
+        assert 'vocab_filepath' in config
+        assert 'spectrum_type' in config
+        assert 'n_fft' in config
+        assert config
+
+        if isinstance(config.augmentation_config, (str, bytes)):
+            if config.augmentation_config:
+                aug_file = io.open(
+                    config.augmentation_config, mode='r', encoding='utf8')
+            else:
+                aug_file = io.StringIO(initial_value='{}', newline='')
+        else:
+            aug_file = config.augmentation_config
+            assert isinstance(aug_file, io.StringIO)
+
+        speech_collator = cls(
+            aug_file=aug_file,
+            random_seed=0,
+            mean_std_filepath=config.mean_std_filepath,
+            unit_type=config.unit_type,
+            vocab_filepath=config.vocab_filepath,
+            spm_model_prefix=config.spm_model_prefix,
+            spectrum_type=config.spectrum_type,
+            feat_dim=config.feat_dim,
+            delta_delta=config.delta_delta,
+            stride_ms=config.stride_ms,
+            window_ms=config.window_ms,
+            n_fft=config.n_fft,
+            max_freq=config.max_freq,
+            target_sample_rate=config.target_sample_rate,
+            use_dB_normalization=config.use_dB_normalization,
+            target_dB=config.target_dB,
+            dither=config.dither,
+            keep_transcription_text=config.keep_transcription_text)
+        return speech_collator
+
+
+class TripletSpeechCollator(SpeechCollator):
+    def process_utterance(self, audio_file, translation, transcript):
+        """Load, augment, featurize and normalize for speech data.
+
+        :param audio_file: Filepath or file object of audio file.
+        :type audio_file: str | file
+        :param translation: translation text.
+        :type translation: str
+        :return: Tuple of audio feature tensor and data of translation part,
+                    where translation part could be token ids or text.
+        :rtype: tuple of (2darray, list)
+        """
+        spectrum, translation_part = super().process_utterance(audio_file,
+                                                               translation)
+        transcript_part = self._speech_featurizer.text_featurize(
+            transcript, self.keep_transcription_text)
+        return spectrum, translation_part, transcript_part
+
+    def __call__(self, batch):
+        """batch examples
+
+        Args:
+            batch (List[Dict]): batch is [dict(audio, text, ...)]
+                audio (np.ndarray) shape (T, D)
+                text (List[int] or str): shape (U,)
+
+        Returns:
+            tuple(utts, xs_pad, ilens, ys_pad, olens): batched data.
+                utts: (B,)
+                xs_pad : (B, Tmax, D)
+                ilens: (B,)
+                ys_pad : [(B, Umax), (B, Umax)]
+                olens: [(B,), (B,)]
+        """
+        utts = []
+        audios = []
+        audio_lens = []
+        translation_text = []
+        translation_text_lens = []
+        transcription_text = []
+        transcription_text_lens = []
+
+        for idx, item in enumerate(batch):
+            utts.append(item['utt'])
+
+            audio = item['input'][0]['feat']
+            translation = item['output'][0]['text']
+            transcription = item['output'][1]['text']
+
+            audio, translation, transcription = self.process_utterance(
+                audio, translation, transcription)
+
+            audios.append(audio)  # [T, D]
+            audio_lens.append(audio.shape[0])
+
+            tokens = [[], []]
+            for idx, text in enumerate([translation, transcription]):
+                tokens[idx] = _tokenids(text, self.keep_transcription_text)
+
+            translation_text.append(tokens[0])
+            translation_text_lens.append(tokens[0].shape[0])
+            transcription_text.append(tokens[1])
+            transcription_text_lens.append(tokens[1].shape[0])
+
+        xs_pad = pad_list(audios, 0.0).astype(np.float32)  #[B, T, D]
+        ilens = np.array(audio_lens).astype(np.int64)
+
+        padded_translation = pad_list(translation_text,
+                                      IGNORE_ID).astype(np.int64)
+        translation_lens = np.array(translation_text_lens).astype(np.int64)
+
+        padded_transcription = pad_list(transcription_text,
+                                        IGNORE_ID).astype(np.int64)
+        transcription_lens = np.array(transcription_text_lens).astype(np.int64)
+
+        ys_pad = (padded_translation, padded_transcription)
+        olens = (translation_lens, transcription_lens)
+        return utts, xs_pad, ilens, ys_pad, olens
diff --git a/ernie-sat/paddlespeech/s2t/io/converter.py b/ernie-sat/paddlespeech/s2t/io/converter.py
new file mode 100644
index 0000000000000000000000000000000000000000..a802ac7490ce57bed3529ce78e3eb6112e3dd492
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/io/converter.py
@@ -0,0 +1,107 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+import numpy as np
+
+from paddlespeech.s2t.io.utility import pad_list
+from paddlespeech.s2t.utils.log import Log
+
+__all__ = ["CustomConverter"]
+
+logger = Log(__name__).getlog()
+
+
+class CustomConverter():
+    """Custom batch converter.
+
+    Args:
+        subsampling_factor (int): The subsampling factor.
+        dtype (np.dtype): Data type to convert.
+        
+    """
+
+    def __init__(self,
+                 subsampling_factor=1,
+                 dtype=np.float32,
+                 load_aux_input=False,
+                 load_aux_output=False):
+        """Construct a CustomConverter object."""
+        self.subsampling_factor = subsampling_factor
+        self.ignore_id = -1
+        self.dtype = dtype
+        self.load_aux_input = load_aux_input
+        self.load_aux_output = load_aux_output
+
+    def __call__(self, batch):
+        """Transform a batch and send it to a device.
+
+        Args:
+            batch (list): The batch to transform.
+
+        Returns:
+            tuple(np.ndarray, nn.ndarray, nn.ndarray)
+
+        """
+        # batch should be located in list
+        assert len(batch) == 1
+        data, utts = batch[0]
+        xs_data, ys_data = [], []
+        for ud in data:
+            if ud[0].ndim > 1:
+                # speech data (input): (speech_len, feat_dim)
+                xs_data.append(ud)
+            else:
+                # text data (output): (text_len, )
+                ys_data.append(ud)
+
+        assert xs_data[0][
+            0] is not None, "please check Reader and Augmentation impl."
+
+        xs_pad, ilens = [], []
+        for xs in xs_data:
+            # perform subsampling
+            if self.subsampling_factor > 1:
+                xs = [x[::self.subsampling_factor, :] for x in xs]
+
+            # get batch of lengths of input sequences
+            ilens.append(np.array([x.shape[0] for x in xs]))
+
+            # perform padding and convert to tensor
+            # currently only support real number
+            xs_pad.append(pad_list(xs, 0).astype(self.dtype))
+
+            if not self.load_aux_input:
+                xs_pad, ilens = xs_pad[0], ilens[0]
+                break
+
+        # NOTE: this is for multi-output (e.g., speech translation)
+        ys_pad, olens = [], []
+
+        for ys in ys_data:
+            ys_pad.append(
+                pad_list([
+                    np.array(y[0][:]) if isinstance(y, tuple) else y for y in ys
+                ], self.ignore_id))
+
+            olens.append(
+                np.array([
+                    y[0].shape[0] if isinstance(y, tuple) else y.shape[0]
+                    for y in ys
+                ]))
+
+            if not self.load_aux_output:
+                ys_pad, olens = ys_pad[0], olens[0]
+                break
+
+        return utts, xs_pad, ilens, ys_pad, olens
diff --git a/ernie-sat/paddlespeech/s2t/io/dataloader.py b/ernie-sat/paddlespeech/s2t/io/dataloader.py
new file mode 100644
index 0000000000000000000000000000000000000000..55aa13ff10a1e34b3063a760544fdd33b150f61f
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/io/dataloader.py
@@ -0,0 +1,201 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any
+from typing import Dict
+from typing import List
+from typing import Text
+
+import jsonlines
+import numpy as np
+from paddle.io import BatchSampler
+from paddle.io import DataLoader
+from paddle.io import DistributedBatchSampler
+
+from paddlespeech.s2t.io.batchfy import make_batchset
+from paddlespeech.s2t.io.converter import CustomConverter
+from paddlespeech.s2t.io.dataset import TransformDataset
+from paddlespeech.s2t.io.reader import LoadInputsAndTargets
+from paddlespeech.s2t.utils.log import Log
+
+__all__ = ["BatchDataLoader"]
+
+logger = Log(__name__).getlog()
+
+
+def feat_dim_and_vocab_size(data_json: List[Dict[Text, Any]],
+                            mode: Text="asr",
+                            iaxis=0,
+                            oaxis=0):
+    if mode == 'asr':
+        feat_dim = data_json[0]['input'][oaxis]['shape'][1]
+        vocab_size = data_json[0]['output'][oaxis]['shape'][1]
+    else:
+        raise ValueError(f"{mode} mode not support!")
+    return feat_dim, vocab_size
+
+
+def batch_collate(x):
+    """de-minibatch, since user compose batch.
+
+    Args:
+        x (List[Tuple]): [(utts, xs, ilens, ys, olens)]
+
+    Returns:
+        Tuple: (utts, xs, ilens, ys, olens)
+    """
+    return x[0]
+
+
+class BatchDataLoader():
+    def __init__(self,
+                 json_file: str,
+                 train_mode: bool,
+                 sortagrad: int=0,
+                 batch_size: int=0,
+                 maxlen_in: float=float('inf'),
+                 maxlen_out: float=float('inf'),
+                 minibatches: int=0,
+                 mini_batch_size: int=1,
+                 batch_count: str='auto',
+                 batch_bins: int=0,
+                 batch_frames_in: int=0,
+                 batch_frames_out: int=0,
+                 batch_frames_inout: int=0,
+                 preprocess_conf=None,
+                 n_iter_processes: int=1,
+                 subsampling_factor: int=1,
+                 load_aux_input: bool=False,
+                 load_aux_output: bool=False,
+                 num_encs: int=1,
+                 dist_sampler: bool=False,
+                 shortest_first: bool=False):
+        self.json_file = json_file
+        self.train_mode = train_mode
+        self.use_sortagrad = sortagrad == -1 or sortagrad > 0
+        self.batch_size = batch_size
+        self.maxlen_in = maxlen_in
+        self.maxlen_out = maxlen_out
+        self.batch_count = batch_count
+        self.batch_bins = batch_bins
+        self.batch_frames_in = batch_frames_in
+        self.batch_frames_out = batch_frames_out
+        self.batch_frames_inout = batch_frames_inout
+        self.subsampling_factor = subsampling_factor
+        self.num_encs = num_encs
+        self.preprocess_conf = preprocess_conf
+        self.n_iter_processes = n_iter_processes
+        self.load_aux_input = load_aux_input
+        self.load_aux_output = load_aux_output
+        self.dist_sampler = dist_sampler
+        self.shortest_first = shortest_first
+
+        # read json data
+        with jsonlines.open(json_file, 'r') as reader:
+            self.data_json = list(reader)
+
+        self.feat_dim, self.vocab_size = feat_dim_and_vocab_size(
+            self.data_json, mode='asr')
+
+        # make minibatch list (variable length)
+        self.minibaches = make_batchset(
+            self.data_json,
+            batch_size,
+            maxlen_in,
+            maxlen_out,
+            minibatches,  # for debug
+            min_batch_size=mini_batch_size,
+            shortest_first=self.shortest_first or self.use_sortagrad,
+            count=batch_count,
+            batch_bins=batch_bins,
+            batch_frames_in=batch_frames_in,
+            batch_frames_out=batch_frames_out,
+            batch_frames_inout=batch_frames_inout,
+            iaxis=0,
+            oaxis=0, )
+
+        # data reader
+        self.reader = LoadInputsAndTargets(
+            mode="asr",
+            load_output=True,
+            preprocess_conf=preprocess_conf,
+            preprocess_args={"train":
+                             train_mode},  # Switch the mode of preprocessing
+        )
+
+        # Setup a converter
+        if num_encs == 1:
+            self.converter = CustomConverter(
+                subsampling_factor=subsampling_factor,
+                dtype=np.float32,
+                load_aux_input=load_aux_input,
+                load_aux_output=load_aux_output)
+        else:
+            assert NotImplementedError("not impl CustomConverterMulEnc.")
+
+        # hack to make batchsize argument as 1
+        # actual bathsize is included in a list
+        # default collate function converts numpy array to paddle tensor
+        # we used an empty collate function instead which returns list
+        self.dataset = TransformDataset(self.minibaches, self.converter,
+                                        self.reader)
+
+        if self.dist_sampler:
+            self.batch_sampler = DistributedBatchSampler(
+                dataset=self.dataset,
+                batch_size=1,
+                shuffle=not self.use_sortagrad if self.train_mode else False,
+                drop_last=False, )
+        else:
+            self.batch_sampler = BatchSampler(
+                dataset=self.dataset,
+                batch_size=1,
+                shuffle=not self.use_sortagrad if self.train_mode else False,
+                drop_last=False, )
+
+        self.dataloader = DataLoader(
+            dataset=self.dataset,
+            batch_sampler=self.batch_sampler,
+            collate_fn=batch_collate,
+            num_workers=self.n_iter_processes, )
+
+    def __len__(self):
+        return len(self.dataloader)
+
+    def __iter__(self):
+        return self.dataloader.__iter__()
+
+    def __call__(self):
+        return self.__iter__()
+
+    def __repr__(self):
+        echo = f"<{self.__class__.__module__}.{self.__class__.__name__} object at {hex(id(self))}> "
+        echo += f"train_mode: {self.train_mode}, "
+        echo += f"sortagrad: {self.use_sortagrad}, "
+        echo += f"batch_size: {self.batch_size}, "
+        echo += f"maxlen_in: {self.maxlen_in}, "
+        echo += f"maxlen_out: {self.maxlen_out}, "
+        echo += f"batch_count: {self.batch_count}, "
+        echo += f"batch_bins: {self.batch_bins}, "
+        echo += f"batch_frames_in: {self.batch_frames_in}, "
+        echo += f"batch_frames_out: {self.batch_frames_out}, "
+        echo += f"batch_frames_inout: {self.batch_frames_inout}, "
+        echo += f"subsampling_factor: {self.subsampling_factor}, "
+        echo += f"num_encs: {self.num_encs}, "
+        echo += f"num_workers: {self.n_iter_processes}, "
+        echo += f"load_aux_input: {self.load_aux_input}, "
+        echo += f"load_aux_output: {self.load_aux_output}, "
+        echo += f"dist_sampler: {self.dist_sampler}, "
+        echo += f"shortest_first: {self.shortest_first}, "
+        echo += f"file: {self.json_file}"
+        return echo
diff --git a/ernie-sat/paddlespeech/s2t/io/dataset.py b/ernie-sat/paddlespeech/s2t/io/dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e94f047bce7ad053ecd566f4a8d8c83a1b10a7c
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/io/dataset.py
@@ -0,0 +1,231 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+# Modified from wenet(https://github.com/wenet-e2e/wenet)
+import jsonlines
+from paddle.io import Dataset
+
+from paddlespeech.s2t.frontend.utility import read_manifest
+from paddlespeech.s2t.utils.log import Log
+
+__all__ = ["ManifestDataset", "TransformDataset"]
+
+logger = Log(__name__).getlog()
+
+
+class ManifestDataset(Dataset):
+    @classmethod
+    def from_config(cls, config):
+        """Build a ManifestDataset object from a config.
+
+        Args:
+            config (yacs.config.CfgNode): configs object.
+
+        Returns:
+            ManifestDataset: dataet object.
+        """
+        assert 'manifest' in config
+        assert config.manifest
+
+        dataset = cls(
+            manifest_path=config.manifest,
+            max_input_len=config.max_input_len,
+            min_input_len=config.min_input_len,
+            max_output_len=config.max_output_len,
+            min_output_len=config.min_output_len,
+            max_output_input_ratio=config.max_output_input_ratio,
+            min_output_input_ratio=config.min_output_input_ratio, )
+        return dataset
+
+    def __init__(self,
+                 manifest_path,
+                 max_input_len=float('inf'),
+                 min_input_len=0.0,
+                 max_output_len=float('inf'),
+                 min_output_len=0.0,
+                 max_output_input_ratio=float('inf'),
+                 min_output_input_ratio=0.0):
+        """Manifest Dataset
+
+        Args:
+            manifest_path (str): manifest josn file path
+            max_input_len ([type], optional): maximum output seq length,
+                in seconds for raw wav, in frame numbers for feature data. Defaults to float('inf').
+            min_input_len (float, optional): minimum input seq length,
+                in seconds for raw wav, in frame numbers for feature data. Defaults to 0.0.
+            max_output_len (float, optional): maximum input seq length,
+                in modeling units. Defaults to 500.0.
+            min_output_len (float, optional): minimum input seq length,
+                in modeling units. Defaults to 0.0.
+            max_output_input_ratio (float, optional): maximum output seq length/output seq length ratio.
+                Defaults to 10.0.
+            min_output_input_ratio (float, optional): minimum output seq length/output seq length ratio.
+                Defaults to 0.05.
+
+        """
+        super().__init__()
+
+        # read manifest
+        self._manifest = read_manifest(
+            manifest_path=manifest_path,
+            max_input_len=max_input_len,
+            min_input_len=min_input_len,
+            max_output_len=max_output_len,
+            min_output_len=min_output_len,
+            max_output_input_ratio=max_output_input_ratio,
+            min_output_input_ratio=min_output_input_ratio)
+        self._manifest.sort(key=lambda x: x["input"][0]["shape"][0])
+
+    def __len__(self):
+        return len(self._manifest)
+
+    def __getitem__(self, idx):
+        return self._manifest[idx]
+
+
+class TransformDataset(Dataset):
+    """Transform Dataset.
+
+    Args:
+        data: list object from make_batchset
+        converter: batch function
+        reader: read data
+    """
+
+    def __init__(self, data, converter, reader):
+        """Init function."""
+        super().__init__()
+        self.data = data
+        self.converter = converter
+        self.reader = reader
+
+    def __len__(self):
+        """Len function."""
+        return len(self.data)
+
+    def __getitem__(self, idx):
+        """[] operator."""
+        return self.converter([self.reader(self.data[idx], return_uttid=True)])
+
+
+class AudioDataset(Dataset):
+    def __init__(self,
+                 data_file,
+                 max_length=10240,
+                 min_length=0,
+                 token_max_length=200,
+                 token_min_length=1,
+                 batch_type='static',
+                 batch_size=1,
+                 max_frames_in_batch=0,
+                 sort=True,
+                 raw_wav=True,
+                 stride_ms=10):
+        """Dataset for loading audio data.
+        Attributes::
+            data_file: input data file
+                Plain text data file, each line contains following 7 fields,
+                which is split by '\t':
+                    utt:utt1
+                    feat:tmp/data/file1.wav or feat:tmp/data/fbank.ark:30
+                    feat_shape: 4.95(in seconds) or feat_shape:495,80(495 is in frames)
+                    text:i love you
+                    token: i <space> l o v e <space> y o u
+                    tokenid: int id of this token
+                    token_shape: M,N    # M is the number of token, N is vocab size
+            max_length: drop utterance which is greater than max_length(10ms), unit 10ms.
+            min_length: drop utterance which is less than min_length(10ms), unit 10ms.
+            token_max_length: drop utterance which is greater than token_max_length,
+                especially when use char unit for english modeling
+            token_min_length: drop utterance which is less than token_max_length
+            batch_type: static or dynamic, see max_frames_in_batch(dynamic)
+            batch_size: number of utterances in a batch,
+               it's for static batch size.
+            max_frames_in_batch: max feature frames in a batch,
+               when batch_type is dynamic, it's for dynamic batch size.
+               Then batch_size is ignored, we will keep filling the
+               batch until the total frames in batch up to max_frames_in_batch.
+            sort: whether to sort all data, so the utterance with the same
+               length could be filled in a same batch.
+            raw_wav: use raw wave or extracted featute.
+                if raw wave is used, dynamic waveform-level augmentation could be used
+                and the feature is extracted by torchaudio.
+                if extracted featute(e.g. by kaldi) is used, only feature-level
+                augmentation such as specaug could be used.
+        """
+        assert batch_type in ['static', 'dynamic']
+        # read manifest
+        with jsonlines.open(data_file, 'r') as reader:
+            data = list(reader)
+        if sort:
+            data = sorted(data, key=lambda x: x["feat_shape"][0])
+        if raw_wav:
+            path_suffix = data[0]['feat'].split(':')[0].splitext()[-1]
+            assert path_suffix not in ('.ark', '.scp')
+            # m second to n frame
+            data = list(
+                map(lambda x: (float(x['feat_shape'][0]) * 1000 / stride_ms),
+                    data))
+
+        self.input_dim = data[0]['feat_shape'][1]
+        self.output_dim = data[0]['token_shape'][1]
+
+        valid_data = []
+        for i in range(len(data)):
+            length = data[i]['feat_shape'][0]
+            token_length = data[i]['token_shape'][0]
+            # remove too lang or too short utt for both input and output
+            # to prevent from out of memory
+            if length > max_length or length < min_length:
+                pass
+            elif token_length > token_max_length or token_length < token_min_length:
+                pass
+            else:
+                valid_data.append(data[i])
+        logger.info(f"raw dataset len: {len(data)}")
+        data = valid_data
+        num_data = len(data)
+        logger.info(f"dataset len after filter: {num_data}")
+
+        self.minibatch = []
+        # Dynamic batch size
+        if batch_type == 'dynamic':
+            assert (max_frames_in_batch > 0)
+            self.minibatch.append([])
+            num_frames_in_batch = 0
+            for i in range(num_data):
+                length = data[i]['feat_shape'][0]
+                num_frames_in_batch += length
+                if num_frames_in_batch > max_frames_in_batch:
+                    self.minibatch.append([])
+                    num_frames_in_batch = length
+                self.minibatch[-1].append(data[i])
+        # Static batch size
+        else:
+            cur = 0
+            while cur < num_data:
+                end = min(cur + batch_size, num_data)
+                item = []
+                for i in range(cur, end):
+                    item.append(data[i])
+                self.minibatch.append(item)
+                cur = end
+
+    def __len__(self):
+        """number of example(batch)"""
+        return len(self.minibatch)
+
+    def __getitem__(self, idx):
+        """batch example of idx"""
+        return self.minibatch[idx]
diff --git a/ernie-sat/paddlespeech/s2t/io/reader.py b/ernie-sat/paddlespeech/s2t/io/reader.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e136bdce1d9b5490dadf58ab6359e6430121ced
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/io/reader.py
@@ -0,0 +1,414 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+from collections import OrderedDict
+
+import kaldiio
+import numpy as np
+import soundfile
+
+from .utility import feat_type
+from paddlespeech.s2t.transform.transformation import Transformation
+from paddlespeech.s2t.utils.log import Log
+# from paddlespeech.s2t.frontend.augmentor.augmentation import AugmentationPipeline as Transformation
+
+__all__ = ["LoadInputsAndTargets"]
+
+logger = Log(__name__).getlog()
+
+
+class LoadInputsAndTargets():
+    """Create a mini-batch from a list of dicts
+
+    >>> batch = [('utt1',
+    ...           dict(input=[dict(feat='some.ark:123',
+    ...                            filetype='mat',
+    ...                            name='input1',
+    ...                            shape=[100, 80])],
+    ...                output=[dict(tokenid='1 2 3 4',
+    ...                             name='target1',
+    ...                             shape=[4, 31])]]))
+    >>> l = LoadInputsAndTargets()
+    >>> feat, target = l(batch)
+
+    :param: str mode: Specify the task mode, "asr" or "tts"
+    :param: str preprocess_conf: The path of a json file for pre-processing
+    :param: bool load_input: If False, not to load the input data
+    :param: bool load_output: If False, not to load the output data
+    :param: bool sort_in_input_length: Sort the mini-batch in descending order
+        of the input length
+    :param: bool use_speaker_embedding: Used for tts mode only
+    :param: bool use_second_target: Used for tts mode only
+    :param: dict preprocess_args: Set some optional arguments for preprocessing
+    :param: Optional[dict] preprocess_args: Used for tts mode only
+    """
+
+    def __init__(
+            self,
+            mode="asr",
+            preprocess_conf=None,
+            load_input=True,
+            load_output=True,
+            sort_in_input_length=True,
+            preprocess_args=None,
+            keep_all_data_on_mem=False, ):
+        self._loaders = {}
+
+        if mode not in ["asr"]:
+            raise ValueError("Only asr are allowed: mode={}".format(mode))
+
+        if preprocess_conf:
+            self.preprocessing = Transformation(preprocess_conf)
+            logger.warning(
+                "[Experimental feature] Some preprocessing will be done "
+                "for the mini-batch creation using {}".format(
+                    self.preprocessing))
+        else:
+            # If conf doesn't exist, this function don't touch anything.
+            self.preprocessing = None
+
+        self.mode = mode
+        self.load_output = load_output
+        self.load_input = load_input
+        self.sort_in_input_length = sort_in_input_length
+        if preprocess_args:
+            assert isinstance(preprocess_args, dict), type(preprocess_args)
+            self.preprocess_args = dict(preprocess_args)
+        else:
+            self.preprocess_args = {}
+        self.keep_all_data_on_mem = keep_all_data_on_mem
+
+    def __call__(self, batch, return_uttid=False):
+        """Function to load inputs and targets from list of dicts
+
+        :param List[Tuple[str, dict]] batch: list of dict which is subset of
+            loaded data.json
+        :param bool return_uttid: return utterance ID information for visualization
+        :return: list of input token id sequences [(L_1), (L_2), ..., (L_B)]
+        :return: list of input feature sequences
+            [(T_1, D), (T_2, D), ..., (T_B, D)]
+        :rtype: list of float ndarray
+        :return: list of target token id sequences [(L_1), (L_2), ..., (L_B)]
+        :rtype: list of int ndarray
+
+        """
+        x_feats_dict = OrderedDict()  # OrderedDict[str, List[np.ndarray]]
+        y_feats_dict = OrderedDict()  # OrderedDict[str, List[np.ndarray]]
+        uttid_list = []  # List[str]
+
+        for uttid, info in batch:
+            uttid_list.append(uttid)
+
+            if self.load_input:
+                # Note(kamo): This for-loop is for multiple inputs
+                for idx, inp in enumerate(info["input"]):
+                    # {"input":
+                    #  [{"feat": "some/path.h5:F01_050C0101_PED_REAL",
+                    #    "filetype": "hdf5",
+                    #    "name": "input1", ...}], ...}
+                    x = self._get_from_loader(
+                        filepath=inp["feat"],
+                        filetype=inp.get("filetype", "mat"))
+                    x_feats_dict.setdefault(inp["name"], []).append(x)
+
+            if self.load_output:
+                for idx, inp in enumerate(info["output"]):
+                    if "tokenid" in inp:
+                        # ======= Legacy format for output =======
+                        # {"output": [{"tokenid": "1 2 3 4"}])
+                        x = np.fromiter(
+                            map(int, inp["tokenid"].split()), dtype=np.int64)
+                    else:
+                        # ======= New format =======
+                        # {"input":
+                        #  [{"feat": "some/path.h5:F01_050C0101_PED_REAL",
+                        #    "filetype": "hdf5",
+                        #    "name": "target1", ...}], ...}
+                        x = self._get_from_loader(
+                            filepath=inp["feat"],
+                            filetype=inp.get("filetype", "mat"))
+
+                    y_feats_dict.setdefault(inp["name"], []).append(x)
+
+        if self.mode == "asr":
+            return_batch, uttid_list = self._create_batch_asr(
+                x_feats_dict, y_feats_dict, uttid_list)
+        else:
+            raise NotImplementedError(self.mode)
+
+        if self.preprocessing is not None:
+            # Apply pre-processing all input features
+            for x_name in return_batch.keys():
+                if x_name.startswith("input"):
+                    return_batch[x_name] = self.preprocessing(
+                        return_batch[x_name], uttid_list,
+                        **self.preprocess_args)
+
+        if return_uttid:
+            return tuple(return_batch.values()), uttid_list
+
+        # Doesn't return the names now.
+        return tuple(return_batch.values())
+
+    def _create_batch_asr(self, x_feats_dict, y_feats_dict, uttid_list):
+        """Create a OrderedDict for the mini-batch
+
+        :param OrderedDict x_feats_dict:
+            e.g. {"input1": [ndarray, ndarray, ...],
+                  "input2": [ndarray, ndarray, ...]}
+        :param OrderedDict y_feats_dict:
+            e.g. {"target1": [ndarray, ndarray, ...],
+                  "target2": [ndarray, ndarray, ...]}
+        :param: List[str] uttid_list:
+            Give uttid_list to sort in the same order as the mini-batch
+        :return: batch, uttid_list
+        :rtype: Tuple[OrderedDict, List[str]]
+        """
+        # handle single-input and multi-input (paralell) asr mode
+        xs = list(x_feats_dict.values())
+
+        if self.load_output:
+            ys = list(y_feats_dict.values())
+            assert len(xs[0]) == len(ys[0]), (len(xs[0]), len(ys[0]))
+
+            # get index of non-zero length samples
+            nonzero_idx = list(
+                filter(lambda i: len(ys[0][i]) > 0, range(len(ys[0]))))
+            for n in range(1, len(y_feats_dict)):
+                nonzero_idx = filter(lambda i: len(ys[n][i]) > 0, nonzero_idx)
+        else:
+            # Note(kamo): Be careful not to make nonzero_idx to a generator
+            nonzero_idx = list(range(len(xs[0])))
+
+        if self.sort_in_input_length:
+            # sort in input lengths based on the first input
+            nonzero_sorted_idx = sorted(
+                nonzero_idx, key=lambda i: -len(xs[0][i]))
+        else:
+            nonzero_sorted_idx = nonzero_idx
+
+        if len(nonzero_sorted_idx) != len(xs[0]):
+            logger.warning(
+                "Target sequences include empty tokenid (batch {} -> {}).".
+                format(len(xs[0]), len(nonzero_sorted_idx)))
+
+        # remove zero-length samples
+        xs = [[x[i] for i in nonzero_sorted_idx] for x in xs]
+        uttid_list = [uttid_list[i] for i in nonzero_sorted_idx]
+
+        x_names = list(x_feats_dict.keys())
+        if self.load_output:
+            ys = [[y[i] for i in nonzero_sorted_idx] for y in ys]
+            y_names = list(y_feats_dict.keys())
+
+            # Keeping x_name and y_name, e.g. input1, for future extension
+            return_batch = OrderedDict([
+                * [(x_name, x) for x_name, x in zip(x_names, xs)],
+                * [(y_name, y) for y_name, y in zip(y_names, ys)],
+            ])
+        else:
+            return_batch = OrderedDict(
+                [(x_name, x) for x_name, x in zip(x_names, xs)])
+        return return_batch, uttid_list
+
+    def _get_from_loader(self, filepath, filetype):
+        """Return ndarray
+
+        In order to make the fds to be opened only at the first referring,
+        the loader are stored in self._loaders
+
+        >>> ndarray = loader.get_from_loader(
+        ...     'some/path.h5:F01_050C0101_PED_REAL', filetype='hdf5')
+
+        :param: str filepath:
+        :param: str filetype:
+        :return:
+        :rtype: np.ndarray
+        """
+        if filetype == "hdf5":
+            # e.g.
+            #    {"input": [{"feat": "some/path.h5:F01_050C0101_PED_REAL",
+            #                "filetype": "hdf5",
+            # -> filepath = "some/path.h5", key = "F01_050C0101_PED_REAL"
+            filepath, key = filepath.split(":", 1)
+
+            loader = self._loaders.get(filepath)
+            if loader is None:
+                # To avoid disk access, create loader only for the first time
+                loader = h5py.File(filepath, "r")
+                self._loaders[filepath] = loader
+            return loader[key][()]
+        elif filetype == "sound.hdf5":
+            # e.g.
+            #    {"input": [{"feat": "some/path.h5:F01_050C0101_PED_REAL",
+            #                "filetype": "sound.hdf5",
+            # -> filepath = "some/path.h5", key = "F01_050C0101_PED_REAL"
+            filepath, key = filepath.split(":", 1)
+
+            loader = self._loaders.get(filepath)
+            if loader is None:
+                # To avoid disk access, create loader only for the first time
+                loader = SoundHDF5File(filepath, "r", dtype="int16")
+                self._loaders[filepath] = loader
+            array, rate = loader[key]
+            return array
+        elif filetype == "sound":
+            # e.g.
+            #    {"input": [{"feat": "some/path.wav",
+            #                "filetype": "sound"},
+            # Assume PCM16
+            if not self.keep_all_data_on_mem:
+                array, _ = soundfile.read(filepath, dtype="int16")
+                return array
+            if filepath not in self._loaders:
+                array, _ = soundfile.read(filepath, dtype="int16")
+                self._loaders[filepath] = array
+            return self._loaders[filepath]
+        elif filetype == "npz":
+            # e.g.
+            #    {"input": [{"feat": "some/path.npz:F01_050C0101_PED_REAL",
+            #                "filetype": "npz",
+            filepath, key = filepath.split(":", 1)
+
+            loader = self._loaders.get(filepath)
+            if loader is None:
+                # To avoid disk access, create loader only for the first time
+                loader = np.load(filepath)
+                self._loaders[filepath] = loader
+            return loader[key]
+        elif filetype == "npy":
+            # e.g.
+            #    {"input": [{"feat": "some/path.npy",
+            #                "filetype": "npy"},
+            if not self.keep_all_data_on_mem:
+                return np.load(filepath)
+            if filepath not in self._loaders:
+                self._loaders[filepath] = np.load(filepath)
+            return self._loaders[filepath]
+        elif filetype in ["mat", "vec"]:
+            # e.g.
+            #    {"input": [{"feat": "some/path.ark:123",
+            #                "filetype": "mat"}]},
+            # In this case, "123" indicates the starting points of the matrix
+            # load_mat can load both matrix and vector
+            if not self.keep_all_data_on_mem:
+                return kaldiio.load_mat(filepath)
+            if filepath not in self._loaders:
+                self._loaders[filepath] = kaldiio.load_mat(filepath)
+            return self._loaders[filepath]
+        elif filetype == "scp":
+            # e.g.
+            #    {"input": [{"feat": "some/path.scp:F01_050C0101_PED_REAL",
+            #                "filetype": "scp",
+            filepath, key = filepath.split(":", 1)
+            loader = self._loaders.get(filepath)
+            if loader is None:
+                # To avoid disk access, create loader only for the first time
+                loader = kaldiio.load_scp(filepath)
+                self._loaders[filepath] = loader
+            return loader[key]
+        else:
+            raise NotImplementedError(
+                "Not supported: loader_type={}".format(filetype))
+
+    def file_type(self, filepath):
+        return feat_type(filepath)
+
+
+class SoundHDF5File():
+    """Collecting sound files to a HDF5 file
+
+    >>> f = SoundHDF5File('a.flac.h5', mode='a')
+    >>> array = np.random.randint(0, 100, 100, dtype=np.int16)
+    >>> f['id'] = (array, 16000)
+    >>> array, rate = f['id']
+
+
+    :param: str filepath:
+    :param: str mode:
+    :param: str format: The type used when saving wav. flac, nist, htk, etc.
+    :param: str dtype:
+
+    """
+
+    def __init__(self,
+                 filepath,
+                 mode="r+",
+                 format=None,
+                 dtype="int16",
+                 **kwargs):
+        self.filepath = filepath
+        self.mode = mode
+        self.dtype = dtype
+
+        self.file = h5py.File(filepath, mode, **kwargs)
+        if format is None:
+            # filepath = a.flac.h5 -> format = flac
+            second_ext = os.path.splitext(os.path.splitext(filepath)[0])[1]
+            format = second_ext[1:]
+            if format.upper() not in soundfile.available_formats():
+                # If not found, flac is selected
+                format = "flac"
+
+        # This format affects only saving
+        self.format = format
+
+    def __repr__(self):
+        return '<SoundHDF5 file "{}" (mode {}, format {}, type {})>'.format(
+            self.filepath, self.mode, self.format, self.dtype)
+
+    def create_dataset(self, name, shape=None, data=None, **kwds):
+        f = io.BytesIO()
+        array, rate = data
+        soundfile.write(f, array, rate, format=self.format)
+        self.file.create_dataset(
+            name, shape=shape, data=np.void(f.getvalue()), **kwds)
+
+    def __setitem__(self, name, data):
+        self.create_dataset(name, data=data)
+
+    def __getitem__(self, key):
+        data = self.file[key][()]
+        f = io.BytesIO(data.tobytes())
+        array, rate = soundfile.read(f, dtype=self.dtype)
+        return array, rate
+
+    def keys(self):
+        return self.file.keys()
+
+    def values(self):
+        for k in self.file:
+            yield self[k]
+
+    def items(self):
+        for k in self.file:
+            yield k, self[k]
+
+    def __iter__(self):
+        return iter(self.file)
+
+    def __contains__(self, item):
+        return item in self.file
+
+    def __len__(self, item):
+        return len(self.file)
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.file.close()
+
+    def close(self):
+        self.file.close()
diff --git a/ernie-sat/paddlespeech/s2t/io/sampler.py b/ernie-sat/paddlespeech/s2t/io/sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac55af1236f11d175e9e7717220980cf95c7d79b
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/io/sampler.py
@@ -0,0 +1,251 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+
+import numpy as np
+from paddle import distributed as dist
+from paddle.io import BatchSampler
+from paddle.io import DistributedBatchSampler
+
+from paddlespeech.s2t.utils.log import Log
+
+logger = Log(__name__).getlog()
+
+__all__ = [
+    "SortagradDistributedBatchSampler",
+    "SortagradBatchSampler",
+]
+
+
+def _batch_shuffle(indices, batch_size, epoch, clipped=False):
+    """Put similarly-sized instances into minibatches for better efficiency
+    and make a batch-wise shuffle.
+
+    1. Sort the audio clips by duration.
+    2. Generate a random number `k`, k in [0, batch_size).
+    3. Randomly shift `k` instances in order to create different batches
+        for different epochs. Create minibatches.
+    4. Shuffle the minibatches.
+
+    :param indices: indexes. List of int.
+    :type indices: list
+    :param batch_size: Batch size. This size is also used for generate
+                        a random number for batch shuffle.
+    :type batch_size: int
+    :param clipped: Whether to clip the heading (small shift) and trailing
+                    (incomplete batch) instances.
+    :type clipped: bool
+    :return: Batch shuffled mainifest.
+    :rtype: list
+    """
+    rng = np.random.RandomState(epoch)
+    shift_len = rng.randint(0, batch_size - 1)
+    batch_indices = list(zip(* [iter(indices[shift_len:])] * batch_size))
+    rng.shuffle(batch_indices)
+    batch_indices = [item for batch in batch_indices for item in batch]
+    assert clipped is False
+    if not clipped:
+        res_len = len(indices) - shift_len - len(batch_indices)
+        # when res_len is 0, will return whole list, len(List[-0:]) = len(List[:])
+        if res_len != 0:
+            batch_indices.extend(indices[-res_len:])
+        batch_indices.extend(indices[0:shift_len])
+        assert len(indices) == len(
+            batch_indices
+        ), f"_batch_shuffle: {len(indices)} : {len(batch_indices)} : {res_len} - {shift_len}"
+    return batch_indices
+
+
+class SortagradDistributedBatchSampler(DistributedBatchSampler):
+    def __init__(self,
+                 dataset,
+                 batch_size,
+                 num_replicas=None,
+                 rank=None,
+                 shuffle=False,
+                 drop_last=False,
+                 sortagrad=False,
+                 shuffle_method="batch_shuffle"):
+        """Sortagrad Sampler for multi gpus.
+
+        Args:
+            dataset (paddle.io.Dataset): 
+            batch_size (int): batch size for one gpu
+            num_replicas (int, optional): world size or numbers of gpus. Defaults to None.
+            rank (int, optional): rank id. Defaults to None.
+            shuffle (bool, optional): True for do shuffle, or else. Defaults to False.
+            drop_last (bool, optional): whether drop last batch which is less than batch size. Defaults to False.
+            sortagrad (bool, optional): True, do sortgrad in first epoch, then shuffle as usual; or else. Defaults to False.
+            shuffle_method (str, optional): shuffle method, "instance_shuffle" or "batch_shuffle". Defaults to "batch_shuffle".
+        """
+        super().__init__(dataset, batch_size, num_replicas, rank, shuffle,
+                         drop_last)
+        self._sortagrad = sortagrad
+        self._shuffle_method = shuffle_method
+
+    def __iter__(self):
+        num_samples = len(self.dataset)
+        indices = np.arange(num_samples).tolist()
+        indices += indices[:(self.total_size - len(indices))]
+        assert len(indices) == self.total_size
+
+        # sort (by duration) or batch-wise shuffle the manifest
+        if self.shuffle:
+            if self.epoch == 0 and self._sortagrad:
+                logger.info(
+                    f'rank: {dist.get_rank()} dataset sortagrad! epoch {self.epoch}'
+                )
+            else:
+                logger.info(
+                    f'rank: {dist.get_rank()} dataset shuffle! epoch {self.epoch}'
+                )
+                if self._shuffle_method == "batch_shuffle":
+                    # using `batch_size * nrank`, or will cause instability loss and nan or inf grad, 
+                    # since diff batch examlpe length in batches case instability loss in diff rank, 
+                    # e.g. rank0 maxlength 20, rank3 maxlength 1000
+                    indices = _batch_shuffle(
+                        indices,
+                        self.batch_size * self.nranks,
+                        self.epoch,
+                        clipped=False)
+                elif self._shuffle_method == "instance_shuffle":
+                    np.random.RandomState(self.epoch).shuffle(indices)
+                else:
+                    raise ValueError("Unknown shuffle method %s." %
+                                     self._shuffle_method)
+        assert len(
+            indices
+        ) == self.total_size, f"batch shuffle examples error: {len(indices)} : {self.total_size}"
+
+        # slice `self.batch_size` examples by rank id
+        def _get_indices_by_batch_size(indices):
+            subsampled_indices = []
+            last_batch_size = self.total_size % (self.batch_size * self.nranks)
+            assert last_batch_size % self.nranks == 0
+            last_local_batch_size = last_batch_size // self.nranks
+
+            for i in range(self.local_rank * self.batch_size,
+                           len(indices) - last_batch_size,
+                           self.batch_size * self.nranks):
+                subsampled_indices.extend(indices[i:i + self.batch_size])
+
+            indices = indices[len(indices) - last_batch_size:]
+            subsampled_indices.extend(
+                indices[self.local_rank * last_local_batch_size:(
+                    self.local_rank + 1) * last_local_batch_size])
+            return subsampled_indices
+
+        if self.nranks > 1:
+            indices = _get_indices_by_batch_size(indices)
+
+        assert len(indices) == self.num_samples
+        _sample_iter = iter(indices)
+
+        batch_indices = []
+        for idx in _sample_iter:
+            batch_indices.append(idx)
+            if len(batch_indices) == self.batch_size:
+                logger.debug(
+                    f"rank: {dist.get_rank()} batch index: {batch_indices} ")
+                yield batch_indices
+                batch_indices = []
+        if not self.drop_last and len(batch_indices) > 0:
+            yield batch_indices
+
+    def __len__(self):
+        num_samples = self.num_samples
+        num_samples += int(not self.drop_last) * (self.batch_size - 1)
+        return num_samples // self.batch_size
+
+
+class SortagradBatchSampler(BatchSampler):
+    def __init__(self,
+                 dataset,
+                 batch_size,
+                 shuffle=False,
+                 drop_last=False,
+                 sortagrad=False,
+                 shuffle_method="batch_shuffle"):
+        """Sortagrad Sampler for one gpu.
+
+        Args:
+            dataset (paddle.io.Dataset): 
+            batch_size (int): batch size for one gpu
+            shuffle (bool, optional): True for do shuffle, or else. Defaults to False.
+            drop_last (bool, optional): whether drop last batch which is less than batch size. Defaults to False.
+            sortagrad (bool, optional): True, do sortgrad in first epoch, then shuffle as usual; or else. Defaults to False.
+            shuffle_method (str, optional): shuffle method, "instance_shuffle" or "batch_shuffle". Defaults to "batch_shuffle".
+        """
+        self.dataset = dataset
+
+        assert isinstance(batch_size, int) and batch_size > 0, \
+            "batch_size should be a positive integer"
+        self.batch_size = batch_size
+        assert isinstance(shuffle, bool), \
+            "shuffle should be a boolean value"
+        self.shuffle = shuffle
+        assert isinstance(drop_last, bool), \
+            "drop_last should be a boolean number"
+
+        self.drop_last = drop_last
+        self.epoch = 0
+        self.num_samples = int(math.ceil(len(self.dataset) * 1.0))
+        self.total_size = self.num_samples
+        self._sortagrad = sortagrad
+        self._shuffle_method = shuffle_method
+
+    def __iter__(self):
+        num_samples = len(self.dataset)
+        indices = np.arange(num_samples).tolist()
+        indices += indices[:(self.total_size - len(indices))]
+        assert len(indices) == self.total_size
+
+        # sort (by duration) or batch-wise shuffle the manifest
+        if self.shuffle:
+            if self.epoch == 0 and self._sortagrad:
+                logger.info(f'dataset sortagrad! epoch {self.epoch}')
+            else:
+                logger.info(f'dataset shuffle! epoch {self.epoch}')
+                if self._shuffle_method == "batch_shuffle":
+                    indices = _batch_shuffle(
+                        indices, self.batch_size, self.epoch, clipped=False)
+                elif self._shuffle_method == "instance_shuffle":
+                    np.random.RandomState(self.epoch).shuffle(indices)
+                else:
+                    raise ValueError("Unknown shuffle method %s." %
+                                     self._shuffle_method)
+        assert len(
+            indices
+        ) == self.total_size, f"batch shuffle examples error: {len(indices)} : {self.total_size}"
+
+        assert len(indices) == self.num_samples
+        _sample_iter = iter(indices)
+
+        batch_indices = []
+        for idx in _sample_iter:
+            batch_indices.append(idx)
+            if len(batch_indices) == self.batch_size:
+                logger.debug(
+                    f"rank: {dist.get_rank()} batch index: {batch_indices} ")
+                yield batch_indices
+                batch_indices = []
+        if not self.drop_last and len(batch_indices) > 0:
+            yield batch_indices
+
+        self.epoch += 1
+
+    def __len__(self):
+        num_samples = self.num_samples
+        num_samples += int(not self.drop_last) * (self.batch_size - 1)
+        return num_samples // self.batch_size
diff --git a/ernie-sat/paddlespeech/s2t/io/utility.py b/ernie-sat/paddlespeech/s2t/io/utility.py
new file mode 100644
index 0000000000000000000000000000000000000000..c08b5535a6cccb7ddf8ba7df53f6c7703e6bb96e
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/io/utility.py
@@ -0,0 +1,109 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from io import BytesIO
+from typing import List
+
+import numpy as np
+
+from paddlespeech.s2t.utils.log import Log
+
+__all__ = ["pad_list", "pad_sequence", "feat_type"]
+
+logger = Log(__name__).getlog()
+
+
+def pad_list(sequences: List[np.ndarray],
+             padding_value: float=0.0) -> np.ndarray:
+    return pad_sequence(sequences, True, padding_value)
+
+
+def pad_sequence(sequences: List[np.ndarray],
+                 batch_first: bool=True,
+                 padding_value: float=0.0) -> np.ndarray:
+    r"""Pad a list of variable length Tensors with ``padding_value``
+
+    ``pad_sequence`` stacks a list of Tensors along a new dimension,
+    and pads them to equal length. For example, if the input is list of
+    sequences with size ``L x *`` and if batch_first is False, and ``T x B x *``
+    otherwise.
+
+    `B` is batch size. It is equal to the number of elements in ``sequences``.
+    `T` is length of the longest sequence.
+    `L` is length of the sequence.
+    `*` is any number of trailing dimensions, including none.
+
+    Example:
+        >>> a = np.ones([25, 300])
+        >>> b = np.ones([22, 300])
+        >>> c = np.ones([15, 300])
+        >>> pad_sequence([a, b, c]).shape
+        [25, 3, 300]
+
+    Note:
+        This function returns a np.ndarray of size ``T x B x *`` or ``B x T x *``
+        where `T` is the length of the longest sequence. This function assumes
+        trailing dimensions and type of all the Tensors in sequences are same.
+
+    Args:
+        sequences (list[np.ndarray]): list of variable length sequences.
+        batch_first (bool, optional): output will be in ``B x T x *`` if True, or in
+            ``T x B x *`` otherwise
+        padding_value (float, optional): value for padded elements. Default: 0.
+
+    Returns:
+        np.ndarray of size ``T x B x *`` if :attr:`batch_first` is ``False``.
+        np.ndarray of size ``B x T x *`` otherwise
+    """
+
+    # assuming trailing dimensions and type of all the Tensors
+    # in sequences are same and fetching those from sequences[0]
+    max_size = sequences[0].shape
+    trailing_dims = max_size[1:]
+    max_len = max([s.shape[0] for s in sequences])
+    if batch_first:
+        out_dims = (len(sequences), max_len) + trailing_dims
+    else:
+        out_dims = (max_len, len(sequences)) + trailing_dims
+
+    out_tensor = np.full(out_dims, padding_value, dtype=sequences[0].dtype)
+    for i, tensor in enumerate(sequences):
+        length = tensor.shape[0]
+        # use index notation to prevent duplicate references to the tensor
+        if batch_first:
+            out_tensor[i, :length, ...] = tensor
+        else:
+            out_tensor[:length, i, ...] = tensor
+
+    return out_tensor
+
+
+def feat_type(filepath):
+    # deal with Byteio type for paddlespeech server
+    if isinstance(filepath, BytesIO):
+        return 'sound'
+
+    suffix = filepath.split(":")[0].split('.')[-1].lower()
+    if suffix == 'ark':
+        return 'mat'
+    elif suffix == 'scp':
+        return 'scp'
+    elif suffix == 'npy':
+        return 'npy'
+    elif suffix == 'npz':
+        return 'npz'
+    elif suffix in ['wav', 'flac']:
+        # PCM16
+        return 'sound'
+    else:
+        raise ValueError(f"Not support filetype: {suffix}")
diff --git a/ernie-sat/paddlespeech/s2t/models/__init__.py b/ernie-sat/paddlespeech/s2t/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..185a92b8d94d3426d616c0624f0f2ee04339349e
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/models/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/ernie-sat/paddlespeech/s2t/models/asr_interface.py b/ernie-sat/paddlespeech/s2t/models/asr_interface.py
new file mode 100644
index 0000000000000000000000000000000000000000..8c2db279763dec086358fc544864b4895cf94f29
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/models/asr_interface.py
@@ -0,0 +1,162 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+"""ASR Interface module."""
+import argparse
+
+from paddlespeech.s2t.utils.dynamic_import import dynamic_import
+
+
+class ASRInterface:
+    """ASR Interface model implementation."""
+
+    @staticmethod
+    def add_arguments(parser):
+        """Add arguments to parser."""
+        return parser
+
+    @classmethod
+    def build(cls, idim: int, odim: int, **kwargs):
+        """Initialize this class with python-level args.
+
+        Args:
+            idim (int): The number of an input feature dim.
+            odim (int): The number of output vocab.
+
+        Returns:
+            ASRinterface: A new instance of ASRInterface.
+
+        """
+        args = argparse.Namespace(**kwargs)
+        return cls(idim, odim, args)
+
+    def forward(self, xs, ilens, ys, olens):
+        """Compute loss for training.
+
+        :param xs: batch of padded source sequences paddle.Tensor (B, Tmax, idim)
+        :param ilens: batch of lengths of source sequences (B), paddle.Tensor
+        :param ys: batch of padded target sequences paddle.Tensor (B, Lmax)
+        :param olens: batch of lengths of target sequences (B), paddle.Tensor
+        :return: loss value
+        :rtype: paddle.Tensor
+        """
+        raise NotImplementedError("forward method is not implemented")
+
+    def recognize(self, x, recog_args, char_list=None, rnnlm=None):
+        """Recognize x for evaluation.
+
+        :param ndarray x: input acouctic feature (B, T, D) or (T, D)
+        :param namespace recog_args: argment namespace contraining options
+        :param list char_list: list of characters
+        :param paddle.nn.Layer rnnlm: language model module
+        :return: N-best decoding results
+        :rtype: list
+        """
+        raise NotImplementedError("recognize method is not implemented")
+
+    def recognize_batch(self, x, recog_args, char_list=None, rnnlm=None):
+        """Beam search implementation for batch.
+
+        :param paddle.Tensor x: encoder hidden state sequences (B, Tmax, Henc)
+        :param namespace recog_args: argument namespace containing options
+        :param list char_list: list of characters
+        :param paddle.nn.Module rnnlm: language model module
+        :return: N-best decoding results
+        :rtype: list
+        """
+        raise NotImplementedError("Batch decoding is not supported yet.")
+
+    def calculate_all_attentions(self, xs, ilens, ys):
+        """Calculate attention.
+
+        :param list xs: list of padded input sequences [(T1, idim), (T2, idim), ...]
+        :param ndarray ilens: batch of lengths of input sequences (B)
+        :param list ys: list of character id sequence tensor [(L1), (L2), (L3), ...]
+        :return: attention weights (B, Lmax, Tmax)
+        :rtype: float ndarray
+        """
+        raise NotImplementedError(
+            "calculate_all_attentions method is not implemented")
+
+    def calculate_all_ctc_probs(self, xs, ilens, ys):
+        """Calculate CTC probability.
+
+        :param list xs_pad: list of padded input sequences [(T1, idim), (T2, idim), ...]
+        :param ndarray ilens: batch of lengths of input sequences (B)
+        :param list ys: list of character id sequence tensor [(L1), (L2), (L3), ...]
+        :return: CTC probabilities (B, Tmax, vocab)
+        :rtype: float ndarray
+        """
+        raise NotImplementedError(
+            "calculate_all_ctc_probs method is not implemented")
+
+    @property
+    def attention_plot_class(self):
+        """Get attention plot class."""
+        from paddlespeech.s2t.training.extensions.plot import PlotAttentionReport
+
+        return PlotAttentionReport
+
+    @property
+    def ctc_plot_class(self):
+        """Get CTC plot class."""
+        from paddlespeech.s2t.training.extensions.plot import PlotCTCReport
+
+        return PlotCTCReport
+
+    def get_total_subsampling_factor(self):
+        """Get total subsampling factor."""
+        raise NotImplementedError(
+            "get_total_subsampling_factor method is not implemented")
+
+    def encode(self, feat):
+        """Encode feature in `beam_search` (optional).
+
+        Args:
+            x (numpy.ndarray): input feature (T, D)
+        Returns:
+            paddle.Tensor: encoded feature (T, D)
+        """
+        raise NotImplementedError("encode method is not implemented")
+
+    def scorers(self):
+        """Get scorers for `beam_search` (optional).
+
+        Returns:
+            dict[str, ScorerInterface]: dict of `ScorerInterface` objects
+
+        """
+        raise NotImplementedError("decoders method is not implemented")
+
+
+predefined_asr = {
+    "transformer": "paddlespeech.s2t.models.u2:U2Model",
+    "conformer": "paddlespeech.s2t.models.u2:U2Model",
+}
+
+
+def dynamic_import_asr(module):
+    """Import ASR models dynamically.
+
+    Args:
+        module (str): asr name. e.g., transformer, conformer
+
+    Returns:
+        type: ASR class
+
+    """
+    model_class = dynamic_import(module, predefined_asr)
+    assert issubclass(model_class,
+                      ASRInterface), f"{module} does not implement ASRInterface"
+    return model_class
diff --git a/ernie-sat/paddlespeech/s2t/models/ds2/__init__.py b/ernie-sat/paddlespeech/s2t/models/ds2/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b32220673e610ea2ba1e907011c92708c2797fb3
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/models/ds2/__init__.py
@@ -0,0 +1,29 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .deepspeech2 import DeepSpeech2InferModel
+from .deepspeech2 import DeepSpeech2Model
+from paddlespeech.s2t.utils import dynamic_pip_install
+
+try:
+    import paddlespeech_ctcdecoders
+except ImportError:
+    try:
+        package_name = 'paddlespeech_ctcdecoders'
+        dynamic_pip_install.install(package_name)
+    except Exception:
+        raise RuntimeError(
+            "Can not install package paddlespeech_ctcdecoders on your system. \
+                The DeepSpeech2 model is not supported for your system")
+
+__all__ = ['DeepSpeech2Model', 'DeepSpeech2InferModel']
diff --git a/ernie-sat/paddlespeech/s2t/models/ds2/conv.py b/ernie-sat/paddlespeech/s2t/models/ds2/conv.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e766e793ddf4eb05d355cc8ed79f2c1b2f462d1
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/models/ds2/conv.py
@@ -0,0 +1,171 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from paddle import nn
+from paddle.nn import functional as F
+
+from paddlespeech.s2t.modules.activation import brelu
+from paddlespeech.s2t.modules.mask import make_non_pad_mask
+from paddlespeech.s2t.utils.log import Log
+
+logger = Log(__name__).getlog()
+
+__all__ = ['ConvStack', "conv_output_size"]
+
+
+def conv_output_size(I, F, P, S):
+    # https://stanford.edu/~shervine/teaching/cs-230/cheatsheet-convolutional-neural-networks#hyperparameters
+    # Output size after Conv:
+    #   By noting I the length of the input volume size, 
+    #   F the length of the filter, 
+    #   P the amount of zero padding, 
+    #   S the stride,
+    #   then the output size O of the feature map along that dimension is given by:
+    #       O = (I - F + Pstart + Pend) // S + 1
+    #   When Pstart == Pend == P, we can replace Pstart + Pend by 2P.
+    #   When Pstart == Pend == 0
+    #       O = (I - F - S) // S
+    # https://iq.opengenus.org/output-size-of-convolution/
+    # Output height = (Input height + padding height top + padding height bottom - kernel height) / (stride height) + 1
+    # Output width = (Output width + padding width right + padding width left - kernel width) / (stride width) + 1
+    return (I - F + 2 * P - S) // S
+
+
+# receptive field calculator
+# https://fomoro.com/research/article/receptive-field-calculator
+# https://stanford.edu/~shervine/teaching/cs-230/cheatsheet-convolutional-neural-networks#hyperparameters
+# https://distill.pub/2019/computing-receptive-fields/
+# Rl-1 = Sl * Rl + (Kl - Sl) 
+
+
+class ConvBn(nn.Layer):
+    """Convolution layer with batch normalization.
+
+    :param kernel_size: The x dimension of a filter kernel. Or input a tuple for
+                        two image dimension.
+    :type kernel_size: int|tuple|list
+    :param num_channels_in: Number of input channels.
+    :type num_channels_in: int
+    :param num_channels_out: Number of output channels.
+    :type num_channels_out: int
+    :param stride: The x dimension of the stride. Or input a tuple for two 
+                image dimension. 
+    :type stride: int|tuple|list
+    :param padding: The x dimension of the padding. Or input a tuple for two
+                    image dimension.
+    :type padding: int|tuple|list
+    :param act: Activation type, relu|brelu
+    :type act: string
+    :return: Batch norm layer after convolution layer.
+    :rtype: Variable
+
+    """
+
+    def __init__(self, num_channels_in, num_channels_out, kernel_size, stride,
+                 padding, act):
+
+        super().__init__()
+        assert len(kernel_size) == 2
+        assert len(stride) == 2
+        assert len(padding) == 2
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.padding = padding
+
+        self.conv = nn.Conv2D(
+            num_channels_in,
+            num_channels_out,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            weight_attr=None,
+            bias_attr=False,
+            data_format='NCHW')
+
+        self.bn = nn.BatchNorm2D(
+            num_channels_out,
+            weight_attr=None,
+            bias_attr=None,
+            data_format='NCHW')
+        self.act = F.relu if act == 'relu' else brelu
+
+    def forward(self, x, x_len):
+        """
+        x(Tensor): audio, shape [B, C, D, T]
+        """
+        x = self.conv(x)
+        x = self.bn(x)
+        x = self.act(x)
+
+        x_len = (x_len - self.kernel_size[1] + 2 * self.padding[1]
+                 ) // self.stride[1] + 1
+
+        # reset padding part to 0
+        masks = make_non_pad_mask(x_len)  #[B, T]
+        masks = masks.unsqueeze(1).unsqueeze(1)  # [B, 1, 1, T]
+        # TODO(Hui Zhang): not support bool multiply
+        # masks = masks.type_as(x)
+        masks = masks.astype(x.dtype)
+        x = x.multiply(masks)
+        return x, x_len
+
+
+class ConvStack(nn.Layer):
+    """Convolution group with stacked convolution layers.
+
+    :param feat_size: audio feature dim.
+    :type feat_size: int
+    :param num_stacks: Number of stacked convolution layers.
+    :type num_stacks: int
+    """
+
+    def __init__(self, feat_size, num_stacks):
+        super().__init__()
+        self.feat_size = feat_size  # D
+        self.num_stacks = num_stacks
+
+        self.conv_in = ConvBn(
+            num_channels_in=1,
+            num_channels_out=32,
+            kernel_size=(41, 11),  #[D, T]
+            stride=(2, 3),
+            padding=(20, 5),
+            act='brelu')
+
+        out_channel = 32
+        convs = [
+            ConvBn(
+                num_channels_in=32,
+                num_channels_out=out_channel,
+                kernel_size=(21, 11),
+                stride=(2, 1),
+                padding=(10, 5),
+                act='brelu') for i in range(num_stacks - 1)
+        ]
+        self.conv_stack = nn.LayerList(convs)
+
+        # conv output feat_dim
+        output_height = (feat_size - 1) // 2 + 1
+        for i in range(self.num_stacks - 1):
+            output_height = (output_height - 1) // 2 + 1
+        self.output_height = out_channel * output_height
+
+    def forward(self, x, x_len):
+        """
+        x: shape [B, C, D, T]
+        x_len : shape [B]
+        """
+        x, x_len = self.conv_in(x, x_len)
+        for i, conv in enumerate(self.conv_stack):
+            x, x_len = conv(x, x_len)
+        return x, x_len
diff --git a/ernie-sat/paddlespeech/s2t/models/ds2/deepspeech2.py b/ernie-sat/paddlespeech/s2t/models/ds2/deepspeech2.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c6b66c251421db54488f97eeb29acef78d0cdfb
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/models/ds2/deepspeech2.py
@@ -0,0 +1,267 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Deepspeech2 ASR Model"""
+import paddle
+from paddle import nn
+
+from paddlespeech.s2t.models.ds2.conv import ConvStack
+from paddlespeech.s2t.models.ds2.rnn import RNNStack
+from paddlespeech.s2t.modules.ctc import CTCDecoder
+from paddlespeech.s2t.utils import layer_tools
+from paddlespeech.s2t.utils.checkpoint import Checkpoint
+from paddlespeech.s2t.utils.log import Log
+
+logger = Log(__name__).getlog()
+
+__all__ = ['DeepSpeech2Model', 'DeepSpeech2InferModel']
+
+
+class CRNNEncoder(nn.Layer):
+    def __init__(self,
+                 feat_size,
+                 dict_size,
+                 num_conv_layers=2,
+                 num_rnn_layers=3,
+                 rnn_size=1024,
+                 use_gru=False,
+                 share_rnn_weights=True):
+        super().__init__()
+        self.rnn_size = rnn_size
+        self.feat_size = feat_size  # 161 for linear
+        self.dict_size = dict_size
+
+        self.conv = ConvStack(feat_size, num_conv_layers)
+
+        i_size = self.conv.output_height  # H after conv stack
+        self.rnn = RNNStack(
+            i_size=i_size,
+            h_size=rnn_size,
+            num_stacks=num_rnn_layers,
+            use_gru=use_gru,
+            share_rnn_weights=share_rnn_weights)
+
+    @property
+    def output_size(self):
+        return self.rnn_size * 2
+
+    def forward(self, audio, audio_len):
+        """Compute Encoder outputs
+
+        Args:
+            audio (Tensor): [B, Tmax, D]
+            text (Tensor): [B, Umax]
+            audio_len (Tensor): [B]
+            text_len (Tensor): [B]
+        Returns:
+            x (Tensor): encoder outputs, [B, T, D]
+            x_lens (Tensor): encoder length, [B]
+        """
+        # [B, T, D]  -> [B, D, T]
+        audio = audio.transpose([0, 2, 1])
+        # [B, D, T] -> [B, C=1, D, T]
+        x = audio.unsqueeze(1)
+        x_lens = audio_len
+
+        # convolution group
+        x, x_lens = self.conv(x, x_lens)
+
+        # convert data from convolution feature map to sequence of vectors
+        #B, C, D, T = paddle.shape(x)  # not work under jit
+        x = x.transpose([0, 3, 1, 2])  #[B, T, C, D]
+        #x = x.reshape([B, T, C * D])  #[B, T, C*D]  # not work under jit
+        x = x.reshape([0, 0, -1])  #[B, T, C*D]
+
+        # remove padding part
+        x, x_lens = self.rnn(x, x_lens)  #[B, T, D]
+        return x, x_lens
+
+
+class DeepSpeech2Model(nn.Layer):
+    """The DeepSpeech2 network structure.
+
+    :param audio_data: Audio spectrogram data layer.
+    :type audio_data: Variable
+    :param text_data: Transcription text data layer.
+    :type text_data: Variable
+    :param audio_len: Valid sequence length data layer.
+    :type audio_len: Variable
+    :param masks: Masks data layer to reset padding.
+    :type masks: Variable
+    :param dict_size: Dictionary size for tokenized transcription.
+    :type dict_size: int
+    :param num_conv_layers: Number of stacking convolution layers.
+    :type num_conv_layers: int
+    :param num_rnn_layers: Number of stacking RNN layers.
+    :type num_rnn_layers: int
+    :param rnn_size: RNN layer size (dimension of RNN cells).
+    :type rnn_size: int
+    :param use_gru: Use gru if set True. Use simple rnn if set False.
+    :type use_gru: bool
+    :param share_rnn_weights: Whether to share input-hidden weights between
+                              forward and backward direction RNNs.
+                              It is only available when use_gru=False.
+    :type share_weights: bool
+    :return: A tuple of an output unnormalized log probability layer (
+             before softmax) and a ctc cost layer.
+    :rtype: tuple of LayerOutput
+    """
+
+    def __init__(self,
+                 feat_size,
+                 dict_size,
+                 num_conv_layers=2,
+                 num_rnn_layers=3,
+                 rnn_size=1024,
+                 use_gru=False,
+                 share_rnn_weights=True,
+                 blank_id=0,
+                 ctc_grad_norm_type=None):
+        super().__init__()
+        self.encoder = CRNNEncoder(
+            feat_size=feat_size,
+            dict_size=dict_size,
+            num_conv_layers=num_conv_layers,
+            num_rnn_layers=num_rnn_layers,
+            rnn_size=rnn_size,
+            use_gru=use_gru,
+            share_rnn_weights=share_rnn_weights)
+        assert (self.encoder.output_size == rnn_size * 2)
+
+        self.decoder = CTCDecoder(
+            odim=dict_size,  # <blank> is in  vocab
+            enc_n_units=self.encoder.output_size,
+            blank_id=blank_id,
+            dropout_rate=0.0,
+            reduction=True,  # sum
+            batch_average=True,  # sum / batch_size
+            grad_norm_type=ctc_grad_norm_type)
+
+    def forward(self, audio, audio_len, text, text_len):
+        """Compute Model loss
+
+        Args:
+            audio (Tensors): [B, T, D]
+            audio_len (Tensor): [B]
+            text (Tensor): [B, U]
+            text_len (Tensor): [B]
+
+        Returns:
+            loss (Tensor): [1]
+        """
+        eouts, eouts_len = self.encoder(audio, audio_len)
+        loss = self.decoder(eouts, eouts_len, text, text_len)
+        return loss
+
+    @paddle.no_grad()
+    def decode(self, audio, audio_len):
+        # decoders only accept string encoded in utf-8
+
+        # Make sure the decoder has been initialized
+        eouts, eouts_len = self.encoder(audio, audio_len)
+        probs = self.decoder.softmax(eouts)
+        batch_size = probs.shape[0]
+        self.decoder.reset_decoder(batch_size=batch_size)
+        self.decoder.next(probs, eouts_len)
+        trans_best, trans_beam = self.decoder.decode()
+
+        return trans_best
+
+    @classmethod
+    def from_pretrained(cls, dataloader, config, checkpoint_path):
+        """Build a DeepSpeech2Model model from a pretrained model.
+        Parameters
+        ----------
+        dataloader: paddle.io.DataLoader
+
+        config: yacs.config.CfgNode
+            model configs
+
+        checkpoint_path: Path or str
+            the path of pretrained model checkpoint, without extension name
+
+        Returns
+        -------
+        DeepSpeech2Model
+            The model built from pretrained result.
+        """
+        model = cls(
+            feat_size=dataloader.collate_fn.feature_size,
+            dict_size=dataloader.collate_fn.vocab_size,
+            num_conv_layers=config.num_conv_layers,
+            num_rnn_layers=config.num_rnn_layers,
+            rnn_size=config.rnn_layer_size,
+            use_gru=config.use_gru,
+            share_rnn_weights=config.share_rnn_weights,
+            blank_id=config.blank_id,
+            ctc_grad_norm_type=config.get('ctc_grad_norm_type', None), )
+        infos = Checkpoint().load_parameters(
+            model, checkpoint_path=checkpoint_path)
+        logger.info(f"checkpoint info: {infos}")
+        layer_tools.summary(model)
+        return model
+
+    @classmethod
+    def from_config(cls, config):
+        """Build a DeepSpeec2Model from config
+        Parameters
+
+        config: yacs.config.CfgNode
+            config
+        Returns
+        -------
+        DeepSpeech2Model
+            The model built from config.
+        """
+        model = cls(
+            feat_size=config.input_dim,
+            dict_size=config.output_dim,
+            num_conv_layers=config.num_conv_layers,
+            num_rnn_layers=config.num_rnn_layers,
+            rnn_size=config.rnn_layer_size,
+            use_gru=config.use_gru,
+            share_rnn_weights=config.share_rnn_weights,
+            blank_id=config.blank_id,
+            ctc_grad_norm_type=config.get('ctc_grad_norm_type', None), )
+        return model
+
+
+class DeepSpeech2InferModel(DeepSpeech2Model):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def forward(self, audio, audio_len):
+        """export model function
+
+        Args:
+            audio (Tensor): [B, T, D]
+            audio_len (Tensor): [B]
+
+        Returns:
+            probs: probs after softmax
+        """
+        eouts, eouts_len = self.encoder(audio, audio_len)
+        probs = self.decoder.softmax(eouts)
+        return probs, eouts_len
+
+    def export(self):
+        static_model = paddle.jit.to_static(
+            self,
+            input_spec=[
+                paddle.static.InputSpec(
+                    shape=[None, None, self.encoder.feat_size],
+                    dtype='float32'),  # audio, [B,T,D]
+                paddle.static.InputSpec(shape=[None],
+                                        dtype='int64'),  # audio_length, [B]
+            ])
+        return static_model
diff --git a/ernie-sat/paddlespeech/s2t/models/ds2/rnn.py b/ernie-sat/paddlespeech/s2t/models/ds2/rnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..f655b2d822fdcdc282649cac6354ddcdab021c06
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/models/ds2/rnn.py
@@ -0,0 +1,315 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+
+import paddle
+from paddle import nn
+from paddle.nn import functional as F
+from paddle.nn import initializer as I
+
+from paddlespeech.s2t.modules.activation import brelu
+from paddlespeech.s2t.modules.mask import make_non_pad_mask
+from paddlespeech.s2t.utils.log import Log
+
+logger = Log(__name__).getlog()
+
+__all__ = ['RNNStack']
+
+
+class RNNCell(nn.RNNCellBase):
+    r"""
+    Elman RNN (SimpleRNN) cell. Given the inputs and previous states, it
+    computes the outputs and updates states.
+    The formula used is as follows:
+    .. math::
+        h_{t} & = act(x_{t} + b_{ih} + W_{hh}h_{t-1} + b_{hh})
+        y_{t} & = h_{t}
+
+    where :math:`act` is for :attr:`activation`.
+    """
+
+    def __init__(self,
+                 hidden_size: int,
+                 activation="tanh",
+                 weight_ih_attr=None,
+                 weight_hh_attr=None,
+                 bias_ih_attr=None,
+                 bias_hh_attr=None,
+                 name=None):
+        super().__init__()
+        std = 1.0 / math.sqrt(hidden_size)
+        self.weight_hh = self.create_parameter(
+            (hidden_size, hidden_size),
+            weight_hh_attr,
+            default_initializer=I.Uniform(-std, std))
+        self.bias_ih = None
+        self.bias_hh = self.create_parameter(
+            (hidden_size, ),
+            bias_hh_attr,
+            is_bias=True,
+            default_initializer=I.Uniform(-std, std))
+
+        self.hidden_size = hidden_size
+        if activation not in ["tanh", "relu", "brelu"]:
+            raise ValueError(
+                "activation for SimpleRNNCell should be tanh or relu, "
+                "but get {}".format(activation))
+        self.activation = activation
+        self._activation_fn = paddle.tanh \
+            if activation == "tanh" \
+            else F.relu
+        if activation == 'brelu':
+            self._activation_fn = brelu
+
+    def forward(self, inputs, states=None):
+        if states is None:
+            states = self.get_initial_states(inputs, self.state_shape)
+        pre_h = states
+        i2h = inputs
+        if self.bias_ih is not None:
+            i2h += self.bias_ih
+        h2h = paddle.matmul(pre_h, self.weight_hh, transpose_y=True)
+        if self.bias_hh is not None:
+            h2h += self.bias_hh
+        h = self._activation_fn(i2h + h2h)
+        return h, h
+
+    @property
+    def state_shape(self):
+        return (self.hidden_size, )
+
+
+class GRUCell(nn.RNNCellBase):
+    r"""
+    Gated Recurrent Unit (GRU) RNN cell. Given the inputs and previous states,
+    it computes the outputs and updates states.
+    The formula for GRU used is as follows:
+    ..  math::
+        r_{t} & = \sigma(W_{ir}x_{t} + b_{ir} + W_{hr}h_{t-1} + b_{hr})
+        z_{t} & = \sigma(W_{iz}x_{t} + b_{iz} + W_{hz}h_{t-1} + b_{hz})
+        \widetilde{h}_{t} & = \tanh(W_{ic}x_{t} + b_{ic} + r_{t} * (W_{hc}h_{t-1} + b_{hc}))
+        h_{t} & = z_{t} * h_{t-1} + (1 - z_{t}) * \widetilde{h}_{t}
+        y_{t} & = h_{t}
+
+    where :math:`\sigma` is the sigmoid fucntion, and * is the elemetwise
+    multiplication operator.
+    """
+
+    def __init__(self,
+                 input_size: int,
+                 hidden_size: int,
+                 weight_ih_attr=None,
+                 weight_hh_attr=None,
+                 bias_ih_attr=None,
+                 bias_hh_attr=None,
+                 name=None):
+        super().__init__()
+        std = 1.0 / math.sqrt(hidden_size)
+        self.weight_hh = self.create_parameter(
+            (3 * hidden_size, hidden_size),
+            weight_hh_attr,
+            default_initializer=I.Uniform(-std, std))
+        self.bias_ih = None
+        self.bias_hh = self.create_parameter(
+            (3 * hidden_size, ),
+            bias_hh_attr,
+            is_bias=True,
+            default_initializer=I.Uniform(-std, std))
+
+        self.hidden_size = hidden_size
+        self.input_size = input_size
+        self._gate_activation = F.sigmoid
+        self._activation = paddle.tanh
+
+    def forward(self, inputs, states=None):
+        if states is None:
+            states = self.get_initial_states(inputs, self.state_shape)
+
+        pre_hidden = states
+        x_gates = inputs
+        if self.bias_ih is not None:
+            x_gates = x_gates + self.bias_ih
+        h_gates = paddle.matmul(pre_hidden, self.weight_hh, transpose_y=True)
+        if self.bias_hh is not None:
+            h_gates = h_gates + self.bias_hh
+
+        x_r, x_z, x_c = paddle.split(x_gates, num_or_sections=3, axis=1)
+        h_r, h_z, h_c = paddle.split(h_gates, num_or_sections=3, axis=1)
+
+        r = self._gate_activation(x_r + h_r)
+        z = self._gate_activation(x_z + h_z)
+        c = self._activation(x_c + r * h_c)  # apply reset gate after mm
+        h = (pre_hidden - c) * z + c
+        # https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/fluid/layers/dynamic_gru_cn.html#dynamic-gru
+
+        return h, h
+
+    @property
+    def state_shape(self):
+        r"""
+        The `state_shape` of GRUCell is a shape `[hidden_size]` (-1 for batch
+        size would be automatically inserted into shape). The shape corresponds
+        to the shape of :math:`h_{t-1}`.
+        """
+        return (self.hidden_size, )
+
+
+class BiRNNWithBN(nn.Layer):
+    """Bidirectonal simple rnn layer with sequence-wise batch normalization.
+    The batch normalization is only performed on input-state weights.
+
+    :param size: Dimension of RNN cells.
+    :type size: int
+    :param share_weights: Whether to share input-hidden weights between
+                          forward and backward directional RNNs.
+    :type share_weights: bool
+    :return: Bidirectional simple rnn layer.
+    :rtype: Variable
+    """
+
+    def __init__(self, i_size: int, h_size: int, share_weights: bool):
+        super().__init__()
+        self.share_weights = share_weights
+        if self.share_weights:
+            #input-hidden weights shared between bi-directional rnn.
+            self.fw_fc = nn.Linear(i_size, h_size, bias_attr=False)
+            # batch norm is only performed on input-state projection
+            self.fw_bn = nn.BatchNorm1D(
+                h_size, bias_attr=None, data_format='NLC')
+            self.bw_fc = self.fw_fc
+            self.bw_bn = self.fw_bn
+        else:
+            self.fw_fc = nn.Linear(i_size, h_size, bias_attr=False)
+            self.fw_bn = nn.BatchNorm1D(
+                h_size, bias_attr=None, data_format='NLC')
+            self.bw_fc = nn.Linear(i_size, h_size, bias_attr=False)
+            self.bw_bn = nn.BatchNorm1D(
+                h_size, bias_attr=None, data_format='NLC')
+
+        self.fw_cell = RNNCell(hidden_size=h_size, activation='brelu')
+        self.bw_cell = RNNCell(hidden_size=h_size, activation='brelu')
+        self.fw_rnn = nn.RNN(
+            self.fw_cell, is_reverse=False, time_major=False)  #[B, T, D]
+        self.bw_rnn = nn.RNN(
+            self.fw_cell, is_reverse=True, time_major=False)  #[B, T, D]
+
+    def forward(self, x: paddle.Tensor, x_len: paddle.Tensor):
+        # x, shape [B, T, D]
+        fw_x = self.fw_bn(self.fw_fc(x))
+        bw_x = self.bw_bn(self.bw_fc(x))
+        fw_x, _ = self.fw_rnn(inputs=fw_x, sequence_length=x_len)
+        bw_x, _ = self.bw_rnn(inputs=bw_x, sequence_length=x_len)
+        x = paddle.concat([fw_x, bw_x], axis=-1)
+        return x, x_len
+
+
+class BiGRUWithBN(nn.Layer):
+    """Bidirectonal gru layer with sequence-wise batch normalization.
+    The batch normalization is only performed on input-state weights.
+
+    :param name: Name of the layer.
+    :type name: string
+    :param input: Input layer.
+    :type input: Variable
+    :param size: Dimension of GRU cells.
+    :type size: int
+    :param act: Activation type.
+    :type act: string
+    :return: Bidirectional GRU layer.
+    :rtype: Variable
+    """
+
+    def __init__(self, i_size: int, h_size: int):
+        super().__init__()
+        hidden_size = h_size * 3
+
+        self.fw_fc = nn.Linear(i_size, hidden_size, bias_attr=False)
+        self.fw_bn = nn.BatchNorm1D(
+            hidden_size, bias_attr=None, data_format='NLC')
+        self.bw_fc = nn.Linear(i_size, hidden_size, bias_attr=False)
+        self.bw_bn = nn.BatchNorm1D(
+            hidden_size, bias_attr=None, data_format='NLC')
+
+        self.fw_cell = GRUCell(input_size=hidden_size, hidden_size=h_size)
+        self.bw_cell = GRUCell(input_size=hidden_size, hidden_size=h_size)
+        self.fw_rnn = nn.RNN(
+            self.fw_cell, is_reverse=False, time_major=False)  #[B, T, D]
+        self.bw_rnn = nn.RNN(
+            self.fw_cell, is_reverse=True, time_major=False)  #[B, T, D]
+
+    def forward(self, x, x_len):
+        # x, shape [B, T, D]
+        fw_x = self.fw_bn(self.fw_fc(x))
+        bw_x = self.bw_bn(self.bw_fc(x))
+        fw_x, _ = self.fw_rnn(inputs=fw_x, sequence_length=x_len)
+        bw_x, _ = self.bw_rnn(inputs=bw_x, sequence_length=x_len)
+        x = paddle.concat([fw_x, bw_x], axis=-1)
+        return x, x_len
+
+
+class RNNStack(nn.Layer):
+    """RNN group with stacked bidirectional simple RNN or GRU layers.
+
+    :param input: Input layer.
+    :type input: Variable
+    :param size: Dimension of RNN cells in each layer.
+    :type size: int
+    :param num_stacks: Number of stacked rnn layers.
+    :type num_stacks: int
+    :param use_gru: Use gru if set True. Use simple rnn if set False.
+    :type use_gru: bool
+    :param share_rnn_weights: Whether to share input-hidden weights between
+                              forward and backward directional RNNs.
+                              It is only available when use_gru=False.
+    :type share_weights: bool
+    :return: Output layer of the RNN group.
+    :rtype: Variable
+    """
+
+    def __init__(self,
+                 i_size: int,
+                 h_size: int,
+                 num_stacks: int,
+                 use_gru: bool,
+                 share_rnn_weights: bool):
+        super().__init__()
+        rnn_stacks = []
+        for i in range(num_stacks):
+            if use_gru:
+                #default:GRU using tanh
+                rnn_stacks.append(BiGRUWithBN(i_size=i_size, h_size=h_size))
+            else:
+                rnn_stacks.append(
+                    BiRNNWithBN(
+                        i_size=i_size,
+                        h_size=h_size,
+                        share_weights=share_rnn_weights))
+            i_size = h_size * 2
+
+        self.rnn_stacks = nn.LayerList(rnn_stacks)
+
+    def forward(self, x: paddle.Tensor, x_len: paddle.Tensor):
+        """
+        x: shape [B, T, D]
+        x_len: shpae [B]
+        """
+        for i, rnn in enumerate(self.rnn_stacks):
+            x, x_len = rnn(x, x_len)
+            masks = make_non_pad_mask(x_len)  #[B, T]
+            masks = masks.unsqueeze(-1)  # [B, T, 1]
+            # TODO(Hui Zhang): not support bool multiply
+            masks = masks.astype(x.dtype)
+            x = x.multiply(masks)
+
+        return x, x_len
diff --git a/ernie-sat/paddlespeech/s2t/models/ds2_online/__init__.py b/ernie-sat/paddlespeech/s2t/models/ds2_online/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c5fdab1bc66aff815ed217d99703c1ff3493975a
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/models/ds2_online/__init__.py
@@ -0,0 +1,29 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .deepspeech2 import DeepSpeech2InferModelOnline
+from .deepspeech2 import DeepSpeech2ModelOnline
+from paddlespeech.s2t.utils import dynamic_pip_install
+
+try:
+    import paddlespeech_ctcdecoders
+except ImportError:
+    try:
+        package_name = 'paddlespeech_ctcdecoders'
+        dynamic_pip_install.install(package_name)
+    except Exception:
+        raise RuntimeError(
+            "Can not install package paddlespeech_ctcdecoders on your system. \
+                The DeepSpeech2 model is not supported for your system")
+
+__all__ = ['DeepSpeech2ModelOnline', 'DeepSpeech2InferModelOnline']
diff --git a/ernie-sat/paddlespeech/s2t/models/ds2_online/conv.py b/ernie-sat/paddlespeech/s2t/models/ds2_online/conv.py
new file mode 100644
index 0000000000000000000000000000000000000000..25a9715a3029f73bd1e1d508ec8b70a861144f66
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/models/ds2_online/conv.py
@@ -0,0 +1,33 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddle
+
+from paddlespeech.s2t.modules.subsampling import Conv2dSubsampling4
+
+
+class Conv2dSubsampling4Online(Conv2dSubsampling4):
+    def __init__(self, idim: int, odim: int, dropout_rate: float):
+        super().__init__(idim, odim, dropout_rate, None)
+        self.output_dim = ((idim - 1) // 2 - 1) // 2 * odim
+        self.receptive_field_length = 2 * (
+            3 - 1) + 3  # stride_1 * (kernel_size_2 - 1) + kerel_size_1
+
+    def forward(self, x: paddle.Tensor,
+                x_len: paddle.Tensor) -> [paddle.Tensor, paddle.Tensor]:
+        x = x.unsqueeze(1)  # (b, c=1, t, f)
+        x = self.conv(x)
+        #b, c, t, f = paddle.shape(x) #not work under jit
+        x = x.transpose([0, 2, 1, 3]).reshape([0, 0, -1])
+        x_len = ((x_len - 1) // 2 - 1) // 2
+        return x, x_len
diff --git a/ernie-sat/paddlespeech/s2t/models/ds2_online/deepspeech2.py b/ernie-sat/paddlespeech/s2t/models/ds2_online/deepspeech2.py
new file mode 100644
index 0000000000000000000000000000000000000000..9574a62bd5f5662aca2eca0b5e8f40d9e71037d0
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/models/ds2_online/deepspeech2.py
@@ -0,0 +1,397 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Deepspeech2 ASR Online Model"""
+import paddle
+import paddle.nn.functional as F
+from paddle import nn
+
+from paddlespeech.s2t.models.ds2_online.conv import Conv2dSubsampling4Online
+from paddlespeech.s2t.modules.ctc import CTCDecoder
+from paddlespeech.s2t.utils import layer_tools
+from paddlespeech.s2t.utils.checkpoint import Checkpoint
+from paddlespeech.s2t.utils.log import Log
+logger = Log(__name__).getlog()
+
+__all__ = ['DeepSpeech2ModelOnline', 'DeepSpeech2InferModelOnline']
+
+
+class CRNNEncoder(nn.Layer):
+    def __init__(self,
+                 feat_size,
+                 dict_size,
+                 num_conv_layers=2,
+                 num_rnn_layers=4,
+                 rnn_size=1024,
+                 rnn_direction='forward',
+                 num_fc_layers=2,
+                 fc_layers_size_list=[512, 256],
+                 use_gru=False):
+        super().__init__()
+        self.rnn_size = rnn_size
+        self.feat_size = feat_size  # 161 for linear
+        self.dict_size = dict_size
+        self.num_rnn_layers = num_rnn_layers
+        self.num_fc_layers = num_fc_layers
+        self.rnn_direction = rnn_direction
+        self.fc_layers_size_list = fc_layers_size_list
+        self.use_gru = use_gru
+        self.conv = Conv2dSubsampling4Online(feat_size, 32, dropout_rate=0.0)
+
+        self.output_dim = self.conv.output_dim
+
+        i_size = self.conv.output_dim
+        self.rnn = nn.LayerList()
+        self.layernorm_list = nn.LayerList()
+        self.fc_layers_list = nn.LayerList()
+        if rnn_direction == 'bidirect' or rnn_direction == 'bidirectional':
+            layernorm_size = 2 * rnn_size
+        elif rnn_direction == 'forward':
+            layernorm_size = rnn_size
+        else:
+            raise Exception("Wrong rnn direction")
+        for i in range(0, num_rnn_layers):
+            if i == 0:
+                rnn_input_size = i_size
+            else:
+                rnn_input_size = layernorm_size
+            if use_gru is True:
+                self.rnn.append(
+                    nn.GRU(
+                        input_size=rnn_input_size,
+                        hidden_size=rnn_size,
+                        num_layers=1,
+                        direction=rnn_direction))
+            else:
+                self.rnn.append(
+                    nn.LSTM(
+                        input_size=rnn_input_size,
+                        hidden_size=rnn_size,
+                        num_layers=1,
+                        direction=rnn_direction))
+            self.layernorm_list.append(nn.LayerNorm(layernorm_size))
+            self.output_dim = layernorm_size
+
+        fc_input_size = layernorm_size
+        for i in range(self.num_fc_layers):
+            self.fc_layers_list.append(
+                nn.Linear(fc_input_size, fc_layers_size_list[i]))
+            fc_input_size = fc_layers_size_list[i]
+            self.output_dim = fc_layers_size_list[i]
+
+    @property
+    def output_size(self):
+        return self.output_dim
+
+    def forward(self, x, x_lens, init_state_h_box=None, init_state_c_box=None):
+        """Compute Encoder outputs
+
+        Args:
+            x (Tensor): [B, T, D]
+            x_lens (Tensor): [B]
+            init_state_h_box(Tensor): init_states h for RNN layers: [num_rnn_layers * num_directions, batch_size, hidden_size]
+            init_state_c_box(Tensor): init_states c for RNN layers: [num_rnn_layers * num_directions, batch_size, hidden_size]
+        Return:
+            x (Tensor): encoder outputs, [B, T, D]
+            x_lens (Tensor): encoder length, [B]
+            final_state_h_box(Tensor): final_states h for RNN layers: [num_rnn_layers * num_directions, batch_size, hidden_size]
+            final_state_c_box(Tensor): final_states c for RNN layers: [num_rnn_layers * num_directions, batch_size, hidden_size]
+        """
+        if init_state_h_box is not None:
+            init_state_list = None
+
+            if self.use_gru is True:
+                init_state_h_list = paddle.split(
+                    init_state_h_box, self.num_rnn_layers, axis=0)
+                init_state_list = init_state_h_list
+            else:
+                init_state_h_list = paddle.split(
+                    init_state_h_box, self.num_rnn_layers, axis=0)
+                init_state_c_list = paddle.split(
+                    init_state_c_box, self.num_rnn_layers, axis=0)
+                init_state_list = [(init_state_h_list[i], init_state_c_list[i])
+                                   for i in range(self.num_rnn_layers)]
+        else:
+            init_state_list = [None] * self.num_rnn_layers
+
+        x, x_lens = self.conv(x, x_lens)
+        final_chunk_state_list = []
+        for i in range(0, self.num_rnn_layers):
+            x, final_state = self.rnn[i](x, init_state_list[i],
+                                         x_lens)  #[B, T, D]
+            final_chunk_state_list.append(final_state)
+            x = self.layernorm_list[i](x)
+
+        for i in range(self.num_fc_layers):
+            x = self.fc_layers_list[i](x)
+            x = F.relu(x)
+
+        if self.use_gru is True:
+            final_chunk_state_h_box = paddle.concat(
+                final_chunk_state_list, axis=0)
+            final_chunk_state_c_box = init_state_c_box
+        else:
+            final_chunk_state_h_list = [
+                final_chunk_state_list[i][0] for i in range(self.num_rnn_layers)
+            ]
+            final_chunk_state_c_list = [
+                final_chunk_state_list[i][1] for i in range(self.num_rnn_layers)
+            ]
+            final_chunk_state_h_box = paddle.concat(
+                final_chunk_state_h_list, axis=0)
+            final_chunk_state_c_box = paddle.concat(
+                final_chunk_state_c_list, axis=0)
+
+        return x, x_lens, final_chunk_state_h_box, final_chunk_state_c_box
+
+    def forward_chunk_by_chunk(self, x, x_lens, decoder_chunk_size=8):
+        """Compute Encoder outputs
+
+        Args:
+            x (Tensor): [B, T, D]
+            x_lens (Tensor): [B]
+            decoder_chunk_size: The chunk size of decoder
+        Returns:
+            eouts_list (List of Tensor): The list of encoder outputs in chunk_size: [B, chunk_size, D] * num_chunks
+            eouts_lens_list (List of Tensor): The list of  encoder length in chunk_size: [B] * num_chunks
+            final_state_h_box(Tensor): final_states h for RNN layers: [num_rnn_layers * num_directions, batch_size, hidden_size]
+            final_state_c_box(Tensor): final_states c for RNN layers: [num_rnn_layers * num_directions, batch_size, hidden_size]
+        """
+        subsampling_rate = self.conv.subsampling_rate
+        receptive_field_length = self.conv.receptive_field_length
+        chunk_size = (decoder_chunk_size - 1
+                      ) * subsampling_rate + receptive_field_length
+        chunk_stride = subsampling_rate * decoder_chunk_size
+        max_len = x.shape[1]
+        assert (chunk_size <= max_len)
+
+        eouts_chunk_list = []
+        eouts_chunk_lens_list = []
+        if (max_len - chunk_size) % chunk_stride != 0:
+            padding_len = chunk_stride - (max_len - chunk_size) % chunk_stride
+        else:
+            padding_len = 0
+        padding = paddle.zeros((x.shape[0], padding_len, x.shape[2]))
+        padded_x = paddle.concat([x, padding], axis=1)
+        num_chunk = (max_len + padding_len - chunk_size) / chunk_stride + 1
+        num_chunk = int(num_chunk)
+        chunk_state_h_box = None
+        chunk_state_c_box = None
+        final_state_h_box = None
+        final_state_c_box = None
+        for i in range(0, num_chunk):
+            start = i * chunk_stride
+            end = start + chunk_size
+            x_chunk = padded_x[:, start:end, :]
+
+            x_len_left = paddle.where(x_lens - i * chunk_stride < 0,
+                                      paddle.zeros_like(x_lens),
+                                      x_lens - i * chunk_stride)
+            x_chunk_len_tmp = paddle.ones_like(x_lens) * chunk_size
+            x_chunk_lens = paddle.where(x_len_left < x_chunk_len_tmp,
+                                        x_len_left, x_chunk_len_tmp)
+
+            eouts_chunk, eouts_chunk_lens, chunk_state_h_box, chunk_state_c_box = self.forward(
+                x_chunk, x_chunk_lens, chunk_state_h_box, chunk_state_c_box)
+
+            eouts_chunk_list.append(eouts_chunk)
+            eouts_chunk_lens_list.append(eouts_chunk_lens)
+        final_state_h_box = chunk_state_h_box
+        final_state_c_box = chunk_state_c_box
+        return eouts_chunk_list, eouts_chunk_lens_list, final_state_h_box, final_state_c_box
+
+
+class DeepSpeech2ModelOnline(nn.Layer):
+    """The DeepSpeech2 network structure for online.
+
+    :param audio: Audio spectrogram data layer.
+    :type audio: Variable
+    :param text: Transcription text data layer.
+    :type text: Variable
+    :param audio_len: Valid sequence length data layer.
+    :type audio_len: Variable
+    :param feat_size: feature size for audio.
+    :type feat_size: int
+    :param dict_size: Dictionary size for tokenized transcription.
+    :type dict_size: int
+    :param num_conv_layers: Number of stacking convolution layers.
+    :type num_conv_layers: int
+    :param num_rnn_layers: Number of stacking RNN layers.
+    :type num_rnn_layers: int
+    :param rnn_size: RNN layer size (dimension of RNN cells).
+    :type rnn_size: int
+    :param num_fc_layers: Number of stacking FC layers.
+    :type num_fc_layers: int
+    :param fc_layers_size_list: The list of FC layer sizes.
+    :type fc_layers_size_list: [int,]
+    :param use_gru: Use gru if set True. Use simple rnn if set False.
+    :type use_gru: bool
+    :return: A tuple of an output unnormalized log probability layer (
+             before softmax) and a ctc cost layer.
+    :rtype: tuple of LayerOutput
+    """
+
+    def __init__(
+            self,
+            feat_size,
+            dict_size,
+            num_conv_layers=2,
+            num_rnn_layers=4,
+            rnn_size=1024,
+            rnn_direction='forward',
+            num_fc_layers=2,
+            fc_layers_size_list=[512, 256],
+            use_gru=False,
+            blank_id=0,
+            ctc_grad_norm_type=None, ):
+        super().__init__()
+        self.encoder = CRNNEncoder(
+            feat_size=feat_size,
+            dict_size=dict_size,
+            num_conv_layers=num_conv_layers,
+            num_rnn_layers=num_rnn_layers,
+            rnn_direction=rnn_direction,
+            num_fc_layers=num_fc_layers,
+            fc_layers_size_list=fc_layers_size_list,
+            rnn_size=rnn_size,
+            use_gru=use_gru)
+
+        self.decoder = CTCDecoder(
+            odim=dict_size,  # <blank> is in  vocab
+            enc_n_units=self.encoder.output_size,
+            blank_id=blank_id,
+            dropout_rate=0.0,
+            reduction=True,  # sum
+            batch_average=True,  # sum / batch_size
+            grad_norm_type=ctc_grad_norm_type)
+
+    def forward(self, audio, audio_len, text, text_len):
+        """Compute Model loss
+
+        Args:
+            audio (Tensor): [B, T, D]
+            audio_len (Tensor): [B]
+            text (Tensor): [B, U]
+            text_len (Tensor): [B]
+
+        Returns:
+            loss (Tensor): [1]
+        """
+        eouts, eouts_len, final_state_h_box, final_state_c_box = self.encoder(
+            audio, audio_len, None, None)
+        loss = self.decoder(eouts, eouts_len, text, text_len)
+        return loss
+
+    @paddle.no_grad()
+    def decode(self, audio, audio_len):
+        # decoders only accept string encoded in utf-8
+        # Make sure the decoder has been initialized
+        eouts, eouts_len, final_state_h_box, final_state_c_box = self.encoder(
+            audio, audio_len, None, None)
+        probs = self.decoder.softmax(eouts)
+        batch_size = probs.shape[0]
+        self.decoder.reset_decoder(batch_size=batch_size)
+        self.decoder.next(probs, eouts_len)
+        trans_best, trans_beam = self.decoder.decode()
+        return trans_best
+
+    @classmethod
+    def from_pretrained(cls, dataloader, config, checkpoint_path):
+        """Build a DeepSpeech2Model model from a pretrained model.
+        Parameters
+        ----------
+        dataloader: paddle.io.DataLoader
+
+        config: yacs.config.CfgNode
+            model configs
+
+        checkpoint_path: Path or str
+            the path of pretrained model checkpoint, without extension name
+
+        Returns
+        -------
+        DeepSpeech2ModelOnline
+            The model built from pretrained result.
+        """
+        model = cls(
+            feat_size=dataloader.collate_fn.feature_size,
+            dict_size=dataloader.collate_fn.vocab_size,
+            num_conv_layers=config.num_conv_layers,
+            num_rnn_layers=config.num_rnn_layers,
+            rnn_size=config.rnn_layer_size,
+            rnn_direction=config.rnn_direction,
+            num_fc_layers=config.num_fc_layers,
+            fc_layers_size_list=config.fc_layers_size_list,
+            use_gru=config.use_gru,
+            blank_id=config.blank_id,
+            ctc_grad_norm_type=config.get('ctc_grad_norm_type', None), )
+        infos = Checkpoint().load_parameters(
+            model, checkpoint_path=checkpoint_path)
+        logger.info(f"checkpoint info: {infos}")
+        layer_tools.summary(model)
+        return model
+
+    @classmethod
+    def from_config(cls, config):
+        """Build a DeepSpeec2ModelOnline from config
+        Parameters
+
+        config: yacs.config.CfgNode
+            config
+        Returns
+        -------
+        DeepSpeech2ModelOnline
+            The model built from config.
+        """
+        model = cls(
+            feat_size=config.input_dim,
+            dict_size=config.output_dim,
+            num_conv_layers=config.num_conv_layers,
+            num_rnn_layers=config.num_rnn_layers,
+            rnn_size=config.rnn_layer_size,
+            rnn_direction=config.rnn_direction,
+            num_fc_layers=config.num_fc_layers,
+            fc_layers_size_list=config.fc_layers_size_list,
+            use_gru=config.use_gru,
+            blank_id=config.blank_id,
+            ctc_grad_norm_type=config.get('ctc_grad_norm_type', None), )
+        return model
+
+
+class DeepSpeech2InferModelOnline(DeepSpeech2ModelOnline):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def forward(self, audio_chunk, audio_chunk_lens, chunk_state_h_box,
+                chunk_state_c_box):
+        eouts_chunk, eouts_chunk_lens, final_state_h_box, final_state_c_box = self.encoder(
+            audio_chunk, audio_chunk_lens, chunk_state_h_box, chunk_state_c_box)
+        probs_chunk = self.decoder.softmax(eouts_chunk)
+        return probs_chunk, eouts_chunk_lens, final_state_h_box, final_state_c_box
+
+    def export(self):
+        static_model = paddle.jit.to_static(
+            self,
+            input_spec=[
+                paddle.static.InputSpec(
+                    shape=[None, None,
+                           self.encoder.feat_size],  #[B, chunk_size, feat_dim]
+                    dtype='float32'),
+                paddle.static.InputSpec(shape=[None],
+                                        dtype='int64'),  # audio_length, [B]
+                paddle.static.InputSpec(
+                    shape=[None, None, None], dtype='float32'),
+                paddle.static.InputSpec(
+                    shape=[None, None, None], dtype='float32')
+            ])
+        return static_model
diff --git a/ernie-sat/paddlespeech/s2t/models/lm/__init__.py b/ernie-sat/paddlespeech/s2t/models/lm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..185a92b8d94d3426d616c0624f0f2ee04339349e
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/models/lm/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/ernie-sat/paddlespeech/s2t/models/lm/dataset.py b/ernie-sat/paddlespeech/s2t/models/lm/dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..25a47be60398aec7e311de5703532b37f1dc03c6
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/models/lm/dataset.py
@@ -0,0 +1,74 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+from paddle.io import Dataset
+
+from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
+from paddlespeech.s2t.io.utility import pad_list
+
+
+class TextDataset(Dataset):
+    @classmethod
+    def from_file(cls, file_path):
+        dataset = cls(file_path)
+        return dataset
+
+    def __init__(self, file_path):
+        self._manifest = []
+        with open(file_path) as f:
+            for line in f:
+                self._manifest.append(line.strip())
+
+    def __len__(self):
+        return len(self._manifest)
+
+    def __getitem__(self, idx):
+        return self._manifest[idx]
+
+
+class TextCollatorSpm():
+    def __init__(self, unit_type, vocab_filepath, spm_model_prefix):
+        assert (vocab_filepath is not None)
+        self.text_featurizer = TextFeaturizer(
+            unit_type=unit_type,
+            vocab=vocab_filepath,
+            spm_model_prefix=spm_model_prefix)
+        self.eos_id = self.text_featurizer.eos_id
+        self.blank_id = self.text_featurizer.blank_id
+
+    def __call__(self, batch):
+        """
+        return type  [List, np.array [B, T], np.array [B, T], np.array[B]]
+        """
+        keys = []
+        texts = []
+        texts_input = []
+        texts_output = []
+        text_lens = []
+
+        for idx, item in enumerate(batch):
+            key = item.split(" ")[0].strip()
+            text = " ".join(item.split(" ")[1:])
+            keys.append(key)
+            token_ids = self.text_featurizer.featurize(text)
+            texts_input.append(
+                np.array([self.eos_id] + token_ids).astype(np.int64))
+            texts_output.append(
+                np.array(token_ids + [self.eos_id]).astype(np.int64))
+            text_lens.append(len(token_ids) + 1)
+
+        ys_input_pad = pad_list(texts_input, self.blank_id).astype(np.int64)
+        ys_output_pad = pad_list(texts_output, self.blank_id).astype(np.int64)
+        y_lens = np.array(text_lens).astype(np.int64)
+        return keys, ys_input_pad, ys_output_pad, y_lens
diff --git a/ernie-sat/paddlespeech/s2t/models/lm/transformer.py b/ernie-sat/paddlespeech/s2t/models/lm/transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..85bd7c2329fbf416d254bd9eabcaaf181fe7db01
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/models/lm/transformer.py
@@ -0,0 +1,266 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+from typing import Any
+from typing import List
+from typing import Tuple
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlespeech.s2t.decoders.scorers.scorer_interface import BatchScorerInterface
+from paddlespeech.s2t.models.lm_interface import LMInterface
+from paddlespeech.s2t.modules.encoder import TransformerEncoder
+from paddlespeech.s2t.modules.mask import subsequent_mask
+from paddlespeech.s2t.utils.log import Log
+
+logger = Log(__name__).getlog()
+
+
+class TransformerLM(nn.Layer, LMInterface, BatchScorerInterface):
+    def __init__(self,
+                 n_vocab: int,
+                 pos_enc: str=None,
+                 embed_unit: int=128,
+                 att_unit: int=256,
+                 head: int=2,
+                 unit: int=1024,
+                 layer: int=4,
+                 dropout_rate: float=0.5,
+                 emb_dropout_rate: float=0.0,
+                 att_dropout_rate: float=0.0,
+                 tie_weights: bool=False,
+                 **kwargs):
+        nn.Layer.__init__(self)
+
+        if pos_enc == "sinusoidal":
+            pos_enc_layer_type = "abs_pos"
+        elif pos_enc is None:
+            pos_enc_layer_type = "no_pos"
+        else:
+            raise ValueError(f"unknown pos-enc option: {pos_enc}")
+
+        self.embed = nn.Embedding(n_vocab, embed_unit)
+
+        if emb_dropout_rate == 0.0:
+            self.embed_drop = None
+        else:
+            self.embed_drop = nn.Dropout(emb_dropout_rate)
+
+        self.encoder = TransformerEncoder(
+            input_size=embed_unit,
+            output_size=att_unit,
+            attention_heads=head,
+            linear_units=unit,
+            num_blocks=layer,
+            dropout_rate=dropout_rate,
+            attention_dropout_rate=att_dropout_rate,
+            input_layer="linear",
+            pos_enc_layer_type=pos_enc_layer_type,
+            concat_after=False,
+            static_chunk_size=1,
+            use_dynamic_chunk=False,
+            use_dynamic_left_chunk=False)
+
+        self.decoder = nn.Linear(att_unit, n_vocab)
+
+        logger.info("Tie weights set to {}".format(tie_weights))
+        logger.info("Dropout set to {}".format(dropout_rate))
+        logger.info("Emb Dropout set to {}".format(emb_dropout_rate))
+        logger.info("Att Dropout set to {}".format(att_dropout_rate))
+
+        if tie_weights:
+            assert (
+                att_unit == embed_unit
+            ), "Tie Weights: True need embedding and final dimensions to match"
+            self.decoder.weight = self.embed.weight
+
+    def _target_mask(self, ys_in_pad):
+        ys_mask = ys_in_pad != 0
+        m = subsequent_mask(ys_mask.size(-1)).unsqueeze(0)
+        return ys_mask.unsqueeze(-2) & m
+
+    def forward(self, x: paddle.Tensor, t: paddle.Tensor
+                ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
+        """Compute LM loss value from buffer sequences.
+
+        Args:
+            x (paddle.Tensor): Input ids. (batch, len)
+            t (paddle.Tensor): Target ids. (batch, len)
+
+        Returns:
+            tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]: Tuple of
+                loss to backward (scalar),
+                negative log-likelihood of t: -log p(t) (scalar) and
+                the number of elements in x (scalar)
+
+        Notes:
+            The last two return values are used
+            in perplexity: p(t)^{-n} = exp(-log p(t) / n)
+
+        """
+        batch_size = x.size(0)
+        xm = x != 0
+        xlen = xm.sum(axis=1)
+        if self.embed_drop is not None:
+            emb = self.embed_drop(self.embed(x))
+        else:
+            emb = self.embed(x)
+        h, _ = self.encoder(emb, xlen)
+        y = self.decoder(h)
+        loss = F.cross_entropy(
+            y.view(-1, y.shape[-1]), t.view(-1), reduction="none")
+        mask = xm.to(loss.dtype)
+        logp = loss * mask.view(-1)
+        nll = logp.view(batch_size, -1).sum(-1)
+        nll_count = mask.sum(-1)
+        logp = logp.sum()
+        count = mask.sum()
+        return logp / count, logp, count, nll, nll_count
+
+    # beam search API (see ScorerInterface)
+    def score(self, y: paddle.Tensor, state: Any,
+              x: paddle.Tensor) -> Tuple[paddle.Tensor, Any]:
+        """Score new token.
+
+        Args:
+            y (paddle.Tensor): 1D paddle.int64 prefix tokens.
+            state: Scorer state for prefix tokens
+            x (paddle.Tensor): encoder feature that generates ys.
+
+        Returns:
+            tuple[paddle.Tensor, Any]: Tuple of
+                paddle.float32 scores for next token (n_vocab)
+                and next state for ys
+
+        """
+        y = y.unsqueeze(0)
+
+        if self.embed_drop is not None:
+            emb = self.embed_drop(self.embed(y))
+        else:
+            emb = self.embed(y)
+
+        h, _, cache = self.encoder.forward_one_step(
+            emb, self._target_mask(y), cache=state)
+        h = self.decoder(h[:, -1])
+        logp = F.log_softmax(h).squeeze(0)
+        return logp, cache
+
+    # batch beam search API (see BatchScorerInterface)
+    def batch_score(self,
+                    ys: paddle.Tensor,
+                    states: List[Any],
+                    xs: paddle.Tensor) -> Tuple[paddle.Tensor, List[Any]]:
+        """Score new token batch (required).
+
+        Args:
+            ys (paddle.Tensor): paddle.int64 prefix tokens (n_batch, ylen).
+            states (List[Any]): Scorer states for prefix tokens.
+            xs (paddle.Tensor):
+                The encoder feature that generates ys (n_batch, xlen, n_feat).
+
+        Returns:
+            tuple[paddle.Tensor, List[Any]]: Tuple of
+                batchfied scores for next token with shape of `(n_batch, n_vocab)`
+                and next state list for ys.
+
+        """
+        # merge states
+        n_batch = len(ys)
+        n_layers = len(self.encoder.encoders)
+        if states[0] is None:
+            batch_state = None
+        else:
+            # transpose state of [batch, layer] into [layer, batch]
+            batch_state = [
+                paddle.stack([states[b][i] for b in range(n_batch)])
+                for i in range(n_layers)
+            ]
+
+        if self.embed_drop is not None:
+            emb = self.embed_drop(self.embed(ys))
+        else:
+            emb = self.embed(ys)
+
+        # batch decoding
+        h, _, states = self.encoder.forward_one_step(
+            emb, self._target_mask(ys), cache=batch_state)
+        h = self.decoder(h[:, -1])
+        logp = F.log_softmax(h)
+
+        # transpose state of [layer, batch] into [batch, layer]
+        state_list = [[states[i][b] for i in range(n_layers)]
+                      for b in range(n_batch)]
+        return logp, state_list
+
+
+if __name__ == "__main__":
+    tlm = TransformerLM(
+        n_vocab=5002,
+        pos_enc=None,
+        embed_unit=128,
+        att_unit=512,
+        head=8,
+        unit=2048,
+        layer=16,
+        dropout_rate=0.5, )
+
+    #     n_vocab: int,
+    # pos_enc: str=None,
+    # embed_unit: int=128,
+    # att_unit: int=256,
+    # head: int=2,
+    # unit: int=1024,
+    # layer: int=4,
+    # dropout_rate: float=0.5,
+    # emb_dropout_rate: float = 0.0,
+    # att_dropout_rate: float = 0.0,
+    # tie_weights: bool = False,):
+    paddle.set_device("cpu")
+    model_dict = paddle.load("transformerLM.pdparams")
+    tlm.set_state_dict(model_dict)
+
+    tlm.eval()
+    #Test the score
+    input2 = np.array([5])
+    input2 = paddle.to_tensor(input2)
+    state = None
+    output, state = tlm.score(input2, state, None)
+
+    input3 = np.array([5, 10])
+    input3 = paddle.to_tensor(input3)
+    output, state = tlm.score(input3, state, None)
+
+    input4 = np.array([5, 10, 0])
+    input4 = paddle.to_tensor(input4)
+    output, state = tlm.score(input4, state, None)
+    print("output", output)
+    """
+    #Test the batch score
+    batch_size = 2
+    inp2 = np.array([[5], [10]])
+    inp2 = paddle.to_tensor(inp2)
+    output, states = tlm.batch_score(
+        inp2, [(None,None,0)] * batch_size)
+    inp3 = np.array([[100], [30]])
+    inp3 = paddle.to_tensor(inp3)
+    output, states = tlm.batch_score(
+        inp3, states)
+    print("output", output)
+    #print("cache", cache)
+    #np.save("output_pd.npy", output)
+    """
diff --git a/ernie-sat/paddlespeech/s2t/models/lm_interface.py b/ernie-sat/paddlespeech/s2t/models/lm_interface.py
new file mode 100644
index 0000000000000000000000000000000000000000..c8f3776a9e68afeca01ea27bafa2ae4becdbd44e
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/models/lm_interface.py
@@ -0,0 +1,83 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+"""Language model interface."""
+import argparse
+
+from paddlespeech.s2t.decoders.scorers.scorer_interface import ScorerInterface
+from paddlespeech.s2t.utils.dynamic_import import dynamic_import
+
+
+class LMInterface(ScorerInterface):
+    """LM Interface model implementation."""
+
+    @staticmethod
+    def add_arguments(parser):
+        """Add arguments to command line argument parser."""
+        return parser
+
+    @classmethod
+    def build(cls, n_vocab: int, **kwargs):
+        """Initialize this class with python-level args.
+
+        Args:
+            idim (int): The number of vocabulary.
+
+        Returns:
+            LMinterface: A new instance of LMInterface.
+
+        """
+        args = argparse.Namespace(**kwargs)
+        return cls(n_vocab, args)
+
+    def forward(self, x, t):
+        """Compute LM loss value from buffer sequences.
+
+        Args:
+            x (torch.Tensor): Input ids. (batch, len)
+            t (torch.Tensor): Target ids. (batch, len)
+
+        Returns:
+            tuple[torch.Tensor, torch.Tensor, torch.Tensor]: Tuple of
+                loss to backward (scalar),
+                negative log-likelihood of t: -log p(t) (scalar) and
+                the number of elements in x (scalar)
+
+        Notes:
+            The last two return values are used
+            in perplexity: p(t)^{-n} = exp(-log p(t) / n)
+
+        """
+        raise NotImplementedError("forward method is not implemented")
+
+
+predefined_lms = {
+    "transformer": "paddlespeech.s2t.models.lm.transformer:TransformerLM",
+}
+
+
+def dynamic_import_lm(module):
+    """Import LM class dynamically.
+
+    Args:
+        module (str): module_name:class_name or alias in `predefined_lms`
+
+    Returns:
+        type: LM class
+
+    """
+    model_class = dynamic_import(module, predefined_lms)
+    assert issubclass(model_class,
+                      LMInterface), f"{module} does not implement LMInterface"
+    return model_class
diff --git a/ernie-sat/paddlespeech/s2t/models/st_interface.py b/ernie-sat/paddlespeech/s2t/models/st_interface.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d368590325b8f39baa804574b07dc7f4747eb54
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/models/st_interface.py
@@ -0,0 +1,76 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+"""ST Interface module."""
+from .asr_interface import ASRInterface
+from paddlespeech.s2t.utils.dynamic_import import dynamic_import
+
+
+class STInterface(ASRInterface):
+    """ST Interface model implementation.
+
+    NOTE: This class is inherited from ASRInterface to enable joint translation
+    and recognition when performing multi-task learning with the ASR task.
+
+    """
+
+    def translate(self,
+                  x,
+                  trans_args,
+                  char_list=None,
+                  rnnlm=None,
+                  ensemble_models=[]):
+        """Recognize x for evaluation.
+
+        :param ndarray x: input acouctic feature (B, T, D) or (T, D)
+        :param namespace trans_args: argment namespace contraining options
+        :param list char_list: list of characters
+        :param paddle.nn.Layer rnnlm: language model module
+        :return: N-best decoding results
+        :rtype: list
+        """
+        raise NotImplementedError("translate method is not implemented")
+
+    def translate_batch(self, x, trans_args, char_list=None, rnnlm=None):
+        """Beam search implementation for batch.
+
+        :param paddle.Tensor x: encoder hidden state sequences (B, Tmax, Henc)
+        :param namespace trans_args: argument namespace containing options
+        :param list char_list: list of characters
+        :param paddle.nn.Layer rnnlm: language model module
+        :return: N-best decoding results
+        :rtype: list
+        """
+        raise NotImplementedError("Batch decoding is not supported yet.")
+
+
+predefined_st = {
+    "transformer": "paddlespeech.s2t.models.u2_st:U2STModel",
+}
+
+
+def dynamic_import_st(module):
+    """Import ST models dynamically.
+
+    Args:
+        module (str): module_name:class_name or alias in `predefined_st`
+
+    Returns:
+        type: ST class
+
+    """
+    model_class = dynamic_import(module, predefined_st)
+    assert issubclass(model_class,
+                      STInterface), f"{module} does not implement STInterface"
+    return model_class
diff --git a/ernie-sat/paddlespeech/s2t/models/u2/__init__.py b/ernie-sat/paddlespeech/s2t/models/u2/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9010f1d09263dc643d16308a8cefbd06744c958
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/models/u2/__init__.py
@@ -0,0 +1,19 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .u2 import U2InferModel
+from .u2 import U2Model
+from .updater import U2Evaluator
+from .updater import U2Updater
+
+__all__ = ["U2Model", "U2InferModel", "U2Evaluator", "U2Updater"]
diff --git a/ernie-sat/paddlespeech/s2t/models/u2/u2.py b/ernie-sat/paddlespeech/s2t/models/u2/u2.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a98607b69721b63b02c833932b74ab77913d078
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/models/u2/u2.py
@@ -0,0 +1,926 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from wenet(https://github.com/wenet-e2e/wenet)
+"""U2 ASR Model
+Unified Streaming and Non-streaming Two-pass End-to-end Model for Speech Recognition
+(https://arxiv.org/pdf/2012.05481.pdf)
+"""
+import sys
+import time
+from collections import defaultdict
+from typing import Dict
+from typing import List
+from typing import Optional
+from typing import Tuple
+
+import paddle
+from paddle import jit
+from paddle import nn
+
+from paddlespeech.s2t.decoders.scorers.ctc import CTCPrefixScorer
+from paddlespeech.s2t.frontend.utility import IGNORE_ID
+from paddlespeech.s2t.frontend.utility import load_cmvn
+from paddlespeech.s2t.models.asr_interface import ASRInterface
+from paddlespeech.s2t.modules.cmvn import GlobalCMVN
+from paddlespeech.s2t.modules.ctc import CTCDecoderBase
+from paddlespeech.s2t.modules.decoder import TransformerDecoder
+from paddlespeech.s2t.modules.encoder import ConformerEncoder
+from paddlespeech.s2t.modules.encoder import TransformerEncoder
+from paddlespeech.s2t.modules.initializer import DefaultInitializerContext
+from paddlespeech.s2t.modules.loss import LabelSmoothingLoss
+from paddlespeech.s2t.modules.mask import make_pad_mask
+from paddlespeech.s2t.modules.mask import mask_finished_preds
+from paddlespeech.s2t.modules.mask import mask_finished_scores
+from paddlespeech.s2t.modules.mask import subsequent_mask
+from paddlespeech.s2t.utils import checkpoint
+from paddlespeech.s2t.utils import layer_tools
+from paddlespeech.s2t.utils.ctc_utils import remove_duplicates_and_blank
+from paddlespeech.s2t.utils.log import Log
+from paddlespeech.s2t.utils.tensor_utils import add_sos_eos
+from paddlespeech.s2t.utils.tensor_utils import pad_sequence
+from paddlespeech.s2t.utils.tensor_utils import th_accuracy
+from paddlespeech.s2t.utils.utility import log_add
+from paddlespeech.s2t.utils.utility import UpdateConfig
+
+__all__ = ["U2Model", "U2InferModel"]
+
+logger = Log(__name__).getlog()
+
+
+class U2BaseModel(ASRInterface, nn.Layer):
+    """CTC-Attention hybrid Encoder-Decoder model"""
+
+    def __init__(self,
+                 vocab_size: int,
+                 encoder: TransformerEncoder,
+                 decoder: TransformerDecoder,
+                 ctc: CTCDecoderBase,
+                 ctc_weight: float=0.5,
+                 ignore_id: int=IGNORE_ID,
+                 lsm_weight: float=0.0,
+                 length_normalized_loss: bool=False,
+                 **kwargs):
+        assert 0.0 <= ctc_weight <= 1.0, ctc_weight
+
+        nn.Layer.__init__(self)
+
+        # note that eos is the same as sos (equivalent ID)
+        self.sos = vocab_size - 1
+        self.eos = vocab_size - 1
+        self.vocab_size = vocab_size
+        self.ignore_id = ignore_id
+        self.ctc_weight = ctc_weight
+
+        self.encoder = encoder
+        self.decoder = decoder
+        self.ctc = ctc
+        self.criterion_att = LabelSmoothingLoss(
+            size=vocab_size,
+            padding_idx=ignore_id,
+            smoothing=lsm_weight,
+            normalize_length=length_normalized_loss, )
+
+    def forward(
+            self,
+            speech: paddle.Tensor,
+            speech_lengths: paddle.Tensor,
+            text: paddle.Tensor,
+            text_lengths: paddle.Tensor,
+    ) -> Tuple[Optional[paddle.Tensor], Optional[paddle.Tensor], Optional[
+            paddle.Tensor]]:
+        """Frontend + Encoder + Decoder + Calc loss
+        Args:
+            speech: (Batch, Length, ...)
+            speech_lengths: (Batch, )
+            text: (Batch, Length)
+            text_lengths: (Batch,)
+        Returns:
+            total_loss, attention_loss, ctc_loss
+        """
+        assert text_lengths.dim() == 1, text_lengths.shape
+        # Check that batch_size is unified
+        assert (speech.shape[0] == speech_lengths.shape[0] == text.shape[0] ==
+                text_lengths.shape[0]), (speech.shape, speech_lengths.shape,
+                                         text.shape, text_lengths.shape)
+        # 1. Encoder
+        start = time.time()
+        encoder_out, encoder_mask = self.encoder(speech, speech_lengths)
+        encoder_time = time.time() - start
+        #logger.debug(f"encoder time: {encoder_time}")
+        #TODO(Hui Zhang): sum not support bool type
+        #encoder_out_lens = encoder_mask.squeeze(1).sum(1)  #[B, 1, T] -> [B]
+        encoder_out_lens = encoder_mask.squeeze(1).cast(paddle.int64).sum(
+            1)  #[B, 1, T] -> [B]
+
+        # 2a. Attention-decoder branch
+        loss_att = None
+        if self.ctc_weight != 1.0:
+            start = time.time()
+            loss_att, acc_att = self._calc_att_loss(encoder_out, encoder_mask,
+                                                    text, text_lengths)
+            decoder_time = time.time() - start
+            #logger.debug(f"decoder time: {decoder_time}")
+
+        # 2b. CTC branch
+        loss_ctc = None
+        if self.ctc_weight != 0.0:
+            start = time.time()
+            loss_ctc = self.ctc(encoder_out, encoder_out_lens, text,
+                                text_lengths)
+            ctc_time = time.time() - start
+            #logger.debug(f"ctc time: {ctc_time}")
+
+        if loss_ctc is None:
+            loss = loss_att
+        elif loss_att is None:
+            loss = loss_ctc
+        else:
+            loss = self.ctc_weight * loss_ctc + (1 - self.ctc_weight) * loss_att
+        return loss, loss_att, loss_ctc
+
+    def _calc_att_loss(
+            self,
+            encoder_out: paddle.Tensor,
+            encoder_mask: paddle.Tensor,
+            ys_pad: paddle.Tensor,
+            ys_pad_lens: paddle.Tensor, ) -> Tuple[paddle.Tensor, float]:
+        """Calc attention loss.
+
+        Args:
+            encoder_out (paddle.Tensor): [B, Tmax, D]
+            encoder_mask (paddle.Tensor): [B, 1, Tmax]
+            ys_pad (paddle.Tensor): [B, Umax]
+            ys_pad_lens (paddle.Tensor): [B]
+
+        Returns:
+            Tuple[paddle.Tensor, float]: attention_loss, accuracy rate
+        """
+        ys_in_pad, ys_out_pad = add_sos_eos(ys_pad, self.sos, self.eos,
+                                            self.ignore_id)
+        ys_in_lens = ys_pad_lens + 1
+
+        # 1. Forward decoder
+        decoder_out, _ = self.decoder(encoder_out, encoder_mask, ys_in_pad,
+                                      ys_in_lens)
+
+        # 2. Compute attention loss
+        loss_att = self.criterion_att(decoder_out, ys_out_pad)
+        acc_att = th_accuracy(
+            decoder_out.view(-1, self.vocab_size),
+            ys_out_pad,
+            ignore_label=self.ignore_id, )
+        return loss_att, acc_att
+
+    def _forward_encoder(
+            self,
+            speech: paddle.Tensor,
+            speech_lengths: paddle.Tensor,
+            decoding_chunk_size: int=-1,
+            num_decoding_left_chunks: int=-1,
+            simulate_streaming: bool=False,
+    ) -> Tuple[paddle.Tensor, paddle.Tensor]:
+        """Encoder pass.
+
+        Args:
+            speech (paddle.Tensor): [B, Tmax, D]
+            speech_lengths (paddle.Tensor): [B]
+            decoding_chunk_size (int, optional): chuck size. Defaults to -1.
+            num_decoding_left_chunks (int, optional): nums chunks. Defaults to -1.
+            simulate_streaming (bool, optional): streaming or not. Defaults to False.
+
+        Returns:
+            Tuple[paddle.Tensor, paddle.Tensor]:
+                encoder hiddens (B, Tmax, D),
+                encoder hiddens mask (B, 1, Tmax).
+        """
+        # Let's assume B = batch_size
+        # 1. Encoder
+        if simulate_streaming and decoding_chunk_size > 0:
+            encoder_out, encoder_mask = self.encoder.forward_chunk_by_chunk(
+                speech,
+                decoding_chunk_size=decoding_chunk_size,
+                num_decoding_left_chunks=num_decoding_left_chunks
+            )  # (B, maxlen, encoder_dim)
+        else:
+            encoder_out, encoder_mask = self.encoder(
+                speech,
+                speech_lengths,
+                decoding_chunk_size=decoding_chunk_size,
+                num_decoding_left_chunks=num_decoding_left_chunks
+            )  # (B, maxlen, encoder_dim)
+        return encoder_out, encoder_mask
+
+    def recognize(
+            self,
+            speech: paddle.Tensor,
+            speech_lengths: paddle.Tensor,
+            beam_size: int=10,
+            decoding_chunk_size: int=-1,
+            num_decoding_left_chunks: int=-1,
+            simulate_streaming: bool=False, ) -> paddle.Tensor:
+        """ Apply beam search on attention decoder
+        Args:
+            speech (paddle.Tensor): (batch, max_len, feat_dim)
+            speech_length (paddle.Tensor): (batch, )
+            beam_size (int): beam size for beam search
+            decoding_chunk_size (int): decoding chunk for dynamic chunk
+                trained model.
+                <0: for decoding, use full chunk.
+                >0: for decoding, use fixed chunk size as set.
+                0: used for training, it's prohibited here
+            simulate_streaming (bool): whether do encoder forward in a
+                streaming fashion
+        Returns:
+            paddle.Tensor: decoding result, (batch, max_result_len)
+        """
+        assert speech.shape[0] == speech_lengths.shape[0]
+        assert decoding_chunk_size != 0
+        device = speech.place
+        batch_size = speech.shape[0]
+
+        # Let's assume B = batch_size and N = beam_size
+        # 1. Encoder
+        encoder_out, encoder_mask = self._forward_encoder(
+            speech, speech_lengths, decoding_chunk_size,
+            num_decoding_left_chunks,
+            simulate_streaming)  # (B, maxlen, encoder_dim)
+        maxlen = encoder_out.shape[1]
+        encoder_dim = encoder_out.shape[2]
+        running_size = batch_size * beam_size
+        encoder_out = encoder_out.unsqueeze(1).repeat(1, beam_size, 1, 1).view(
+            running_size, maxlen, encoder_dim)  # (B*N, maxlen, encoder_dim)
+        encoder_mask = encoder_mask.unsqueeze(1).repeat(
+            1, beam_size, 1, 1).view(running_size, 1,
+                                     maxlen)  # (B*N, 1, max_len)
+
+        hyps = paddle.ones(
+            [running_size, 1], dtype=paddle.long).fill_(self.sos)  # (B*N, 1)
+        # log scale score
+        scores = paddle.to_tensor(
+            [0.0] + [-float('inf')] * (beam_size - 1), dtype=paddle.float)
+        scores = scores.to(device).repeat(batch_size).unsqueeze(1).to(
+            device)  # (B*N, 1)
+        end_flag = paddle.zeros_like(scores, dtype=paddle.bool)  # (B*N, 1)
+        cache: Optional[List[paddle.Tensor]] = None
+        # 2. Decoder forward step by step
+        for i in range(1, maxlen + 1):
+            # Stop if all batch and all beam produce eos
+            # TODO(Hui Zhang): if end_flag.sum() == running_size:
+            if end_flag.cast(paddle.int64).sum() == running_size:
+                break
+
+            # 2.1 Forward decoder step
+            hyps_mask = subsequent_mask(i).unsqueeze(0).repeat(
+                running_size, 1, 1).to(device)  # (B*N, i, i)
+            # logp: (B*N, vocab)
+            logp, cache = self.decoder.forward_one_step(
+                encoder_out, encoder_mask, hyps, hyps_mask, cache)
+
+            # 2.2 First beam prune: select topk best prob at current time
+            top_k_logp, top_k_index = logp.topk(beam_size)  # (B*N, N)
+            top_k_logp = mask_finished_scores(top_k_logp, end_flag)
+            top_k_index = mask_finished_preds(top_k_index, end_flag, self.eos)
+
+            # 2.3 Seconde beam prune: select topk score with history
+            scores = scores + top_k_logp  # (B*N, N), broadcast add
+            scores = scores.view(batch_size, beam_size * beam_size)  # (B, N*N)
+            scores, offset_k_index = scores.topk(k=beam_size)  # (B, N)
+            scores = scores.view(-1, 1)  # (B*N, 1)
+
+            # 2.4. Compute base index in top_k_index,
+            # regard top_k_index as (B*N*N),regard offset_k_index as (B*N),
+            # then find offset_k_index in top_k_index
+            base_k_index = paddle.arange(batch_size).view(-1, 1).repeat(
+                1, beam_size)  # (B, N)
+            base_k_index = base_k_index * beam_size * beam_size
+            best_k_index = base_k_index.view(-1) + offset_k_index.view(
+                -1)  # (B*N)
+
+            # 2.5 Update best hyps
+            best_k_pred = paddle.index_select(
+                top_k_index.view(-1), index=best_k_index, axis=0)  # (B*N)
+            best_hyps_index = best_k_index // beam_size
+            last_best_k_hyps = paddle.index_select(
+                hyps, index=best_hyps_index, axis=0)  # (B*N, i)
+            hyps = paddle.cat(
+                (last_best_k_hyps, best_k_pred.view(-1, 1)),
+                dim=1)  # (B*N, i+1)
+
+            # 2.6 Update end flag
+            end_flag = paddle.eq(hyps[:, -1], self.eos).view(-1, 1)
+
+        # 3. Select best of best
+        scores = scores.view(batch_size, beam_size)
+        # TODO: length normalization
+        best_index = paddle.argmax(scores, axis=-1).long()  # (B)
+        best_hyps_index = best_index + paddle.arange(
+            batch_size, dtype=paddle.long) * beam_size
+        best_hyps = paddle.index_select(hyps, index=best_hyps_index, axis=0)
+        best_hyps = best_hyps[:, 1:]
+        return best_hyps
+
+    def ctc_greedy_search(
+            self,
+            speech: paddle.Tensor,
+            speech_lengths: paddle.Tensor,
+            decoding_chunk_size: int=-1,
+            num_decoding_left_chunks: int=-1,
+            simulate_streaming: bool=False, ) -> List[List[int]]:
+        """ Apply CTC greedy search
+        Args:
+            speech (paddle.Tensor): (batch, max_len, feat_dim)
+            speech_length (paddle.Tensor): (batch, )
+            beam_size (int): beam size for beam search
+            decoding_chunk_size (int): decoding chunk for dynamic chunk
+                trained model.
+                <0: for decoding, use full chunk.
+                >0: for decoding, use fixed chunk size as set.
+                0: used for training, it's prohibited here
+            simulate_streaming (bool): whether do encoder forward in a
+                streaming fashion
+        Returns:
+            List[List[int]]: best path result
+        """
+        assert speech.shape[0] == speech_lengths.shape[0]
+        assert decoding_chunk_size != 0
+        batch_size = speech.shape[0]
+
+        # Let's assume B = batch_size
+        # encoder_out: (B, maxlen, encoder_dim)
+        # encoder_mask: (B, 1, Tmax)
+        encoder_out, encoder_mask = self._forward_encoder(
+            speech, speech_lengths, decoding_chunk_size,
+            num_decoding_left_chunks, simulate_streaming)
+        maxlen = encoder_out.shape[1]
+        # (TODO Hui Zhang): bool no support reduce_sum
+        # encoder_out_lens = encoder_mask.squeeze(1).sum(1)
+        encoder_out_lens = encoder_mask.squeeze(1).astype(paddle.int).sum(1)
+        ctc_probs = self.ctc.log_softmax(encoder_out)  # (B, maxlen, vocab_size)
+
+        topk_prob, topk_index = ctc_probs.topk(1, axis=2)  # (B, maxlen, 1)
+        topk_index = topk_index.view(batch_size, maxlen)  # (B, maxlen)
+        pad_mask = make_pad_mask(encoder_out_lens)  # (B, maxlen)
+        topk_index = topk_index.masked_fill_(pad_mask, self.eos)  # (B, maxlen)
+
+        hyps = [hyp.tolist() for hyp in topk_index]
+        hyps = [remove_duplicates_and_blank(hyp) for hyp in hyps]
+        return hyps
+
+    def _ctc_prefix_beam_search(
+            self,
+            speech: paddle.Tensor,
+            speech_lengths: paddle.Tensor,
+            beam_size: int,
+            decoding_chunk_size: int=-1,
+            num_decoding_left_chunks: int=-1,
+            simulate_streaming: bool=False,
+            blank_id: int=0, ) -> Tuple[List[Tuple[int, float]], paddle.Tensor]:
+        """ CTC prefix beam search inner implementation
+        Args:
+            speech (paddle.Tensor): (batch, max_len, feat_dim)
+            speech_length (paddle.Tensor): (batch, )
+            beam_size (int): beam size for beam search
+            decoding_chunk_size (int): decoding chunk for dynamic chunk
+                trained model.
+                <0: for decoding, use full chunk.
+                >0: for decoding, use fixed chunk size as set.
+                0: used for training, it's prohibited here
+            simulate_streaming (bool): whether do encoder forward in a
+                streaming fashion
+        Returns:
+            List[Tuple[int, float]]: nbest results, (N,1), (text, likelihood)
+            paddle.Tensor: encoder output, (1, max_len, encoder_dim),
+                it will be used for rescoring in attention rescoring mode
+        """
+        assert speech.shape[0] == speech_lengths.shape[0]
+        assert decoding_chunk_size != 0
+        batch_size = speech.shape[0]
+        # For CTC prefix beam search, we only support batch_size=1
+        assert batch_size == 1
+
+        # Let's assume B = batch_size and N = beam_size
+        # 1. Encoder forward and get CTC score
+        encoder_out, encoder_mask = self._forward_encoder(
+            speech, speech_lengths, decoding_chunk_size,
+            num_decoding_left_chunks,
+            simulate_streaming)  # (B, maxlen, encoder_dim)
+        maxlen = encoder_out.shape[1]
+        ctc_probs = self.ctc.log_softmax(encoder_out)  # (1, maxlen, vocab_size)
+        ctc_probs = ctc_probs.squeeze(0)
+
+        # cur_hyps: (prefix, (blank_ending_score, none_blank_ending_score))
+        # blank_ending_score and  none_blank_ending_score in ln domain
+        cur_hyps = [(tuple(), (0.0, -float('inf')))]
+        # 2. CTC beam search step by step
+        for t in range(0, maxlen):
+            logp = ctc_probs[t]  # (vocab_size,)
+            # key: prefix, value (pb, pnb), default value(-inf, -inf)
+            next_hyps = defaultdict(lambda: (-float('inf'), -float('inf')))
+            # 2.1 First beam prune: select topk best
+            top_k_logp, top_k_index = logp.topk(beam_size)  # (beam_size,)
+            for s in top_k_index:
+                s = s.item()
+                ps = logp[s].item()
+                for prefix, (pb, pnb) in cur_hyps:
+                    last = prefix[-1] if len(prefix) > 0 else None
+                    if s == blank_id:  # blank
+                        n_pb, n_pnb = next_hyps[prefix]
+                        n_pb = log_add([n_pb, pb + ps, pnb + ps])
+                        next_hyps[prefix] = (n_pb, n_pnb)
+                    elif s == last:
+                        #  Update *ss -> *s;
+                        n_pb, n_pnb = next_hyps[prefix]
+                        n_pnb = log_add([n_pnb, pnb + ps])
+                        next_hyps[prefix] = (n_pb, n_pnb)
+                        # Update *s-s -> *ss, - is for blank
+                        n_prefix = prefix + (s, )
+                        n_pb, n_pnb = next_hyps[n_prefix]
+                        n_pnb = log_add([n_pnb, pb + ps])
+                        next_hyps[n_prefix] = (n_pb, n_pnb)
+                    else:
+                        n_prefix = prefix + (s, )
+                        n_pb, n_pnb = next_hyps[n_prefix]
+                        n_pnb = log_add([n_pnb, pb + ps, pnb + ps])
+                        next_hyps[n_prefix] = (n_pb, n_pnb)
+
+            # 2.2 Second beam prune
+            next_hyps = sorted(
+                next_hyps.items(),
+                key=lambda x: log_add(list(x[1])),
+                reverse=True)
+            cur_hyps = next_hyps[:beam_size]
+
+        hyps = [(y[0], log_add([y[1][0], y[1][1]])) for y in cur_hyps]
+        return hyps, encoder_out
+
+    def ctc_prefix_beam_search(
+            self,
+            speech: paddle.Tensor,
+            speech_lengths: paddle.Tensor,
+            beam_size: int,
+            decoding_chunk_size: int=-1,
+            num_decoding_left_chunks: int=-1,
+            simulate_streaming: bool=False, ) -> List[int]:
+        """ Apply CTC prefix beam search
+        Args:
+            speech (paddle.Tensor): (batch, max_len, feat_dim)
+            speech_length (paddle.Tensor): (batch, )
+            beam_size (int): beam size for beam search
+            decoding_chunk_size (int): decoding chunk for dynamic chunk
+                trained model.
+                <0: for decoding, use full chunk.
+                >0: for decoding, use fixed chunk size as set.
+                0: used for training, it's prohibited here
+            simulate_streaming (bool): whether do encoder forward in a
+                streaming fashion
+        Returns:
+            List[int]: CTC prefix beam search nbest results
+        """
+        hyps, _ = self._ctc_prefix_beam_search(
+            speech, speech_lengths, beam_size, decoding_chunk_size,
+            num_decoding_left_chunks, simulate_streaming)
+        return hyps[0][0]
+
+    def attention_rescoring(
+            self,
+            speech: paddle.Tensor,
+            speech_lengths: paddle.Tensor,
+            beam_size: int,
+            decoding_chunk_size: int=-1,
+            num_decoding_left_chunks: int=-1,
+            ctc_weight: float=0.0,
+            simulate_streaming: bool=False, ) -> List[int]:
+        """ Apply attention rescoring decoding, CTC prefix beam search
+            is applied first to get nbest, then we resoring the nbest on
+            attention decoder with corresponding encoder out
+        Args:
+            speech (paddle.Tensor): (batch, max_len, feat_dim)
+            speech_length (paddle.Tensor): (batch, )
+            beam_size (int): beam size for beam search
+            decoding_chunk_size (int): decoding chunk for dynamic chunk
+                trained model.
+                <0: for decoding, use full chunk.
+                >0: for decoding, use fixed chunk size as set.
+                0: used for training, it's prohibited here
+            simulate_streaming (bool): whether do encoder forward in a
+                streaming fashion
+        Returns:
+            List[int]: Attention rescoring result
+        """
+        assert speech.shape[0] == speech_lengths.shape[0]
+        assert decoding_chunk_size != 0
+        device = speech.place
+        batch_size = speech.shape[0]
+        # For attention rescoring we only support batch_size=1
+        assert batch_size == 1
+
+        # len(hyps) = beam_size, encoder_out: (1, maxlen, encoder_dim)
+        hyps, encoder_out = self._ctc_prefix_beam_search(
+            speech, speech_lengths, beam_size, decoding_chunk_size,
+            num_decoding_left_chunks, simulate_streaming)
+        assert len(hyps) == beam_size
+
+        hyp_list = []
+        for hyp in hyps:
+            hyp_content = hyp[0]
+            # Prevent the hyp is empty
+            if len(hyp_content) == 0:
+                hyp_content = (self.ctc.blank_id, )
+            hyp_content = paddle.to_tensor(
+                hyp_content, place=device, dtype=paddle.long)
+            hyp_list.append(hyp_content)
+        hyps_pad = pad_sequence(hyp_list, True, self.ignore_id)
+        hyps_lens = paddle.to_tensor(
+            [len(hyp[0]) for hyp in hyps], place=device,
+            dtype=paddle.long)  # (beam_size,)
+        hyps_pad, _ = add_sos_eos(hyps_pad, self.sos, self.eos, self.ignore_id)
+        hyps_lens = hyps_lens + 1  # Add <sos> at begining
+
+        encoder_out = encoder_out.repeat(beam_size, 1, 1)
+        encoder_mask = paddle.ones(
+            (beam_size, 1, encoder_out.shape[1]), dtype=paddle.bool)
+        decoder_out, _ = self.decoder(
+            encoder_out, encoder_mask, hyps_pad,
+            hyps_lens)  # (beam_size, max_hyps_len, vocab_size)
+        # ctc score in ln domain
+        decoder_out = paddle.nn.functional.log_softmax(decoder_out, axis=-1)
+        decoder_out = decoder_out.numpy()
+
+        # Only use decoder score for rescoring
+        best_score = -float('inf')
+        best_index = 0
+        # hyps is List[(Text=List[int], Score=float)], len(hyps)=beam_size
+        for i, hyp in enumerate(hyps):
+            score = 0.0
+            for j, w in enumerate(hyp[0]):
+                score += decoder_out[i][j][w]
+            # last decoder output token is `eos`, for laste decoder input token.
+            score += decoder_out[i][len(hyp[0])][self.eos]
+            # add ctc score (which in ln domain)
+            score += hyp[1] * ctc_weight
+            if score > best_score:
+                best_score = score
+                best_index = i
+        return hyps[best_index][0]
+
+    #@jit.to_static
+    def subsampling_rate(self) -> int:
+        """ Export interface for c++ call, return subsampling_rate of the
+            model
+        """
+        return self.encoder.embed.subsampling_rate
+
+    #@jit.to_static
+    def right_context(self) -> int:
+        """ Export interface for c++ call, return right_context of the model
+        """
+        return self.encoder.embed.right_context
+
+    #@jit.to_static
+    def sos_symbol(self) -> int:
+        """ Export interface for c++ call, return sos symbol id of the model
+        """
+        return self.sos
+
+    #@jit.to_static
+    def eos_symbol(self) -> int:
+        """ Export interface for c++ call, return eos symbol id of the model
+        """
+        return self.eos
+
+    @jit.to_static
+    def forward_encoder_chunk(
+            self,
+            xs: paddle.Tensor,
+            offset: int,
+            required_cache_size: int,
+            subsampling_cache: Optional[paddle.Tensor]=None,
+            elayers_output_cache: Optional[List[paddle.Tensor]]=None,
+            conformer_cnn_cache: Optional[List[paddle.Tensor]]=None,
+    ) -> Tuple[paddle.Tensor, paddle.Tensor, List[paddle.Tensor], List[
+            paddle.Tensor]]:
+        """ Export interface for c++ call, give input chunk xs, and return
+            output from time 0 to current chunk.
+        Args:
+            xs (paddle.Tensor): chunk input
+            subsampling_cache (Optional[paddle.Tensor]): subsampling cache
+            elayers_output_cache (Optional[List[paddle.Tensor]]):
+                transformer/conformer encoder layers output cache
+            conformer_cnn_cache (Optional[List[paddle.Tensor]]): conformer
+                cnn cache
+        Returns:
+            paddle.Tensor: output, it ranges from time 0 to current chunk.
+            paddle.Tensor: subsampling cache
+            List[paddle.Tensor]: attention cache
+            List[paddle.Tensor]: conformer cnn cache
+        """
+        return self.encoder.forward_chunk(
+            xs, offset, required_cache_size, subsampling_cache,
+            elayers_output_cache, conformer_cnn_cache)
+
+    # @jit.to_static
+    def ctc_activation(self, xs: paddle.Tensor) -> paddle.Tensor:
+        """ Export interface for c++ call, apply linear transform and log
+            softmax before ctc
+        Args:
+            xs (paddle.Tensor): encoder output, (B, T, D)
+        Returns:
+            paddle.Tensor: activation before ctc
+        """
+        return self.ctc.log_softmax(xs)
+
+    @jit.to_static
+    def forward_attention_decoder(
+            self,
+            hyps: paddle.Tensor,
+            hyps_lens: paddle.Tensor,
+            encoder_out: paddle.Tensor, ) -> paddle.Tensor:
+        """ Export interface for c++ call, forward decoder with multiple
+            hypothesis from ctc prefix beam search and one encoder output
+        Args:
+            hyps (paddle.Tensor): hyps from ctc prefix beam search, already
+                pad sos at the begining, (B, T)
+            hyps_lens (paddle.Tensor): length of each hyp in hyps, (B)
+            encoder_out (paddle.Tensor): corresponding encoder output, (B=1, T, D)
+        Returns:
+            paddle.Tensor: decoder output, (B, L)
+        """
+        assert encoder_out.shape[0] == 1
+        num_hyps = hyps.shape[0]
+        assert hyps_lens.shape[0] == num_hyps
+        encoder_out = encoder_out.repeat(num_hyps, 1, 1)
+        # (B, 1, T)
+        encoder_mask = paddle.ones(
+            [num_hyps, 1, encoder_out.shape[1]], dtype=paddle.bool)
+        # (num_hyps, max_hyps_len, vocab_size)
+        decoder_out, _ = self.decoder(encoder_out, encoder_mask, hyps,
+                                      hyps_lens)
+        decoder_out = paddle.nn.functional.log_softmax(decoder_out, axis=-1)
+        return decoder_out
+
+    @paddle.no_grad()
+    def decode(self,
+               feats: paddle.Tensor,
+               feats_lengths: paddle.Tensor,
+               text_feature: Dict[str, int],
+               decoding_method: str,
+               beam_size: int,
+               ctc_weight: float=0.0,
+               decoding_chunk_size: int=-1,
+               num_decoding_left_chunks: int=-1,
+               simulate_streaming: bool=False):
+        """u2 decoding.
+
+        Args:
+            feats (Tensor): audio features, (B, T, D)
+            feats_lengths (Tensor): (B)
+            text_feature (TextFeaturizer): text feature object.
+            decoding_method (str): decoding mode, e.g.
+                    'attention', 'ctc_greedy_search',
+                    'ctc_prefix_beam_search', 'attention_rescoring'
+            beam_size (int): beam size for search
+            ctc_weight (float, optional): ctc weight for attention rescoring decode mode. Defaults to 0.0.
+            decoding_chunk_size (int, optional): decoding chunk size. Defaults to -1.
+                    <0: for decoding, use full chunk.
+                    >0: for decoding, use fixed chunk size as set.
+                    0: used for training, it's prohibited here.
+            num_decoding_left_chunks (int, optional):
+                    number of left chunks for decoding. Defaults to -1.
+            simulate_streaming (bool, optional): simulate streaming inference. Defaults to False.
+
+        Raises:
+            ValueError: when not support decoding_method.
+
+        Returns:
+            List[List[int]]: transcripts.
+        """
+        batch_size = feats.shape[0]
+        if decoding_method in ['ctc_prefix_beam_search',
+                               'attention_rescoring'] and batch_size > 1:
+            logger.fatal(
+                f'decoding mode {decoding_method} must be running with batch_size == 1'
+            )
+            sys.exit(1)
+
+        if decoding_method == 'attention':
+            hyps = self.recognize(
+                feats,
+                feats_lengths,
+                beam_size=beam_size,
+                decoding_chunk_size=decoding_chunk_size,
+                num_decoding_left_chunks=num_decoding_left_chunks,
+                simulate_streaming=simulate_streaming)
+            hyps = [hyp.tolist() for hyp in hyps]
+        elif decoding_method == 'ctc_greedy_search':
+            hyps = self.ctc_greedy_search(
+                feats,
+                feats_lengths,
+                decoding_chunk_size=decoding_chunk_size,
+                num_decoding_left_chunks=num_decoding_left_chunks,
+                simulate_streaming=simulate_streaming)
+        # ctc_prefix_beam_search and attention_rescoring only return one
+        # result in List[int], change it to List[List[int]] for compatible
+        # with other batch decoding mode
+        elif decoding_method == 'ctc_prefix_beam_search':
+            assert feats.shape[0] == 1
+            hyp = self.ctc_prefix_beam_search(
+                feats,
+                feats_lengths,
+                beam_size,
+                decoding_chunk_size=decoding_chunk_size,
+                num_decoding_left_chunks=num_decoding_left_chunks,
+                simulate_streaming=simulate_streaming)
+            hyps = [hyp]
+        elif decoding_method == 'attention_rescoring':
+            assert feats.shape[0] == 1
+            hyp = self.attention_rescoring(
+                feats,
+                feats_lengths,
+                beam_size,
+                decoding_chunk_size=decoding_chunk_size,
+                num_decoding_left_chunks=num_decoding_left_chunks,
+                ctc_weight=ctc_weight,
+                simulate_streaming=simulate_streaming)
+            hyps = [hyp]
+        else:
+            raise ValueError(f"Not support decoding method: {decoding_method}")
+
+        res = [text_feature.defeaturize(hyp) for hyp in hyps]
+        res_tokenids = [hyp for hyp in hyps]
+        return res, res_tokenids
+
+
+class U2DecodeModel(U2BaseModel):
+    def scorers(self):
+        """Scorers."""
+        return dict(
+            decoder=self.decoder, ctc=CTCPrefixScorer(self.ctc, self.eos))
+
+    def encode(self, x):
+        """Encode acoustic features.
+
+        :param ndarray x: source acoustic feature (T, D)
+        :return: encoder outputs
+        :rtype: paddle.Tensor
+        """
+        self.eval()
+        x = paddle.to_tensor(x).unsqueeze(0)
+        ilen = x.size(1)
+        enc_output, _ = self._forward_encoder(x, ilen)
+        return enc_output.squeeze(0)
+
+
+class U2Model(U2DecodeModel):
+    def __init__(self, configs: dict):
+        model_conf = configs.get('model_conf', dict())
+        init_type = model_conf.get("init_type", None)
+        with DefaultInitializerContext(init_type):
+            vocab_size, encoder, decoder, ctc = U2Model._init_from_config(
+                configs)
+
+        super().__init__(
+            vocab_size=vocab_size,
+            encoder=encoder,
+            decoder=decoder,
+            ctc=ctc,
+            **model_conf)
+
+    @classmethod
+    def _init_from_config(cls, configs: dict):
+        """init sub module for model.
+
+        Args:
+            configs (dict): config dict.
+
+        Raises:
+            ValueError: raise when using not support encoder type.
+
+        Returns:
+            int, nn.Layer, nn.Layer, nn.Layer: vocab size, encoder, decoder, ctc
+        """
+        # cmvn
+        if 'cmvn_file' in configs and configs['cmvn_file']:
+            mean, istd = load_cmvn(configs['cmvn_file'],
+                                   configs['cmvn_file_type'])
+            global_cmvn = GlobalCMVN(
+                paddle.to_tensor(mean, dtype=paddle.float),
+                paddle.to_tensor(istd, dtype=paddle.float))
+        else:
+            global_cmvn = None
+
+        # input & output dim
+        input_dim = configs['input_dim']
+        vocab_size = configs['output_dim']
+        assert input_dim != 0, input_dim
+        assert vocab_size != 0, vocab_size
+
+        # encoder
+        encoder_type = configs.get('encoder', 'transformer')
+        logger.info(f"U2 Encoder type: {encoder_type}")
+        if encoder_type == 'transformer':
+            encoder = TransformerEncoder(
+                input_dim, global_cmvn=global_cmvn, **configs['encoder_conf'])
+        elif encoder_type == 'conformer':
+            encoder = ConformerEncoder(
+                input_dim, global_cmvn=global_cmvn, **configs['encoder_conf'])
+        else:
+            raise ValueError(f"not support encoder type:{encoder_type}")
+
+        # decoder
+        decoder = TransformerDecoder(vocab_size,
+                                     encoder.output_size(),
+                                     **configs['decoder_conf'])
+
+        # ctc decoder and ctc loss
+        model_conf = configs.get('model_conf', dict())
+        dropout_rate = model_conf.get('ctc_dropout_rate', 0.0)
+        grad_norm_type = model_conf.get('ctc_grad_norm_type', None)
+        ctc = CTCDecoderBase(
+            odim=vocab_size,
+            enc_n_units=encoder.output_size(),
+            blank_id=0,
+            dropout_rate=dropout_rate,
+            reduction=True,  # sum
+            batch_average=True,  # sum / batch_size
+            grad_norm_type=grad_norm_type)
+
+        return vocab_size, encoder, decoder, ctc
+
+    @classmethod
+    def from_config(cls, configs: dict):
+        """init model.
+
+        Args:
+            configs (dict): config dict.
+
+        Raises:
+            ValueError: raise when using not support encoder type.
+
+        Returns:
+            nn.Layer: U2Model
+        """
+        model = cls(configs)
+        return model
+
+    @classmethod
+    def from_pretrained(cls, dataloader, config, checkpoint_path):
+        """Build a DeepSpeech2Model model from a pretrained model.
+
+        Args:
+            dataloader (paddle.io.DataLoader): not used.
+            config (yacs.config.CfgNode):  model configs
+            checkpoint_path (Path or str): the path of pretrained model checkpoint, without extension name
+
+        Returns:
+            DeepSpeech2Model: The model built from pretrained result.
+        """
+        with UpdateConfig(config):
+            config.input_dim = dataloader.feat_dim
+            config.output_dim = dataloader.vocab_size
+
+        model = cls.from_config(config)
+
+        if checkpoint_path:
+            infos = checkpoint.Checkpoint().load_parameters(
+                model, checkpoint_path=checkpoint_path)
+            logger.info(f"checkpoint info: {infos}")
+        layer_tools.summary(model)
+        return model
+
+
+class U2InferModel(U2Model):
+    def __init__(self, configs: dict):
+        super().__init__(configs)
+
+    def forward(self,
+                feats,
+                feats_lengths,
+                decoding_chunk_size=-1,
+                num_decoding_left_chunks=-1,
+                simulate_streaming=False):
+        """export model function
+
+        Args:
+            feats (Tensor): [B, T, D]
+            feats_lengths (Tensor): [B]
+
+        Returns:
+            List[List[int]]: best path result
+        """
+        return self.ctc_greedy_search(
+            feats,
+            feats_lengths,
+            decoding_chunk_size=decoding_chunk_size,
+            num_decoding_left_chunks=num_decoding_left_chunks,
+            simulate_streaming=simulate_streaming)
diff --git a/ernie-sat/paddlespeech/s2t/models/u2/updater.py b/ernie-sat/paddlespeech/s2t/models/u2/updater.py
new file mode 100644
index 0000000000000000000000000000000000000000..c59090a84ee4d416353eff3d6049ff3451cf0dae
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/models/u2/updater.py
@@ -0,0 +1,150 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from wenet(https://github.com/wenet-e2e/wenet)
+from contextlib import nullcontext
+
+import paddle
+from paddle import distributed as dist
+
+from paddlespeech.s2t.training.extensions.evaluator import StandardEvaluator
+from paddlespeech.s2t.training.reporter import report
+from paddlespeech.s2t.training.timer import Timer
+from paddlespeech.s2t.training.updaters.standard_updater import StandardUpdater
+from paddlespeech.s2t.utils import layer_tools
+from paddlespeech.s2t.utils.log import Log
+
+logger = Log(__name__).getlog()
+
+
+class U2Evaluator(StandardEvaluator):
+    def __init__(self, model, dataloader):
+        super().__init__(model, dataloader)
+        self.msg = ""
+        self.num_seen_utts = 0
+        self.total_loss = 0.0
+
+    def evaluate_core(self, batch):
+        self.msg = "Valid: Rank: {}, ".format(dist.get_rank())
+        losses_dict = {}
+
+        loss, attention_loss, ctc_loss = self.model(*batch[1:])
+        if paddle.isfinite(loss):
+            num_utts = batch[1].shape[0]
+            self.num_seen_utts += num_utts
+            self.total_loss += float(loss) * num_utts
+
+            losses_dict['loss'] = float(loss)
+            if attention_loss:
+                losses_dict['att_loss'] = float(attention_loss)
+            if ctc_loss:
+                losses_dict['ctc_loss'] = float(ctc_loss)
+
+            for k, v in losses_dict.items():
+                report("eval/" + k, v)
+
+        self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
+                              for k, v in losses_dict.items())
+        logger.info(self.msg)
+        return self.total_loss, self.num_seen_utts
+
+
+class U2Updater(StandardUpdater):
+    def __init__(self,
+                 model,
+                 optimizer,
+                 scheduler,
+                 dataloader,
+                 init_state=None,
+                 accum_grad=1,
+                 **kwargs):
+        super().__init__(
+            model, optimizer, scheduler, dataloader, init_state=init_state)
+        self.accum_grad = accum_grad
+        self.forward_count = 0
+        self.msg = ""
+
+    def update_core(self, batch):
+        """One Step
+
+        Args:
+            batch (List[Object]): utts, xs, xlens, ys, ylens
+        """
+        losses_dict = {}
+        self.msg = "Rank: {}, ".format(dist.get_rank())
+
+        # forward
+        batch_size = batch[1].shape[0]
+        loss, attention_loss, ctc_loss = self.model(*batch[1:])
+        # loss div by `batch_size * accum_grad`
+        loss /= self.accum_grad
+
+        # loss backward
+        if (self.forward_count + 1) != self.accum_grad:
+            # Disable gradient synchronizations across DDP processes.
+            # Within this context, gradients will be accumulated on module
+            # variables, which will later be synchronized.
+            context = self.model.no_sync
+        else:
+            # Used for single gpu training and DDP gradient synchronization
+            # processes.
+            context = nullcontext
+
+        with context():
+            loss.backward()
+            layer_tools.print_grads(self.model, print_func=None)
+
+        # loss info
+        losses_dict['loss'] = float(loss) * self.accum_grad
+        if attention_loss:
+            losses_dict['att_loss'] = float(attention_loss)
+        if ctc_loss:
+            losses_dict['ctc_loss'] = float(ctc_loss)
+        # report loss
+        for k, v in losses_dict.items():
+            report("train/" + k, v)
+        # loss msg
+        self.msg += "batch size: {}, ".format(batch_size)
+        self.msg += "accum: {}, ".format(self.accum_grad)
+        self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
+                              for k, v in losses_dict.items())
+
+        # Truncate the graph
+        loss.detach()
+
+        # update parameters
+        self.forward_count += 1
+        if self.forward_count != self.accum_grad:
+            return
+        self.forward_count = 0
+
+        self.optimizer.step()
+        self.optimizer.clear_grad()
+        self.scheduler.step()
+
+    def update(self):
+        # model is default in train mode
+
+        # training for a step is implemented here
+        with Timer("data time cost:{}"):
+            batch = self.read_batch()
+        with Timer("step time cost:{}"):
+            self.update_core(batch)
+
+        # #iterations with accum_grad > 1
+        # Ref.: https://github.com/espnet/espnet/issues/777
+        if self.forward_count == 0:
+            self.state.iteration += 1
+        if self.updates_per_epoch is not None:
+            if self.state.iteration % self.updates_per_epoch == 0:
+                self.state.epoch += 1
diff --git a/ernie-sat/paddlespeech/s2t/models/u2_st/__init__.py b/ernie-sat/paddlespeech/s2t/models/u2_st/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b10b0834833ada1a56de61b0d4a72e4269efec5
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/models/u2_st/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .u2_st import U2STInferModel
+from .u2_st import U2STModel
diff --git a/ernie-sat/paddlespeech/s2t/models/u2_st/u2_st.py b/ernie-sat/paddlespeech/s2t/models/u2_st/u2_st.py
new file mode 100644
index 0000000000000000000000000000000000000000..6447753c50f0f27bbfc3ed87495ec8cd42d79c59
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/models/u2_st/u2_st.py
@@ -0,0 +1,676 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from wenet(https://github.com/wenet-e2e/wenet)
+"""U2 ASR Model
+Unified Streaming and Non-streaming Two-pass End-to-end Model for Speech Recognition
+(https://arxiv.org/pdf/2012.05481.pdf)
+"""
+import time
+from typing import Dict
+from typing import List
+from typing import Optional
+from typing import Tuple
+
+import paddle
+from paddle import jit
+from paddle import nn
+
+from paddlespeech.s2t.frontend.utility import IGNORE_ID
+from paddlespeech.s2t.frontend.utility import load_cmvn
+from paddlespeech.s2t.modules.cmvn import GlobalCMVN
+from paddlespeech.s2t.modules.ctc import CTCDecoderBase
+from paddlespeech.s2t.modules.decoder import TransformerDecoder
+from paddlespeech.s2t.modules.encoder import ConformerEncoder
+from paddlespeech.s2t.modules.encoder import TransformerEncoder
+from paddlespeech.s2t.modules.loss import LabelSmoothingLoss
+from paddlespeech.s2t.modules.mask import subsequent_mask
+from paddlespeech.s2t.utils import checkpoint
+from paddlespeech.s2t.utils import layer_tools
+from paddlespeech.s2t.utils.log import Log
+from paddlespeech.s2t.utils.tensor_utils import add_sos_eos
+from paddlespeech.s2t.utils.tensor_utils import th_accuracy
+from paddlespeech.s2t.utils.utility import UpdateConfig
+
+__all__ = ["U2STModel", "U2STInferModel"]
+
+logger = Log(__name__).getlog()
+
+
+class U2STBaseModel(nn.Layer):
+    """CTC-Attention hybrid Encoder-Decoder model"""
+
+    def __init__(self,
+                 vocab_size: int,
+                 encoder: TransformerEncoder,
+                 st_decoder: TransformerDecoder,
+                 decoder: TransformerDecoder=None,
+                 ctc: CTCDecoderBase=None,
+                 ctc_weight: float=0.0,
+                 asr_weight: float=0.0,
+                 ignore_id: int=IGNORE_ID,
+                 lsm_weight: float=0.0,
+                 length_normalized_loss: bool=False,
+                 **kwargs):
+        assert 0.0 <= ctc_weight <= 1.0, ctc_weight
+
+        super().__init__()
+        # note that eos is the same as sos (equivalent ID)
+        self.sos = vocab_size - 1
+        self.eos = vocab_size - 1
+        self.vocab_size = vocab_size
+        self.ignore_id = ignore_id
+        self.ctc_weight = ctc_weight
+        self.asr_weight = asr_weight
+
+        self.encoder = encoder
+        self.st_decoder = st_decoder
+        self.decoder = decoder
+        self.ctc = ctc
+        self.criterion_att = LabelSmoothingLoss(
+            size=vocab_size,
+            padding_idx=ignore_id,
+            smoothing=lsm_weight,
+            normalize_length=length_normalized_loss, )
+
+    def forward(
+            self,
+            speech: paddle.Tensor,
+            speech_lengths: paddle.Tensor,
+            text: paddle.Tensor,
+            text_lengths: paddle.Tensor,
+            asr_text: paddle.Tensor=None,
+            asr_text_lengths: paddle.Tensor=None,
+    ) -> Tuple[Optional[paddle.Tensor], Optional[paddle.Tensor], Optional[
+            paddle.Tensor]]:
+        """Frontend + Encoder + Decoder + Calc loss
+        Args:
+            speech: (Batch, Length, ...)
+            speech_lengths: (Batch, )
+            text: (Batch, Length)
+            text_lengths: (Batch,)
+        Returns:
+            total_loss, attention_loss, ctc_loss
+        """
+        assert text_lengths.dim() == 1, text_lengths.shape
+        # Check that batch_size is unified
+        assert (speech.shape[0] == speech_lengths.shape[0] == text.shape[0] ==
+                text_lengths.shape[0]), (speech.shape, speech_lengths.shape,
+                                         text.shape, text_lengths.shape)
+        # 1. Encoder
+        start = time.time()
+        encoder_out, encoder_mask = self.encoder(speech, speech_lengths)
+        encoder_time = time.time() - start
+        #logger.debug(f"encoder time: {encoder_time}")
+        #TODO(Hui Zhang): sum not support bool type
+        #encoder_out_lens = encoder_mask.squeeze(1).sum(1)  #[B, 1, T] -> [B]
+        encoder_out_lens = encoder_mask.squeeze(1).cast(paddle.int64).sum(
+            1)  #[B, 1, T] -> [B]
+
+        # 2a. ST-decoder branch
+        start = time.time()
+        loss_st, acc_st = self._calc_st_loss(encoder_out, encoder_mask, text,
+                                             text_lengths)
+        decoder_time = time.time() - start
+
+        loss_asr_att = None
+        loss_asr_ctc = None
+        # 2b. ASR Attention-decoder branch
+        if self.asr_weight > 0.:
+            if self.ctc_weight != 1.0:
+                start = time.time()
+                loss_asr_att, acc_att = self._calc_att_loss(
+                    encoder_out, encoder_mask, asr_text, asr_text_lengths)
+                decoder_time = time.time() - start
+
+            # 2c. CTC branch
+            if self.ctc_weight != 0.0:
+                start = time.time()
+                loss_asr_ctc = self.ctc(encoder_out, encoder_out_lens, asr_text,
+                                        asr_text_lengths)
+                ctc_time = time.time() - start
+
+            if loss_asr_ctc is None:
+                loss_asr = loss_asr_att
+            elif loss_asr_att is None:
+                loss_asr = loss_asr_ctc
+            else:
+                loss_asr = self.ctc_weight * loss_asr_ctc + (1 - self.ctc_weight
+                                                             ) * loss_asr_att
+            loss = self.asr_weight * loss_asr + (1 - self.asr_weight) * loss_st
+        else:
+            loss = loss_st
+        return loss, loss_st, loss_asr_att, loss_asr_ctc
+
+    def _calc_st_loss(
+            self,
+            encoder_out: paddle.Tensor,
+            encoder_mask: paddle.Tensor,
+            ys_pad: paddle.Tensor,
+            ys_pad_lens: paddle.Tensor, ) -> Tuple[paddle.Tensor, float]:
+        """Calc attention loss.
+
+        Args:
+            encoder_out (paddle.Tensor): [B, Tmax, D]
+            encoder_mask (paddle.Tensor): [B, 1, Tmax]
+            ys_pad (paddle.Tensor): [B, Umax]
+            ys_pad_lens (paddle.Tensor): [B]
+
+        Returns:
+            Tuple[paddle.Tensor, float]: attention_loss, accuracy rate
+        """
+        ys_in_pad, ys_out_pad = add_sos_eos(ys_pad, self.sos, self.eos,
+                                            self.ignore_id)
+        ys_in_lens = ys_pad_lens + 1
+
+        # 1. Forward decoder
+        decoder_out, _ = self.st_decoder(encoder_out, encoder_mask, ys_in_pad,
+                                         ys_in_lens)
+
+        # 2. Compute attention loss
+        loss_att = self.criterion_att(decoder_out, ys_out_pad)
+        acc_att = th_accuracy(
+            decoder_out.view(-1, self.vocab_size),
+            ys_out_pad,
+            ignore_label=self.ignore_id, )
+        return loss_att, acc_att
+
+    def _calc_att_loss(
+            self,
+            encoder_out: paddle.Tensor,
+            encoder_mask: paddle.Tensor,
+            ys_pad: paddle.Tensor,
+            ys_pad_lens: paddle.Tensor, ) -> Tuple[paddle.Tensor, float]:
+        """Calc attention loss.
+
+        Args:
+            encoder_out (paddle.Tensor): [B, Tmax, D]
+            encoder_mask (paddle.Tensor): [B, 1, Tmax]
+            ys_pad (paddle.Tensor): [B, Umax]
+            ys_pad_lens (paddle.Tensor): [B]
+
+        Returns:
+            Tuple[paddle.Tensor, float]: attention_loss, accuracy rate
+        """
+        ys_in_pad, ys_out_pad = add_sos_eos(ys_pad, self.sos, self.eos,
+                                            self.ignore_id)
+        ys_in_lens = ys_pad_lens + 1
+
+        # 1. Forward decoder
+        decoder_out, _ = self.decoder(encoder_out, encoder_mask, ys_in_pad,
+                                      ys_in_lens)
+
+        # 2. Compute attention loss
+        loss_att = self.criterion_att(decoder_out, ys_out_pad)
+        acc_att = th_accuracy(
+            decoder_out.view(-1, self.vocab_size),
+            ys_out_pad,
+            ignore_label=self.ignore_id, )
+        return loss_att, acc_att
+
+    def _forward_encoder(
+            self,
+            speech: paddle.Tensor,
+            speech_lengths: paddle.Tensor,
+            decoding_chunk_size: int=-1,
+            num_decoding_left_chunks: int=-1,
+            simulate_streaming: bool=False,
+    ) -> Tuple[paddle.Tensor, paddle.Tensor]:
+        """Encoder pass.
+
+        Args:
+            speech (paddle.Tensor): [B, Tmax, D]
+            speech_lengths (paddle.Tensor): [B]
+            decoding_chunk_size (int, optional): chuck size. Defaults to -1.
+            num_decoding_left_chunks (int, optional): nums chunks. Defaults to -1.
+            simulate_streaming (bool, optional): streaming or not. Defaults to False.
+
+        Returns:
+            Tuple[paddle.Tensor, paddle.Tensor]:
+                encoder hiddens (B, Tmax, D),
+                encoder hiddens mask (B, 1, Tmax).
+        """
+        # Let's assume B = batch_size
+        # 1. Encoder
+        if simulate_streaming and decoding_chunk_size > 0:
+            encoder_out, encoder_mask = self.encoder.forward_chunk_by_chunk(
+                speech,
+                decoding_chunk_size=decoding_chunk_size,
+                num_decoding_left_chunks=num_decoding_left_chunks
+            )  # (B, maxlen, encoder_dim)
+        else:
+            encoder_out, encoder_mask = self.encoder(
+                speech,
+                speech_lengths,
+                decoding_chunk_size=decoding_chunk_size,
+                num_decoding_left_chunks=num_decoding_left_chunks
+            )  # (B, maxlen, encoder_dim)
+        return encoder_out, encoder_mask
+
+    def translate(
+            self,
+            speech: paddle.Tensor,
+            speech_lengths: paddle.Tensor,
+            beam_size: int=10,
+            word_reward: float=0.0,
+            maxlenratio: float=0.5,
+            decoding_chunk_size: int=-1,
+            num_decoding_left_chunks: int=-1,
+            simulate_streaming: bool=False, ) -> paddle.Tensor:
+        """ Apply beam search on attention decoder with length penalty
+        Args:
+            speech (paddle.Tensor): (batch, max_len, feat_dim)
+            speech_length (paddle.Tensor): (batch, )
+            beam_size (int): beam size for beam search
+            word_reward (float): word reward used in beam search
+            maxlenratio (float): max length ratio to bound the length of translated text
+            decoding_chunk_size (int): decoding chunk for dynamic chunk
+                trained model.
+                <0: for decoding, use full chunk.
+                >0: for decoding, use fixed chunk size as set.
+                0: used for training, it's prohibited here
+            simulate_streaming (bool): whether do encoder forward in a
+                streaming fashion
+        Returns:
+            paddle.Tensor: decoding result, (batch, max_result_len)
+        """
+        assert speech.shape[0] == speech_lengths.shape[0]
+        assert decoding_chunk_size != 0
+        assert speech.shape[0] == 1
+        device = speech.place
+
+        # Let's assume B = batch_size and N = beam_size
+        # 1. Encoder and init hypothesis
+        encoder_out, encoder_mask = self._forward_encoder(
+            speech, speech_lengths, decoding_chunk_size,
+            num_decoding_left_chunks,
+            simulate_streaming)  # (B, maxlen, encoder_dim)
+
+        maxlen = max(int(encoder_out.shape[1] * maxlenratio), 5)
+
+        hyp = {"score": 0.0, "yseq": [self.sos], "cache": None}
+        hyps = [hyp]
+        ended_hyps = []
+        cur_best_score = -float("inf")
+        cache = None
+
+        # 2. Decoder forward step by step
+        for i in range(1, maxlen + 1):
+            ys = paddle.ones((len(hyps), i), dtype=paddle.long)
+
+            if hyps[0]["cache"] is not None:
+                cache = [
+                    paddle.ones(
+                        (len(hyps), i - 1, hyp_cache.shape[-1]),
+                        dtype=paddle.float32) for hyp_cache in hyps[0]["cache"]
+                ]
+            for j, hyp in enumerate(hyps):
+                ys[j, :] = paddle.to_tensor(hyp["yseq"])
+                if hyps[0]["cache"] is not None:
+                    for k in range(len(cache)):
+                        cache[k][j] = hyps[j]["cache"][k]
+            ys_mask = subsequent_mask(i).unsqueeze(0).to(device)
+
+            logp, cache = self.st_decoder.forward_one_step(
+                encoder_out.repeat(len(hyps), 1, 1),
+                encoder_mask.repeat(len(hyps), 1, 1), ys, ys_mask, cache)
+
+            hyps_best_kept = []
+            for j, hyp in enumerate(hyps):
+                top_k_logp, top_k_index = logp[j:j + 1].topk(beam_size)
+
+                for b in range(beam_size):
+                    new_hyp = {}
+                    new_hyp["score"] = hyp["score"] + float(top_k_logp[0, b])
+                    new_hyp["yseq"] = [0] * (1 + len(hyp["yseq"]))
+                    new_hyp["yseq"][:len(hyp["yseq"])] = hyp["yseq"]
+                    new_hyp["yseq"][len(hyp["yseq"])] = int(top_k_index[0, b])
+                    new_hyp["cache"] = [cache_[j] for cache_ in cache]
+                    # will be (2 x beam) hyps at most
+                    hyps_best_kept.append(new_hyp)
+
+                hyps_best_kept = sorted(
+                    hyps_best_kept, key=lambda x: -x["score"])[:beam_size]
+
+            # sort and get nbest
+            hyps = hyps_best_kept
+            if i == maxlen:
+                for hyp in hyps:
+                    hyp["yseq"].append(self.eos)
+
+            # finalize the ended hypotheses with word reward (by length)
+            remained_hyps = []
+            for hyp in hyps:
+                if hyp["yseq"][-1] == self.eos:
+                    hyp["score"] += (i - 1) * word_reward
+                    cur_best_score = max(cur_best_score, hyp["score"])
+                    ended_hyps.append(hyp)
+                else:
+                    # stop while guarantee the optimality
+                    if hyp["score"] + maxlen * word_reward > cur_best_score:
+                        remained_hyps.append(hyp)
+
+            # stop predition when there is no unended hypothesis
+            if not remained_hyps:
+                break
+            hyps = remained_hyps
+
+        # 3. Select best of best
+        best_hyp = max(ended_hyps, key=lambda x: x["score"])
+
+        return paddle.to_tensor([best_hyp["yseq"][1:]])
+
+    # @jit.to_static
+    def subsampling_rate(self) -> int:
+        """ Export interface for c++ call, return subsampling_rate of the
+            model
+        """
+        return self.encoder.embed.subsampling_rate
+
+    # @jit.to_static
+    def right_context(self) -> int:
+        """ Export interface for c++ call, return right_context of the model
+        """
+        return self.encoder.embed.right_context
+
+    # @jit.to_static
+    def sos_symbol(self) -> int:
+        """ Export interface for c++ call, return sos symbol id of the model
+        """
+        return self.sos
+
+    # @jit.to_static
+    def eos_symbol(self) -> int:
+        """ Export interface for c++ call, return eos symbol id of the model
+        """
+        return self.eos
+
+    @jit.to_static
+    def forward_encoder_chunk(
+            self,
+            xs: paddle.Tensor,
+            offset: int,
+            required_cache_size: int,
+            subsampling_cache: Optional[paddle.Tensor]=None,
+            elayers_output_cache: Optional[List[paddle.Tensor]]=None,
+            conformer_cnn_cache: Optional[List[paddle.Tensor]]=None,
+    ) -> Tuple[paddle.Tensor, paddle.Tensor, List[paddle.Tensor], List[
+            paddle.Tensor]]:
+        """ Export interface for c++ call, give input chunk xs, and return
+            output from time 0 to current chunk.
+        Args:
+            xs (paddle.Tensor): chunk input
+            subsampling_cache (Optional[paddle.Tensor]): subsampling cache
+            elayers_output_cache (Optional[List[paddle.Tensor]]):
+                transformer/conformer encoder layers output cache
+            conformer_cnn_cache (Optional[List[paddle.Tensor]]): conformer
+                cnn cache
+        Returns:
+            paddle.Tensor: output, it ranges from time 0 to current chunk.
+            paddle.Tensor: subsampling cache
+            List[paddle.Tensor]: attention cache
+            List[paddle.Tensor]: conformer cnn cache
+        """
+        return self.encoder.forward_chunk(
+            xs, offset, required_cache_size, subsampling_cache,
+            elayers_output_cache, conformer_cnn_cache)
+
+    # @jit.to_static
+    def ctc_activation(self, xs: paddle.Tensor) -> paddle.Tensor:
+        """ Export interface for c++ call, apply linear transform and log
+            softmax before ctc
+        Args:
+            xs (paddle.Tensor): encoder output
+        Returns:
+            paddle.Tensor: activation before ctc
+        """
+        return self.ctc.log_softmax(xs)
+
+    @jit.to_static
+    def forward_attention_decoder(
+            self,
+            hyps: paddle.Tensor,
+            hyps_lens: paddle.Tensor,
+            encoder_out: paddle.Tensor, ) -> paddle.Tensor:
+        """ Export interface for c++ call, forward decoder with multiple
+            hypothesis from ctc prefix beam search and one encoder output
+        Args:
+            hyps (paddle.Tensor): hyps from ctc prefix beam search, already
+                pad sos at the begining, (B, T)
+            hyps_lens (paddle.Tensor): length of each hyp in hyps, (B)
+            encoder_out (paddle.Tensor): corresponding encoder output, (B=1, T, D)
+        Returns:
+            paddle.Tensor: decoder output, (B, L)
+        """
+        assert encoder_out.shape[0] == 1
+        num_hyps = hyps.shape[0]
+        assert hyps_lens.shape[0] == num_hyps
+        encoder_out = encoder_out.repeat(num_hyps, 1, 1)
+        # (B, 1, T)
+        encoder_mask = paddle.ones(
+            [num_hyps, 1, encoder_out.shape[1]], dtype=paddle.bool)
+        # (num_hyps, max_hyps_len, vocab_size)
+        decoder_out, _ = self.decoder(encoder_out, encoder_mask, hyps,
+                                      hyps_lens)
+        decoder_out = paddle.nn.functional.log_softmax(decoder_out, dim=-1)
+        return decoder_out
+
+    @paddle.no_grad()
+    def decode(self,
+               feats: paddle.Tensor,
+               feats_lengths: paddle.Tensor,
+               text_feature: Dict[str, int],
+               decoding_method: str,
+               beam_size: int,
+               word_reward: float=0.0,
+               maxlenratio: float=0.5,
+               decoding_chunk_size: int=-1,
+               num_decoding_left_chunks: int=-1,
+               simulate_streaming: bool=False):
+        """u2 decoding.
+
+        Args:
+            feats (Tensor): audio features, (B, T, D)
+            feats_lengths (Tensor): (B)
+            text_feature (TextFeaturizer): text feature object.
+            decoding_method (str): decoding mode, e.g.
+                    'fullsentence',
+                    'simultaneous'
+            beam_size (int): beam size for search
+            decoding_chunk_size (int, optional): decoding chunk size. Defaults to -1.
+                    <0: for decoding, use full chunk.
+                    >0: for decoding, use fixed chunk size as set.
+                    0: used for training, it's prohibited here.
+            num_decoding_left_chunks (int, optional):
+                    number of left chunks for decoding. Defaults to -1.
+            simulate_streaming (bool, optional): simulate streaming inference. Defaults to False.
+
+        Raises:
+            ValueError: when not support decoding_method.
+
+        Returns:
+            List[List[int]]: transcripts.
+        """
+        batch_size = feats.shape[0]
+
+        if decoding_method == 'fullsentence':
+            hyps = self.translate(
+                feats,
+                feats_lengths,
+                beam_size=beam_size,
+                word_reward=word_reward,
+                maxlenratio=maxlenratio,
+                decoding_chunk_size=decoding_chunk_size,
+                num_decoding_left_chunks=num_decoding_left_chunks,
+                simulate_streaming=simulate_streaming)
+            hyps = [hyp.tolist() for hyp in hyps]
+        else:
+            raise ValueError(f"Not support decoding method: {decoding_method}")
+
+        res = [text_feature.defeaturize(hyp) for hyp in hyps]
+        return res
+
+
+class U2STModel(U2STBaseModel):
+    def __init__(self, configs: dict):
+        vocab_size, encoder, decoder = U2STModel._init_from_config(configs)
+
+        if isinstance(decoder, Tuple):
+            st_decoder, asr_decoder, ctc = decoder
+            super().__init__(
+                vocab_size=vocab_size,
+                encoder=encoder,
+                st_decoder=st_decoder,
+                decoder=asr_decoder,
+                ctc=ctc,
+                **configs['model_conf'])
+        else:
+            super().__init__(
+                vocab_size=vocab_size,
+                encoder=encoder,
+                st_decoder=decoder,
+                **configs['model_conf'])
+
+    @classmethod
+    def _init_from_config(cls, configs: dict):
+        """init sub module for model.
+
+        Args:
+            configs (dict): config dict.
+
+        Raises:
+            ValueError: raise when using not support encoder type.
+
+        Returns:
+            int, nn.Layer, nn.Layer, nn.Layer: vocab size, encoder, decoder, ctc
+        """
+        if configs['cmvn_file'] is not None:
+            mean, istd = load_cmvn(configs['cmvn_file'],
+                                   configs['cmvn_file_type'])
+            global_cmvn = GlobalCMVN(
+                paddle.to_tensor(mean, dtype=paddle.float),
+                paddle.to_tensor(istd, dtype=paddle.float))
+        else:
+            global_cmvn = None
+
+        input_dim = configs['input_dim']
+        vocab_size = configs['output_dim']
+        assert input_dim != 0, input_dim
+        assert vocab_size != 0, vocab_size
+
+        encoder_type = configs.get('encoder', 'transformer')
+        logger.info(f"U2 Encoder type: {encoder_type}")
+        if encoder_type == 'transformer':
+            encoder = TransformerEncoder(
+                input_dim, global_cmvn=global_cmvn, **configs['encoder_conf'])
+        elif encoder_type == 'conformer':
+            encoder = ConformerEncoder(
+                input_dim, global_cmvn=global_cmvn, **configs['encoder_conf'])
+        else:
+            raise ValueError(f"not support encoder type:{encoder_type}")
+
+        st_decoder = TransformerDecoder(vocab_size,
+                                        encoder.output_size(),
+                                        **configs['decoder_conf'])
+
+        asr_weight = configs['model_conf']['asr_weight']
+        logger.info(f"ASR Joint Training Weight: {asr_weight}")
+
+        if asr_weight > 0.:
+            decoder = TransformerDecoder(vocab_size,
+                                         encoder.output_size(),
+                                         **configs['decoder_conf'])
+            # ctc decoder and ctc loss
+            model_conf = configs['model_conf']
+            dropout_rate = model_conf.get('ctc_dropout_rate', 0.0)
+            grad_norm_type = model_conf.get('ctc_grad_norm_type', None)
+            ctc = CTCDecoderBase(
+                odim=vocab_size,
+                enc_n_units=encoder.output_size(),
+                blank_id=0,
+                dropout_rate=dropout_rate,
+                reduction=True,  # sum
+                batch_average=True,  # sum / batch_size
+                grad_norm_type=grad_norm_type)
+
+            return vocab_size, encoder, (st_decoder, decoder, ctc)
+        else:
+            return vocab_size, encoder, st_decoder
+
+    @classmethod
+    def from_config(cls, configs: dict):
+        """init model.
+
+        Args:
+            configs (dict): config dict.
+
+        Raises:
+            ValueError: raise when using not support encoder type.
+
+        Returns:
+            nn.Layer: U2STModel
+        """
+        model = cls(configs)
+        return model
+
+    @classmethod
+    def from_pretrained(cls, dataloader, config, checkpoint_path):
+        """Build a DeepSpeech2Model model from a pretrained model.
+
+        Args:
+            dataloader (paddle.io.DataLoader): not used.
+            config (yacs.config.CfgNode):  model configs
+            checkpoint_path (Path or str): the path of pretrained model checkpoint, without extension name
+
+        Returns:
+            DeepSpeech2Model: The model built from pretrained result.
+        """
+        with UpdateConfig(config):
+            config.input_dim = dataloader.collate_fn.feature_size
+            config.output_dim = dataloader.collate_fn.vocab_size
+
+        model = cls.from_config(config)
+
+        if checkpoint_path:
+            infos = checkpoint.load_parameters(
+                model, checkpoint_path=checkpoint_path)
+            logger.info(f"checkpoint info: {infos}")
+        layer_tools.summary(model)
+        return model
+
+
+class U2STInferModel(U2STModel):
+    def __init__(self, configs: dict):
+        super().__init__(configs)
+
+    def forward(self,
+                feats,
+                feats_lengths,
+                decoding_chunk_size=-1,
+                num_decoding_left_chunks=-1,
+                simulate_streaming=False):
+        """export model function
+
+        Args:
+            feats (Tensor): [B, T, D]
+            feats_lengths (Tensor): [B]
+
+        Returns:
+            List[List[int]]: best path result
+        """
+        return self.translate(
+            feats,
+            feats_lengths,
+            decoding_chunk_size=decoding_chunk_size,
+            num_decoding_left_chunks=num_decoding_left_chunks,
+            simulate_streaming=simulate_streaming)
diff --git a/ernie-sat/paddlespeech/s2t/modules/__init__.py b/ernie-sat/paddlespeech/s2t/modules/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..185a92b8d94d3426d616c0624f0f2ee04339349e
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/modules/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/ernie-sat/paddlespeech/s2t/modules/activation.py b/ernie-sat/paddlespeech/s2t/modules/activation.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f387b0d99b68ed5d37cb05a13a030ad49aaa381
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/modules/activation.py
@@ -0,0 +1,164 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from collections import OrderedDict
+
+import paddle
+from paddle import nn
+from paddle.nn import functional as F
+
+from paddlespeech.s2t.modules.align import Conv2D
+from paddlespeech.s2t.modules.align import Linear
+from paddlespeech.s2t.utils.log import Log
+
+logger = Log(__name__).getlog()
+
+__all__ = ["get_activation", "brelu", "LinearGLUBlock", "ConvGLUBlock", "GLU"]
+
+
+def brelu(x, t_min=0.0, t_max=24.0, name=None):
+    # paddle.to_tensor is dygraph_only can not work under JIT
+    t_min = paddle.full(shape=[1], fill_value=t_min, dtype='float32')
+    t_max = paddle.full(shape=[1], fill_value=t_max, dtype='float32')
+    return x.maximum(t_min).minimum(t_max)
+
+
+class GLU(nn.Layer):
+    """Gated Linear Units (GLU) Layer"""
+
+    def __init__(self, dim: int=-1):
+        super().__init__()
+        self.dim = dim
+
+    def forward(self, xs):
+        return F.glu(xs, axis=self.dim)
+
+
+class LinearGLUBlock(nn.Layer):
+    """A linear Gated Linear Units (GLU) block."""
+
+    def __init__(self, idim: int):
+        """ GLU.
+        Args:
+            idim (int): input and output dimension
+        """
+        super().__init__()
+        self.fc = Linear(idim, idim * 2)
+
+    def forward(self, xs):
+        return glu(self.fc(xs), dim=-1)
+
+
+class ConvGLUBlock(nn.Layer):
+    def __init__(self, kernel_size, in_ch, out_ch, bottlececk_dim=0,
+                 dropout=0.):
+        """A convolutional Gated Linear Units (GLU) block.
+
+        Args:
+            kernel_size (int): kernel size
+            in_ch (int): number of input channels
+            out_ch (int): number of output channels
+            bottlececk_dim (int): dimension of the bottleneck layers for computational efficiency. Defaults to 0.
+            dropout (float): dropout probability. Defaults to 0..
+        """
+
+        super().__init__()
+
+        self.conv_residual = None
+        if in_ch != out_ch:
+            self.conv_residual = nn.utils.weight_norm(
+                Conv2D(
+                    in_channels=in_ch, out_channels=out_ch, kernel_size=(1, 1)),
+                name='weight',
+                dim=0)
+            self.dropout_residual = nn.Dropout(p=dropout)
+
+        self.pad_left = nn.Pad2d((0, 0, kernel_size - 1, 0), 0)
+
+        layers = OrderedDict()
+        if bottlececk_dim == 0:
+            layers['conv'] = nn.utils.weight_norm(
+                Conv2D(
+                    in_channels=in_ch,
+                    out_channels=out_ch * 2,
+                    kernel_size=(kernel_size, 1)),
+                name='weight',
+                dim=0)
+            # TODO(hirofumi0810): padding?
+            layers['dropout'] = nn.Dropout(p=dropout)
+            layers['glu'] = GLU()
+
+        elif bottlececk_dim > 0:
+            layers['conv_in'] = nn.utils.weight_norm(
+                nn.Conv2D(
+                    in_channels=in_ch,
+                    out_channels=bottlececk_dim,
+                    kernel_size=(1, 1)),
+                name='weight',
+                dim=0)
+            layers['dropout_in'] = nn.Dropout(p=dropout)
+            layers['conv_bottleneck'] = nn.utils.weight_norm(
+                Conv2D(
+                    in_channels=bottlececk_dim,
+                    out_channels=bottlececk_dim,
+                    kernel_size=(kernel_size, 1)),
+                name='weight',
+                dim=0)
+            layers['dropout'] = nn.Dropout(p=dropout)
+            layers['glu'] = GLU()
+            layers['conv_out'] = nn.utils.weight_norm(
+                Conv2D(
+                    in_channels=bottlececk_dim,
+                    out_channels=out_ch * 2,
+                    kernel_size=(1, 1)),
+                name='weight',
+                dim=0)
+            layers['dropout_out'] = nn.Dropout(p=dropout)
+
+        self.layers = nn.Sequential(layers)
+
+    def forward(self, xs):
+        """Forward pass.
+        Args:
+            xs (FloatTensor): `[B, in_ch, T, feat_dim]`
+        Returns:
+            out (FloatTensor): `[B, out_ch, T, feat_dim]`
+        """
+        residual = xs
+        if self.conv_residual is not None:
+            residual = self.dropout_residual(self.conv_residual(residual))
+        xs = self.pad_left(xs)  # `[B, embed_dim, T+kernel-1, 1]`
+        xs = self.layers(xs)  # `[B, out_ch * 2, T ,1]`
+        xs = xs + residual
+        return xs
+
+
+def get_activation(act):
+    """Return activation function."""
+    # Lazy load to avoid unused import
+    activation_funcs = {
+        "hardshrink": paddle.nn.Hardshrink,
+        "hardswish": paddle.nn.Hardswish,
+        "hardtanh": paddle.nn.Hardtanh,
+        "tanh": paddle.nn.Tanh,
+        "relu": paddle.nn.ReLU,
+        "relu6": paddle.nn.ReLU6,
+        "leakyrelu": paddle.nn.LeakyReLU,
+        "selu": paddle.nn.SELU,
+        "swish": paddle.nn.Swish,
+        "gelu": paddle.nn.GELU,
+        "glu": GLU,
+        "elu": paddle.nn.ELU,
+    }
+
+    return activation_funcs[act]()
diff --git a/ernie-sat/paddlespeech/s2t/modules/align.py b/ernie-sat/paddlespeech/s2t/modules/align.py
new file mode 100644
index 0000000000000000000000000000000000000000..f889167936115ccc7267037d9046765f83b403bd
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/modules/align.py
@@ -0,0 +1,139 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddle
+from paddle import nn
+
+from paddlespeech.s2t.modules.initializer import KaimingUniform
+"""
+    To align the initializer between paddle and torch, 
+    the API below are set defalut initializer with priority higger than global initializer.
+"""
+global_init_type = None
+
+
+class LayerNorm(nn.LayerNorm):
+    def __init__(self,
+                 normalized_shape,
+                 epsilon=1e-05,
+                 weight_attr=None,
+                 bias_attr=None,
+                 name=None):
+        if weight_attr is None:
+            weight_attr = paddle.ParamAttr(
+                initializer=nn.initializer.Constant(1.0))
+        if bias_attr is None:
+            bias_attr = paddle.ParamAttr(
+                initializer=nn.initializer.Constant(0.0))
+        super(LayerNorm, self).__init__(normalized_shape, epsilon, weight_attr,
+                                        bias_attr, name)
+
+
+class BatchNorm1D(nn.BatchNorm1D):
+    def __init__(self,
+                 num_features,
+                 momentum=0.9,
+                 epsilon=1e-05,
+                 weight_attr=None,
+                 bias_attr=None,
+                 data_format='NCL',
+                 name=None):
+        if weight_attr is None:
+            weight_attr = paddle.ParamAttr(
+                initializer=nn.initializer.Constant(1.0))
+        if bias_attr is None:
+            bias_attr = paddle.ParamAttr(
+                initializer=nn.initializer.Constant(0.0))
+        super(BatchNorm1D,
+              self).__init__(num_features, momentum, epsilon, weight_attr,
+                             bias_attr, data_format, name)
+
+
+class Embedding(nn.Embedding):
+    def __init__(self,
+                 num_embeddings,
+                 embedding_dim,
+                 padding_idx=None,
+                 sparse=False,
+                 weight_attr=None,
+                 name=None):
+        if weight_attr is None:
+            weight_attr = paddle.ParamAttr(initializer=nn.initializer.Normal())
+        super(Embedding, self).__init__(num_embeddings, embedding_dim,
+                                        padding_idx, sparse, weight_attr, name)
+
+
+class Linear(nn.Linear):
+    def __init__(self,
+                 in_features,
+                 out_features,
+                 weight_attr=None,
+                 bias_attr=None,
+                 name=None):
+        if weight_attr is None:
+            if global_init_type == "kaiming_uniform":
+                weight_attr = paddle.ParamAttr(initializer=KaimingUniform())
+        if bias_attr is None:
+            if global_init_type == "kaiming_uniform":
+                bias_attr = paddle.ParamAttr(initializer=KaimingUniform())
+        super(Linear, self).__init__(in_features, out_features, weight_attr,
+                                     bias_attr, name)
+
+
+class Conv1D(nn.Conv1D):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 padding_mode='zeros',
+                 weight_attr=None,
+                 bias_attr=None,
+                 data_format='NCL'):
+        if weight_attr is None:
+            if global_init_type == "kaiming_uniform":
+                print("set kaiming_uniform")
+                weight_attr = paddle.ParamAttr(initializer=KaimingUniform())
+        if bias_attr is None:
+            if global_init_type == "kaiming_uniform":
+                bias_attr = paddle.ParamAttr(initializer=KaimingUniform())
+        super(Conv1D, self).__init__(
+            in_channels, out_channels, kernel_size, stride, padding, dilation,
+            groups, padding_mode, weight_attr, bias_attr, data_format)
+
+
+class Conv2D(nn.Conv2D):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 padding_mode='zeros',
+                 weight_attr=None,
+                 bias_attr=None,
+                 data_format='NCHW'):
+        if weight_attr is None:
+            if global_init_type == "kaiming_uniform":
+                weight_attr = paddle.ParamAttr(initializer=KaimingUniform())
+        if bias_attr is None:
+            if global_init_type == "kaiming_uniform":
+                bias_attr = paddle.ParamAttr(initializer=KaimingUniform())
+        super(Conv2D, self).__init__(
+            in_channels, out_channels, kernel_size, stride, padding, dilation,
+            groups, padding_mode, weight_attr, bias_attr, data_format)
diff --git a/ernie-sat/paddlespeech/s2t/modules/attention.py b/ernie-sat/paddlespeech/s2t/modules/attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..438efd2a14151904cb75ff6c72f7be01663bff09
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/modules/attention.py
@@ -0,0 +1,237 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2019 Mobvoi Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from wenet(https://github.com/wenet-e2e/wenet)
+"""Multi-Head Attention layer definition."""
+import math
+from typing import Optional
+from typing import Tuple
+
+import paddle
+from paddle import nn
+from paddle.nn import initializer as I
+
+from paddlespeech.s2t.modules.align import Linear
+from paddlespeech.s2t.utils.log import Log
+
+logger = Log(__name__).getlog()
+
+__all__ = ["MultiHeadedAttention", "RelPositionMultiHeadedAttention"]
+
+# Relative Positional Encodings
+# https://www.jianshu.com/p/c0608efcc26f
+# https://zhuanlan.zhihu.com/p/344604604
+
+
+class MultiHeadedAttention(nn.Layer):
+    """Multi-Head Attention layer."""
+
+    def __init__(self, n_head: int, n_feat: int, dropout_rate: float):
+        """Construct an MultiHeadedAttention object.
+        Args:
+            n_head (int): The number of heads.
+            n_feat (int): The number of features.
+            dropout_rate (float): Dropout rate.
+        """
+        super().__init__()
+        assert n_feat % n_head == 0
+        # We assume d_v always equals d_k
+        self.d_k = n_feat // n_head
+        self.h = n_head
+        self.linear_q = Linear(n_feat, n_feat)
+        self.linear_k = Linear(n_feat, n_feat)
+        self.linear_v = Linear(n_feat, n_feat)
+        self.linear_out = Linear(n_feat, n_feat)
+        self.dropout = nn.Dropout(p=dropout_rate)
+
+    def forward_qkv(self,
+                    query: paddle.Tensor,
+                    key: paddle.Tensor,
+                    value: paddle.Tensor
+                    ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
+        """Transform query, key and value.
+        Args:
+            query (paddle.Tensor): Query tensor (#batch, time1, size).
+            key (paddle.Tensor): Key tensor (#batch, time2, size).
+            value (paddle.Tensor): Value tensor (#batch, time2, size).
+        Returns:
+            paddle.Tensor: Transformed query tensor, size
+                (#batch, n_head, time1, d_k).
+            paddle.Tensor: Transformed key tensor, size
+                (#batch, n_head, time2, d_k).
+            paddle.Tensor: Transformed value tensor, size
+                (#batch, n_head, time2, d_k).
+        """
+        n_batch = query.shape[0]
+        q = self.linear_q(query).view(n_batch, -1, self.h, self.d_k)
+        k = self.linear_k(key).view(n_batch, -1, self.h, self.d_k)
+        v = self.linear_v(value).view(n_batch, -1, self.h, self.d_k)
+        q = q.transpose([0, 2, 1, 3])  # (batch, head, time1, d_k)
+        k = k.transpose([0, 2, 1, 3])  # (batch, head, time2, d_k)
+        v = v.transpose([0, 2, 1, 3])  # (batch, head, time2, d_k)
+
+        return q, k, v
+
+    def forward_attention(self,
+                          value: paddle.Tensor,
+                          scores: paddle.Tensor,
+                          mask: Optional[paddle.Tensor]) -> paddle.Tensor:
+        """Compute attention context vector.
+        Args:
+            value (paddle.Tensor): Transformed value, size
+                (#batch, n_head, time2, d_k).
+            scores (paddle.Tensor): Attention score, size
+                (#batch, n_head, time1, time2).
+            mask (paddle.Tensor): Mask, size (#batch, 1, time2) or
+                (#batch, time1, time2).
+        Returns:
+            paddle.Tensor: Transformed value weighted
+                by the attention score, (#batch, time1, d_model).
+        """
+        n_batch = value.shape[0]
+        if mask is not None:
+            mask = mask.unsqueeze(1).eq(0)  # (batch, 1, *, time2)
+            scores = scores.masked_fill(mask, -float('inf'))
+            attn = paddle.softmax(
+                scores, axis=-1).masked_fill(mask,
+                                             0.0)  # (batch, head, time1, time2)
+        else:
+            attn = paddle.softmax(
+                scores, axis=-1)  # (batch, head, time1, time2)
+
+        p_attn = self.dropout(attn)
+        x = paddle.matmul(p_attn, value)  # (batch, head, time1, d_k)
+        x = x.transpose([0, 2, 1, 3]).view(n_batch, -1, self.h *
+                                           self.d_k)  # (batch, time1, d_model)
+
+        return self.linear_out(x)  # (batch, time1, d_model)
+
+    def forward(self,
+                query: paddle.Tensor,
+                key: paddle.Tensor,
+                value: paddle.Tensor,
+                mask: Optional[paddle.Tensor]) -> paddle.Tensor:
+        """Compute scaled dot product attention.
+        Args:
+            query (torch.Tensor): Query tensor (#batch, time1, size).
+            key (torch.Tensor): Key tensor (#batch, time2, size).
+            value (torch.Tensor): Value tensor (#batch, time2, size).
+            mask (torch.Tensor): Mask tensor (#batch, 1, time2) or
+                (#batch, time1, time2).
+        Returns:
+            torch.Tensor: Output tensor (#batch, time1, d_model).
+        """
+        q, k, v = self.forward_qkv(query, key, value)
+        scores = paddle.matmul(q,
+                               k.transpose([0, 1, 3, 2])) / math.sqrt(self.d_k)
+        return self.forward_attention(v, scores, mask)
+
+
+class RelPositionMultiHeadedAttention(MultiHeadedAttention):
+    """Multi-Head Attention layer with relative position encoding."""
+
+    def __init__(self, n_head, n_feat, dropout_rate):
+        """Construct an RelPositionMultiHeadedAttention object.
+        Paper: https://arxiv.org/abs/1901.02860
+        Args:
+            n_head (int): The number of heads.
+            n_feat (int): The number of features.
+            dropout_rate (float): Dropout rate.
+        """
+        super().__init__(n_head, n_feat, dropout_rate)
+        # linear transformation for positional encoding
+        self.linear_pos = Linear(n_feat, n_feat, bias_attr=False)
+        # these two learnable bias are used in matrix c and matrix d
+        # as described in https://arxiv.org/abs/1901.02860 Section 3.3
+        #self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k))
+        #self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k))
+        #torch.nn.init.xavier_uniform_(self.pos_bias_u)
+        #torch.nn.init.xavier_uniform_(self.pos_bias_v)
+        pos_bias_u = self.create_parameter(
+            [self.h, self.d_k], default_initializer=I.XavierUniform())
+        self.add_parameter('pos_bias_u', pos_bias_u)
+        pos_bias_v = self.create_parameter(
+            (self.h, self.d_k), default_initializer=I.XavierUniform())
+        self.add_parameter('pos_bias_v', pos_bias_v)
+
+    def rel_shift(self, x, zero_triu: bool=False):
+        """Compute relative positinal encoding.
+        Args:
+            x (paddle.Tensor): Input tensor (batch, head, time1, time1).
+            zero_triu (bool): If true, return the lower triangular part of
+                the matrix.
+        Returns:
+            paddle.Tensor: Output tensor. (batch, head, time1, time1)
+        """
+        zero_pad = paddle.zeros(
+            (x.shape[0], x.shape[1], x.shape[2], 1), dtype=x.dtype)
+        x_padded = paddle.cat([zero_pad, x], dim=-1)
+
+        x_padded = x_padded.view(x.shape[0], x.shape[1], x.shape[3] + 1,
+                                 x.shape[2])
+        x = x_padded[:, :, 1:].view_as(x)  # [B, H, T1, T1]
+
+        if zero_triu:
+            ones = paddle.ones((x.shape[2], x.shape[3]))
+            x = x * paddle.tril(ones, x.shape[3] - x.shape[2])[None, None, :, :]
+
+        return x
+
+    def forward(self,
+                query: paddle.Tensor,
+                key: paddle.Tensor,
+                value: paddle.Tensor,
+                pos_emb: paddle.Tensor,
+                mask: Optional[paddle.Tensor]):
+        """Compute 'Scaled Dot Product Attention' with rel. positional encoding.
+        Args:
+            query (paddle.Tensor): Query tensor (#batch, time1, size).
+            key (paddle.Tensor): Key tensor (#batch, time2, size).
+            value (paddle.Tensor): Value tensor (#batch, time2, size).
+            pos_emb (paddle.Tensor): Positional embedding tensor
+                (#batch, time1, size).
+            mask (paddle.Tensor): Mask tensor (#batch, 1, time2) or
+                (#batch, time1, time2).
+        Returns:
+            paddle.Tensor: Output tensor (#batch, time1, d_model).
+        """
+        q, k, v = self.forward_qkv(query, key, value)
+        q = q.transpose([0, 2, 1, 3])  # (batch, time1, head, d_k)
+
+        n_batch_pos = pos_emb.shape[0]
+        p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k)
+        p = p.transpose([0, 2, 1, 3])  # (batch, head, time1, d_k)
+
+        # (batch, head, time1, d_k)
+        q_with_bias_u = (q + self.pos_bias_u).transpose([0, 2, 1, 3])
+        # (batch, head, time1, d_k)
+        q_with_bias_v = (q + self.pos_bias_v).transpose([0, 2, 1, 3])
+
+        # compute attention score
+        # first compute matrix a and matrix c
+        # as described in https://arxiv.org/abs/1901.02860 Section 3.3
+        # (batch, head, time1, time2)
+        matrix_ac = paddle.matmul(q_with_bias_u, k.transpose([0, 1, 3, 2]))
+
+        # compute matrix b and matrix d
+        # (batch, head, time1, time2)
+        matrix_bd = paddle.matmul(q_with_bias_v, p.transpose([0, 1, 3, 2]))
+        # Remove rel_shift since it is useless in speech recognition,
+        # and it requires special attention for streaming.
+        # matrix_bd = self.rel_shift(matrix_bd)
+
+        scores = (matrix_ac + matrix_bd) / math.sqrt(
+            self.d_k)  # (batch, head, time1, time2)
+
+        return self.forward_attention(v, scores, mask)
diff --git a/ernie-sat/paddlespeech/s2t/modules/cmvn.py b/ernie-sat/paddlespeech/s2t/modules/cmvn.py
new file mode 100644
index 0000000000000000000000000000000000000000..67f71b6678e9908613b0fe867a44453fb204297a
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/modules/cmvn.py
@@ -0,0 +1,53 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2019 Mobvoi Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from wenet(https://github.com/wenet-e2e/wenet)
+import paddle
+from paddle import nn
+
+from paddlespeech.s2t.utils.log import Log
+
+logger = Log(__name__).getlog()
+
+__all__ = ['GlobalCMVN']
+
+
+class GlobalCMVN(nn.Layer):
+    def __init__(self,
+                 mean: paddle.Tensor,
+                 istd: paddle.Tensor,
+                 norm_var: bool=True):
+        """
+        Args:
+            mean (paddle.Tensor): mean stats
+            istd (paddle.Tensor): inverse std, std which is 1.0 / std
+        """
+        super().__init__()
+        assert mean.shape == istd.shape
+        self.norm_var = norm_var
+        # The buffer can be accessed from this module using self.mean
+        self.register_buffer("mean", mean)
+        self.register_buffer("istd", istd)
+
+    def forward(self, x: paddle.Tensor):
+        """
+        Args:
+            x (paddle.Tensor): (batch, max_len, feat_dim)
+        Returns:
+            (paddle.Tensor): normalized feature
+        """
+        x = x - self.mean
+        if self.norm_var:
+            x = x * self.istd
+        return x
diff --git a/ernie-sat/paddlespeech/s2t/modules/conformer_convolution.py b/ernie-sat/paddlespeech/s2t/modules/conformer_convolution.py
new file mode 100644
index 0000000000000000000000000000000000000000..89e6526885a2679b8ab09a4e4e4423a15e51ac08
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/modules/conformer_convolution.py
@@ -0,0 +1,166 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2019 Mobvoi Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from wenet(https://github.com/wenet-e2e/wenet)
+"""ConvolutionModule definition."""
+from typing import Optional
+from typing import Tuple
+
+import paddle
+from paddle import nn
+from typeguard import check_argument_types
+
+from paddlespeech.s2t.modules.align import BatchNorm1D
+from paddlespeech.s2t.modules.align import Conv1D
+from paddlespeech.s2t.modules.align import LayerNorm
+from paddlespeech.s2t.utils.log import Log
+
+logger = Log(__name__).getlog()
+
+__all__ = ['ConvolutionModule']
+
+
+class ConvolutionModule(nn.Layer):
+    """ConvolutionModule in Conformer model."""
+
+    def __init__(self,
+                 channels: int,
+                 kernel_size: int=15,
+                 activation: nn.Layer=nn.ReLU(),
+                 norm: str="batch_norm",
+                 causal: bool=False,
+                 bias: bool=True):
+        """Construct an ConvolutionModule object.
+        Args:
+            channels (int): The number of channels of conv layers.
+            kernel_size (int): Kernel size of conv layers.
+            activation (nn.Layer): Activation Layer.
+            norm (str): Normalization type, 'batch_norm' or 'layer_norm'
+            causal (bool): Whether use causal convolution or not
+            bias (bool): Whether Conv with bias or not
+        """
+        assert check_argument_types()
+        super().__init__()
+        self.pointwise_conv1 = Conv1D(
+            channels,
+            2 * channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias_attr=None
+            if bias else False,  # None for True, using bias as default config
+        )
+
+        # self.lorder is used to distinguish if it's a causal convolution,
+        # if self.lorder > 0:
+        #    it's a causal convolution, the input will be padded with
+        #    `self.lorder` frames on the left in forward (causal conv impl).
+        # else: it's a symmetrical convolution
+        if causal:
+            padding = 0
+            self.lorder = kernel_size - 1
+        else:
+            # kernel_size should be an odd number for none causal convolution
+            assert (kernel_size - 1) % 2 == 0
+            padding = (kernel_size - 1) // 2
+            self.lorder = 0
+
+        self.depthwise_conv = Conv1D(
+            channels,
+            channels,
+            kernel_size,
+            stride=1,
+            padding=padding,
+            groups=channels,
+            bias_attr=None
+            if bias else False,  # None for True, using bias as default config
+        )
+
+        assert norm in ['batch_norm', 'layer_norm']
+        if norm == "batch_norm":
+            self.use_layer_norm = False
+            self.norm = BatchNorm1D(channels)
+        else:
+            self.use_layer_norm = True
+            self.norm = LayerNorm(channels)
+
+        self.pointwise_conv2 = Conv1D(
+            channels,
+            channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias_attr=None
+            if bias else False,  # None for True, using bias as default config
+        )
+        self.activation = activation
+
+    def forward(self,
+                x: paddle.Tensor,
+                mask_pad: Optional[paddle.Tensor]=None,
+                cache: Optional[paddle.Tensor]=None
+                ) -> Tuple[paddle.Tensor, paddle.Tensor]:
+        """Compute convolution module.
+        Args:
+            x (paddle.Tensor): Input tensor (#batch, time, channels).
+            mask_pad (paddle.Tensor): used for batch padding, (#batch, channels, time).
+            cache (paddle.Tensor): left context cache, it is only
+                used in causal convolution. (#batch, channels, time')
+        Returns:
+            paddle.Tensor: Output tensor (#batch, time, channels).
+            paddle.Tensor: Output cache tensor (#batch, channels, time')
+        """
+        # exchange the temporal dimension and the feature dimension
+        x = x.transpose([0, 2, 1])  # [B, C, T]
+
+        # mask batch padding
+        if mask_pad is not None:
+            x = x.masked_fill(mask_pad, 0.0)
+
+        if self.lorder > 0:
+            if cache is None:
+                x = nn.functional.pad(
+                    x, [self.lorder, 0], 'constant', 0.0, data_format='NCL')
+            else:
+                assert cache.shape[0] == x.shape[0]  # B
+                assert cache.shape[1] == x.shape[1]  # C
+                x = paddle.concat((cache, x), axis=2)
+
+            assert (x.shape[2] > self.lorder)
+            new_cache = x[:, :, -self.lorder:]  #[B, C, T]
+        else:
+            # It's better we just return None if no cache is requried,
+            # However, for JIT export, here we just fake one tensor instead of
+            # None.
+            new_cache = paddle.zeros([1], dtype=x.dtype)
+
+        # GLU mechanism
+        x = self.pointwise_conv1(x)  # (batch, 2*channel, dim)
+        x = nn.functional.glu(x, axis=1)  # (batch, channel, dim)
+
+        # 1D Depthwise Conv
+        x = self.depthwise_conv(x)
+        if self.use_layer_norm:
+            x = x.transpose([0, 2, 1])  # [B, T, C]
+        x = self.activation(self.norm(x))
+        if self.use_layer_norm:
+            x = x.transpose([0, 2, 1])  # [B, C, T]
+        x = self.pointwise_conv2(x)
+
+        # mask batch padding
+        if mask_pad is not None:
+            x = x.masked_fill(mask_pad, 0.0)
+
+        x = x.transpose([0, 2, 1])  # [B, T, C]
+        return x, new_cache
diff --git a/ernie-sat/paddlespeech/s2t/modules/crf.py b/ernie-sat/paddlespeech/s2t/modules/crf.py
new file mode 100644
index 0000000000000000000000000000000000000000..66f6b18065f4171c121dd56494e229556bf47af4
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/modules/crf.py
@@ -0,0 +1,370 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddle
+from paddle import nn
+
+from paddlespeech.s2t.utils.log import Log
+
+logger = Log(__name__).getlog()
+
+__all__ = ['CRF']
+
+
+class CRF(nn.Layer):
+    """
+    Linear-chain Conditional Random Field (CRF).
+    
+    Args:
+        nb_labels (int): number of labels in your tagset, including special symbols.
+        bos_tag_id (int): integer representing the beginning of sentence symbol in
+            your tagset.
+        eos_tag_id (int): integer representing the end of sentence symbol in your tagset.
+        pad_tag_id (int, optional): integer representing the pad symbol in your tagset.
+            If None, the model will treat the PAD as a normal tag. Otherwise, the model
+            will apply constraints for PAD transitions.
+        batch_first (bool): Whether the first dimension represents the batch dimension.
+    """
+
+    def __init__(self,
+                 nb_labels: int,
+                 bos_tag_id: int,
+                 eos_tag_id: int,
+                 pad_tag_id: int=None,
+                 batch_first: bool=True):
+        super().__init__()
+
+        self.nb_labels = nb_labels
+        self.BOS_TAG_ID = bos_tag_id
+        self.EOS_TAG_ID = eos_tag_id
+        self.PAD_TAG_ID = pad_tag_id
+        self.batch_first = batch_first
+
+        # initialize transitions from a random uniform distribution between -0.1 and 0.1
+        self.transitions = self.create_parameter(
+            [self.nb_labels, self.nb_labels],
+            default_initializer=nn.initializer.Uniform(-0.1, 0.1))
+        self.init_weights()
+
+    def init_weights(self):
+        # enforce contraints (rows=from, columns=to) with a big negative number
+        # so exp(-10000) will tend to zero
+
+        # no transitions allowed to the beginning of sentence
+        self.transitions[:, self.BOS_TAG_ID] = -10000.0
+        # no transition alloed from the end of sentence
+        self.transitions[self.EOS_TAG_ID, :] = -10000.0
+
+        if self.PAD_TAG_ID is not None:
+            # no transitions from padding
+            self.transitions[self.PAD_TAG_ID, :] = -10000.0
+            # no transitions to padding
+            self.transitions[:, self.PAD_TAG_ID] = -10000.0
+            # except if the end of sentence is reached
+            # or we are already in a pad position
+            self.transitions[self.PAD_TAG_ID, self.EOS_TAG_ID] = 0.0
+            self.transitions[self.PAD_TAG_ID, self.PAD_TAG_ID] = 0.0
+
+    def forward(self,
+                emissions: paddle.Tensor,
+                tags: paddle.Tensor,
+                mask: paddle.Tensor=None) -> paddle.Tensor:
+        """Compute the negative log-likelihood. See `log_likelihood` method."""
+        nll = -self.log_likelihood(emissions, tags, mask=mask)
+        return nll
+
+    def log_likelihood(self, emissions, tags, mask=None):
+        """Compute the probability of a sequence of tags given a sequence of
+        emissions scores.
+
+        Args:
+            emissions (paddle.Tensor): Sequence of emissions for each label.
+                Shape of (batch_size, seq_len, nb_labels) if batch_first is True,
+                (seq_len, batch_size, nb_labels) otherwise.
+            tags (paddle.LongTensor): Sequence of labels.
+                Shape of (batch_size, seq_len) if batch_first is True,
+                (seq_len, batch_size) otherwise.
+            mask (paddle.FloatTensor, optional): Tensor representing valid positions.
+                If None, all positions are considered valid.
+                Shape of (batch_size, seq_len) if batch_first is True,
+                (seq_len, batch_size) otherwise.
+
+        Returns:
+            paddle.Tensor: sum of the log-likelihoods for each sequence in the batch.
+                Shape of ()
+        """
+        # fix tensors order by setting batch as the first dimension
+        if not self.batch_first:
+            emissions = emissions.transpose(0, 1)
+            tags = tags.transpose(0, 1)
+
+        if mask is None:
+            mask = paddle.ones(emissions.shape[:2], dtype=paddle.float)
+
+        scores = self._compute_scores(emissions, tags, mask=mask)
+        partition = self._compute_log_partition(emissions, mask=mask)
+        return paddle.sum(scores - partition)
+
+    def decode(self, emissions, mask=None):
+        """Find the most probable sequence of labels given the emissions using
+        the Viterbi algorithm.
+
+        Args:
+            emissions (paddle.Tensor): Sequence of emissions for each label.
+                Shape (batch_size, seq_len, nb_labels) if batch_first is True,
+                (seq_len, batch_size, nb_labels) otherwise.
+            mask (paddle.FloatTensor, optional): Tensor representing valid positions.
+                If None, all positions are considered valid.
+                Shape (batch_size, seq_len) if batch_first is True,
+                (seq_len, batch_size) otherwise.
+
+        Returns:
+            paddle.Tensor: the viterbi score for the for each batch.
+                Shape of (batch_size,)
+            list of lists: the best viterbi sequence of labels for each batch. [B, T]
+        """
+        # fix tensors order by setting batch as the first dimension
+        if not self.batch_first:
+            emissions = emissions.transpose(0, 1)
+            tags = tags.transpose(0, 1)
+
+        if mask is None:
+            mask = paddle.ones(emissions.shape[:2], dtype=paddle.float)
+
+        scores, sequences = self._viterbi_decode(emissions, mask)
+        return scores, sequences
+
+    def _compute_scores(self, emissions, tags, mask):
+        """Compute the scores for a given batch of emissions with their tags.
+
+        Args:
+            emissions (paddle.Tensor): (batch_size, seq_len, nb_labels)
+            tags (Paddle.LongTensor): (batch_size, seq_len)
+            mask (Paddle.FloatTensor): (batch_size, seq_len)
+
+        Returns:
+            paddle.Tensor: Scores for each batch.
+                Shape of (batch_size,)
+        """
+        batch_size, seq_length = tags.shape
+        scores = paddle.zeros([batch_size])
+
+        # save first and last tags to be used later
+        first_tags = tags[:, 0]
+        last_valid_idx = mask.int().sum(1) - 1
+
+        # TODO(Hui Zhang): not support fancy index. 
+        # last_tags = tags.gather(last_valid_idx.unsqueeze(1), axis=1).squeeze()
+        batch_idx = paddle.arange(batch_size, dtype=last_valid_idx.dtype)
+        gather_last_valid_idx = paddle.stack(
+            [batch_idx, last_valid_idx], axis=-1)
+        last_tags = tags.gather_nd(gather_last_valid_idx)
+
+        # add the transition from BOS to the first tags for each batch
+        # t_scores = self.transitions[self.BOS_TAG_ID, first_tags]
+        t_scores = self.transitions[self.BOS_TAG_ID].gather(first_tags)
+
+        # add the [unary] emission scores for the first tags for each batch
+        # for all batches, the first word, see the correspondent emissions
+        # for the first tags (which is a list of ids):
+        # emissions[:, 0, [tag_1, tag_2, ..., tag_nblabels]]
+        # e_scores = emissions[:, 0].gather(1, first_tags.unsqueeze(1)).squeeze()
+        gather_first_tags_idx = paddle.stack([batch_idx, first_tags], axis=-1)
+        e_scores = emissions[:, 0].gather_nd(gather_first_tags_idx)
+
+        # the scores for a word is just the sum of both scores
+        scores += e_scores + t_scores
+
+        # now lets do this for each remaining word
+        for i in range(1, seq_length):
+
+            # we could: iterate over batches, check if we reached a mask symbol
+            # and stop the iteration, but vecotrizing is faster due to gpu,
+            # so instead we perform an element-wise multiplication
+            is_valid = mask[:, i]
+
+            previous_tags = tags[:, i - 1]
+            current_tags = tags[:, i]
+
+            # calculate emission and transition scores as we did before
+            # e_scores = emissions[:, i].gather(1, current_tags.unsqueeze(1)).squeeze()
+            gather_current_tags_idx = paddle.stack(
+                [batch_idx, current_tags], axis=-1)
+            e_scores = emissions[:, i].gather_nd(gather_current_tags_idx)
+            # t_scores = self.transitions[previous_tags, current_tags]
+            gather_transitions_idx = paddle.stack(
+                [previous_tags, current_tags], axis=-1)
+            t_scores = self.transitions.gather_nd(gather_transitions_idx)
+
+            # apply the mask
+            e_scores = e_scores * is_valid
+            t_scores = t_scores * is_valid
+
+            scores += e_scores + t_scores
+
+        # add the transition from the end tag to the EOS tag for each batch
+        # scores += self.transitions[last_tags, self.EOS_TAG_ID]
+        scores += self.transitions.gather(last_tags)[:, self.EOS_TAG_ID]
+
+        return scores
+
+    def _compute_log_partition(self, emissions, mask):
+        """Compute the partition function in log-space using the forward-algorithm.
+
+        Args:
+            emissions (paddle.Tensor): (batch_size, seq_len, nb_labels)
+            mask (Paddle.FloatTensor): (batch_size, seq_len)
+
+        Returns:
+            paddle.Tensor: the partition scores for each batch.
+                Shape of (batch_size,)
+        """
+        batch_size, seq_length, nb_labels = emissions.shape
+
+        # in the first iteration, BOS will have all the scores
+        alphas = self.transitions[self.BOS_TAG_ID, :].unsqueeze(
+            0) + emissions[:, 0]
+
+        for i in range(1, seq_length):
+            # (bs, nb_labels) -> (bs, 1, nb_labels)
+            e_scores = emissions[:, i].unsqueeze(1)
+
+            # (nb_labels, nb_labels) -> (bs, nb_labels, nb_labels)
+            t_scores = self.transitions.unsqueeze(0)
+
+            # (bs, nb_labels)  -> (bs, nb_labels, 1)
+            a_scores = alphas.unsqueeze(2)
+
+            scores = e_scores + t_scores + a_scores
+            new_alphas = paddle.logsumexp(scores, axis=1)
+
+            # set alphas if the mask is valid, otherwise keep the current values
+            is_valid = mask[:, i].unsqueeze(-1)
+            alphas = is_valid * new_alphas + (1 - is_valid) * alphas
+
+        # add the scores for the final transition
+        last_transition = self.transitions[:, self.EOS_TAG_ID]
+        end_scores = alphas + last_transition.unsqueeze(0)
+
+        # return a *log* of sums of exps
+        return paddle.logsumexp(end_scores, axis=1)
+
+    def _viterbi_decode(self, emissions, mask):
+        """Compute the viterbi algorithm to find the most probable sequence of labels
+        given a sequence of emissions.
+
+        Args:
+            emissions (paddle.Tensor): (batch_size, seq_len, nb_labels)
+            mask (Paddle.FloatTensor): (batch_size, seq_len)
+
+        Returns:
+            paddle.Tensor: the viterbi score for the for each batch.
+                Shape of (batch_size,)
+            list of lists of ints: the best viterbi sequence of labels for each batch
+        """
+        batch_size, seq_length, nb_labels = emissions.shape
+
+        # in the first iteration, BOS will have all the scores and then, the max
+        alphas = self.transitions[self.BOS_TAG_ID, :].unsqueeze(
+            0) + emissions[:, 0]
+
+        backpointers = []
+
+        for i in range(1, seq_length):
+            # (bs, nb_labels) -> (bs, 1, nb_labels)
+            e_scores = emissions[:, i].unsqueeze(1)
+
+            # (nb_labels, nb_labels) -> (bs, nb_labels, nb_labels)
+            t_scores = self.transitions.unsqueeze(0)
+
+            # (bs, nb_labels)  -> (bs, nb_labels, 1)
+            a_scores = alphas.unsqueeze(2)
+
+            # combine current scores with previous alphas
+            scores = e_scores + t_scores + a_scores
+
+            # so far is exactly like the forward algorithm,
+            # but now, instead of calculating the logsumexp,
+            # we will find the highest score and the tag associated with it
+            # max_scores, max_score_tags = paddle.max(scores, axis=1)
+            max_scores = paddle.max(scores, axis=1)
+            max_score_tags = paddle.argmax(scores, axis=1)
+
+            # set alphas if the mask is valid, otherwise keep the current values
+            is_valid = mask[:, i].unsqueeze(-1)
+            alphas = is_valid * max_scores + (1 - is_valid) * alphas
+
+            # add the max_score_tags for our list of backpointers
+            # max_scores has shape (batch_size, nb_labels) so we transpose it to
+            # be compatible with our previous loopy version of viterbi
+            backpointers.append(max_score_tags.t())
+
+        # add the scores for the final transition
+        last_transition = self.transitions[:, self.EOS_TAG_ID]
+        end_scores = alphas + last_transition.unsqueeze(0)
+
+        # get the final most probable score and the final most probable tag
+        # max_final_scores, max_final_tags = paddle.max(end_scores, axis=1)
+        max_final_scores = paddle.max(end_scores, axis=1)
+        max_final_tags = paddle.argmax(end_scores, axis=1)
+
+        # find the best sequence of labels for each sample in the batch
+        best_sequences = []
+        emission_lengths = mask.int().sum(axis=1)
+        for i in range(batch_size):
+
+            # recover the original sentence length for the i-th sample in the batch
+            sample_length = emission_lengths[i].item()
+
+            # recover the max tag for the last timestep
+            sample_final_tag = max_final_tags[i].item()
+
+            # limit the backpointers until the last but one
+            # since the last corresponds to the sample_final_tag
+            sample_backpointers = backpointers[:sample_length - 1]
+
+            # follow the backpointers to build the sequence of labels
+            sample_path = self._find_best_path(i, sample_final_tag,
+                                               sample_backpointers)
+
+            # add this path to the list of best sequences
+            best_sequences.append(sample_path)
+
+        return max_final_scores, best_sequences
+
+    def _find_best_path(self, sample_id, best_tag, backpointers):
+        """Auxiliary function to find the best path sequence for a specific sample.
+
+            Args:
+                sample_id (int): sample index in the range [0, batch_size)
+                best_tag (int): tag which maximizes the final score
+                backpointers (list of lists of tensors): list of pointers with
+                shape (seq_len_i-1, nb_labels, batch_size) where seq_len_i
+                represents the length of the ith sample in the batch
+
+            Returns:
+                list of ints: a list of tag indexes representing the bast path
+        """
+        # add the final best_tag to our best path
+        best_path = [best_tag]
+
+        # traverse the backpointers in backwards
+        for backpointers_t in reversed(backpointers):
+
+            # recover the best_tag at this timestep
+            best_tag = backpointers_t[best_tag][sample_id].item()
+
+            # append to the beginning of the list so we don't need to reverse it later
+            best_path.insert(0, best_tag)
+
+        return best_path
diff --git a/ernie-sat/paddlespeech/s2t/modules/ctc.py b/ernie-sat/paddlespeech/s2t/modules/ctc.py
new file mode 100644
index 0000000000000000000000000000000000000000..33ad472defba0a86bc945582f386acb406e4c35e
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/modules/ctc.py
@@ -0,0 +1,470 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Union
+
+import paddle
+from paddle import nn
+from paddle.nn import functional as F
+from typeguard import check_argument_types
+
+from paddlespeech.s2t.modules.align import Linear
+from paddlespeech.s2t.modules.loss import CTCLoss
+from paddlespeech.s2t.utils import ctc_utils
+from paddlespeech.s2t.utils.log import Log
+
+logger = Log(__name__).getlog()
+
+try:
+    from paddlespeech.s2t.decoders.ctcdecoder import ctc_beam_search_decoding_batch  # noqa: F401
+    from paddlespeech.s2t.decoders.ctcdecoder import ctc_greedy_decoding  # noqa: F401
+    from paddlespeech.s2t.decoders.ctcdecoder import Scorer  # noqa: F401
+    from paddlespeech.s2t.decoders.ctcdecoder import CTCBeamSearchDecoder  # noqa: F401
+except ImportError:
+    try:
+        from paddlespeech.s2t.utils import dynamic_pip_install
+        package_name = 'paddlespeech_ctcdecoders'
+        dynamic_pip_install.install(package_name)
+        from paddlespeech.s2t.decoders.ctcdecoder import ctc_beam_search_decoding_batch  # noqa: F401
+        from paddlespeech.s2t.decoders.ctcdecoder import ctc_greedy_decoding  # noqa: F401
+        from paddlespeech.s2t.decoders.ctcdecoder import Scorer  # noqa: F401
+        from paddlespeech.s2t.decoders.ctcdecoder import CTCBeamSearchDecoder  # noqa: F401
+    except Exception as e:
+        logger.info("paddlespeech_ctcdecoders not installed!")
+
+__all__ = ['CTCDecoder']
+
+
+class CTCDecoderBase(nn.Layer):
+    def __init__(self,
+                 odim,
+                 enc_n_units,
+                 blank_id=0,
+                 dropout_rate: float=0.0,
+                 reduction: bool=True,
+                 batch_average: bool=True,
+                 grad_norm_type: Union[str, None]=None):
+        """CTC decoder
+
+        Args:
+            odim ([int]): text vocabulary size
+            enc_n_units ([int]): encoder output dimention
+            dropout_rate (float): dropout rate (0.0 ~ 1.0)
+            reduction (bool): reduce the CTC loss into a scalar, True for 'sum' or 'none'
+            batch_average (bool): do batch dim wise average.
+            grad_norm_type (str): Default, None. one of 'instance', 'batch', 'frame', None.
+        """
+        assert check_argument_types()
+        super().__init__()
+
+        self.blank_id = blank_id
+        self.odim = odim
+        self.dropout = nn.Dropout(dropout_rate)
+        self.ctc_lo = Linear(enc_n_units, self.odim)
+        reduction_type = "sum" if reduction else "none"
+        self.criterion = CTCLoss(
+            blank=self.blank_id,
+            reduction=reduction_type,
+            batch_average=batch_average,
+            grad_norm_type=grad_norm_type)
+
+    def forward(self, hs_pad, hlens, ys_pad, ys_lens):
+        """Calculate CTC loss.
+
+        Args:
+            hs_pad (Tensor): batch of padded hidden state sequences (B, Tmax, D)
+            hlens (Tensor): batch of lengths of hidden state sequences (B)
+            ys_pad (Tensor): batch of padded character id sequence tensor (B, Lmax)
+            ys_lens (Tensor): batch of lengths of character sequence (B)
+        Returns:
+            loss (Tensor): ctc loss value, scalar.
+        """
+        logits = self.ctc_lo(self.dropout(hs_pad))
+        loss = self.criterion(logits, ys_pad, hlens, ys_lens)
+        return loss
+
+    def softmax(self, eouts: paddle.Tensor, temperature: float=1.0):
+        """Get CTC probabilities.
+        Args:
+            eouts (FloatTensor): `[B, T, enc_units]`
+        Returns:
+            probs (FloatTensor): `[B, T, odim]`
+        """
+        self.probs = F.softmax(self.ctc_lo(eouts) / temperature, axis=2)
+        return self.probs
+
+    def log_softmax(self, hs_pad: paddle.Tensor,
+                    temperature: float=1.0) -> paddle.Tensor:
+        """log_softmax of frame activations
+        Args:
+            Tensor hs_pad: 3d tensor (B, Tmax, eprojs)
+        Returns:
+            paddle.Tensor: log softmax applied 3d tensor (B, Tmax, odim)
+        """
+        return F.log_softmax(self.ctc_lo(hs_pad) / temperature, axis=2)
+
+    def argmax(self, hs_pad: paddle.Tensor) -> paddle.Tensor:
+        """argmax of frame activations
+        Args:
+            paddle.Tensor hs_pad: 3d tensor (B, Tmax, eprojs)
+        Returns:
+            paddle.Tensor: argmax applied 2d tensor (B, Tmax)
+        """
+        return paddle.argmax(self.ctc_lo(hs_pad), dim=2)
+
+    def forced_align(self,
+                     ctc_probs: paddle.Tensor,
+                     y: paddle.Tensor,
+                     blank_id=0) -> list:
+        """ctc forced alignment.
+        Args:
+            ctc_probs (paddle.Tensor): hidden state sequence, 2d tensor (T, D)
+            y (paddle.Tensor): label id sequence tensor, 1d tensor (L)
+            blank_id (int): blank symbol index
+        Returns:
+            paddle.Tensor: best alignment result, (T).
+        """
+        return ctc_utils.forced_align(ctc_probs, y, blank_id)
+
+
+class CTCDecoder(CTCDecoderBase):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # CTCDecoder LM Score handle
+        self._ext_scorer = None
+        self.beam_search_decoder = None
+
+    def _decode_batch_greedy_offline(self, probs_split, vocab_list):
+        """This function will be deprecated in future.
+        Decode by best path for a batch of probs matrix input.
+        :param probs_split: List of 2-D probability matrix, and each consists
+                            of prob vectors for one speech utterancce.
+        :param probs_split: List of matrix
+        :param vocab_list: List of tokens in the vocabulary, for decoding.
+        :type vocab_list: list
+        :return: List of transcription texts.
+        :rtype: List of str
+        """
+        results = []
+        for i, probs in enumerate(probs_split):
+            output_transcription = ctc_greedy_decoding(
+                probs_seq=probs, vocabulary=vocab_list, blank_id=self.blank_id)
+            results.append(output_transcription)
+        return results
+
+    def _init_ext_scorer(self, beam_alpha, beam_beta, language_model_path,
+                         vocab_list):
+        """Initialize the external scorer.
+        :param beam_alpha: Parameter associated with language model.
+        :type beam_alpha: float
+        :param beam_beta: Parameter associated with word count.
+        :type beam_beta: float
+        :param language_model_path: Filepath for language model. If it is
+                                    empty, the external scorer will be set to
+                                    None, and the decoding method will be pure
+                                    beam search without scorer.
+        :type language_model_path: str|None
+        :param vocab_list: List of tokens in the vocabulary, for decoding.
+        :type vocab_list: list
+        """
+        # init once
+        if self._ext_scorer is not None:
+            return
+
+        if language_model_path != '':
+            logger.info("begin to initialize the external scorer "
+                        "for decoding")
+            self._ext_scorer = Scorer(beam_alpha, beam_beta,
+                                      language_model_path, vocab_list)
+            lm_char_based = self._ext_scorer.is_character_based()
+            lm_max_order = self._ext_scorer.get_max_order()
+            lm_dict_size = self._ext_scorer.get_dict_size()
+            logger.info("language model: "
+                        "is_character_based = %d," % lm_char_based +
+                        " max_order = %d," % lm_max_order + " dict_size = %d" %
+                        lm_dict_size)
+            logger.info("end initializing scorer")
+        else:
+            self._ext_scorer = None
+            logger.info("no language model provided, "
+                        "decoding by pure beam search without scorer.")
+
+    def _decode_batch_beam_search_offline(
+            self, probs_split, beam_alpha, beam_beta, beam_size, cutoff_prob,
+            cutoff_top_n, vocab_list, num_processes):
+        """
+        This function will be deprecated in future.
+        Decode by beam search for a batch of probs matrix input.
+        :param probs_split: List of 2-D probability matrix, and each consists
+                            of prob vectors for one speech utterancce.
+        :param probs_split: List of matrix
+        :param beam_alpha: Parameter associated with language model.
+        :type beam_alpha: float
+        :param beam_beta: Parameter associated with word count.
+        :type beam_beta: float
+        :param beam_size: Width for Beam search.
+        :type beam_size: int
+        :param cutoff_prob: Cutoff probability in pruning,
+                            default 1.0, no pruning.
+        :type cutoff_prob: float
+        :param cutoff_top_n: Cutoff number in pruning, only top cutoff_top_n
+                        characters with highest probs in vocabulary will be
+                        used in beam search, default 40.
+        :type cutoff_top_n: int
+        :param vocab_list: List of tokens in the vocabulary, for decoding.
+        :type vocab_list: list
+        :param num_processes: Number of processes (CPU) for decoder.
+        :type num_processes: int
+        :return: List of transcription texts.
+        :rtype: List of str
+        """
+        if self._ext_scorer is not None:
+            self._ext_scorer.reset_params(beam_alpha, beam_beta)
+
+        # beam search decode
+        num_processes = min(num_processes, len(probs_split))
+        beam_search_results = ctc_beam_search_decoding_batch(
+            probs_split=probs_split,
+            vocabulary=vocab_list,
+            beam_size=beam_size,
+            num_processes=num_processes,
+            ext_scoring_func=self._ext_scorer,
+            cutoff_prob=cutoff_prob,
+            cutoff_top_n=cutoff_top_n,
+            blank_id=self.blank_id)
+
+        results = [result[0][1] for result in beam_search_results]
+        return results
+
+    def init_decoder(self, batch_size, vocab_list, decoding_method,
+                     lang_model_path, beam_alpha, beam_beta, beam_size,
+                     cutoff_prob, cutoff_top_n, num_processes):
+        """
+        init ctc decoders
+        Args:
+            batch_size(int): Batch size for input data
+            vocab_list (list): List of tokens in the vocabulary, for decoding
+            decoding_method (str): ctc_beam_search
+            lang_model_path (str): language model path
+            beam_alpha (float): beam_alpha
+            beam_beta (float): beam_beta
+            beam_size (int): beam_size
+            cutoff_prob (float): cutoff probability in beam search
+            cutoff_top_n (int): cutoff_top_n
+            num_processes (int): num_processes
+
+        Raises:
+            ValueError: when decoding_method not support.
+
+        Returns:
+            CTCBeamSearchDecoder
+        """
+        self.batch_size = batch_size
+        self.vocab_list = vocab_list
+        self.decoding_method = decoding_method
+        self.beam_size = beam_size
+        self.cutoff_prob = cutoff_prob
+        self.cutoff_top_n = cutoff_top_n
+        self.num_processes = num_processes
+        if decoding_method == "ctc_beam_search":
+            self._init_ext_scorer(beam_alpha, beam_beta, lang_model_path,
+                                  vocab_list)
+            if self.beam_search_decoder is None:
+                self.beam_search_decoder = self.get_decoder(
+                    vocab_list, batch_size, beam_alpha, beam_beta, beam_size,
+                    num_processes, cutoff_prob, cutoff_top_n)
+            return self.beam_search_decoder
+        elif decoding_method == "ctc_greedy":
+            self._init_ext_scorer(beam_alpha, beam_beta, lang_model_path,
+                                  vocab_list)
+        else:
+            raise ValueError(f"Not support: {decoding_method}")
+
+    def decode_probs_offline(self, probs, logits_lens, vocab_list,
+                             decoding_method, lang_model_path, beam_alpha,
+                             beam_beta, beam_size, cutoff_prob, cutoff_top_n,
+                             num_processes):
+        """
+        This function will be deprecated in future.
+        ctc decoding with probs.
+        Args:
+            probs (Tensor): activation after softmax
+            logits_lens (Tensor): audio output lens
+            vocab_list (list): List of tokens in the vocabulary, for decoding
+            decoding_method (str): ctc_beam_search
+            lang_model_path (str): language model path
+            beam_alpha (float): beam_alpha
+            beam_beta (float): beam_beta
+            beam_size (int): beam_size
+            cutoff_prob (float): cutoff probability in beam search
+            cutoff_top_n (int): cutoff_top_n
+            num_processes (int): num_processes
+
+        Raises:
+            ValueError: when decoding_method not support.
+
+        Returns:
+            List[str]: transcripts.
+        """
+        logger.warn(
+            "This function will be deprecated in future: decode_probs_offline")
+        probs_split = [probs[i, :l, :] for i, l in enumerate(logits_lens)]
+        if decoding_method == "ctc_greedy":
+            result_transcripts = self._decode_batch_greedy_offline(
+                probs_split=probs_split, vocab_list=vocab_list)
+        elif decoding_method == "ctc_beam_search":
+            result_transcripts = self._decode_batch_beam_search_offline(
+                probs_split=probs_split,
+                beam_alpha=beam_alpha,
+                beam_beta=beam_beta,
+                beam_size=beam_size,
+                cutoff_prob=cutoff_prob,
+                cutoff_top_n=cutoff_top_n,
+                vocab_list=vocab_list,
+                num_processes=num_processes)
+        else:
+            raise ValueError(f"Not support: {decoding_method}")
+        return result_transcripts
+
+    def get_decoder(self, vocab_list, batch_size, beam_alpha, beam_beta,
+                    beam_size, num_processes, cutoff_prob, cutoff_top_n):
+        """
+        init get ctc decoder
+        Args:
+            vocab_list (list): List of tokens in the vocabulary, for decoding.
+            batch_size(int): Batch size for input data
+            beam_alpha (float): beam_alpha
+            beam_beta (float): beam_beta
+            beam_size (int): beam_size
+            num_processes (int): num_processes
+            cutoff_prob (float): cutoff probability in beam search
+            cutoff_top_n (int): cutoff_top_n
+
+        Raises:
+            ValueError: when decoding_method not support.
+
+        Returns:
+            CTCBeamSearchDecoder
+        """
+        num_processes = min(num_processes, batch_size)
+        if self._ext_scorer is not None:
+            self._ext_scorer.reset_params(beam_alpha, beam_beta)
+        if self.decoding_method == "ctc_beam_search":
+            beam_search_decoder = CTCBeamSearchDecoder(
+                vocab_list, batch_size, beam_size, num_processes, cutoff_prob,
+                cutoff_top_n, self._ext_scorer, self.blank_id)
+        else:
+            raise ValueError(f"Not support: {decoding_method}")
+        return beam_search_decoder
+
+    def next(self, probs, logits_lens):
+        """
+        Input probs into ctc decoder
+        Args:
+            probs (list(list(float))): probs for a batch of data
+            logits_lens (list(int)): logits lens for a batch of data
+        Raises:
+            Exception: when the ctc decoder is not initialized
+            ValueError: when decoding_method not support.
+        """
+
+        if self.beam_search_decoder is None:
+            raise Exception(
+                "You need to initialize the beam_search_decoder firstly")
+        beam_search_decoder = self.beam_search_decoder
+
+        has_value = (logits_lens > 0).tolist()
+        has_value = [
+            "true" if has_value[i] is True else "false"
+            for i in range(len(has_value))
+        ]
+        probs_split = [
+            probs[i, :l, :].tolist() if has_value[i] else probs[i].tolist()
+            for i, l in enumerate(logits_lens)
+        ]
+        if self.decoding_method == "ctc_beam_search":
+            beam_search_decoder.next(probs_split, has_value)
+        else:
+            raise ValueError(f"Not support: {decoding_method}")
+
+        return
+
+    def decode(self):
+        """
+        Get the decoding result
+        Raises:
+            Exception: when the ctc decoder is not initialized
+            ValueError: when decoding_method not support.
+        Returns:
+            results_best (list(str)): The best result for a batch of data
+            results_beam (list(list(str))): The beam search result for a batch of data
+        """
+        if self.beam_search_decoder is None:
+            raise Exception(
+                "You need to initialize the beam_search_decoder firstly")
+
+        beam_search_decoder = self.beam_search_decoder
+        if self.decoding_method == "ctc_beam_search":
+            batch_beam_results = beam_search_decoder.decode()
+            batch_beam_results = [[(res[0], res[1]) for res in beam_results]
+                                  for beam_results in batch_beam_results]
+            results_best = [result[0][1] for result in batch_beam_results]
+            results_beam = [[trans[1] for trans in result]
+                            for result in batch_beam_results]
+
+        else:
+            raise ValueError(f"Not support: {decoding_method}")
+
+        return results_best, results_beam
+
+    def reset_decoder(self,
+                      batch_size=-1,
+                      beam_size=-1,
+                      num_processes=-1,
+                      cutoff_prob=-1.0,
+                      cutoff_top_n=-1):
+        if batch_size > 0:
+            self.batch_size = batch_size
+        if beam_size > 0:
+            self.beam_size = beam_size
+        if num_processes > 0:
+            self.num_processes = num_processes
+        if cutoff_prob > 0:
+            self.cutoff_prob = cutoff_prob
+        if cutoff_top_n > 0:
+            self.cutoff_top_n = cutoff_top_n
+        """
+        Reset the decoder state
+        Args:
+            batch_size(int): Batch size for input data
+            beam_size (int): beam_size
+            num_processes (int): num_processes
+            cutoff_prob (float): cutoff probability in beam search
+            cutoff_top_n (int): cutoff_top_n
+        Raises:
+            Exception: when the ctc decoder is not initialized
+        """
+        if self.beam_search_decoder is None:
+            raise Exception(
+                "You need to initialize the beam_search_decoder firstly")
+        self.beam_search_decoder.reset_state(
+            self.batch_size, self.beam_size, self.num_processes,
+            self.cutoff_prob, self.cutoff_top_n)
+
+    def del_decoder(self):
+        """
+        Delete the decoder
+        """
+        if self.beam_search_decoder is not None:
+            del self.beam_search_decoder
+            self.beam_search_decoder = None
diff --git a/ernie-sat/paddlespeech/s2t/modules/decoder.py b/ernie-sat/paddlespeech/s2t/modules/decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a851ec62c35f633ce07fd0b4380d92b31d67b3b
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/modules/decoder.py
@@ -0,0 +1,252 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2019 Mobvoi Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from wenet(https://github.com/wenet-e2e/wenet)
+"""Decoder definition."""
+from typing import Any
+from typing import List
+from typing import Optional
+from typing import Tuple
+
+import paddle
+from paddle import nn
+from typeguard import check_argument_types
+
+from paddlespeech.s2t.decoders.scorers.scorer_interface import BatchScorerInterface
+from paddlespeech.s2t.modules.align import Embedding
+from paddlespeech.s2t.modules.align import LayerNorm
+from paddlespeech.s2t.modules.align import Linear
+from paddlespeech.s2t.modules.attention import MultiHeadedAttention
+from paddlespeech.s2t.modules.decoder_layer import DecoderLayer
+from paddlespeech.s2t.modules.embedding import PositionalEncoding
+from paddlespeech.s2t.modules.mask import make_non_pad_mask
+from paddlespeech.s2t.modules.mask import make_xs_mask
+from paddlespeech.s2t.modules.mask import subsequent_mask
+from paddlespeech.s2t.modules.positionwise_feed_forward import PositionwiseFeedForward
+from paddlespeech.s2t.utils.log import Log
+
+logger = Log(__name__).getlog()
+
+__all__ = ["TransformerDecoder"]
+
+
+class TransformerDecoder(BatchScorerInterface, nn.Layer):
+    """Base class of Transfomer decoder module.
+    Args:
+        vocab_size: output dim
+        encoder_output_size: dimension of attention
+        attention_heads: the number of heads of multi head attention
+        linear_units: the hidden units number of position-wise feedforward
+        num_blocks: the number of decoder blocks
+        dropout_rate: dropout rate
+        self_attention_dropout_rate: dropout rate for attention
+        input_layer: input layer type, `embed`
+        use_output_layer: whether to use output layer
+        pos_enc_class: PositionalEncoding module
+        normalize_before:
+            True: use layer_norm before each sub-block of a layer.
+            False: use layer_norm after each sub-block of a layer.
+        concat_after: whether to concat attention layer's input and output
+            True: x -> x + linear(concat(x, att(x)))
+            False: x -> x + att(x)
+    """
+
+    def __init__(
+            self,
+            vocab_size: int,
+            encoder_output_size: int,
+            attention_heads: int=4,
+            linear_units: int=2048,
+            num_blocks: int=6,
+            dropout_rate: float=0.1,
+            positional_dropout_rate: float=0.1,
+            self_attention_dropout_rate: float=0.0,
+            src_attention_dropout_rate: float=0.0,
+            input_layer: str="embed",
+            use_output_layer: bool=True,
+            normalize_before: bool=True,
+            concat_after: bool=False, ):
+
+        assert check_argument_types()
+
+        nn.Layer.__init__(self)
+        self.selfattention_layer_type = 'selfattn'
+        attention_dim = encoder_output_size
+
+        if input_layer == "embed":
+            self.embed = nn.Sequential(
+                Embedding(vocab_size, attention_dim),
+                PositionalEncoding(attention_dim, positional_dropout_rate), )
+        else:
+            raise ValueError(f"only 'embed' is supported: {input_layer}")
+
+        self.normalize_before = normalize_before
+        self.after_norm = LayerNorm(attention_dim, epsilon=1e-12)
+        self.use_output_layer = use_output_layer
+        self.output_layer = Linear(attention_dim, vocab_size)
+
+        self.decoders = nn.LayerList([
+            DecoderLayer(
+                size=attention_dim,
+                self_attn=MultiHeadedAttention(attention_heads, attention_dim,
+                                               self_attention_dropout_rate),
+                src_attn=MultiHeadedAttention(attention_heads, attention_dim,
+                                              src_attention_dropout_rate),
+                feed_forward=PositionwiseFeedForward(
+                    attention_dim, linear_units, dropout_rate),
+                dropout_rate=dropout_rate,
+                normalize_before=normalize_before,
+                concat_after=concat_after, ) for _ in range(num_blocks)
+        ])
+
+    def forward(
+            self,
+            memory: paddle.Tensor,
+            memory_mask: paddle.Tensor,
+            ys_in_pad: paddle.Tensor,
+            ys_in_lens: paddle.Tensor, ) -> Tuple[paddle.Tensor, paddle.Tensor]:
+        """Forward decoder.
+        Args:
+            memory: encoded memory, float32  (batch, maxlen_in, feat)
+            memory_mask: encoder memory mask, (batch, 1, maxlen_in)
+            ys_in_pad: padded input token ids, int64 (batch, maxlen_out)
+            ys_in_lens: input lengths of this batch (batch)
+        Returns:
+            (tuple): tuple containing:
+                x: decoded token score before softmax (batch, maxlen_out, vocab_size)
+                    if use_output_layer is True,
+                olens: (batch, )
+        """
+        tgt = ys_in_pad
+        # tgt_mask: (B, 1, L)
+        tgt_mask = (make_non_pad_mask(ys_in_lens).unsqueeze(1))
+        # m: (1, L, L)
+        m = subsequent_mask(tgt_mask.shape[-1]).unsqueeze(0)
+        # tgt_mask: (B, L, L)
+        # TODO(Hui Zhang): not support & for tensor
+        # tgt_mask = tgt_mask & m
+        tgt_mask = tgt_mask.logical_and(m)
+
+        x, _ = self.embed(tgt)
+        for layer in self.decoders:
+            x, tgt_mask, memory, memory_mask = layer(x, tgt_mask, memory,
+                                                     memory_mask)
+        if self.normalize_before:
+            x = self.after_norm(x)
+        if self.use_output_layer:
+            x = self.output_layer(x)
+
+        # TODO(Hui Zhang): reduce_sum not support bool type
+        # olens = tgt_mask.sum(1)
+        olens = tgt_mask.astype(paddle.int).sum(1)
+        return x, olens
+
+    def forward_one_step(
+            self,
+            memory: paddle.Tensor,
+            memory_mask: paddle.Tensor,
+            tgt: paddle.Tensor,
+            tgt_mask: paddle.Tensor,
+            cache: Optional[List[paddle.Tensor]]=None,
+    ) -> Tuple[paddle.Tensor, List[paddle.Tensor]]:
+        """Forward one step.
+            This is only used for decoding.
+        Args:
+            memory: encoded memory, float32  (batch, maxlen_in, feat)
+            memory_mask: encoded memory mask, (batch, 1, maxlen_in)
+            tgt: input token ids, int64 (batch, maxlen_out)
+            tgt_mask: input token mask,  (batch, maxlen_out, maxlen_out)
+                      dtype=paddle.bool
+            cache: cached output list of (batch, max_time_out-1, size)
+        Returns:
+            y, cache: NN output value and cache per `self.decoders`.
+                y.shape` is (batch, token)
+        """
+        x, _ = self.embed(tgt)
+        new_cache = []
+        for i, decoder in enumerate(self.decoders):
+            if cache is None:
+                c = None
+            else:
+                c = cache[i]
+            x, tgt_mask, memory, memory_mask = decoder(
+                x, tgt_mask, memory, memory_mask, cache=c)
+            new_cache.append(x)
+        if self.normalize_before:
+            y = self.after_norm(x[:, -1])
+        else:
+            y = x[:, -1]
+        if self.use_output_layer:
+            y = paddle.log_softmax(self.output_layer(y), axis=-1)
+        return y, new_cache
+
+    # beam search API (see ScorerInterface)
+    def score(self, ys, state, x):
+        """Score.
+        ys: (ylen,)
+        x: (xlen, n_feat)
+        """
+        ys_mask = subsequent_mask(len(ys)).unsqueeze(0)  # (B,L,L)
+        x_mask = make_xs_mask(x.unsqueeze(0)).unsqueeze(1)  # (B,1,T)
+        if self.selfattention_layer_type != "selfattn":
+            # TODO(karita): implement cache
+            logging.warning(
+                f"{self.selfattention_layer_type} does not support cached decoding."
+            )
+            state = None
+        logp, state = self.forward_one_step(
+            x.unsqueeze(0), x_mask, ys.unsqueeze(0), ys_mask, cache=state)
+        return logp.squeeze(0), state
+
+    # batch beam search API (see BatchScorerInterface)
+    def batch_score(self,
+                    ys: paddle.Tensor,
+                    states: List[Any],
+                    xs: paddle.Tensor) -> Tuple[paddle.Tensor, List[Any]]:
+        """Score new token batch (required).
+
+        Args:
+            ys (paddle.Tensor): paddle.int64 prefix tokens (n_batch, ylen).
+            states (List[Any]): Scorer states for prefix tokens.
+            xs (paddle.Tensor):
+                The encoder feature that generates ys (n_batch, xlen, n_feat).
+
+        Returns:
+            tuple[paddle.Tensor, List[Any]]: Tuple of
+                batchfied scores for next token with shape of `(n_batch, n_vocab)`
+                and next state list for ys.
+
+        """
+        # merge states
+        n_batch = len(ys)
+        n_layers = len(self.decoders)
+        if states[0] is None:
+            batch_state = None
+        else:
+            # transpose state of [batch, layer] into [layer, batch]
+            batch_state = [
+                paddle.stack([states[b][i] for b in range(n_batch)])
+                for i in range(n_layers)
+            ]
+
+        # batch decoding
+        ys_mask = subsequent_mask(ys.size(-1)).unsqueeze(0)  # (B,L,L)
+        xs_mask = make_xs_mask(xs).unsqueeze(1)  # (B,1,T)
+        logp, states = self.forward_one_step(
+            xs, xs_mask, ys, ys_mask, cache=batch_state)
+
+        # transpose state of [layer, batch] into [batch, layer]
+        state_list = [[states[i][b] for i in range(n_layers)]
+                      for b in range(n_batch)]
+        return logp, state_list
diff --git a/ernie-sat/paddlespeech/s2t/modules/decoder_layer.py b/ernie-sat/paddlespeech/s2t/modules/decoder_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7f8694c12623ce82eb6849bcd9438483f513502
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/modules/decoder_layer.py
@@ -0,0 +1,155 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2019 Mobvoi Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from wenet(https://github.com/wenet-e2e/wenet)
+"""Decoder self-attention layer definition."""
+from typing import Optional
+from typing import Tuple
+
+import paddle
+from paddle import nn
+
+from paddlespeech.s2t.modules.align import LayerNorm
+from paddlespeech.s2t.modules.align import Linear
+from paddlespeech.s2t.utils.log import Log
+
+logger = Log(__name__).getlog()
+
+__all__ = ["DecoderLayer"]
+
+
+class DecoderLayer(nn.Layer):
+    """Single decoder layer module.
+    Args:
+        size (int): Input dimension.
+        self_attn (nn.Layer): Self-attention module instance.
+            `MultiHeadedAttention` instance can be used as the argument.
+        src_attn (nn.Layer): Self-attention module instance.
+            `MultiHeadedAttention` instance can be used as the argument.
+        feed_forward (nn.Layer): Feed-forward module instance.
+            `PositionwiseFeedForward` instance can be used as the argument.
+        dropout_rate (float): Dropout rate.
+        normalize_before (bool):
+            True: use layer_norm before each sub-block.
+            False: to use layer_norm after each sub-block.
+        concat_after (bool): Whether to concat attention layer's input
+            and output.
+            True: x -> x + linear(concat(x, att(x)))
+            False: x -> x + att(x)
+    """
+
+    def __init__(
+            self,
+            size: int,
+            self_attn: nn.Layer,
+            src_attn: nn.Layer,
+            feed_forward: nn.Layer,
+            dropout_rate: float,
+            normalize_before: bool=True,
+            concat_after: bool=False, ):
+        """Construct an DecoderLayer object."""
+        super().__init__()
+        self.size = size
+        self.self_attn = self_attn
+        self.src_attn = src_attn
+        self.feed_forward = feed_forward
+        self.norm1 = LayerNorm(size, epsilon=1e-12)
+        self.norm2 = LayerNorm(size, epsilon=1e-12)
+        self.norm3 = LayerNorm(size, epsilon=1e-12)
+        self.dropout = nn.Dropout(dropout_rate)
+        self.normalize_before = normalize_before
+        self.concat_after = concat_after
+        self.concat_linear1 = Linear(size + size, size)
+        self.concat_linear2 = Linear(size + size, size)
+
+    def forward(
+            self,
+            tgt: paddle.Tensor,
+            tgt_mask: paddle.Tensor,
+            memory: paddle.Tensor,
+            memory_mask: paddle.Tensor,
+            cache: Optional[paddle.Tensor]=None
+    ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor]:
+        """Compute decoded features.
+        Args:
+            tgt (paddle.Tensor): Input tensor (#batch, maxlen_out, size).
+            tgt_mask (paddle.Tensor): Mask for input tensor
+                (#batch, maxlen_out).
+            memory (paddle.Tensor): Encoded memory
+                (#batch, maxlen_in, size).
+            memory_mask (paddle.Tensor): Encoded memory mask
+                (#batch, maxlen_in).
+            cache (paddle.Tensor): cached tensors.
+                (#batch, maxlen_out - 1, size).
+        Returns:
+            paddle.Tensor: Output tensor (#batch, maxlen_out, size).
+            paddle.Tensor: Mask for output tensor (#batch, maxlen_out).
+            paddle.Tensor: Encoded memory (#batch, maxlen_in, size).
+            paddle.Tensor: Encoded memory mask (#batch, maxlen_in).
+        """
+        residual = tgt
+        if self.normalize_before:
+            tgt = self.norm1(tgt)
+
+        if cache is None:
+            tgt_q = tgt
+            tgt_q_mask = tgt_mask
+        else:
+            # compute only the last frame query keeping dim: max_time_out -> 1
+            assert cache.shape == [
+                tgt.shape[0],
+                tgt.shape[1] - 1,
+                self.size,
+            ], f"{cache.shape} == {[tgt.shape[0], tgt.shape[1] - 1, self.size]}"
+            tgt_q = tgt[:, -1:, :]
+            residual = residual[:, -1:, :]
+            # TODO(Hui Zhang): slice not support bool type
+            # tgt_q_mask = tgt_mask[:, -1:, :]
+            tgt_q_mask = tgt_mask.cast(paddle.int64)[:, -1:, :].cast(
+                paddle.bool)
+
+        if self.concat_after:
+            tgt_concat = paddle.cat(
+                (tgt_q, self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)), dim=-1)
+            x = residual + self.concat_linear1(tgt_concat)
+        else:
+            x = residual + self.dropout(
+                self.self_attn(tgt_q, tgt, tgt, tgt_q_mask))
+        if not self.normalize_before:
+            x = self.norm1(x)
+
+        residual = x
+        if self.normalize_before:
+            x = self.norm2(x)
+        if self.concat_after:
+            x_concat = paddle.cat(
+                (x, self.src_attn(x, memory, memory, memory_mask)), dim=-1)
+            x = residual + self.concat_linear2(x_concat)
+        else:
+            x = residual + self.dropout(
+                self.src_attn(x, memory, memory, memory_mask))
+        if not self.normalize_before:
+            x = self.norm2(x)
+
+        residual = x
+        if self.normalize_before:
+            x = self.norm3(x)
+        x = residual + self.dropout(self.feed_forward(x))
+        if not self.normalize_before:
+            x = self.norm3(x)
+
+        if cache is not None:
+            x = paddle.cat([cache, x], dim=1)
+
+        return x, tgt_mask, memory, memory_mask
diff --git a/ernie-sat/paddlespeech/s2t/modules/embedding.py b/ernie-sat/paddlespeech/s2t/modules/embedding.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d4e91753b38129a9c2c71d706787af9d14a903d
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/modules/embedding.py
@@ -0,0 +1,165 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2019 Mobvoi Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from wenet(https://github.com/wenet-e2e/wenet)
+"""Positonal Encoding Module."""
+import math
+from typing import Tuple
+
+import paddle
+from paddle import nn
+
+from paddlespeech.s2t.utils.log import Log
+
+logger = Log(__name__).getlog()
+
+__all__ = [
+    "PositionalEncodingInterface", "NoPositionalEncoding", "PositionalEncoding",
+    "RelPositionalEncoding"
+]
+
+
+class PositionalEncodingInterface:
+    def forward(self, x: paddle.Tensor,
+                offset: int=0) -> Tuple[paddle.Tensor, paddle.Tensor]:
+        """Compute positional encoding.
+        Args:
+            x (paddle.Tensor): Input tensor (batch, time, `*`).
+        Returns:
+            paddle.Tensor: Encoded tensor (batch, time, `*`).
+            paddle.Tensor: Positional embedding tensor (1, time, `*`).
+        """
+        raise NotImplementedError("forward method is not implemented")
+
+    def position_encoding(self, offset: int, size: int) -> paddle.Tensor:
+        """ For getting encoding in a streaming fashion
+        Args:
+            offset (int): start offset
+            size (int): requried size of position encoding
+        Returns:
+            paddle.Tensor: Corresponding position encoding
+        """
+        raise NotImplementedError("position_encoding method is not implemented")
+
+
+class NoPositionalEncoding(nn.Layer, PositionalEncodingInterface):
+    def __init__(self,
+                 d_model: int,
+                 dropout_rate: float,
+                 max_len: int=5000,
+                 reverse: bool=False):
+        nn.Layer.__init__(self)
+
+    def forward(self, x: paddle.Tensor,
+                offset: int=0) -> Tuple[paddle.Tensor, paddle.Tensor]:
+        return x, None
+
+    def position_encoding(self, offset: int, size: int) -> paddle.Tensor:
+        return None
+
+
+class PositionalEncoding(nn.Layer, PositionalEncodingInterface):
+    def __init__(self,
+                 d_model: int,
+                 dropout_rate: float,
+                 max_len: int=5000,
+                 reverse: bool=False):
+        """Positional encoding.
+            PE(pos, 2i)   = sin(pos/(10000^(2i/dmodel)))
+            PE(pos, 2i+1) = cos(pos/(10000^(2i/dmodel)))
+        Args:
+            d_model (int): embedding dim.
+            dropout_rate (float): dropout rate.
+            max_len (int, optional): maximum input length. Defaults to 5000.
+            reverse (bool, optional): Not used. Defaults to False.
+        """
+        nn.Layer.__init__(self)
+        self.d_model = d_model
+        self.max_len = max_len
+        self.xscale = paddle.to_tensor(math.sqrt(self.d_model))
+        self.dropout = nn.Dropout(p=dropout_rate)
+        self.pe = paddle.zeros([self.max_len, self.d_model])  #[T,D]
+
+        position = paddle.arange(
+            0, self.max_len, dtype=paddle.float32).unsqueeze(1)  #[T, 1]
+        div_term = paddle.exp(
+            paddle.arange(0, self.d_model, 2, dtype=paddle.float32) *
+            -(math.log(10000.0) / self.d_model))
+
+        self.pe[:, 0::2] = paddle.sin(position * div_term)
+        self.pe[:, 1::2] = paddle.cos(position * div_term)
+        self.pe = self.pe.unsqueeze(0)  #[1, T, D]
+
+    def forward(self, x: paddle.Tensor,
+                offset: int=0) -> Tuple[paddle.Tensor, paddle.Tensor]:
+        """Add positional encoding.
+        Args:
+            x (paddle.Tensor): Input. Its shape is (batch, time, ...)
+            offset (int): position offset
+        Returns:
+            paddle.Tensor: Encoded tensor. Its shape is (batch, time, ...)
+            paddle.Tensor: for compatibility to RelPositionalEncoding, (batch=1, time, ...)
+        """
+        T = x.shape[1]
+        assert offset + x.shape[1] < self.max_len
+        #TODO(Hui Zhang): using T = x.size(1), __getitem__ not support Tensor
+        pos_emb = self.pe[:, offset:offset + T]
+        x = x * self.xscale + pos_emb
+        return self.dropout(x), self.dropout(pos_emb)
+
+    def position_encoding(self, offset: int, size: int) -> paddle.Tensor:
+        """ For getting encoding in a streaming fashion
+        Attention!!!!!
+        we apply dropout only once at the whole utterance level in a none
+        streaming way, but will call this function several times with
+        increasing input size in a streaming scenario, so the dropout will
+        be applied several times.
+        Args:
+            offset (int): start offset
+            size (int): requried size of position encoding
+        Returns:
+            paddle.Tensor: Corresponding position encoding
+        """
+        assert offset + size < self.max_len
+        return self.dropout(self.pe[:, offset:offset + size])
+
+
+class RelPositionalEncoding(PositionalEncoding):
+    """Relative positional encoding module.
+    See : Appendix B in https://arxiv.org/abs/1901.02860
+    """
+
+    def __init__(self, d_model: int, dropout_rate: float, max_len: int=5000):
+        """
+        Args:
+            d_model (int): Embedding dimension.
+            dropout_rate (float): Dropout rate.
+            max_len (int, optional): [Maximum input length.]. Defaults to 5000.
+        """
+        super().__init__(d_model, dropout_rate, max_len, reverse=True)
+
+    def forward(self, x: paddle.Tensor,
+                offset: int=0) -> Tuple[paddle.Tensor, paddle.Tensor]:
+        """Compute positional encoding.
+        Args:
+            x (paddle.Tensor): Input tensor (batch, time, `*`).
+        Returns:
+            paddle.Tensor: Encoded tensor (batch, time, `*`).
+            paddle.Tensor: Positional embedding tensor (1, time, `*`).
+        """
+        assert offset + x.shape[1] < self.max_len
+        x = x * self.xscale
+        #TODO(Hui Zhang): using x.size(1), __getitem__ not support Tensor
+        pos_emb = self.pe[:, offset:offset + x.shape[1]]
+        return self.dropout(x), self.dropout(pos_emb)
diff --git a/ernie-sat/paddlespeech/s2t/modules/encoder.py b/ernie-sat/paddlespeech/s2t/modules/encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..c843c0e207054b20a5d3850334198ef6bcb6888c
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/modules/encoder.py
@@ -0,0 +1,495 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2019 Mobvoi Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from wenet(https://github.com/wenet-e2e/wenet)
+"""Encoder definition."""
+from typing import List
+from typing import Optional
+from typing import Tuple
+
+import paddle
+from paddle import nn
+from typeguard import check_argument_types
+
+from paddlespeech.s2t.modules.activation import get_activation
+from paddlespeech.s2t.modules.align import LayerNorm
+from paddlespeech.s2t.modules.attention import MultiHeadedAttention
+from paddlespeech.s2t.modules.attention import RelPositionMultiHeadedAttention
+from paddlespeech.s2t.modules.conformer_convolution import ConvolutionModule
+from paddlespeech.s2t.modules.embedding import NoPositionalEncoding
+from paddlespeech.s2t.modules.embedding import PositionalEncoding
+from paddlespeech.s2t.modules.embedding import RelPositionalEncoding
+from paddlespeech.s2t.modules.encoder_layer import ConformerEncoderLayer
+from paddlespeech.s2t.modules.encoder_layer import TransformerEncoderLayer
+from paddlespeech.s2t.modules.mask import add_optional_chunk_mask
+from paddlespeech.s2t.modules.mask import make_non_pad_mask
+from paddlespeech.s2t.modules.positionwise_feed_forward import PositionwiseFeedForward
+from paddlespeech.s2t.modules.subsampling import Conv2dSubsampling4
+from paddlespeech.s2t.modules.subsampling import Conv2dSubsampling6
+from paddlespeech.s2t.modules.subsampling import Conv2dSubsampling8
+from paddlespeech.s2t.modules.subsampling import LinearNoSubsampling
+from paddlespeech.s2t.utils.log import Log
+
+logger = Log(__name__).getlog()
+
+__all__ = ["BaseEncoder", 'TransformerEncoder', "ConformerEncoder"]
+
+
+class BaseEncoder(nn.Layer):
+    def __init__(
+            self,
+            input_size: int,
+            output_size: int=256,
+            attention_heads: int=4,
+            linear_units: int=2048,
+            num_blocks: int=6,
+            dropout_rate: float=0.1,
+            positional_dropout_rate: float=0.1,
+            attention_dropout_rate: float=0.0,
+            input_layer: str="conv2d",
+            pos_enc_layer_type: str="abs_pos",
+            normalize_before: bool=True,
+            concat_after: bool=False,
+            static_chunk_size: int=0,
+            use_dynamic_chunk: bool=False,
+            global_cmvn: paddle.nn.Layer=None,
+            use_dynamic_left_chunk: bool=False, ):
+        """
+        Args:
+            input_size (int): input dim, d_feature
+            output_size (int): dimension of attention, d_model
+            attention_heads (int): the number of heads of multi head attention
+            linear_units (int): the hidden units number of position-wise feed
+                forward
+            num_blocks (int): the number of encoder blocks
+            dropout_rate (float): dropout rate
+            attention_dropout_rate (float): dropout rate in attention
+            positional_dropout_rate (float): dropout rate after adding
+                positional encoding
+            input_layer (str): input layer type.
+                optional [linear, conv2d, conv2d6, conv2d8]
+            pos_enc_layer_type (str): Encoder positional encoding layer type.
+                opitonal [abs_pos, scaled_abs_pos, rel_pos, no_pos]
+            normalize_before (bool):
+                True: use layer_norm before each sub-block of a layer.
+                False: use layer_norm after each sub-block of a layer.
+            concat_after (bool): whether to concat attention layer's input
+                and output.
+                True: x -> x + linear(concat(x, att(x)))
+                False: x -> x + att(x)
+            static_chunk_size (int): chunk size for static chunk training and
+                decoding
+            use_dynamic_chunk (bool): whether use dynamic chunk size for
+                training or not, You can only use fixed chunk(chunk_size > 0)
+                or dyanmic chunk size(use_dynamic_chunk = True)
+            global_cmvn (Optional[paddle.nn.Layer]): Optional GlobalCMVN layer
+            use_dynamic_left_chunk (bool): whether use dynamic left chunk in
+                dynamic chunk training
+        """
+        assert check_argument_types()
+        super().__init__()
+        self._output_size = output_size
+
+        if pos_enc_layer_type == "abs_pos":
+            pos_enc_class = PositionalEncoding
+        elif pos_enc_layer_type == "rel_pos":
+            pos_enc_class = RelPositionalEncoding
+        elif pos_enc_layer_type == "no_pos":
+            pos_enc_class = NoPositionalEncoding
+        else:
+            raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type)
+
+        if input_layer == "linear":
+            subsampling_class = LinearNoSubsampling
+        elif input_layer == "conv2d":
+            subsampling_class = Conv2dSubsampling4
+        elif input_layer == "conv2d6":
+            subsampling_class = Conv2dSubsampling6
+        elif input_layer == "conv2d8":
+            subsampling_class = Conv2dSubsampling8
+        else:
+            raise ValueError("unknown input_layer: " + input_layer)
+
+        self.global_cmvn = global_cmvn
+        self.embed = subsampling_class(
+            idim=input_size,
+            odim=output_size,
+            dropout_rate=dropout_rate,
+            pos_enc_class=pos_enc_class(
+                d_model=output_size, dropout_rate=positional_dropout_rate), )
+
+        self.normalize_before = normalize_before
+        self.after_norm = LayerNorm(output_size, epsilon=1e-12)
+        self.static_chunk_size = static_chunk_size
+        self.use_dynamic_chunk = use_dynamic_chunk
+        self.use_dynamic_left_chunk = use_dynamic_left_chunk
+
+    def output_size(self) -> int:
+        return self._output_size
+
+    def forward(
+            self,
+            xs: paddle.Tensor,
+            xs_lens: paddle.Tensor,
+            decoding_chunk_size: int=0,
+            num_decoding_left_chunks: int=-1,
+    ) -> Tuple[paddle.Tensor, paddle.Tensor]:
+        """Embed positions in tensor.
+        Args:
+            xs: padded input tensor (B, L, D)
+            xs_lens: input length (B)
+            decoding_chunk_size: decoding chunk size for dynamic chunk
+                0: default for training, use random dynamic chunk.
+                <0: for decoding, use full chunk.
+                >0: for decoding, use fixed chunk size as set.
+            num_decoding_left_chunks: number of left chunks, this is for decoding,
+                the chunk size is decoding_chunk_size.
+                >=0: use num_decoding_left_chunks
+                <0: use all left chunks
+        Returns:
+            encoder output tensor, lens and mask
+        """
+        masks = make_non_pad_mask(xs_lens).unsqueeze(1)  # (B, 1, L)
+
+        if self.global_cmvn is not None:
+            xs = self.global_cmvn(xs)
+        #TODO(Hui Zhang): self.embed(xs, masks, offset=0), stride_slice not support bool tensor
+        xs, pos_emb, masks = self.embed(xs, masks.astype(xs.dtype), offset=0)
+        #TODO(Hui Zhang): remove mask.astype, stride_slice not support bool tensor
+        masks = masks.astype(paddle.bool)
+        #TODO(Hui Zhang): mask_pad = ~masks
+        mask_pad = masks.logical_not()
+        chunk_masks = add_optional_chunk_mask(
+            xs, masks, self.use_dynamic_chunk, self.use_dynamic_left_chunk,
+            decoding_chunk_size, self.static_chunk_size,
+            num_decoding_left_chunks)
+        for layer in self.encoders:
+            xs, chunk_masks, _ = layer(xs, chunk_masks, pos_emb, mask_pad)
+        if self.normalize_before:
+            xs = self.after_norm(xs)
+        # Here we assume the mask is not changed in encoder layers, so just
+        # return the masks before encoder layers, and the masks will be used
+        # for cross attention with decoder later
+        return xs, masks
+
+    def forward_chunk(
+            self,
+            xs: paddle.Tensor,
+            offset: int,
+            required_cache_size: int,
+            subsampling_cache: Optional[paddle.Tensor]=None,
+            elayers_output_cache: Optional[List[paddle.Tensor]]=None,
+            conformer_cnn_cache: Optional[List[paddle.Tensor]]=None,
+    ) -> Tuple[paddle.Tensor, paddle.Tensor, List[paddle.Tensor], List[
+            paddle.Tensor]]:
+        """ Forward just one chunk
+        Args:
+            xs (paddle.Tensor): chunk input, [B=1, T, D]
+            offset (int): current offset in encoder output time stamp
+            required_cache_size (int): cache size required for next chunk
+                compuation
+                >=0: actual cache size
+                <0: means all history cache is required
+            subsampling_cache (Optional[paddle.Tensor]): subsampling cache
+            elayers_output_cache (Optional[List[paddle.Tensor]]):
+                transformer/conformer encoder layers output cache
+            conformer_cnn_cache (Optional[List[paddle.Tensor]]): conformer
+                cnn cache
+        Returns:
+            paddle.Tensor: output of current input xs
+            paddle.Tensor: subsampling cache required for next chunk computation
+            List[paddle.Tensor]: encoder layers output cache required for next
+                chunk computation
+            List[paddle.Tensor]: conformer cnn cache
+        """
+        assert xs.shape[0] == 1  # batch size must be one
+        # tmp_masks is just for interface compatibility
+        # TODO(Hui Zhang): stride_slice not support bool tensor
+        # tmp_masks = paddle.ones([1, xs.size(1)], dtype=paddle.bool)
+        tmp_masks = paddle.ones([1, xs.shape[1]], dtype=paddle.int32)
+        tmp_masks = tmp_masks.unsqueeze(1)  #[B=1, C=1, T]
+
+        if self.global_cmvn is not None:
+            xs = self.global_cmvn(xs)
+
+        xs, pos_emb, _ = self.embed(
+            xs, tmp_masks, offset=offset)  #xs=(B, T, D), pos_emb=(B=1, T, D)
+
+        if subsampling_cache is not None:
+            cache_size = subsampling_cache.shape[1]  #T
+            xs = paddle.cat((subsampling_cache, xs), dim=1)
+        else:
+            cache_size = 0
+
+        # only used when using `RelPositionMultiHeadedAttention`
+        pos_emb = self.embed.position_encoding(
+            offset=offset - cache_size, size=xs.shape[1])
+
+        if required_cache_size < 0:
+            next_cache_start = 0
+        elif required_cache_size == 0:
+            next_cache_start = xs.shape[1]
+        else:
+            next_cache_start = xs.shape[1] - required_cache_size
+        r_subsampling_cache = xs[:, next_cache_start:, :]
+
+        # Real mask for transformer/conformer layers
+        masks = paddle.ones([1, xs.shape[1]], dtype=paddle.bool)
+        masks = masks.unsqueeze(1)  #[B=1, L'=1, T]
+        r_elayers_output_cache = []
+        r_conformer_cnn_cache = []
+        for i, layer in enumerate(self.encoders):
+            attn_cache = None if elayers_output_cache is None else elayers_output_cache[
+                i]
+            cnn_cache = None if conformer_cnn_cache is None else conformer_cnn_cache[
+                i]
+            xs, _, new_cnn_cache = layer(
+                xs,
+                masks,
+                pos_emb,
+                output_cache=attn_cache,
+                cnn_cache=cnn_cache)
+            r_elayers_output_cache.append(xs[:, next_cache_start:, :])
+            r_conformer_cnn_cache.append(new_cnn_cache)
+        if self.normalize_before:
+            xs = self.after_norm(xs)
+
+        return (xs[:, cache_size:, :], r_subsampling_cache,
+                r_elayers_output_cache, r_conformer_cnn_cache)
+
+    def forward_chunk_by_chunk(
+            self,
+            xs: paddle.Tensor,
+            decoding_chunk_size: int,
+            num_decoding_left_chunks: int=-1,
+    ) -> Tuple[paddle.Tensor, paddle.Tensor]:
+        """ Forward input chunk by chunk with chunk_size like a streaming
+            fashion
+        Here we should pay special attention to computation cache in the
+        streaming style forward chunk by chunk. Three things should be taken
+        into account for computation in the current network:
+            1. transformer/conformer encoder layers output cache
+            2. convolution in conformer
+            3. convolution in subsampling
+        However, we don't implement subsampling cache for:
+            1. We can control subsampling module to output the right result by
+               overlapping input instead of cache left context, even though it
+               wastes some computation, but subsampling only takes a very
+               small fraction of computation in the whole model.
+            2. Typically, there are several covolution layers with subsampling
+               in subsampling module, it is tricky and complicated to do cache
+               with different convolution layers with different subsampling
+               rate.
+            3. Currently, nn.Sequential is used to stack all the convolution
+               layers in subsampling, we need to rewrite it to make it work
+               with cache, which is not prefered.
+        Args:
+            xs (paddle.Tensor): (1, max_len, dim)
+            chunk_size (int): decoding chunk size.
+            num_left_chunks (int): decoding with num left chunks.
+        """
+        assert decoding_chunk_size > 0
+        # The model is trained by static or dynamic chunk
+        assert self.static_chunk_size > 0 or self.use_dynamic_chunk
+
+        # feature stride and window for `subsampling` module
+        subsampling = self.embed.subsampling_rate
+        context = self.embed.right_context + 1  # Add current frame
+        stride = subsampling * decoding_chunk_size
+        decoding_window = (decoding_chunk_size - 1) * subsampling + context
+
+        num_frames = xs.shape[1]
+        required_cache_size = decoding_chunk_size * num_decoding_left_chunks
+        subsampling_cache: Optional[paddle.Tensor] = None
+        elayers_output_cache: Optional[List[paddle.Tensor]] = None
+        conformer_cnn_cache: Optional[List[paddle.Tensor]] = None
+        outputs = []
+        offset = 0
+        # Feed forward overlap input step by step
+        for cur in range(0, num_frames - context + 1, stride):
+            end = min(cur + decoding_window, num_frames)
+            chunk_xs = xs[:, cur:end, :]
+            (y, subsampling_cache, elayers_output_cache,
+             conformer_cnn_cache) = self.forward_chunk(
+                 chunk_xs, offset, required_cache_size, subsampling_cache,
+                 elayers_output_cache, conformer_cnn_cache)
+            outputs.append(y)
+            offset += y.shape[1]
+        ys = paddle.cat(outputs, 1)
+        # fake mask, just for jit script and compatibility with `forward` api
+        masks = paddle.ones([1, ys.shape[1]], dtype=paddle.bool)
+        masks = masks.unsqueeze(1)
+        return ys, masks
+
+
+class TransformerEncoder(BaseEncoder):
+    """Transformer encoder module."""
+
+    def __init__(
+            self,
+            input_size: int,
+            output_size: int=256,
+            attention_heads: int=4,
+            linear_units: int=2048,
+            num_blocks: int=6,
+            dropout_rate: float=0.1,
+            positional_dropout_rate: float=0.1,
+            attention_dropout_rate: float=0.0,
+            input_layer: str="conv2d",
+            pos_enc_layer_type: str="abs_pos",
+            normalize_before: bool=True,
+            concat_after: bool=False,
+            static_chunk_size: int=0,
+            use_dynamic_chunk: bool=False,
+            global_cmvn: nn.Layer=None,
+            use_dynamic_left_chunk: bool=False, ):
+        """ Construct TransformerEncoder
+        See Encoder for the meaning of each parameter.
+        """
+        assert check_argument_types()
+        super().__init__(input_size, output_size, attention_heads, linear_units,
+                         num_blocks, dropout_rate, positional_dropout_rate,
+                         attention_dropout_rate, input_layer,
+                         pos_enc_layer_type, normalize_before, concat_after,
+                         static_chunk_size, use_dynamic_chunk, global_cmvn,
+                         use_dynamic_left_chunk)
+        self.encoders = nn.LayerList([
+            TransformerEncoderLayer(
+                size=output_size,
+                self_attn=MultiHeadedAttention(attention_heads, output_size,
+                                               attention_dropout_rate),
+                feed_forward=PositionwiseFeedForward(output_size, linear_units,
+                                                     dropout_rate),
+                dropout_rate=dropout_rate,
+                normalize_before=normalize_before,
+                concat_after=concat_after) for _ in range(num_blocks)
+        ])
+
+    def forward_one_step(
+            self,
+            xs: paddle.Tensor,
+            masks: paddle.Tensor,
+            cache=None, ) -> Tuple[paddle.Tensor, paddle.Tensor]:
+        """Encode input frame.
+
+        Args:
+            xs (paddle.Tensor): (Prefix) Input tensor. (B, T, D)
+            masks (paddle.Tensor): Mask tensor. (B, T, T)
+            cache (List[paddle.Tensor]): List of cache tensors.
+
+        Returns:
+            paddle.Tensor: Output tensor.
+            paddle.Tensor: Mask tensor.
+            List[paddle.Tensor]: List of new cache tensors.
+        """
+        if self.global_cmvn is not None:
+            xs = self.global_cmvn(xs)
+
+        #TODO(Hui Zhang): self.embed(xs, masks, offset=0), stride_slice not support bool tensor
+        xs, pos_emb, masks = self.embed(xs, masks.astype(xs.dtype), offset=0)
+        #TODO(Hui Zhang): remove mask.astype, stride_slice not support bool tensor
+        masks = masks.astype(paddle.bool)
+
+        if cache is None:
+            cache = [None for _ in range(len(self.encoders))]
+        new_cache = []
+        for c, e in zip(cache, self.encoders):
+            xs, masks, _ = e(xs, masks, output_cache=c)
+            new_cache.append(xs)
+        if self.normalize_before:
+            xs = self.after_norm(xs)
+        return xs, masks, new_cache
+
+
+class ConformerEncoder(BaseEncoder):
+    """Conformer encoder module."""
+
+    def __init__(
+            self,
+            input_size: int,
+            output_size: int=256,
+            attention_heads: int=4,
+            linear_units: int=2048,
+            num_blocks: int=6,
+            dropout_rate: float=0.1,
+            positional_dropout_rate: float=0.1,
+            attention_dropout_rate: float=0.0,
+            input_layer: str="conv2d",
+            pos_enc_layer_type: str="rel_pos",
+            normalize_before: bool=True,
+            concat_after: bool=False,
+            static_chunk_size: int=0,
+            use_dynamic_chunk: bool=False,
+            global_cmvn: nn.Layer=None,
+            use_dynamic_left_chunk: bool=False,
+            positionwise_conv_kernel_size: int=1,
+            macaron_style: bool=True,
+            selfattention_layer_type: str="rel_selfattn",
+            activation_type: str="swish",
+            use_cnn_module: bool=True,
+            cnn_module_kernel: int=15,
+            causal: bool=False,
+            cnn_module_norm: str="batch_norm", ):
+        """Construct ConformerEncoder
+        Args:
+            input_size to use_dynamic_chunk, see in BaseEncoder
+            positionwise_conv_kernel_size (int): Kernel size of positionwise
+                conv1d layer.
+            macaron_style (bool): Whether to use macaron style for
+                positionwise layer.
+            selfattention_layer_type (str): Encoder attention layer type,
+                the parameter has no effect now, it's just for configure
+                compatibility.
+            activation_type (str): Encoder activation function type.
+            use_cnn_module (bool): Whether to use convolution module.
+            cnn_module_kernel (int): Kernel size of convolution module.
+            causal (bool): whether to use causal convolution or not.
+            cnn_module_norm (str): cnn conv norm type, Optional['batch_norm','layer_norm']
+        """
+        assert check_argument_types()
+
+        super().__init__(input_size, output_size, attention_heads, linear_units,
+                         num_blocks, dropout_rate, positional_dropout_rate,
+                         attention_dropout_rate, input_layer,
+                         pos_enc_layer_type, normalize_before, concat_after,
+                         static_chunk_size, use_dynamic_chunk, global_cmvn,
+                         use_dynamic_left_chunk)
+        activation = get_activation(activation_type)
+
+        # self-attention module definition
+        encoder_selfattn_layer = RelPositionMultiHeadedAttention
+        encoder_selfattn_layer_args = (attention_heads, output_size,
+                                       attention_dropout_rate)
+        # feed-forward module definition
+        positionwise_layer = PositionwiseFeedForward
+        positionwise_layer_args = (output_size, linear_units, dropout_rate,
+                                   activation)
+        # convolution module definition
+        convolution_layer = ConvolutionModule
+        convolution_layer_args = (output_size, cnn_module_kernel, activation,
+                                  cnn_module_norm, causal)
+
+        self.encoders = nn.LayerList([
+            ConformerEncoderLayer(
+                size=output_size,
+                self_attn=encoder_selfattn_layer(*encoder_selfattn_layer_args),
+                feed_forward=positionwise_layer(*positionwise_layer_args),
+                feed_forward_macaron=positionwise_layer(
+                    *positionwise_layer_args) if macaron_style else None,
+                conv_module=convolution_layer(*convolution_layer_args)
+                if use_cnn_module else None,
+                dropout_rate=dropout_rate,
+                normalize_before=normalize_before,
+                concat_after=concat_after) for _ in range(num_blocks)
+        ])
diff --git a/ernie-sat/paddlespeech/s2t/modules/encoder_layer.py b/ernie-sat/paddlespeech/s2t/modules/encoder_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..e80a298d621ac87db8ad9f76e48041f05ec18f64
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/modules/encoder_layer.py
@@ -0,0 +1,288 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2019 Mobvoi Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from wenet(https://github.com/wenet-e2e/wenet)
+"""Encoder self-attention layer definition."""
+from typing import Optional
+from typing import Tuple
+
+import paddle
+from paddle import nn
+
+from paddlespeech.s2t.modules.align import LayerNorm
+from paddlespeech.s2t.modules.align import Linear
+from paddlespeech.s2t.utils.log import Log
+
+logger = Log(__name__).getlog()
+
+__all__ = ["TransformerEncoderLayer", "ConformerEncoderLayer"]
+
+
+class TransformerEncoderLayer(nn.Layer):
+    """Encoder layer module."""
+
+    def __init__(
+            self,
+            size: int,
+            self_attn: nn.Layer,
+            feed_forward: nn.Layer,
+            dropout_rate: float,
+            normalize_before: bool=True,
+            concat_after: bool=False, ):
+        """Construct an EncoderLayer object.
+
+        Args:
+            size (int): Input dimension.
+            self_attn (nn.Layer): Self-attention module instance.
+                `MultiHeadedAttention` or `RelPositionMultiHeadedAttention`
+                instance can be used as the argument.
+            feed_forward (nn.Layer): Feed-forward module instance.
+                `PositionwiseFeedForward`, instance can be used as the argument.
+            dropout_rate (float): Dropout rate.
+            normalize_before (bool):
+                True: use layer_norm before each sub-block.
+                False: to use layer_norm after each sub-block.
+            concat_after (bool): Whether to concat attention layer's input and
+                output.
+                True: x -> x + linear(concat(x, att(x)))
+                False: x -> x + att(x)
+        """
+        super().__init__()
+        self.self_attn = self_attn
+        self.feed_forward = feed_forward
+        self.norm1 = LayerNorm(size, epsilon=1e-12)
+        self.norm2 = LayerNorm(size, epsilon=1e-12)
+        self.dropout = nn.Dropout(dropout_rate)
+        self.size = size
+        self.normalize_before = normalize_before
+        self.concat_after = concat_after
+        # concat_linear may be not used in forward fuction,
+        # but will be saved in the *.pt
+        self.concat_linear = Linear(size + size, size)
+
+    def forward(
+            self,
+            x: paddle.Tensor,
+            mask: paddle.Tensor,
+            pos_emb: Optional[paddle.Tensor]=None,
+            mask_pad: Optional[paddle.Tensor]=None,
+            output_cache: Optional[paddle.Tensor]=None,
+            cnn_cache: Optional[paddle.Tensor]=None,
+    ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
+        """Compute encoded features.
+        Args:
+            x (paddle.Tensor): Input tensor (#batch, time, size).
+            mask (paddle.Tensor): Mask tensor for the input (#batch, time).
+            pos_emb (paddle.Tensor): just for interface compatibility
+                to ConformerEncoderLayer
+            mask_pad (paddle.Tensor): not used here, it's for interface
+                compatibility to ConformerEncoderLayer
+            output_cache (paddle.Tensor): Cache tensor of the output
+                (#batch, time2, size), time2 < time in x.
+            cnn_cache (paddle.Tensor): not used here, it's for interface
+                compatibility to ConformerEncoderLayer
+        Returns:
+            paddle.Tensor: Output tensor (#batch, time, size).
+            paddle.Tensor: Mask tensor (#batch, time).
+            paddle.Tensor: Fake cnn cache tensor for api compatibility with Conformer (#batch, channels, time').
+        """
+        residual = x
+        if self.normalize_before:
+            x = self.norm1(x)
+
+        if output_cache is None:
+            x_q = x
+        else:
+            assert output_cache.shape[0] == x.shape[0]
+            assert output_cache.shape[1] < x.shape[1]
+            assert output_cache.shape[2] == self.size
+            chunk = x.shape[1] - output_cache.shape[1]
+            x_q = x[:, -chunk:, :]
+            residual = residual[:, -chunk:, :]
+            mask = mask[:, -chunk:, :]
+
+        if self.concat_after:
+            x_concat = paddle.concat(
+                (x, self.self_attn(x_q, x, x, mask)), axis=-1)
+            x = residual + self.concat_linear(x_concat)
+        else:
+            x = residual + self.dropout(self.self_attn(x_q, x, x, mask))
+        if not self.normalize_before:
+            x = self.norm1(x)
+
+        residual = x
+        if self.normalize_before:
+            x = self.norm2(x)
+        x = residual + self.dropout(self.feed_forward(x))
+        if not self.normalize_before:
+            x = self.norm2(x)
+
+        if output_cache is not None:
+            x = paddle.concat([output_cache, x], axis=1)
+
+        fake_cnn_cache = paddle.zeros([1], dtype=x.dtype)
+        return x, mask, fake_cnn_cache
+
+
+class ConformerEncoderLayer(nn.Layer):
+    """Encoder layer module."""
+
+    def __init__(
+            self,
+            size: int,
+            self_attn: nn.Layer,
+            feed_forward: Optional[nn.Layer]=None,
+            feed_forward_macaron: Optional[nn.Layer]=None,
+            conv_module: Optional[nn.Layer]=None,
+            dropout_rate: float=0.1,
+            normalize_before: bool=True,
+            concat_after: bool=False, ):
+        """Construct an EncoderLayer object.
+
+        Args:
+            size (int): Input dimension.
+            self_attn (nn.Layer): Self-attention module instance.
+                `MultiHeadedAttention` or `RelPositionMultiHeadedAttention`
+                instance can be used as the argument.
+            feed_forward (nn.Layer): Feed-forward module instance.
+                `PositionwiseFeedForward` instance can be used as the argument.
+            feed_forward_macaron (nn.Layer): Additional feed-forward module
+                instance.
+                `PositionwiseFeedForward` instance can be used as the argument.
+            conv_module (nn.Layer): Convolution module instance.
+                `ConvlutionModule` instance can be used as the argument.
+            dropout_rate (float): Dropout rate.
+            normalize_before (bool):
+                True: use layer_norm before each sub-block.
+                False: use layer_norm after each sub-block.
+            concat_after (bool): Whether to concat attention layer's input and
+                output.
+                True: x -> x + linear(concat(x, att(x)))
+                False: x -> x + att(x)
+        """
+        super().__init__()
+        self.self_attn = self_attn
+        self.feed_forward = feed_forward
+        self.feed_forward_macaron = feed_forward_macaron
+        self.conv_module = conv_module
+        self.norm_ff = LayerNorm(size, epsilon=1e-12)  # for the FNN module
+        self.norm_mha = LayerNorm(size, epsilon=1e-12)  # for the MHA module
+        if feed_forward_macaron is not None:
+            self.norm_ff_macaron = LayerNorm(size, epsilon=1e-12)
+            self.ff_scale = 0.5
+        else:
+            self.ff_scale = 1.0
+        if self.conv_module is not None:
+            self.norm_conv = LayerNorm(
+                size, epsilon=1e-12)  # for the CNN module
+            self.norm_final = LayerNorm(
+                size, epsilon=1e-12)  # for the final output of the block
+        self.dropout = nn.Dropout(dropout_rate)
+        self.size = size
+        self.normalize_before = normalize_before
+        self.concat_after = concat_after
+        self.concat_linear = Linear(size + size, size)
+
+    def forward(
+            self,
+            x: paddle.Tensor,
+            mask: paddle.Tensor,
+            pos_emb: paddle.Tensor,
+            mask_pad: Optional[paddle.Tensor]=None,
+            output_cache: Optional[paddle.Tensor]=None,
+            cnn_cache: Optional[paddle.Tensor]=None,
+    ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
+        """Compute encoded features.
+        Args:
+            x (paddle.Tensor): (#batch, time, size)
+            mask (paddle.Tensor): Mask tensor for the input (#batch, time，time).
+            pos_emb (paddle.Tensor): positional encoding, must not be None
+                for ConformerEncoderLayer.
+            mask_pad (paddle.Tensor): batch padding mask used for conv module, (B, 1, T).
+            output_cache (paddle.Tensor): Cache tensor of the encoder output
+                (#batch, time2, size), time2 < time in x.
+            cnn_cache (paddle.Tensor): Convolution cache in conformer layer
+        Returns:
+            paddle.Tensor: Output tensor (#batch, time, size).
+            paddle.Tensor: Mask tensor (#batch, time).
+            paddle.Tensor: New cnn cache tensor (#batch, channels, time').
+        """
+        # whether to use macaron style FFN
+        if self.feed_forward_macaron is not None:
+            residual = x
+            if self.normalize_before:
+                x = self.norm_ff_macaron(x)
+            x = residual + self.ff_scale * self.dropout(
+                self.feed_forward_macaron(x))
+            if not self.normalize_before:
+                x = self.norm_ff_macaron(x)
+
+        # multi-headed self-attention module
+        residual = x
+        if self.normalize_before:
+            x = self.norm_mha(x)
+
+        if output_cache is None:
+            x_q = x
+        else:
+            assert output_cache.shape[0] == x.shape[0]
+            assert output_cache.shape[1] < x.shape[1]
+            assert output_cache.shape[2] == self.size
+            chunk = x.shape[1] - output_cache.shape[1]
+            x_q = x[:, -chunk:, :]
+            residual = residual[:, -chunk:, :]
+            mask = mask[:, -chunk:, :]
+
+        x_att = self.self_attn(x_q, x, x, pos_emb, mask)
+
+        if self.concat_after:
+            x_concat = paddle.concat((x, x_att), axis=-1)
+            x = residual + self.concat_linear(x_concat)
+        else:
+            x = residual + self.dropout(x_att)
+
+        if not self.normalize_before:
+            x = self.norm_mha(x)
+
+        # convolution module
+        # Fake new cnn cache here, and then change it in conv_module
+        new_cnn_cache = paddle.zeros([1], dtype=x.dtype)
+        if self.conv_module is not None:
+            residual = x
+            if self.normalize_before:
+                x = self.norm_conv(x)
+
+            x, new_cnn_cache = self.conv_module(x, mask_pad, cnn_cache)
+            x = residual + self.dropout(x)
+
+            if not self.normalize_before:
+                x = self.norm_conv(x)
+
+        # feed forward module
+        residual = x
+        if self.normalize_before:
+            x = self.norm_ff(x)
+
+        x = residual + self.ff_scale * self.dropout(self.feed_forward(x))
+
+        if not self.normalize_before:
+            x = self.norm_ff(x)
+
+        if self.conv_module is not None:
+            x = self.norm_final(x)
+
+        if output_cache is not None:
+            x = paddle.concat([output_cache, x], axis=1)
+
+        return x, mask, new_cnn_cache
diff --git a/ernie-sat/paddlespeech/s2t/modules/initializer.py b/ernie-sat/paddlespeech/s2t/modules/initializer.py
new file mode 100644
index 0000000000000000000000000000000000000000..30a04e44fb2965d03be8c6346ef16448ed257bbc
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/modules/initializer.py
@@ -0,0 +1,172 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+from paddle.fluid import framework
+from paddle.fluid import unique_name
+from paddle.fluid.core import VarDesc
+from paddle.fluid.initializer import MSRAInitializer
+
+__all__ = ['KaimingUniform']
+
+
+class KaimingUniform(MSRAInitializer):
+    r"""Implements the Kaiming Uniform initializer
+
+    This class implements the weight initialization from the paper
+    `Delving Deep into Rectifiers: Surpassing Human-Level Performance on
+    ImageNet Classification <https://arxiv.org/abs/1502.01852>`_
+    by Kaiming He, Xiangyu Zhang, Shaoqing Ren and Jian Sun. This is a
+    robust initialization method that particularly considers the rectifier
+    nonlinearities.
+
+    In case of Uniform distribution, the range is [-x, x], where
+
+    .. math::
+
+        x = \sqrt{\frac{1.0}{fan\_in}}
+
+    In case of Normal distribution, the mean is 0 and the standard deviation
+    is
+
+    .. math::
+
+        \sqrt{\\frac{2.0}{fan\_in}}
+
+    Args:
+        fan_in (float32|None): fan_in for Kaiming uniform Initializer. If None, it is\
+        inferred from the variable. default is None.
+
+    Note:
+        It is recommended to set fan_in to None for most cases.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn as nn
+
+            linear = nn.Linear(2,
+                               4,
+                               weight_attr=nn.initializer.KaimingUniform())
+            data = paddle.rand([30, 10, 2], dtype='float32')
+            res = linear(data)
+
+    """
+
+    def __init__(self, fan_in=None):
+        super(KaimingUniform, self).__init__(
+            uniform=True, fan_in=fan_in, seed=0)
+
+    def __call__(self, var, block=None):
+        """Initialize the input tensor with MSRA initialization.
+
+        Args:
+            var(Tensor): Tensor that needs to be initialized.
+            block(Block, optional): The block in which initialization ops
+                   should be added. Used in static graph only, default None.
+
+        Returns:
+            The initialization op
+        """
+        block = self._check_block(block)
+
+        assert isinstance(var, framework.Variable)
+        assert isinstance(block, framework.Block)
+        f_in, f_out = self._compute_fans(var)
+
+        # If fan_in is passed, use it
+        fan_in = f_in if self._fan_in is None else self._fan_in
+
+        if self._seed == 0:
+            self._seed = block.program.random_seed
+
+        # to be compatible of fp16 initalizers
+        if var.dtype == VarDesc.VarType.FP16 or (
+                var.dtype == VarDesc.VarType.BF16 and not self._uniform):
+            out_dtype = VarDesc.VarType.FP32
+            out_var = block.create_var(
+                name=unique_name.generate(
+                    ".".join(['masra_init', var.name, 'tmp'])),
+                shape=var.shape,
+                dtype=out_dtype,
+                type=VarDesc.VarType.LOD_TENSOR,
+                persistable=False)
+        else:
+            out_dtype = var.dtype
+            out_var = var
+
+        if self._uniform:
+            limit = np.sqrt(1.0 / float(fan_in))
+            op = block.append_op(
+                type="uniform_random",
+                inputs={},
+                outputs={"Out": out_var},
+                attrs={
+                    "shape": out_var.shape,
+                    "dtype": int(out_dtype),
+                    "min": -limit,
+                    "max": limit,
+                    "seed": self._seed
+                },
+                stop_gradient=True)
+
+        else:
+            std = np.sqrt(2.0 / float(fan_in))
+            op = block.append_op(
+                type="gaussian_random",
+                outputs={"Out": out_var},
+                attrs={
+                    "shape": out_var.shape,
+                    "dtype": int(out_dtype),
+                    "mean": 0.0,
+                    "std": std,
+                    "seed": self._seed
+                },
+                stop_gradient=True)
+
+        if var.dtype == VarDesc.VarType.FP16 or (
+                var.dtype == VarDesc.VarType.BF16 and not self._uniform):
+            block.append_op(
+                type="cast",
+                inputs={"X": out_var},
+                outputs={"Out": var},
+                attrs={"in_dtype": out_var.dtype,
+                       "out_dtype": var.dtype})
+
+        if not framework.in_dygraph_mode():
+            var.op = op
+        return op
+
+
+class DefaultInitializerContext(object):
+    """
+        egs:
+        with DefaultInitializerContext("kaiming_uniform"):
+            code for setup_model
+    """
+
+    def __init__(self, init_type=None):
+        self.init_type = init_type
+
+    def __enter__(self):
+        if self.init_type is None:
+            return
+        else:
+            from paddlespeech.s2t.modules import align
+            align.global_init_type = self.init_type
+            return
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        from paddlespeech.s2t.modules import align
+        align.global_init_type = None
diff --git a/ernie-sat/paddlespeech/s2t/modules/loss.py b/ernie-sat/paddlespeech/s2t/modules/loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..c7d9bd45dd2bf005a575098456c435a173678d26
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/modules/loss.py
@@ -0,0 +1,185 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2019 Mobvoi Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from wenet(https://github.com/wenet-e2e/wenet)
+import inspect
+
+import paddle
+from paddle import nn
+from paddle.nn import functional as F
+
+from paddlespeech.s2t.utils.log import Log
+
+logger = Log(__name__).getlog()
+
+__all__ = ['CTCLoss', "LabelSmoothingLoss"]
+
+
+class CTCLoss(nn.Layer):
+    def __init__(self,
+                 blank=0,
+                 reduction='sum',
+                 batch_average=False,
+                 grad_norm_type=None):
+        super().__init__()
+        # last token id as blank id
+        self.loss = nn.CTCLoss(blank=blank, reduction=reduction)
+        self.batch_average = batch_average
+
+        logger.info(
+            f"CTCLoss Loss reduction: {reduction}, div-bs: {batch_average}")
+        logger.info(f"CTCLoss Grad Norm Type: {grad_norm_type}")
+
+        assert grad_norm_type in ('instance', 'batch', 'frame', None)
+        self.norm_by_times = False
+        self.norm_by_batchsize = False
+        self.norm_by_total_logits_len = False
+        if grad_norm_type is None:
+            # no grad norm
+            pass
+        elif grad_norm_type == 'instance':
+            self.norm_by_times = True
+        elif grad_norm_type == 'batch':
+            self.norm_by_batchsize = True
+        elif grad_norm_type == 'frame':
+            self.norm_by_total_logits_len = True
+        else:
+            raise ValueError(f"CTCLoss Grad Norm no support {grad_norm_type}")
+        kwargs = {
+            "norm_by_times": self.norm_by_times,
+            "norm_by_batchsize": self.norm_by_batchsize,
+            "norm_by_total_logits_len": self.norm_by_total_logits_len,
+        }
+
+        # Derive only the args which the func has
+        try:
+            param = inspect.signature(self.loss.forward).parameters
+        except ValueError:
+            # Some function, e.g. built-in function, are failed
+            param = {}
+        self._kwargs = {k: v for k, v in kwargs.items() if k in param}
+        _notin = {k: v for k, v in kwargs.items() if k not in param}
+        logger.info(f"{self.loss} kwargs:{self._kwargs}, not support: {_notin}")
+
+    def forward(self, logits, ys_pad, hlens, ys_lens):
+        """Compute CTC loss.
+
+        Args:
+            logits ([paddle.Tensor]): [B, Tmax, D]
+            ys_pad ([paddle.Tensor]): [B, Tmax]
+            hlens ([paddle.Tensor]): [B]
+            ys_lens ([paddle.Tensor]): [B]
+
+        Returns:
+            [paddle.Tensor]: scalar. If reduction is 'none', then (N), where N = \text{batch size}.
+        """
+        B = paddle.shape(logits)[0]
+        # warp-ctc need logits, and do softmax on logits by itself
+        # warp-ctc need activation with shape [T, B, V + 1]
+        # logits: (B, L, D) -> (L, B, D)
+        logits = logits.transpose([1, 0, 2])
+        ys_pad = ys_pad.astype(paddle.int32)
+        loss = self.loss(logits, ys_pad, hlens, ys_lens, **self._kwargs)
+        if self.batch_average:
+            # Batch-size average
+            loss = loss / B
+        return loss
+
+
+class LabelSmoothingLoss(nn.Layer):
+    """Label-smoothing loss.
+    In a standard CE loss, the label's data distribution is:
+        [0,1,2] ->
+        [
+            [1.0, 0.0, 0.0],
+            [0.0, 1.0, 0.0],
+            [0.0, 0.0, 1.0],
+        ]
+    In the smoothing version CE Loss,some probabilities
+    are taken from the true label prob (1.0) and are divided
+    among other labels.
+        e.g.
+        smoothing=0.1
+        [0,1,2] ->
+        [
+            [0.9, 0.05, 0.05],
+            [0.05, 0.9, 0.05],
+            [0.05, 0.05, 0.9],
+        ]
+
+    """
+
+    def __init__(self,
+                 size: int,
+                 padding_idx: int,
+                 smoothing: float,
+                 normalize_length: bool=False):
+        """Label-smoothing loss.
+
+        Args:
+            size (int): the number of class
+            padding_idx (int): padding class id which will be ignored for loss
+            smoothing (float): smoothing rate (0.0 means the conventional CE)
+            normalize_length (bool):
+                True, normalize loss by sequence length;
+                False, normalize loss by batch size.
+                Defaults to False.
+        """
+        super().__init__()
+        self.size = size
+        self.padding_idx = padding_idx
+        self.smoothing = smoothing
+        self.confidence = 1.0 - smoothing
+        self.normalize_length = normalize_length
+        self.criterion = nn.KLDivLoss(reduction="none")
+
+    def forward(self, x: paddle.Tensor, target: paddle.Tensor) -> paddle.Tensor:
+        """Compute loss between x and target.
+        The model outputs and data labels tensors are flatten to
+        (batch*seqlen, class) shape and a mask is applied to the
+        padding part which should not be calculated for loss.
+
+        Args:
+            x (paddle.Tensor): prediction (batch, seqlen, class)
+            target (paddle.Tensor):
+                target signal masked with self.padding_id (batch, seqlen)
+        Returns:
+            loss (paddle.Tensor) : The KL loss, scalar float value
+        """
+        B, T, D = paddle.shape(x)
+        assert D == self.size
+        x = x.reshape((-1, self.size))
+        target = target.reshape([-1])
+
+        # use zeros_like instead of torch.no_grad() for true_dist,
+        # since no_grad() can not be exported by JIT
+        true_dist = paddle.full_like(x, self.smoothing / (self.size - 1))
+        ignore = target == self.padding_idx  # (B,)
+
+        #TODO(Hui Zhang): target = target * (1 - ignore)  # avoid -1 index
+        target = target.masked_fill(ignore, 0)  # avoid -1 index
+        # true_dist.scatter_(1, target.unsqueeze(1), self.confidence)
+        target_mask = F.one_hot(target, self.size)
+        true_dist *= (1 - target_mask)
+        true_dist += target_mask * self.confidence
+
+        kl = self.criterion(F.log_softmax(x, axis=1), true_dist)
+
+        #TODO(Hui Zhang): sum not support bool type
+        #total = len(target) - int(ignore.sum())
+        total = len(target) - int(ignore.type_as(target).sum())
+        denom = total if self.normalize_length else B
+        #numer = (kl * (1 - ignore)).sum()
+        numer = kl.masked_fill(ignore.unsqueeze(1), 0).sum()
+        return numer / denom
diff --git a/ernie-sat/paddlespeech/s2t/modules/mask.py b/ernie-sat/paddlespeech/s2t/modules/mask.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f66c015acb4574bddab8276a7c9c4454206997e
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/modules/mask.py
@@ -0,0 +1,277 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2019 Mobvoi Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from wenet(https://github.com/wenet-e2e/wenet)
+import paddle
+
+from paddlespeech.s2t.utils.log import Log
+
+logger = Log(__name__).getlog()
+
+__all__ = [
+    "make_xs_mask", "make_pad_mask", "make_non_pad_mask", "subsequent_mask",
+    "subsequent_chunk_mask", "add_optional_chunk_mask", "mask_finished_scores",
+    "mask_finished_preds"
+]
+
+
+def make_xs_mask(xs: paddle.Tensor, pad_value=0.0) -> paddle.Tensor:
+    """Maks mask tensor containing indices of non-padded part.
+    Args:
+        xs (paddle.Tensor): (B, T, D), zeros for pad.
+    Returns:
+        paddle.Tensor: Mask Tensor indices of non-padded part. (B, T)
+    """
+    pad_frame = paddle.full([1, 1, xs.shape[-1]], pad_value, dtype=xs.dtype)
+    mask = xs != pad_frame
+    mask = mask.all(axis=-1)
+    return mask
+
+
+def make_pad_mask(lengths: paddle.Tensor) -> paddle.Tensor:
+    """Make mask tensor containing indices of padded part.
+    See description of make_non_pad_mask.
+    Args:
+        lengths (paddle.Tensor): Batch of lengths (B,).
+    Returns:
+        paddle.Tensor: Mask tensor containing indices of padded part.
+        (B, T)
+    Examples:
+        >>> lengths = [5, 3, 2]
+        >>> make_pad_mask(lengths)
+        masks = [[0, 0, 0, 0 ,0],
+                 [0, 0, 0, 1, 1],
+                 [0, 0, 1, 1, 1]]
+    """
+    # (TODO: Hui Zhang): jit not support Tensor.dim() and Tensor.ndim
+    # assert lengths.dim() == 1
+    batch_size = int(lengths.shape[0])
+    max_len = int(lengths.max())
+    seq_range = paddle.arange(0, max_len, dtype=paddle.int64)
+    seq_range_expand = seq_range.unsqueeze(0).expand([batch_size, max_len])
+    seq_length_expand = lengths.unsqueeze(-1)
+    mask = seq_range_expand >= seq_length_expand
+    return mask
+
+
+def make_non_pad_mask(lengths: paddle.Tensor) -> paddle.Tensor:
+    """Make mask tensor containing indices of non-padded part.
+    The sequences in a batch may have different lengths. To enable
+    batch computing, padding is need to make all sequence in same
+    size. To avoid the padding part pass value to context dependent
+    block such as attention or convolution , this padding part is
+    masked.
+    This pad_mask is used in both encoder and decoder.
+    1 for non-padded part and 0 for padded part.
+    Args:
+        lengths (paddle.Tensor): Batch of lengths (B,).
+    Returns:
+        paddle.Tensor: mask tensor containing indices of padded part.
+        (B, T)
+    Examples:
+        >>> lengths = [5, 3, 2]
+        >>> make_non_pad_mask(lengths)
+        masks = [[1, 1, 1, 1 ,1],
+                 [1, 1, 1, 0, 0],
+                 [1, 1, 0, 0, 0]]
+    """
+    #return ~make_pad_mask(lengths)
+    return make_pad_mask(lengths).logical_not()
+
+
+def subsequent_mask(size: int) -> paddle.Tensor:
+    """Create mask for subsequent steps (size, size).
+    This mask is used only in decoder which works in an auto-regressive mode.
+    This means the current step could only do attention with its left steps.
+    In encoder, fully attention is used when streaming is not necessary and
+    the sequence is not long. In this case, no attention mask is needed.
+    When streaming is need, chunk-based attention is used in encoder. See
+    subsequent_chunk_mask for the chunk-based attention mask.
+    Args:
+        size (int): size of mask
+    Returns:
+        paddle.Tensor: mask, [size, size]
+    Examples:
+        >>> subsequent_mask(3)
+        [[1, 0, 0],
+         [1, 1, 0],
+         [1, 1, 1]]
+    """
+    ret = paddle.ones([size, size], dtype=paddle.bool)
+    #TODO(Hui Zhang): tril not support bool
+    #return paddle.tril(ret)
+    ret = ret.astype(paddle.float)
+    ret = paddle.tril(ret)
+    ret = ret.astype(paddle.bool)
+    return ret
+
+
+def subsequent_chunk_mask(
+        size: int,
+        chunk_size: int,
+        num_left_chunks: int=-1, ) -> paddle.Tensor:
+    """Create mask for subsequent steps (size, size) with chunk size,
+       this is for streaming encoder
+    Args:
+        size (int): size of mask
+        chunk_size (int): size of chunk
+        num_left_chunks (int): number of left chunks
+            <0: use full chunk
+            >=0: use num_left_chunks
+    Returns:
+        paddle.Tensor: mask, [size, size]
+    Examples:
+        >>> subsequent_chunk_mask(4, 2)
+        [[1, 1, 0, 0],
+         [1, 1, 0, 0],
+         [1, 1, 1, 1],
+         [1, 1, 1, 1]]
+    """
+    ret = paddle.zeros([size, size], dtype=paddle.bool)
+    for i in range(size):
+        if num_left_chunks < 0:
+            start = 0
+        else:
+            start = max(0, (i // chunk_size - num_left_chunks) * chunk_size)
+        ending = min(size, (i // chunk_size + 1) * chunk_size)
+        ret[i, start:ending] = True
+    return ret
+
+
+def add_optional_chunk_mask(xs: paddle.Tensor,
+                            masks: paddle.Tensor,
+                            use_dynamic_chunk: bool,
+                            use_dynamic_left_chunk: bool,
+                            decoding_chunk_size: int,
+                            static_chunk_size: int,
+                            num_decoding_left_chunks: int):
+    """ Apply optional mask for encoder.
+    Args:
+        xs (paddle.Tensor): padded input, (B, L, D), L for max length
+        mask (paddle.Tensor): mask for xs, (B, 1, L)
+        use_dynamic_chunk (bool): whether to use dynamic chunk or not
+        use_dynamic_left_chunk (bool): whether to use dynamic left chunk for
+            training.
+        decoding_chunk_size (int): decoding chunk size for dynamic chunk, it's
+            0: default for training, use random dynamic chunk.
+            <0: for decoding, use full chunk.
+            >0: for decoding, use fixed chunk size as set.
+        static_chunk_size (int): chunk size for static chunk training/decoding
+            if it's greater than 0, if use_dynamic_chunk is true,
+            this parameter will be ignored
+        num_decoding_left_chunks (int): number of left chunks, this is for decoding,
+            the chunk size is decoding_chunk_size.
+            >=0: use num_decoding_left_chunks
+            <0: use all left chunks
+    Returns:
+        paddle.Tensor: chunk mask of the input xs.
+    """
+    # Whether to use chunk mask or not
+    if use_dynamic_chunk:
+        max_len = xs.shape[1]
+        if decoding_chunk_size < 0:
+            chunk_size = max_len
+            num_left_chunks = -1
+        elif decoding_chunk_size > 0:
+            chunk_size = decoding_chunk_size
+            num_left_chunks = num_decoding_left_chunks
+        else:
+            # chunk size is either [1, 25] or full context(max_len).
+            # Since we use 4 times subsampling and allow up to 1s(100 frames)
+            # delay, the maximum frame is 100 / 4 = 25.
+            chunk_size = int(paddle.randint(1, max_len, (1, )))
+            num_left_chunks = -1
+            if chunk_size > max_len // 2:
+                chunk_size = max_len
+            else:
+                chunk_size = chunk_size % 25 + 1
+                if use_dynamic_left_chunk:
+                    max_left_chunks = (max_len - 1) // chunk_size
+                    num_left_chunks = int(
+                        paddle.randint(0, max_left_chunks, (1, )))
+        chunk_masks = subsequent_chunk_mask(xs.shape[1], chunk_size,
+                                            num_left_chunks)  # (L, L)
+        chunk_masks = chunk_masks.unsqueeze(0)  # (1, L, L)
+        # chunk_masks = masks & chunk_masks  # (B, L, L)
+        chunk_masks = masks.logical_and(chunk_masks)  # (B, L, L)
+    elif static_chunk_size > 0:
+        num_left_chunks = num_decoding_left_chunks
+        chunk_masks = subsequent_chunk_mask(xs.shape[1], static_chunk_size,
+                                            num_left_chunks)  # (L, L)
+        chunk_masks = chunk_masks.unsqueeze(0)  # (1, L, L)
+        # chunk_masks = masks & chunk_masks  # (B, L, L)
+        chunk_masks = masks.logical_and(chunk_masks)  # (B, L, L)
+    else:
+        chunk_masks = masks
+    return chunk_masks
+
+
+def mask_finished_scores(score: paddle.Tensor,
+                         flag: paddle.Tensor) -> paddle.Tensor:
+    """
+    If a sequence is finished, we only allow one alive branch. This function
+    aims to give one branch a zero score and the rest -inf score.
+    Args:
+        score (paddle.Tensor): A real value array with shape
+            (batch_size * beam_size, beam_size).
+        flag (paddle.Tensor): A bool array with shape
+            (batch_size * beam_size, 1).
+    Returns:
+        paddle.Tensor: (batch_size * beam_size, beam_size).
+    Examples:
+        flag: tensor([[ True],
+                      [False]])
+        score: tensor([[-0.3666, -0.6664,  0.6019],
+                       [-1.1490, -0.2948,  0.7460]])
+        unfinished: tensor([[False,  True,  True],
+                            [False, False, False]])
+        finished: tensor([[ True, False, False],
+                          [False, False, False]])
+        return: tensor([[ 0.0000,    -inf,    -inf],
+                        [-1.1490, -0.2948,  0.7460]])
+    """
+    beam_size = score.shape[-1]
+    zero_mask = paddle.zeros_like(flag, dtype=paddle.bool)
+    if beam_size > 1:
+        unfinished = paddle.concat(
+            (zero_mask, flag.tile([1, beam_size - 1])), axis=1)
+        finished = paddle.concat(
+            (flag, zero_mask.tile([1, beam_size - 1])), axis=1)
+    else:
+        unfinished = zero_mask
+        finished = flag
+
+    # infs = paddle.ones_like(score) * -float('inf')
+    # score = paddle.where(unfinished, infs, score)
+    # score = paddle.where(finished, paddle.zeros_like(score), score)
+    score.masked_fill_(unfinished, -float('inf'))
+    score.masked_fill_(finished, 0)
+    return score
+
+
+def mask_finished_preds(pred: paddle.Tensor, flag: paddle.Tensor,
+                        eos: int) -> paddle.Tensor:
+    """
+    If a sequence is finished, all of its branch should be <eos>
+    Args:
+        pred (paddle.Tensor): A int array with shape
+            (batch_size * beam_size, beam_size).
+        flag (paddle.Tensor): A bool array with shape
+            (batch_size * beam_size, 1).
+    Returns:
+        paddle.Tensor: (batch_size * beam_size).
+    """
+    beam_size = pred.shape[-1]
+    finished = flag.repeat(1, beam_size)
+    return pred.masked_fill_(finished, eos)
diff --git a/ernie-sat/paddlespeech/s2t/modules/positionwise_feed_forward.py b/ernie-sat/paddlespeech/s2t/modules/positionwise_feed_forward.py
new file mode 100644
index 0000000000000000000000000000000000000000..c2725dc5cc4aac28d04e44333e185082d7300d44
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/modules/positionwise_feed_forward.py
@@ -0,0 +1,60 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2019 Mobvoi Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from wenet(https://github.com/wenet-e2e/wenet)
+"""Positionwise feed forward layer definition."""
+import paddle
+from paddle import nn
+
+from paddlespeech.s2t.modules.align import Linear
+from paddlespeech.s2t.utils.log import Log
+
+logger = Log(__name__).getlog()
+
+__all__ = ["PositionwiseFeedForward"]
+
+
+class PositionwiseFeedForward(nn.Layer):
+    """Positionwise feed forward layer."""
+
+    def __init__(self,
+                 idim: int,
+                 hidden_units: int,
+                 dropout_rate: float,
+                 activation: nn.Layer=nn.ReLU()):
+        """Construct a PositionwiseFeedForward object.
+
+        FeedForward are appied on each position of the sequence.
+        The output dim is same with the input dim.
+
+        Args:
+            idim (int): Input dimenstion.
+            hidden_units (int): The number of hidden units.
+            dropout_rate (float): Dropout rate.
+            activation (paddle.nn.Layer): Activation function
+        """
+        super().__init__()
+        self.w_1 = Linear(idim, hidden_units)
+        self.activation = activation
+        self.dropout = nn.Dropout(dropout_rate)
+        self.w_2 = Linear(hidden_units, idim)
+
+    def forward(self, xs: paddle.Tensor) -> paddle.Tensor:
+        """Forward function.
+        Args:
+            xs: input tensor (B, Lmax, D)
+        Returns:
+            output tensor, (B, Lmax, D)
+        """
+        return self.w_2(self.dropout(self.activation(self.w_1(xs))))
diff --git a/ernie-sat/paddlespeech/s2t/modules/subsampling.py b/ernie-sat/paddlespeech/s2t/modules/subsampling.py
new file mode 100644
index 0000000000000000000000000000000000000000..88451ddd77f6f89f8597238ddb1236acaa1945d7
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/modules/subsampling.py
@@ -0,0 +1,250 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2019 Mobvoi Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from wenet(https://github.com/wenet-e2e/wenet)
+"""Subsampling layer definition."""
+from typing import Tuple
+
+import paddle
+from paddle import nn
+
+from paddlespeech.s2t.modules.align import Conv2D
+from paddlespeech.s2t.modules.align import LayerNorm
+from paddlespeech.s2t.modules.align import Linear
+from paddlespeech.s2t.modules.embedding import PositionalEncoding
+from paddlespeech.s2t.utils.log import Log
+
+logger = Log(__name__).getlog()
+
+__all__ = [
+    "LinearNoSubsampling", "Conv2dSubsampling4", "Conv2dSubsampling6",
+    "Conv2dSubsampling8"
+]
+
+
+class BaseSubsampling(nn.Layer):
+    def __init__(self, pos_enc_class: nn.Layer=PositionalEncoding):
+        super().__init__()
+        self.pos_enc = pos_enc_class
+        # window size = (1 + right_context) + (chunk_size -1) * subsampling_rate
+        self.right_context = 0
+        # stride = subsampling_rate * chunk_size
+        self.subsampling_rate = 1
+
+    def position_encoding(self, offset: int, size: int) -> paddle.Tensor:
+        return self.pos_enc.position_encoding(offset, size)
+
+
+class LinearNoSubsampling(BaseSubsampling):
+    """Linear transform the input without subsampling."""
+
+    def __init__(self,
+                 idim: int,
+                 odim: int,
+                 dropout_rate: float,
+                 pos_enc_class: nn.Layer=PositionalEncoding):
+        """Construct an linear object.
+        Args:
+            idim (int): Input dimension.
+            odim (int): Output dimension.
+            dropout_rate (float): Dropout rate.
+            pos_enc_class (PositionalEncoding): position encoding class
+        """
+        super().__init__(pos_enc_class)
+        self.out = nn.Sequential(
+            Linear(idim, odim),
+            LayerNorm(odim, epsilon=1e-12),
+            nn.Dropout(dropout_rate),
+            nn.ReLU(), )
+        self.right_context = 0
+        self.subsampling_rate = 1
+
+    def forward(self, x: paddle.Tensor, x_mask: paddle.Tensor, offset: int=0
+                ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
+        """Input x.
+        Args:
+            x (paddle.Tensor): Input tensor (#batch, time, idim).
+            x_mask (paddle.Tensor): Input mask (#batch, 1, time).
+            offset (int): position encoding offset.
+        Returns:
+            paddle.Tensor: linear input tensor (#batch, time', odim),
+                where time' = time .
+            paddle.Tensor: positional encoding
+            paddle.Tensor: linear input mask (#batch, 1, time'),
+                where time' = time .
+        """
+        x = self.out(x)
+        x, pos_emb = self.pos_enc(x, offset)
+        return x, pos_emb, x_mask
+
+
+class Conv2dSubsampling(BaseSubsampling):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+
+class Conv2dSubsampling4(Conv2dSubsampling):
+    """Convolutional 2D subsampling (to 1/4 length)."""
+
+    def __init__(self,
+                 idim: int,
+                 odim: int,
+                 dropout_rate: float,
+                 pos_enc_class: nn.Layer=PositionalEncoding):
+        """Construct an Conv2dSubsampling4 object.
+
+        Args:
+            idim (int): Input dimension.
+            odim (int): Output dimension.
+            dropout_rate (float): Dropout rate.
+        """
+        super().__init__(pos_enc_class)
+        self.conv = nn.Sequential(
+            Conv2D(1, odim, 3, 2),
+            nn.ReLU(),
+            Conv2D(odim, odim, 3, 2),
+            nn.ReLU(), )
+        self.out = nn.Sequential(
+            Linear(odim * (((idim - 1) // 2 - 1) // 2), odim))
+        self.subsampling_rate = 4
+        # The right context for every conv layer is computed by:
+        # (kernel_size - 1) * frame_rate_of_this_layer
+        # 6 = (3 - 1) * 1 + (3 - 1) * 2
+        self.right_context = 6
+
+    def forward(self, x: paddle.Tensor, x_mask: paddle.Tensor, offset: int=0
+                ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
+        """Subsample x.
+        Args:
+            x (paddle.Tensor): Input tensor (#batch, time, idim).
+            x_mask (paddle.Tensor): Input mask (#batch, 1, time).
+            offset (int): position encoding offset.
+        Returns:
+            paddle.Tensor: Subsampled tensor (#batch, time', odim),
+                where time' = time // 4.
+            paddle.Tensor: positional encoding
+            paddle.Tensor: Subsampled mask (#batch, 1, time'),
+                where time' = time // 4.
+        """
+        x = x.unsqueeze(1)  # (b, c=1, t, f)
+        x = self.conv(x)
+        b, c, t, f = paddle.shape(x)
+        x = self.out(x.transpose([0, 2, 1, 3]).reshape([b, t, c * f]))
+        x, pos_emb = self.pos_enc(x, offset)
+        return x, pos_emb, x_mask[:, :, :-2:2][:, :, :-2:2]
+
+
+class Conv2dSubsampling6(Conv2dSubsampling):
+    """Convolutional 2D subsampling (to 1/6 length)."""
+
+    def __init__(self,
+                 idim: int,
+                 odim: int,
+                 dropout_rate: float,
+                 pos_enc_class: nn.Layer=PositionalEncoding):
+        """Construct an Conv2dSubsampling6 object.
+
+        Args:
+            idim (int): Input dimension.
+            odim (int): Output dimension.
+            dropout_rate (float): Dropout rate.
+            pos_enc (PositionalEncoding): Custom position encoding layer.
+        """
+        super().__init__(pos_enc_class)
+        self.conv = nn.Sequential(
+            Conv2D(1, odim, 3, 2),
+            nn.ReLU(),
+            Conv2D(odim, odim, 5, 3),
+            nn.ReLU(), )
+        # O = (I - F + Pstart + Pend) // S + 1
+        # when Padding == 0, O = (I - F - S) // S
+        self.linear = Linear(odim * (((idim - 1) // 2 - 2) // 3), odim)
+        # The right context for every conv layer is computed by:
+        # (kernel_size - 1) * frame_rate_of_this_layer
+        # 10 = (3 - 1) * 1 + (5 - 1) * 2
+        self.subsampling_rate = 6
+        self.right_context = 10
+
+    def forward(self, x: paddle.Tensor, x_mask: paddle.Tensor, offset: int=0
+                ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
+        """Subsample x.
+        Args:
+            x (paddle.Tensor): Input tensor (#batch, time, idim).
+            x_mask (paddle.Tensor): Input mask (#batch, 1, time).
+            offset (int): position encoding offset.
+        Returns:
+            paddle.Tensor: Subsampled tensor (#batch, time', odim),
+                where time' = time // 6.
+            paddle.Tensor: positional encoding
+            paddle.Tensor: Subsampled mask (#batch, 1, time'),
+                where time' = time // 6.
+        """
+        x = x.unsqueeze(1)  # (b, c, t, f)
+        x = self.conv(x)
+        b, c, t, f = paddle.shape(x)
+        x = self.linear(x.transpose([0, 2, 1, 3]).reshape([b, t, c * f]))
+        x, pos_emb = self.pos_enc(x, offset)
+        return x, pos_emb, x_mask[:, :, :-2:2][:, :, :-4:3]
+
+
+class Conv2dSubsampling8(Conv2dSubsampling):
+    """Convolutional 2D subsampling (to 1/8 length)."""
+
+    def __init__(self,
+                 idim: int,
+                 odim: int,
+                 dropout_rate: float,
+                 pos_enc_class: nn.Layer=PositionalEncoding):
+        """Construct an Conv2dSubsampling8 object.
+
+        Args:
+            idim (int): Input dimension.
+            odim (int): Output dimension.
+            dropout_rate (float): Dropout rate.
+        """
+        super().__init__(pos_enc_class)
+        self.conv = nn.Sequential(
+            Conv2D(1, odim, 3, 2),
+            nn.ReLU(),
+            Conv2D(odim, odim, 3, 2),
+            nn.ReLU(),
+            Conv2D(odim, odim, 3, 2),
+            nn.ReLU(), )
+        self.linear = Linear(odim * ((((idim - 1) // 2 - 1) // 2 - 1) // 2),
+                             odim)
+        self.subsampling_rate = 8
+        # The right context for every conv layer is computed by:
+        # (kernel_size - 1) * frame_rate_of_this_layer
+        # 14 = (3 - 1) * 1 + (3 - 1) * 2 + (3 - 1) * 4
+        self.right_context = 14
+
+    def forward(self, x: paddle.Tensor, x_mask: paddle.Tensor, offset: int=0
+                ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
+        """Subsample x.
+        Args:
+            x (paddle.Tensor): Input tensor (#batch, time, idim).
+            x_mask (paddle.Tensor): Input mask (#batch, 1, time).
+            offset (int): position encoding offset.
+        Returns:
+            paddle.Tensor: Subsampled tensor (#batch, time', odim),
+                where time' = time // 8.
+            paddle.Tensor: positional encoding
+            paddle.Tensor: Subsampled mask (#batch, 1, time'),
+                where time' = time // 8.
+        """
+        x = x.unsqueeze(1)  # (b, c, t, f)
+        x = self.conv(x)
+        x = self.linear(x.transpose([0, 2, 1, 3]).reshape([b, t, c * f]))
+        x, pos_emb = self.pos_enc(x, offset)
+        return x, pos_emb, x_mask[:, :, :-2:2][:, :, :-2:2][:, :, :-2:2]
diff --git a/ernie-sat/paddlespeech/s2t/training/__init__.py b/ernie-sat/paddlespeech/s2t/training/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..185a92b8d94d3426d616c0624f0f2ee04339349e
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/training/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/ernie-sat/paddlespeech/s2t/training/cli.py b/ernie-sat/paddlespeech/s2t/training/cli.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb85732a6f7f33ee9c5f2f7febabcd7912b78374
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/training/cli.py
@@ -0,0 +1,127 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+
+
+class ExtendAction(argparse.Action):
+    """
+    [Since Python 3.8, the "extend" is available directly in stdlib]
+    (https://docs.python.org/3.8/library/argparse.html#action).
+    If you only have to support 3.8+ then defining it yourself is no longer required. 
+    Usage of stdlib "extend" action is exactly the same way as this answer originally described:
+    """
+
+    def __call__(self, parser, namespace, values, option_string=None):
+        items = getattr(namespace, self.dest) or []
+        items.extend(values)
+        setattr(namespace, self.dest, items)
+
+
+class LoadFromFile(argparse.Action):
+    def __call__(self, parser, namespace, values, option_string=None):
+        with values as f:
+            # parse arguments in the file and store them in the target namespace
+            parser.parse_args(f.read().split(), namespace)
+
+
+def default_argument_parser(parser=None):
+    r"""A simple yet genral argument parser for experiments with t2s.
+
+    This is used in examples with t2s. And it is intended to be used by
+    other experiments with t2s. It requires a minimal set of command line
+    arguments to start a training script.
+
+    The ``--config`` and ``--opts`` are used for overwrite the deault
+    configuration.
+
+    The ``--data`` and ``--output`` specifies the data path and output path.
+    Resuming training from existing progress at the output directory is the
+    intended default behavior.
+
+    The ``--checkpoint_path`` specifies the checkpoint to load from.
+
+    The ``--ngpu`` specifies how to run the training.
+
+
+    See Also
+    --------
+    paddlespeech.t2s.training.experiment
+    Returns
+    -------
+    argparse.ArgumentParser
+        the parser
+    """
+    if parser is None:
+        parser = argparse.ArgumentParser()
+
+    parser.register('action', 'extend', ExtendAction)
+    parser.add_argument(
+        '--conf', type=open, action=LoadFromFile, help="config file.")
+
+    train_group = parser.add_argument_group(
+        title='Train Options', description=None)
+    train_group.add_argument(
+        "--seed",
+        type=int,
+        default=None,
+        help="seed to use for paddle, np and random. None or 0 for random, else set seed."
+    )
+    train_group.add_argument(
+        "--ngpu",
+        type=int,
+        default=1,
+        help="number of parallel processes. 0 for cpu.")
+    train_group.add_argument(
+        "--config", metavar="CONFIG_FILE", help="config file.")
+    train_group.add_argument(
+        "--output", metavar="CKPT_DIR", help="path to save checkpoint.")
+    train_group.add_argument(
+        "--checkpoint_path", type=str, help="path to load checkpoint")
+    train_group.add_argument(
+        "--opts",
+        action='extend',
+        nargs=2,
+        metavar=('key', 'val'),
+        help="overwrite --config field, passing (KEY VALUE) pairs")
+    train_group.add_argument(
+        "--dump-config", metavar="FILE", help="dump config to `this` file.")
+
+    test_group = parser.add_argument_group(
+        title='Test Options', description=None)
+
+    test_group.add_argument(
+        "--decode_cfg",
+        metavar="DECODE_CONFIG_FILE",
+        help="decode config file.")
+
+    profile_group = parser.add_argument_group(
+        title='Benchmark Options', description=None)
+    profile_group.add_argument(
+        '--profiler-options',
+        type=str,
+        default=None,
+        help='The option of profiler, which should be in format \"key1=value1;key2=value2;key3=value3\".'
+    )
+    profile_group.add_argument(
+        '--benchmark-batch-size',
+        type=int,
+        default=None,
+        help='batch size for benchmark.')
+    profile_group.add_argument(
+        '--benchmark-max-step',
+        type=int,
+        default=None,
+        help='max iteration for benchmark.')
+
+    return parser
diff --git a/ernie-sat/paddlespeech/s2t/training/extensions/__init__.py b/ernie-sat/paddlespeech/s2t/training/extensions/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ad04155931b1071c6fe746c3befaf07bda91051
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/training/extensions/__init__.py
@@ -0,0 +1,41 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Callable
+
+from .extension import Extension
+
+
+def make_extension(trigger: Callable=None,
+                   default_name: str=None,
+                   priority: int=None,
+                   finalizer: Callable=None,
+                   initializer: Callable=None,
+                   on_error: Callable=None):
+    """Make an Extension-like object by injecting required attributes to it.
+    """
+    if trigger is None:
+        trigger = Extension.trigger
+    if priority is None:
+        priority = Extension.priority
+
+    def decorator(ext):
+        ext.trigger = trigger
+        ext.default_name = default_name or ext.__name__
+        ext.priority = priority
+        ext.finalize = finalizer
+        ext.on_error = on_error
+        ext.initialize = initializer
+        return ext
+
+    return decorator
diff --git a/ernie-sat/paddlespeech/s2t/training/extensions/evaluator.py b/ernie-sat/paddlespeech/s2t/training/extensions/evaluator.py
new file mode 100644
index 0000000000000000000000000000000000000000..b96a4818d18769452ecf92a7f7a640fddfdd8fe1
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/training/extensions/evaluator.py
@@ -0,0 +1,102 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from chainer(https://github.com/chainer/chainer)
+from typing import Dict
+
+import paddle
+from paddle import distributed as dist
+from paddle.io import DataLoader
+from paddle.nn import Layer
+
+from . import extension
+from ..reporter import DictSummary
+from ..reporter import ObsScope
+from ..reporter import report
+from ..timer import Timer
+from paddlespeech.s2t.utils.log import Log
+logger = Log(__name__).getlog()
+
+
+class StandardEvaluator(extension.Extension):
+
+    trigger = (1, 'epoch')
+    default_name = 'validation'
+    priority = extension.PRIORITY_WRITER
+
+    name = None
+
+    def __init__(self, model: Layer, dataloader: DataLoader):
+        # it is designed to hold multiple models
+        models = {"main": model}
+        self.models: Dict[str, Layer] = models
+        self.model = model
+
+        # dataloaders
+        self.dataloader = dataloader
+
+    def evaluate_core(self, batch):
+        # compute
+        self.model(batch)  # you may report here
+        return
+
+    def evaluate_sync(self, data):
+        # dist sync `evaluate_core` outputs
+        if data is None:
+            return
+
+        numerator, denominator = data
+        if dist.get_world_size() > 1:
+            numerator = paddle.to_tensor(numerator)
+            denominator = paddle.to_tensor(denominator)
+            # the default operator in all_reduce function is sum.
+            dist.all_reduce(numerator)
+            dist.all_reduce(denominator)
+            value = numerator / denominator
+            value = float(value)
+        else:
+            value = numerator / denominator
+        # used for `snapshort` to do kbest save.
+        report("VALID/LOSS", value)
+        logger.info(f"Valid: all-reduce loss {value}")
+
+    def evaluate(self):
+        # switch to eval mode
+        for model in self.models.values():
+            model.eval()
+
+        # to average evaluation metrics
+        summary = DictSummary()
+        for batch in self.dataloader:
+            observation = {}
+            with ObsScope(observation):
+                # main evaluation computation here.
+                with paddle.no_grad():
+                    self.evaluate_sync(self.evaluate_core(batch))
+            summary.add(observation)
+        summary = summary.compute_mean()
+
+        # switch to train mode
+        for model in self.models.values():
+            model.train()
+        return summary
+
+    def __call__(self, trainer=None):
+        # evaluate and report the averaged metric to current observation
+        # if it is used to extend a trainer, the metrics is reported to
+        # to observation of the trainer
+        # or otherwise, you can use your own observation
+        with Timer("Eval Time Cost: {}"):
+            summary = self.evaluate()
+        for k, v in summary.items():
+            report(k, v)
diff --git a/ernie-sat/paddlespeech/s2t/training/extensions/extension.py b/ernie-sat/paddlespeech/s2t/training/extensions/extension.py
new file mode 100644
index 0000000000000000000000000000000000000000..7493213a05c62e06005ea5aefa6894dcde008a8e
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/training/extensions/extension.py
@@ -0,0 +1,53 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from chainer(https://github.com/chainer/chainer)
+PRIORITY_WRITER = 300
+PRIORITY_EDITOR = 200
+PRIORITY_READER = 100
+
+
+class Extension():
+    """Extension to customize the behavior of Trainer."""
+    trigger = (1, 'iteration')
+    priority = PRIORITY_READER
+    name = None
+
+    @property
+    def default_name(self):
+        """Default name of the extension, class name by default."""
+        return type(self).__name__
+
+    def __call__(self, trainer):
+        """Main action of the extention. After each update, it is executed
+        when the trigger fires."""
+        raise NotImplementedError(
+            'Extension implementation must override __call__.')
+
+    def initialize(self, trainer):
+        """Action that is executed once to get the corect trainer state.
+        It is called before training normally, but if the trainer restores
+        states with an Snapshot extension, this method should also be called.
+        """
+        pass
+
+    def on_error(self, trainer, exc, tb):
+        """Handles the error raised during training before finalization.
+        """
+        pass
+
+    def finalize(self, trainer):
+        """Action that is executed when training is done.
+        For example, visualizers would need to be closed.
+        """
+        pass
diff --git a/ernie-sat/paddlespeech/s2t/training/extensions/plot.py b/ernie-sat/paddlespeech/s2t/training/extensions/plot.py
new file mode 100644
index 0000000000000000000000000000000000000000..7782b95b9d17b1530d7682e69b82f5882bb4d5d1
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/training/extensions/plot.py
@@ -0,0 +1,419 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from chainer(https://github.com/chainer/chainer)
+import copy
+import os
+
+import numpy as np
+
+from . import extension
+
+
+class PlotAttentionReport(extension.Extension):
+    """Plot attention reporter.
+
+    Args:
+        att_vis_fn (espnet.nets.*_backend.e2e_asr.E2E.calculate_all_attentions):
+            Function of attention visualization.
+        data (list[tuple(str, dict[str, list[Any]])]): List json utt key items.
+        outdir (str): Directory to save figures.
+        converter (espnet.asr.*_backend.asr.CustomConverter):
+            Function to convert data.
+        device (int | torch.device): Device.
+        reverse (bool): If True, input and output length are reversed.
+        ikey (str): Key to access input
+            (for ASR/ST ikey="input", for MT ikey="output".)
+        iaxis (int): Dimension to access input
+            (for ASR/ST iaxis=0, for MT iaxis=1.)
+        okey (str): Key to access output
+            (for ASR/ST okey="input", MT okay="output".)
+        oaxis (int): Dimension to access output
+            (for ASR/ST oaxis=0, for MT oaxis=0.)
+        subsampling_factor (int): subsampling factor in encoder
+
+    """
+
+    def __init__(
+            self,
+            att_vis_fn,
+            data,
+            outdir,
+            converter,
+            transform,
+            device,
+            reverse=False,
+            ikey="input",
+            iaxis=0,
+            okey="output",
+            oaxis=0,
+            subsampling_factor=1, ):
+        self.att_vis_fn = att_vis_fn
+        self.data = copy.deepcopy(data)
+        self.data_dict = {k: v for k, v in copy.deepcopy(data)}
+        # key is utterance ID
+        self.outdir = outdir
+        self.converter = converter
+        self.transform = transform
+        self.device = device
+        self.reverse = reverse
+        self.ikey = ikey
+        self.iaxis = iaxis
+        self.okey = okey
+        self.oaxis = oaxis
+        self.factor = subsampling_factor
+        if not os.path.exists(self.outdir):
+            os.makedirs(self.outdir)
+
+    def __call__(self, trainer):
+        """Plot and save image file of att_ws matrix."""
+        att_ws, uttid_list = self.get_attention_weights()
+        if isinstance(att_ws, list):  # multi-encoder case
+            num_encs = len(att_ws) - 1
+            # atts
+            for i in range(num_encs):
+                for idx, att_w in enumerate(att_ws[i]):
+                    filename = "%s/%s.ep.{.updater.epoch}.att%d.png" % (
+                        self.outdir, uttid_list[idx], i + 1, )
+                    att_w = self.trim_attention_weight(uttid_list[idx], att_w)
+                    np_filename = "%s/%s.ep.{.updater.epoch}.att%d.npy" % (
+                        self.outdir, uttid_list[idx], i + 1, )
+                    np.save(np_filename.format(trainer), att_w)
+                    self._plot_and_save_attention(att_w,
+                                                  filename.format(trainer))
+            # han
+            for idx, att_w in enumerate(att_ws[num_encs]):
+                filename = "%s/%s.ep.{.updater.epoch}.han.png" % (
+                    self.outdir, uttid_list[idx], )
+                att_w = self.trim_attention_weight(uttid_list[idx], att_w)
+                np_filename = "%s/%s.ep.{.updater.epoch}.han.npy" % (
+                    self.outdir, uttid_list[idx], )
+                np.save(np_filename.format(trainer), att_w)
+                self._plot_and_save_attention(
+                    att_w, filename.format(trainer), han_mode=True)
+        else:
+            for idx, att_w in enumerate(att_ws):
+                filename = "%s/%s.ep.{.updater.epoch}.png" % (self.outdir,
+                                                              uttid_list[idx], )
+                att_w = self.trim_attention_weight(uttid_list[idx], att_w)
+                np_filename = "%s/%s.ep.{.updater.epoch}.npy" % (
+                    self.outdir, uttid_list[idx], )
+                np.save(np_filename.format(trainer), att_w)
+                self._plot_and_save_attention(att_w, filename.format(trainer))
+
+    def log_attentions(self, logger, step):
+        """Add image files of att_ws matrix to the tensorboard."""
+        att_ws, uttid_list = self.get_attention_weights()
+        if isinstance(att_ws, list):  # multi-encoder case
+            num_encs = len(att_ws) - 1
+            # atts
+            for i in range(num_encs):
+                for idx, att_w in enumerate(att_ws[i]):
+                    att_w = self.trim_attention_weight(uttid_list[idx], att_w)
+                    plot = self.draw_attention_plot(att_w)
+                    logger.add_figure(
+                        "%s_att%d" % (uttid_list[idx], i + 1),
+                        plot.gcf(),
+                        step, )
+            # han
+            for idx, att_w in enumerate(att_ws[num_encs]):
+                att_w = self.trim_attention_weight(uttid_list[idx], att_w)
+                plot = self.draw_han_plot(att_w)
+                logger.add_figure(
+                    "%s_han" % (uttid_list[idx]),
+                    plot.gcf(),
+                    step, )
+        else:
+            for idx, att_w in enumerate(att_ws):
+                att_w = self.trim_attention_weight(uttid_list[idx], att_w)
+                plot = self.draw_attention_plot(att_w)
+                logger.add_figure("%s" % (uttid_list[idx]), plot.gcf(), step)
+
+    def get_attention_weights(self):
+        """Return attention weights.
+
+        Returns:
+            numpy.ndarray: attention weights. float. Its shape would be
+                differ from backend.
+                * pytorch-> 1) multi-head case => (B, H, Lmax, Tmax), 2)
+                    other case => (B, Lmax, Tmax).
+                * chainer-> (B, Lmax, Tmax)
+
+        """
+        return_batch, uttid_list = self.transform(self.data, return_uttid=True)
+        batch = self.converter([return_batch], self.device)
+        if isinstance(batch, tuple):
+            att_ws = self.att_vis_fn(*batch)
+        else:
+            att_ws = self.att_vis_fn(**batch)
+        return att_ws, uttid_list
+
+    def trim_attention_weight(self, uttid, att_w):
+        """Transform attention matrix with regard to self.reverse."""
+        if self.reverse:
+            enc_key, enc_axis = self.okey, self.oaxis
+            dec_key, dec_axis = self.ikey, self.iaxis
+        else:
+            enc_key, enc_axis = self.ikey, self.iaxis
+            dec_key, dec_axis = self.okey, self.oaxis
+        dec_len = int(self.data_dict[uttid][dec_key][dec_axis]["shape"][0])
+        enc_len = int(self.data_dict[uttid][enc_key][enc_axis]["shape"][0])
+        if self.factor > 1:
+            enc_len //= self.factor
+        if len(att_w.shape) == 3:
+            att_w = att_w[:, :dec_len, :enc_len]
+        else:
+            att_w = att_w[:dec_len, :enc_len]
+        return att_w
+
+    def draw_attention_plot(self, att_w):
+        """Plot the att_w matrix.
+
+        Returns:
+            matplotlib.pyplot: pyplot object with attention matrix image.
+
+        """
+        import matplotlib
+
+        matplotlib.use("Agg")
+        import matplotlib.pyplot as plt
+
+        plt.clf()
+        att_w = att_w.astype(np.float32)
+        if len(att_w.shape) == 3:
+            for h, aw in enumerate(att_w, 1):
+                plt.subplot(1, len(att_w), h)
+                plt.imshow(aw, aspect="auto")
+                plt.xlabel("Encoder Index")
+                plt.ylabel("Decoder Index")
+        else:
+            plt.imshow(att_w, aspect="auto")
+            plt.xlabel("Encoder Index")
+            plt.ylabel("Decoder Index")
+        plt.tight_layout()
+        return plt
+
+    def draw_han_plot(self, att_w):
+        """Plot the att_w matrix for hierarchical attention.
+
+        Returns:
+            matplotlib.pyplot: pyplot object with attention matrix image.
+
+        """
+        import matplotlib
+
+        matplotlib.use("Agg")
+        import matplotlib.pyplot as plt
+
+        plt.clf()
+        if len(att_w.shape) == 3:
+            for h, aw in enumerate(att_w, 1):
+                legends = []
+                plt.subplot(1, len(att_w), h)
+                for i in range(aw.shape[1]):
+                    plt.plot(aw[:, i])
+                    legends.append("Att{}".format(i))
+                plt.ylim([0, 1.0])
+                plt.xlim([0, aw.shape[0]])
+                plt.grid(True)
+                plt.ylabel("Attention Weight")
+                plt.xlabel("Decoder Index")
+                plt.legend(legends)
+        else:
+            legends = []
+            for i in range(att_w.shape[1]):
+                plt.plot(att_w[:, i])
+                legends.append("Att{}".format(i))
+            plt.ylim([0, 1.0])
+            plt.xlim([0, att_w.shape[0]])
+            plt.grid(True)
+            plt.ylabel("Attention Weight")
+            plt.xlabel("Decoder Index")
+            plt.legend(legends)
+        plt.tight_layout()
+        return plt
+
+    def _plot_and_save_attention(self, att_w, filename, han_mode=False):
+        if han_mode:
+            plt = self.draw_han_plot(att_w)
+        else:
+            plt = self.draw_attention_plot(att_w)
+        plt.savefig(filename)
+        plt.close()
+
+
+class PlotCTCReport(extension.Extension):
+    """Plot CTC reporter.
+
+    Args:
+        ctc_vis_fn (espnet.nets.*_backend.e2e_asr.E2E.calculate_all_ctc_probs):
+            Function of CTC visualization.
+        data (list[tuple(str, dict[str, list[Any]])]): List json utt key items.
+        outdir (str): Directory to save figures.
+        converter (espnet.asr.*_backend.asr.CustomConverter):
+            Function to convert data.
+        device (int | torch.device): Device.
+        reverse (bool): If True, input and output length are reversed.
+        ikey (str): Key to access input
+            (for ASR/ST ikey="input", for MT ikey="output".)
+        iaxis (int): Dimension to access input
+            (for ASR/ST iaxis=0, for MT iaxis=1.)
+        okey (str): Key to access output
+            (for ASR/ST okey="input", MT okay="output".)
+        oaxis (int): Dimension to access output
+            (for ASR/ST oaxis=0, for MT oaxis=0.)
+        subsampling_factor (int): subsampling factor in encoder
+
+    """
+
+    def __init__(
+            self,
+            ctc_vis_fn,
+            data,
+            outdir,
+            converter,
+            transform,
+            device,
+            reverse=False,
+            ikey="input",
+            iaxis=0,
+            okey="output",
+            oaxis=0,
+            subsampling_factor=1, ):
+        self.ctc_vis_fn = ctc_vis_fn
+        self.data = copy.deepcopy(data)
+        self.data_dict = {k: v for k, v in copy.deepcopy(data)}
+        # key is utterance ID
+        self.outdir = outdir
+        self.converter = converter
+        self.transform = transform
+        self.device = device
+        self.reverse = reverse
+        self.ikey = ikey
+        self.iaxis = iaxis
+        self.okey = okey
+        self.oaxis = oaxis
+        self.factor = subsampling_factor
+        if not os.path.exists(self.outdir):
+            os.makedirs(self.outdir)
+
+    def __call__(self, trainer):
+        """Plot and save image file of ctc prob."""
+        ctc_probs, uttid_list = self.get_ctc_probs()
+        if isinstance(ctc_probs, list):  # multi-encoder case
+            num_encs = len(ctc_probs) - 1
+            for i in range(num_encs):
+                for idx, ctc_prob in enumerate(ctc_probs[i]):
+                    filename = "%s/%s.ep.{.updater.epoch}.ctc%d.png" % (
+                        self.outdir, uttid_list[idx], i + 1, )
+                    ctc_prob = self.trim_ctc_prob(uttid_list[idx], ctc_prob)
+                    np_filename = "%s/%s.ep.{.updater.epoch}.ctc%d.npy" % (
+                        self.outdir, uttid_list[idx], i + 1, )
+                    np.save(np_filename.format(trainer), ctc_prob)
+                    self._plot_and_save_ctc(ctc_prob, filename.format(trainer))
+        else:
+            for idx, ctc_prob in enumerate(ctc_probs):
+                filename = "%s/%s.ep.{.updater.epoch}.png" % (self.outdir,
+                                                              uttid_list[idx], )
+                ctc_prob = self.trim_ctc_prob(uttid_list[idx], ctc_prob)
+                np_filename = "%s/%s.ep.{.updater.epoch}.npy" % (
+                    self.outdir, uttid_list[idx], )
+                np.save(np_filename.format(trainer), ctc_prob)
+                self._plot_and_save_ctc(ctc_prob, filename.format(trainer))
+
+    def log_ctc_probs(self, logger, step):
+        """Add image files of ctc probs to the tensorboard."""
+        ctc_probs, uttid_list = self.get_ctc_probs()
+        if isinstance(ctc_probs, list):  # multi-encoder case
+            num_encs = len(ctc_probs) - 1
+            for i in range(num_encs):
+                for idx, ctc_prob in enumerate(ctc_probs[i]):
+                    ctc_prob = self.trim_ctc_prob(uttid_list[idx], ctc_prob)
+                    plot = self.draw_ctc_plot(ctc_prob)
+                    logger.add_figure(
+                        "%s_ctc%d" % (uttid_list[idx], i + 1),
+                        plot.gcf(),
+                        step, )
+        else:
+            for idx, ctc_prob in enumerate(ctc_probs):
+                ctc_prob = self.trim_ctc_prob(uttid_list[idx], ctc_prob)
+                plot = self.draw_ctc_plot(ctc_prob)
+                logger.add_figure("%s" % (uttid_list[idx]), plot.gcf(), step)
+
+    def get_ctc_probs(self):
+        """Return CTC probs.
+
+        Returns:
+            numpy.ndarray: CTC probs. float. Its shape would be
+                differ from backend. (B, Tmax, vocab).
+
+        """
+        return_batch, uttid_list = self.transform(self.data, return_uttid=True)
+        batch = self.converter([return_batch], self.device)
+        if isinstance(batch, tuple):
+            probs = self.ctc_vis_fn(*batch)
+        else:
+            probs = self.ctc_vis_fn(**batch)
+        return probs, uttid_list
+
+    def trim_ctc_prob(self, uttid, prob):
+        """Trim CTC posteriors accoding to input lengths."""
+        enc_len = int(self.data_dict[uttid][self.ikey][self.iaxis]["shape"][0])
+        if self.factor > 1:
+            enc_len //= self.factor
+        prob = prob[:enc_len]
+        return prob
+
+    def draw_ctc_plot(self, ctc_prob):
+        """Plot the ctc_prob matrix.
+
+        Returns:
+            matplotlib.pyplot: pyplot object with CTC prob matrix image.
+
+        """
+        import matplotlib
+
+        matplotlib.use("Agg")
+        import matplotlib.pyplot as plt
+
+        ctc_prob = ctc_prob.astype(np.float32)
+
+        plt.clf()
+        topk_ids = np.argsort(ctc_prob, axis=1)
+        n_frames, vocab = ctc_prob.shape
+        times_probs = np.arange(n_frames)
+
+        plt.figure(figsize=(20, 8))
+
+        # NOTE: index 0 is reserved for blank
+        for idx in set(topk_ids.reshape(-1).tolist()):
+            if idx == 0:
+                plt.plot(
+                    times_probs,
+                    ctc_prob[:, 0],
+                    ":",
+                    label="<blank>",
+                    color="grey")
+            else:
+                plt.plot(times_probs, ctc_prob[:, idx])
+        plt.xlabel(u"Input [frame]", fontsize=12)
+        plt.ylabel("Posteriors", fontsize=12)
+        plt.xticks(list(range(0, int(n_frames) + 1, 10)))
+        plt.yticks(list(range(0, 2, 1)))
+        plt.tight_layout()
+        return plt
+
+    def _plot_and_save_ctc(self, ctc_prob, filename):
+        plt = self.draw_ctc_plot(ctc_prob)
+        plt.savefig(filename)
+        plt.close()
diff --git a/ernie-sat/paddlespeech/s2t/training/extensions/snapshot.py b/ernie-sat/paddlespeech/s2t/training/extensions/snapshot.py
new file mode 100644
index 0000000000000000000000000000000000000000..426bf72cdab9a81e5320a563fba6c6ce228435cd
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/training/extensions/snapshot.py
@@ -0,0 +1,134 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from chainer(https://github.com/chainer/chainer)
+import os
+from datetime import datetime
+from pathlib import Path
+from typing import Any
+from typing import Dict
+from typing import List
+
+import jsonlines
+
+from . import extension
+from ..reporter import get_observations
+from ..updaters.trainer import Trainer
+from paddlespeech.s2t.utils.log import Log
+from paddlespeech.s2t.utils.mp_tools import rank_zero_only
+
+logger = Log(__name__).getlog()
+
+
+def load_records(records_fp):
+    """Load record files (json lines.)"""
+    with jsonlines.open(records_fp, 'r') as reader:
+        records = list(reader)
+    return records
+
+
+class Snapshot(extension.Extension):
+    """An extension to make snapshot of the updater object inside
+    the trainer. It is done by calling the updater's `save` method.
+    An Updater save its state_dict by default, which contains the
+    updater state, (i.e. epoch and iteration) and all the model
+    parameters and optimizer states. If the updater inside the trainer
+    subclasses StandardUpdater, everything is good to go.
+    Parameters
+    ----------
+    checkpoint_dir : Union[str, Path]
+        The directory to save checkpoints into.
+    """
+
+    trigger = (1, 'epoch')
+    priority = -100
+    default_name = "snapshot"
+
+    def __init__(self,
+                 mode='latest',
+                 max_size: int=5,
+                 indicator=None,
+                 less_better=True,
+                 snapshot_on_error: bool=False):
+        self.records: List[Dict[str, Any]] = []
+        assert mode in ('latest', 'kbest'), mode
+        if mode == 'kbest':
+            assert indicator is not None
+        self.mode = mode
+        self.indicator = indicator
+        self.less_is_better = less_better
+        self.max_size = max_size
+        self._snapshot_on_error = snapshot_on_error
+        self._save_all = (max_size == -1)
+        self.checkpoint_dir = None
+
+    def initialize(self, trainer: Trainer):
+        """Setting up this extention."""
+        self.checkpoint_dir = trainer.out / "checkpoints"
+
+        # load existing records
+        record_path: Path = self.checkpoint_dir / "records.jsonl"
+        if record_path.exists():
+            self.records = load_records(record_path)
+            ckpt_path = self.records[-1]['path']
+            logger.info(f"Loading from an existing checkpoint {ckpt_path}")
+            trainer.updater.load(ckpt_path)
+
+    def on_error(self, trainer, exc, tb):
+        if self._snapshot_on_error:
+            self.save_checkpoint_and_update(trainer, 'latest')
+
+    def __call__(self, trainer: Trainer):
+        self.save_checkpoint_and_update(trainer, self.mode)
+
+    def full(self):
+        """Whether the number of snapshots it keeps track of is greater
+        than the max_size."""
+        return (not self._save_all) and len(self.records) > self.max_size
+
+    @rank_zero_only
+    def save_checkpoint_and_update(self, trainer: Trainer, mode: str):
+        """Saving new snapshot and remove the oldest snapshot if needed."""
+        iteration = trainer.updater.state.iteration
+        epoch = trainer.updater.state.epoch
+        num = epoch if self.trigger[1] == 'epoch' else iteration
+        path = self.checkpoint_dir / f"{num}.np"
+
+        # add the new one
+        trainer.updater.save(path)
+        record = {
+            "time": str(datetime.now()),
+            'path': str(path.resolve()),  # use absolute path
+            'iteration': iteration,
+            'epoch': epoch,
+            'indicator': get_observations()[self.indicator]
+        }
+        self.records.append(record)
+
+        # remove the earist
+        if self.full():
+            if mode == 'kbest':
+                self.records = sorted(
+                    self.records,
+                    key=lambda record: record['indicator'],
+                    reverse=not self.less_is_better)
+            eariest_record = self.records[0]
+            os.remove(eariest_record["path"])
+            self.records.pop(0)
+
+        # update the record file
+        record_path = self.checkpoint_dir / "records.jsonl"
+        with jsonlines.open(record_path, 'w') as writer:
+            for record in self.records:
+                # jsonlines.open may return a Writer or a Reader
+                writer.write(record)  # pylint: disable=no-member
diff --git a/ernie-sat/paddlespeech/s2t/training/extensions/visualizer.py b/ernie-sat/paddlespeech/s2t/training/extensions/visualizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5f456cac4ff2b1fd9623ec1948a9e7337b712f0
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/training/extensions/visualizer.py
@@ -0,0 +1,39 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from visualdl import LogWriter
+
+from . import extension
+from ..updaters.trainer import Trainer
+
+
+class VisualDL(extension.Extension):
+    """A wrapper of visualdl log writer. It assumes that the metrics to be visualized
+    are all scalars which are recorded into the `.observation` dictionary of the
+    trainer object. The dictionary is created for each step, thus the visualdl log
+    writer uses the iteration from the updater's `iteration` as the global step to
+    add records.
+    """
+    trigger = (1, 'iteration')
+    default_name = 'visualdl'
+    priority = extension.PRIORITY_READER
+
+    def __init__(self, output_dir):
+        self.writer = LogWriter(str(output_dir))
+
+    def __call__(self, trainer: Trainer):
+        for k, v in trainer.observation.items():
+            self.writer.add_scalar(k, v, step=trainer.updater.state.iteration)
+
+    def finalize(self, trainer):
+        self.writer.close()
diff --git a/ernie-sat/paddlespeech/s2t/training/gradclip.py b/ernie-sat/paddlespeech/s2t/training/gradclip.py
new file mode 100644
index 0000000000000000000000000000000000000000..26ac501e282087b6906bb44833e389abb346db9f
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/training/gradclip.py
@@ -0,0 +1,85 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddle
+from paddle.fluid import core
+from paddle.fluid import layers
+from paddle.fluid.dygraph import base as imperative_base
+
+from paddlespeech.s2t.utils.log import Log
+
+__all__ = ["ClipGradByGlobalNormWithLog"]
+
+logger = Log(__name__).getlog()
+
+
+class ClipGradByGlobalNormWithLog(paddle.nn.ClipGradByGlobalNorm):
+    def __init__(self, clip_norm):
+        super().__init__(clip_norm)
+
+    def __repr__(self):
+        return f"{self.__class__.__name__}(global_clip_norm={self.clip_norm})"
+
+    @imperative_base.no_grad
+    def _dygraph_clip(self, params_grads):
+        params_and_grads = []
+        sum_square_list = []
+        for i, (p, g) in enumerate(params_grads):
+            if g is None:
+                continue
+            if getattr(p, 'need_clip', True) is False:
+                continue
+            merge_grad = g
+            if g.type == core.VarDesc.VarType.SELECTED_ROWS:
+                merge_grad = layers.merge_selected_rows(g)
+                merge_grad = layers.get_tensor_from_selected_rows(merge_grad)
+            square = layers.square(merge_grad)
+            sum_square = layers.reduce_sum(square)
+            sum_square_list.append(sum_square)
+
+            # debug log, not dump all since slow down train process
+            if i < 10:
+                logger.debug(
+                    f"Grad Before Clip: {p.name}: {float(sum_square.sqrt()) }")
+
+        # all parameters have been filterd out
+        if len(sum_square_list) == 0:
+            return params_grads
+
+        global_norm_var = layers.concat(sum_square_list)
+        global_norm_var = layers.reduce_sum(global_norm_var)
+        global_norm_var = layers.sqrt(global_norm_var)
+        # debug log
+        logger.debug(f"Grad Global Norm: {float(global_norm_var)}!!!!")
+
+        max_global_norm = layers.fill_constant(
+            shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm)
+        clip_var = layers.elementwise_div(
+            x=max_global_norm,
+            y=layers.elementwise_max(x=global_norm_var, y=max_global_norm))
+        for i, (p, g) in enumerate(params_grads):
+            if g is None:
+                continue
+            if getattr(p, 'need_clip', True) is False:
+                params_and_grads.append((p, g))
+                continue
+            new_grad = layers.elementwise_mul(x=g, y=clip_var)
+            params_and_grads.append((p, new_grad))
+
+            # debug log, not dump all since slow down train process
+            if i < 10:
+                logger.debug(
+                    f"Grad After Clip: {p.name}: {float(new_grad.square().sum().sqrt())}"
+                )
+
+        return params_and_grads
diff --git a/ernie-sat/paddlespeech/s2t/training/optimizer.py b/ernie-sat/paddlespeech/s2t/training/optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7f70c5704b0f62eb733995a6c895799f78fe3b1
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/training/optimizer.py
@@ -0,0 +1,122 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+from typing import Any
+from typing import Dict
+from typing import Text
+
+import paddle
+from paddle.optimizer import Optimizer
+from paddle.regularizer import L2Decay
+
+from paddlespeech.s2t.training.gradclip import ClipGradByGlobalNormWithLog
+from paddlespeech.s2t.utils.dynamic_import import dynamic_import
+from paddlespeech.s2t.utils.dynamic_import import instance_class
+from paddlespeech.s2t.utils.log import Log
+
+__all__ = ["OptimizerFactory"]
+
+logger = Log(__name__).getlog()
+
+OPTIMIZER_DICT = {
+    "sgd": "paddle.optimizer:SGD",
+    "momentum": "paddle.optimizer:Momentum",
+    "adadelta": "paddle.optimizer:Adadelta",
+    "adam": "paddle.optimizer:Adam",
+    "adamw": "paddle.optimizer:AdamW",
+}
+
+
+def register_optimizer(cls):
+    """Register optimizer."""
+    alias = cls.__name__.lower()
+    OPTIMIZER_DICT[cls.__name__.lower()] = cls.__module__ + ":" + cls.__name__
+    return cls
+
+
+@register_optimizer
+class Noam(paddle.optimizer.Adam):
+    """Seem to: espnet/nets/pytorch_backend/transformer/optimizer.py """
+
+    def __init__(self,
+                 learning_rate=0,
+                 beta1=0.9,
+                 beta2=0.98,
+                 epsilon=1e-9,
+                 parameters=None,
+                 weight_decay=None,
+                 grad_clip=None,
+                 lazy_mode=False,
+                 multi_precision=False,
+                 name=None):
+        super().__init__(
+            learning_rate=learning_rate,
+            beta1=beta1,
+            beta2=beta2,
+            epsilon=epsilon,
+            parameters=parameters,
+            weight_decay=weight_decay,
+            grad_clip=grad_clip,
+            lazy_mode=lazy_mode,
+            multi_precision=multi_precision,
+            name=name)
+
+    def __repr__(self):
+        echo = f"<{self.__class__.__module__}.{self.__class__.__name__} object at {hex(id(self))}> "
+        echo += f"learning_rate: {self._learning_rate}, "
+        echo += f"(beta1: {self._beta1} beta2: {self._beta2}), "
+        echo += f"epsilon: {self._epsilon}"
+
+
+def dynamic_import_optimizer(module):
+    """Import Optimizer class dynamically.
+
+    Args:
+        module (str): module_name:class_name or alias in `OPTIMIZER_DICT`
+
+    Returns:
+        type: Optimizer class
+
+    """
+    module_class = dynamic_import(module, OPTIMIZER_DICT)
+    assert issubclass(module_class,
+                      Optimizer), f"{module} does not implement Optimizer"
+    return module_class
+
+
+class OptimizerFactory():
+    @classmethod
+    def from_args(cls, name: str, args: Dict[Text, Any]):
+        assert "parameters" in args, "parameters not in args."
+        assert "learning_rate" in args, "learning_rate not in args."
+
+        grad_clip = ClipGradByGlobalNormWithLog(
+            args['grad_clip']) if "grad_clip" in args else None
+        weight_decay = L2Decay(
+            args['weight_decay']) if "weight_decay" in args else None
+        if weight_decay:
+            logger.info(f'<WeightDecay - {weight_decay}>')
+        if grad_clip:
+            logger.info(f'<GradClip - {grad_clip}>')
+
+        module_class = dynamic_import_optimizer(name.lower())
+        args.update({"grad_clip": grad_clip, "weight_decay": weight_decay})
+        opt = instance_class(module_class, args)
+        if "__repr__" in vars(opt):
+            logger.info(f"{opt}")
+        else:
+            logger.info(
+                f"<Optimizer {module_class.__module__}.{module_class.__name__}> LR: {args['learning_rate']}"
+            )
+        return opt
diff --git a/ernie-sat/paddlespeech/s2t/training/reporter.py b/ernie-sat/paddlespeech/s2t/training/reporter.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d8eb2a1dd0a2dc865a9b3b48909e9c914673310
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/training/reporter.py
@@ -0,0 +1,145 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from chainer(https://github.com/chainer/chainer)
+import contextlib
+import math
+from collections import defaultdict
+
+OBSERVATIONS = None
+
+
+@contextlib.contextmanager
+def ObsScope(observations):
+    # make `observation` the target to report to.
+    # it is basically a dictionary that stores temporary observations
+    global OBSERVATIONS
+    old = OBSERVATIONS
+    OBSERVATIONS = observations
+
+    try:
+        yield
+    finally:
+        OBSERVATIONS = old
+
+
+def get_observations():
+    global OBSERVATIONS
+    return OBSERVATIONS
+
+
+def report(name, value):
+    # a simple function to report named value
+    # you can use it everywhere, it will get the default target and writ to it
+    # you can think of it as std.out
+    observations = get_observations()
+    if observations is None:
+        return
+    else:
+        observations[name] = value
+
+
+class Summary():
+    """Online summarization of a sequence of scalars.
+    Summary computes the statistics of given scalars online.
+    """
+
+    def __init__(self):
+        self._x = 0.0
+        self._x2 = 0.0
+        self._n = 0
+
+    def add(self, value, weight=1):
+        """Adds a scalar value.
+        Args:
+            value: Scalar value to accumulate. It is either a NumPy scalar or
+                a zero-dimensional array (on CPU or GPU).
+            weight: An optional weight for the value. It is a NumPy scalar or
+                a zero-dimensional array (on CPU or GPU).
+                Default is 1 (integer).
+        """
+        self._x += weight * value
+        self._x2 += weight * value * value
+        self._n += weight
+
+    def compute_mean(self):
+        """Computes the mean."""
+        x, n = self._x, self._n
+        return x / n
+
+    def make_statistics(self):
+        """Computes and returns the mean and standard deviation values.
+        Returns:
+            tuple: Mean and standard deviation values.
+        """
+        x, n = self._x, self._n
+        mean = x / n
+        var = self._x2 / n - mean * mean
+        std = math.sqrt(var)
+        return mean, std
+
+
+class DictSummary():
+    """Online summarization of a sequence of dictionaries.
+    ``DictSummary`` computes the statistics of a given set of scalars online.
+    It only computes the statistics for scalar values and variables of scalar
+    values in the dictionaries.
+    """
+
+    def __init__(self):
+        self._summaries = defaultdict(Summary)
+
+    def add(self, d):
+        """Adds a dictionary of scalars.
+        Args:
+            d (dict): Dictionary of scalars to accumulate. Only elements of
+               scalars, zero-dimensional arrays, and variables of
+               zero-dimensional arrays are accumulated. When the value
+               is a tuple, the second element is interpreted as a weight.
+        """
+        summaries = self._summaries
+        for k, v in d.items():
+            w = 1
+            if isinstance(v, tuple):
+                v = v[0]
+                w = v[1]
+            summaries[k].add(v, weight=w)
+
+    def compute_mean(self):
+        """Creates a dictionary of mean values.
+        It returns a single dictionary that holds a mean value for each entry
+        added to the summary.
+        Returns:
+            dict: Dictionary of mean values.
+        """
+        return {
+            name: summary.compute_mean()
+            for name, summary in self._summaries.items()
+        }
+
+    def make_statistics(self):
+        """Creates a dictionary of statistics.
+        It returns a single dictionary that holds mean and standard deviation
+        values for every entry added to the summary. For an entry of name
+        ``'key'``, these values are added to the dictionary by names ``'key'``
+        and ``'key.std'``, respectively.
+        Returns:
+            dict: Dictionary of statistics of all entries.
+        """
+        stats = {}
+        for name, summary in self._summaries.items():
+            mean, std = summary.make_statistics()
+            stats[name] = mean
+            stats[name + '.std'] = std
+
+        return stats
diff --git a/ernie-sat/paddlespeech/s2t/training/scheduler.py b/ernie-sat/paddlespeech/s2t/training/scheduler.py
new file mode 100644
index 0000000000000000000000000000000000000000..b22f7ef85081032e0fdb370c5883e775b2c64693
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/training/scheduler.py
@@ -0,0 +1,130 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+from typing import Any
+from typing import Dict
+from typing import Text
+from typing import Union
+
+from paddle.optimizer.lr import LRScheduler
+from typeguard import check_argument_types
+
+from paddlespeech.s2t.utils.dynamic_import import dynamic_import
+from paddlespeech.s2t.utils.dynamic_import import instance_class
+from paddlespeech.s2t.utils.log import Log
+
+__all__ = ["WarmupLR", "LRSchedulerFactory"]
+
+logger = Log(__name__).getlog()
+
+SCHEDULER_DICT = {
+    "noam": "paddle.optimizer.lr:NoamDecay",
+    "expdecaylr": "paddle.optimizer.lr:ExponentialDecay",
+    "piecewisedecay": "paddle.optimizer.lr:PiecewiseDecay",
+}
+
+
+def register_scheduler(cls):
+    """Register scheduler."""
+    alias = cls.__name__.lower()
+    SCHEDULER_DICT[cls.__name__.lower()] = cls.__module__ + ":" + cls.__name__
+    return cls
+
+
+@register_scheduler
+class WarmupLR(LRScheduler):
+    """The WarmupLR scheduler
+    This scheduler is almost same as NoamLR Scheduler except for following
+    difference:
+    NoamLR:
+        lr = optimizer.lr * model_size ** -0.5
+             * min(step ** -0.5, step * warmup_step ** -1.5)
+    WarmupLR:
+        lr = optimizer.lr * warmup_step ** 0.5
+             * min(step ** -0.5, step * warmup_step ** -1.5)
+    Note that the maximum lr equals to optimizer.lr in this scheduler.
+    """
+
+    def __init__(self,
+                 warmup_steps: Union[int, float]=25000,
+                 learning_rate=1.0,
+                 last_epoch=-1,
+                 verbose=False,
+                 **kwargs):
+        assert check_argument_types()
+        self.warmup_steps = warmup_steps
+        super().__init__(learning_rate, last_epoch, verbose)
+
+    def __repr__(self):
+        return f"{self.__class__.__name__}(warmup_steps={self.warmup_steps}, lr={self.base_lr}, last_epoch={self.last_epoch})"
+
+    def get_lr(self):
+        # self.last_epoch start from zero
+        step_num = self.last_epoch + 1
+        return self.base_lr * self.warmup_steps**0.5 * min(
+            step_num**-0.5, step_num * self.warmup_steps**-1.5)
+
+    def set_step(self, step: int=None):
+        '''
+        It will update the learning rate in optimizer according to current ``epoch`` .
+        The new learning rate will take effect on next ``optimizer.step`` .
+
+        Args:
+            step (int, None): specify current epoch. Default: None. Auto-increment from last_epoch=-1.
+        Returns:
+            None
+        '''
+        self.step(epoch=step)
+
+
+@register_scheduler
+class ConstantLR(LRScheduler):
+    """
+    Args:
+        learning_rate (float): The initial learning rate. It is a python float number.
+        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
+        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
+
+    Returns:
+        ``ConstantLR`` instance to schedule learning rate.
+    """
+
+    def __init__(self, learning_rate, last_epoch=-1, verbose=False):
+        super().__init__(learning_rate, last_epoch, verbose)
+
+    def get_lr(self):
+        return self.base_lr
+
+
+def dynamic_import_scheduler(module):
+    """Import Scheduler class dynamically.
+
+    Args:
+        module (str): module_name:class_name or alias in `SCHEDULER_DICT`
+
+    Returns:
+        type: Scheduler class
+
+    """
+    module_class = dynamic_import(module, SCHEDULER_DICT)
+    assert issubclass(module_class,
+                      LRScheduler), f"{module} does not implement LRScheduler"
+    return module_class
+
+
+class LRSchedulerFactory():
+    @classmethod
+    def from_args(cls, name: str, args: Dict[Text, Any]):
+        module_class = dynamic_import_scheduler(name.lower())
+        return instance_class(module_class, args)
diff --git a/ernie-sat/paddlespeech/s2t/training/timer.py b/ernie-sat/paddlespeech/s2t/training/timer.py
new file mode 100644
index 0000000000000000000000000000000000000000..271ffff1b24e8038c68754fc217e76a1faa30e5d
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/training/timer.py
@@ -0,0 +1,50 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import datetime
+import time
+
+from paddlespeech.s2t.utils.log import Log
+
+__all__ = ["Timer"]
+
+logger = Log(__name__).getlog()
+
+
+class Timer():
+    """To be used like this: 
+        with Timer("Message") as value:
+            do some thing
+    """
+
+    def __init__(self, message=None):
+        self.message = message
+
+    def duration(self) -> str:
+        elapsed_time = time.time() - self.start
+        time_str = str(datetime.timedelta(seconds=elapsed_time))
+        return time_str
+
+    def __enter__(self):
+        self.start = time.time()
+        return self
+
+    def __exit__(self, type, value, traceback):
+        if self.message:
+            logger.info(self.message.format(self.duration()))
+
+    def __call__(self) -> float:
+        return time.time() - self.start
+
+    def __str__(self):
+        return self.duration()
diff --git a/ernie-sat/paddlespeech/s2t/training/trainer.py b/ernie-sat/paddlespeech/s2t/training/trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..de90c9ef889c76f5c9733cf7347f4656d2035ca6
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/training/trainer.py
@@ -0,0 +1,492 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+import time
+from collections import OrderedDict
+from contextlib import contextmanager
+from pathlib import Path
+
+import paddle
+from paddle import distributed as dist
+from visualdl import LogWriter
+
+from paddlespeech.s2t.training.reporter import ObsScope
+from paddlespeech.s2t.training.reporter import report
+from paddlespeech.s2t.training.timer import Timer
+from paddlespeech.s2t.utils import mp_tools
+from paddlespeech.s2t.utils import profiler
+from paddlespeech.s2t.utils.checkpoint import Checkpoint
+from paddlespeech.s2t.utils.log import Log
+from paddlespeech.s2t.utils.utility import all_version
+from paddlespeech.s2t.utils.utility import seed_all
+from paddlespeech.s2t.utils.utility import UpdateConfig
+
+__all__ = ["Trainer"]
+
+logger = Log(__name__).getlog()
+
+
+class Trainer():
+    """
+    An experiment template in order to structure the training code and take
+    care of saving, loading, logging, visualization stuffs. It's intended to
+    be flexible and simple.
+
+    So it only handles output directory (create directory for the output,
+    create a checkpoint directory, dump the config in use and create
+    visualizer and logger) in a standard way without enforcing any
+    input-output protocols to the model and dataloader. It leaves the main
+    part for the user to implement their own (setup the model, criterion,
+    optimizer, define a training step, define a validation function and
+    customize all the text and visual logs).
+    It does not save too much boilerplate code. The users still have to write
+    the forward/backward/update mannually, but they are free to add
+    non-standard behaviors if needed.
+    We have some conventions to follow.
+    1. Experiment should have ``model``, ``optimizer``, ``train_loader`` and
+    ``valid_loader``, ``config`` and ``args`` attributes.
+    2. The config should have a ``training`` field, which has
+    ``valid_interval``, ``save_interval`` and ``max_iteration`` keys. It is
+    used as the trigger to invoke validation, checkpointing and stop of the
+    experiment.
+    3. There are four methods, namely ``train_batch``, ``valid``,
+    ``setup_model`` and ``setup_dataloader`` that should be implemented.
+    Feel free to add/overwrite other methods and standalone functions if you
+    need.
+
+    Parameters
+    ----------
+    config: yacs.config.CfgNode
+        The configuration used for the experiment.
+
+    args: argparse.Namespace
+        The parsed command line arguments.
+    Examples
+    --------
+    >>> def main_sp(config, args):
+    >>>     exp = Trainer(config, args)
+    >>>     exp.setup()
+    >>>     exp.run()
+    >>>
+    >>> config = get_cfg_defaults()
+    >>> parser = default_argument_parser()
+    >>> args = parser.parse_args()
+    >>> if args.config:
+    >>>     config.merge_from_file(args.config)
+    >>> if args.opts:
+    >>>     config.merge_from_list(args.opts)
+    >>> config.freeze()
+    >>>
+    >>> if args.ngpu > 1:
+    >>>     dist.spawn(main_sp, args=(config, args), nprocs=args.ngpu)
+    >>> else:
+    >>>     main_sp(config, args)
+    """
+
+    def __init__(self, config, args):
+        self.config = config
+        self.args = args
+        self.optimizer = None
+        self.visualizer = None
+        self.output_dir = None
+        self.checkpoint_dir = None
+        self.iteration = 0
+        self.epoch = 0
+        self.rank = dist.get_rank()
+        self.world_size = dist.get_world_size()
+        self._train = True
+
+        # print deps version
+        all_version()
+        logger.info(f"Rank: {self.rank}/{self.world_size}")
+
+        # set device
+        paddle.set_device('gpu' if self.args.ngpu > 0 else 'cpu')
+        if self.parallel:
+            self.init_parallel()
+
+        self.checkpoint = Checkpoint(
+            kbest_n=self.config.checkpoint.kbest_n,
+            latest_n=self.config.checkpoint.latest_n)
+
+        # set random seed if needed
+        if args.seed:
+            seed_all(args.seed)
+            logger.info(f"Set seed {args.seed}")
+
+        # profiler and benchmark options
+        if hasattr(self.args,
+                   "benchmark_batch_size") and self.args.benchmark_batch_size:
+            with UpdateConfig(self.config):
+                self.config.batch_size = self.args.benchmark_batch_size
+                self.config.log_interval = 1
+            logger.info(
+                f"Benchmark reset batch-size: {self.args.benchmark_batch_size}")
+
+    @property
+    def train(self):
+        return self._train
+
+    @contextmanager
+    def eval(self):
+        self._train = False
+        yield
+        self._train = True
+
+    def setup(self):
+        """Setup the experiment.
+        """
+        self.setup_output_dir()
+        self.dump_config()
+        self.setup_visualizer()
+
+        self.setup_dataloader()
+        self.setup_model()
+
+        self.iteration = 0
+        self.epoch = 0
+
+    @property
+    def parallel(self):
+        """A flag indicating whether the experiment should run with
+        multiprocessing.
+        """
+        return self.args.ngpu > 1
+
+    def init_parallel(self):
+        """Init environment for multiprocess training.
+        """
+        dist.init_parallel_env()
+
+    @mp_tools.rank_zero_only
+    def save(self, tag=None, infos: dict=None):
+        """Save checkpoint (model parameters and optimizer states).
+
+        Args:
+            tag (int or str, optional): None for step, else using tag, e.g epoch. Defaults to None.
+            infos (dict, optional): meta data to save. Defaults to None.
+        """
+
+        infos = infos if infos else dict()
+        infos.update({
+            "step": self.iteration,
+            "epoch": self.epoch,
+            "lr": self.optimizer.get_lr()
+        })
+        self.checkpoint.save_parameters(self.checkpoint_dir, self.iteration
+                                        if tag is None else tag, self.model,
+                                        self.optimizer, infos)
+
+    def resume_or_scratch(self):
+        """Resume from latest checkpoint at checkpoints in the output
+        directory or load a specified checkpoint.
+
+        If ``args.checkpoint_path`` is not None, load the checkpoint, else
+        resume training.
+        """
+        scratch = None
+        infos = self.checkpoint.load_latest_parameters(
+            self.model,
+            self.optimizer,
+            checkpoint_dir=self.checkpoint_dir,
+            checkpoint_path=self.args.checkpoint_path)
+        if infos:
+            # just restore ckpt
+            # lr will resotre from optimizer ckpt
+            self.iteration = infos["step"]
+            self.epoch = infos["epoch"]
+            scratch = False
+            logger.info(
+                f"Restore ckpt: epoch {self.epoch }, step {self.iteration}!")
+        else:
+            self.iteration = 0
+            self.epoch = 0
+            scratch = True
+            logger.info("Init from scratch!")
+        return scratch
+
+    def maybe_batch_sampler_step(self):
+        """ batch_sampler seed by epoch """
+        if hasattr(self.train_loader, "batch_sampler"):
+            batch_sampler = self.train_loader.batch_sampler
+            if isinstance(batch_sampler, paddle.io.DistributedBatchSampler):
+                logger.debug(
+                    f"train_loader.batch_sample.set_epoch: {self.epoch}")
+                batch_sampler.set_epoch(self.epoch)
+
+    def before_train(self):
+        from_scratch = self.resume_or_scratch()
+        if from_scratch:
+            # scratch: save init model, i.e. 0 epoch
+            self.save(tag='init', infos=None)
+        else:
+            # resume: train next_epoch and next_iteration
+            self.epoch += 1
+            self.iteration += 1
+            logger.info(
+                f"Resume train: epoch {self.epoch }, step {self.iteration}!")
+
+        self.maybe_batch_sampler_step()
+
+    def new_epoch(self):
+        """Reset the train loader seed and increment `epoch`.
+        """
+        # `iteration` increased by train step
+        self.epoch += 1
+        self.maybe_batch_sampler_step()
+
+    def after_train_batch(self):
+        if self.args.benchmark_max_step:
+            profiler.add_profiler_step(self.args.profiler_options)
+        if self.args.benchmark_max_step and self.iteration > self.args.benchmark_max_step:
+            logger.info(
+                f"Reach benchmark-max-step: {self.args.benchmark_max_step}")
+            sys.exit(0)
+
+    def do_train(self):
+        """The training process control by epoch."""
+        self.before_train()
+
+        logger.info(f"Train Total Examples: {len(self.train_loader.dataset)}")
+        while self.epoch < self.config.n_epoch:
+            with Timer("Epoch-Train Time Cost: {}"):
+                self.model.train()
+                try:
+                    data_start_time = time.time()
+                    for batch_index, batch in enumerate(self.train_loader):
+                        dataload_time = time.time() - data_start_time
+                        msg = "Train:"
+                        observation = OrderedDict()
+                        with ObsScope(observation):
+                            report("Rank", dist.get_rank())
+                            report("epoch", self.epoch)
+                            report('step', self.iteration)
+                            report("lr", self.lr_scheduler())
+                            self.train_batch(batch_index, batch, msg)
+                            self.after_train_batch()
+                            report('iter', batch_index + 1)
+                            report('total', len(self.train_loader))
+                            report('reader_cost', dataload_time)
+                        observation['batch_cost'] = observation[
+                            'reader_cost'] + observation['step_cost']
+                        observation['samples'] = observation['batch_size']
+                        observation['ips samples/s'] = observation[
+                            'batch_size'] / observation['batch_cost']
+                        for k, v in observation.items():
+                            msg += f" {k}: "
+                            msg += f"{v:>.8f}" if isinstance(v,
+                                                             float) else f"{v}"
+                            msg += ","
+                        msg = msg[:-1]  # remove the last ","
+                        logger.info(msg)
+                        data_start_time = time.time()
+                except Exception as e:
+                    logger.error(e)
+                    raise e
+
+            with Timer("Eval Time Cost: {}"):
+                total_loss, num_seen_utts = self.valid()
+                if dist.get_world_size() > 1:
+                    num_seen_utts = paddle.to_tensor(num_seen_utts)
+                    # the default operator in all_reduce function is sum.
+                    dist.all_reduce(num_seen_utts)
+                    total_loss = paddle.to_tensor(total_loss)
+                    dist.all_reduce(total_loss)
+                    cv_loss = total_loss / num_seen_utts
+                    cv_loss = float(cv_loss)
+                else:
+                    cv_loss = total_loss / num_seen_utts
+
+            logger.info(
+                'Epoch {} Val info val_loss {}'.format(self.epoch, cv_loss))
+            if self.visualizer:
+                self.visualizer.add_scalar(
+                    tag='eval/cv_loss', value=cv_loss, step=self.epoch)
+                self.visualizer.add_scalar(
+                    tag='eval/lr', value=self.lr_scheduler(), step=self.epoch)
+
+            # after epoch
+            self.save(tag=self.epoch, infos={'val_loss': cv_loss})
+            # step lr every epoch
+            self.lr_scheduler.step()
+            self.new_epoch()
+
+    def run(self):
+        """The routine of the experiment after setup. This method is intended
+        to be used by the user.
+        """
+        try:
+            with Timer("Training Done: {}"):
+                self.do_train()
+        except KeyboardInterrupt:
+            exit(-1)
+        finally:
+            self.destory()
+
+    def restore(self):
+        """Resume from latest checkpoint at checkpoints in the output
+        directory or load a specified checkpoint.
+
+        If ``args.checkpoint_path`` is not None, load the checkpoint, else
+        resume training.
+        """
+        assert self.args.checkpoint_path
+        infos = self.checkpoint.load_latest_parameters(
+            self.model, checkpoint_path=self.args.checkpoint_path)
+        return infos
+
+    def run_test(self):
+        """Do Test/Decode"""
+        try:
+            with Timer("Test/Decode Done: {}"):
+                with self.eval():
+                    self.restore()
+                    self.test()
+        except KeyboardInterrupt:
+            exit(-1)
+
+    def run_export(self):
+        """Do Model Export"""
+        try:
+            with Timer("Export Done: {}"):
+                with self.eval():
+                    self.restore()
+                    self.export()
+        except KeyboardInterrupt:
+            exit(-1)
+
+    def run_align(self):
+        """Do CTC alignment"""
+        try:
+            with Timer("Align Done: {}"):
+                with self.eval():
+                    self.restore()
+                    self.align()
+        except KeyboardInterrupt:
+            sys.exit(-1)
+
+    def setup_output_dir(self):
+        """Create a directory used for output.
+        """
+        if self.args.output:
+            output_dir = Path(self.args.output).expanduser()
+        elif self.args.checkpoint_path:
+            output_dir = Path(
+                self.args.checkpoint_path).expanduser().parent.parent
+        elif self.args.export_path:
+            output_dir = Path(self.args.export_path).expanduser().parent.parent
+        self.output_dir = output_dir
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+
+        self.checkpoint_dir = self.output_dir / "checkpoints"
+        self.checkpoint_dir.mkdir(parents=True, exist_ok=True)
+
+        self.log_dir = output_dir / "log"
+        self.log_dir.mkdir(parents=True, exist_ok=True)
+
+        self.test_dir = output_dir / "test"
+        self.test_dir.mkdir(parents=True, exist_ok=True)
+
+        self.decode_dir = output_dir / "decode"
+        self.decode_dir.mkdir(parents=True, exist_ok=True)
+
+        self.export_dir = output_dir / "export"
+        self.export_dir.mkdir(parents=True, exist_ok=True)
+
+        self.visual_dir = output_dir / "visual"
+        self.visual_dir.mkdir(parents=True, exist_ok=True)
+
+        self.config_dir = output_dir / "conf"
+        self.config_dir.mkdir(parents=True, exist_ok=True)
+
+    @mp_tools.rank_zero_only
+    def destory(self):
+        """Close visualizer to avoid hanging after training"""
+        # https://github.com/pytorch/fairseq/issues/2357
+        if self.visualizer:
+            self.visualizer.close()
+
+    @mp_tools.rank_zero_only
+    def setup_visualizer(self):
+        """Initialize a visualizer to log the experiment.
+
+        The visual log is saved in the output directory.
+
+        Notes
+        ------
+        Only the main process has a visualizer with it. Use multiple
+        visualizers in multiprocess to write to a same log file may cause
+        unexpected behaviors.
+        """
+        # visualizer
+        visualizer = LogWriter(logdir=str(self.visual_dir))
+        self.visualizer = visualizer
+
+    @mp_tools.rank_zero_only
+    def dump_config(self):
+        """Save the configuration used for this experiment.
+
+        It is saved in to ``config.yaml`` in the output directory at the
+        beginning of the experiment.
+        """
+        config_file = self.config_dir / "config.yaml"
+        if self.train and config_file.exists():
+            time_stamp = time.strftime("%Y_%m_%d_%H_%M_%s", time.gmtime())
+            target_path = self.config_dir / ".".join(
+                [time_stamp, "config.yaml"])
+            config_file.rename(target_path)
+
+        with open(config_file, 'wt') as f:
+            print(self.config, file=f)
+
+    def train_batch(self):
+        """The training loop. A subclass should implement this method.
+        """
+        raise NotImplementedError("train_batch should be implemented.")
+
+    @paddle.no_grad()
+    def valid(self):
+        """The validation. A subclass should implement this method.
+        """
+        raise NotImplementedError("valid should be implemented.")
+
+    @paddle.no_grad()
+    def test(self):
+        """The test. A subclass should implement this method in Tester.
+        """
+        raise NotImplementedError("test should be implemented.")
+
+    @paddle.no_grad()
+    def export(self):
+        """The test. A subclass should implement this method in Tester.
+        """
+        raise NotImplementedError("export should be implemented.")
+
+    @paddle.no_grad()
+    def align(self):
+        """The align. A subclass should implement this method in Tester.
+        """
+        raise NotImplementedError("align should be implemented.")
+
+    def setup_model(self):
+        """Setup model, criterion and optimizer, etc. A subclass should
+        implement this method.
+        """
+        raise NotImplementedError("setup_model should be implemented.")
+
+    def setup_dataloader(self):
+        """Setup training dataloader and validation dataloader. A subclass
+        should implement this method.
+        """
+        raise NotImplementedError("setup_dataloader should be implemented.")
diff --git a/ernie-sat/paddlespeech/s2t/training/triggers/__init__.py b/ernie-sat/paddlespeech/s2t/training/triggers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..185a92b8d94d3426d616c0624f0f2ee04339349e
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/training/triggers/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/ernie-sat/paddlespeech/s2t/training/triggers/compare_value_trigger.py b/ernie-sat/paddlespeech/s2t/training/triggers/compare_value_trigger.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c2a2721709e0a9b5376e5d5747f17b691c2c5ac
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/training/triggers/compare_value_trigger.py
@@ -0,0 +1,62 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from chainer(https://github.com/chainer/chainer)
+from ..reporter import DictSummary
+from .utils import get_trigger
+
+
+class CompareValueTrigger():
+    """Trigger invoked when key value getting bigger or lower than before.
+
+    Args:
+        key (str) : Key of value.
+        compare_fn ((float, float) -> bool) : Function to compare the values.
+        trigger (tuple(int, str)) : Trigger that decide the comparison interval.
+
+    """
+
+    def __init__(self, key, compare_fn, trigger=(1, "epoch")):
+        self._key = key
+        self._best_value = None
+        self._interval_trigger = get_trigger(trigger)
+        self._init_summary()
+        self._compare_fn = compare_fn
+
+    def __call__(self, trainer):
+        """Get value related to the key and compare with current value."""
+        observation = trainer.observation
+        summary = self._summary
+        key = self._key
+        if key in observation:
+            summary.add({key: observation[key]})
+
+        if not self._interval_trigger(trainer):
+            return False
+
+        stats = summary.compute_mean()
+        value = float(stats[key])  # copy to CPU
+        self._init_summary()
+
+        if self._best_value is None:
+            # initialize best value
+            self._best_value = value
+            return False
+        elif self._compare_fn(self._best_value, value):
+            return True
+        else:
+            self._best_value = value
+            return False
+
+    def _init_summary(self):
+        self._summary = DictSummary()
diff --git a/ernie-sat/paddlespeech/s2t/training/triggers/interval_trigger.py b/ernie-sat/paddlespeech/s2t/training/triggers/interval_trigger.py
new file mode 100644
index 0000000000000000000000000000000000000000..14201d29cfc101263db51febb039a01e36bfbf1b
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/training/triggers/interval_trigger.py
@@ -0,0 +1,39 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Reference chainer MIT (https://opensource.org/licenses/MIT)
+
+
+class IntervalTrigger():
+    """A Predicate to do something every N cycle."""
+
+    def __init__(self, period: int, unit: str):
+        if unit not in ("iteration", "epoch"):
+            raise ValueError("unit should be 'iteration' or 'epoch'")
+        if period <= 0:
+            raise ValueError("period should be a positive integer.")
+        self.period = period
+        self.unit = unit
+        self.last_index = None
+
+    def __call__(self, trainer):
+        if self.last_index is None:
+            last_index = getattr(trainer.updater.state, self.unit)
+            self.last_index = last_index
+
+        last_index = self.last_index
+        index = getattr(trainer.updater.state, self.unit)
+        fire = index // self.period != last_index // self.period
+
+        self.last_index = index
+        return fire
diff --git a/ernie-sat/paddlespeech/s2t/training/triggers/limit_trigger.py b/ernie-sat/paddlespeech/s2t/training/triggers/limit_trigger.py
new file mode 100644
index 0000000000000000000000000000000000000000..cd96040ef0bbc54c878ed9b194e10936eab4974f
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/training/triggers/limit_trigger.py
@@ -0,0 +1,32 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Reference chainer MIT (https://opensource.org/licenses/MIT)
+
+
+class LimitTrigger():
+    """A Predicate to decide whether to stop."""
+
+    def __init__(self, limit: int, unit: str):
+        if unit not in ("iteration", "epoch"):
+            raise ValueError("unit should be 'iteration' or 'epoch'")
+        if limit <= 0:
+            raise ValueError("limit should be a positive integer.")
+        self.limit = limit
+        self.unit = unit
+
+    def __call__(self, trainer):
+        state = trainer.updater.state
+        index = getattr(state, self.unit)
+        fire = index >= self.limit
+        return fire
diff --git a/ernie-sat/paddlespeech/s2t/training/triggers/time_trigger.py b/ernie-sat/paddlespeech/s2t/training/triggers/time_trigger.py
new file mode 100644
index 0000000000000000000000000000000000000000..53c398d11ca7547f86f2445c2f6d0228a0ca2ecc
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/training/triggers/time_trigger.py
@@ -0,0 +1,42 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Reference chainer MIT (https://opensource.org/licenses/MIT)
+
+
+class TimeTrigger():
+    """Trigger based on a fixed time interval.
+    This trigger accepts iterations with a given interval time.
+    Args:
+        period (float): Interval time. It is given in seconds.
+    """
+
+    def __init__(self, period):
+        self._period = period
+        self._next_time = self._period
+
+    def __call__(self, trainer):
+        if self._next_time < trainer.elapsed_time:
+            self._next_time += self._period
+            return True
+        else:
+            return False
+
+    def state_dict(self):
+        state_dict = {
+            "next_time": self._next_time,
+        }
+        return state_dict
+
+    def set_state_dict(self, state_dict):
+        self._next_time = state_dict['next_time']
diff --git a/ernie-sat/paddlespeech/s2t/training/triggers/utils.py b/ernie-sat/paddlespeech/s2t/training/triggers/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a7c4292e6e1f81e4a34efb517c05f58c5d8f1fe
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/training/triggers/utils.py
@@ -0,0 +1,28 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .interval_trigger import IntervalTrigger
+
+
+def never_fail_trigger(trainer):
+    return False
+
+
+def get_trigger(trigger):
+    if trigger is None:
+        return never_fail_trigger
+    if callable(trigger):
+        return trigger
+    else:
+        trigger = IntervalTrigger(*trigger)
+        return trigger
diff --git a/ernie-sat/paddlespeech/s2t/training/updaters/__init__.py b/ernie-sat/paddlespeech/s2t/training/updaters/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..185a92b8d94d3426d616c0624f0f2ee04339349e
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/training/updaters/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/ernie-sat/paddlespeech/s2t/training/updaters/standard_updater.py b/ernie-sat/paddlespeech/s2t/training/updaters/standard_updater.py
new file mode 100644
index 0000000000000000000000000000000000000000..a320a80d2c724da20bc287f215b71d94fb9c78cc
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/training/updaters/standard_updater.py
@@ -0,0 +1,196 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from chainer(https://github.com/chainer/chainer)
+from typing import Dict
+from typing import Optional
+
+import paddle
+from paddle.io import DataLoader
+from paddle.io import DistributedBatchSampler
+from paddle.nn import Layer
+from paddle.optimizer import Optimizer
+from paddle.optimizer.lr import LRScheduler
+
+from paddlespeech.s2t.training.reporter import report
+from paddlespeech.s2t.training.updaters.updater import UpdaterBase
+from paddlespeech.s2t.training.updaters.updater import UpdaterState
+from paddlespeech.s2t.utils.log import Log
+
+__all__ = ["StandardUpdater"]
+
+logger = Log(__name__).getlog()
+
+
+class StandardUpdater(UpdaterBase):
+    """An example of over-simplification. Things may not be that simple, but
+    you can subclass it to fit your need.
+    """
+
+    def __init__(self,
+                 model: Layer,
+                 optimizer: Optimizer,
+                 scheduler: LRScheduler,
+                 dataloader: DataLoader,
+                 init_state: Optional[UpdaterState]=None):
+        super().__init__(init_state)
+        # it is designed to hold multiple models
+        models = {"main": model}
+        self.models: Dict[str, Layer] = models
+        self.model = model
+
+        # it is designed to hold multiple optimizers
+        optimizers = {"main": optimizer}
+        self.optimizer = optimizer
+        self.optimizers: Dict[str, Optimizer] = optimizers
+
+        # it is designed to hold multiple scheduler
+        schedulers = {"main": scheduler}
+        self.scheduler = scheduler
+        self.schedulers: Dict[str, LRScheduler] = schedulers
+
+        # dataloaders
+        self.dataloader = dataloader
+
+        self.train_iterator = iter(dataloader)
+
+    def update(self):
+        # We increase the iteration index after updating and before extension.
+        # Here are the reasons.
+
+        # 0. Snapshotting(as well as other extensions, like visualizer) is
+        #    executed after a step of updating;
+        # 1. We decide to increase the iteration index after updating and
+        #    before any all extension is executed. 
+        # 3. We do not increase the iteration after extension because we
+        #    prefer a consistent resume behavior, when load from a
+        #    `snapshot_iter_100.pdz` then the next step to train is `101`,
+        #    naturally. But if iteration is increased increased after
+        #    extension(including snapshot), then, a `snapshot_iter_99` is
+        #    loaded. You would need a extra increasing of the iteration idex
+        #    before training to avoid another iteration `99`, which has been
+        #    done before snapshotting.
+        # 4. Thus iteration index represrnts "currently how mant epochs has
+        #    been done."
+        # NOTE: use report to capture the correctly value. If you want to
+        # report the learning rate used for a step, you must report it before
+        # the learning rate scheduler's step() has been called. In paddle's
+        # convention, we do not use an extension to change the learning rate.
+        # so if you want to report it, do it in the updater.
+
+        # Then here comes the next question. When is the proper time to
+        # increase the epoch index? Since all extensions are executed after
+        # updating, it is the time that after updating is the proper time to
+        # increase epoch index.
+        # 1. If we increase the epoch index before updating, then an extension
+        #    based ot epoch would miss the correct timing. It could only be
+        #    triggerd after an extra updating.
+        # 2. Theoretically, when an epoch is done, the epoch index should be
+        #    increased. So it would be increase after updating.
+        # 3. Thus, eppoch index represents "currently how many epochs has been
+        #    done." So it starts from 0.
+
+        # switch to training mode
+        for model in self.models.values():
+            model.train()
+
+        # training for a step is implemented here
+        with Timier("data time cost:{}"):
+            batch = self.read_batch()
+        with Timier("step time cost:{}"):
+            self.update_core(batch)
+
+        self.state.iteration += 1
+        if self.updates_per_epoch is not None:
+            if self.state.iteration % self.updates_per_epoch == 0:
+                self.state.epoch += 1
+
+    def update_core(self, batch):
+        """A simple case for a training step. Basic assumptions are:
+        Single model;
+        Single optimizer;
+        Single scheduler, and update learning rate each step;
+        A batch from the dataloader is just the input of the model;
+        The model return a single loss, or a dict containing serval losses.
+        Parameters updates at every batch, no gradient accumulation.
+        """
+        loss = self.model(*batch)
+
+        if isinstance(loss, paddle.Tensor):
+            loss_dict = {"main": loss}
+        else:
+            # Dict[str, Tensor]
+            loss_dict = loss
+            if "main" not in loss_dict:
+                main_loss = 0
+                for loss_item in loss.values():
+                    main_loss += loss_item
+                loss_dict["main"] = main_loss
+
+        for name, loss_item in loss_dict.items():
+            report(name, float(loss_item))
+
+        self.optimizer.clear_grad()
+        loss_dict["main"].backward()
+        self.optimizer.step()
+        self.scheduler.step()
+
+    @property
+    def updates_per_epoch(self):
+        """Number of steps per epoch, 
+        determined by the length of the dataloader."""
+        length_of_dataloader = None
+        try:
+            length_of_dataloader = len(self.dataloader)
+        except TypeError:
+            logger.debug("This dataloader has no __len__.")
+        finally:
+            return length_of_dataloader
+
+    def new_epoch(self):
+        """Start a new epoch."""
+        # NOTE: all batch sampler for distributed training should
+        # subclass DistributedBatchSampler and implement `set_epoch` method
+        if hasattr(self.dataloader, "batch_sampler"):
+            batch_sampler = self.dataloader.batch_sampler
+            if isinstance(batch_sampler, DistributedBatchSampler):
+                batch_sampler.set_epoch(self.state.epoch)
+        self.train_iterator = iter(self.dataloader)
+
+    def read_batch(self):
+        """Read a batch from the data loader, auto renew when data is exhausted."""
+        try:
+            batch = next(self.train_iterator)
+        except StopIteration:
+            self.new_epoch()
+            batch = next(self.train_iterator)
+        return batch
+
+    def state_dict(self):
+        """State dict of a Updater, model, optimizers/schedulers 
+        and updater state are included."""
+        state_dict = super().state_dict()
+        for name, model in self.models.items():
+            state_dict[f"{name}_params"] = model.state_dict()
+        for name, optim in self.optimizers.items():
+            state_dict[f"{name}_optimizer"] = optim.state_dict()
+        return state_dict
+
+    def set_state_dict(self, state_dict):
+        """Set state dict for a Updater. Parameters of models, states for
+        optimizers/schedulers and UpdaterState are restored."""
+        for name, model in self.models.items():
+            model.set_state_dict(state_dict[f"{name}_params"])
+        for name, optim in self.optimizers.items():
+            optim.set_state_dict(state_dict[f"{name}_optimizer"])
+        super().set_state_dict(state_dict)
diff --git a/ernie-sat/paddlespeech/s2t/training/updaters/trainer.py b/ernie-sat/paddlespeech/s2t/training/updaters/trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..a0698c60a9c1318a417d3b71ee8906fb4f288bde
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/training/updaters/trainer.py
@@ -0,0 +1,185 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from chainer(https://github.com/chainer/chainer)
+import sys
+import traceback
+from collections import OrderedDict
+from pathlib import Path
+from typing import Callable
+from typing import List
+from typing import Union
+
+import six
+import tqdm
+
+from paddlespeech.s2t.training.extensions.extension import Extension
+from paddlespeech.s2t.training.extensions.extension import PRIORITY_READER
+from paddlespeech.s2t.training.reporter import ObsScope
+from paddlespeech.s2t.training.triggers import get_trigger
+from paddlespeech.s2t.training.triggers.limit_trigger import LimitTrigger
+from paddlespeech.s2t.training.updaters.updater import UpdaterBase
+
+
+class _ExtensionEntry():
+    def __init__(self, extension, trigger, priority):
+        self.extension = extension
+        self.trigger = trigger
+        self.priority = priority
+
+
+class Trainer():
+    def __init__(self,
+                 updater: UpdaterBase,
+                 stop_trigger: Callable=None,
+                 out: Union[str, Path]='result',
+                 extensions: List[Extension]=None):
+        self.updater = updater
+        self.extensions = OrderedDict()
+        self.stop_trigger = LimitTrigger(*stop_trigger)
+        self.out = Path(out)
+        self.observation = None
+
+        self._done = False
+        if extensions:
+            for ext in extensions:
+                self.extend(ext)
+
+    @property
+    def is_before_training(self):
+        return self.updater.state.iteration == 0
+
+    def extend(self, extension, name=None, trigger=None, priority=None):
+        # get name for the extension
+        # argument \
+        # -> extention's name \
+        # -> default_name (class name, when it is an object) \
+        # -> function name when it is a function \
+        # -> error
+
+        if name is None:
+            name = getattr(extension, 'name', None)
+            if name is None:
+                name = getattr(extension, 'default_name', None)
+                if name is None:
+                    name = getattr(extension, '__name__', None)
+                    if name is None:
+                        raise ValueError("Name is not given for the extension.")
+        if name == 'training':
+            raise ValueError("training is a reserved name.")
+
+        if trigger is None:
+            trigger = getattr(extension, 'trigger', (1, 'iteration'))
+        trigger = get_trigger(trigger)
+
+        if priority is None:
+            priority = getattr(extension, 'priority', PRIORITY_READER)
+
+        # add suffix to avoid nameing conflict
+        ordinal = 0
+        modified_name = name
+        while modified_name in self.extensions:
+            ordinal += 1
+            modified_name = f"{name}_{ordinal}"
+        extension.name = modified_name
+
+        self.extensions[modified_name] = _ExtensionEntry(extension, trigger,
+                                                         priority)
+
+    def get_extension(self, name):
+        """get extension by name."""
+        extensions = self.extensions
+        if name in extensions:
+            return extensions[name].extension
+        else:
+            raise ValueError(f'extension {name} not found')
+
+    def run(self):
+        if self._done:
+            raise RuntimeError("Training is already done!.")
+
+        self.out.mkdir(parents=True, exist_ok=True)
+
+        # sort extensions by priorities once
+        extension_order = sorted(
+            self.extensions.keys(),
+            key=lambda name: self.extensions[name].priority,
+            reverse=True)
+        extensions = [(name, self.extensions[name]) for name in extension_order]
+
+        # initializing all extensions
+        for name, entry in extensions:
+            if hasattr(entry.extension, "initialize"):
+                entry.extension.initialize(self)
+
+        update = self.updater.update  # training step
+        stop_trigger = self.stop_trigger
+
+        # display only one progress bar
+        max_iteration = None
+        if isinstance(stop_trigger, LimitTrigger):
+            if stop_trigger.unit == 'epoch':
+                max_epoch = self.stop_trigger.limit
+                updates_per_epoch = getattr(self.updater, "updates_per_epoch",
+                                            None)
+                max_iteration = max_epoch * updates_per_epoch if updates_per_epoch else None
+            else:
+                max_iteration = self.stop_trigger.limit
+
+        p = tqdm.tqdm(initial=self.updater.state.iteration, total=max_iteration)
+
+        try:
+            while not stop_trigger(self):
+                self.observation = {}
+                # set observation as the `report` target
+                # you can use `report` freely in Updater.update()
+
+                # updating parameters and state
+                with ObsScope(self.observation):
+                    update()
+                    p.update()
+
+                    # execute extension when necessary
+                    for name, entry in extensions:
+                        if entry.trigger(self):
+                            entry.extension(self)
+
+                # print("###", self.observation)
+        except Exception as e:
+            f = sys.stderr
+            f.write(f"Exception in main training loop: {e}\n")
+            f.write("Traceback (most recent call last):\n")
+            traceback.print_tb(sys.exc_info()[2])
+            f.write(
+                "Trainer extensions will try to handle the extension. Then all extensions will finalize."
+            )
+
+            # capture the exception in the mian training loop
+            exc_info = sys.exc_info()
+
+            # try to handle it
+            for name, entry in extensions:
+                if hasattr(entry.extension, "on_error"):
+                    try:
+                        entry.extension.on_error(self, e, sys.exc_info()[2])
+                    except Exception as ee:
+                        f.write(f"Exception in error handler: {ee}\n")
+                        f.write('Traceback (most recent call last):\n')
+                        traceback.print_tb(sys.exc_info()[2])
+
+            # raise exception in main training loop
+            six.reraise(*exc_info)
+        finally:
+            for name, entry in extensions:
+                if hasattr(entry.extension, "finalize"):
+                    entry.extension.finalize(self)
diff --git a/ernie-sat/paddlespeech/s2t/training/updaters/updater.py b/ernie-sat/paddlespeech/s2t/training/updaters/updater.py
new file mode 100644
index 0000000000000000000000000000000000000000..6875deb3d2bcf50bd962d66d3b7ee442ae1e4b39
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/training/updaters/updater.py
@@ -0,0 +1,85 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from chainer(https://github.com/chainer/chainer)
+from dataclasses import dataclass
+
+import paddle
+
+from paddlespeech.s2t.utils.log import Log
+
+__all__ = ["UpdaterBase", "UpdaterState"]
+
+logger = Log(__name__).getlog()
+
+
+@dataclass
+class UpdaterState:
+    iteration: int = 0
+    epoch: int = 0
+
+
+class UpdaterBase():
+    """An updater is the abstraction of how a model is trained given the
+    dataloader and the optimizer.
+    The `update_core` method is a step in the training loop with only necessary
+    operations (get a batch, forward and backward, update the parameters).
+    Other stuffs are made extensions. Visualization, saving, loading and
+    periodical validation and evaluation are not considered here.
+    But even in such simplist case, things are not that simple. There is an
+    attempt to standardize this process and requires only the model and
+    dataset and do all the stuffs automatically. But this may hurt flexibility.
+    If we assume a batch yield from the dataloader is just the input to the
+    model, we will find that some model requires more arguments, or just some
+    keyword arguments. But this prevents us from over-simplifying it.
+    From another perspective, the batch may includes not just the input, but
+    also the target. But the model's forward method may just need the input.
+    We can pass a dict or a super-long tuple to the model and let it pick what
+    it really needs. But this is an abuse of lazy interface.
+    After all, we care about how a model is trained. But just how the model is
+    used for inference. We want to control how a model is trained. We just
+    don't want to be messed up with other auxiliary code.
+    So the best practice is to define a model and define a updater for it.
+    """
+
+    def __init__(self, init_state=None):
+        # init state
+        if init_state is None:
+            self.state = UpdaterState()
+        else:
+            self.state = init_state
+
+    def update(self, batch):
+        raise NotImplementedError(
+            "Implement your own `update` method for training a step.")
+
+    def state_dict(self):
+        state_dict = {
+            "epoch": self.state.epoch,
+            "iteration": self.state.iteration,
+        }
+        return state_dict
+
+    def set_state_dict(self, state_dict):
+        self.state.epoch = state_dict["epoch"]
+        self.state.iteration = state_dict["iteration"]
+
+    def save(self, path):
+        logger.debug(f"Saving to {path}.")
+        archive = self.state_dict()
+        paddle.save(archive, str(path))
+
+    def load(self, path):
+        logger.debug(f"Loading from {path}.")
+        archive = paddle.load(str(path))
+        self.set_state_dict(archive)
diff --git a/ernie-sat/paddlespeech/s2t/transform/__init__.py b/ernie-sat/paddlespeech/s2t/transform/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..185a92b8d94d3426d616c0624f0f2ee04339349e
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/transform/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/ernie-sat/paddlespeech/s2t/transform/add_deltas.py b/ernie-sat/paddlespeech/s2t/transform/add_deltas.py
new file mode 100644
index 0000000000000000000000000000000000000000..1387fe9da643b4dce7ea7887d50d3f1119722661
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/transform/add_deltas.py
@@ -0,0 +1,54 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+import numpy as np
+
+
+def delta(feat, window):
+    assert window > 0
+    delta_feat = np.zeros_like(feat)
+    for i in range(1, window + 1):
+        delta_feat[:-i] += i * feat[i:]
+        delta_feat[i:] += -i * feat[:-i]
+        delta_feat[-i:] += i * feat[-1]
+        delta_feat[:i] += -i * feat[0]
+    delta_feat /= 2 * sum(i**2 for i in range(1, window + 1))
+    return delta_feat
+
+
+def add_deltas(x, window=2, order=2):
+    """
+    Args:
+        x (np.ndarray): speech feat, (T, D).
+
+    Return:
+        np.ndarray: (T, (1+order)*D)
+    """
+    feats = [x]
+    for _ in range(order):
+        feats.append(delta(feats[-1], window))
+    return np.concatenate(feats, axis=1)
+
+
+class AddDeltas():
+    def __init__(self, window=2, order=2):
+        self.window = window
+        self.order = order
+
+    def __repr__(self):
+        return "{name}(window={window}, order={order}".format(
+            name=self.__class__.__name__, window=self.window, order=self.order)
+
+    def __call__(self, x):
+        return add_deltas(x, window=self.window, order=self.order)
diff --git a/ernie-sat/paddlespeech/s2t/transform/channel_selector.py b/ernie-sat/paddlespeech/s2t/transform/channel_selector.py
new file mode 100644
index 0000000000000000000000000000000000000000..b078dcf81b9edab3531160b097afbcebd1f807e4
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/transform/channel_selector.py
@@ -0,0 +1,57 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+import numpy
+
+
+class ChannelSelector():
+    """Select 1ch from multi-channel signal"""
+
+    def __init__(self, train_channel="random", eval_channel=0, axis=1):
+        self.train_channel = train_channel
+        self.eval_channel = eval_channel
+        self.axis = axis
+
+    def __repr__(self):
+        return ("{name}(train_channel={train_channel}, "
+                "eval_channel={eval_channel}, axis={axis})".format(
+                    name=self.__class__.__name__,
+                    train_channel=self.train_channel,
+                    eval_channel=self.eval_channel,
+                    axis=self.axis, ))
+
+    def __call__(self, x, train=True):
+        # Assuming x: [Time, Channel] by default
+
+        if x.ndim <= self.axis:
+            # If the dimension is insufficient, then unsqueeze
+            # (e.g [Time] -> [Time, 1])
+            ind = tuple(
+                slice(None) if i < x.ndim else None
+                for i in range(self.axis + 1))
+            x = x[ind]
+
+        if train:
+            channel = self.train_channel
+        else:
+            channel = self.eval_channel
+
+        if channel == "random":
+            ch = numpy.random.randint(0, x.shape[self.axis])
+        else:
+            ch = channel
+
+        ind = tuple(
+            slice(None) if i != self.axis else ch for i in range(x.ndim))
+        return x[ind]
diff --git a/ernie-sat/paddlespeech/s2t/transform/cmvn.py b/ernie-sat/paddlespeech/s2t/transform/cmvn.py
new file mode 100644
index 0000000000000000000000000000000000000000..2db0070bf38e690fdeb4c4212e2f683773574703
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/transform/cmvn.py
@@ -0,0 +1,201 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+import io
+import json
+
+import h5py
+import kaldiio
+import numpy as np
+
+
+class CMVN():
+    "Apply Global/Spk CMVN/iverserCMVN."
+
+    def __init__(
+            self,
+            stats,
+            norm_means=True,
+            norm_vars=False,
+            filetype="mat",
+            utt2spk=None,
+            spk2utt=None,
+            reverse=False,
+            std_floor=1.0e-20, ):
+        self.stats_file = stats
+        self.norm_means = norm_means
+        self.norm_vars = norm_vars
+        self.reverse = reverse
+
+        if isinstance(stats, dict):
+            stats_dict = dict(stats)
+        else:
+            # Use for global CMVN
+            if filetype == "mat":
+                stats_dict = {None: kaldiio.load_mat(stats)}
+            # Use for global CMVN
+            elif filetype == "npy":
+                stats_dict = {None: np.load(stats)}
+            # Use for speaker CMVN
+            elif filetype == "ark":
+                self.accept_uttid = True
+                stats_dict = dict(kaldiio.load_ark(stats))
+            # Use for speaker CMVN
+            elif filetype == "hdf5":
+                self.accept_uttid = True
+                stats_dict = h5py.File(stats)
+            else:
+                raise ValueError("Not supporting filetype={}".format(filetype))
+
+        if utt2spk is not None:
+            self.utt2spk = {}
+            with io.open(utt2spk, "r", encoding="utf-8") as f:
+                for line in f:
+                    utt, spk = line.rstrip().split(None, 1)
+                    self.utt2spk[utt] = spk
+        elif spk2utt is not None:
+            self.utt2spk = {}
+            with io.open(spk2utt, "r", encoding="utf-8") as f:
+                for line in f:
+                    spk, utts = line.rstrip().split(None, 1)
+                    for utt in utts.split():
+                        self.utt2spk[utt] = spk
+        else:
+            self.utt2spk = None
+
+        # Kaldi makes a matrix for CMVN which has a shape of (2, feat_dim + 1),
+        # and the first vector contains the sum of feats and the second is
+        # the sum of squares. The last value of the first, i.e. stats[0,-1],
+        # is the number of samples for this statistics.
+        self.bias = {}
+        self.scale = {}
+        for spk, stats in stats_dict.items():
+            assert len(stats) == 2, stats.shape
+
+            count = stats[0, -1]
+
+            # If the feature has two or more dimensions
+            if not (np.isscalar(count) or isinstance(count, (int, float))):
+                # The first is only used
+                count = count.flatten()[0]
+
+            mean = stats[0, :-1] / count
+            # V(x) = E(x^2) - (E(x))^2
+            var = stats[1, :-1] / count - mean * mean
+            std = np.maximum(np.sqrt(var), std_floor)
+            self.bias[spk] = -mean
+            self.scale[spk] = 1 / std
+
+    def __repr__(self):
+        return ("{name}(stats_file={stats_file}, "
+                "norm_means={norm_means}, norm_vars={norm_vars}, "
+                "reverse={reverse})".format(
+                    name=self.__class__.__name__,
+                    stats_file=self.stats_file,
+                    norm_means=self.norm_means,
+                    norm_vars=self.norm_vars,
+                    reverse=self.reverse, ))
+
+    def __call__(self, x, uttid=None):
+        if self.utt2spk is not None:
+            spk = self.utt2spk[uttid]
+        else:
+            spk = uttid
+
+        if not self.reverse:
+            # apply cmvn
+            if self.norm_means:
+                x = np.add(x, self.bias[spk])
+            if self.norm_vars:
+                x = np.multiply(x, self.scale[spk])
+
+        else:
+            # apply reverse cmvn
+            if self.norm_vars:
+                x = np.divide(x, self.scale[spk])
+            if self.norm_means:
+                x = np.subtract(x, self.bias[spk])
+
+        return x
+
+
+class UtteranceCMVN():
+    "Apply Utterance CMVN"
+
+    def __init__(self, norm_means=True, norm_vars=False, std_floor=1.0e-20):
+        self.norm_means = norm_means
+        self.norm_vars = norm_vars
+        self.std_floor = std_floor
+
+    def __repr__(self):
+        return "{name}(norm_means={norm_means}, norm_vars={norm_vars})".format(
+            name=self.__class__.__name__,
+            norm_means=self.norm_means,
+            norm_vars=self.norm_vars, )
+
+    def __call__(self, x, uttid=None):
+        # x: [Time, Dim]
+        square_sums = (x**2).sum(axis=0)
+        mean = x.mean(axis=0)
+
+        if self.norm_means:
+            x = np.subtract(x, mean)
+
+        if self.norm_vars:
+            var = square_sums / x.shape[0] - mean**2
+            std = np.maximum(np.sqrt(var), self.std_floor)
+            x = np.divide(x, std)
+
+        return x
+
+
+class GlobalCMVN():
+    "Apply Global CMVN"
+
+    def __init__(self,
+                 cmvn_path,
+                 norm_means=True,
+                 norm_vars=True,
+                 std_floor=1.0e-20):
+        # cmvn_path: Option[str, dict]
+        cmvn = cmvn_path
+        self.cmvn = cmvn
+        self.norm_means = norm_means
+        self.norm_vars = norm_vars
+        self.std_floor = std_floor
+        if isinstance(cmvn, dict):
+            cmvn_stats = cmvn
+        else:
+            with open(cmvn) as f:
+                cmvn_stats = json.load(f)
+        self.count = cmvn_stats['frame_num']
+        self.mean = np.array(cmvn_stats['mean_stat']) / self.count
+        self.square_sums = np.array(cmvn_stats['var_stat'])
+        self.var = self.square_sums / self.count - self.mean**2
+        self.std = np.maximum(np.sqrt(self.var), self.std_floor)
+
+    def __repr__(self):
+        return f"""{self.__class__.__name__}(
+            cmvn_path={self.cmvn},
+            norm_means={self.norm_means},
+            norm_vars={self.norm_vars},)"""
+
+    def __call__(self, x, uttid=None):
+        # x: [Time, Dim]
+        if self.norm_means:
+            x = np.subtract(x, self.mean)
+
+        if self.norm_vars:
+            x = np.divide(x, self.std)
+        return x
diff --git a/ernie-sat/paddlespeech/s2t/transform/functional.py b/ernie-sat/paddlespeech/s2t/transform/functional.py
new file mode 100644
index 0000000000000000000000000000000000000000..ccb500819e171bada581811905737fbfd7af015d
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/transform/functional.py
@@ -0,0 +1,86 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+import inspect
+
+from paddlespeech.s2t.transform.transform_interface import TransformInterface
+from paddlespeech.s2t.utils.check_kwargs import check_kwargs
+
+
+class FuncTrans(TransformInterface):
+    """Functional Transformation
+
+    WARNING:
+        Builtin or C/C++ functions may not work properly
+        because this class heavily depends on the `inspect` module.
+
+    Usage:
+
+    >>> def foo_bar(x, a=1, b=2):
+    ...     '''Foo bar
+    ...     :param x: input
+    ...     :param int a: default 1
+    ...     :param int b: default 2
+    ...     '''
+    ...     return x + a - b
+
+
+    >>> class FooBar(FuncTrans):
+    ...     _func = foo_bar
+    ...     __doc__ = foo_bar.__doc__
+    """
+
+    _func = None
+
+    def __init__(self, **kwargs):
+        self.kwargs = kwargs
+        check_kwargs(self.func, kwargs)
+
+    def __call__(self, x):
+        return self.func(x, **self.kwargs)
+
+    @classmethod
+    def add_arguments(cls, parser):
+        fname = cls._func.__name__.replace("_", "-")
+        group = parser.add_argument_group(fname + " transformation setting")
+        for k, v in cls.default_params().items():
+            # TODO(karita): get help and choices from docstring?
+            attr = k.replace("_", "-")
+            group.add_argument(f"--{fname}-{attr}", default=v, type=type(v))
+        return parser
+
+    @property
+    def func(self):
+        return type(self)._func
+
+    @classmethod
+    def default_params(cls):
+        try:
+            d = dict(inspect.signature(cls._func).parameters)
+        except ValueError:
+            d = dict()
+        return {
+            k: v.default
+            for k, v in d.items() if v.default != inspect.Parameter.empty
+        }
+
+    def __repr__(self):
+        params = self.default_params()
+        params.update(**self.kwargs)
+        ret = self.__class__.__name__ + "("
+        if len(params) == 0:
+            return ret + ")"
+        for k, v in params.items():
+            ret += "{}={}, ".format(k, v)
+        return ret[:-2] + ")"
diff --git a/ernie-sat/paddlespeech/s2t/transform/perturb.py b/ernie-sat/paddlespeech/s2t/transform/perturb.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e41b824b6ed0261db5acb24fb5e0aff2a4758fa
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/transform/perturb.py
@@ -0,0 +1,470 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+import librosa
+import numpy
+import scipy
+import soundfile
+
+from paddlespeech.s2t.io.reader import SoundHDF5File
+
+
+class SpeedPerturbation():
+    """SpeedPerturbation
+
+    The speed perturbation in kaldi uses sox-speed instead of sox-tempo,
+    and sox-speed just to resample the input,
+    i.e pitch and tempo are changed both.
+
+    "Why use speed option instead of tempo -s in SoX for speed perturbation"
+    https://groups.google.com/forum/#!topic/kaldi-help/8OOG7eE4sZ8
+
+    Warning:
+        This function is very slow because of resampling.
+        I recommmend to apply speed-perturb outside the training using sox.
+
+    """
+
+    def __init__(
+            self,
+            lower=0.9,
+            upper=1.1,
+            utt2ratio=None,
+            keep_length=True,
+            res_type="kaiser_best",
+            seed=None, ):
+        self.res_type = res_type
+        self.keep_length = keep_length
+        self.state = numpy.random.RandomState(seed)
+
+        if utt2ratio is not None:
+            self.utt2ratio = {}
+            # Use the scheduled ratio for each utterances
+            self.utt2ratio_file = utt2ratio
+            self.lower = None
+            self.upper = None
+            self.accept_uttid = True
+
+            with open(utt2ratio, "r") as f:
+                for line in f:
+                    utt, ratio = line.rstrip().split(None, 1)
+                    ratio = float(ratio)
+                    self.utt2ratio[utt] = ratio
+        else:
+            self.utt2ratio = None
+            # The ratio is given on runtime randomly
+            self.lower = lower
+            self.upper = upper
+
+    def __repr__(self):
+        if self.utt2ratio is None:
+            return "{}(lower={}, upper={}, " "keep_length={}, res_type={})".format(
+                self.__class__.__name__,
+                self.lower,
+                self.upper,
+                self.keep_length,
+                self.res_type, )
+        else:
+            return "{}({}, res_type={})".format(
+                self.__class__.__name__, self.utt2ratio_file, self.res_type)
+
+    def __call__(self, x, uttid=None, train=True):
+        if not train:
+            return x
+        x = x.astype(numpy.float32)
+        if self.accept_uttid:
+            ratio = self.utt2ratio[uttid]
+        else:
+            ratio = self.state.uniform(self.lower, self.upper)
+
+        # Note1: resample requires the sampling-rate of input and output,
+        #        but actually only the ratio is used.
+        y = librosa.resample(
+            x, orig_sr=ratio, target_sr=1, res_type=self.res_type)
+
+        if self.keep_length:
+            diff = abs(len(x) - len(y))
+            if len(y) > len(x):
+                # Truncate noise
+                y = y[diff // 2:-((diff + 1) // 2)]
+            elif len(y) < len(x):
+                # Assume the time-axis is the first: (Time, Channel)
+                pad_width = [(diff // 2, (diff + 1) // 2)] + [
+                    (0, 0) for _ in range(y.ndim - 1)
+                ]
+                y = numpy.pad(
+                    y, pad_width=pad_width, constant_values=0, mode="constant")
+        return y
+
+
+class SpeedPerturbationSox():
+    """SpeedPerturbationSox
+
+    The speed perturbation in kaldi uses sox-speed instead of sox-tempo,
+    and sox-speed just to resample the input,
+    i.e pitch and tempo are changed both.
+
+    To speed up or slow down the sound of a file,
+    use speed to modify the pitch and the duration of the file.
+    This raises the speed and reduces the time.
+    The default factor is 1.0 which makes no change to the audio.
+    2.0 doubles speed, thus time length is cut by a half and pitch is one interval higher.
+
+    "Why use speed option instead of tempo -s in SoX for speed perturbation"
+    https://groups.google.com/forum/#!topic/kaldi-help/8OOG7eE4sZ8
+
+    tempo option:
+    sox -t wav input.wav -t wav output.tempo0.9.wav tempo -s 0.9
+
+    speed option:
+    sox -t wav input.wav -t wav output.speed0.9.wav speed 0.9
+
+    If we use speed option like above, the pitch of audio also will be changed,
+    but the tempo option does not change the pitch.
+    """
+
+    def __init__(
+            self,
+            lower=0.9,
+            upper=1.1,
+            utt2ratio=None,
+            keep_length=True,
+            sr=16000,
+            seed=None, ):
+        self.sr = sr
+        self.keep_length = keep_length
+        self.state = numpy.random.RandomState(seed)
+
+        try:
+            import soxbindings as sox
+        except ImportError:
+            try:
+                from paddlespeech.s2t.utils import dynamic_pip_install
+                package = "sox"
+                dynamic_pip_install.install(package)
+                package = "soxbindings"
+                dynamic_pip_install.install(package)
+                import soxbindings as sox
+            except Exception:
+                raise RuntimeError(
+                    "Can not install soxbindings on your system.")
+        self.sox = sox
+
+        if utt2ratio is not None:
+            self.utt2ratio = {}
+            # Use the scheduled ratio for each utterances
+            self.utt2ratio_file = utt2ratio
+            self.lower = None
+            self.upper = None
+            self.accept_uttid = True
+
+            with open(utt2ratio, "r") as f:
+                for line in f:
+                    utt, ratio = line.rstrip().split(None, 1)
+                    ratio = float(ratio)
+                    self.utt2ratio[utt] = ratio
+        else:
+            self.utt2ratio = None
+            # The ratio is given on runtime randomly
+            self.lower = lower
+            self.upper = upper
+
+    def __repr__(self):
+        if self.utt2ratio is None:
+            return f"""{self.__class__.__name__}(
+                lower={self.lower},
+                upper={self.upper},
+                keep_length={self.keep_length},
+                sample_rate={self.sr})"""
+
+        else:
+            return f"""{self.__class__.__name__}(
+                utt2ratio={self.utt2ratio_file},
+                sample_rate={self.sr})"""
+
+    def __call__(self, x, uttid=None, train=True):
+        if not train:
+            return x
+
+        x = x.astype(numpy.float32)
+        if self.accept_uttid:
+            ratio = self.utt2ratio[uttid]
+        else:
+            ratio = self.state.uniform(self.lower, self.upper)
+
+        tfm = self.sox.Transformer()
+        tfm.set_globals(multithread=False)
+        tfm.speed(ratio)
+        y = tfm.build_array(input_array=x, sample_rate_in=self.sr)
+
+        if self.keep_length:
+            diff = abs(len(x) - len(y))
+            if len(y) > len(x):
+                # Truncate noise
+                y = y[diff // 2:-((diff + 1) // 2)]
+            elif len(y) < len(x):
+                # Assume the time-axis is the first: (Time, Channel)
+                pad_width = [(diff // 2, (diff + 1) // 2)] + [
+                    (0, 0) for _ in range(y.ndim - 1)
+                ]
+                y = numpy.pad(
+                    y, pad_width=pad_width, constant_values=0, mode="constant")
+
+        if y.ndim == 2 and x.ndim == 1:
+            # (T, C) -> (T)
+            y = y.sequence(1)
+        return y
+
+
+class BandpassPerturbation():
+    """BandpassPerturbation
+
+    Randomly dropout along the frequency axis.
+
+    The original idea comes from the following:
+        "randomly-selected frequency band was cut off under the constraint of
+         leaving at least 1,000 Hz band within the range of less than 4,000Hz."
+        (The Hitachi/JHU CHiME-5 system: Advances in speech recognition for
+         everyday home environments using multiple microphone arrays;
+         http://spandh.dcs.shef.ac.uk/chime_workshop/papers/CHiME_2018_paper_kanda.pdf)
+
+    """
+
+    def __init__(self, lower=0.0, upper=0.75, seed=None, axes=(-1, )):
+        self.lower = lower
+        self.upper = upper
+        self.state = numpy.random.RandomState(seed)
+        # x_stft: (Time, Channel, Freq)
+        self.axes = axes
+
+    def __repr__(self):
+        return "{}(lower={}, upper={})".format(self.__class__.__name__,
+                                               self.lower, self.upper)
+
+    def __call__(self, x_stft, uttid=None, train=True):
+        if not train:
+            return x_stft
+
+        if x_stft.ndim == 1:
+            raise RuntimeError("Input in time-freq domain: "
+                               "(Time, Channel, Freq) or (Time, Freq)")
+
+        ratio = self.state.uniform(self.lower, self.upper)
+        axes = [i if i >= 0 else x_stft.ndim - i for i in self.axes]
+        shape = [s if i in axes else 1 for i, s in enumerate(x_stft.shape)]
+
+        mask = self.state.randn(*shape) > ratio
+        x_stft *= mask
+        return x_stft
+
+
+class VolumePerturbation():
+    def __init__(self,
+                 lower=-1.6,
+                 upper=1.6,
+                 utt2ratio=None,
+                 dbunit=True,
+                 seed=None):
+        self.dbunit = dbunit
+        self.utt2ratio_file = utt2ratio
+        self.lower = lower
+        self.upper = upper
+        self.state = numpy.random.RandomState(seed)
+
+        if utt2ratio is not None:
+            # Use the scheduled ratio for each utterances
+            self.utt2ratio = {}
+            self.lower = None
+            self.upper = None
+            self.accept_uttid = True
+
+            with open(utt2ratio, "r") as f:
+                for line in f:
+                    utt, ratio = line.rstrip().split(None, 1)
+                    ratio = float(ratio)
+                    self.utt2ratio[utt] = ratio
+        else:
+            # The ratio is given on runtime randomly
+            self.utt2ratio = None
+
+    def __repr__(self):
+        if self.utt2ratio is None:
+            return "{}(lower={}, upper={}, dbunit={})".format(
+                self.__class__.__name__, self.lower, self.upper, self.dbunit)
+        else:
+            return '{}("{}", dbunit={})'.format(
+                self.__class__.__name__, self.utt2ratio_file, self.dbunit)
+
+    def __call__(self, x, uttid=None, train=True):
+        if not train:
+            return x
+
+        x = x.astype(numpy.float32)
+
+        if self.accept_uttid:
+            ratio = self.utt2ratio[uttid]
+        else:
+            ratio = self.state.uniform(self.lower, self.upper)
+        if self.dbunit:
+            ratio = 10**(ratio / 20)
+        return x * ratio
+
+
+class NoiseInjection():
+    """Add isotropic noise"""
+
+    def __init__(
+            self,
+            utt2noise=None,
+            lower=-20,
+            upper=-5,
+            utt2ratio=None,
+            filetype="list",
+            dbunit=True,
+            seed=None, ):
+        self.utt2noise_file = utt2noise
+        self.utt2ratio_file = utt2ratio
+        self.filetype = filetype
+        self.dbunit = dbunit
+        self.lower = lower
+        self.upper = upper
+        self.state = numpy.random.RandomState(seed)
+
+        if utt2ratio is not None:
+            # Use the scheduled ratio for each utterances
+            self.utt2ratio = {}
+            with open(utt2noise, "r") as f:
+                for line in f:
+                    utt, snr = line.rstrip().split(None, 1)
+                    snr = float(snr)
+                    self.utt2ratio[utt] = snr
+        else:
+            # The ratio is given on runtime randomly
+            self.utt2ratio = None
+
+        if utt2noise is not None:
+            self.utt2noise = {}
+            if filetype == "list":
+                with open(utt2noise, "r") as f:
+                    for line in f:
+                        utt, filename = line.rstrip().split(None, 1)
+                        signal, rate = soundfile.read(filename, dtype="int16")
+                        # Load all files in memory
+                        self.utt2noise[utt] = (signal, rate)
+
+            elif filetype == "sound.hdf5":
+                self.utt2noise = SoundHDF5File(utt2noise, "r")
+            else:
+                raise ValueError(filetype)
+        else:
+            self.utt2noise = None
+
+        if utt2noise is not None and utt2ratio is not None:
+            if set(self.utt2ratio) != set(self.utt2noise):
+                raise RuntimeError("The uttids mismatch between {} and {}".
+                                   format(utt2ratio, utt2noise))
+
+    def __repr__(self):
+        if self.utt2ratio is None:
+            return "{}(lower={}, upper={}, dbunit={})".format(
+                self.__class__.__name__, self.lower, self.upper, self.dbunit)
+        else:
+            return '{}("{}", dbunit={})'.format(
+                self.__class__.__name__, self.utt2ratio_file, self.dbunit)
+
+    def __call__(self, x, uttid=None, train=True):
+        if not train:
+            return x
+        x = x.astype(numpy.float32)
+
+        # 1. Get ratio of noise to signal in sound pressure level
+        if uttid is not None and self.utt2ratio is not None:
+            ratio = self.utt2ratio[uttid]
+        else:
+            ratio = self.state.uniform(self.lower, self.upper)
+
+        if self.dbunit:
+            ratio = 10**(ratio / 20)
+        scale = ratio * numpy.sqrt((x**2).mean())
+
+        # 2. Get noise
+        if self.utt2noise is not None:
+            # Get noise from the external source
+            if uttid is not None:
+                noise, rate = self.utt2noise[uttid]
+            else:
+                # Randomly select the noise source
+                noise = self.state.choice(list(self.utt2noise.values()))
+            # Normalize the level
+            noise /= numpy.sqrt((noise**2).mean())
+
+            # Adjust the noise length
+            diff = abs(len(x) - len(noise))
+            offset = self.state.randint(0, diff)
+            if len(noise) > len(x):
+                # Truncate noise
+                noise = noise[offset:-(diff - offset)]
+            else:
+                noise = numpy.pad(
+                    noise, pad_width=[offset, diff - offset], mode="wrap")
+
+        else:
+            # Generate white noise
+            noise = self.state.normal(0, 1, x.shape)
+
+        # 3. Add noise to signal
+        return x + noise * scale
+
+
+class RIRConvolve():
+    def __init__(self, utt2rir, filetype="list"):
+        self.utt2rir_file = utt2rir
+        self.filetype = filetype
+
+        self.utt2rir = {}
+        if filetype == "list":
+            with open(utt2rir, "r") as f:
+                for line in f:
+                    utt, filename = line.rstrip().split(None, 1)
+                    signal, rate = soundfile.read(filename, dtype="int16")
+                    self.utt2rir[utt] = (signal, rate)
+
+        elif filetype == "sound.hdf5":
+            self.utt2rir = SoundHDF5File(utt2rir, "r")
+        else:
+            raise NotImplementedError(filetype)
+
+    def __repr__(self):
+        return '{}("{}")'.format(self.__class__.__name__, self.utt2rir_file)
+
+    def __call__(self, x, uttid=None, train=True):
+        if not train:
+            return x
+
+        x = x.astype(numpy.float32)
+
+        if x.ndim != 1:
+            # Must be single channel
+            raise RuntimeError(
+                "Input x must be one dimensional array, but got {}".format(
+                    x.shape))
+
+        rir, rate = self.utt2rir[uttid]
+        if rir.ndim == 2:
+            # FIXME(kamo): Use chainer.convolution_1d?
+            # return [Time, Channel]
+            return numpy.stack(
+                [scipy.convolve(x, r, mode="same") for r in rir], axis=-1)
+        else:
+            return scipy.convolve(x, rir, mode="same")
diff --git a/ernie-sat/paddlespeech/s2t/transform/spec_augment.py b/ernie-sat/paddlespeech/s2t/transform/spec_augment.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ce950851a4ee6dbaa2bcbe529cbc89ce714a60b
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/transform/spec_augment.py
@@ -0,0 +1,214 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+"""Spec Augment module for preprocessing i.e., data augmentation"""
+import random
+
+import numpy
+from PIL import Image
+from PIL.Image import BICUBIC
+
+from paddlespeech.s2t.transform.functional import FuncTrans
+
+
+def time_warp(x, max_time_warp=80, inplace=False, mode="PIL"):
+    """time warp for spec augment
+
+    move random center frame by the random width ~ uniform(-window, window)
+    :param numpy.ndarray x: spectrogram (time, freq)
+    :param int max_time_warp: maximum time frames to warp
+    :param bool inplace: overwrite x with the result
+    :param str mode: "PIL" (default, fast, not differentiable) or "sparse_image_warp"
+        (slow, differentiable)
+    :returns numpy.ndarray: time warped spectrogram (time, freq)
+    """
+    window = max_time_warp
+    if window == 0:
+        return x
+
+    if mode == "PIL":
+        t = x.shape[0]
+        if t - window <= window:
+            return x
+        # NOTE: randrange(a, b) emits a, a + 1, ..., b - 1
+        center = random.randrange(window, t - window)
+        warped = random.randrange(center - window, center +
+                                  window) + 1  # 1 ... t - 1
+
+        left = Image.fromarray(x[:center]).resize((x.shape[1], warped), BICUBIC)
+        right = Image.fromarray(x[center:]).resize((x.shape[1], t - warped),
+                                                   BICUBIC)
+        if inplace:
+            x[:warped] = left
+            x[warped:] = right
+            return x
+        return numpy.concatenate((left, right), 0)
+    elif mode == "sparse_image_warp":
+        import paddle
+
+        from espnet.utils import spec_augment
+
+        # TODO(karita): make this differentiable again
+        return spec_augment.time_warp(paddle.to_tensor(x), window).numpy()
+    else:
+        raise NotImplementedError("unknown resize mode: " + mode +
+                                  ", choose one from (PIL, sparse_image_warp).")
+
+
+class TimeWarp(FuncTrans):
+    _func = time_warp
+    __doc__ = time_warp.__doc__
+
+    def __call__(self, x, train):
+        if not train:
+            return x
+        return super().__call__(x)
+
+
+def freq_mask(x, F=30, n_mask=2, replace_with_zero=True, inplace=False):
+    """freq mask for spec agument
+
+    :param numpy.ndarray x: (time, freq)
+    :param int n_mask: the number of masks
+    :param bool inplace: overwrite
+    :param bool replace_with_zero: pad zero on mask if true else use mean
+    """
+    if inplace:
+        cloned = x
+    else:
+        cloned = x.copy()
+
+    num_mel_channels = cloned.shape[1]
+    fs = numpy.random.randint(0, F, size=(n_mask, 2))
+
+    for f, mask_end in fs:
+        f_zero = random.randrange(0, num_mel_channels - f)
+        mask_end += f_zero
+
+        # avoids randrange error if values are equal and range is empty
+        if f_zero == f_zero + f:
+            continue
+
+        if replace_with_zero:
+            cloned[:, f_zero:mask_end] = 0
+        else:
+            cloned[:, f_zero:mask_end] = cloned.mean()
+    return cloned
+
+
+class FreqMask(FuncTrans):
+    _func = freq_mask
+    __doc__ = freq_mask.__doc__
+
+    def __call__(self, x, train):
+        if not train:
+            return x
+        return super().__call__(x)
+
+
+def time_mask(spec, T=40, n_mask=2, replace_with_zero=True, inplace=False):
+    """freq mask for spec agument
+
+    :param numpy.ndarray spec: (time, freq)
+    :param int n_mask: the number of masks
+    :param bool inplace: overwrite
+    :param bool replace_with_zero: pad zero on mask if true else use mean
+    """
+    if inplace:
+        cloned = spec
+    else:
+        cloned = spec.copy()
+    len_spectro = cloned.shape[0]
+    ts = numpy.random.randint(0, T, size=(n_mask, 2))
+    for t, mask_end in ts:
+        # avoid randint range error
+        if len_spectro - t <= 0:
+            continue
+        t_zero = random.randrange(0, len_spectro - t)
+
+        # avoids randrange error if values are equal and range is empty
+        if t_zero == t_zero + t:
+            continue
+
+        mask_end += t_zero
+        if replace_with_zero:
+            cloned[t_zero:mask_end] = 0
+        else:
+            cloned[t_zero:mask_end] = cloned.mean()
+    return cloned
+
+
+class TimeMask(FuncTrans):
+    _func = time_mask
+    __doc__ = time_mask.__doc__
+
+    def __call__(self, x, train):
+        if not train:
+            return x
+        return super().__call__(x)
+
+
+def spec_augment(
+        x,
+        resize_mode="PIL",
+        max_time_warp=80,
+        max_freq_width=27,
+        n_freq_mask=2,
+        max_time_width=100,
+        n_time_mask=2,
+        inplace=True,
+        replace_with_zero=True, ):
+    """spec agument
+
+    apply random time warping and time/freq masking
+    default setting is based on LD (Librispeech double) in Table 2
+        https://arxiv.org/pdf/1904.08779.pdf
+
+    :param numpy.ndarray x: (time, freq)
+    :param str resize_mode: "PIL" (fast, nondifferentiable) or "sparse_image_warp"
+        (slow, differentiable)
+    :param int max_time_warp: maximum frames to warp the center frame in spectrogram (W)
+    :param int freq_mask_width: maximum width of the random freq mask (F)
+    :param int n_freq_mask: the number of the random freq mask (m_F)
+    :param int time_mask_width: maximum width of the random time mask (T)
+    :param int n_time_mask: the number of the random time mask (m_T)
+    :param bool inplace: overwrite intermediate array
+    :param bool replace_with_zero: pad zero on mask if true else use mean
+    """
+    assert isinstance(x, numpy.ndarray)
+    assert x.ndim == 2
+    x = time_warp(x, max_time_warp, inplace=inplace, mode=resize_mode)
+    x = freq_mask(
+        x,
+        max_freq_width,
+        n_freq_mask,
+        inplace=inplace,
+        replace_with_zero=replace_with_zero, )
+    x = time_mask(
+        x,
+        max_time_width,
+        n_time_mask,
+        inplace=inplace,
+        replace_with_zero=replace_with_zero, )
+    return x
+
+
+class SpecAugment(FuncTrans):
+    _func = spec_augment
+    __doc__ = spec_augment.__doc__
+
+    def __call__(self, x, train):
+        if not train:
+            return x
+        return super().__call__(x)
diff --git a/ernie-sat/paddlespeech/s2t/transform/spectrogram.py b/ernie-sat/paddlespeech/s2t/transform/spectrogram.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a65548fe141bb7e23b1b04fa990d998891d922d
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/transform/spectrogram.py
@@ -0,0 +1,475 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+import librosa
+import numpy as np
+import paddle
+from python_speech_features import logfbank
+
+import paddleaudio.compliance.kaldi as kaldi
+
+
+def stft(x,
+         n_fft,
+         n_shift,
+         win_length=None,
+         window="hann",
+         center=True,
+         pad_mode="reflect"):
+    # x: [Time, Channel]
+    if x.ndim == 1:
+        single_channel = True
+        # x: [Time] -> [Time, Channel]
+        x = x[:, None]
+    else:
+        single_channel = False
+    x = x.astype(np.float32)
+
+    # FIXME(kamo): librosa.stft can't use multi-channel?
+    # x: [Time, Channel, Freq]
+    x = np.stack(
+        [
+            librosa.stft(
+                y=x[:, ch],
+                n_fft=n_fft,
+                hop_length=n_shift,
+                win_length=win_length,
+                window=window,
+                center=center,
+                pad_mode=pad_mode, ).T for ch in range(x.shape[1])
+        ],
+        axis=1, )
+
+    if single_channel:
+        # x: [Time, Channel, Freq] -> [Time, Freq]
+        x = x[:, 0]
+    return x
+
+
+def istft(x, n_shift, win_length=None, window="hann", center=True):
+    # x: [Time, Channel, Freq]
+    if x.ndim == 2:
+        single_channel = True
+        # x: [Time, Freq] -> [Time, Channel, Freq]
+        x = x[:, None, :]
+    else:
+        single_channel = False
+
+    # x: [Time, Channel]
+    x = np.stack(
+        [
+            librosa.istft(
+                stft_matrix=x[:, ch].T,  # [Time, Freq] -> [Freq, Time]
+                hop_length=n_shift,
+                win_length=win_length,
+                window=window,
+                center=center, ) for ch in range(x.shape[1])
+        ],
+        axis=1, )
+
+    if single_channel:
+        # x: [Time, Channel] -> [Time]
+        x = x[:, 0]
+    return x
+
+
+def stft2logmelspectrogram(x_stft,
+                           fs,
+                           n_mels,
+                           n_fft,
+                           fmin=None,
+                           fmax=None,
+                           eps=1e-10):
+    # x_stft: (Time, Channel, Freq) or (Time, Freq)
+    fmin = 0 if fmin is None else fmin
+    fmax = fs / 2 if fmax is None else fmax
+
+    # spc: (Time, Channel, Freq) or (Time, Freq)
+    spc = np.abs(x_stft)
+    # mel_basis: (Mel_freq, Freq)
+    mel_basis = librosa.filters.mel(
+        sr=fs, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax)
+    # lmspc: (Time, Channel, Mel_freq) or (Time, Mel_freq)
+    lmspc = np.log10(np.maximum(eps, np.dot(spc, mel_basis.T)))
+
+    return lmspc
+
+
+def spectrogram(x, n_fft, n_shift, win_length=None, window="hann"):
+    # x: (Time, Channel) -> spc: (Time, Channel, Freq)
+    spc = np.abs(stft(x, n_fft, n_shift, win_length, window=window))
+    return spc
+
+
+def logmelspectrogram(
+        x,
+        fs,
+        n_mels,
+        n_fft,
+        n_shift,
+        win_length=None,
+        window="hann",
+        fmin=None,
+        fmax=None,
+        eps=1e-10,
+        pad_mode="reflect", ):
+    # stft: (Time, Channel, Freq) or (Time, Freq)
+    x_stft = stft(
+        x,
+        n_fft=n_fft,
+        n_shift=n_shift,
+        win_length=win_length,
+        window=window,
+        pad_mode=pad_mode, )
+
+    return stft2logmelspectrogram(
+        x_stft,
+        fs=fs,
+        n_mels=n_mels,
+        n_fft=n_fft,
+        fmin=fmin,
+        fmax=fmax,
+        eps=eps)
+
+
+class Spectrogram():
+    def __init__(self, n_fft, n_shift, win_length=None, window="hann"):
+        self.n_fft = n_fft
+        self.n_shift = n_shift
+        self.win_length = win_length
+        self.window = window
+
+    def __repr__(self):
+        return ("{name}(n_fft={n_fft}, n_shift={n_shift}, "
+                "win_length={win_length}, window={window})".format(
+                    name=self.__class__.__name__,
+                    n_fft=self.n_fft,
+                    n_shift=self.n_shift,
+                    win_length=self.win_length,
+                    window=self.window, ))
+
+    def __call__(self, x):
+        return spectrogram(
+            x,
+            n_fft=self.n_fft,
+            n_shift=self.n_shift,
+            win_length=self.win_length,
+            window=self.window, )
+
+
+class LogMelSpectrogram():
+    def __init__(
+            self,
+            fs,
+            n_mels,
+            n_fft,
+            n_shift,
+            win_length=None,
+            window="hann",
+            fmin=None,
+            fmax=None,
+            eps=1e-10, ):
+        self.fs = fs
+        self.n_mels = n_mels
+        self.n_fft = n_fft
+        self.n_shift = n_shift
+        self.win_length = win_length
+        self.window = window
+        self.fmin = fmin
+        self.fmax = fmax
+        self.eps = eps
+
+    def __repr__(self):
+        return ("{name}(fs={fs}, n_mels={n_mels}, n_fft={n_fft}, "
+                "n_shift={n_shift}, win_length={win_length}, window={window}, "
+                "fmin={fmin}, fmax={fmax}, eps={eps}))".format(
+                    name=self.__class__.__name__,
+                    fs=self.fs,
+                    n_mels=self.n_mels,
+                    n_fft=self.n_fft,
+                    n_shift=self.n_shift,
+                    win_length=self.win_length,
+                    window=self.window,
+                    fmin=self.fmin,
+                    fmax=self.fmax,
+                    eps=self.eps, ))
+
+    def __call__(self, x):
+        return logmelspectrogram(
+            x,
+            fs=self.fs,
+            n_mels=self.n_mels,
+            n_fft=self.n_fft,
+            n_shift=self.n_shift,
+            win_length=self.win_length,
+            window=self.window, )
+
+
+class Stft2LogMelSpectrogram():
+    def __init__(self, fs, n_mels, n_fft, fmin=None, fmax=None, eps=1e-10):
+        self.fs = fs
+        self.n_mels = n_mels
+        self.n_fft = n_fft
+        self.fmin = fmin
+        self.fmax = fmax
+        self.eps = eps
+
+    def __repr__(self):
+        return ("{name}(fs={fs}, n_mels={n_mels}, n_fft={n_fft}, "
+                "fmin={fmin}, fmax={fmax}, eps={eps}))".format(
+                    name=self.__class__.__name__,
+                    fs=self.fs,
+                    n_mels=self.n_mels,
+                    n_fft=self.n_fft,
+                    fmin=self.fmin,
+                    fmax=self.fmax,
+                    eps=self.eps, ))
+
+    def __call__(self, x):
+        return stft2logmelspectrogram(
+            x,
+            fs=self.fs,
+            n_mels=self.n_mels,
+            n_fft=self.n_fft,
+            fmin=self.fmin,
+            fmax=self.fmax, )
+
+
+class Stft():
+    def __init__(
+            self,
+            n_fft,
+            n_shift,
+            win_length=None,
+            window="hann",
+            center=True,
+            pad_mode="reflect", ):
+        self.n_fft = n_fft
+        self.n_shift = n_shift
+        self.win_length = win_length
+        self.window = window
+        self.center = center
+        self.pad_mode = pad_mode
+
+    def __repr__(self):
+        return ("{name}(n_fft={n_fft}, n_shift={n_shift}, "
+                "win_length={win_length}, window={window},"
+                "center={center}, pad_mode={pad_mode})".format(
+                    name=self.__class__.__name__,
+                    n_fft=self.n_fft,
+                    n_shift=self.n_shift,
+                    win_length=self.win_length,
+                    window=self.window,
+                    center=self.center,
+                    pad_mode=self.pad_mode, ))
+
+    def __call__(self, x):
+        return stft(
+            x,
+            self.n_fft,
+            self.n_shift,
+            win_length=self.win_length,
+            window=self.window,
+            center=self.center,
+            pad_mode=self.pad_mode, )
+
+
+class IStft():
+    def __init__(self, n_shift, win_length=None, window="hann", center=True):
+        self.n_shift = n_shift
+        self.win_length = win_length
+        self.window = window
+        self.center = center
+
+    def __repr__(self):
+        return ("{name}(n_shift={n_shift}, "
+                "win_length={win_length}, window={window},"
+                "center={center})".format(
+                    name=self.__class__.__name__,
+                    n_shift=self.n_shift,
+                    win_length=self.win_length,
+                    window=self.window,
+                    center=self.center, ))
+
+    def __call__(self, x):
+        return istft(
+            x,
+            self.n_shift,
+            win_length=self.win_length,
+            window=self.window,
+            center=self.center, )
+
+
+class LogMelSpectrogramKaldi():
+    def __init__(
+            self,
+            fs=16000,
+            n_mels=80,
+            n_shift=160,  # unit:sample, 10ms
+            win_length=400,  # unit:sample, 25ms
+            energy_floor=0.0,
+            dither=0.1):
+        """
+        The Kaldi implementation of LogMelSpectrogram 
+        Args:
+            fs (int): sample rate of the audio
+            n_mels (int): number of mel filter banks
+            n_shift (int): number of points in a frame shift
+            win_length (int): number of points in a frame windows
+            energy_floor (float): Floor on energy in Spectrogram computation (absolute)
+            dither (float): Dithering constant
+
+        Returns:
+            LogMelSpectrogramKaldi
+        """
+
+        self.fs = fs
+        self.n_mels = n_mels
+        num_point_ms = fs / 1000
+        self.n_frame_length = win_length / num_point_ms
+        self.n_frame_shift = n_shift / num_point_ms
+        self.energy_floor = energy_floor
+        self.dither = dither
+
+    def __repr__(self):
+        return (
+            "{name}(fs={fs}, n_mels={n_mels}, "
+            "n_frame_shift={n_frame_shift}, n_frame_length={n_frame_length}, "
+            "dither={dither}))".format(
+                name=self.__class__.__name__,
+                fs=self.fs,
+                n_mels=self.n_mels,
+                n_frame_shift=self.n_frame_shift,
+                n_frame_length=self.n_frame_length,
+                dither=self.dither, ))
+
+    def __call__(self, x, train):
+        """
+        Args:
+            x (np.ndarray): shape (Ti,)
+            train (bool): True, train mode.
+
+        Raises:
+            ValueError: not support (Ti, C)
+
+        Returns:
+            np.ndarray: (T, D)
+        """
+        dither = self.dither if train else 0.0
+        if x.ndim != 1:
+            raise ValueError("Not support x: [Time, Channel]")
+        waveform = paddle.to_tensor(np.expand_dims(x, 0), dtype=paddle.float32)
+        mat = kaldi.fbank(
+            waveform,
+            n_mels=self.n_mels,
+            frame_length=self.n_frame_length,
+            frame_shift=self.n_frame_shift,
+            dither=dither,
+            energy_floor=self.energy_floor,
+            sr=self.fs)
+        mat = np.squeeze(mat.numpy())
+        return mat
+
+
+class LogMelSpectrogramKaldi_decay():
+    def __init__(
+            self,
+            fs=16000,
+            n_mels=80,
+            n_fft=512,  # fft point
+            n_shift=160,  # unit:sample, 10ms
+            win_length=400,  # unit:sample, 25ms
+            window="povey",
+            fmin=20,
+            fmax=None,
+            eps=1e-10,
+            dither=1.0):
+        self.fs = fs
+        self.n_mels = n_mels
+        self.n_fft = n_fft
+        if n_shift > win_length:
+            raise ValueError("Stride size must not be greater than "
+                             "window size.")
+        self.n_shift = n_shift / fs  # unit: ms
+        self.win_length = win_length / fs  # unit: ms
+
+        self.window = window
+        self.fmin = fmin
+        if fmax is None:
+            fmax_ = fmax if fmax else self.fs / 2
+        elif fmax > int(self.fs / 2):
+            raise ValueError("fmax must not be greater than half of "
+                             "sample rate.")
+        self.fmax = fmax_
+
+        self.eps = eps
+        self.remove_dc_offset = True
+        self.preemph = 0.97
+        self.dither = dither  # only work in train mode
+
+    def __repr__(self):
+        return (
+            "{name}(fs={fs}, n_mels={n_mels}, n_fft={n_fft}, "
+            "n_shift={n_shift}, win_length={win_length}, preemph={preemph}, window={window}, "
+            "fmin={fmin}, fmax={fmax}, eps={eps}, dither={dither}))".format(
+                name=self.__class__.__name__,
+                fs=self.fs,
+                n_mels=self.n_mels,
+                n_fft=self.n_fft,
+                n_shift=self.n_shift,
+                preemph=self.preemph,
+                win_length=self.win_length,
+                window=self.window,
+                fmin=self.fmin,
+                fmax=self.fmax,
+                eps=self.eps,
+                dither=self.dither, ))
+
+    def __call__(self, x, train):
+        """
+
+        Args:
+            x (np.ndarray): shape (Ti,)
+            train (bool): True, train mode.
+
+        Raises:
+            ValueError: not support (Ti, C)
+
+        Returns:
+            np.ndarray: (T, D)
+        """
+        dither = self.dither if train else 0.0
+        if x.ndim != 1:
+            raise ValueError("Not support x: [Time, Channel]")
+
+        if x.dtype in np.sctypes['float']:
+            # PCM32 -> PCM16
+            bits = np.iinfo(np.int16).bits
+            x = x * 2**(bits - 1)
+
+        # logfbank need PCM16 input
+        y = logfbank(
+            signal=x,
+            samplerate=self.fs,
+            winlen=self.win_length,  # unit ms
+            winstep=self.n_shift,  # unit ms
+            nfilt=self.n_mels,
+            nfft=self.n_fft,
+            lowfreq=self.fmin,
+            highfreq=self.fmax,
+            dither=dither,
+            remove_dc_offset=self.remove_dc_offset,
+            preemph=self.preemph,
+            wintype=self.window)
+        return y
diff --git a/ernie-sat/paddlespeech/s2t/transform/transform_interface.py b/ernie-sat/paddlespeech/s2t/transform/transform_interface.py
new file mode 100644
index 0000000000000000000000000000000000000000..8bc6242090b0f07c08d283401587762dbce08eee
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/transform/transform_interface.py
@@ -0,0 +1,35 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+
+
+class TransformInterface:
+    """Transform Interface"""
+
+    def __call__(self, x):
+        raise NotImplementedError("__call__ method is not implemented")
+
+    @classmethod
+    def add_arguments(cls, parser):
+        return parser
+
+    def __repr__(self):
+        return self.__class__.__name__ + "()"
+
+
+class Identity(TransformInterface):
+    """Identity Function"""
+
+    def __call__(self, x):
+        return x
diff --git a/ernie-sat/paddlespeech/s2t/transform/transformation.py b/ernie-sat/paddlespeech/s2t/transform/transformation.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b433cb0bc50c7c3e3cbf847f2906d0f6b554d99
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/transform/transformation.py
@@ -0,0 +1,158 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+"""Transformation module."""
+import copy
+import io
+import logging
+from collections import OrderedDict
+from collections.abc import Sequence
+from inspect import signature
+
+import yaml
+
+from paddlespeech.s2t.utils.dynamic_import import dynamic_import
+
+import_alias = dict(
+    identity="paddlespeech.s2t.transform.transform_interface:Identity",
+    time_warp="paddlespeech.s2t.transform.spec_augment:TimeWarp",
+    time_mask="paddlespeech.s2t.transform.spec_augment:TimeMask",
+    freq_mask="paddlespeech.s2t.transform.spec_augment:FreqMask",
+    spec_augment="paddlespeech.s2t.transform.spec_augment:SpecAugment",
+    speed_perturbation="paddlespeech.s2t.transform.perturb:SpeedPerturbation",
+    speed_perturbation_sox="paddlespeech.s2t.transform.perturb:SpeedPerturbationSox",
+    volume_perturbation="paddlespeech.s2t.transform.perturb:VolumePerturbation",
+    noise_injection="paddlespeech.s2t.transform.perturb:NoiseInjection",
+    bandpass_perturbation="paddlespeech.s2t.transform.perturb:BandpassPerturbation",
+    rir_convolve="paddlespeech.s2t.transform.perturb:RIRConvolve",
+    delta="paddlespeech.s2t.transform.add_deltas:AddDeltas",
+    cmvn="paddlespeech.s2t.transform.cmvn:CMVN",
+    utterance_cmvn="paddlespeech.s2t.transform.cmvn:UtteranceCMVN",
+    fbank="paddlespeech.s2t.transform.spectrogram:LogMelSpectrogram",
+    spectrogram="paddlespeech.s2t.transform.spectrogram:Spectrogram",
+    stft="paddlespeech.s2t.transform.spectrogram:Stft",
+    istft="paddlespeech.s2t.transform.spectrogram:IStft",
+    stft2fbank="paddlespeech.s2t.transform.spectrogram:Stft2LogMelSpectrogram",
+    wpe="paddlespeech.s2t.transform.wpe:WPE",
+    channel_selector="paddlespeech.s2t.transform.channel_selector:ChannelSelector",
+    fbank_kaldi="paddlespeech.s2t.transform.spectrogram:LogMelSpectrogramKaldi",
+    cmvn_json="paddlespeech.s2t.transform.cmvn:GlobalCMVN")
+
+
+class Transformation():
+    """Apply some functions to the mini-batch
+
+    Examples:
+        >>> kwargs = {"process": [{"type": "fbank",
+        ...                        "n_mels": 80,
+        ...                        "fs": 16000},
+        ...                       {"type": "cmvn",
+        ...                        "stats": "data/train/cmvn.ark",
+        ...                        "norm_vars": True},
+        ...                       {"type": "delta", "window": 2, "order": 2}]}
+        >>> transform = Transformation(kwargs)
+        >>> bs = 10
+        >>> xs = [np.random.randn(100, 80).astype(np.float32)
+        ...       for _ in range(bs)]
+        >>> xs = transform(xs)
+    """
+
+    def __init__(self, conffile=None):
+        if conffile is not None:
+            if isinstance(conffile, dict):
+                self.conf = copy.deepcopy(conffile)
+            else:
+                with io.open(conffile, encoding="utf-8") as f:
+                    self.conf = yaml.safe_load(f)
+                    assert isinstance(self.conf, dict), type(self.conf)
+        else:
+            self.conf = {"mode": "sequential", "process": []}
+
+        self.functions = OrderedDict()
+        if self.conf.get("mode", "sequential") == "sequential":
+            for idx, process in enumerate(self.conf["process"]):
+                assert isinstance(process, dict), type(process)
+                opts = dict(process)
+                process_type = opts.pop("type")
+                class_obj = dynamic_import(process_type, import_alias)
+                # TODO(karita): assert issubclass(class_obj, TransformInterface)
+                try:
+                    self.functions[idx] = class_obj(**opts)
+                except TypeError:
+                    try:
+                        signa = signature(class_obj)
+                    except ValueError:
+                        # Some function, e.g. built-in function, are failed
+                        pass
+                    else:
+                        logging.error("Expected signature: {}({})".format(
+                            class_obj.__name__, signa))
+                    raise
+        else:
+            raise NotImplementedError(
+                "Not supporting mode={}".format(self.conf["mode"]))
+
+    def __repr__(self):
+        rep = "\n" + "\n".join("    {}: {}".format(k, v)
+                               for k, v in self.functions.items())
+        return "{}({})".format(self.__class__.__name__, rep)
+
+    def __call__(self, xs, uttid_list=None, **kwargs):
+        """Return new mini-batch
+
+        :param Union[Sequence[np.ndarray], np.ndarray] xs:
+        :param Union[Sequence[str], str] uttid_list:
+        :return: batch:
+        :rtype: List[np.ndarray]
+        """
+        if not isinstance(xs, Sequence):
+            is_batch = False
+            xs = [xs]
+        else:
+            is_batch = True
+
+        if isinstance(uttid_list, str):
+            uttid_list = [uttid_list for _ in range(len(xs))]
+
+        if self.conf.get("mode", "sequential") == "sequential":
+            for idx in range(len(self.conf["process"])):
+                func = self.functions[idx]
+                # TODO(karita): use TrainingTrans and UttTrans to check __call__ args
+                # Derive only the args which the func has
+                try:
+                    param = signature(func).parameters
+                except ValueError:
+                    # Some function, e.g. built-in function, are failed
+                    param = {}
+                _kwargs = {k: v for k, v in kwargs.items() if k in param}
+                try:
+                    if uttid_list is not None and "uttid" in param:
+                        xs = [
+                            func(x, u, **_kwargs)
+                            for x, u in zip(xs, uttid_list)
+                        ]
+                    else:
+                        xs = [func(x, **_kwargs) for x in xs]
+                except Exception:
+                    logging.fatal("Catch a exception from {}th func: {}".format(
+                        idx, func))
+                    raise
+        else:
+            raise NotImplementedError(
+                "Not supporting mode={}".format(self.conf["mode"]))
+
+        if is_batch:
+            return xs
+        else:
+            return xs[0]
diff --git a/ernie-sat/paddlespeech/s2t/transform/wpe.py b/ernie-sat/paddlespeech/s2t/transform/wpe.py
new file mode 100644
index 0000000000000000000000000000000000000000..777379d018b651d9edf7b9179833ab2968b4599d
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/transform/wpe.py
@@ -0,0 +1,58 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+from nara_wpe.wpe import wpe
+
+
+class WPE(object):
+    def __init__(self,
+                 taps=10,
+                 delay=3,
+                 iterations=3,
+                 psd_context=0,
+                 statistics_mode="full"):
+        self.taps = taps
+        self.delay = delay
+        self.iterations = iterations
+        self.psd_context = psd_context
+        self.statistics_mode = statistics_mode
+
+    def __repr__(self):
+        return ("{name}(taps={taps}, delay={delay}"
+                "iterations={iterations}, psd_context={psd_context}, "
+                "statistics_mode={statistics_mode})".format(
+                    name=self.__class__.__name__,
+                    taps=self.taps,
+                    delay=self.delay,
+                    iterations=self.iterations,
+                    psd_context=self.psd_context,
+                    statistics_mode=self.statistics_mode, ))
+
+    def __call__(self, xs):
+        """Return enhanced
+
+        :param np.ndarray xs: (Time, Channel, Frequency)
+        :return: enhanced_xs
+        :rtype: np.ndarray
+
+        """
+        # nara_wpe.wpe: (F, C, T)
+        xs = wpe(
+            xs.transpose((2, 1, 0)),
+            taps=self.taps,
+            delay=self.delay,
+            iterations=self.iterations,
+            psd_context=self.psd_context,
+            statistics_mode=self.statistics_mode, )
+        return xs.transpose(2, 1, 0)
diff --git a/ernie-sat/paddlespeech/s2t/utils/__init__.py b/ernie-sat/paddlespeech/s2t/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..185a92b8d94d3426d616c0624f0f2ee04339349e
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/utils/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/ernie-sat/paddlespeech/s2t/utils/asr_utils.py b/ernie-sat/paddlespeech/s2t/utils/asr_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..9184fd6af6c4046acbbe2e78346621f6eb25ae8b
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/utils/asr_utils.py
@@ -0,0 +1,52 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Reference espnet Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
+import json
+
+import numpy as np
+
+__all__ = ["label_smoothing_dist"]
+
+
+def label_smoothing_dist(odim, lsm_type, transcript=None, blank=0):
+    """Obtain label distribution for loss smoothing.
+
+    :param odim:
+    :param lsm_type:
+    :param blank:
+    :param transcript:
+    :return:
+    """
+    if transcript is not None:
+        with open(transcript, "rb") as f:
+            trans_json = json.load(f)["utts"]
+
+    if lsm_type == "unigram":
+        assert transcript is not None, (
+            "transcript is required for %s label smoothing" % lsm_type)
+        labelcount = np.zeros(odim)
+        for k, v in trans_json.items():
+            ids = np.array([int(n) for n in v["output"][0]["tokenid"].split()])
+            # to avoid an error when there is no text in an uttrance
+            if len(ids) > 0:
+                labelcount[ids] += 1
+        labelcount[odim - 1] = len(transcript)  # count <eos>
+        labelcount[labelcount == 0] = 1  # flooring
+        labelcount[blank] = 0  # remove counts for blank
+        labeldist = labelcount.astype(np.float32) / np.sum(labelcount)
+    else:
+        logging.error("Error: unexpected label smoothing type: %s" % lsm_type)
+        sys.exit()
+
+    return labeldist
diff --git a/ernie-sat/paddlespeech/s2t/utils/bleu_score.py b/ernie-sat/paddlespeech/s2t/utils/bleu_score.py
new file mode 100644
index 0000000000000000000000000000000000000000..d7eb9c7c6878d33a1a52404d2aeb9407e09859b1
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/utils/bleu_score.py
@@ -0,0 +1,118 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+"""This module provides functions to calculate bleu score in different level.
+e.g. wer for word-level, cer for char-level.
+"""
+import numpy as np
+import sacrebleu
+
+__all__ = ['bleu', 'char_bleu', "ErrorCalculator"]
+
+
+def bleu(hypothesis, reference):
+    """Calculate BLEU. BLEU compares reference text and
+    hypothesis text in word-level using scarebleu.
+
+    :param reference: The reference sentences.
+    :type reference: list[list[str]]
+    :param hypothesis: The hypothesis sentence.
+    :type hypothesis: list[str]
+    :raises ValueError: If the reference length is zero.
+    """
+
+    return sacrebleu.corpus_bleu(hypothesis, reference)
+
+
+def char_bleu(hypothesis, reference):
+    """Calculate BLEU. BLEU compares reference text and
+    hypothesis text in char-level using scarebleu.
+
+    :param reference: The reference sentences.
+    :type reference: list[list[str]]
+    :param hypothesis: The hypothesis sentence.
+    :type hypothesis: list[str]
+    :raises ValueError: If the reference number is zero.
+    """
+    hypothesis = [' '.join(list(hyp.replace(' ', ''))) for hyp in hypothesis]
+    reference = [[' '.join(list(ref_i.replace(' ', ''))) for ref_i in ref]
+                 for ref in reference]
+
+    return sacrebleu.corpus_bleu(hypothesis, reference)
+
+
+class ErrorCalculator():
+    """Calculate BLEU for ST and MT models during training.
+
+    :param y_hats: numpy array with predicted text
+    :param y_pads: numpy array with true (target) text
+    :param char_list: vocabulary list
+    :param sym_space: space symbol
+    :param sym_pad: pad symbol
+    :param report_bleu: report BLUE score if True
+    """
+
+    def __init__(self, char_list, sym_space, sym_pad, report_bleu=False):
+        """Construct an ErrorCalculator object."""
+        super().__init__()
+        self.char_list = char_list
+        self.space = sym_space
+        self.pad = sym_pad
+        self.report_bleu = report_bleu
+        if self.space in self.char_list:
+            self.idx_space = self.char_list.index(self.space)
+        else:
+            self.idx_space = None
+
+    def __call__(self, ys_hat, ys_pad):
+        """Calculate corpus-level BLEU score.
+
+        :param torch.Tensor ys_hat: prediction (batch, seqlen)
+        :param torch.Tensor ys_pad: reference (batch, seqlen)
+        :return: corpus-level BLEU score in a mini-batch
+        :rtype float
+        """
+        bleu = None
+        if not self.report_bleu:
+            return bleu
+
+        bleu = self.calculate_corpus_bleu(ys_hat, ys_pad)
+        return bleu
+
+    def calculate_corpus_bleu(self, ys_hat, ys_pad):
+        """Calculate corpus-level BLEU score in a mini-batch.
+
+        :param torch.Tensor seqs_hat: prediction (batch, seqlen)
+        :param torch.Tensor seqs_true: reference (batch, seqlen)
+        :return: corpus-level BLEU score
+        :rtype float
+        """
+        seqs_hat, seqs_true = [], []
+        for i, y_hat in enumerate(ys_hat):
+            y_true = ys_pad[i]
+            eos_true = np.where(y_true == -1)[0]
+            ymax = eos_true[0] if len(eos_true) > 0 else len(y_true)
+            # NOTE: padding index (-1) in y_true is used to pad y_hat
+            # because y_hats is not padded with -1
+            seq_hat = [self.char_list[int(idx)] for idx in y_hat[:ymax]]
+            seq_true = [
+                self.char_list[int(idx)] for idx in y_true if int(idx) != -1
+            ]
+            seq_hat_text = "".join(seq_hat).replace(self.space, " ")
+            seq_hat_text = seq_hat_text.replace(self.pad, "")
+            seq_true_text = "".join(seq_true).replace(self.space, " ")
+            seqs_hat.append(seq_hat_text)
+            seqs_true.append(seq_true_text)
+        bleu = sacrebleu.corpus_bleu(seqs_hat, [[ref] for ref in seqs_true])
+        return bleu.score * 100
diff --git a/ernie-sat/paddlespeech/s2t/utils/check_kwargs.py b/ernie-sat/paddlespeech/s2t/utils/check_kwargs.py
new file mode 100644
index 0000000000000000000000000000000000000000..0aa839aca8bf177d39bd174db730413bfc8a3b90
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/utils/check_kwargs.py
@@ -0,0 +1,35 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+import inspect
+
+
+def check_kwargs(func, kwargs, name=None):
+    """check kwargs are valid for func
+
+    If kwargs are invalid, raise TypeError as same as python default
+    :param function func: function to be validated
+    :param dict kwargs: keyword arguments for func
+    :param str name: name used in TypeError (default is func name)
+    """
+    try:
+        params = inspect.signature(func).parameters
+    except ValueError:
+        return
+    if name is None:
+        name = func.__name__
+    for k in kwargs.keys():
+        if k not in params:
+            raise TypeError(
+                f"{name}() got an unexpected keyword argument '{k}'")
diff --git a/ernie-sat/paddlespeech/s2t/utils/checkpoint.py b/ernie-sat/paddlespeech/s2t/utils/checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d24c880add78e13d2d87fc5fd1a4fc67ffd123b
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/utils/checkpoint.py
@@ -0,0 +1,298 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import glob
+import json
+import os
+import re
+from pathlib import Path
+from typing import Text
+from typing import Union
+
+import paddle
+from paddle import distributed as dist
+from paddle.optimizer import Optimizer
+
+from paddlespeech.s2t.utils import mp_tools
+from paddlespeech.s2t.utils.log import Log
+
+logger = Log(__name__).getlog()
+
+__all__ = ["Checkpoint"]
+
+
+class Checkpoint():
+    def __init__(self, kbest_n: int=5, latest_n: int=1):
+        self.best_records: Mapping[Path, float] = {}
+        self.latest_records = []
+        self.kbest_n = kbest_n
+        self.latest_n = latest_n
+        self._save_all = (kbest_n == -1)
+
+    def save_parameters(self,
+                        checkpoint_dir,
+                        tag_or_iteration: Union[int, Text],
+                        model: paddle.nn.Layer,
+                        optimizer: Optimizer=None,
+                        infos: dict=None,
+                        metric_type="val_loss"):
+        """Save checkpoint in best_n and latest_n.
+
+        Args:
+            checkpoint_dir (str): the directory where checkpoint is saved.
+            tag_or_iteration (int or str): the latest iteration(step or epoch) number or tag.
+            model (Layer):  model to be checkpointed.
+            optimizer (Optimizer, optional): optimizer to be checkpointed.
+            infos (dict or None)):  any info you want to save.
+            metric_type (str, optional): metric type. Defaults to "val_loss".
+        """
+        if (metric_type not in infos.keys()):
+            self._save_parameters(checkpoint_dir, tag_or_iteration, model,
+                                  optimizer, infos)
+            return
+
+        #save best
+        if self._should_save_best(infos[metric_type]):
+            self._save_best_checkpoint_and_update(
+                infos[metric_type], checkpoint_dir, tag_or_iteration, model,
+                optimizer, infos)
+        #save latest
+        self._save_latest_checkpoint_and_update(
+            checkpoint_dir, tag_or_iteration, model, optimizer, infos)
+
+        if isinstance(tag_or_iteration, int):
+            self._save_checkpoint_record(checkpoint_dir, tag_or_iteration)
+
+    def load_parameters(self,
+                        model,
+                        optimizer=None,
+                        checkpoint_dir=None,
+                        checkpoint_path=None,
+                        record_file="checkpoint_latest"):
+        """Load a last model checkpoint from disk.
+        Args:
+            model (Layer): model to load parameters.
+            optimizer (Optimizer, optional): optimizer to load states if needed.
+                Defaults to None.
+            checkpoint_dir (str, optional): the directory where checkpoint is saved.
+            checkpoint_path (str, optional): if specified, load the checkpoint
+                stored in the checkpoint_path(prefix) and the argument 'checkpoint_dir' will
+                be ignored. Defaults to None.
+            record_file "checkpoint_latest" or "checkpoint_best"
+        Returns:
+            configs (dict): epoch or step, lr and other meta info should be saved.
+        """
+        configs = {}
+
+        if checkpoint_path:
+            pass
+        elif checkpoint_dir is not None and record_file is not None:
+            # load checkpint from record file
+            checkpoint_record = os.path.join(checkpoint_dir, record_file)
+            iteration = self._load_checkpoint_idx(checkpoint_record)
+            if iteration == -1:
+                return configs
+            checkpoint_path = os.path.join(checkpoint_dir,
+                                           "{}".format(iteration))
+        else:
+            raise ValueError(
+                "At least one of 'checkpoint_path' or 'checkpoint_dir' should be specified!"
+            )
+
+        rank = dist.get_rank()
+
+        params_path = checkpoint_path + ".pdparams"
+        model_dict = paddle.load(params_path)
+        model.set_state_dict(model_dict)
+        logger.info("Rank {}: Restore model from {}".format(rank, params_path))
+
+        optimizer_path = checkpoint_path + ".pdopt"
+        if optimizer and os.path.isfile(optimizer_path):
+            optimizer_dict = paddle.load(optimizer_path)
+            optimizer.set_state_dict(optimizer_dict)
+            logger.info("Rank {}: Restore optimizer state from {}".format(
+                rank, optimizer_path))
+
+        info_path = re.sub('.pdparams$', '.json', params_path)
+        if os.path.exists(info_path):
+            with open(info_path, 'r') as fin:
+                configs = json.load(fin)
+        return configs
+
+    def load_latest_parameters(self,
+                               model,
+                               optimizer=None,
+                               checkpoint_dir=None,
+                               checkpoint_path=None):
+        """Load a last model checkpoint from disk.
+        Args:
+            model (Layer): model to load parameters.
+            optimizer (Optimizer, optional): optimizer to load states if needed.
+                Defaults to None.
+            checkpoint_dir (str, optional): the directory where checkpoint is saved.
+            checkpoint_path (str, optional): if specified, load the checkpoint
+                stored in the checkpoint_path(prefix) and the argument 'checkpoint_dir' will
+                be ignored. Defaults to None.
+        Returns:
+            configs (dict): epoch or step, lr and other meta info should be saved.
+        """
+        return self.load_parameters(model, optimizer, checkpoint_dir,
+                                    checkpoint_path, "checkpoint_latest")
+
+    def load_best_parameters(self,
+                             model,
+                             optimizer=None,
+                             checkpoint_dir=None,
+                             checkpoint_path=None):
+        """Load a last model checkpoint from disk.
+        Args:
+            model (Layer): model to load parameters.
+            optimizer (Optimizer, optional): optimizer to load states if needed.
+                Defaults to None.
+            checkpoint_dir (str, optional): the directory where checkpoint is saved.
+            checkpoint_path (str, optional): if specified, load the checkpoint
+                stored in the checkpoint_path(prefix) and the argument 'checkpoint_dir' will
+                be ignored. Defaults to None.
+        Returns:
+            configs (dict): epoch or step, lr and other meta info should be saved.
+        """
+        return self.load_parameters(model, optimizer, checkpoint_dir,
+                                    checkpoint_path, "checkpoint_best")
+
+    def _should_save_best(self, metric: float) -> bool:
+        if not self._best_full():
+            return True
+
+        # already full
+        worst_record_path = max(self.best_records, key=self.best_records.get)
+        # worst_record_path = max(self.best_records.iteritems(), key=operator.itemgetter(1))[0]
+        worst_metric = self.best_records[worst_record_path]
+        return metric < worst_metric
+
+    def _best_full(self):
+        return (not self._save_all) and len(self.best_records) == self.kbest_n
+
+    def _latest_full(self):
+        return len(self.latest_records) == self.latest_n
+
+    def _save_best_checkpoint_and_update(self, metric, checkpoint_dir,
+                                         tag_or_iteration, model, optimizer,
+                                         infos):
+        # remove the worst
+        if self._best_full():
+            worst_record_path = max(self.best_records,
+                                    key=self.best_records.get)
+            self.best_records.pop(worst_record_path)
+            if (worst_record_path not in self.latest_records):
+                logger.info(
+                    "remove the worst checkpoint: {}".format(worst_record_path))
+                self._del_checkpoint(checkpoint_dir, worst_record_path)
+
+        # add the new one
+        self._save_parameters(checkpoint_dir, tag_or_iteration, model,
+                              optimizer, infos)
+        self.best_records[tag_or_iteration] = metric
+
+    def _save_latest_checkpoint_and_update(
+            self, checkpoint_dir, tag_or_iteration, model, optimizer, infos):
+        # remove the old
+        if self._latest_full():
+            to_del_fn = self.latest_records.pop(0)
+            if (to_del_fn not in self.best_records.keys()):
+                logger.info(
+                    "remove the latest checkpoint: {}".format(to_del_fn))
+                self._del_checkpoint(checkpoint_dir, to_del_fn)
+        self.latest_records.append(tag_or_iteration)
+
+        self._save_parameters(checkpoint_dir, tag_or_iteration, model,
+                              optimizer, infos)
+
+    def _del_checkpoint(self, checkpoint_dir, tag_or_iteration):
+        checkpoint_path = os.path.join(checkpoint_dir,
+                                       "{}".format(tag_or_iteration))
+        for filename in glob.glob(checkpoint_path + ".*"):
+            os.remove(filename)
+            logger.info("delete file: {}".format(filename))
+
+    def _load_checkpoint_idx(self, checkpoint_record: str) -> int:
+        """Get the iteration number corresponding to the latest saved checkpoint.
+        Args:
+            checkpoint_path (str): the saved path of checkpoint.
+        Returns:
+            int: the latest iteration number. -1 for no checkpoint to load.
+        """
+        if not os.path.isfile(checkpoint_record):
+            return -1
+
+        # Fetch the latest checkpoint index.
+        with open(checkpoint_record, "rt") as handle:
+            latest_checkpoint = handle.readlines()[-1].strip()
+            iteration = int(latest_checkpoint.split(":")[-1])
+        return iteration
+
+    def _save_checkpoint_record(self, checkpoint_dir: str, iteration: int):
+        """Save the iteration number of the latest model to be checkpoint record.
+        Args:
+            checkpoint_dir (str): the directory where checkpoint is saved.
+            iteration (int): the latest iteration number.
+        Returns:
+            None
+        """
+        checkpoint_record_latest = os.path.join(checkpoint_dir,
+                                                "checkpoint_latest")
+        checkpoint_record_best = os.path.join(checkpoint_dir, "checkpoint_best")
+
+        with open(checkpoint_record_best, "w") as handle:
+            for i in self.best_records.keys():
+                handle.write("model_checkpoint_path:{}\n".format(i))
+        with open(checkpoint_record_latest, "w") as handle:
+            for i in self.latest_records:
+                handle.write("model_checkpoint_path:{}\n".format(i))
+
+    @mp_tools.rank_zero_only
+    def _save_parameters(self,
+                         checkpoint_dir: str,
+                         tag_or_iteration: Union[int, str],
+                         model: paddle.nn.Layer,
+                         optimizer: Optimizer=None,
+                         infos: dict=None):
+        """Checkpoint the latest trained model parameters.
+        Args:
+            checkpoint_dir (str): the directory where checkpoint is saved.
+            tag_or_iteration (int or str): the latest iteration(step or epoch) number.
+            model (Layer): model to be checkpointed.
+            optimizer (Optimizer, optional): optimizer to be checkpointed.
+                Defaults to None.
+            infos (dict or None): any info you want to save.
+        Returns:
+            None
+        """
+        checkpoint_path = os.path.join(checkpoint_dir,
+                                       "{}".format(tag_or_iteration))
+
+        model_dict = model.state_dict()
+        params_path = checkpoint_path + ".pdparams"
+        paddle.save(model_dict, params_path)
+        logger.info("Saved model to {}".format(params_path))
+
+        if optimizer:
+            opt_dict = optimizer.state_dict()
+            optimizer_path = checkpoint_path + ".pdopt"
+            paddle.save(opt_dict, optimizer_path)
+            logger.info("Saved optimzier state to {}".format(optimizer_path))
+
+        info_path = re.sub('.pdparams$', '.json', params_path)
+        infos = {} if infos is None else infos
+        with open(info_path, 'w') as fout:
+            data = json.dumps(infos)
+            fout.write(data)
diff --git a/ernie-sat/paddlespeech/s2t/utils/cli_readers.py b/ernie-sat/paddlespeech/s2t/utils/cli_readers.py
new file mode 100644
index 0000000000000000000000000000000000000000..735d590dd97c44aa14de8113d9b0e204ac887010
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/utils/cli_readers.py
@@ -0,0 +1,242 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+import io
+import logging
+import sys
+
+import h5py
+import kaldiio
+import soundfile
+
+from paddlespeech.s2t.io.reader import SoundHDF5File
+
+
+def file_reader_helper(
+        rspecifier: str,
+        filetype: str="mat",
+        return_shape: bool=False,
+        segments: str=None, ):
+    """Read uttid and array in kaldi style
+
+    This function might be a bit confusing as "ark" is used
+    for HDF5 to imitate "kaldi-rspecifier".
+
+    Args:
+        rspecifier: Give as "ark:feats.ark" or "scp:feats.scp"
+        filetype: "mat" is kaldi-martix, "hdf5": HDF5
+        return_shape: Return the shape of the matrix,
+            instead of the matrix. This can reduce IO cost for HDF5.
+        segments (str): The file format is
+            "<segment-id> <recording-id> <start-time> <end-time>\n"
+            "e.g. call-861225-A-0050-0065 call-861225-A 5.0 6.5\n"
+    Returns:
+        Generator[Tuple[str, np.ndarray], None, None]:
+
+    Examples:
+        Read from kaldi-matrix ark file:
+
+        >>> for u, array in file_reader_helper('ark:feats.ark', 'mat'):
+        ...     array
+
+        Read from HDF5 file:
+
+        >>> for u, array in file_reader_helper('ark:feats.h5', 'hdf5'):
+        ...     array
+
+    """
+    if filetype == "mat":
+        return KaldiReader(
+            rspecifier, return_shape=return_shape, segments=segments)
+    elif filetype == "hdf5":
+        return HDF5Reader(rspecifier, return_shape=return_shape)
+    elif filetype == "sound.hdf5":
+        return SoundHDF5Reader(rspecifier, return_shape=return_shape)
+    elif filetype == "sound":
+        return SoundReader(rspecifier, return_shape=return_shape)
+    else:
+        raise NotImplementedError(f"filetype={filetype}")
+
+
+class KaldiReader:
+    def __init__(self, rspecifier, return_shape=False, segments=None):
+        self.rspecifier = rspecifier
+        self.return_shape = return_shape
+        self.segments = segments
+
+    def __iter__(self):
+        with kaldiio.ReadHelper(
+                self.rspecifier, segments=self.segments) as reader:
+            for key, array in reader:
+                if self.return_shape:
+                    array = array.shape
+                yield key, array
+
+
+class HDF5Reader:
+    def __init__(self, rspecifier, return_shape=False):
+        if ":" not in rspecifier:
+            raise ValueError('Give "rspecifier" such as "ark:some.ark: {}"'.
+                             format(self.rspecifier))
+        self.rspecifier = rspecifier
+        self.ark_or_scp, self.filepath = self.rspecifier.split(":", 1)
+        if self.ark_or_scp not in ["ark", "scp"]:
+            raise ValueError(f"Must be scp or ark: {self.ark_or_scp}")
+
+        self.return_shape = return_shape
+
+    def __iter__(self):
+        if self.ark_or_scp == "scp":
+            hdf5_dict = {}
+            with open(self.filepath, "r", encoding="utf-8") as f:
+                for line in f:
+                    key, value = line.rstrip().split(None, 1)
+
+                    if ":" not in value:
+                        raise RuntimeError(
+                            "scp file for hdf5 should be like: "
+                            '"uttid filepath.h5:key": {}({})'.format(
+                                line, self.filepath))
+                    path, h5_key = value.split(":", 1)
+
+                    hdf5_file = hdf5_dict.get(path)
+                    if hdf5_file is None:
+                        try:
+                            hdf5_file = h5py.File(path, "r")
+                        except Exception:
+                            logging.error("Error when loading {}".format(path))
+                            raise
+                        hdf5_dict[path] = hdf5_file
+
+                    try:
+                        data = hdf5_file[h5_key]
+                    except Exception:
+                        logging.error("Error when loading {} with key={}".
+                                      format(path, h5_key))
+                        raise
+
+                    if self.return_shape:
+                        yield key, data.shape
+                    else:
+                        yield key, data[()]
+
+            # Closing all files
+            for k in hdf5_dict:
+                try:
+                    hdf5_dict[k].close()
+                except Exception:
+                    pass
+
+        else:
+            if self.filepath == "-":
+                # Required h5py>=2.9
+                filepath = io.BytesIO(sys.stdin.buffer.read())
+            else:
+                filepath = self.filepath
+            with h5py.File(filepath, "r") as f:
+                for key in f:
+                    if self.return_shape:
+                        yield key, f[key].shape
+                    else:
+                        yield key, f[key][()]
+
+
+class SoundHDF5Reader:
+    def __init__(self, rspecifier, return_shape=False):
+        if ":" not in rspecifier:
+            raise ValueError('Give "rspecifier" such as "ark:some.ark: {}"'.
+                             format(rspecifier))
+        self.ark_or_scp, self.filepath = rspecifier.split(":", 1)
+        if self.ark_or_scp not in ["ark", "scp"]:
+            raise ValueError(f"Must be scp or ark: {self.ark_or_scp}")
+        self.return_shape = return_shape
+
+    def __iter__(self):
+        if self.ark_or_scp == "scp":
+            hdf5_dict = {}
+            with open(self.filepath, "r", encoding="utf-8") as f:
+                for line in f:
+                    key, value = line.rstrip().split(None, 1)
+
+                    if ":" not in value:
+                        raise RuntimeError(
+                            "scp file for hdf5 should be like: "
+                            '"uttid filepath.h5:key": {}({})'.format(
+                                line, self.filepath))
+                    path, h5_key = value.split(":", 1)
+
+                    hdf5_file = hdf5_dict.get(path)
+                    if hdf5_file is None:
+                        try:
+                            hdf5_file = SoundHDF5File(path, "r")
+                        except Exception:
+                            logging.error("Error when loading {}".format(path))
+                            raise
+                        hdf5_dict[path] = hdf5_file
+
+                    try:
+                        data = hdf5_file[h5_key]
+                    except Exception:
+                        logging.error("Error when loading {} with key={}".
+                                      format(path, h5_key))
+                        raise
+
+                    # Change Tuple[ndarray, int] -> Tuple[int, ndarray]
+                    # (soundfile style -> scipy style)
+                    array, rate = data
+                    if self.return_shape:
+                        array = array.shape
+                    yield key, (rate, array)
+
+            # Closing all files
+            for k in hdf5_dict:
+                try:
+                    hdf5_dict[k].close()
+                except Exception:
+                    pass
+
+        else:
+            if self.filepath == "-":
+                # Required h5py>=2.9
+                filepath = io.BytesIO(sys.stdin.buffer.read())
+            else:
+                filepath = self.filepath
+            for key, (a, r) in SoundHDF5File(filepath, "r").items():
+                if self.return_shape:
+                    a = a.shape
+                yield key, (r, a)
+
+
+class SoundReader:
+    def __init__(self, rspecifier, return_shape=False):
+        if ":" not in rspecifier:
+            raise ValueError('Give "rspecifier" such as "scp:some.scp: {}"'.
+                             format(rspecifier))
+        self.ark_or_scp, self.filepath = rspecifier.split(":", 1)
+        if self.ark_or_scp != "scp":
+            raise ValueError('Only supporting "scp" for sound file: {}'.format(
+                self.ark_or_scp))
+        self.return_shape = return_shape
+
+    def __iter__(self):
+        with open(self.filepath, "r", encoding="utf-8") as f:
+            for line in f:
+                key, sound_file_path = line.rstrip().split(None, 1)
+                # Assume PCM16
+                array, rate = soundfile.read(sound_file_path, dtype="int16")
+                # Change Tuple[ndarray, int] -> Tuple[int, ndarray]
+                # (soundfile style -> scipy style)
+                if self.return_shape:
+                    array = array.shape
+                yield key, (rate, array)
diff --git a/ernie-sat/paddlespeech/s2t/utils/cli_utils.py b/ernie-sat/paddlespeech/s2t/utils/cli_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..ccb0d3c97dde3a9bb3b17bdf5ee0b29adee652ba
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/utils/cli_utils.py
@@ -0,0 +1,71 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+import sys
+from collections.abc import Sequence
+
+import numpy
+from distutils.util import strtobool as dist_strtobool
+
+
+def strtobool(x):
+    # distutils.util.strtobool returns integer, but it's confusing,
+    return bool(dist_strtobool(x))
+
+
+def get_commandline_args():
+    extra_chars = [
+        " ",
+        ";",
+        "&",
+        "(",
+        ")",
+        "|",
+        "^",
+        "<",
+        ">",
+        "?",
+        "*",
+        "[",
+        "]",
+        "$",
+        "`",
+        '"',
+        "\\",
+        "!",
+        "{",
+        "}",
+    ]
+
+    # Escape the extra characters for shell
+    argv = [
+        arg.replace("'", "'\\''") if all(char not in arg
+                                         for char in extra_chars) else
+        "'" + arg.replace("'", "'\\''") + "'" for arg in sys.argv
+    ]
+
+    return sys.executable + " " + " ".join(argv)
+
+
+def is_scipy_wav_style(value):
+    # If Tuple[int, numpy.ndarray] or not
+    return (isinstance(value, Sequence) and len(value) == 2 and
+            isinstance(value[0], int) and isinstance(value[1], numpy.ndarray))
+
+
+def assert_scipy_wav_style(value):
+    assert is_scipy_wav_style(
+        value), "Must be Tuple[int, numpy.ndarray], but got {}".format(
+            type(value) if not isinstance(value, Sequence) else "{}[{}]".format(
+                type(value), ", ".join(str(type(v)) for v in value)))
diff --git a/ernie-sat/paddlespeech/s2t/utils/cli_writers.py b/ernie-sat/paddlespeech/s2t/utils/cli_writers.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3a4c2b85ccf7583c1e9cfd62d841c7f660c6a09
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/utils/cli_writers.py
@@ -0,0 +1,294 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+from pathlib import Path
+from typing import Dict
+
+import h5py
+import kaldiio
+import numpy
+import soundfile
+
+from paddlespeech.s2t.io.reader import SoundHDF5File
+from paddlespeech.s2t.utils.cli_utils import assert_scipy_wav_style
+
+
+def file_writer_helper(
+        wspecifier: str,
+        filetype: str="mat",
+        write_num_frames: str=None,
+        compress: bool=False,
+        compression_method: int=2,
+        pcm_format: str="wav", ):
+    """Write matrices in kaldi style
+
+    Args:
+        wspecifier: e.g. ark,scp:out.ark,out.scp
+        filetype: "mat" is kaldi-martix, "hdf5": HDF5
+        write_num_frames: e.g. 'ark,t:num_frames.txt'
+        compress: Compress or not
+        compression_method: Specify compression level
+
+    Write in kaldi-matrix-ark with "kaldi-scp" file:
+
+    >>> with file_writer_helper('ark,scp:out.ark,out.scp') as f:
+    >>>     f['uttid'] = array
+
+    This "scp" has the following format:
+
+        uttidA out.ark:1234
+        uttidB out.ark:2222
+
+    where, 1234 and 2222 points the strating byte address of the matrix.
+    (For detail, see official documentation of Kaldi)
+
+    Write in HDF5 with "scp" file:
+
+    >>> with file_writer_helper('ark,scp:out.h5,out.scp', 'hdf5') as f:
+    >>>     f['uttid'] = array
+
+    This "scp" file is created as:
+
+        uttidA out.h5:uttidA
+        uttidB out.h5:uttidB
+
+    HDF5 can be, unlike "kaldi-ark", accessed to any keys,
+    so originally "scp" is not required for random-reading.
+    Nevertheless we create "scp" for HDF5 because it is useful
+    for some use-case. e.g. Concatenation, Splitting.
+
+    """
+    if filetype == "mat":
+        return KaldiWriter(
+            wspecifier,
+            write_num_frames=write_num_frames,
+            compress=compress,
+            compression_method=compression_method, )
+    elif filetype == "hdf5":
+        return HDF5Writer(
+            wspecifier, write_num_frames=write_num_frames, compress=compress)
+    elif filetype == "sound.hdf5":
+        return SoundHDF5Writer(
+            wspecifier,
+            write_num_frames=write_num_frames,
+            pcm_format=pcm_format)
+    elif filetype == "sound":
+        return SoundWriter(
+            wspecifier,
+            write_num_frames=write_num_frames,
+            pcm_format=pcm_format)
+    else:
+        raise NotImplementedError(f"filetype={filetype}")
+
+
+class BaseWriter:
+    def __setitem__(self, key, value):
+        raise NotImplementedError
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.close()
+
+    def close(self):
+        try:
+            self.writer.close()
+        except Exception:
+            pass
+
+        if self.writer_scp is not None:
+            try:
+                self.writer_scp.close()
+            except Exception:
+                pass
+
+        if self.writer_nframe is not None:
+            try:
+                self.writer_nframe.close()
+            except Exception:
+                pass
+
+
+def get_num_frames_writer(write_num_frames: str):
+    """get_num_frames_writer
+
+    Examples:
+        >>> get_num_frames_writer('ark,t:num_frames.txt')
+    """
+    if write_num_frames is not None:
+        if ":" not in write_num_frames:
+            raise ValueError('Must include ":", write_num_frames={}'.format(
+                write_num_frames))
+
+        nframes_type, nframes_file = write_num_frames.split(":", 1)
+        if nframes_type != "ark,t":
+            raise ValueError("Only supporting text mode. "
+                             "e.g. --write-num-frames=ark,t:foo.txt :"
+                             "{}".format(nframes_type))
+
+    return open(nframes_file, "w", encoding="utf-8")
+
+
+class KaldiWriter(BaseWriter):
+    def __init__(self,
+                 wspecifier,
+                 write_num_frames=None,
+                 compress=False,
+                 compression_method=2):
+        if compress:
+            self.writer = kaldiio.WriteHelper(
+                wspecifier, compression_method=compression_method)
+        else:
+            self.writer = kaldiio.WriteHelper(wspecifier)
+        self.writer_scp = None
+        if write_num_frames is not None:
+            self.writer_nframe = get_num_frames_writer(write_num_frames)
+        else:
+            self.writer_nframe = None
+
+    def __setitem__(self, key, value):
+        self.writer[key] = value
+        if self.writer_nframe is not None:
+            self.writer_nframe.write(f"{key} {len(value)}\n")
+
+
+def parse_wspecifier(wspecifier: str) -> Dict[str, str]:
+    """Parse wspecifier to dict
+
+    Examples:
+        >>> parse_wspecifier('ark,scp:out.ark,out.scp')
+        {'ark': 'out.ark', 'scp': 'out.scp'}
+
+    """
+    ark_scp, filepath = wspecifier.split(":", 1)
+    if ark_scp not in ["ark", "scp,ark", "ark,scp"]:
+        raise ValueError("{} is not allowed: {}".format(ark_scp, wspecifier))
+    ark_scps = ark_scp.split(",")
+    filepaths = filepath.split(",")
+    if len(ark_scps) != len(filepaths):
+        raise ValueError("Mismatch: {} and {}".format(ark_scp, filepath))
+    spec_dict = dict(zip(ark_scps, filepaths))
+    return spec_dict
+
+
+class HDF5Writer(BaseWriter):
+    """HDF5Writer
+
+    Examples:
+        >>> with HDF5Writer('ark:out.h5', compress=True) as f:
+        ...     f['key'] = array
+    """
+
+    def __init__(self, wspecifier, write_num_frames=None, compress=False):
+        spec_dict = parse_wspecifier(wspecifier)
+        self.filename = spec_dict["ark"]
+
+        if compress:
+            self.kwargs = {"compression": "gzip"}
+        else:
+            self.kwargs = {}
+        self.writer = h5py.File(spec_dict["ark"], "w")
+        if "scp" in spec_dict:
+            self.writer_scp = open(spec_dict["scp"], "w", encoding="utf-8")
+        else:
+            self.writer_scp = None
+        if write_num_frames is not None:
+            self.writer_nframe = get_num_frames_writer(write_num_frames)
+        else:
+            self.writer_nframe = None
+
+    def __setitem__(self, key, value):
+        self.writer.create_dataset(key, data=value, **self.kwargs)
+
+        if self.writer_scp is not None:
+            self.writer_scp.write(f"{key} {self.filename}:{key}\n")
+        if self.writer_nframe is not None:
+            self.writer_nframe.write(f"{key} {len(value)}\n")
+
+
+class SoundHDF5Writer(BaseWriter):
+    """SoundHDF5Writer
+
+    Examples:
+        >>> fs = 16000
+        >>> with SoundHDF5Writer('ark:out.h5') as f:
+        ...     f['key'] = fs, array
+    """
+
+    def __init__(self, wspecifier, write_num_frames=None, pcm_format="wav"):
+        self.pcm_format = pcm_format
+        spec_dict = parse_wspecifier(wspecifier)
+        self.filename = spec_dict["ark"]
+        self.writer = SoundHDF5File(
+            spec_dict["ark"], "w", format=self.pcm_format)
+        if "scp" in spec_dict:
+            self.writer_scp = open(spec_dict["scp"], "w", encoding="utf-8")
+        else:
+            self.writer_scp = None
+        if write_num_frames is not None:
+            self.writer_nframe = get_num_frames_writer(write_num_frames)
+        else:
+            self.writer_nframe = None
+
+    def __setitem__(self, key, value):
+        assert_scipy_wav_style(value)
+        # Change Tuple[int, ndarray] -> Tuple[ndarray, int]
+        # (scipy style -> soundfile style)
+        value = (value[1], value[0])
+        self.writer.create_dataset(key, data=value)
+
+        if self.writer_scp is not None:
+            self.writer_scp.write(f"{key} {self.filename}:{key}\n")
+        if self.writer_nframe is not None:
+            self.writer_nframe.write(f"{key} {len(value[0])}\n")
+
+
+class SoundWriter(BaseWriter):
+    """SoundWriter
+
+    Examples:
+        >>> fs = 16000
+        >>> with SoundWriter('ark,scp:outdir,out.scp') as f:
+        ...     f['key'] = fs, array
+    """
+
+    def __init__(self, wspecifier, write_num_frames=None, pcm_format="wav"):
+        self.pcm_format = pcm_format
+        spec_dict = parse_wspecifier(wspecifier)
+        # e.g. ark,scp:dirname,wav.scp
+        # -> The wave files are found in dirname/*.wav
+        self.dirname = spec_dict["ark"]
+        Path(self.dirname).mkdir(parents=True, exist_ok=True)
+        self.writer = None
+
+        if "scp" in spec_dict:
+            self.writer_scp = open(spec_dict["scp"], "w", encoding="utf-8")
+        else:
+            self.writer_scp = None
+        if write_num_frames is not None:
+            self.writer_nframe = get_num_frames_writer(write_num_frames)
+        else:
+            self.writer_nframe = None
+
+    def __setitem__(self, key, value):
+        assert_scipy_wav_style(value)
+        rate, signal = value
+        wavfile = Path(self.dirname) / (key + "." + self.pcm_format)
+        soundfile.write(wavfile, signal.astype(numpy.int16), rate)
+
+        if self.writer_scp is not None:
+            self.writer_scp.write(f"{key} {wavfile}\n")
+        if self.writer_nframe is not None:
+            self.writer_nframe.write(f"{key} {len(signal)}\n")
diff --git a/ernie-sat/paddlespeech/s2t/utils/ctc_utils.py b/ernie-sat/paddlespeech/s2t/utils/ctc_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..886b72033605e9080ebc7ae06e0a32054325be71
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/utils/ctc_utils.py
@@ -0,0 +1,211 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from wenet(https://github.com/wenet-e2e/wenet)
+from pathlib import Path
+from typing import List
+
+import numpy as np
+import paddle
+
+from paddlespeech.s2t.utils import text_grid
+from paddlespeech.s2t.utils import utility
+from paddlespeech.s2t.utils.log import Log
+
+logger = Log(__name__).getlog()
+
+__all__ = ["forced_align", "remove_duplicates_and_blank", "insert_blank"]
+
+
+def remove_duplicates_and_blank(hyp: List[int], blank_id=0) -> List[int]:
+    """ctc alignment to ctc label ids.
+
+    "abaa-acee-" -> "abaace"
+
+    Args:
+        hyp (List[int]): hypotheses ids, (L)
+        blank_id (int, optional): blank id. Defaults to 0.
+
+    Returns:
+        List[int]: remove dupicate ids, then remove blank id.
+    """
+    new_hyp: List[int] = []
+    cur = 0
+    while cur < len(hyp):
+        # add non-blank into new_hyp
+        if hyp[cur] != blank_id:
+            new_hyp.append(hyp[cur])
+        # skip repeat label
+        prev = cur
+        while cur < len(hyp) and hyp[cur] == hyp[prev]:
+            cur += 1
+    return new_hyp
+
+
+def insert_blank(label: np.ndarray, blank_id: int=0) -> np.ndarray:
+    """Insert blank token between every two label token.
+
+    "abcdefg" -> "-a-b-c-d-e-f-g-"
+
+    Args:
+        label ([np.ndarray]): label ids, List[int], (L).
+        blank_id (int, optional): blank id. Defaults to 0.
+
+    Returns:
+        [np.ndarray]: (2L+1).
+    """
+    label = np.expand_dims(label, 1)  #[L, 1]
+    blanks = np.zeros((label.shape[0], 1), dtype=np.int64) + blank_id
+    label = np.concatenate([blanks, label], axis=1)  #[L, 2]
+    label = label.reshape(-1)  #[2L], -l-l-l
+    label = np.append(label, label[0])  #[2L + 1], -l-l-l-
+    return label
+
+
+def forced_align(ctc_probs: paddle.Tensor, y: paddle.Tensor,
+                 blank_id=0) -> List[int]:
+    """ctc forced alignment.
+
+    https://distill.pub/2017/ctc/
+
+    Args:
+        ctc_probs (paddle.Tensor): hidden state sequence, 2d tensor (T, D)
+        y (paddle.Tensor): label id sequence tensor, 1d tensor (L)
+        blank_id (int): blank symbol index
+    Returns:
+        List[int]: best alignment result, (T).
+    """
+    y_insert_blank = insert_blank(y, blank_id)  #(2L+1)
+
+    log_alpha = paddle.zeros(
+        (ctc_probs.shape[0], len(y_insert_blank)))  #(T, 2L+1)
+    log_alpha = log_alpha - float('inf')  # log of zero
+
+    # TODO(Hui Zhang): zeros not support paddle.int16
+    # self.__setitem_varbase__(item, value) When assign a value to a paddle.Tensor, the data type of the paddle.Tensor not support int16
+    state_path = (paddle.zeros(
+        (ctc_probs.shape[0], len(y_insert_blank)), dtype=paddle.int32) - 1
+                  )  # state path, Tuple((T, 2L+1))
+
+    # init start state
+    # TODO(Hui Zhang): VarBase.__getitem__() not support np.int64
+    log_alpha[0, 0] = ctc_probs[0][int(y_insert_blank[0])]  # State-b, Sb
+    log_alpha[0, 1] = ctc_probs[0][int(y_insert_blank[1])]  # State-nb, Snb
+
+    for t in range(1, ctc_probs.shape[0]):  # T
+        for s in range(len(y_insert_blank)):  # 2L+1
+            if y_insert_blank[s] == blank_id or s < 2 or y_insert_blank[
+                    s] == y_insert_blank[s - 2]:
+                candidates = paddle.to_tensor(
+                    [log_alpha[t - 1, s], log_alpha[t - 1, s - 1]])
+                prev_state = [s, s - 1]
+            else:
+                candidates = paddle.to_tensor([
+                    log_alpha[t - 1, s],
+                    log_alpha[t - 1, s - 1],
+                    log_alpha[t - 1, s - 2],
+                ])
+                prev_state = [s, s - 1, s - 2]
+            # TODO(Hui Zhang): VarBase.__getitem__() not support np.int64
+            log_alpha[t, s] = paddle.max(candidates) + ctc_probs[t][int(
+                y_insert_blank[s])]
+            state_path[t, s] = prev_state[paddle.argmax(candidates)]
+    # TODO(Hui Zhang): zeros not support paddle.int16
+    # self.__setitem_varbase__(item, value) When assign a value to a paddle.Tensor, the data type of the paddle.Tensor not support int16
+    state_seq = -1 * paddle.ones((ctc_probs.shape[0], 1), dtype=paddle.int32)
+
+    candidates = paddle.to_tensor([
+        log_alpha[-1, len(y_insert_blank) - 1],  # Sb
+        log_alpha[-1, len(y_insert_blank) - 2]  # Snb
+    ])
+    prev_state = [len(y_insert_blank) - 1, len(y_insert_blank) - 2]
+    state_seq[-1] = prev_state[paddle.argmax(candidates)]
+    for t in range(ctc_probs.shape[0] - 2, -1, -1):
+        state_seq[t] = state_path[t + 1, state_seq[t + 1, 0]]
+
+    output_alignment = []
+    for t in range(0, ctc_probs.shape[0]):
+        output_alignment.append(y_insert_blank[state_seq[t, 0]])
+
+    return output_alignment
+
+
+def ctc_align(config, model, dataloader, batch_size, stride_ms, token_dict,
+              result_file):
+    """ctc alignment.
+
+    Args:
+        config (cfgNode): config 
+        model (nn.Layer): U2 Model.
+        dataloader (io.DataLoader): dataloader.
+        batch_size (int): decoding batchsize.
+        stride_ms (int): audio feature stride in ms unit.
+        token_dict (List[str]): vocab list, e.g. ['blank', 'unk', 'a', 'b', '<eos>'].
+        result_file (str): alignment output file, e.g. /path/to/xxx.align.
+    """
+    if batch_size > 1:
+        logger.fatal('alignment mode must be running with batch_size == 1')
+        sys.exit(1)
+    assert result_file and result_file.endswith('.align')
+
+    model.eval()
+    # conv subsampling rate
+    subsample = utility.get_subsample(config)
+    logger.info(f"Align Total Examples: {len(dataloader.dataset)}")
+
+    with open(result_file, 'w') as fout:
+        # one example in batch
+        for i, batch in enumerate(dataloader):
+            key, feat, feats_length, target, target_length = batch
+
+            # 1. Encoder
+            encoder_out, encoder_mask = model._forward_encoder(
+                feat, feats_length)  # (B, maxlen, encoder_dim)
+            maxlen = encoder_out.shape[1]
+            ctc_probs = model.ctc.log_softmax(
+                encoder_out)  # (1, maxlen, vocab_size)
+
+            # 2. alignment
+            ctc_probs = ctc_probs.squeeze(0)
+            target = target.squeeze(0)
+            alignment = forced_align(ctc_probs, target)
+
+            logger.info(f"align ids: {key[0]} {alignment}")
+            fout.write('{} {}\n'.format(key[0], alignment))
+
+            # 3. gen praat
+            # segment alignment
+            align_segs = text_grid.segment_alignment(alignment)
+            logger.info(f"align tokens: {key[0]}, {align_segs}")
+
+            # IntervalTier, List["start end token\n"]
+            tierformat = text_grid.align_to_tierformat(align_segs, subsample,
+                                                       token_dict)
+
+            # write tier
+            align_output_path = Path(result_file).parent / "align"
+            align_output_path.mkdir(parents=True, exist_ok=True)
+            tier_path = align_output_path / (key[0] + ".tier")
+            with tier_path.open('w') as f:
+                f.writelines(tierformat)
+
+            # write textgrid
+            textgrid_path = align_output_path / (key[0] + ".TextGrid")
+            second_per_frame = 1. / (1000. /
+                                     stride_ms)  # 25ms window, 10ms stride
+            second_per_example = (
+                len(alignment) + 1) * subsample * second_per_frame
+            text_grid.generate_textgrid(
+                maxtime=second_per_example,
+                intervals=tierformat,
+                output=str(textgrid_path))
diff --git a/ernie-sat/paddlespeech/s2t/utils/dynamic_import.py b/ernie-sat/paddlespeech/s2t/utils/dynamic_import.py
new file mode 100644
index 0000000000000000000000000000000000000000..bd738edf8b8a96759d2fe7466fb9f7f027687c9d
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/utils/dynamic_import.py
@@ -0,0 +1,69 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+import importlib
+import inspect
+from typing import Any
+from typing import Dict
+from typing import List
+from typing import Text
+
+from paddlespeech.s2t.utils.log import Log
+from paddlespeech.s2t.utils.tensor_utils import has_tensor
+
+logger = Log(__name__).getlog()
+
+__all__ = ["dynamic_import", "instance_class"]
+
+
+def dynamic_import(import_path, alias=dict()):
+    """dynamic import module and class
+
+    :param str import_path: syntax 'module_name:class_name'
+        e.g., 'paddlespeech.s2t.models.u2:U2Model'
+    :param dict alias: shortcut for registered class
+    :return: imported class
+    """
+    if import_path not in alias and ":" not in import_path:
+        raise ValueError(
+            "import_path should be one of {} or "
+            'include ":", e.g. "paddlespeech.s2t.models.u2:U2Model" : '
+            "{}".format(set(alias), import_path))
+    if ":" not in import_path:
+        import_path = alias[import_path]
+
+    module_name, objname = import_path.split(":")
+    m = importlib.import_module(module_name)
+    return getattr(m, objname)
+
+
+def filter_valid_args(args: Dict[Text, Any], valid_keys: List[Text]):
+    # filter by `valid_keys` and filter `val` is not None
+    new_args = {
+        key: val
+        for key, val in args.items() if (key in valid_keys and val is not None)
+    }
+    return new_args
+
+
+def filter_out_tensor(args: Dict[Text, Any]):
+    return {key: val for key, val in args.items() if not has_tensor(val)}
+
+
+def instance_class(module_class, args: Dict[Text, Any]):
+    valid_keys = inspect.signature(module_class).parameters.keys()
+    new_args = filter_valid_args(args, valid_keys)
+    logger.info(
+        f"Instance: {module_class.__name__} {filter_out_tensor(new_args)}.")
+    return module_class(**new_args)
diff --git a/ernie-sat/paddlespeech/s2t/utils/dynamic_pip_install.py b/ernie-sat/paddlespeech/s2t/utils/dynamic_pip_install.py
new file mode 100644
index 0000000000000000000000000000000000000000..39e9c35fd2e6335f69ece7345d1db7cabee998e2
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/utils/dynamic_pip_install.py
@@ -0,0 +1,22 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import pip
+
+
+def install(package_name):
+    if int(pip.__version__.split('.')[0]) > 9:
+        from pip._internal import main
+    else:
+        from pip import main
+    main(['install', package_name])
diff --git a/ernie-sat/paddlespeech/s2t/utils/error_rate.py b/ernie-sat/paddlespeech/s2t/utils/error_rate.py
new file mode 100644
index 0000000000000000000000000000000000000000..548376aa24bc7fcc153b93e57f95810f0e95a62b
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/utils/error_rate.py
@@ -0,0 +1,364 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""This module provides functions to calculate error rate in different level.
+e.g. wer for word-level, cer for char-level.
+"""
+from itertools import groupby
+
+import editdistance
+import numpy as np
+
+__all__ = ['word_errors', 'char_errors', 'wer', 'cer', "ErrorCalculator"]
+
+
+def _levenshtein_distance(ref, hyp):
+    """Levenshtein distance is a string metric for measuring the difference
+    between two sequences. Informally, the levenshtein disctance is defined as
+    the minimum number of single-character edits (substitutions, insertions or
+    deletions) required to change one word into the other. We can naturally
+    extend the edits to word level when calculate levenshtein disctance for
+    two sentences.
+    """
+    m = len(ref)
+    n = len(hyp)
+
+    # special case
+    if ref == hyp:
+        return 0
+    if m == 0:
+        return n
+    if n == 0:
+        return m
+
+    if m < n:
+        ref, hyp = hyp, ref
+        m, n = n, m
+
+    # use O(min(m, n)) space
+    distance = np.zeros((2, n + 1), dtype=np.int32)
+
+    # initialize distance matrix
+    for j in range(n + 1):
+        distance[0][j] = j
+
+    # calculate levenshtein distance
+    for i in range(1, m + 1):
+        prev_row_idx = (i - 1) % 2
+        cur_row_idx = i % 2
+        distance[cur_row_idx][0] = i
+        for j in range(1, n + 1):
+            if ref[i - 1] == hyp[j - 1]:
+                distance[cur_row_idx][j] = distance[prev_row_idx][j - 1]
+            else:
+                s_num = distance[prev_row_idx][j - 1] + 1
+                i_num = distance[cur_row_idx][j - 1] + 1
+                d_num = distance[prev_row_idx][j] + 1
+                distance[cur_row_idx][j] = min(s_num, i_num, d_num)
+
+    return distance[m % 2][n]
+
+
+def word_errors(reference, hypothesis, ignore_case=False, delimiter=' '):
+    """Compute the levenshtein distance between reference sequence and
+    hypothesis sequence in word-level.
+
+    :param reference: The reference sentence.
+    :type reference: str
+    :param hypothesis: The hypothesis sentence.
+    :type hypothesis: str
+    :param ignore_case: Whether case-sensitive or not.
+    :type ignore_case: bool
+    :param delimiter: Delimiter of input sentences.
+    :type delimiter: char
+    :return: Levenshtein distance and word number of reference sentence.
+    :rtype: list
+    """
+    if ignore_case:
+        reference = reference.lower()
+        hypothesis = hypothesis.lower()
+
+    ref_words = list(filter(None, reference.split(delimiter)))
+    hyp_words = list(filter(None, hypothesis.split(delimiter)))
+
+    edit_distance = _levenshtein_distance(ref_words, hyp_words)
+    # `editdistance.eavl precision` less than `_levenshtein_distance`
+    # edit_distance = editdistance.eval(ref_words, hyp_words)
+    return float(edit_distance), len(ref_words)
+
+
+def char_errors(reference, hypothesis, ignore_case=False, remove_space=False):
+    """Compute the levenshtein distance between reference sequence and
+    hypothesis sequence in char-level.
+
+    :param reference: The reference sentence.
+    :type reference: str
+    :param hypothesis: The hypothesis sentence.
+    :type hypothesis: str
+    :param ignore_case: Whether case-sensitive or not.
+    :type ignore_case: bool
+    :param remove_space: Whether remove internal space characters
+    :type remove_space: bool
+    :return: Levenshtein distance and length of reference sentence.
+    :rtype: list
+    """
+    if ignore_case:
+        reference = reference.lower()
+        hypothesis = hypothesis.lower()
+
+    join_char = ' '
+    if remove_space:
+        join_char = ''
+
+    reference = join_char.join(list(filter(None, reference.split(' '))))
+    hypothesis = join_char.join(list(filter(None, hypothesis.split(' '))))
+
+    edit_distance = _levenshtein_distance(reference, hypothesis)
+    # `editdistance.eavl precision` less than `_levenshtein_distance`
+    # edit_distance = editdistance.eval(reference, hypothesis)
+    return float(edit_distance), len(reference)
+
+
+def wer(reference, hypothesis, ignore_case=False, delimiter=' '):
+    """Calculate word error rate (WER). WER compares reference text and
+    hypothesis text in word-level. WER is defined as:
+
+    .. math::
+        WER = (Sw + Dw + Iw) / Nw
+
+    where
+
+    .. code-block:: text
+
+        Sw is the number of words subsituted,
+        Dw is the number of words deleted,
+        Iw is the number of words inserted,
+        Nw is the number of words in the reference
+
+    We can use levenshtein distance to calculate WER. Please draw an attention
+    that empty items will be removed when splitting sentences by delimiter.
+
+    :param reference: The reference sentence.
+    :type reference: str
+    :param hypothesis: The hypothesis sentence.
+    :type hypothesis: str
+    :param ignore_case: Whether case-sensitive or not.
+    :type ignore_case: bool
+    :param delimiter: Delimiter of input sentences.
+    :type delimiter: char
+    :return: Word error rate.
+    :rtype: float
+    :raises ValueError: If word number of reference is zero.
+    """
+    edit_distance, ref_len = word_errors(reference, hypothesis, ignore_case,
+                                         delimiter)
+
+    if ref_len == 0:
+        raise ValueError("Reference's word number should be greater than 0.")
+
+    wer = float(edit_distance) / ref_len
+    return wer
+
+
+def cer(reference, hypothesis, ignore_case=False, remove_space=False):
+    """Calculate charactor error rate (CER). CER compares reference text and
+    hypothesis text in char-level. CER is defined as:
+
+    .. math::
+        CER = (Sc + Dc + Ic) / Nc
+
+    where
+
+    .. code-block:: text
+
+        Sc is the number of characters substituted,
+        Dc is the number of characters deleted,
+        Ic is the number of characters inserted
+        Nc is the number of characters in the reference
+
+    We can use levenshtein distance to calculate CER. Chinese input should be
+    encoded to unicode. Please draw an attention that the leading and tailing
+    space characters will be truncated and multiple consecutive space
+    characters in a sentence will be replaced by one space character.
+
+    :param reference: The reference sentence.
+    :type reference: str
+    :param hypothesis: The hypothesis sentence.
+    :type hypothesis: str
+    :param ignore_case: Whether case-sensitive or not.
+    :type ignore_case: bool
+    :param remove_space: Whether remove internal space characters
+    :type remove_space: bool
+    :return: Character error rate.
+    :rtype: float
+    :raises ValueError: If the reference length is zero.
+    """
+    edit_distance, ref_len = char_errors(reference, hypothesis, ignore_case,
+                                         remove_space)
+
+    if ref_len == 0:
+        raise ValueError("Length of reference should be greater than 0.")
+
+    cer = float(edit_distance) / ref_len
+    return cer
+
+
+class ErrorCalculator():
+    """Calculate CER and WER for E2E_ASR and CTC models during training.
+
+    :param y_hats: numpy array with predicted text
+    :param y_pads: numpy array with true (target) text
+    :param char_list: List[str]
+    :param sym_space: <space>
+    :param sym_blank: <blank>
+    :return:
+    """
+
+    def __init__(self,
+                 char_list,
+                 sym_space,
+                 sym_blank,
+                 report_cer=False,
+                 report_wer=False):
+        """Construct an ErrorCalculator object."""
+        super().__init__()
+
+        self.report_cer = report_cer
+        self.report_wer = report_wer
+
+        self.char_list = char_list
+        self.space = sym_space
+        self.blank = sym_blank
+        self.idx_blank = self.char_list.index(self.blank)
+        if self.space in self.char_list:
+            self.idx_space = self.char_list.index(self.space)
+        else:
+            self.idx_space = None
+
+    def __call__(self, ys_hat, ys_pad, is_ctc=False):
+        """Calculate sentence-level WER/CER score.
+
+        :param paddle.Tensor ys_hat: prediction (batch, seqlen)
+        :param paddle.Tensor ys_pad: reference (batch, seqlen)
+        :param bool is_ctc: calculate CER score for CTC
+        :return: sentence-level WER score
+        :rtype float
+        :return: sentence-level CER score
+        :rtype float
+        """
+        cer, wer = None, None
+        if is_ctc:
+            return self.calculate_cer_ctc(ys_hat, ys_pad)
+        elif not self.report_cer and not self.report_wer:
+            return cer, wer
+
+        seqs_hat, seqs_true = self.convert_to_char(ys_hat, ys_pad)
+        if self.report_cer:
+            cer = self.calculate_cer(seqs_hat, seqs_true)
+
+        if self.report_wer:
+            wer = self.calculate_wer(seqs_hat, seqs_true)
+        return cer, wer
+
+    def calculate_cer_ctc(self, ys_hat, ys_pad):
+        """Calculate sentence-level CER score for CTC.
+
+        :param paddle.Tensor ys_hat: prediction (batch, seqlen)
+        :param paddle.Tensor ys_pad: reference (batch, seqlen)
+        :return: average sentence-level CER score
+        :rtype float
+        """
+        cers, char_ref_lens = [], []
+        for i, y in enumerate(ys_hat):
+            y_hat = [x[0] for x in groupby(y)]
+            y_true = ys_pad[i]
+            seq_hat, seq_true = [], []
+            for idx in y_hat:
+                idx = int(idx)
+                if idx != -1 and idx != self.idx_blank and idx != self.idx_space:
+                    seq_hat.append(self.char_list[int(idx)])
+
+            for idx in y_true:
+                idx = int(idx)
+                if idx != -1 and idx != self.idx_blank and idx != self.idx_space:
+                    seq_true.append(self.char_list[int(idx)])
+
+            hyp_chars = "".join(seq_hat)
+            ref_chars = "".join(seq_true)
+            if len(ref_chars) > 0:
+                cers.append(editdistance.eval(hyp_chars, ref_chars))
+                char_ref_lens.append(len(ref_chars))
+
+        cer_ctc = float(sum(cers)) / sum(char_ref_lens) if cers else None
+        return cer_ctc
+
+    def convert_to_char(self, ys_hat, ys_pad):
+        """Convert index to character.
+
+        :param paddle.Tensor seqs_hat: prediction (batch, seqlen)
+        :param paddle.Tensor seqs_true: reference (batch, seqlen)
+        :return: token list of prediction
+        :rtype list
+        :return: token list of reference
+        :rtype list
+        """
+        seqs_hat, seqs_true = [], []
+        for i, y_hat in enumerate(ys_hat):
+            y_true = ys_pad[i]
+            eos_true = np.where(y_true == -1)[0]
+            ymax = eos_true[0] if len(eos_true) > 0 else len(y_true)
+            # NOTE: padding index (-1) in y_true is used to pad y_hat
+            seq_hat = [self.char_list[int(idx)] for idx in y_hat[:ymax]]
+            seq_true = [
+                self.char_list[int(idx)] for idx in y_true if int(idx) != -1
+            ]
+            seq_hat_text = "".join(seq_hat).replace(self.space, " ")
+            seq_hat_text = seq_hat_text.replace(self.blank, "")
+            seq_true_text = "".join(seq_true).replace(self.space, " ")
+            seqs_hat.append(seq_hat_text)
+            seqs_true.append(seq_true_text)
+        return seqs_hat, seqs_true
+
+    def calculate_cer(self, seqs_hat, seqs_true):
+        """Calculate sentence-level CER score.
+
+        :param list seqs_hat: prediction
+        :param list seqs_true: reference
+        :return: average sentence-level CER score
+        :rtype float
+        """
+        char_eds, char_ref_lens = [], []
+        for i, seq_hat_text in enumerate(seqs_hat):
+            seq_true_text = seqs_true[i]
+            hyp_chars = seq_hat_text.replace(" ", "")
+            ref_chars = seq_true_text.replace(" ", "")
+            char_eds.append(editdistance.eval(hyp_chars, ref_chars))
+            char_ref_lens.append(len(ref_chars))
+        return float(sum(char_eds)) / sum(char_ref_lens)
+
+    def calculate_wer(self, seqs_hat, seqs_true):
+        """Calculate sentence-level WER score.
+
+        :param list seqs_hat: prediction
+        :param list seqs_true: reference
+        :return: average sentence-level WER score
+        :rtype float
+        """
+        word_eds, word_ref_lens = [], []
+        for i, seq_hat_text in enumerate(seqs_hat):
+            seq_true_text = seqs_true[i]
+            hyp_words = seq_hat_text.split()
+            ref_words = seq_true_text.split()
+            word_eds.append(editdistance.eval(hyp_words, ref_words))
+            word_ref_lens.append(len(ref_words))
+        return float(sum(word_eds)) / sum(word_ref_lens)
diff --git a/ernie-sat/paddlespeech/s2t/utils/layer_tools.py b/ernie-sat/paddlespeech/s2t/utils/layer_tools.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb076c0c716938b85e0b52a4268b71993e51a475
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/utils/layer_tools.py
@@ -0,0 +1,88 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+from paddle import nn
+
+__all__ = [
+    "summary", "gradient_norm", "freeze", "unfreeze", "print_grads",
+    "print_params"
+]
+
+
+def summary(layer: nn.Layer, print_func=print):
+    if print_func is None:
+        return
+    num_params = num_elements = 0
+    for name, param in layer.state_dict().items():
+        if print_func:
+            print_func(
+                "{} | {} | {}".format(name, param.shape, np.prod(param.shape)))
+        num_elements += np.prod(param.shape)
+        num_params += 1
+    if print_func:
+        num_elements = num_elements / 1024**2
+        print_func(
+            f"Total parameters: {num_params}, {num_elements:.2f}M elements.")
+
+
+def print_grads(model, print_func=print):
+    if print_func is None:
+        return
+    for n, p in model.named_parameters():
+        msg = f"param grad: {n}: shape: {p.shape} grad: {p.grad}"
+        print_func(msg)
+
+
+def print_params(model, print_func=print):
+    if print_func is None:
+        return
+    total = 0.0
+    num_params = 0.0
+    for n, p in model.named_parameters():
+        msg = f"{n} | {p.shape} | {np.prod(p.shape)} | {not p.stop_gradient}"
+        total += np.prod(p.shape)
+        num_params += 1
+        if print_func:
+            print_func(msg)
+    if print_func:
+        total = total / 1024**2
+        print_func(f"Total parameters: {num_params}, {total:.2f}M elements.")
+
+
+def gradient_norm(layer: nn.Layer):
+    grad_norm_dict = {}
+    for name, param in layer.state_dict().items():
+        if param.trainable:
+            grad = param.gradient()  # return numpy.ndarray
+            grad_norm_dict[name] = np.linalg.norm(grad) / grad.size
+    return grad_norm_dict
+
+
+def recursively_remove_weight_norm(layer: nn.Layer):
+    for layer in layer.sublayers():
+        try:
+            nn.utils.remove_weight_norm(layer)
+        except ValueError as e:
+            # ther is not weight norm hoom in this layer
+            pass
+
+
+def freeze(layer: nn.Layer):
+    for param in layer.parameters():
+        param.trainable = False
+
+
+def unfreeze(layer: nn.Layer):
+    for param in layer.parameters():
+        param.trainable = True
diff --git a/ernie-sat/paddlespeech/s2t/utils/log.py b/ernie-sat/paddlespeech/s2t/utils/log.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f51b7f05e5c14b4ec7877b32ee9cc016ee5ba13
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/utils/log.py
@@ -0,0 +1,162 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import getpass
+import inspect
+import os
+import socket
+import sys
+
+from loguru import logger
+from paddle import inference
+
+
+def find_log_dir(log_dir=None):
+    """Returns the most suitable directory to put log files into.
+    Args:
+        log_dir: str|None, if specified, the logfile(s) will be created in that
+            directory.  Otherwise if the --log_dir command-line flag is provided,
+            the logfile will be created in that directory.  Otherwise the logfile
+            will be created in a standard location.
+    Raises:
+        FileNotFoundError: raised when it cannot find a log directory.
+  """
+    # Get a list of possible log dirs (will try to use them in order).
+    if log_dir:
+        # log_dir was explicitly specified as an arg, so use it and it alone.
+        dirs = [log_dir]
+    else:
+        dirs = ['/tmp/', './']
+
+    # Find the first usable log dir.
+    for d in dirs:
+        if os.path.isdir(d) and os.access(d, os.W_OK):
+            return d
+    raise FileNotFoundError(
+        "Can't find a writable directory for logs, tried %s" % dirs)
+
+
+def find_log_dir_and_names(program_name=None, log_dir=None):
+    """Computes the directory and filename prefix for log file.
+    Args:
+        program_name: str|None, the filename part of the path to the program that
+            is running without its extension.  e.g: if your program is called
+            'usr/bin/foobar.py' this method should probably be called with
+            program_name='foobar' However, this is just a convention, you can
+            pass in any string you want, and it will be used as part of the
+            log filename. If you don't pass in anything, the default behavior
+            is as described in the example.  In python standard logging mode,
+            the program_name will be prepended with py_ if it is the program_name
+            argument is omitted.
+        log_dir: str|None, the desired log directory.
+    Returns:
+        (log_dir, file_prefix, symlink_prefix)
+    Raises:
+        FileNotFoundError: raised in Python 3 when it cannot find a log directory.
+        OSError: raised in Python 2 when it cannot find a log directory.
+  """
+    if not program_name:
+        # Strip the extension (foobar.par becomes foobar, and
+        # fubar.py becomes fubar). We do this so that the log
+        # file names are similar to C++ log file names.
+        program_name = os.path.splitext(os.path.basename(sys.argv[0]))[0]
+
+        # Prepend py_ to files so that python code gets a unique file, and
+        # so that C++ libraries do not try to write to the same log files as us.
+        program_name = 'py_%s' % program_name
+
+    actual_log_dir = find_log_dir(log_dir=log_dir)
+
+    try:
+        username = getpass.getuser()
+    except KeyError:
+        # This can happen, e.g. when running under docker w/o passwd file.
+        if hasattr(os, 'getuid'):
+            # Windows doesn't have os.getuid
+            username = str(os.getuid())
+        else:
+            username = 'unknown'
+    hostname = socket.gethostname()
+    file_prefix = '%s.%s.%s.log' % (program_name, hostname, username)
+
+    return actual_log_dir, file_prefix, program_name
+
+
+class Log():
+    """Default Logger for all."""
+    logger.remove()
+
+    _call_from_cli = False
+    _frame = inspect.currentframe()
+    while _frame:
+        if 'paddlespeech/cli/__init__.py' in _frame.f_code.co_filename or 'paddlespeech/t2s' in _frame.f_code.co_filename:
+            _call_from_cli = True
+            break
+        _frame = _frame.f_back
+
+    if _call_from_cli:
+        logger.add(
+            sys.stdout,
+            level='ERROR',
+            enqueue=True,
+            filter=lambda record: record['level'].no >= 20)
+    else:
+        logger.add(
+            sys.stdout,
+            level='INFO',
+            enqueue=True,
+            filter=lambda record: record['level'].no >= 20)
+        _, file_prefix, _ = find_log_dir_and_names()
+        sink_prefix = os.path.join("exp/log", file_prefix)
+        sink_path = sink_prefix[:-3] + "{time}.log"
+        logger.add(sink_path, level='DEBUG', enqueue=True, rotation="500 MB")
+
+    def __init__(self, name=None):
+        pass
+
+    def getlog(self):
+        return logger
+
+
+class Autolog:
+    """Just used by fullchain project"""
+
+    def __init__(self,
+                 batch_size,
+                 model_name="DeepSpeech",
+                 model_precision="fp32"):
+        import auto_log
+        pid = os.getpid()
+        if os.environ.get('CUDA_VISIBLE_DEVICES', None):
+            gpu_id = int(os.environ['CUDA_VISIBLE_DEVICES'].split(',')[0])
+            infer_config = inference.Config()
+            infer_config.enable_use_gpu(100, gpu_id)
+        else:
+            gpu_id = None
+            infer_config = inference.Config()
+
+        self.autolog = auto_log.AutoLogger(
+            model_name=model_name,
+            model_precision=model_precision,
+            batch_size=batch_size,
+            data_shape="dynamic",
+            save_path="./output/auto_log.lpg",
+            inference_config=infer_config,
+            pids=pid,
+            process_name=None,
+            gpu_ids=gpu_id,
+            time_keys=['preprocess_time', 'inference_time', 'postprocess_time'],
+            warmup=0)
+
+    def getlog(self):
+        return self.autolog
diff --git a/ernie-sat/paddlespeech/s2t/utils/mp_tools.py b/ernie-sat/paddlespeech/s2t/utils/mp_tools.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3e25aab68ad597df14f168095db9080e48ee997
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/utils/mp_tools.py
@@ -0,0 +1,30 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from functools import wraps
+
+from paddle import distributed as dist
+
+__all__ = ["rank_zero_only"]
+
+
+def rank_zero_only(func):
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        rank = dist.get_rank()
+        if rank != 0:
+            return
+        result = func(*args, **kwargs)
+        return result
+
+    return wrapper
diff --git a/ernie-sat/paddlespeech/s2t/utils/profiler.py b/ernie-sat/paddlespeech/s2t/utils/profiler.py
new file mode 100644
index 0000000000000000000000000000000000000000..3592157dc17eac5991bbaabdc1b757b7198827ef
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/utils/profiler.py
@@ -0,0 +1,119 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+
+import paddle
+
+from paddlespeech.s2t.utils.log import Log
+
+logger = Log(__name__).getlog()
+
+# A global variable to record the number of calling times for profiler
+# functions. It is used to specify the tracing range of training steps.
+_profiler_step_id = 0
+
+# A global variable to avoid parsing from string every time.
+_profiler_options = None
+
+
+class ProfilerOptions(object):
+    '''
+    Use a string to initialize a ProfilerOptions.
+    The string should be in the format: "key1=value1;key2=value;key3=value3".
+    For example:
+      "profile_path=model.profile"
+      "batch_range=[50, 60]; profile_path=model.profile"
+      "batch_range=[50, 60]; tracer_option=OpDetail; profile_path=model.profile"
+    ProfilerOptions supports following key-value pair:
+      batch_range      - a integer list, e.g. [100, 110].
+      state            - a string, the optional values are 'CPU', 'GPU' or 'All'. 
+      sorted_key       - a string, the optional values are 'calls', 'total',
+                         'max', 'min' or 'ave.
+      tracer_option    - a string, the optional values are 'Default', 'OpDetail',
+                         'AllOpDetail'.
+      profile_path     - a string, the path to save the serialized profile data,
+                         which can be used to generate a timeline.
+      exit_on_finished - a boolean.
+    '''
+
+    def __init__(self, options_str):
+        assert isinstance(options_str, str)
+
+        self._options = {
+            'batch_range': [10, 20],
+            'state': 'All',
+            'sorted_key': 'total',
+            'tracer_option': 'Default',
+            'profile_path': '/tmp/profile',
+            'exit_on_finished': True
+        }
+        self._parse_from_string(options_str)
+
+    def _parse_from_string(self, options_str):
+        if not options_str:
+            return
+
+        for kv in options_str.replace(' ', '').split(';'):
+            key, value = kv.split('=')
+            if key == 'batch_range':
+                value_list = value.replace('[', '').replace(']', '').split(',')
+                value_list = list(map(int, value_list))
+                if len(value_list) >= 2 and value_list[0] >= 0 and value_list[
+                        1] > value_list[0]:
+                    self._options[key] = value_list
+            elif key == 'exit_on_finished':
+                self._options[key] = value.lower() in ("yes", "true", "t", "1")
+            elif key in [
+                    'state', 'sorted_key', 'tracer_option', 'profile_path'
+            ]:
+                self._options[key] = value
+
+    def __getitem__(self, name):
+        if self._options.get(name, None) is None:
+            raise ValueError(
+                "ProfilerOptions does not have an option named %s." % name)
+        return self._options[name]
+
+
+def add_profiler_step(options_str=None):
+    '''
+    Enable the operator-level timing using PaddlePaddle's profiler.
+    The profiler uses a independent variable to count the profiler steps.
+    One call of this function is treated as a profiler step.
+    
+    Args:
+      profiler_options - a string to initialize the ProfilerOptions.
+                         Default is None, and the profiler is disabled.
+    '''
+    if options_str is None:
+        return
+
+    global _profiler_step_id
+    global _profiler_options
+
+    if _profiler_options is None:
+        _profiler_options = ProfilerOptions(options_str)
+        logger.info(f"Profiler: {options_str}")
+        logger.info(f"Profiler: {_profiler_options._options}")
+
+    if _profiler_step_id == _profiler_options['batch_range'][0]:
+        paddle.utils.profiler.start_profiler(_profiler_options['state'],
+                                             _profiler_options['tracer_option'])
+    elif _profiler_step_id == _profiler_options['batch_range'][1]:
+        paddle.utils.profiler.stop_profiler(_profiler_options['sorted_key'],
+                                            _profiler_options['profile_path'])
+        if _profiler_options['exit_on_finished']:
+            sys.exit(0)
+
+    _profiler_step_id += 1
diff --git a/ernie-sat/paddlespeech/s2t/utils/socket_server.py b/ernie-sat/paddlespeech/s2t/utils/socket_server.py
new file mode 100644
index 0000000000000000000000000000000000000000..691ea966821dce4923652b68937dd4a4fbb17ede
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/utils/socket_server.py
@@ -0,0 +1,113 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import random
+import socket
+import socketserver
+import struct
+import time
+import wave
+from time import gmtime
+from time import strftime
+
+import jsonlines
+
+__all__ = ["socket_send", "warm_up_test", "AsrTCPServer", "AsrRequestHandler"]
+
+
+def socket_send(server_ip: str, server_port: str, data: bytes):
+    # Connect to server and send data
+    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+    sock.connect((server_ip, server_port))
+    sent = data
+    sock.sendall(struct.pack('>i', len(sent)) + sent)
+    print('Speech[length=%d] Sent.' % len(sent))
+    # Receive data from the server and shut down
+    received = sock.recv(1024)
+    print("Recognition Results: {}".format(received.decode('utf8')))
+    sock.close()
+
+
+def warm_up_test(audio_process_handler,
+                 manifest_path,
+                 num_test_cases,
+                 random_seed=0):
+    """Warming-up test."""
+    with jsonlines.open(manifest_path) as reader:
+        manifest = list(reader)
+    rng = random.Random(random_seed)
+    samples = rng.sample(manifest, num_test_cases)
+    for idx, sample in enumerate(samples):
+        print("Warm-up Test Case %d: %s" % (idx, sample['feat']))
+        start_time = time.time()
+        transcript = audio_process_handler(sample['feat'])
+        finish_time = time.time()
+        print("Response Time: %f, Transcript: %s" %
+              (finish_time - start_time, transcript))
+
+
+class AsrTCPServer(socketserver.TCPServer):
+    """The ASR TCP Server."""
+
+    def __init__(self,
+                 server_address,
+                 RequestHandlerClass,
+                 speech_save_dir,
+                 audio_process_handler,
+                 bind_and_activate=True):
+        self.speech_save_dir = speech_save_dir
+        self.audio_process_handler = audio_process_handler
+        socketserver.TCPServer.__init__(
+            self, server_address, RequestHandlerClass, bind_and_activate=True)
+
+
+class AsrRequestHandler(socketserver.BaseRequestHandler):
+    """The ASR request handler."""
+
+    def handle(self):
+        # receive data through TCP socket
+        chunk = self.request.recv(1024)
+        target_len = struct.unpack('>i', chunk[:4])[0]
+        data = chunk[4:]
+        while len(data) < target_len:
+            chunk = self.request.recv(1024)
+            data += chunk
+        # write to file
+        filename = self._write_to_file(data)
+
+        print("Received utterance[length=%d] from %s, saved to %s." %
+              (len(data), self.client_address[0], filename))
+        start_time = time.time()
+        transcript = self.server.audio_process_handler(filename)
+        finish_time = time.time()
+        print("Response Time: %f, Transcript: %s" %
+              (finish_time - start_time, transcript))
+        self.request.sendall(transcript.encode('utf-8'))
+
+    def _write_to_file(self, data):
+        # prepare save dir and filename
+        if not os.path.exists(self.server.speech_save_dir):
+            os.mkdir(self.server.speech_save_dir)
+        timestamp = strftime("%Y%m%d%H%M%S", gmtime())
+        out_filename = os.path.join(
+            self.server.speech_save_dir,
+            timestamp + "_" + self.client_address[0] + ".wav")
+        # write to wav file
+        file = wave.open(out_filename, 'wb')
+        file.setnchannels(1)
+        file.setsampwidth(2)
+        file.setframerate(16000)
+        file.writeframes(data)
+        file.close()
+        return out_filename
diff --git a/ernie-sat/paddlespeech/s2t/utils/spec_augment.py b/ernie-sat/paddlespeech/s2t/utils/spec_augment.py
new file mode 100644
index 0000000000000000000000000000000000000000..185a92b8d94d3426d616c0624f0f2ee04339349e
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/utils/spec_augment.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/ernie-sat/paddlespeech/s2t/utils/tensor_utils.py b/ernie-sat/paddlespeech/s2t/utils/tensor_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..0dbaa0b6b77031d4b8e8aa29fcc9246458b8ab99
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/utils/tensor_utils.py
@@ -0,0 +1,195 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Unility functions for Transformer."""
+from typing import List
+from typing import Tuple
+
+import paddle
+
+from paddlespeech.s2t.utils.log import Log
+
+__all__ = ["pad_sequence", "add_sos_eos", "th_accuracy", "has_tensor"]
+
+logger = Log(__name__).getlog()
+
+
+def has_tensor(val):
+    if isinstance(val, (list, tuple)):
+        for item in val:
+            if has_tensor(item):
+                return True
+    elif isinstance(val, dict):
+        for k, v in val.items():
+            print(k)
+            if has_tensor(v):
+                return True
+    else:
+        return paddle.is_tensor(val)
+
+
+def pad_sequence(sequences: List[paddle.Tensor],
+                 batch_first: bool=False,
+                 padding_value: float=0.0) -> paddle.Tensor:
+    r"""Pad a list of variable length Tensors with ``padding_value``
+
+    ``pad_sequence`` stacks a list of Tensors along a new dimension,
+    and pads them to equal length. For example, if the input is list of
+    sequences with size ``L x *`` and if batch_first is False, and ``T x B x *``
+    otherwise.
+
+    `B` is batch size. It is equal to the number of elements in ``sequences``.
+    `T` is length of the longest sequence.
+    `L` is length of the sequence.
+    `*` is any number of trailing dimensions, including none.
+
+    Example:
+        >>> from paddle.nn.utils.rnn import pad_sequence
+        >>> a = paddle.ones(25, 300)
+        >>> b = paddle.ones(22, 300)
+        >>> c = paddle.ones(15, 300)
+        >>> pad_sequence([a, b, c]).size()
+        paddle.Tensor([25, 3, 300])
+
+    Note:
+        This function returns a Tensor of size ``T x B x *`` or ``B x T x *``
+        where `T` is the length of the longest sequence. This function assumes
+        trailing dimensions and type of all the Tensors in sequences are same.
+
+    Args:
+        sequences (list[Tensor]): list of variable length sequences.
+        batch_first (bool, optional): output will be in ``B x T x *`` if True, or in
+            ``T x B x *`` otherwise
+        padding_value (float, optional): value for padded elements. Default: 0.
+
+    Returns:
+        Tensor of size ``T x B x *`` if :attr:`batch_first` is ``False``.
+        Tensor of size ``B x T x *`` otherwise
+    """
+
+    # assuming trailing dimensions and type of all the Tensors
+    # in sequences are same and fetching those from sequences[0]
+    max_size = sequences[0].size()
+    # (TODO Hui Zhang): slice not supprot `end==start`
+    # trailing_dims = max_size[1:]
+    trailing_dims = max_size[1:] if max_size.ndim >= 2 else ()
+    max_len = max([s.shape[0] for s in sequences])
+    if batch_first:
+        out_dims = (len(sequences), max_len) + trailing_dims
+    else:
+        out_dims = (max_len, len(sequences)) + trailing_dims
+
+    out_tensor = sequences[0].new_full(out_dims, padding_value)
+    for i, tensor in enumerate(sequences):
+        length = tensor.shape[0]
+        # use index notation to prevent duplicate references to the tensor
+        logger.info(
+            f"length {length}, out_tensor {out_tensor.shape}, tensor {tensor.shape}"
+        )
+        if batch_first:
+            # TODO (Hui Zhang): set_value op not supprot `end==start`
+            # TODO (Hui Zhang): set_value op not support int16
+            # TODO (Hui Zhang): set_varbase 2 rank not support [0,0,...] 
+            # out_tensor[i, :length, ...] = tensor
+            if length != 0:
+                out_tensor[i, :length] = tensor
+            else:
+                out_tensor[i, length] = tensor
+        else:
+            # TODO (Hui Zhang): set_value op not supprot `end==start`
+            # out_tensor[:length, i, ...] = tensor
+            if length != 0:
+                out_tensor[:length, i] = tensor
+            else:
+                out_tensor[length, i] = tensor
+
+    return out_tensor
+
+
+def add_sos_eos(ys_pad: paddle.Tensor, sos: int, eos: int,
+                ignore_id: int) -> Tuple[paddle.Tensor, paddle.Tensor]:
+    """Add <sos> and <eos> labels.
+    Args:
+        ys_pad (paddle.Tensor): batch of padded target sequences (B, Lmax)
+        sos (int): index of <sos>
+        eos (int): index of <eeos>
+        ignore_id (int): index of padding
+    Returns:
+        ys_in (paddle.Tensor) : (B, Lmax + 1)
+        ys_out (paddle.Tensor) : (B, Lmax + 1)
+    Examples:
+        >>> sos_id = 10
+        >>> eos_id = 11
+        >>> ignore_id = -1
+        >>> ys_pad
+        tensor([[ 1,  2,  3,  4,  5],
+                [ 4,  5,  6, -1, -1],
+                [ 7,  8,  9, -1, -1]], dtype=paddle.int32)
+        >>> ys_in,ys_out=add_sos_eos(ys_pad, sos_id , eos_id, ignore_id)
+        >>> ys_in
+        tensor([[10,  1,  2,  3,  4,  5],
+                [10,  4,  5,  6, 11, 11],
+                [10,  7,  8,  9, 11, 11]])
+        >>> ys_out
+        tensor([[ 1,  2,  3,  4,  5, 11],
+                [ 4,  5,  6, 11, -1, -1],
+                [ 7,  8,  9, 11, -1, -1]])
+    """
+    # TODO(Hui Zhang): using comment code, 
+    #_sos = paddle.to_tensor(
+    #    [sos], dtype=paddle.long, stop_gradient=True, place=ys_pad.place)
+    #_eos = paddle.to_tensor(
+    #    [eos], dtype=paddle.long, stop_gradient=True, place=ys_pad.place)
+    #ys = [y[y != ignore_id] for y in ys_pad]  # parse padded ys
+    #ys_in = [paddle.cat([_sos, y], dim=0) for y in ys]
+    #ys_out = [paddle.cat([y, _eos], dim=0) for y in ys]
+    #return pad_sequence(ys_in, padding_value=eos), pad_sequence(ys_out, padding_value=ignore_id)
+    B = ys_pad.shape[0]
+    _sos = paddle.ones([B, 1], dtype=ys_pad.dtype) * sos
+    _eos = paddle.ones([B, 1], dtype=ys_pad.dtype) * eos
+    ys_in = paddle.cat([_sos, ys_pad], dim=1)
+    mask_pad = (ys_in == ignore_id)
+    ys_in = ys_in.masked_fill(mask_pad, eos)
+
+    ys_out = paddle.cat([ys_pad, _eos], dim=1)
+    ys_out = ys_out.masked_fill(mask_pad, eos)
+    mask_eos = (ys_out == ignore_id)
+    ys_out = ys_out.masked_fill(mask_eos, eos)
+    ys_out = ys_out.masked_fill(mask_pad, ignore_id)
+    return ys_in, ys_out
+
+
+def th_accuracy(pad_outputs: paddle.Tensor,
+                pad_targets: paddle.Tensor,
+                ignore_label: int) -> float:
+    """Calculate accuracy.
+    Args:
+        pad_outputs (Tensor): Prediction tensors (B * Lmax, D).
+        pad_targets (LongTensor): Target label tensors (B, Lmax, D).
+        ignore_label (int): Ignore label id.
+    Returns:
+        float: Accuracy value (0.0 - 1.0).
+    """
+    pad_pred = pad_outputs.view(pad_targets.shape[0], pad_targets.shape[1],
+                                pad_outputs.shape[1]).argmax(2)
+    mask = pad_targets != ignore_label
+    #TODO(Hui Zhang): sum not support bool type
+    # numerator = paddle.sum(
+    #     pad_pred.masked_select(mask) == pad_targets.masked_select(mask))
+    numerator = (
+        pad_pred.masked_select(mask) == pad_targets.masked_select(mask))
+    numerator = paddle.sum(numerator.type_as(pad_targets))
+    #TODO(Hui Zhang): sum not support bool type
+    # denominator = paddle.sum(mask)
+    denominator = paddle.sum(mask.type_as(pad_targets))
+    return float(numerator) / float(denominator)
diff --git a/ernie-sat/paddlespeech/s2t/utils/text_grid.py b/ernie-sat/paddlespeech/s2t/utils/text_grid.py
new file mode 100644
index 0000000000000000000000000000000000000000..cbd9856e40d72897cd08d3618178e60f7a34ea0f
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/utils/text_grid.py
@@ -0,0 +1,128 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from wenet(https://github.com/wenet-e2e/wenet)
+from typing import Dict
+from typing import List
+from typing import Text
+
+import textgrid
+
+
+def segment_alignment(alignment: List[int], blank_id=0) -> List[List[int]]:
+    """segment ctc alignment ids by continuous blank and repeat label.
+
+    Args:
+        alignment (List[int]): ctc alignment id sequence. 
+            e.g. [0, 0, 0, 1, 1, 1, 2, 0, 0, 3]
+        blank_id (int, optional): blank id. Defaults to 0.
+
+    Returns:
+        List[List[int]]: token align, segment aligment id sequence. 
+            e.g. [[0, 0, 0, 1, 1, 1], [2], [0, 0, 3]]
+    """
+    # convert alignment to a praat format, which is a doing phonetics
+    # by computer and helps analyzing alignment
+    align_segs = []
+    # get frames level duration for each token
+    start = 0
+    end = 0
+    while end < len(alignment):
+        while end < len(alignment) and alignment[end] == blank_id:  # blank
+            end += 1
+        if end == len(alignment):
+            align_segs[-1].extend(alignment[start:])
+            break
+        end += 1
+        while end < len(alignment) and alignment[end - 1] == alignment[
+                end]:  # repeat label
+            end += 1
+        align_segs.append(alignment[start:end])
+        start = end
+    return align_segs
+
+
+def align_to_tierformat(align_segs: List[List[int]],
+                        subsample: int,
+                        token_dict: Dict[int, Text],
+                        blank_id=0) -> List[Text]:
+    """Generate textgrid.Interval format from alignment segmentations.
+
+    Args:
+        align_segs (List[List[int]]): segmented ctc alignment ids.
+        subsample (int): 25ms frame_length, 10ms hop_length, 1/subsample
+        token_dict (Dict[int, Text]): int -> str map.
+
+    Returns:
+        List[Text]: list of textgrid.Interval text, str(start, end, text).
+    """
+    hop_length = 10  # ms
+    second_ms = 1000  # ms
+    frame_per_second = second_ms / hop_length  # 25ms frame_length, 10ms hop_length
+    second_per_frame = 1.0 / frame_per_second
+
+    begin = 0
+    duration = 0
+    tierformat = []
+
+    for idx, tokens in enumerate(align_segs):
+        token_len = len(tokens)
+        token = tokens[-1]
+        # time duration in second
+        duration = token_len * subsample * second_per_frame
+        if idx < len(align_segs) - 1:
+            print(f"{begin:.2f} {begin + duration:.2f} {token_dict[token]}")
+            tierformat.append(
+                f"{begin:.2f} {begin + duration:.2f} {token_dict[token]}\n")
+        else:
+            for i in tokens:
+                if i != blank_id:
+                    token = i
+                    break
+            print(f"{begin:.2f} {begin + duration:.2f} {token_dict[token]}")
+            tierformat.append(
+                f"{begin:.2f} {begin + duration:.2f} {token_dict[token]}\n")
+        begin = begin + duration
+
+    return tierformat
+
+
+def generate_textgrid(maxtime: float,
+                      intervals: List[Text],
+                      output: Text,
+                      name: Text='ali') -> None:
+    """Create alignment textgrid file.
+
+    Args:
+        maxtime (float): audio duartion.
+        intervals (List[Text]): ctc output alignment. e.g. "start-time end-time word" per item.
+        output (Text): textgrid filepath.
+        name (Text, optional): tier or layer name. Defaults to 'ali'.
+    """
+    # Download Praat: https://www.fon.hum.uva.nl/praat/
+    avg_interval = maxtime / (len(intervals) + 1)
+    print(f"average second/token: {avg_interval}")
+    margin = 0.0001
+
+    tg = textgrid.TextGrid(maxTime=maxtime)
+    tier = textgrid.IntervalTier(name=name, maxTime=maxtime)
+
+    i = 0
+    for dur in intervals:
+        s, e, text = dur.split()
+        tier.add(minTime=float(s) + margin, maxTime=float(e), mark=text)
+
+    tg.append(tier)
+
+    tg.write(output)
+    print("successfully generator textgrid {}.".format(output))
diff --git a/ernie-sat/paddlespeech/s2t/utils/utility.py b/ernie-sat/paddlespeech/s2t/utils/utility.py
new file mode 100644
index 0000000000000000000000000000000000000000..fdd8c029232b582f46adfe8d06cae817d89d283d
--- /dev/null
+++ b/ernie-sat/paddlespeech/s2t/utils/utility.py
@@ -0,0 +1,140 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Contains common utility functions."""
+import math
+import os
+import random
+import sys
+from contextlib import contextmanager
+from pprint import pformat
+from typing import List
+
+import distutils.util
+import numpy as np
+import paddle
+import soundfile
+
+from paddlespeech.s2t.utils.log import Log
+
+logger = Log(__name__).getlog()
+
+__all__ = [
+    "all_version", "UpdateConfig", "seed_all", 'print_arguments',
+    'add_arguments', "log_add"
+]
+
+
+def all_version():
+    vers = {
+        "python": sys.version,
+        "paddle": paddle.__version__,
+        "paddle_commit": paddle.version.commit,
+        "soundfile": soundfile.__version__,
+    }
+    logger.info(f"Deps Module Version:{pformat(list(vers.items()))}")
+
+
+@contextmanager
+def UpdateConfig(config):
+    """Update yacs config"""
+    config.defrost()
+    yield
+    config.freeze()
+
+
+def seed_all(seed: int=20210329):
+    """freeze random generator seed."""
+    np.random.seed(seed)
+    random.seed(seed)
+    paddle.seed(seed)
+
+
+def print_arguments(args, info=None):
+    """Print argparse's arguments.
+
+    Usage:
+
+    .. code-block:: python
+
+        parser = argparse.ArgumentParser()
+        parser.add_argument("name", default="Jonh", type=str, help="User name.")
+        args = parser.parse_args()
+        print_arguments(args)
+
+    :param args: Input argparse.Namespace for printing.
+    :type args: argparse.Namespace
+    """
+    filename = ""
+    if info:
+        filename = info["__file__"]
+    filename = os.path.basename(filename)
+    print(f"----------- {filename} Arguments -----------")
+    for arg, value in sorted(vars(args).items()):
+        print("%s: %s" % (arg, value))
+    print("-----------------------------------------------------------")
+
+
+def add_arguments(argname, type, default, help, argparser, **kwargs):
+    """Add argparse's argument.
+
+    Usage:
+
+    .. code-block:: python
+
+        parser = argparse.ArgumentParser()
+        add_argument("name", str, "Jonh", "User name.", parser)
+        args = parser.parse_args()
+    """
+    type = distutils.util.strtobool if type == bool else type
+    argparser.add_argument(
+        "--" + argname,
+        default=default,
+        type=type,
+        help=help + ' Default: %(default)s.',
+        **kwargs)
+
+
+def log_add(args: List[int]) -> float:
+    """Stable log add
+
+    Args:
+        args (List[int]): log scores
+
+    Returns:
+        float: sum of log scores
+    """
+    if all(a == -float('inf') for a in args):
+        return -float('inf')
+    a_max = max(args)
+    lsp = math.log(sum(math.exp(a - a_max) for a in args))
+    return a_max + lsp
+
+
+def get_subsample(config):
+    """Subsample rate from config.
+
+    Args:
+        config (yacs.config.CfgNode): yaml config
+
+    Returns:
+        int: subsample rate.
+    """
+    input_layer = config["encoder_conf"]["input_layer"]
+    assert input_layer in ["conv2d", "conv2d6", "conv2d8"]
+    if input_layer == "conv2d":
+        return 4
+    elif input_layer == "conv2d6":
+        return 6
+    elif input_layer == "conv2d8":
+        return 8
diff --git a/ernie-sat/paddlespeech/server/README.md b/ernie-sat/paddlespeech/server/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..819fe440d220c1f4b06b2557978c9205ede804e0
--- /dev/null
+++ b/ernie-sat/paddlespeech/server/README.md
@@ -0,0 +1,37 @@
+# PaddleSpeech Server Command Line
+
+([简体中文](./README_cn.md)|English)
+
+ The simplest approach to use PaddleSpeech Server including server and client.
+
+ ## PaddleSpeech Server
+ ### Help
+ ```bash
+ paddlespeech_server help
+ ```
+ ### Start the server
+ First set the service-related configuration parameters, similar to `./conf/application.yaml`. Set `engine_list`, which represents the speech tasks included in the service to be started
+ Then start the service:
+ ```bash
+ paddlespeech_server start --config_file ./conf/application.yaml
+ ```
+
+ ## PaddleSpeech Client
+ ### Help
+ ```bash
+ paddlespeech_client help
+ ```
+ ### Access speech recognition services 
+ ```
+ paddlespeech_client asr --server_ip 127.0.0.1 --port 8090 --input input_16k.wav
+ ```
+ 
+ ### Access text to speech services
+ ```bash
+ paddlespeech_client tts --server_ip 127.0.0.1 --port 8090 --input "你好，欢迎使用百度飞桨深度学习框架！" --output output.wav
+ ```
+ 
+ ### Access audio classification services
+ ```bash
+ paddlespeech_client cls --server_ip 127.0.0.1 --port 8090 --input input.wav
+ ```
diff --git a/ernie-sat/paddlespeech/server/README_cn.md b/ernie-sat/paddlespeech/server/README_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..c0a4a7336700c642efc2172dfa14416dff0ef5ec
--- /dev/null
+++ b/ernie-sat/paddlespeech/server/README_cn.md
@@ -0,0 +1,37 @@
+# PaddleSpeech Server 命令行工具
+
+(简体中文|[English](./README.md))
+
+它提供了最简便的方式调用 PaddleSpeech 语音服务用一行命令就可以轻松启动服务和调用服务。
+
+ ## 服务端命令行使用
+ ### 帮助
+ ```bash
+ paddlespeech_server help
+ ```
+ ### 启动服务
+ 首先设置服务相关配置文件，类似于 `./conf/application.yaml`，设置 `engine_list`，该值表示即将启动的服务中包含的语音任务。
+ 然后启动服务：
+ ```bash
+ paddlespeech_server start --config_file ./conf/application.yaml
+ ```
+
+ ## 客户端命令行使用
+ ### 帮助
+ ```bash
+ paddlespeech_client help
+ ```
+ ### 访问语音识别服务 
+ ```
+ paddlespeech_client asr --server_ip 127.0.0.1 --port 8090 --input input_16k.wav
+ ```
+ 
+ ### 访问语音合成服务
+ ```bash
+ paddlespeech_client tts --server_ip 127.0.0.1 --port 8090 --input "你好，欢迎使用百度飞桨深度学习框架！" --output output.wav
+ ```
+
+ ### 访问音频分类服务
+ ```bash
+ paddlespeech_client cls --server_ip 127.0.0.1 --port 8090 --input input.wav
+ ```
diff --git a/ernie-sat/paddlespeech/server/__init__.py b/ernie-sat/paddlespeech/server/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..97722c0a0cbb3b1978f182b554ba466f2ce41ea5
--- /dev/null
+++ b/ernie-sat/paddlespeech/server/__init__.py
@@ -0,0 +1,25 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import _locale
+
+from .base_commands import ClientBaseCommand
+from .base_commands import ClientHelpCommand
+from .base_commands import ServerBaseCommand
+from .base_commands import ServerHelpCommand
+from .bin.paddlespeech_client import ASRClientExecutor
+from .bin.paddlespeech_client import CLSClientExecutor
+from .bin.paddlespeech_client import TTSClientExecutor
+from .bin.paddlespeech_server import ServerExecutor
+
+_locale._getdefaultlocale = (lambda *args: ['en_US', 'utf8'])
diff --git a/ernie-sat/paddlespeech/server/base_commands.py b/ernie-sat/paddlespeech/server/base_commands.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1239297d47c88d6169c2622ff89b568a9292c68
--- /dev/null
+++ b/ernie-sat/paddlespeech/server/base_commands.py
@@ -0,0 +1,82 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List
+
+from .entry import client_commands
+from .entry import server_commands
+from .util import cli_client_register
+from .util import cli_server_register
+from .util import get_client_command
+from .util import get_server_command
+
+__all__ = [
+    'ServerBaseCommand',
+    'ServerHelpCommand',
+    'ClientBaseCommand',
+    'ClientHelpCommand',
+]
+
+
+@cli_server_register(name='paddlespeech_server')
+class ServerBaseCommand:
+    def execute(self, argv: List[str]) -> bool:
+        help = get_server_command('paddlespeech_server.help')
+        return help().execute(argv)
+
+
+@cli_server_register(
+    name='paddlespeech_server.help', description='Show help for commands.')
+class ServerHelpCommand:
+    def execute(self, argv: List[str]) -> bool:
+        msg = 'Usage:\n'
+        msg += '    paddlespeech_server <command> <options>\n\n'
+        msg += 'Commands:\n'
+        for command, detail in server_commands['paddlespeech_server'].items():
+            if command.startswith('_'):
+                continue
+
+            if '_description' not in detail:
+                continue
+            msg += '    {:<15}        {}\n'.format(command,
+                                                   detail['_description'])
+
+        print(msg)
+        return True
+
+
+@cli_client_register(name='paddlespeech_client')
+class ClientBaseCommand:
+    def execute(self, argv: List[str]) -> bool:
+        help = get_client_command('paddlespeech_client.help')
+        return help().execute(argv)
+
+
+@cli_client_register(
+    name='paddlespeech_client.help', description='Show help for commands.')
+class ClientHelpCommand:
+    def execute(self, argv: List[str]) -> bool:
+        msg = 'Usage:\n'
+        msg += '    paddlespeech_client <command> <options>\n\n'
+        msg += 'Commands:\n'
+        for command, detail in client_commands['paddlespeech_client'].items():
+            if command.startswith('_'):
+                continue
+
+            if '_description' not in detail:
+                continue
+            msg += '    {:<15}        {}\n'.format(command,
+                                                   detail['_description'])
+
+        print(msg)
+        return True
diff --git a/ernie-sat/paddlespeech/server/bin/__init__.py b/ernie-sat/paddlespeech/server/bin/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..025aab098f2b6d56ced56d499ce619feb190ab2d
--- /dev/null
+++ b/ernie-sat/paddlespeech/server/bin/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .paddlespeech_client import ASRClientExecutor
+from .paddlespeech_client import TTSClientExecutor
+from .paddlespeech_server import ServerExecutor
+from .paddlespeech_server import ServerStatsExecutor
diff --git a/ernie-sat/paddlespeech/server/bin/main.py b/ernie-sat/paddlespeech/server/bin/main.py
new file mode 100644
index 0000000000000000000000000000000000000000..81824c85c46687ff12d2ffd366743eaf237dbd9a
--- /dev/null
+++ b/ernie-sat/paddlespeech/server/bin/main.py
@@ -0,0 +1,77 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+
+import uvicorn
+from fastapi import FastAPI
+
+from paddlespeech.server.engine.engine_pool import init_engine_pool
+from paddlespeech.server.restful.api import setup_router as setup_http_router
+from paddlespeech.server.utils.config import get_config
+from paddlespeech.server.ws.api import setup_router as setup_ws_router
+
+app = FastAPI(
+    title="PaddleSpeech Serving API", description="Api", version="0.0.1")
+
+
+def init(config):
+    """system initialization
+
+    Args:
+        config (CfgNode): config object
+
+    Returns:
+        bool: 
+    """
+    # init api
+    api_list = list(engine.split("_")[0] for engine in config.engine_list)
+    if config.protocol == "websocket":
+        api_router = setup_ws_router(api_list)
+    elif config.protocol == "http":
+        api_router = setup_http_router(api_list)
+    else:
+        raise Exception("unsupported protocol")
+    app.include_router(api_router)
+
+    if not init_engine_pool(config):
+        return False
+
+    return True
+
+
+def main(args):
+    """main function"""
+
+    config = get_config(args.config_file)
+
+    if init(config):
+        uvicorn.run(app, host=config.host, port=config.port, debug=True)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--config_file",
+        action="store",
+        help="yaml file of the app",
+        default="./conf/application.yaml")
+
+    parser.add_argument(
+        "--log_file",
+        action="store",
+        help="log file",
+        default="./log/paddlespeech.log")
+    args = parser.parse_args()
+
+    main(args)
diff --git a/ernie-sat/paddlespeech/server/bin/paddlespeech_client.py b/ernie-sat/paddlespeech/server/bin/paddlespeech_client.py
new file mode 100644
index 0000000000000000000000000000000000000000..413f00872327b1ef364146d12b8cd8540eec421f
--- /dev/null
+++ b/ernie-sat/paddlespeech/server/bin/paddlespeech_client.py
@@ -0,0 +1,289 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import base64
+import io
+import json
+import os
+import random
+import time
+from typing import List
+
+import numpy as np
+import requests
+import soundfile
+
+from ..executor import BaseExecutor
+from ..util import cli_client_register
+from ..util import stats_wrapper
+from paddlespeech.cli.log import logger
+from paddlespeech.server.utils.audio_process import wav2pcm
+from paddlespeech.server.utils.util import wav2base64
+
+__all__ = ['TTSClientExecutor', 'ASRClientExecutor', 'CLSClientExecutor']
+
+
+@cli_client_register(
+    name='paddlespeech_client.tts', description='visit tts service')
+class TTSClientExecutor(BaseExecutor):
+    def __init__(self):
+        super(TTSClientExecutor, self).__init__()
+        self.parser = argparse.ArgumentParser(
+            prog='paddlespeech_client.tts', add_help=True)
+        self.parser.add_argument(
+            '--server_ip', type=str, default='127.0.0.1', help='server ip')
+        self.parser.add_argument(
+            '--port', type=int, default=8090, help='server port')
+        self.parser.add_argument(
+            '--input',
+            type=str,
+            default=None,
+            help='Text to be synthesized.',
+            required=True)
+        self.parser.add_argument(
+            '--spk_id', type=int, default=0, help='Speaker id')
+        self.parser.add_argument(
+            '--speed',
+            type=float,
+            default=1.0,
+            help='Audio speed, the value should be set between 0 and 3')
+        self.parser.add_argument(
+            '--volume',
+            type=float,
+            default=1.0,
+            help='Audio volume, the value should be set between 0 and 3')
+        self.parser.add_argument(
+            '--sample_rate',
+            type=int,
+            default=0,
+            choices=[0, 8000, 16000],
+            help='Sampling rate, the default is the same as the model')
+        self.parser.add_argument(
+            '--output', type=str, default=None, help='Synthesized audio file')
+
+    def postprocess(self, wav_base64: str, outfile: str) -> float:
+        audio_data_byte = base64.b64decode(wav_base64)
+        # from byte
+        samples, sample_rate = soundfile.read(
+            io.BytesIO(audio_data_byte), dtype='float32')
+
+        # transform audio
+        if outfile.endswith(".wav"):
+            soundfile.write(outfile, samples, sample_rate)
+        elif outfile.endswith(".pcm"):
+            temp_wav = str(random.getrandbits(128)) + ".wav"
+            soundfile.write(temp_wav, samples, sample_rate)
+            wav2pcm(temp_wav, outfile, data_type=np.int16)
+            os.system("rm %s" % (temp_wav))
+        else:
+            logger.error("The format for saving audio only supports wav or pcm")
+
+    def execute(self, argv: List[str]) -> bool:
+        args = self.parser.parse_args(argv)
+        input_ = args.input
+        server_ip = args.server_ip
+        port = args.port
+        spk_id = args.spk_id
+        speed = args.speed
+        volume = args.volume
+        sample_rate = args.sample_rate
+        output = args.output
+
+        try:
+            time_start = time.time()
+            res = self(
+                input=input_,
+                server_ip=server_ip,
+                port=port,
+                spk_id=spk_id,
+                speed=speed,
+                volume=volume,
+                sample_rate=sample_rate,
+                output=output)
+            time_end = time.time()
+            time_consume = time_end - time_start
+            response_dict = res.json()
+            logger.info(response_dict["message"])
+            logger.info("Save synthesized audio successfully on %s." % (output))
+            logger.info("Audio duration: %f s." %
+                        (response_dict['result']['duration']))
+            logger.info("Response time: %f s." % (time_consume))
+            return True
+        except Exception as e:
+            logger.error("Failed to synthesized audio.")
+            return False
+
+    @stats_wrapper
+    def __call__(self,
+                 input: str,
+                 server_ip: str="127.0.0.1",
+                 port: int=8090,
+                 spk_id: int=0,
+                 speed: float=1.0,
+                 volume: float=1.0,
+                 sample_rate: int=0,
+                 output: str=None):
+        """
+        Python API to call an executor.
+        """
+
+        url = 'http://' + server_ip + ":" + str(port) + '/paddlespeech/tts'
+        request = {
+            "text": input,
+            "spk_id": spk_id,
+            "speed": speed,
+            "volume": volume,
+            "sample_rate": sample_rate,
+            "save_path": output
+        }
+
+        res = requests.post(url, json.dumps(request))
+        response_dict = res.json()
+        if output is not None:
+            self.postprocess(response_dict["result"]["audio"], output)
+        return res
+
+
+@cli_client_register(
+    name='paddlespeech_client.asr', description='visit asr service')
+class ASRClientExecutor(BaseExecutor):
+    def __init__(self):
+        super(ASRClientExecutor, self).__init__()
+        self.parser = argparse.ArgumentParser(
+            prog='paddlespeech_client.asr', add_help=True)
+        self.parser.add_argument(
+            '--server_ip', type=str, default='127.0.0.1', help='server ip')
+        self.parser.add_argument(
+            '--port', type=int, default=8090, help='server port')
+        self.parser.add_argument(
+            '--input',
+            type=str,
+            default=None,
+            help='Audio file to be recognized',
+            required=True)
+        self.parser.add_argument(
+            '--sample_rate', type=int, default=16000, help='audio sample rate')
+        self.parser.add_argument(
+            '--lang', type=str, default="zh_cn", help='language')
+        self.parser.add_argument(
+            '--audio_format', type=str, default="wav", help='audio format')
+
+    def execute(self, argv: List[str]) -> bool:
+        args = self.parser.parse_args(argv)
+        input_ = args.input
+        server_ip = args.server_ip
+        port = args.port
+        sample_rate = args.sample_rate
+        lang = args.lang
+        audio_format = args.audio_format
+
+        try:
+            time_start = time.time()
+            res = self(
+                input=input_,
+                server_ip=server_ip,
+                port=port,
+                sample_rate=sample_rate,
+                lang=lang,
+                audio_format=audio_format)
+            time_end = time.time()
+            logger.info(res.json())
+            logger.info("Response time %f s." % (time_end - time_start))
+            return True
+        except Exception as e:
+            logger.error("Failed to speech recognition.")
+            return False
+
+    @stats_wrapper
+    def __call__(self,
+                 input: str,
+                 server_ip: str="127.0.0.1",
+                 port: int=8090,
+                 sample_rate: int=16000,
+                 lang: str="zh_cn",
+                 audio_format: str="wav"):
+        """
+        Python API to call an executor.
+        """
+
+        url = 'http://' + server_ip + ":" + str(port) + '/paddlespeech/asr'
+        audio = wav2base64(input)
+        data = {
+            "audio": audio,
+            "audio_format": audio_format,
+            "sample_rate": sample_rate,
+            "lang": lang,
+        }
+
+        res = requests.post(url=url, data=json.dumps(data))
+        return res
+
+
+@cli_client_register(
+    name='paddlespeech_client.cls', description='visit cls service')
+class CLSClientExecutor(BaseExecutor):
+    def __init__(self):
+        super(CLSClientExecutor, self).__init__()
+        self.parser = argparse.ArgumentParser(
+            prog='paddlespeech_client.cls', add_help=True)
+        self.parser.add_argument(
+            '--server_ip', type=str, default='127.0.0.1', help='server ip')
+        self.parser.add_argument(
+            '--port', type=int, default=8090, help='server port')
+        self.parser.add_argument(
+            '--input',
+            type=str,
+            default=None,
+            help='Audio file to classify.',
+            required=True)
+        self.parser.add_argument(
+            '--topk',
+            type=int,
+            default=1,
+            help='Return topk scores of classification result.')
+
+    def execute(self, argv: List[str]) -> bool:
+        args = self.parser.parse_args(argv)
+        input_ = args.input
+        server_ip = args.server_ip
+        port = args.port
+        topk = args.topk
+
+        try:
+            time_start = time.time()
+            res = self(input=input_, server_ip=server_ip, port=port, topk=topk)
+            time_end = time.time()
+            logger.info(res.json())
+            logger.info("Response time %f s." % (time_end - time_start))
+            return True
+        except Exception as e:
+            logger.error("Failed to speech classification.")
+            return False
+
+    @stats_wrapper
+    def __call__(self,
+                 input: str,
+                 server_ip: str="127.0.0.1",
+                 port: int=8090,
+                 topk: int=1):
+        """
+        Python API to call an executor.
+        """
+
+        url = 'http://' + server_ip + ":" + str(port) + '/paddlespeech/cls'
+        audio = wav2base64(input)
+        data = {"audio": audio, "topk": topk}
+
+        res = requests.post(url=url, data=json.dumps(data))
+        return res
diff --git a/ernie-sat/paddlespeech/server/bin/paddlespeech_server.py b/ernie-sat/paddlespeech/server/bin/paddlespeech_server.py
new file mode 100644
index 0000000000000000000000000000000000000000..f6a7f42955753ecb5d717d06073540a3a2fe6789
--- /dev/null
+++ b/ernie-sat/paddlespeech/server/bin/paddlespeech_server.py
@@ -0,0 +1,198 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+from typing import List
+
+import uvicorn
+from fastapi import FastAPI
+from prettytable import PrettyTable
+
+from ..executor import BaseExecutor
+from ..util import cli_server_register
+from ..util import stats_wrapper
+from paddlespeech.cli.log import logger
+from paddlespeech.server.engine.engine_pool import init_engine_pool
+from paddlespeech.server.restful.api import setup_router
+from paddlespeech.server.utils.config import get_config
+
+__all__ = ['ServerExecutor', 'ServerStatsExecutor']
+
+app = FastAPI(
+    title="PaddleSpeech Serving API", description="Api", version="0.0.1")
+
+
+@cli_server_register(
+    name='paddlespeech_server.start', description='Start the service')
+class ServerExecutor(BaseExecutor):
+    def __init__(self):
+        super(ServerExecutor, self).__init__()
+        self.parser = argparse.ArgumentParser(
+            prog='paddlespeech_server.start', add_help=True)
+        self.parser.add_argument(
+            "--config_file",
+            action="store",
+            help="yaml file of the app",
+            default=None,
+            required=True)
+
+        self.parser.add_argument(
+            "--log_file",
+            action="store",
+            help="log file",
+            default="./log/paddlespeech.log")
+
+    def init(self, config) -> bool:
+        """system initialization
+
+        Args:
+            config (CfgNode): config object
+
+        Returns:
+            bool: 
+        """
+        # init api
+        api_list = list(engine.split("_")[0] for engine in config.engine_list)
+        api_router = setup_router(api_list)
+        app.include_router(api_router)
+
+        if not init_engine_pool(config):
+            return False
+
+        return True
+
+    def execute(self, argv: List[str]) -> bool:
+        args = self.parser.parse_args(argv)
+        config = get_config(args.config_file)
+
+        if self.init(config):
+            uvicorn.run(app, host=config.host, port=config.port, debug=True)
+
+    @stats_wrapper
+    def __call__(self,
+                 config_file: str="./conf/application.yaml",
+                 log_file: str="./log/paddlespeech.log"):
+        """
+        Python API to call an executor.
+        """
+        config = get_config(config_file)
+        if self.init(config):
+            uvicorn.run(app, host=config.host, port=config.port, debug=True)
+
+
+@cli_server_register(
+    name='paddlespeech_server.stats',
+    description='Get the models supported by each speech task in the service.')
+class ServerStatsExecutor():
+    def __init__(self):
+        super(ServerStatsExecutor, self).__init__()
+
+        self.parser = argparse.ArgumentParser(
+            prog='paddlespeech_server.stats', add_help=True)
+        self.parser.add_argument(
+            '--task',
+            type=str,
+            default=None,
+            choices=['asr', 'tts', 'cls'],
+            help='Choose speech task.',
+            required=True)
+        self.task_choices = ['asr', 'tts', 'cls']
+        self.model_name_format = {
+            'asr': 'Model-Language-Sample Rate',
+            'tts': 'Model-Language',
+            'cls': 'Model-Sample Rate'
+        }
+
+    def show_support_models(self, pretrained_models: dict):
+        fields = self.model_name_format[self.task].split("-")
+        table = PrettyTable(fields)
+        for key in pretrained_models:
+            table.add_row(key.split("-"))
+        print(table)
+
+    def execute(self, argv: List[str]) -> bool:
+        """
+            Command line entry.
+        """
+        parser_args = self.parser.parse_args(argv)
+        self.task = parser_args.task
+        if self.task not in self.task_choices:
+            logger.error(
+                "Please input correct speech task, choices = ['asr', 'tts']")
+            return False
+
+        elif self.task == 'asr':
+            try:
+                from paddlespeech.cli.asr.infer import pretrained_models
+                logger.info(
+                    "Here is the table of ASR pretrained models supported in the service."
+                )
+                self.show_support_models(pretrained_models)
+
+                # show ASR static pretrained model
+                from paddlespeech.server.engine.asr.paddleinference.asr_engine import pretrained_models
+                logger.info(
+                    "Here is the table of ASR static pretrained models supported in the service."
+                )
+                self.show_support_models(pretrained_models)
+
+                return True
+            except BaseException:
+                logger.error(
+                    "Failed to get the table of ASR pretrained models supported in the service."
+                )
+                return False
+
+        elif self.task == 'tts':
+            try:
+                from paddlespeech.cli.tts.infer import pretrained_models
+                logger.info(
+                    "Here is the table of TTS pretrained models supported in the service."
+                )
+                self.show_support_models(pretrained_models)
+
+                # show TTS static pretrained model
+                from paddlespeech.server.engine.tts.paddleinference.tts_engine import pretrained_models
+                logger.info(
+                    "Here is the table of TTS static pretrained models supported in the service."
+                )
+                self.show_support_models(pretrained_models)
+
+                return True
+            except BaseException:
+                logger.error(
+                    "Failed to get the table of TTS pretrained models supported in the service."
+                )
+                return False
+
+        elif self.task == 'cls':
+            try:
+                from paddlespeech.cli.cls.infer import pretrained_models
+                logger.info(
+                    "Here is the table of CLS pretrained models supported in the service."
+                )
+                self.show_support_models(pretrained_models)
+
+                # show CLS static pretrained model
+                from paddlespeech.server.engine.cls.paddleinference.cls_engine import pretrained_models
+                logger.info(
+                    "Here is the table of CLS static pretrained models supported in the service."
+                )
+                self.show_support_models(pretrained_models)
+
+                return True
+            except BaseException:
+                logger.error(
+                    "Failed to get the table of CLS pretrained models supported in the service."
+                )
+                return False
diff --git a/ernie-sat/paddlespeech/server/conf/application.yaml b/ernie-sat/paddlespeech/server/conf/application.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..849349c2df371a58f754d1fa881ba524ac7df5d7
--- /dev/null
+++ b/ernie-sat/paddlespeech/server/conf/application.yaml
@@ -0,0 +1,157 @@
+# This is the parameter configuration file for PaddleSpeech Serving.
+
+#################################################################################
+#                             SERVER SETTING                                    #
+#################################################################################
+host: 127.0.0.1
+port: 8090
+
+# The task format in the engin_list is: <speech task>_<engine type>
+# task choices = ['asr_python', 'asr_inference', 'tts_python', 'tts_inference']
+# protocol = ['websocket', 'http'] (only one can be selected). 
+# http only support offline engine type.
+protocol: 'http'
+engine_list: ['asr_python', 'tts_python', 'cls_python']
+
+
+#################################################################################
+#                                ENGINE CONFIG                                  #
+#################################################################################
+
+################################### ASR #########################################
+################### speech task: asr; engine_type: python #######################
+asr_python:
+    model: 'conformer_wenetspeech'
+    lang: 'zh'
+    sample_rate: 16000
+    cfg_path: # [optional]
+    ckpt_path: # [optional]
+    decode_method: 'attention_rescoring'
+    force_yes: True
+    device:  # set 'gpu:id' or 'cpu'
+
+
+################### speech task: asr; engine_type: inference #######################
+asr_inference:
+    # model_type choices=['deepspeech2offline_aishell']
+    model_type: 'deepspeech2offline_aishell'
+    am_model: # the pdmodel file of am static model [optional]
+    am_params:  # the pdiparams file of am static model [optional]
+    lang: 'zh'
+    sample_rate: 16000
+    cfg_path: 
+    decode_method: 
+    force_yes: True
+
+    am_predictor_conf:
+        device:  # set 'gpu:id' or 'cpu'
+        switch_ir_optim: True
+        glog_info: False  # True -> print glog
+        summary: True  # False -> do not show predictor config
+
+
+################### speech task: asr; engine_type: online #######################
+asr_online:
+    model_type: 'deepspeech2online_aishell'
+    am_model: # the pdmodel file of am static model [optional]
+    am_params:  # the pdiparams file of am static model [optional]
+    lang: 'zh'
+    sample_rate: 16000
+    cfg_path: 
+    decode_method: 
+    force_yes: True
+
+    am_predictor_conf:
+        device:  # set 'gpu:id' or 'cpu'
+        switch_ir_optim: True
+        glog_info: False  # True -> print glog
+        summary: True  # False -> do not show predictor config
+
+
+################################### TTS #########################################
+################### speech task: tts; engine_type: python #######################
+tts_python: 
+    # am (acoustic model) choices=['speedyspeech_csmsc', 'fastspeech2_csmsc', 
+    #                              'fastspeech2_ljspeech', 'fastspeech2_aishell3',
+    #                              'fastspeech2_vctk']        
+    am: 'fastspeech2_csmsc'   
+    am_config: 
+    am_ckpt: 
+    am_stat: 
+    phones_dict: 
+    tones_dict: 
+    speaker_dict: 
+    spk_id: 0
+
+    # voc (vocoder) choices=['pwgan_csmsc', 'pwgan_ljspeech', 'pwgan_aishell3',
+    #                        'pwgan_vctk', 'mb_melgan_csmsc']
+    voc: 'pwgan_csmsc'
+    voc_config: 
+    voc_ckpt: 
+    voc_stat: 
+
+    # others
+    lang: 'zh'
+    device:  # set 'gpu:id' or 'cpu'
+
+
+################### speech task: tts; engine_type: inference #######################
+tts_inference:
+    # am (acoustic model) choices=['speedyspeech_csmsc', 'fastspeech2_csmsc']
+    am: 'fastspeech2_csmsc'   
+    am_model: # the pdmodel file of your am static model (XX.pdmodel)
+    am_params: # the pdiparams file of your am static model (XX.pdipparams)
+    am_sample_rate: 24000
+    phones_dict: 
+    tones_dict: 
+    speaker_dict: 
+    spk_id: 0
+
+    am_predictor_conf:
+        device:  # set 'gpu:id' or 'cpu'
+        switch_ir_optim: True
+        glog_info: False # True -> print glog
+        summary: True  # False -> do not show predictor config
+
+    # voc (vocoder) choices=['pwgan_csmsc', 'mb_melgan_csmsc','hifigan_csmsc']
+    voc: 'pwgan_csmsc'
+    voc_model: # the pdmodel file of your vocoder static model (XX.pdmodel)
+    voc_params: # the pdiparams file of your vocoder static model (XX.pdipparams)
+    voc_sample_rate: 24000
+
+    voc_predictor_conf:
+        device:  # set 'gpu:id' or 'cpu'  
+        switch_ir_optim: True  
+        glog_info: False # True -> print glog
+        summary: True  # False -> do not show predictor config
+
+    # others
+    lang: 'zh'
+
+
+################################### CLS #########################################
+################### speech task: cls; engine_type: python #######################
+cls_python:
+    # model choices=['panns_cnn14', 'panns_cnn10', 'panns_cnn6']
+    model: 'panns_cnn14'
+    cfg_path: # [optional] Config of cls task.
+    ckpt_path: # [optional] Checkpoint file of model.
+    label_file: # [optional] Label file of cls task.
+    device:  # set 'gpu:id' or 'cpu'
+
+
+################### speech task: cls; engine_type: inference #######################
+cls_inference:
+    # model_type choices=['panns_cnn14', 'panns_cnn10', 'panns_cnn6']
+    model_type: 'panns_cnn14' 
+    cfg_path: 
+    model_path:  # the pdmodel file of am static model [optional]
+    params_path:  # the pdiparams file of am static model [optional]
+    label_file:  # [optional] Label file of cls task.
+
+    predictor_conf:
+        device:  # set 'gpu:id' or 'cpu'
+        switch_ir_optim: True
+        glog_info: False  # True -> print glog
+        summary: True  # False -> do not show predictor config
+
diff --git a/ernie-sat/paddlespeech/server/conf/ws_application.yaml b/ernie-sat/paddlespeech/server/conf/ws_application.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ef23593ed896ebfc11d2bf3645f4e515b1e90df4
--- /dev/null
+++ b/ernie-sat/paddlespeech/server/conf/ws_application.yaml
@@ -0,0 +1,51 @@
+# This is the parameter configuration file for PaddleSpeech Serving.
+
+#################################################################################
+#                             SERVER SETTING                                    #
+#################################################################################
+host: 0.0.0.0
+port: 8091
+
+# The task format in the engin_list is: <speech task>_<engine type>
+# task choices = ['asr_online', 'tts_online']
+# protocol = ['websocket', 'http'] (only one can be selected).
+# websocket only support online engine type.
+protocol: 'websocket'
+engine_list: ['asr_online']
+
+
+#################################################################################
+#                                ENGINE CONFIG                                  #
+#################################################################################
+
+################################### ASR #########################################
+################### speech task: asr; engine_type: online #######################
+asr_online:
+    model_type: 'deepspeech2online_aishell'
+    am_model: # the pdmodel file of am static model [optional]
+    am_params:  # the pdiparams file of am static model [optional]
+    lang: 'zh'
+    sample_rate: 16000
+    cfg_path: 
+    decode_method: 
+    force_yes: True
+
+    am_predictor_conf:
+        device:  # set 'gpu:id' or 'cpu'
+        switch_ir_optim: True
+        glog_info: False  # True -> print glog
+        summary: True  # False -> do not show predictor config
+
+    chunk_buffer_conf:
+        frame_duration_ms: 80
+        shift_ms: 40
+        sample_rate: 16000
+        sample_width: 2
+
+    vad_conf:
+        aggressiveness: 2
+        sample_rate: 16000
+        frame_duration_ms: 20
+        sample_width: 2
+        padding_ms: 200
+        padding_ratio: 0.9
diff --git a/ernie-sat/paddlespeech/server/download.py b/ernie-sat/paddlespeech/server/download.py
new file mode 100644
index 0000000000000000000000000000000000000000..ea943dd8745c17cacdb0575a8552ba1a75ab4a7c
--- /dev/null
+++ b/ernie-sat/paddlespeech/server/download.py
@@ -0,0 +1,329 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import hashlib
+import os
+import os.path as osp
+import shutil
+import subprocess
+import tarfile
+import time
+import zipfile
+
+import requests
+from tqdm import tqdm
+
+from paddlespeech.cli.log import logger
+
+__all__ = ['get_path_from_url']
+
+DOWNLOAD_RETRY_LIMIT = 3
+
+
+def _is_url(path):
+    """
+    Whether path is URL.
+    Args:
+        path (string): URL string or not.
+    """
+    return path.startswith('http://') or path.startswith('https://')
+
+
+def _map_path(url, root_dir):
+    # parse path after download under root_dir
+    fname = osp.split(url)[-1]
+    fpath = fname
+    return osp.join(root_dir, fpath)
+
+
+def _get_unique_endpoints(trainer_endpoints):
+    # Sorting is to avoid different environmental variables for each card
+    trainer_endpoints.sort()
+    ips = set()
+    unique_endpoints = set()
+    for endpoint in trainer_endpoints:
+        ip = endpoint.split(":")[0]
+        if ip in ips:
+            continue
+        ips.add(ip)
+        unique_endpoints.add(endpoint)
+    logger.info("unique_endpoints {}".format(unique_endpoints))
+    return unique_endpoints
+
+
+def get_path_from_url(url,
+                      root_dir,
+                      md5sum=None,
+                      check_exist=True,
+                      decompress=True,
+                      method='get'):
+    """ Download from given url to root_dir.
+    if file or directory specified by url is exists under
+    root_dir, return the path directly, otherwise download
+    from url and decompress it, return the path.
+    Args:
+        url (str): download url
+        root_dir (str): root dir for downloading, it should be
+                        WEIGHTS_HOME or DATASET_HOME
+        md5sum (str): md5 sum of download package
+        decompress (bool): decompress zip or tar file. Default is `True`
+        method (str): which download method to use. Support `wget` and `get`. Default is `get`.
+    Returns:
+        str: a local path to save downloaded models & weights & datasets.
+    """
+
+    from paddle.fluid.dygraph.parallel import ParallelEnv
+
+    assert _is_url(url), "downloading from {} not a url".format(url)
+    # parse path after download to decompress under root_dir
+    fullpath = _map_path(url, root_dir)
+    # Mainly used to solve the problem of downloading data from different 
+    # machines in the case of multiple machines. Different ips will download 
+    # data, and the same ip will only download data once.
+    unique_endpoints = _get_unique_endpoints(ParallelEnv().trainer_endpoints[:])
+    if osp.exists(fullpath) and check_exist and _md5check(fullpath, md5sum):
+        logger.info("Found {}".format(fullpath))
+    else:
+        if ParallelEnv().current_endpoint in unique_endpoints:
+            fullpath = _download(url, root_dir, md5sum, method=method)
+        else:
+            while not os.path.exists(fullpath):
+                time.sleep(1)
+
+    if ParallelEnv().current_endpoint in unique_endpoints:
+        if decompress and (tarfile.is_tarfile(fullpath) or
+                           zipfile.is_zipfile(fullpath)):
+            fullpath = _decompress(fullpath)
+
+    return fullpath
+
+
+def _get_download(url, fullname):
+    # using requests.get method
+    fname = osp.basename(fullname)
+    try:
+        req = requests.get(url, stream=True)
+    except Exception as e:  # requests.exceptions.ConnectionError
+        logger.info("Downloading {} from {} failed with exception {}".format(
+            fname, url, str(e)))
+        return False
+
+    if req.status_code != 200:
+        raise RuntimeError("Downloading from {} failed with code "
+                           "{}!".format(url, req.status_code))
+
+    # For protecting download interupted, download to
+    # tmp_fullname firstly, move tmp_fullname to fullname
+    # after download finished
+    tmp_fullname = fullname + "_tmp"
+    total_size = req.headers.get('content-length')
+    with open(tmp_fullname, 'wb') as f:
+        if total_size:
+            with tqdm(total=(int(total_size) + 1023) // 1024) as pbar:
+                for chunk in req.iter_content(chunk_size=1024):
+                    f.write(chunk)
+                    pbar.update(1)
+        else:
+            for chunk in req.iter_content(chunk_size=1024):
+                if chunk:
+                    f.write(chunk)
+    shutil.move(tmp_fullname, fullname)
+
+    return fullname
+
+
+def _wget_download(url, fullname):
+    # using wget to download url
+    tmp_fullname = fullname + "_tmp"
+    # –user-agent
+    command = 'wget -O {} -t {} {}'.format(tmp_fullname, DOWNLOAD_RETRY_LIMIT,
+                                           url)
+    subprc = subprocess.Popen(
+        command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    _ = subprc.communicate()
+
+    if subprc.returncode != 0:
+        raise RuntimeError(
+            '{} failed. Please make sure `wget` is installed or {} exists'.
+            format(command, url))
+
+    shutil.move(tmp_fullname, fullname)
+
+    return fullname
+
+
+_download_methods = {
+    'get': _get_download,
+    'wget': _wget_download,
+}
+
+
+def _download(url, path, md5sum=None, method='get'):
+    """
+    Download from url, save to path.
+    url (str): download url
+    path (str): download to given path
+    md5sum (str): md5 sum of download package
+    method (str): which download method to use. Support `wget` and `get`. Default is `get`.
+    """
+    assert method in _download_methods, 'make sure `{}` implemented'.format(
+        method)
+
+    if not osp.exists(path):
+        os.makedirs(path)
+
+    fname = osp.split(url)[-1]
+    fullname = osp.join(path, fname)
+    retry_cnt = 0
+
+    logger.info("Downloading {} from {}".format(fname, url))
+    while not (osp.exists(fullname) and _md5check(fullname, md5sum)):
+        if retry_cnt < DOWNLOAD_RETRY_LIMIT:
+            retry_cnt += 1
+        else:
+            raise RuntimeError("Download from {} failed. "
+                               "Retry limit reached".format(url))
+
+        if not _download_methods[method](url, fullname):
+            time.sleep(1)
+            continue
+
+    return fullname
+
+
+def _md5check(fullname, md5sum=None):
+    if md5sum is None:
+        return True
+
+    logger.info("File {} md5 checking...".format(fullname))
+    md5 = hashlib.md5()
+    with open(fullname, 'rb') as f:
+        for chunk in iter(lambda: f.read(4096), b""):
+            md5.update(chunk)
+    calc_md5sum = md5.hexdigest()
+
+    if calc_md5sum != md5sum:
+        logger.info("File {} md5 check failed, {}(calc) != "
+                    "{}(base)".format(fullname, calc_md5sum, md5sum))
+        return False
+    return True
+
+
+def _decompress(fname):
+    """
+    Decompress for zip and tar file
+    """
+    logger.info("Decompressing {}...".format(fname))
+
+    # For protecting decompressing interupted,
+    # decompress to fpath_tmp directory firstly, if decompress
+    # successed, move decompress files to fpath and delete
+    # fpath_tmp and remove download compress file.
+
+    if tarfile.is_tarfile(fname):
+        uncompressed_path = _uncompress_file_tar(fname)
+    elif zipfile.is_zipfile(fname):
+        uncompressed_path = _uncompress_file_zip(fname)
+    else:
+        raise TypeError("Unsupport compress file type {}".format(fname))
+
+    return uncompressed_path
+
+
+def _uncompress_file_zip(filepath):
+    files = zipfile.ZipFile(filepath, 'r')
+    file_list = files.namelist()
+
+    file_dir = os.path.dirname(filepath)
+
+    if _is_a_single_file(file_list):
+        rootpath = file_list[0]
+        uncompressed_path = os.path.join(file_dir, rootpath)
+
+        for item in file_list:
+            files.extract(item, file_dir)
+
+    elif _is_a_single_dir(file_list):
+        rootpath = os.path.splitext(file_list[0])[0].split(os.sep)[0]
+        uncompressed_path = os.path.join(file_dir, rootpath)
+
+        for item in file_list:
+            files.extract(item, file_dir)
+
+    else:
+        rootpath = os.path.splitext(filepath)[0].split(os.sep)[-1]
+        uncompressed_path = os.path.join(file_dir, rootpath)
+        if not os.path.exists(uncompressed_path):
+            os.makedirs(uncompressed_path)
+        for item in file_list:
+            files.extract(item, os.path.join(file_dir, rootpath))
+
+    files.close()
+
+    return uncompressed_path
+
+
+def _uncompress_file_tar(filepath, mode="r:*"):
+    files = tarfile.open(filepath, mode)
+    file_list = files.getnames()
+
+    file_dir = os.path.dirname(filepath)
+
+    if _is_a_single_file(file_list):
+        rootpath = file_list[0]
+        uncompressed_path = os.path.join(file_dir, rootpath)
+        for item in file_list:
+            files.extract(item, file_dir)
+    elif _is_a_single_dir(file_list):
+        rootpath = os.path.splitext(file_list[0])[0].split(os.sep)[-1]
+        uncompressed_path = os.path.join(file_dir, rootpath)
+        for item in file_list:
+            files.extract(item, file_dir)
+    else:
+        rootpath = os.path.splitext(filepath)[0].split(os.sep)[-1]
+        uncompressed_path = os.path.join(file_dir, rootpath)
+        if not os.path.exists(uncompressed_path):
+            os.makedirs(uncompressed_path)
+
+        for item in file_list:
+            files.extract(item, os.path.join(file_dir, rootpath))
+
+    files.close()
+
+    return uncompressed_path
+
+
+def _is_a_single_file(file_list):
+    if len(file_list) == 1 and file_list[0].find(os.sep) < -1:
+        return True
+    return False
+
+
+def _is_a_single_dir(file_list):
+    new_file_list = []
+    for file_path in file_list:
+        if '/' in file_path:
+            file_path = file_path.replace('/', os.sep)
+        elif '\\' in file_path:
+            file_path = file_path.replace('\\', os.sep)
+        new_file_list.append(file_path)
+
+    file_name = new_file_list[0].split(os.sep)[0]
+    for i in range(1, len(new_file_list)):
+        if file_name != new_file_list[i].split(os.sep)[0]:
+            return False
+    return True
diff --git a/ernie-sat/paddlespeech/server/engine/__init__.py b/ernie-sat/paddlespeech/server/engine/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..97043fd7ba6885aac81cad5a49924c23c67d4d47
--- /dev/null
+++ b/ernie-sat/paddlespeech/server/engine/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/ernie-sat/paddlespeech/server/engine/asr/__init__.py b/ernie-sat/paddlespeech/server/engine/asr/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..97043fd7ba6885aac81cad5a49924c23c67d4d47
--- /dev/null
+++ b/ernie-sat/paddlespeech/server/engine/asr/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/ernie-sat/paddlespeech/server/engine/asr/online/__init__.py b/ernie-sat/paddlespeech/server/engine/asr/online/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..97043fd7ba6885aac81cad5a49924c23c67d4d47
--- /dev/null
+++ b/ernie-sat/paddlespeech/server/engine/asr/online/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/ernie-sat/paddlespeech/server/engine/asr/online/asr_engine.py b/ernie-sat/paddlespeech/server/engine/asr/online/asr_engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..9029aa6e9e45a24f06c6a806bff0c82dd1e84d95
--- /dev/null
+++ b/ernie-sat/paddlespeech/server/engine/asr/online/asr_engine.py
@@ -0,0 +1,352 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from typing import Optional
+
+import numpy as np
+import paddle
+from numpy import float32
+from yacs.config import CfgNode
+
+from paddlespeech.cli.asr.infer import ASRExecutor
+from paddlespeech.cli.log import logger
+from paddlespeech.cli.utils import MODEL_HOME
+from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
+from paddlespeech.s2t.frontend.speech import SpeechSegment
+from paddlespeech.s2t.modules.ctc import CTCDecoder
+from paddlespeech.s2t.utils.utility import UpdateConfig
+from paddlespeech.server.engine.base_engine import BaseEngine
+from paddlespeech.server.utils.paddle_predictor import init_predictor
+
+__all__ = ['ASREngine']
+
+pretrained_models = {
+    "deepspeech2online_aishell-zh-16k": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_ckpt_0.1.1.model.tar.gz',
+        'md5':
+        '23e16c69730a1cb5d735c98c83c21e16',
+        'cfg_path':
+        'model.yaml',
+        'ckpt_path':
+        'exp/deepspeech2_online/checkpoints/avg_1',
+        'model':
+        'exp/deepspeech2_online/checkpoints/avg_1.jit.pdmodel',
+        'params':
+        'exp/deepspeech2_online/checkpoints/avg_1.jit.pdiparams',
+        'lm_url':
+        'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm',
+        'lm_md5':
+        '29e02312deb2e59b3c8686c7966d4fe3'
+    },
+}
+
+
+class ASRServerExecutor(ASRExecutor):
+    def __init__(self):
+        super().__init__()
+        pass
+
+    def _init_from_path(self,
+                        model_type: str='wenetspeech',
+                        am_model: Optional[os.PathLike]=None,
+                        am_params: Optional[os.PathLike]=None,
+                        lang: str='zh',
+                        sample_rate: int=16000,
+                        cfg_path: Optional[os.PathLike]=None,
+                        decode_method: str='attention_rescoring',
+                        am_predictor_conf: dict=None):
+        """
+        Init model and other resources from a specific path.
+        """
+
+        if cfg_path is None or am_model is None or am_params is None:
+            sample_rate_str = '16k' if sample_rate == 16000 else '8k'
+            tag = model_type + '-' + lang + '-' + sample_rate_str
+            res_path = self._get_pretrained_path(tag)  # wenetspeech_zh
+            self.res_path = res_path
+            self.cfg_path = os.path.join(res_path,
+                                         pretrained_models[tag]['cfg_path'])
+
+            self.am_model = os.path.join(res_path,
+                                         pretrained_models[tag]['model'])
+            self.am_params = os.path.join(res_path,
+                                          pretrained_models[tag]['params'])
+            logger.info(res_path)
+            logger.info(self.cfg_path)
+            logger.info(self.am_model)
+            logger.info(self.am_params)
+        else:
+            self.cfg_path = os.path.abspath(cfg_path)
+            self.am_model = os.path.abspath(am_model)
+            self.am_params = os.path.abspath(am_params)
+            self.res_path = os.path.dirname(
+                os.path.dirname(os.path.abspath(self.cfg_path)))
+
+        #Init body.
+        self.config = CfgNode(new_allowed=True)
+        self.config.merge_from_file(self.cfg_path)
+
+        with UpdateConfig(self.config):
+            if "deepspeech2online" in model_type or "deepspeech2offline" in model_type:
+                from paddlespeech.s2t.io.collator import SpeechCollator
+                self.vocab = self.config.vocab_filepath
+                self.config.decode.lang_model_path = os.path.join(
+                    MODEL_HOME, 'language_model',
+                    self.config.decode.lang_model_path)
+                self.collate_fn_test = SpeechCollator.from_config(self.config)
+                self.text_feature = TextFeaturizer(
+                    unit_type=self.config.unit_type, vocab=self.vocab)
+
+                lm_url = pretrained_models[tag]['lm_url']
+                lm_md5 = pretrained_models[tag]['lm_md5']
+                self.download_lm(
+                    lm_url,
+                    os.path.dirname(self.config.decode.lang_model_path), lm_md5)
+            elif "conformer" in model_type or "transformer" in model_type or "wenetspeech" in model_type:
+                raise Exception("wrong type")
+            else:
+                raise Exception("wrong type")
+
+        # AM predictor
+        self.am_predictor_conf = am_predictor_conf
+        self.am_predictor = init_predictor(
+            model_file=self.am_model,
+            params_file=self.am_params,
+            predictor_conf=self.am_predictor_conf)
+
+        # decoder
+        self.decoder = CTCDecoder(
+            odim=self.config.output_dim,  # <blank> is in  vocab
+            enc_n_units=self.config.rnn_layer_size * 2,
+            blank_id=self.config.blank_id,
+            dropout_rate=0.0,
+            reduction=True,  # sum
+            batch_average=True,  # sum / batch_size
+            grad_norm_type=self.config.get('ctc_grad_norm_type', None))
+
+        # init decoder
+        cfg = self.config.decode
+        decode_batch_size = 1  # for online
+        self.decoder.init_decoder(
+            decode_batch_size, self.text_feature.vocab_list,
+            cfg.decoding_method, cfg.lang_model_path, cfg.alpha, cfg.beta,
+            cfg.beam_size, cfg.cutoff_prob, cfg.cutoff_top_n,
+            cfg.num_proc_bsearch)
+
+        # init state box
+        self.chunk_state_h_box = np.zeros(
+            (self.config.num_rnn_layers, 1, self.config.rnn_layer_size),
+            dtype=float32)
+        self.chunk_state_c_box = np.zeros(
+            (self.config.num_rnn_layers, 1, self.config.rnn_layer_size),
+            dtype=float32)
+
+    def reset_decoder_and_chunk(self):
+        """reset decoder and chunk state for an new audio
+        """
+        self.decoder.reset_decoder(batch_size=1)
+        # init state box, for new audio request
+        self.chunk_state_h_box = np.zeros(
+            (self.config.num_rnn_layers, 1, self.config.rnn_layer_size),
+            dtype=float32)
+        self.chunk_state_c_box = np.zeros(
+            (self.config.num_rnn_layers, 1, self.config.rnn_layer_size),
+            dtype=float32)
+
+    def decode_one_chunk(self, x_chunk, x_chunk_lens, model_type: str):
+        """decode one chunk
+
+        Args:
+            x_chunk (numpy.array): shape[B, T, D]
+            x_chunk_lens (numpy.array): shape[B]
+            model_type (str): online model type
+
+        Returns:
+            [type]: [description]
+        """
+        if "deepspeech2online" in model_type:
+            input_names = self.am_predictor.get_input_names()
+            audio_handle = self.am_predictor.get_input_handle(input_names[0])
+            audio_len_handle = self.am_predictor.get_input_handle(
+                input_names[1])
+            h_box_handle = self.am_predictor.get_input_handle(input_names[2])
+            c_box_handle = self.am_predictor.get_input_handle(input_names[3])
+
+            audio_handle.reshape(x_chunk.shape)
+            audio_handle.copy_from_cpu(x_chunk)
+
+            audio_len_handle.reshape(x_chunk_lens.shape)
+            audio_len_handle.copy_from_cpu(x_chunk_lens)
+
+            h_box_handle.reshape(self.chunk_state_h_box.shape)
+            h_box_handle.copy_from_cpu(self.chunk_state_h_box)
+
+            c_box_handle.reshape(self.chunk_state_c_box.shape)
+            c_box_handle.copy_from_cpu(self.chunk_state_c_box)
+
+            output_names = self.am_predictor.get_output_names()
+            output_handle = self.am_predictor.get_output_handle(output_names[0])
+            output_lens_handle = self.am_predictor.get_output_handle(
+                output_names[1])
+            output_state_h_handle = self.am_predictor.get_output_handle(
+                output_names[2])
+            output_state_c_handle = self.am_predictor.get_output_handle(
+                output_names[3])
+
+            self.am_predictor.run()
+
+            output_chunk_probs = output_handle.copy_to_cpu()
+            output_chunk_lens = output_lens_handle.copy_to_cpu()
+            self.chunk_state_h_box = output_state_h_handle.copy_to_cpu()
+            self.chunk_state_c_box = output_state_c_handle.copy_to_cpu()
+
+            self.decoder.next(output_chunk_probs, output_chunk_lens)
+            trans_best, trans_beam = self.decoder.decode()
+
+            return trans_best[0]
+
+        elif "conformer" in model_type or "transformer" in model_type:
+            raise Exception("invalid model name")
+        else:
+            raise Exception("invalid model name")
+
+    def _pcm16to32(self, audio):
+        """pcm int16 to float32
+
+        Args:
+            audio(numpy.array): numpy.int16
+
+        Returns:
+            audio(numpy.array): numpy.float32
+        """
+        if audio.dtype == np.int16:
+            audio = audio.astype("float32")
+            bits = np.iinfo(np.int16).bits
+            audio = audio / (2**(bits - 1))
+        return audio
+
+    def extract_feat(self, samples, sample_rate):
+        """extract feat
+
+        Args:
+            samples (numpy.array): numpy.float32
+            sample_rate (int): sample rate
+
+        Returns:
+            x_chunk (numpy.array): shape[B, T, D]
+            x_chunk_lens (numpy.array): shape[B]
+        """
+        # pcm16 -> pcm 32
+        samples = self._pcm16to32(samples)
+
+        # read audio
+        speech_segment = SpeechSegment.from_pcm(
+            samples, sample_rate, transcript=" ")
+        # audio augment
+        self.collate_fn_test.augmentation.transform_audio(speech_segment)
+
+        # extract speech feature
+        spectrum, transcript_part = self.collate_fn_test._speech_featurizer.featurize(
+            speech_segment, self.collate_fn_test.keep_transcription_text)
+        # CMVN spectrum
+        if self.collate_fn_test._normalizer:
+            spectrum = self.collate_fn_test._normalizer.apply(spectrum)
+
+        # spectrum augment
+        audio = self.collate_fn_test.augmentation.transform_feature(spectrum)
+
+        audio_len = audio.shape[0]
+        audio = paddle.to_tensor(audio, dtype='float32')
+        # audio_len = paddle.to_tensor(audio_len)
+        audio = paddle.unsqueeze(audio, axis=0)
+
+        x_chunk = audio.numpy()
+        x_chunk_lens = np.array([audio_len])
+
+        return x_chunk, x_chunk_lens
+
+
+class ASREngine(BaseEngine):
+    """ASR server engine
+
+    Args:
+        metaclass: Defaults to Singleton.
+    """
+
+    def __init__(self):
+        super(ASREngine, self).__init__()
+
+    def init(self, config: dict) -> bool:
+        """init engine resource
+
+        Args:
+            config_file (str): config file
+
+        Returns:
+            bool: init failed or success
+        """
+        self.input = None
+        self.output = ""
+        self.executor = ASRServerExecutor()
+        self.config = config
+
+        self.executor._init_from_path(
+            model_type=self.config.model_type,
+            am_model=self.config.am_model,
+            am_params=self.config.am_params,
+            lang=self.config.lang,
+            sample_rate=self.config.sample_rate,
+            cfg_path=self.config.cfg_path,
+            decode_method=self.config.decode_method,
+            am_predictor_conf=self.config.am_predictor_conf)
+
+        logger.info("Initialize ASR server engine successfully.")
+        return True
+
+    def preprocess(self, samples, sample_rate):
+        """preprocess
+
+        Args:
+            samples (numpy.array): numpy.float32
+            sample_rate (int): sample rate
+
+        Returns:
+            x_chunk (numpy.array): shape[B, T, D]
+            x_chunk_lens (numpy.array): shape[B]
+        """
+        x_chunk, x_chunk_lens = self.executor.extract_feat(samples, sample_rate)
+        return x_chunk, x_chunk_lens
+
+    def run(self, x_chunk, x_chunk_lens, decoder_chunk_size=1):
+        """run online engine
+
+        Args:
+            x_chunk (numpy.array): shape[B, T, D]
+            x_chunk_lens (numpy.array): shape[B]
+            decoder_chunk_size(int)
+        """
+        self.output = self.executor.decode_one_chunk(x_chunk, x_chunk_lens,
+                                                     self.config.model_type)
+
+    def postprocess(self):
+        """postprocess
+        """
+        return self.output
+
+    def reset(self):
+        """reset engine decoder and inference state
+        """
+        self.executor.reset_decoder_and_chunk()
+        self.output = ""
diff --git a/ernie-sat/paddlespeech/server/engine/asr/paddleinference/__init__.py b/ernie-sat/paddlespeech/server/engine/asr/paddleinference/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..97043fd7ba6885aac81cad5a49924c23c67d4d47
--- /dev/null
+++ b/ernie-sat/paddlespeech/server/engine/asr/paddleinference/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/ernie-sat/paddlespeech/server/engine/asr/paddleinference/asr_engine.py b/ernie-sat/paddlespeech/server/engine/asr/paddleinference/asr_engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..1925bf1d623613d073bb028133a348842b591127
--- /dev/null
+++ b/ernie-sat/paddlespeech/server/engine/asr/paddleinference/asr_engine.py
@@ -0,0 +1,240 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import io
+import os
+import time
+from typing import Optional
+
+import paddle
+from yacs.config import CfgNode
+
+from paddlespeech.cli.asr.infer import ASRExecutor
+from paddlespeech.cli.log import logger
+from paddlespeech.cli.utils import MODEL_HOME
+from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
+from paddlespeech.s2t.modules.ctc import CTCDecoder
+from paddlespeech.s2t.utils.utility import UpdateConfig
+from paddlespeech.server.engine.base_engine import BaseEngine
+from paddlespeech.server.utils.paddle_predictor import init_predictor
+from paddlespeech.server.utils.paddle_predictor import run_model
+
+__all__ = ['ASREngine']
+
+pretrained_models = {
+    "deepspeech2offline_aishell-zh-16k": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_aishell_ckpt_0.1.1.model.tar.gz',
+        'md5':
+        '932c3593d62fe5c741b59b31318aa314',
+        'cfg_path':
+        'model.yaml',
+        'ckpt_path':
+        'exp/deepspeech2/checkpoints/avg_1',
+        'model':
+        'exp/deepspeech2/checkpoints/avg_1.jit.pdmodel',
+        'params':
+        'exp/deepspeech2/checkpoints/avg_1.jit.pdiparams',
+        'lm_url':
+        'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm',
+        'lm_md5':
+        '29e02312deb2e59b3c8686c7966d4fe3'
+    },
+}
+
+
+class ASRServerExecutor(ASRExecutor):
+    def __init__(self):
+        super().__init__()
+        pass
+
+    def _init_from_path(self,
+                        model_type: str='wenetspeech',
+                        am_model: Optional[os.PathLike]=None,
+                        am_params: Optional[os.PathLike]=None,
+                        lang: str='zh',
+                        sample_rate: int=16000,
+                        cfg_path: Optional[os.PathLike]=None,
+                        decode_method: str='attention_rescoring',
+                        am_predictor_conf: dict=None):
+        """
+        Init model and other resources from a specific path.
+        """
+
+        if cfg_path is None or am_model is None or am_params is None:
+            sample_rate_str = '16k' if sample_rate == 16000 else '8k'
+            tag = model_type + '-' + lang + '-' + sample_rate_str
+            res_path = self._get_pretrained_path(tag)  # wenetspeech_zh
+            self.res_path = res_path
+            self.cfg_path = os.path.join(res_path,
+                                         pretrained_models[tag]['cfg_path'])
+
+            self.am_model = os.path.join(res_path,
+                                         pretrained_models[tag]['model'])
+            self.am_params = os.path.join(res_path,
+                                          pretrained_models[tag]['params'])
+            logger.info(res_path)
+            logger.info(self.cfg_path)
+            logger.info(self.am_model)
+            logger.info(self.am_params)
+        else:
+            self.cfg_path = os.path.abspath(cfg_path)
+            self.am_model = os.path.abspath(am_model)
+            self.am_params = os.path.abspath(am_params)
+            self.res_path = os.path.dirname(
+                os.path.dirname(os.path.abspath(self.cfg_path)))
+
+        #Init body.
+        self.config = CfgNode(new_allowed=True)
+        self.config.merge_from_file(self.cfg_path)
+
+        with UpdateConfig(self.config):
+            if "deepspeech2online" in model_type or "deepspeech2offline" in model_type:
+                from paddlespeech.s2t.io.collator import SpeechCollator
+                self.vocab = self.config.vocab_filepath
+                self.config.decode.lang_model_path = os.path.join(
+                    MODEL_HOME, 'language_model',
+                    self.config.decode.lang_model_path)
+                self.collate_fn_test = SpeechCollator.from_config(self.config)
+                self.text_feature = TextFeaturizer(
+                    unit_type=self.config.unit_type, vocab=self.vocab)
+
+                lm_url = pretrained_models[tag]['lm_url']
+                lm_md5 = pretrained_models[tag]['lm_md5']
+                self.download_lm(
+                    lm_url,
+                    os.path.dirname(self.config.decode.lang_model_path), lm_md5)
+            elif "conformer" in model_type or "transformer" in model_type or "wenetspeech" in model_type:
+                raise Exception("wrong type")
+            else:
+                raise Exception("wrong type")
+
+        # AM predictor
+        self.am_predictor_conf = am_predictor_conf
+        self.am_predictor = init_predictor(
+            model_file=self.am_model,
+            params_file=self.am_params,
+            predictor_conf=self.am_predictor_conf)
+
+        # decoder
+        self.decoder = CTCDecoder(
+            odim=self.config.output_dim,  # <blank> is in  vocab
+            enc_n_units=self.config.rnn_layer_size * 2,
+            blank_id=self.config.blank_id,
+            dropout_rate=0.0,
+            reduction=True,  # sum
+            batch_average=True,  # sum / batch_size
+            grad_norm_type=self.config.get('ctc_grad_norm_type', None))
+
+    @paddle.no_grad()
+    def infer(self, model_type: str):
+        """
+        Model inference and result stored in self.output.
+        """
+        cfg = self.config.decode
+        audio = self._inputs["audio"]
+        audio_len = self._inputs["audio_len"]
+        if "deepspeech2online" in model_type or "deepspeech2offline" in model_type:
+            decode_batch_size = audio.shape[0]
+            # init once
+            self.decoder.init_decoder(
+                decode_batch_size, self.text_feature.vocab_list,
+                cfg.decoding_method, cfg.lang_model_path, cfg.alpha, cfg.beta,
+                cfg.beam_size, cfg.cutoff_prob, cfg.cutoff_top_n,
+                cfg.num_proc_bsearch)
+
+            output_data = run_model(self.am_predictor,
+                                    [audio.numpy(), audio_len.numpy()])
+
+            probs = output_data[0]
+            eouts_len = output_data[1]
+
+            batch_size = probs.shape[0]
+            self.decoder.reset_decoder(batch_size=batch_size)
+            self.decoder.next(probs, eouts_len)
+            trans_best, trans_beam = self.decoder.decode()
+
+            # self.model.decoder.del_decoder()
+            self._outputs["result"] = trans_best[0]
+
+        elif "conformer" in model_type or "transformer" in model_type:
+            raise Exception("invalid model name")
+        else:
+            raise Exception("invalid model name")
+
+
+class ASREngine(BaseEngine):
+    """ASR server engine
+
+    Args:
+        metaclass: Defaults to Singleton.
+    """
+
+    def __init__(self):
+        super(ASREngine, self).__init__()
+
+    def init(self, config: dict) -> bool:
+        """init engine resource
+
+        Args:
+            config_file (str): config file
+
+        Returns:
+            bool: init failed or success
+        """
+        self.input = None
+        self.output = None
+        self.executor = ASRServerExecutor()
+        self.config = config
+
+        self.executor._init_from_path(
+            model_type=self.config.model_type,
+            am_model=self.config.am_model,
+            am_params=self.config.am_params,
+            lang=self.config.lang,
+            sample_rate=self.config.sample_rate,
+            cfg_path=self.config.cfg_path,
+            decode_method=self.config.decode_method,
+            am_predictor_conf=self.config.am_predictor_conf)
+
+        logger.info("Initialize ASR server engine successfully.")
+        return True
+
+    def run(self, audio_data):
+        """engine run 
+
+        Args:
+            audio_data (bytes): base64.b64decode
+        """
+        if self.executor._check(
+                io.BytesIO(audio_data), self.config.sample_rate,
+                self.config.force_yes):
+            logger.info("start running asr engine")
+            self.executor.preprocess(self.config.model_type,
+                                     io.BytesIO(audio_data))
+            st = time.time()
+            self.executor.infer(self.config.model_type)
+            infer_time = time.time() - st
+            self.output = self.executor.postprocess()  # Retrieve result of asr.
+            logger.info("end inferring asr engine")
+        else:
+            logger.info("file check failed!")
+            self.output = None
+
+        logger.info("inference time: {}".format(infer_time))
+        logger.info("asr engine type: paddle inference")
+
+    def postprocess(self):
+        """postprocess
+        """
+        return self.output
diff --git a/ernie-sat/paddlespeech/server/engine/asr/python/__init__.py b/ernie-sat/paddlespeech/server/engine/asr/python/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..97043fd7ba6885aac81cad5a49924c23c67d4d47
--- /dev/null
+++ b/ernie-sat/paddlespeech/server/engine/asr/python/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/ernie-sat/paddlespeech/server/engine/asr/python/asr_engine.py b/ernie-sat/paddlespeech/server/engine/asr/python/asr_engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..e76c49a79a66be505f239f9f04b5fdd050701fda
--- /dev/null
+++ b/ernie-sat/paddlespeech/server/engine/asr/python/asr_engine.py
@@ -0,0 +1,100 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import io
+import time
+
+import paddle
+
+from paddlespeech.cli.asr.infer import ASRExecutor
+from paddlespeech.cli.log import logger
+from paddlespeech.server.engine.base_engine import BaseEngine
+
+__all__ = ['ASREngine']
+
+
+class ASRServerExecutor(ASRExecutor):
+    def __init__(self):
+        super().__init__()
+        pass
+
+
+class ASREngine(BaseEngine):
+    """ASR server engine
+
+    Args:
+        metaclass: Defaults to Singleton.
+    """
+
+    def __init__(self):
+        super(ASREngine, self).__init__()
+
+    def init(self, config: dict) -> bool:
+        """init engine resource
+
+        Args:
+            config_file (str): config file
+
+        Returns:
+            bool: init failed or success
+        """
+        self.input = None
+        self.output = None
+        self.executor = ASRServerExecutor()
+        self.config = config
+        try:
+            if self.config.device:
+                self.device = self.config.device
+            else:
+                self.device = paddle.get_device()
+            paddle.set_device(self.device)
+        except BaseException:
+            logger.error(
+                "Set device failed, please check if device is already used and the parameter 'device' in the yaml file"
+            )
+
+        self.executor._init_from_path(
+            self.config.model, self.config.lang, self.config.sample_rate,
+            self.config.cfg_path, self.config.decode_method,
+            self.config.ckpt_path)
+
+        logger.info("Initialize ASR server engine successfully on device: %s." %
+                    (self.device))
+        return True
+
+    def run(self, audio_data):
+        """engine run 
+
+        Args:
+            audio_data (bytes): base64.b64decode
+        """
+        if self.executor._check(
+                io.BytesIO(audio_data), self.config.sample_rate,
+                self.config.force_yes):
+            logger.info("start run asr engine")
+            self.executor.preprocess(self.config.model, io.BytesIO(audio_data))
+            st = time.time()
+            self.executor.infer(self.config.model)
+            infer_time = time.time() - st
+            self.output = self.executor.postprocess()  # Retrieve result of asr.
+        else:
+            logger.info("file check failed!")
+            self.output = None
+
+        logger.info("inference time: {}".format(infer_time))
+        logger.info("asr engine type: python")
+
+    def postprocess(self):
+        """postprocess
+        """
+        return self.output
diff --git a/ernie-sat/paddlespeech/server/engine/base_engine.py b/ernie-sat/paddlespeech/server/engine/base_engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f020d1c783e194f96af84de9326eba25595435c
--- /dev/null
+++ b/ernie-sat/paddlespeech/server/engine/base_engine.py
@@ -0,0 +1,58 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from typing import Union
+
+from pattern_singleton import Singleton
+
+__all__ = ['BaseEngine']
+
+
+class BaseEngine(metaclass=Singleton):
+    """
+        An base engine class
+    """
+
+    def __init__(self):
+        self._inputs = dict()
+        self._outputs = dict()
+
+    def init(self, *args, **kwargs):
+        """
+        init the engine
+        
+        Returns:
+            bool: true or false
+        """
+        pass
+
+    def postprocess(self, *args, **kwargs) -> Union[str, os.PathLike]:
+        """
+        Output postprocess and return results.
+        This method get model output from self._outputs and convert it into human-readable results.
+
+        Returns:
+            Union[str, os.PathLike]: Human-readable results such as texts and audio files.
+        """
+        pass
+
+    def run(self, *args, **kwargs) -> Union[str, os.PathLike]:
+        """
+        Output postprocess and return results.
+        This method get model output from self._outputs and convert it into human-readable results.
+
+        Returns:
+            Union[str, os.PathLike]: Human-readable results such as texts and audio files.
+        """
+        pass
diff --git a/ernie-sat/paddlespeech/server/engine/cls/__init__.py b/ernie-sat/paddlespeech/server/engine/cls/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..97043fd7ba6885aac81cad5a49924c23c67d4d47
--- /dev/null
+++ b/ernie-sat/paddlespeech/server/engine/cls/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/ernie-sat/paddlespeech/server/engine/cls/paddleinference/__init__.py b/ernie-sat/paddlespeech/server/engine/cls/paddleinference/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..97043fd7ba6885aac81cad5a49924c23c67d4d47
--- /dev/null
+++ b/ernie-sat/paddlespeech/server/engine/cls/paddleinference/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/ernie-sat/paddlespeech/server/engine/cls/paddleinference/cls_engine.py b/ernie-sat/paddlespeech/server/engine/cls/paddleinference/cls_engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..3982effd902c9d79b7b7684a7bd0268d0e8c1049
--- /dev/null
+++ b/ernie-sat/paddlespeech/server/engine/cls/paddleinference/cls_engine.py
@@ -0,0 +1,224 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import io
+import os
+import time
+from typing import Optional
+
+import numpy as np
+import paddle
+import yaml
+
+from paddlespeech.cli.cls.infer import CLSExecutor
+from paddlespeech.cli.log import logger
+from paddlespeech.cli.utils import download_and_decompress
+from paddlespeech.cli.utils import MODEL_HOME
+from paddlespeech.server.engine.base_engine import BaseEngine
+from paddlespeech.server.utils.paddle_predictor import init_predictor
+from paddlespeech.server.utils.paddle_predictor import run_model
+
+__all__ = ['CLSEngine']
+
+pretrained_models = {
+    "panns_cnn6-32k": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/cls/inference_model/panns_cnn6_static.tar.gz',
+        'md5':
+        'da087c31046d23281d8ec5188c1967da',
+        'cfg_path':
+        'panns.yaml',
+        'model_path':
+        'inference.pdmodel',
+        'params_path':
+        'inference.pdiparams',
+        'label_file':
+        'audioset_labels.txt',
+    },
+    "panns_cnn10-32k": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/cls/inference_model/panns_cnn10_static.tar.gz',
+        'md5':
+        '5460cc6eafbfaf0f261cc75b90284ae1',
+        'cfg_path':
+        'panns.yaml',
+        'model_path':
+        'inference.pdmodel',
+        'params_path':
+        'inference.pdiparams',
+        'label_file':
+        'audioset_labels.txt',
+    },
+    "panns_cnn14-32k": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/cls/inference_model/panns_cnn14_static.tar.gz',
+        'md5':
+        'ccc80b194821274da79466862b2ab00f',
+        'cfg_path':
+        'panns.yaml',
+        'model_path':
+        'inference.pdmodel',
+        'params_path':
+        'inference.pdiparams',
+        'label_file':
+        'audioset_labels.txt',
+    },
+}
+
+
+class CLSServerExecutor(CLSExecutor):
+    def __init__(self):
+        super().__init__()
+        pass
+
+    def _get_pretrained_path(self, tag: str) -> os.PathLike:
+        """
+            Download and returns pretrained resources path of current task.
+        """
+        support_models = list(pretrained_models.keys())
+        assert tag in pretrained_models, 'The model "{}" you want to use has not been supported, please choose other models.\nThe support models includes:\n\t\t{}\n'.format(
+            tag, '\n\t\t'.join(support_models))
+
+        res_path = os.path.join(MODEL_HOME, tag)
+        decompressed_path = download_and_decompress(pretrained_models[tag],
+                                                    res_path)
+        decompressed_path = os.path.abspath(decompressed_path)
+        logger.info(
+            'Use pretrained model stored in: {}'.format(decompressed_path))
+
+        return decompressed_path
+
+    def _init_from_path(
+            self,
+            model_type: str='panns_cnn14',
+            cfg_path: Optional[os.PathLike]=None,
+            model_path: Optional[os.PathLike]=None,
+            params_path: Optional[os.PathLike]=None,
+            label_file: Optional[os.PathLike]=None,
+            predictor_conf: dict=None, ):
+        """
+        Init model and other resources from a specific path.
+        """
+
+        if cfg_path is None or model_path is None or params_path is None or label_file is None:
+            tag = model_type + '-' + '32k'
+            self.res_path = self._get_pretrained_path(tag)
+            self.cfg_path = os.path.join(self.res_path,
+                                         pretrained_models[tag]['cfg_path'])
+            self.model_path = os.path.join(self.res_path,
+                                           pretrained_models[tag]['model_path'])
+            self.params_path = os.path.join(
+                self.res_path, pretrained_models[tag]['params_path'])
+            self.label_file = os.path.join(self.res_path,
+                                           pretrained_models[tag]['label_file'])
+        else:
+            self.cfg_path = os.path.abspath(cfg_path)
+            self.model_path = os.path.abspath(model_path)
+            self.params_path = os.path.abspath(params_path)
+            self.label_file = os.path.abspath(label_file)
+
+        logger.info(self.cfg_path)
+        logger.info(self.model_path)
+        logger.info(self.params_path)
+        logger.info(self.label_file)
+
+        # config
+        with open(self.cfg_path, 'r') as f:
+            self._conf = yaml.safe_load(f)
+        logger.info("Read cfg file successfully.")
+
+        # labels
+        self._label_list = []
+        with open(self.label_file, 'r') as f:
+            for line in f:
+                self._label_list.append(line.strip())
+        logger.info("Read label file successfully.")
+
+        # Create predictor
+        self.predictor_conf = predictor_conf
+        self.predictor = init_predictor(
+            model_file=self.model_path,
+            params_file=self.params_path,
+            predictor_conf=self.predictor_conf)
+        logger.info("Create predictor successfully.")
+
+    @paddle.no_grad()
+    def infer(self):
+        """
+        Model inference and result stored in self.output.
+        """
+        output = run_model(self.predictor, [self._inputs['feats'].numpy()])
+        self._outputs['logits'] = output[0]
+
+
+class CLSEngine(BaseEngine):
+    """CLS server engine
+
+    Args:
+        metaclass: Defaults to Singleton.
+    """
+
+    def __init__(self):
+        super(CLSEngine, self).__init__()
+
+    def init(self, config: dict) -> bool:
+        """init engine resource
+
+        Args:
+            config_file (str): config file
+
+        Returns:
+            bool: init failed or success
+        """
+        self.executor = CLSServerExecutor()
+        self.config = config
+        self.executor._init_from_path(
+            self.config.model_type, self.config.cfg_path,
+            self.config.model_path, self.config.params_path,
+            self.config.label_file, self.config.predictor_conf)
+
+        logger.info("Initialize CLS server engine successfully.")
+        return True
+
+    def run(self, audio_data):
+        """engine run 
+
+        Args:
+            audio_data (bytes): base64.b64decode
+        """
+
+        self.executor.preprocess(io.BytesIO(audio_data))
+        st = time.time()
+        self.executor.infer()
+        infer_time = time.time() - st
+
+        logger.info("inference time: {}".format(infer_time))
+        logger.info("cls engine type: inference")
+
+    def postprocess(self, topk: int):
+        """postprocess
+        """
+        assert topk <= len(self.executor._label_list
+                           ), 'Value of topk is larger than number of labels.'
+
+        result = np.squeeze(self.executor._outputs['logits'], axis=0)
+        topk_idx = (-result).argsort()[:topk]
+        topk_results = []
+        for idx in topk_idx:
+            res = {}
+            label, score = self.executor._label_list[idx], result[idx]
+            res['class_name'] = label
+            res['prob'] = score
+            topk_results.append(res)
+
+        return topk_results
diff --git a/ernie-sat/paddlespeech/server/engine/cls/python/__init__.py b/ernie-sat/paddlespeech/server/engine/cls/python/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..97043fd7ba6885aac81cad5a49924c23c67d4d47
--- /dev/null
+++ b/ernie-sat/paddlespeech/server/engine/cls/python/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/ernie-sat/paddlespeech/server/engine/cls/python/cls_engine.py b/ernie-sat/paddlespeech/server/engine/cls/python/cls_engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a975b0a05b4d0163e47877b5141da529ad5f004
--- /dev/null
+++ b/ernie-sat/paddlespeech/server/engine/cls/python/cls_engine.py
@@ -0,0 +1,124 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import io
+import time
+from typing import List
+
+import paddle
+
+from paddlespeech.cli.cls.infer import CLSExecutor
+from paddlespeech.cli.log import logger
+from paddlespeech.server.engine.base_engine import BaseEngine
+
+__all__ = ['CLSEngine']
+
+
+class CLSServerExecutor(CLSExecutor):
+    def __init__(self):
+        super().__init__()
+        pass
+
+    def get_topk_results(self, topk: int) -> List:
+        assert topk <= len(
+            self._label_list), 'Value of topk is larger than number of labels.'
+
+        result = self._outputs['logits'].squeeze(0).numpy()
+        topk_idx = (-result).argsort()[:topk]
+        res = {}
+        topk_results = []
+        for idx in topk_idx:
+            label, score = self._label_list[idx], result[idx]
+            res['class'] = label
+            res['prob'] = score
+            topk_results.append(res)
+        return topk_results
+
+
+class CLSEngine(BaseEngine):
+    """CLS server engine
+
+    Args:
+        metaclass: Defaults to Singleton.
+    """
+
+    def __init__(self):
+        super(CLSEngine, self).__init__()
+
+    def init(self, config: dict) -> bool:
+        """init engine resource
+
+        Args:
+            config_file (str): config file
+
+        Returns:
+            bool: init failed or success
+        """
+        self.input = None
+        self.output = None
+        self.executor = CLSServerExecutor()
+        self.config = config
+        try:
+            if self.config.device:
+                self.device = self.config.device
+            else:
+                self.device = paddle.get_device()
+            paddle.set_device(self.device)
+        except BaseException:
+            logger.error(
+                "Set device failed, please check if device is already used and the parameter 'device' in the yaml file"
+            )
+
+        try:
+            self.executor._init_from_path(
+                self.config.model, self.config.cfg_path, self.config.ckpt_path,
+                self.config.label_file)
+        except BaseException:
+            logger.error("Initialize CLS server engine Failed.")
+            return False
+
+        logger.info("Initialize CLS server engine successfully on device: %s." %
+                    (self.device))
+        return True
+
+    def run(self, audio_data):
+        """engine run 
+
+        Args:
+            audio_data (bytes): base64.b64decode
+        """
+        self.executor.preprocess(io.BytesIO(audio_data))
+        st = time.time()
+        self.executor.infer()
+        infer_time = time.time() - st
+
+        logger.info("inference time: {}".format(infer_time))
+        logger.info("cls engine type: python")
+
+    def postprocess(self, topk: int):
+        """postprocess
+        """
+        assert topk <= len(self.executor._label_list
+                           ), 'Value of topk is larger than number of labels.'
+
+        result = self.executor._outputs['logits'].squeeze(0).numpy()
+        topk_idx = (-result).argsort()[:topk]
+        topk_results = []
+        for idx in topk_idx:
+            res = {}
+            label, score = self.executor._label_list[idx], result[idx]
+            res['class_name'] = label
+            res['prob'] = score
+            topk_results.append(res)
+
+        return topk_results
diff --git a/ernie-sat/paddlespeech/server/engine/engine_factory.py b/ernie-sat/paddlespeech/server/engine/engine_factory.py
new file mode 100644
index 0000000000000000000000000000000000000000..2a39fb79b9c4ece5f16ef6761f03af90ddaed79e
--- /dev/null
+++ b/ernie-sat/paddlespeech/server/engine/engine_factory.py
@@ -0,0 +1,44 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Text
+
+__all__ = ['EngineFactory']
+
+
+class EngineFactory(object):
+    @staticmethod
+    def get_engine(engine_name: Text, engine_type: Text):
+        if engine_name == 'asr' and engine_type == 'inference':
+            from paddlespeech.server.engine.asr.paddleinference.asr_engine import ASREngine
+            return ASREngine()
+        elif engine_name == 'asr' and engine_type == 'python':
+            from paddlespeech.server.engine.asr.python.asr_engine import ASREngine
+            return ASREngine()
+        elif engine_name == 'asr' and engine_type == 'online':
+            from paddlespeech.server.engine.asr.online.asr_engine import ASREngine
+            return ASREngine()
+        elif engine_name == 'tts' and engine_type == 'inference':
+            from paddlespeech.server.engine.tts.paddleinference.tts_engine import TTSEngine
+            return TTSEngine()
+        elif engine_name == 'tts' and engine_type == 'python':
+            from paddlespeech.server.engine.tts.python.tts_engine import TTSEngine
+            return TTSEngine()
+        elif engine_name == 'cls' and engine_type == 'inference':
+            from paddlespeech.server.engine.cls.paddleinference.cls_engine import CLSEngine
+            return CLSEngine()
+        elif engine_name == 'cls' and engine_type == 'python':
+            from paddlespeech.server.engine.cls.python.cls_engine import CLSEngine
+            return CLSEngine()
+        else:
+            return None
diff --git a/ernie-sat/paddlespeech/server/engine/engine_pool.py b/ernie-sat/paddlespeech/server/engine/engine_pool.py
new file mode 100644
index 0000000000000000000000000000000000000000..9de73567e47c8150a7b2807d4bf1cc299e0e1b40
--- /dev/null
+++ b/ernie-sat/paddlespeech/server/engine/engine_pool.py
@@ -0,0 +1,40 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from paddlespeech.server.engine.engine_factory import EngineFactory
+
+# global value
+ENGINE_POOL = {}
+
+
+def get_engine_pool() -> dict:
+    """ Get engine pool
+    """
+    global ENGINE_POOL
+    return ENGINE_POOL
+
+
+def init_engine_pool(config) -> bool:
+    """ Init engine pool
+    """
+    global ENGINE_POOL
+
+    for engine_and_type in config.engine_list:
+        engine = engine_and_type.split("_")[0]
+        engine_type = engine_and_type.split("_")[1]
+        ENGINE_POOL[engine] = EngineFactory.get_engine(
+            engine_name=engine, engine_type=engine_type)
+        if not ENGINE_POOL[engine].init(config=config[engine_and_type]):
+            return False
+
+    return True
diff --git a/ernie-sat/paddlespeech/server/engine/tts/__init__.py b/ernie-sat/paddlespeech/server/engine/tts/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..97043fd7ba6885aac81cad5a49924c23c67d4d47
--- /dev/null
+++ b/ernie-sat/paddlespeech/server/engine/tts/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/ernie-sat/paddlespeech/server/engine/tts/paddleinference/__init__.py b/ernie-sat/paddlespeech/server/engine/tts/paddleinference/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..97043fd7ba6885aac81cad5a49924c23c67d4d47
--- /dev/null
+++ b/ernie-sat/paddlespeech/server/engine/tts/paddleinference/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/ernie-sat/paddlespeech/server/engine/tts/paddleinference/tts_engine.py b/ernie-sat/paddlespeech/server/engine/tts/paddleinference/tts_engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..db8813ba901a93fa935ce003b8a7abdeec245485
--- /dev/null
+++ b/ernie-sat/paddlespeech/server/engine/tts/paddleinference/tts_engine.py
@@ -0,0 +1,534 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import base64
+import io
+import os
+import time
+from typing import Optional
+
+import librosa
+import numpy as np
+import paddle
+import soundfile as sf
+from scipy.io import wavfile
+
+from paddlespeech.cli.log import logger
+from paddlespeech.cli.tts.infer import TTSExecutor
+from paddlespeech.cli.utils import download_and_decompress
+from paddlespeech.cli.utils import MODEL_HOME
+from paddlespeech.server.engine.base_engine import BaseEngine
+from paddlespeech.server.utils.audio_process import change_speed
+from paddlespeech.server.utils.errors import ErrorCode
+from paddlespeech.server.utils.exception import ServerBaseException
+from paddlespeech.server.utils.paddle_predictor import init_predictor
+from paddlespeech.server.utils.paddle_predictor import run_model
+from paddlespeech.t2s.frontend import English
+from paddlespeech.t2s.frontend.zh_frontend import Frontend
+
+__all__ = ['TTSEngine']
+
+# Static model applied on paddle inference
+pretrained_models = {
+    # speedyspeech
+    "speedyspeech_csmsc-zh": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_static_0.5.zip',
+        'md5':
+        'f10cbdedf47dc7a9668d2264494e1823',
+        'model':
+        'speedyspeech_csmsc.pdmodel',
+        'params':
+        'speedyspeech_csmsc.pdiparams',
+        'phones_dict':
+        'phone_id_map.txt',
+        'tones_dict':
+        'tone_id_map.txt',
+        'sample_rate':
+        24000,
+    },
+    # fastspeech2
+    "fastspeech2_csmsc-zh": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_static_0.4.zip',
+        'md5':
+        '9788cd9745e14c7a5d12d32670b2a5a7',
+        'model':
+        'fastspeech2_csmsc.pdmodel',
+        'params':
+        'fastspeech2_csmsc.pdiparams',
+        'phones_dict':
+        'phone_id_map.txt',
+        'sample_rate':
+        24000,
+    },
+    # pwgan
+    "pwgan_csmsc-zh": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_static_0.4.zip',
+        'md5':
+        'e3504aed9c5a290be12d1347836d2742',
+        'model':
+        'pwgan_csmsc.pdmodel',
+        'params':
+        'pwgan_csmsc.pdiparams',
+        'sample_rate':
+        24000,
+    },
+    # mb_melgan
+    "mb_melgan_csmsc-zh": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_static_0.1.1.zip',
+        'md5':
+        'ac6eee94ba483421d750433f4c3b8d36',
+        'model':
+        'mb_melgan_csmsc.pdmodel',
+        'params':
+        'mb_melgan_csmsc.pdiparams',
+        'sample_rate':
+        24000,
+    },
+    # hifigan
+    "hifigan_csmsc-zh": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_static_0.1.1.zip',
+        'md5':
+        '7edd8c436b3a5546b3a7cb8cff9d5a0c',
+        'model':
+        'hifigan_csmsc.pdmodel',
+        'params':
+        'hifigan_csmsc.pdiparams',
+        'sample_rate':
+        24000,
+    },
+}
+
+
+class TTSServerExecutor(TTSExecutor):
+    def __init__(self):
+        super().__init__()
+        pass
+
+    def _get_pretrained_path(self, tag: str) -> os.PathLike:
+        """
+        Download and returns pretrained resources path of current task.
+        """
+        assert tag in pretrained_models, 'Can not find pretrained resources of {}.'.format(
+            tag)
+
+        res_path = os.path.join(MODEL_HOME, tag)
+        decompressed_path = download_and_decompress(pretrained_models[tag],
+                                                    res_path)
+        decompressed_path = os.path.abspath(decompressed_path)
+        logger.info(
+            'Use pretrained model stored in: {}'.format(decompressed_path))
+        return decompressed_path
+
+    def _init_from_path(
+            self,
+            am: str='fastspeech2_csmsc',
+            am_model: Optional[os.PathLike]=None,
+            am_params: Optional[os.PathLike]=None,
+            am_sample_rate: int=24000,
+            phones_dict: Optional[os.PathLike]=None,
+            tones_dict: Optional[os.PathLike]=None,
+            speaker_dict: Optional[os.PathLike]=None,
+            voc: str='pwgan_csmsc',
+            voc_model: Optional[os.PathLike]=None,
+            voc_params: Optional[os.PathLike]=None,
+            voc_sample_rate: int=24000,
+            lang: str='zh',
+            am_predictor_conf: dict=None,
+            voc_predictor_conf: dict=None, ):
+        """
+        Init model and other resources from a specific path.
+        """
+        if hasattr(self, 'am_predictor') and hasattr(self, 'voc_predictor'):
+            logger.info('Models had been initialized.')
+            return
+        # am
+        am_tag = am + '-' + lang
+        if am_model is None or am_params is None or phones_dict is None:
+            am_res_path = self._get_pretrained_path(am_tag)
+            self.am_res_path = am_res_path
+            self.am_model = os.path.join(am_res_path,
+                                         pretrained_models[am_tag]['model'])
+            self.am_params = os.path.join(am_res_path,
+                                          pretrained_models[am_tag]['params'])
+            # must have phones_dict in acoustic
+            self.phones_dict = os.path.join(
+                am_res_path, pretrained_models[am_tag]['phones_dict'])
+            self.am_sample_rate = pretrained_models[am_tag]['sample_rate']
+
+            logger.info(am_res_path)
+            logger.info(self.am_model)
+            logger.info(self.am_params)
+        else:
+            self.am_model = os.path.abspath(am_model)
+            self.am_params = os.path.abspath(am_params)
+            self.phones_dict = os.path.abspath(phones_dict)
+            self.am_sample_rate = am_sample_rate
+            self.am_res_path = os.path.dirname(os.path.abspath(self.am_model))
+        logger.info("self.phones_dict: {}".format(self.phones_dict))
+
+        # for speedyspeech
+        self.tones_dict = None
+        if 'tones_dict' in pretrained_models[am_tag]:
+            self.tones_dict = os.path.join(
+                am_res_path, pretrained_models[am_tag]['tones_dict'])
+            if tones_dict:
+                self.tones_dict = tones_dict
+
+        # for multi speaker fastspeech2
+        self.speaker_dict = None
+        if 'speaker_dict' in pretrained_models[am_tag]:
+            self.speaker_dict = os.path.join(
+                am_res_path, pretrained_models[am_tag]['speaker_dict'])
+            if speaker_dict:
+                self.speaker_dict = speaker_dict
+
+        # voc
+        voc_tag = voc + '-' + lang
+        if voc_model is None or voc_params is None:
+            voc_res_path = self._get_pretrained_path(voc_tag)
+            self.voc_res_path = voc_res_path
+            self.voc_model = os.path.join(voc_res_path,
+                                          pretrained_models[voc_tag]['model'])
+            self.voc_params = os.path.join(voc_res_path,
+                                           pretrained_models[voc_tag]['params'])
+            self.voc_sample_rate = pretrained_models[voc_tag]['sample_rate']
+            logger.info(voc_res_path)
+            logger.info(self.voc_model)
+            logger.info(self.voc_params)
+        else:
+            self.voc_model = os.path.abspath(voc_model)
+            self.voc_params = os.path.abspath(voc_params)
+            self.voc_sample_rate = voc_sample_rate
+            self.voc_res_path = os.path.dirname(os.path.abspath(self.voc_model))
+
+        assert (
+            self.voc_sample_rate == self.am_sample_rate
+        ), "The sample rate of AM and Vocoder model are different, please check model."
+
+        # Init body.
+        with open(self.phones_dict, "r") as f:
+            phn_id = [line.strip().split() for line in f.readlines()]
+        vocab_size = len(phn_id)
+        logger.info("vocab_size: {}".format(vocab_size))
+
+        tone_size = None
+        if self.tones_dict:
+            with open(self.tones_dict, "r") as f:
+                tone_id = [line.strip().split() for line in f.readlines()]
+            tone_size = len(tone_id)
+            logger.info("tone_size: {}".format(tone_size))
+
+        spk_num = None
+        if self.speaker_dict:
+            with open(self.speaker_dict, 'rt') as f:
+                spk_id = [line.strip().split() for line in f.readlines()]
+            spk_num = len(spk_id)
+            logger.info("spk_num: {}".format(spk_num))
+
+        # frontend
+        if lang == 'zh':
+            self.frontend = Frontend(
+                phone_vocab_path=self.phones_dict,
+                tone_vocab_path=self.tones_dict)
+
+        elif lang == 'en':
+            self.frontend = English(phone_vocab_path=self.phones_dict)
+        logger.info("frontend done!")
+
+        # Create am predictor
+        self.am_predictor_conf = am_predictor_conf
+        self.am_predictor = init_predictor(
+            model_file=self.am_model,
+            params_file=self.am_params,
+            predictor_conf=self.am_predictor_conf)
+        logger.info("Create AM predictor successfully.")
+
+        # Create voc predictor
+        self.voc_predictor_conf = voc_predictor_conf
+        self.voc_predictor = init_predictor(
+            model_file=self.voc_model,
+            params_file=self.voc_params,
+            predictor_conf=self.voc_predictor_conf)
+        logger.info("Create Vocoder predictor successfully.")
+
+    @paddle.no_grad()
+    def infer(self,
+              text: str,
+              lang: str='zh',
+              am: str='fastspeech2_csmsc',
+              spk_id: int=0):
+        """
+        Model inference and result stored in self.output.
+        """
+        am_name = am[:am.rindex('_')]
+        am_dataset = am[am.rindex('_') + 1:]
+        get_tone_ids = False
+        merge_sentences = False
+        frontend_st = time.time()
+        if am_name == 'speedyspeech':
+            get_tone_ids = True
+        if lang == 'zh':
+            input_ids = self.frontend.get_input_ids(
+                text,
+                merge_sentences=merge_sentences,
+                get_tone_ids=get_tone_ids)
+            phone_ids = input_ids["phone_ids"]
+            if get_tone_ids:
+                tone_ids = input_ids["tone_ids"]
+        elif lang == 'en':
+            input_ids = self.frontend.get_input_ids(
+                text, merge_sentences=merge_sentences)
+            phone_ids = input_ids["phone_ids"]
+        else:
+            logger.error("lang should in {'zh', 'en'}!")
+        self.frontend_time = time.time() - frontend_st
+
+        self.am_time = 0
+        self.voc_time = 0
+        flags = 0
+        for i in range(len(phone_ids)):
+            am_st = time.time()
+            part_phone_ids = phone_ids[i]
+            # am
+            if am_name == 'speedyspeech':
+                part_tone_ids = tone_ids[i]
+                am_result = run_model(
+                    self.am_predictor,
+                    [part_phone_ids.numpy(), part_tone_ids.numpy()])
+                mel = am_result[0]
+
+            # fastspeech2
+            else:
+                # multi speaker  do not have static model
+                if am_dataset in {"aishell3", "vctk"}:
+                    pass
+                else:
+                    am_result = run_model(self.am_predictor,
+                                          [part_phone_ids.numpy()])
+                    mel = am_result[0]
+            self.am_time += (time.time() - am_st)
+
+            # voc
+            voc_st = time.time()
+            voc_result = run_model(self.voc_predictor, [mel])
+            wav = voc_result[0]
+            wav = paddle.to_tensor(wav)
+
+            if flags == 0:
+                wav_all = wav
+                flags = 1
+            else:
+                wav_all = paddle.concat([wav_all, wav])
+            self.voc_time += (time.time() - voc_st)
+        self._outputs['wav'] = wav_all
+
+
+class TTSEngine(BaseEngine):
+    """TTS server engine
+
+    Args:
+        metaclass: Defaults to Singleton.
+    """
+
+    def __init__(self):
+        """Initialize TTS server engine
+        """
+        super(TTSEngine, self).__init__()
+
+    def init(self, config: dict) -> bool:
+        self.executor = TTSServerExecutor()
+
+        self.config = config
+        self.executor._init_from_path(
+            am=self.config.am,
+            am_model=self.config.am_model,
+            am_params=self.config.am_params,
+            am_sample_rate=self.config.am_sample_rate,
+            phones_dict=self.config.phones_dict,
+            tones_dict=self.config.tones_dict,
+            speaker_dict=self.config.speaker_dict,
+            voc=self.config.voc,
+            voc_model=self.config.voc_model,
+            voc_params=self.config.voc_params,
+            voc_sample_rate=self.config.voc_sample_rate,
+            lang=self.config.lang,
+            am_predictor_conf=self.config.am_predictor_conf,
+            voc_predictor_conf=self.config.voc_predictor_conf, )
+
+        logger.info("Initialize TTS server engine successfully.")
+        return True
+
+    def postprocess(self,
+                    wav,
+                    original_fs: int,
+                    target_fs: int=0,
+                    volume: float=1.0,
+                    speed: float=1.0,
+                    audio_path: str=None):
+        """Post-processing operations, including speech, volume, sample rate, save audio file
+
+        Args:
+            wav (numpy(float)): Synthesized audio sample points
+            original_fs (int): original audio sample rate
+            target_fs (int): target audio sample rate
+            volume (float): target volume
+            speed (float): target speed
+
+        Raises:
+            ServerBaseException: Throws an exception if the change speed unsuccessfully.
+
+        Returns:
+            target_fs: target sample rate for synthesized audio.
+            wav_base64: The base64 format of the synthesized audio.
+        """
+
+        # transform sample_rate
+        if target_fs == 0 or target_fs > original_fs:
+            target_fs = original_fs
+            wav_tar_fs = wav
+            logger.info(
+                "The sample rate of synthesized audio is the same as model, which is {}Hz".
+                format(original_fs))
+        else:
+            wav_tar_fs = librosa.resample(
+                np.squeeze(wav), original_fs, target_fs)
+            logger.info(
+                "The sample rate of model is {}Hz and the target sample rate is {}Hz. Converting the sample rate of the synthesized audio successfully.".
+                format(original_fs, target_fs))
+        # transform volume
+        wav_vol = wav_tar_fs * volume
+        logger.info("Transform the volume of the audio successfully.")
+
+        # transform speed
+        try:  # windows not support soxbindings
+            wav_speed = change_speed(wav_vol, speed, target_fs)
+            logger.info("Transform the speed of the audio successfully.")
+        except ServerBaseException:
+            raise ServerBaseException(
+                ErrorCode.SERVER_INTERNAL_ERR,
+                "Failed to transform speed. Can not install soxbindings on your system. \
+                 You need to set speed value 1.0.")
+        except BaseException:
+            logger.error("Failed to transform speed.")
+
+        # wav to base64
+        buf = io.BytesIO()
+        wavfile.write(buf, target_fs, wav_speed)
+        base64_bytes = base64.b64encode(buf.read())
+        wav_base64 = base64_bytes.decode('utf-8')
+        logger.info("Audio to string successfully.")
+
+        # save audio
+        if audio_path is not None:
+            if audio_path.endswith(".wav"):
+                sf.write(audio_path, wav_speed, target_fs)
+            elif audio_path.endswith(".pcm"):
+                wav_norm = wav_speed * (32767 / max(0.001,
+                                                    np.max(np.abs(wav_speed))))
+                with open(audio_path, "wb") as f:
+                    f.write(wav_norm.astype(np.int16))
+            logger.info("Save audio to {} successfully.".format(audio_path))
+        else:
+            logger.info("There is no need to save audio.")
+
+        return target_fs, wav_base64
+
+    def run(self,
+            sentence: str,
+            spk_id: int=0,
+            speed: float=1.0,
+            volume: float=1.0,
+            sample_rate: int=0,
+            save_path: str=None):
+        """get the result of the server response
+
+        Args:
+            sentence (str): sentence to be synthesized
+            spk_id (int, optional): speaker id. Defaults to 0.
+            speed (float, optional): audio speed, 0 < speed <=3.0. Defaults to 1.0.
+            volume (float, optional): The volume relative to the audio synthesized by the model, 
+            0 < volume <=3.0. Defaults to 1.0.
+            sample_rate (int, optional): Set the sample rate of the synthesized audio. 
+            0 represents the sample rate for model synthesis. Defaults to 0.
+            save_path (str, optional): The save path of the synthesized audio. Defaults to None.
+
+        Raises:
+            ServerBaseException: Throws an exception if tts inference unsuccessfully.
+            ServerBaseException: Throws an exception if postprocess unsuccessfully.
+
+        Returns:
+            lang: model language 
+            target_sample_rate: target sample rate for synthesized audio.
+            wav_base64: The base64 format of the synthesized audio.
+        """
+
+        lang = self.config.lang
+
+        try:
+            infer_st = time.time()
+            self.executor.infer(
+                text=sentence, lang=lang, am=self.config.am, spk_id=spk_id)
+            infer_et = time.time()
+            infer_time = infer_et - infer_st
+
+        except ServerBaseException:
+            raise ServerBaseException(ErrorCode.SERVER_INTERNAL_ERR,
+                                      "tts infer failed.")
+        except BaseException:
+            logger.error("tts infer failed.")
+
+        try:
+            postprocess_st = time.time()
+            target_sample_rate, wav_base64 = self.postprocess(
+                wav=self.executor._outputs['wav'].numpy(),
+                original_fs=self.executor.am_sample_rate,
+                target_fs=sample_rate,
+                volume=volume,
+                speed=speed,
+                audio_path=save_path)
+            postprocess_et = time.time()
+            postprocess_time = postprocess_et - postprocess_st
+            duration = len(self.executor._outputs['wav']
+                           .numpy()) / self.executor.am_sample_rate
+            rtf = infer_time / duration
+
+        except ServerBaseException:
+            raise ServerBaseException(ErrorCode.SERVER_INTERNAL_ERR,
+                                      "tts postprocess failed.")
+        except BaseException:
+            logger.error("tts postprocess failed.")
+
+        logger.info("AM model: {}".format(self.config.am))
+        logger.info("Vocoder model: {}".format(self.config.voc))
+        logger.info("Language: {}".format(lang))
+        logger.info("tts engine type: paddle inference")
+
+        logger.info("audio duration: {}".format(duration))
+        logger.info(
+            "frontend inference time: {}".format(self.executor.frontend_time))
+        logger.info("AM inference time: {}".format(self.executor.am_time))
+        logger.info("Vocoder inference time: {}".format(self.executor.voc_time))
+        logger.info("total inference time: {}".format(infer_time))
+        logger.info(
+            "postprocess (change speed, volume, target sample rate) time: {}".
+            format(postprocess_time))
+        logger.info("total generate audio time: {}".format(infer_time +
+                                                           postprocess_time))
+        logger.info("RTF: {}".format(rtf))
+
+        return lang, target_sample_rate, duration, wav_base64
diff --git a/ernie-sat/paddlespeech/server/engine/tts/python/__init__.py b/ernie-sat/paddlespeech/server/engine/tts/python/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..97043fd7ba6885aac81cad5a49924c23c67d4d47
--- /dev/null
+++ b/ernie-sat/paddlespeech/server/engine/tts/python/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/ernie-sat/paddlespeech/server/engine/tts/python/tts_engine.py b/ernie-sat/paddlespeech/server/engine/tts/python/tts_engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..f153f60b966682fea72418643b29adc38ffa1f07
--- /dev/null
+++ b/ernie-sat/paddlespeech/server/engine/tts/python/tts_engine.py
@@ -0,0 +1,253 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import base64
+import io
+import time
+
+import librosa
+import numpy as np
+import paddle
+import soundfile as sf
+from scipy.io import wavfile
+
+from paddlespeech.cli.log import logger
+from paddlespeech.cli.tts.infer import TTSExecutor
+from paddlespeech.server.engine.base_engine import BaseEngine
+from paddlespeech.server.utils.audio_process import change_speed
+from paddlespeech.server.utils.errors import ErrorCode
+from paddlespeech.server.utils.exception import ServerBaseException
+
+__all__ = ['TTSEngine']
+
+
+class TTSServerExecutor(TTSExecutor):
+    def __init__(self):
+        super().__init__()
+        pass
+
+
+class TTSEngine(BaseEngine):
+    """TTS server engine
+
+    Args:
+        metaclass: Defaults to Singleton.
+    """
+
+    def __init__(self, name=None):
+        """Initialize TTS server engine
+        """
+        super(TTSEngine, self).__init__()
+
+    def init(self, config: dict) -> bool:
+        self.executor = TTSServerExecutor()
+
+        try:
+            self.config = config
+            if self.config.device:
+                self.device = self.config.device
+            else:
+                self.device = paddle.get_device()
+            paddle.set_device(self.device)
+        except BaseException:
+            logger.error(
+                "Set device failed, please check if device is already used and the parameter 'device' in the yaml file"
+            )
+            logger.error("Initialize TTS server engine Failed on device: %s." %
+                         (self.device))
+            return False
+
+        try:
+            self.executor._init_from_path(
+                am=self.config.am,
+                am_config=self.config.am_config,
+                am_ckpt=self.config.am_ckpt,
+                am_stat=self.config.am_stat,
+                phones_dict=self.config.phones_dict,
+                tones_dict=self.config.tones_dict,
+                speaker_dict=self.config.speaker_dict,
+                voc=self.config.voc,
+                voc_config=self.config.voc_config,
+                voc_ckpt=self.config.voc_ckpt,
+                voc_stat=self.config.voc_stat,
+                lang=self.config.lang)
+        except BaseException:
+            logger.error("Failed to get model related files.")
+            logger.error("Initialize TTS server engine Failed on device: %s." %
+                         (self.device))
+            return False
+
+        logger.info("Initialize TTS server engine successfully on device: %s." %
+                    (self.device))
+        return True
+
+    def postprocess(self,
+                    wav,
+                    original_fs: int,
+                    target_fs: int=0,
+                    volume: float=1.0,
+                    speed: float=1.0,
+                    audio_path: str=None):
+        """Post-processing operations, including speech, volume, sample rate, save audio file
+
+        Args:
+            wav (numpy(float)): Synthesized audio sample points
+            original_fs (int): original audio sample rate
+            target_fs (int): target audio sample rate
+            volume (float): target volume
+            speed (float): target speed
+
+        Raises:
+            ServerBaseException: Throws an exception if the change speed unsuccessfully.
+
+        Returns:
+            target_fs: target sample rate for synthesized audio.
+            wav_base64: The base64 format of the synthesized audio.
+        """
+
+        # transform sample_rate
+        if target_fs == 0 or target_fs > original_fs:
+            target_fs = original_fs
+            wav_tar_fs = wav
+            logger.info(
+                "The sample rate of synthesized audio is the same as model, which is {}Hz".
+                format(original_fs))
+        else:
+            wav_tar_fs = librosa.resample(
+                np.squeeze(wav), original_fs, target_fs)
+            logger.info(
+                "The sample rate of model is {}Hz and the target sample rate is {}Hz. Converting the sample rate of the synthesized audio successfully.".
+                format(original_fs, target_fs))
+        # transform volume
+        wav_vol = wav_tar_fs * volume
+        logger.info("Transform the volume of the audio successfully.")
+
+        # transform speed
+        try:  # windows not support soxbindings
+            wav_speed = change_speed(wav_vol, speed, target_fs)
+            logger.info("Transform the speed of the audio successfully.")
+        except ServerBaseException:
+            raise ServerBaseException(
+                ErrorCode.SERVER_INTERNAL_ERR,
+                "Failed to transform speed. Can not install soxbindings on your system. \
+                 You need to set speed value 1.0.")
+        except BaseException:
+            logger.error("Failed to transform speed.")
+
+        # wav to base64
+        buf = io.BytesIO()
+        wavfile.write(buf, target_fs, wav_speed)
+        base64_bytes = base64.b64encode(buf.read())
+        wav_base64 = base64_bytes.decode('utf-8')
+        logger.info("Audio to string successfully.")
+
+        # save audio
+        if audio_path is not None:
+            if audio_path.endswith(".wav"):
+                sf.write(audio_path, wav_speed, target_fs)
+            elif audio_path.endswith(".pcm"):
+                wav_norm = wav_speed * (32767 / max(0.001,
+                                                    np.max(np.abs(wav_speed))))
+                with open(audio_path, "wb") as f:
+                    f.write(wav_norm.astype(np.int16))
+            logger.info("Save audio to {} successfully.".format(audio_path))
+        else:
+            logger.info("There is no need to save audio.")
+
+        return target_fs, wav_base64
+
+    def run(self,
+            sentence: str,
+            spk_id: int=0,
+            speed: float=1.0,
+            volume: float=1.0,
+            sample_rate: int=0,
+            save_path: str=None):
+        """ run include inference and postprocess.
+
+        Args:
+            sentence (str): text to be synthesized
+            spk_id (int, optional): speaker id for multi-speaker speech synthesis. Defaults to 0.
+            speed (float, optional): speed. Defaults to 1.0.
+            volume (float, optional): volume. Defaults to 1.0.
+            sample_rate (int, optional): target sample rate for synthesized audio, 
+            0 means the same as the model sampling rate. Defaults to 0.
+            save_path (str, optional): The save path of the synthesized audio. 
+            None means do not save audio. Defaults to None.
+
+        Raises:
+            ServerBaseException: Throws an exception if tts inference unsuccessfully.
+            ServerBaseException: Throws an exception if postprocess unsuccessfully.
+
+        Returns:
+            lang: model language 
+            target_sample_rate: target sample rate for synthesized audio.
+            wav_base64: The base64 format of the synthesized audio.
+        """
+
+        lang = self.config.lang
+
+        try:
+            infer_st = time.time()
+            self.executor.infer(
+                text=sentence, lang=lang, am=self.config.am, spk_id=spk_id)
+            infer_et = time.time()
+            infer_time = infer_et - infer_st
+            duration = len(self.executor._outputs['wav']
+                           .numpy()) / self.executor.am_config.fs
+            rtf = infer_time / duration
+
+        except ServerBaseException:
+            raise ServerBaseException(ErrorCode.SERVER_INTERNAL_ERR,
+                                      "tts infer failed.")
+        except BaseException:
+            logger.error("tts infer failed.")
+
+        try:
+            postprocess_st = time.time()
+            target_sample_rate, wav_base64 = self.postprocess(
+                wav=self.executor._outputs['wav'].numpy(),
+                original_fs=self.executor.am_config.fs,
+                target_fs=sample_rate,
+                volume=volume,
+                speed=speed,
+                audio_path=save_path)
+            postprocess_et = time.time()
+            postprocess_time = postprocess_et - postprocess_st
+
+        except ServerBaseException:
+            raise ServerBaseException(ErrorCode.SERVER_INTERNAL_ERR,
+                                      "tts postprocess failed.")
+        except BaseException:
+            logger.error("tts postprocess failed.")
+
+        logger.info("AM model: {}".format(self.config.am))
+        logger.info("Vocoder model: {}".format(self.config.voc))
+        logger.info("Language: {}".format(lang))
+        logger.info("tts engine type: python")
+
+        logger.info("audio duration: {}".format(duration))
+        logger.info(
+            "frontend inference time: {}".format(self.executor.frontend_time))
+        logger.info("AM inference time: {}".format(self.executor.am_time))
+        logger.info("Vocoder inference time: {}".format(self.executor.voc_time))
+        logger.info("total inference time: {}".format(infer_time))
+        logger.info(
+            "postprocess (change speed, volume, target sample rate) time: {}".
+            format(postprocess_time))
+        logger.info("total generate audio time: {}".format(infer_time +
+                                                           postprocess_time))
+        logger.info("RTF: {}".format(rtf))
+        logger.info("device: {}".format(self.device))
+
+        return lang, target_sample_rate, duration, wav_base64
diff --git a/ernie-sat/paddlespeech/server/entry.py b/ernie-sat/paddlespeech/server/entry.py
new file mode 100644
index 0000000000000000000000000000000000000000..f817321d06544db844fc6000616e70307a548379
--- /dev/null
+++ b/ernie-sat/paddlespeech/server/entry.py
@@ -0,0 +1,57 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+from collections import defaultdict
+
+__all__ = ['server_commands', 'client_commands']
+
+
+def _CommandDict():
+    return defaultdict(_CommandDict)
+
+
+def server_execute():
+    com = server_commands
+    idx = 0
+    for _argv in (['paddlespeech_server'] + sys.argv[1:]):
+        if _argv not in com:
+            break
+        idx += 1
+        com = com[_argv]
+
+    # The method 'execute' of a command instance returns 'True' for a success
+    # while 'False' for a failure. Here converts this result into a exit status
+    # in bash: 0 for a success and 1 for a failure.
+    status = 0 if com['_entry']().execute(sys.argv[idx:]) else 1
+    return status
+
+
+def client_execute():
+    com = client_commands
+    idx = 0
+    for _argv in (['paddlespeech_client'] + sys.argv[1:]):
+        if _argv not in com:
+            break
+        idx += 1
+        com = com[_argv]
+
+    # The method 'execute' of a command instance returns 'True' for a success
+    # while 'False' for a failure. Here converts this result into a exit status
+    # in bash: 0 for a success and 1 for a failure.
+    status = 0 if com['_entry']().execute(sys.argv[idx:]) else 1
+    return status
+
+
+server_commands = _CommandDict()
+client_commands = _CommandDict()
diff --git a/ernie-sat/paddlespeech/server/executor.py b/ernie-sat/paddlespeech/server/executor.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa2d01a9749c3543aaed22a846a80142c8823b45
--- /dev/null
+++ b/ernie-sat/paddlespeech/server/executor.py
@@ -0,0 +1,46 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+from abc import ABC
+from abc import abstractmethod
+from typing import List
+
+
+class BaseExecutor(ABC):
+    """
+        An abstract executor of paddlespeech server tasks.
+    """
+
+    def __init__(self):
+        self.parser = argparse.ArgumentParser()
+
+    @abstractmethod
+    def execute(self, argv: List[str]) -> bool:
+        """
+        Command line entry. This method can only be accessed by a command line such as `paddlespeech asr`.
+
+        Args:
+            argv (List[str]): Arguments from command line.
+
+        Returns:
+            int: Result of the command execution. `True` for a success and `False` for a failure.
+        """
+        pass
+
+    @abstractmethod
+    def __call__(self, *arg, **kwargs):
+        """
+        Python API to call an executor.
+        """
+        pass
diff --git a/ernie-sat/paddlespeech/server/restful/__init__.py b/ernie-sat/paddlespeech/server/restful/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..97043fd7ba6885aac81cad5a49924c23c67d4d47
--- /dev/null
+++ b/ernie-sat/paddlespeech/server/restful/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/ernie-sat/paddlespeech/server/restful/api.py b/ernie-sat/paddlespeech/server/restful/api.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f91a03b6473f95a69f6a4f0da3ce3c9b911eeae
--- /dev/null
+++ b/ernie-sat/paddlespeech/server/restful/api.py
@@ -0,0 +1,44 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List
+
+from fastapi import APIRouter
+
+from paddlespeech.server.restful.asr_api import router as asr_router
+from paddlespeech.server.restful.cls_api import router as cls_router
+from paddlespeech.server.restful.tts_api import router as tts_router
+
+_router = APIRouter()
+
+
+def setup_router(api_list: List):
+    """setup router for fastapi
+
+    Args:
+        api_list (List): [asr, tts, cls]
+
+    Returns:
+        APIRouter
+    """
+    for api_name in api_list:
+        if api_name == 'asr':
+            _router.include_router(asr_router)
+        elif api_name == 'tts':
+            _router.include_router(tts_router)
+        elif api_name == 'cls':
+            _router.include_router(cls_router)
+        else:
+            pass
+
+    return _router
diff --git a/ernie-sat/paddlespeech/server/restful/asr_api.py b/ernie-sat/paddlespeech/server/restful/asr_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf46735dcc84dc92c8bfcfa71b426604ed7c1843
--- /dev/null
+++ b/ernie-sat/paddlespeech/server/restful/asr_api.py
@@ -0,0 +1,91 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import base64
+import traceback
+from typing import Union
+
+from fastapi import APIRouter
+
+from paddlespeech.server.engine.engine_pool import get_engine_pool
+from paddlespeech.server.restful.request import ASRRequest
+from paddlespeech.server.restful.response import ASRResponse
+from paddlespeech.server.restful.response import ErrorResponse
+from paddlespeech.server.utils.errors import ErrorCode
+from paddlespeech.server.utils.errors import failed_response
+from paddlespeech.server.utils.exception import ServerBaseException
+
+router = APIRouter()
+
+
+@router.get('/paddlespeech/asr/help')
+def help():
+    """help
+
+    Returns:
+        json: [description]
+    """
+    response = {
+        "success": "True",
+        "code": 200,
+        "message": {
+            "global": "success"
+        },
+        "result": {
+            "description": "asr server",
+            "input": "base64 string of wavfile",
+            "output": "transcription"
+        }
+    }
+    return response
+
+
+@router.post(
+    "/paddlespeech/asr", response_model=Union[ASRResponse, ErrorResponse])
+def asr(request_body: ASRRequest):
+    """asr api 
+
+    Args:
+        request_body (ASRRequest): [description]
+
+    Returns:
+        json: [description]
+    """
+    try:
+        audio_data = base64.b64decode(request_body.audio)
+
+        # get single engine from engine pool
+        engine_pool = get_engine_pool()
+        asr_engine = engine_pool['asr']
+
+        asr_engine.run(audio_data)
+        asr_results = asr_engine.postprocess()
+
+        response = {
+            "success": True,
+            "code": 200,
+            "message": {
+                "description": "success"
+            },
+            "result": {
+                "transcription": asr_results
+            }
+        }
+
+    except ServerBaseException as e:
+        response = failed_response(e.error_code, e.msg)
+    except BaseException:
+        response = failed_response(ErrorCode.SERVER_UNKOWN_ERR)
+        traceback.print_exc()
+
+    return response
diff --git a/ernie-sat/paddlespeech/server/restful/cls_api.py b/ernie-sat/paddlespeech/server/restful/cls_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..306d9ca9c11ce824cba3982492ea285f6d99a3ff
--- /dev/null
+++ b/ernie-sat/paddlespeech/server/restful/cls_api.py
@@ -0,0 +1,92 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import base64
+import traceback
+from typing import Union
+
+from fastapi import APIRouter
+
+from paddlespeech.server.engine.engine_pool import get_engine_pool
+from paddlespeech.server.restful.request import CLSRequest
+from paddlespeech.server.restful.response import CLSResponse
+from paddlespeech.server.restful.response import ErrorResponse
+from paddlespeech.server.utils.errors import ErrorCode
+from paddlespeech.server.utils.errors import failed_response
+from paddlespeech.server.utils.exception import ServerBaseException
+
+router = APIRouter()
+
+
+@router.get('/paddlespeech/cls/help')
+def help():
+    """help
+
+    Returns:
+        json: [description]
+    """
+    response = {
+        "success": "True",
+        "code": 200,
+        "message": {
+            "global": "success"
+        },
+        "result": {
+            "description": "cls server",
+            "input": "base64 string of wavfile",
+            "output": "classification result"
+        }
+    }
+    return response
+
+
+@router.post(
+    "/paddlespeech/cls", response_model=Union[CLSResponse, ErrorResponse])
+def cls(request_body: CLSRequest):
+    """cls api 
+
+    Args:
+        request_body (CLSRequest): [description]
+
+    Returns:
+        json: [description]
+    """
+    try:
+        audio_data = base64.b64decode(request_body.audio)
+
+        # get single engine from engine pool
+        engine_pool = get_engine_pool()
+        cls_engine = engine_pool['cls']
+
+        cls_engine.run(audio_data)
+        cls_results = cls_engine.postprocess(request_body.topk)
+
+        response = {
+            "success": True,
+            "code": 200,
+            "message": {
+                "description": "success"
+            },
+            "result": {
+                "topk": request_body.topk,
+                "results": cls_results
+            }
+        }
+
+    except ServerBaseException as e:
+        response = failed_response(e.error_code, e.msg)
+    except BaseException:
+        response = failed_response(ErrorCode.SERVER_UNKOWN_ERR)
+        traceback.print_exc()
+
+    return response
diff --git a/ernie-sat/paddlespeech/server/restful/request.py b/ernie-sat/paddlespeech/server/restful/request.py
new file mode 100644
index 0000000000000000000000000000000000000000..dbac9dac881f7b3ed04e0ab17592b0eb5ff5884d
--- /dev/null
+++ b/ernie-sat/paddlespeech/server/restful/request.py
@@ -0,0 +1,80 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional
+
+from pydantic import BaseModel
+
+__all__ = ['ASRRequest', 'TTSRequest', 'CLSRequest']
+
+
+#****************************************************************************************/
+#************************************ ASR request ***************************************/
+#****************************************************************************************/
+class ASRRequest(BaseModel):
+    """
+    request body example
+    {
+        "audio": "exSI6ICJlbiIsCgkgICAgInBvc2l0aW9uIjogImZhbHNlIgoJf...",
+        "audio_format": "wav",
+        "sample_rate": 16000,
+        "lang": "zh_cn",
+        "punc":false
+    }
+    """
+    audio: str
+    audio_format: str
+    sample_rate: int
+    lang: str
+    punc: Optional[bool] = None
+
+
+#****************************************************************************************/
+#************************************ TTS request ***************************************/
+#****************************************************************************************/
+class TTSRequest(BaseModel):
+    """TTS request
+
+    request body example
+    {
+        "text": "你好，欢迎使用百度飞桨语音合成服务。",
+        "spk_id": 0,
+        "speed": 1.0,
+        "volume": 1.0,
+        "sample_rate": 0,
+        "tts_audio_path": "./tts.wav"
+    }
+    
+    """
+
+    text: str
+    spk_id: int = 0
+    speed: float = 1.0
+    volume: float = 1.0
+    sample_rate: int = 0
+    save_path: str = None
+
+
+#****************************************************************************************/
+#************************************ CLS request ***************************************/
+#****************************************************************************************/
+class CLSRequest(BaseModel):
+    """
+    request body example
+    {
+        "audio": "exSI6ICJlbiIsCgkgICAgInBvc2l0aW9uIjogImZhbHNlIgoJf...",
+        "topk": 1
+    }
+    """
+    audio: str
+    topk: int = 1
diff --git a/ernie-sat/paddlespeech/server/restful/response.py b/ernie-sat/paddlespeech/server/restful/response.py
new file mode 100644
index 0000000000000000000000000000000000000000..a2a207e4f689103c96bd513a8552fcdc3cce24d4
--- /dev/null
+++ b/ernie-sat/paddlespeech/server/restful/response.py
@@ -0,0 +1,148 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List
+
+from pydantic import BaseModel
+
+__all__ = ['ASRResponse', 'TTSResponse', 'CLSResponse']
+
+
+class Message(BaseModel):
+    description: str
+
+
+#****************************************************************************************/
+#************************************ ASR response **************************************/
+#****************************************************************************************/
+class AsrResult(BaseModel):
+    transcription: str
+
+
+class ASRResponse(BaseModel):
+    """
+    response example
+    {
+        "success": true,
+        "code": 0,
+        "message": {
+            "description": "success" 
+        },
+        "result": {
+            "transcription": "你好，飞桨"
+        }
+    }
+    """
+    success: bool
+    code: int
+    message: Message
+    result: AsrResult
+
+
+#****************************************************************************************/
+#************************************ TTS response **************************************/
+#****************************************************************************************/
+class TTSResult(BaseModel):
+    lang: str = "zh"
+    spk_id: int = 0
+    speed: float = 1.0
+    volume: float = 1.0
+    sample_rate: int
+    duration: float
+    save_path: str = None
+    audio: str
+
+
+class TTSResponse(BaseModel):
+    """
+    response example
+    {
+        "success": true,
+        "code": 200,
+        "message": {
+            "description": "success" 
+        },
+        "result": {
+            "lang": "zh",
+            "spk_id": 0,
+            "speed": 1.0,
+            "volume": 1.0,
+            "sample_rate": 24000,
+            "duration": 3.6125,
+            "audio": "LTI1OTIuNjI1OTUwMzQsOTk2OS41NDk4...",
+            "save_path": "./tts.wav"
+        }
+    }
+    """
+    success: bool
+    code: int
+    message: Message
+    result: TTSResult
+
+
+#****************************************************************************************/
+#************************************ CLS response **************************************/
+#****************************************************************************************/
+class CLSResults(BaseModel):
+    class_name: str
+    prob: float
+
+
+class CLSResult(BaseModel):
+    topk: int
+    results: List[CLSResults]
+
+
+class CLSResponse(BaseModel):
+    """
+    response example
+    {
+        "success": true,
+        "code": 0,
+        "message": {
+            "description": "success" 
+        },
+        "result": {
+            topk: 1
+            results: [
+            {
+                "class":"Speech",
+                "prob": 0.9027184844017029
+            }
+            ]
+        }
+    }
+    """
+    success: bool
+    code: int
+    message: Message
+    result: CLSResult
+
+
+#****************************************************************************************/
+#********************************** Error response **************************************/
+#****************************************************************************************/
+class ErrorResponse(BaseModel):
+    """
+    response example
+    {
+        "success": false,
+        "code": 0,
+        "message": {
+            "description": "Unknown error occurred."
+        }
+    }
+    """
+    success: bool
+    code: int
+    message: Message
diff --git a/ernie-sat/paddlespeech/server/restful/tts_api.py b/ernie-sat/paddlespeech/server/restful/tts_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e9bbe23ed333d134ab535a499d04b653f7bdf87
--- /dev/null
+++ b/ernie-sat/paddlespeech/server/restful/tts_api.py
@@ -0,0 +1,127 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import traceback
+from typing import Union
+
+from fastapi import APIRouter
+
+from paddlespeech.cli.log import logger
+from paddlespeech.server.engine.engine_pool import get_engine_pool
+from paddlespeech.server.restful.request import TTSRequest
+from paddlespeech.server.restful.response import ErrorResponse
+from paddlespeech.server.restful.response import TTSResponse
+from paddlespeech.server.utils.errors import ErrorCode
+from paddlespeech.server.utils.errors import failed_response
+from paddlespeech.server.utils.exception import ServerBaseException
+
+router = APIRouter()
+
+
+@router.get('/paddlespeech/tts/help')
+def help():
+    """help
+
+    Returns:
+        json: [description]
+    """
+    response = {
+        "success": "True",
+        "code": 200,
+        "message": {
+            "global": "success"
+        },
+        "result": {
+            "description": "tts server",
+            "text": "sentence to be synthesized",
+            "audio": "the base64 of audio"
+        }
+    }
+    return response
+
+
+@router.post(
+    "/paddlespeech/tts", response_model=Union[TTSResponse, ErrorResponse])
+def tts(request_body: TTSRequest):
+    """tts api
+
+    Args:
+        request_body (TTSRequest): [description]
+
+    Returns:
+        json: [description]
+    """
+
+    logger.info("request: {}".format(request_body))
+
+    # get params
+    text = request_body.text
+    spk_id = request_body.spk_id
+    speed = request_body.speed
+    volume = request_body.volume
+    sample_rate = request_body.sample_rate
+    save_path = request_body.save_path
+
+    # Check parameters
+    if speed <= 0 or speed > 3:
+        return failed_response(
+            ErrorCode.SERVER_PARAM_ERR,
+            "invalid speed value, the value should be between 0 and 3.")
+    if volume <= 0 or volume > 3:
+        return failed_response(
+            ErrorCode.SERVER_PARAM_ERR,
+            "invalid volume value, the value should be between 0 and 3.")
+    if sample_rate not in [0, 16000, 8000]:
+        return failed_response(
+            ErrorCode.SERVER_PARAM_ERR,
+            "invalid sample_rate value, the choice of value is 0, 8000, 16000.")
+    if save_path is not None and not save_path.endswith(
+            "pcm") and not save_path.endswith("wav"):
+        return failed_response(
+            ErrorCode.SERVER_PARAM_ERR,
+            "invalid save_path, saved audio formats support pcm and wav")
+
+    # run
+    try:
+        # get single engine from engine pool
+        engine_pool = get_engine_pool()
+        tts_engine = engine_pool['tts']
+        logger.info("Get tts engine successfully.")
+
+        lang, target_sample_rate, duration, wav_base64 = tts_engine.run(
+            text, spk_id, speed, volume, sample_rate, save_path)
+
+        response = {
+            "success": True,
+            "code": 200,
+            "message": {
+                "description": "success."
+            },
+            "result": {
+                "lang": lang,
+                "spk_id": spk_id,
+                "speed": speed,
+                "volume": volume,
+                "sample_rate": target_sample_rate,
+                "duration": duration,
+                "save_path": save_path,
+                "audio": wav_base64
+            }
+        }
+    except ServerBaseException as e:
+        response = failed_response(e.error_code, e.msg)
+    except BaseException:
+        response = failed_response(ErrorCode.SERVER_UNKOWN_ERR)
+        traceback.print_exc()
+
+    return response
diff --git a/ernie-sat/paddlespeech/server/tests/asr/http_client.py b/ernie-sat/paddlespeech/server/tests/asr/http_client.py
new file mode 100644
index 0000000000000000000000000000000000000000..49f2adf7c28954af1fc2efc42b81169989ad471e
--- /dev/null
+++ b/ernie-sat/paddlespeech/server/tests/asr/http_client.py
@@ -0,0 +1,59 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the 
+import base64
+import json
+import time
+
+import requests
+
+
+def readwav2base64(wav_file):
+    """
+    read wave file and covert to base64 string
+    """
+    with open(wav_file, 'rb') as f:
+        base64_bytes = base64.b64encode(f.read())
+        base64_string = base64_bytes.decode('utf-8')
+    return base64_string
+
+
+def main():
+    """
+    main func
+    """
+    url = "http://127.0.0.1:8090/paddlespeech/asr"
+
+    # start Timestamp
+    time_start = time.time()
+
+    test_audio_dir = "./16_audio.wav"
+    audio = readwav2base64(test_audio_dir)
+
+    data = {
+        "audio": audio,
+        "audio_format": "wav",
+        "sample_rate": 16000,
+        "lang": "zh_cn",
+    }
+
+    r = requests.post(url=url, data=json.dumps(data))
+
+    # ending Timestamp
+    time_end = time.time()
+    print('time cost', time_end - time_start, 's')
+
+    print(r.json())
+
+
+if __name__ == "__main__":
+    main()
diff --git a/ernie-sat/paddlespeech/server/tests/asr/online/microphone_client.py b/ernie-sat/paddlespeech/server/tests/asr/online/microphone_client.py
new file mode 100644
index 0000000000000000000000000000000000000000..2ceaf6d03a07ab922477505c016e3870351d2574
--- /dev/null
+++ b/ernie-sat/paddlespeech/server/tests/asr/online/microphone_client.py
@@ -0,0 +1,161 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+record wave from the mic
+"""
+import asyncio
+import json
+import logging
+import threading
+import wave
+from signal import SIGINT
+from signal import SIGTERM
+
+import pyaudio
+import websockets
+
+
+class ASRAudioHandler(threading.Thread):
+    def __init__(self, url="127.0.0.1", port=8091):
+        threading.Thread.__init__(self)
+        self.url = url
+        self.port = port
+        self.url = "ws://" + self.url + ":" + str(self.port) + "/ws/asr"
+        self.fileName = "./output.wav"
+        self.chunk = 5120
+        self.format = pyaudio.paInt16
+        self.channels = 1
+        self.rate = 16000
+        self._running = True
+        self._frames = []
+        self.data_backup = []
+
+    def startrecord(self):
+        """
+        start a new thread to record wave
+        """
+        threading._start_new_thread(self.recording, ())
+
+    def recording(self):
+        """
+        recording wave
+        """
+        self._running = True
+        self._frames = []
+        p = pyaudio.PyAudio()
+        stream = p.open(
+            format=self.format,
+            channels=self.channels,
+            rate=self.rate,
+            input=True,
+            frames_per_buffer=self.chunk)
+        while (self._running):
+            data = stream.read(self.chunk)
+            self._frames.append(data)
+            self.data_backup.append(data)
+
+        stream.stop_stream()
+        stream.close()
+        p.terminate()
+
+    def save(self):
+        """
+        save wave data
+        """
+        p = pyaudio.PyAudio()
+        wf = wave.open(self.fileName, 'wb')
+        wf.setnchannels(self.channels)
+        wf.setsampwidth(p.get_sample_size(self.format))
+        wf.setframerate(self.rate)
+        wf.writeframes(b''.join(self.data_backup))
+        wf.close()
+        p.terminate()
+
+    def stoprecord(self):
+        """
+        stop recording
+        """
+        self._running = False
+
+    async def run(self):
+        aa = input("是否开始录音？   (y/n)")
+        if aa.strip() == "y":
+            self.startrecord()
+            logging.info("*" * 10 + "开始录音，请输入语音")
+
+            async with websockets.connect(self.url) as ws:
+                # 发送开始指令
+                audio_info = json.dumps(
+                    {
+                        "name": "test.wav",
+                        "signal": "start",
+                        "nbest": 5
+                    },
+                    sort_keys=True,
+                    indent=4,
+                    separators=(',', ': '))
+                await ws.send(audio_info)
+                msg = await ws.recv()
+                logging.info("receive msg={}".format(msg))
+
+                # send bytes data
+                logging.info("结束录音请: Ctrl + c。继续请按回车。")
+                try:
+                    while True:
+                        while len(self._frames) > 0:
+                            await ws.send(self._frames.pop(0))
+                            msg = await ws.recv()
+                            logging.info("receive msg={}".format(msg))
+                except asyncio.CancelledError:
+                    # quit
+                    # send finished 
+                    audio_info = json.dumps(
+                        {
+                            "name": "test.wav",
+                            "signal": "end",
+                            "nbest": 5
+                        },
+                        sort_keys=True,
+                        indent=4,
+                        separators=(',', ': '))
+                    await ws.send(audio_info)
+                    msg = await ws.recv()
+                    logging.info("receive msg={}".format(msg))
+
+                    self.stoprecord()
+                    logging.info("*" * 10 + "录音结束")
+                    self.save()
+        elif aa.strip() == "n":
+            exit()
+        else:
+            print("无效输入!")
+            exit()
+
+
+if __name__ == "__main__":
+
+    logging.basicConfig(level=logging.INFO)
+    logging.info("asr websocket client start")
+
+    handler = ASRAudioHandler("127.0.0.1", 8091)
+    loop = asyncio.get_event_loop()
+    main_task = asyncio.ensure_future(handler.run())
+    for signal in [SIGINT, SIGTERM]:
+        loop.add_signal_handler(signal, main_task.cancel)
+    try:
+        loop.run_until_complete(main_task)
+    finally:
+        loop.close()
+
+    logging.info("asr websocket client finished")
diff --git a/ernie-sat/paddlespeech/server/tests/asr/online/websocket_client.py b/ernie-sat/paddlespeech/server/tests/asr/online/websocket_client.py
new file mode 100644
index 0000000000000000000000000000000000000000..58b1a452c19a2e330b32be8826e4d4f693dae440
--- /dev/null
+++ b/ernie-sat/paddlespeech/server/tests/asr/online/websocket_client.py
@@ -0,0 +1,115 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#!/usr/bin/python
+# -*- coding: UTF-8 -*-
+import argparse
+import asyncio
+import json
+import logging
+
+import numpy as np
+import soundfile
+import websockets
+
+
+class ASRAudioHandler:
+    def __init__(self, url="127.0.0.1", port=8090):
+        self.url = url
+        self.port = port
+        self.url = "ws://" + self.url + ":" + str(self.port) + "/ws/asr"
+
+    def read_wave(self, wavfile_path: str):
+        samples, sample_rate = soundfile.read(wavfile_path, dtype='int16')
+        x_len = len(samples)
+        chunk_stride = 40 * 16  #40ms, sample_rate = 16kHz
+        chunk_size = 80 * 16  #80ms, sample_rate = 16kHz
+
+        if (x_len - chunk_size) % chunk_stride != 0:
+            padding_len_x = chunk_stride - (x_len - chunk_size) % chunk_stride
+        else:
+            padding_len_x = 0
+
+        padding = np.zeros((padding_len_x), dtype=samples.dtype)
+        padded_x = np.concatenate([samples, padding], axis=0)
+
+        num_chunk = (x_len + padding_len_x - chunk_size) / chunk_stride + 1
+        num_chunk = int(num_chunk)
+
+        for i in range(0, num_chunk):
+            start = i * chunk_stride
+            end = start + chunk_size
+            x_chunk = padded_x[start:end]
+            yield x_chunk
+
+    async def run(self, wavfile_path: str):
+        logging.info("send a message to the server")
+        # 读取音频
+        # self.read_wave()
+        # 发送 websocket 的 handshake 协议头
+        async with websockets.connect(self.url) as ws:
+            # server 端已经接收到 handshake 协议头
+            # 发送开始指令
+            audio_info = json.dumps(
+                {
+                    "name": "test.wav",
+                    "signal": "start",
+                    "nbest": 5
+                },
+                sort_keys=True,
+                indent=4,
+                separators=(',', ': '))
+            await ws.send(audio_info)
+            msg = await ws.recv()
+            logging.info("receive msg={}".format(msg))
+
+            # send chunk audio data to engine
+            for chunk_data in self.read_wave(wavfile_path):
+                await ws.send(chunk_data.tobytes())
+                msg = await ws.recv()
+                logging.info("receive msg={}".format(msg))
+
+            # finished 
+            audio_info = json.dumps(
+                {
+                    "name": "test.wav",
+                    "signal": "end",
+                    "nbest": 5
+                },
+                sort_keys=True,
+                indent=4,
+                separators=(',', ': '))
+            await ws.send(audio_info)
+            msg = await ws.recv()
+            logging.info("receive msg={}".format(msg))
+
+
+def main(args):
+    logging.basicConfig(level=logging.INFO)
+    logging.info("asr websocket client start")
+    handler = ASRAudioHandler("127.0.0.1", 8091)
+    loop = asyncio.get_event_loop()
+    loop.run_until_complete(handler.run(args.wavfile))
+    logging.info("asr websocket client finished")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--wavfile",
+        action="store",
+        help="wav file path ",
+        default="./16_audio.wav")
+    args = parser.parse_args()
+
+    main(args)
diff --git a/ernie-sat/paddlespeech/server/tests/tts/test_client.py b/ernie-sat/paddlespeech/server/tests/tts/test_client.py
new file mode 100644
index 0000000000000000000000000000000000000000..e42c9bcfa1cf586333ca333251f63e9b50a1b62f
--- /dev/null
+++ b/ernie-sat/paddlespeech/server/tests/tts/test_client.py
@@ -0,0 +1,104 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import base64
+import io
+import json
+import os
+import random
+import time
+
+import numpy as np
+import requests
+import soundfile
+
+from paddlespeech.server.utils.audio_process import wav2pcm
+
+
+# Request and response
+def tts_client(args):
+    """ Request and response
+    Args:
+        text: A sentence to be synthesized
+        outfile: Synthetic audio file
+    """
+    url = 'http://127.0.0.1:8090/paddlespeech/tts'
+    request = {
+        "text": args.text,
+        "spk_id": args.spk_id,
+        "speed": args.speed,
+        "volume": args.volume,
+        "sample_rate": args.sample_rate,
+        "save_path": args.output
+    }
+
+    response = requests.post(url, json.dumps(request))
+    response_dict = response.json()
+    wav_base64 = response_dict["result"]["audio"]
+
+    audio_data_byte = base64.b64decode(wav_base64)
+    # from byte
+    samples, sample_rate = soundfile.read(
+        io.BytesIO(audio_data_byte), dtype='float32')
+
+    # transform audio
+    outfile = args.output
+    if outfile.endswith(".wav"):
+        soundfile.write(outfile, samples, sample_rate)
+    elif outfile.endswith(".pcm"):
+        temp_wav = str(random.getrandbits(128)) + ".wav"
+        soundfile.write(temp_wav, samples, sample_rate)
+        wav2pcm(temp_wav, outfile, data_type=np.int16)
+        os.system("rm %s" % (temp_wav))
+    else:
+        print("The format for saving audio only supports wav or pcm")
+
+    return len(samples), sample_rate
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--text',
+        type=str,
+        default="你好，欢迎使用语音合成服务",
+        help='A sentence to be synthesized')
+    parser.add_argument('--spk_id', type=int, default=0, help='Speaker id')
+    parser.add_argument('--speed', type=float, default=1.0, help='Audio speed')
+    parser.add_argument(
+        '--volume', type=float, default=1.0, help='Audio volume')
+    parser.add_argument(
+        '--sample_rate',
+        type=int,
+        default=0,
+        help='Sampling rate, the default is the same as the model')
+    parser.add_argument(
+        '--output',
+        type=str,
+        default="./out.wav",
+        help='Synthesized audio file')
+    args = parser.parse_args()
+
+    st = time.time()
+    try:
+        samples_length, sample_rate = tts_client(args)
+        time_consume = time.time() - st
+        duration = samples_length / sample_rate
+        rtf = time_consume / duration
+        print("Synthesized audio successfully.")
+        print("Inference time: %f" % (time_consume))
+        print("The duration of synthesized audio: %f" % (duration))
+        print("The RTF is: %f" % (rtf))
+    except BaseException:
+        print("Failed to synthesized audio.")
diff --git a/ernie-sat/paddlespeech/server/util.py b/ernie-sat/paddlespeech/server/util.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f1b0be1bd82f112bfa7c6162fde42c236739243
--- /dev/null
+++ b/ernie-sat/paddlespeech/server/util.py
@@ -0,0 +1,367 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import hashlib
+import inspect
+import json
+import os
+import tarfile
+import threading
+import time
+import uuid
+import zipfile
+from typing import Any
+from typing import Dict
+
+import paddle
+import requests
+import yaml
+from paddle.framework import load
+
+import paddleaudio
+from . import download
+from .entry import client_commands
+from .entry import server_commands
+try:
+    from .. import __version__
+except ImportError:
+    __version__ = "0.0.0"  # for develop branch
+
+requests.adapters.DEFAULT_RETRIES = 3
+
+__all__ = [
+    'cli_server_register',
+    'get_server_command',
+    'cli_client_register',
+    'get_client_command',
+    'download_and_decompress',
+    'load_state_dict_from_url',
+    'stats_wrapper',
+]
+
+
+def cli_server_register(name: str, description: str='') -> Any:
+    def _warpper(command):
+        items = name.split('.')
+
+        com = server_commands
+        for item in items:
+            com = com[item]
+        com['_entry'] = command
+        if description:
+            com['_description'] = description
+        return command
+
+    return _warpper
+
+
+def get_server_command(name: str) -> Any:
+    items = name.split('.')
+    com = server_commands
+    for item in items:
+        com = com[item]
+
+    return com['_entry']
+
+
+def cli_client_register(name: str, description: str='') -> Any:
+    def _warpper(command):
+        items = name.split('.')
+
+        com = client_commands
+        for item in items:
+            com = com[item]
+        com['_entry'] = command
+        if description:
+            com['_description'] = description
+        return command
+
+    return _warpper
+
+
+def get_client_command(name: str) -> Any:
+    items = name.split('.')
+    com = client_commands
+    for item in items:
+        com = com[item]
+
+    return com['_entry']
+
+
+def _get_uncompress_path(filepath: os.PathLike) -> os.PathLike:
+    file_dir = os.path.dirname(filepath)
+    is_zip_file = False
+    if tarfile.is_tarfile(filepath):
+        files = tarfile.open(filepath, "r:*")
+        file_list = files.getnames()
+    elif zipfile.is_zipfile(filepath):
+        files = zipfile.ZipFile(filepath, 'r')
+        file_list = files.namelist()
+        is_zip_file = True
+    else:
+        return file_dir
+
+    if download._is_a_single_file(file_list):
+        rootpath = file_list[0]
+        uncompressed_path = os.path.join(file_dir, rootpath)
+    elif download._is_a_single_dir(file_list):
+        if is_zip_file:
+            rootpath = os.path.splitext(file_list[0])[0].split(os.sep)[0]
+        else:
+            rootpath = os.path.splitext(file_list[0])[0].split(os.sep)[-1]
+        uncompressed_path = os.path.join(file_dir, rootpath)
+    else:
+        rootpath = os.path.splitext(filepath)[0].split(os.sep)[-1]
+        uncompressed_path = os.path.join(file_dir, rootpath)
+
+    files.close()
+    return uncompressed_path
+
+
+def download_and_decompress(archive: Dict[str, str], path: str) -> os.PathLike:
+    """
+    Download archieves and decompress to specific path.
+    """
+    if not os.path.isdir(path):
+        os.makedirs(path)
+
+    assert 'url' in archive and 'md5' in archive, \
+        'Dictionary keys of "url" and "md5" are required in the archive, but got: {}'.format(list(archive.keys()))
+
+    filepath = os.path.join(path, os.path.basename(archive['url']))
+    if os.path.isfile(filepath) and download._md5check(filepath,
+                                                       archive['md5']):
+        uncompress_path = _get_uncompress_path(filepath)
+        if not os.path.isdir(uncompress_path):
+            download._decompress(filepath)
+    else:
+        StatsWorker(
+            task='download',
+            version=__version__,
+            extra_info={
+                'download_url': archive['url'],
+                'paddle_version': paddle.__version__
+            }).start()
+        uncompress_path = download.get_path_from_url(archive['url'], path,
+                                                     archive['md5'])
+
+    return uncompress_path
+
+
+def load_state_dict_from_url(url: str, path: str, md5: str=None) -> os.PathLike:
+    """
+    Download and load a state dict from url
+    """
+    if not os.path.isdir(path):
+        os.makedirs(path)
+
+    download.get_path_from_url(url, path, md5)
+    return load(os.path.join(path, os.path.basename(url)))
+
+
+def _get_user_home():
+    return os.path.expanduser('~')
+
+
+def _get_paddlespcceh_home():
+    if 'PPSPEECH_HOME' in os.environ:
+        home_path = os.environ['PPSPEECH_HOME']
+        if os.path.exists(home_path):
+            if os.path.isdir(home_path):
+                return home_path
+            else:
+                raise RuntimeError(
+                    'The environment variable PPSPEECH_HOME {} is not a directory.'.
+                    format(home_path))
+        else:
+            return home_path
+    return os.path.join(_get_user_home(), '.paddlespeech')
+
+
+def _get_sub_home(directory):
+    home = os.path.join(_get_paddlespcceh_home(), directory)
+    if not os.path.exists(home):
+        os.makedirs(home)
+    return home
+
+
+PPSPEECH_HOME = _get_paddlespcceh_home()
+MODEL_HOME = _get_sub_home('models')
+CONF_HOME = _get_sub_home('conf')
+
+
+def _md5(text: str):
+    '''Calculate the md5 value of the input text.'''
+    md5code = hashlib.md5(text.encode())
+    return md5code.hexdigest()
+
+
+class ConfigCache:
+    def __init__(self):
+        self._data = {}
+        self._initialize()
+        self.file = os.path.join(CONF_HOME, 'cache.yaml')
+        if not os.path.exists(self.file):
+            self.flush()
+            return
+
+        with open(self.file, 'r') as file:
+            try:
+                cfg = yaml.load(file, Loader=yaml.FullLoader)
+                self._data.update(cfg)
+            except BaseException:
+                self.flush()
+
+    @property
+    def cache_info(self):
+        return self._data['cache_info']
+
+    def _initialize(self):
+        # Set default configuration values.
+        cache_info = _md5(str(uuid.uuid1())[-12:]) + "-" + str(int(time.time()))
+        self._data['cache_info'] = cache_info
+
+    def flush(self):
+        '''Flush the current configuration into the configuration file.'''
+        with open(self.file, 'w') as file:
+            cfg = json.loads(json.dumps(self._data))
+            yaml.dump(cfg, file)
+
+
+stats_api = "http://paddlepaddle.org.cn/paddlehub/stat"
+cache_info = ConfigCache().cache_info
+
+
+class StatsWorker(threading.Thread):
+    def __init__(self,
+                 task="asr",
+                 model=None,
+                 version=__version__,
+                 extra_info={}):
+        threading.Thread.__init__(self)
+        self._task = task
+        self._model = model
+        self._version = version
+        self._extra_info = extra_info
+
+    def run(self):
+        params = {
+            'task': self._task,
+            'version': self._version,
+            'from': 'ppspeech'
+        }
+        if self._model:
+            params['model'] = self._model
+
+        self._extra_info.update({
+            'cache_info': cache_info,
+        })
+        params.update({"extra": json.dumps(self._extra_info)})
+
+        try:
+            requests.get(stats_api, params)
+        except Exception:
+            pass
+
+        return
+
+
+def _note_one_stat(cls_name, params={}):
+    task = cls_name.replace('Executor', '').lower()  # XXExecutor
+    extra_info = {
+        'paddle_version': paddle.__version__,
+    }
+
+    if 'model' in params:
+        model = params['model']
+    else:
+        model = None
+
+    if 'audio_file' in params:
+        try:
+            _, sr = paddleaudio.load(params['audio_file'])
+        except Exception:
+            sr = -1
+
+    if task == 'asr':
+        extra_info.update({
+            'lang': params['lang'],
+            'inp_sr': sr,
+            'model_sr': params['sample_rate'],
+        })
+    elif task == 'st':
+        extra_info.update({
+            'lang':
+            params['src_lang'] + '-' + params['tgt_lang'],
+            'inp_sr':
+            sr,
+            'model_sr':
+            params['sample_rate'],
+        })
+    elif task == 'tts':
+        model = params['am']
+        extra_info.update({
+            'lang': params['lang'],
+            'vocoder': params['voc'],
+        })
+    elif task == 'cls':
+        extra_info.update({
+            'inp_sr': sr,
+        })
+    elif task == 'text':
+        extra_info.update({
+            'sub_task': params['task'],
+            'lang': params['lang'],
+        })
+    else:
+        return
+
+    StatsWorker(
+        task=task,
+        model=model,
+        version=__version__,
+        extra_info=extra_info, ).start()
+
+
+def _parse_args(func, *args, **kwargs):
+    # FullArgSpec(args, varargs, varkw, defaults, kwonlyargs, kwonlydefaults, annotations)
+    argspec = inspect.getfullargspec(func)
+
+    keys = argspec[0]
+    if keys[0] == 'self':  # Remove self pointer.
+        keys = keys[1:]
+
+    default_values = argspec[3]
+    values = [None] * (len(keys) - len(default_values))
+    values.extend(list(default_values))
+    params = dict(zip(keys, values))
+
+    for idx, v in enumerate(args):
+        params[keys[idx]] = v
+    for k, v in kwargs.items():
+        params[k] = v
+
+    return params
+
+
+def stats_wrapper(executor_func):
+    def _warpper(self, *args, **kwargs):
+        try:
+            _note_one_stat(
+                type(self).__name__, _parse_args(executor_func, *args,
+                                                 **kwargs))
+        except Exception:
+            pass
+        return executor_func(self, *args, **kwargs)
+
+    return _warpper
diff --git a/ernie-sat/paddlespeech/server/utils/__init__.py b/ernie-sat/paddlespeech/server/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..97043fd7ba6885aac81cad5a49924c23c67d4d47
--- /dev/null
+++ b/ernie-sat/paddlespeech/server/utils/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/ernie-sat/paddlespeech/server/utils/audio_process.py b/ernie-sat/paddlespeech/server/utils/audio_process.py
new file mode 100644
index 0000000000000000000000000000000000000000..3cbb495a67ffcb54444fd44173571eccb02addef
--- /dev/null
+++ b/ernie-sat/paddlespeech/server/utils/audio_process.py
@@ -0,0 +1,105 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import wave
+
+import numpy as np
+
+from paddlespeech.cli.log import logger
+
+
+def wav2pcm(wavfile, pcmfile, data_type=np.int16):
+    """ Save the wav file as a pcm file
+
+    Args:
+        wavfile (str): wav file path
+        pcmfile (str): pcm file save path
+        data_type (type, optional): pcm sample type. Defaults to np.int16.
+    """
+    with open(wavfile, "rb") as f:
+        f.seek(0)
+        f.read(44)
+        data = np.fromfile(f, dtype=data_type)
+        data.tofile(pcmfile)
+
+
+def pcm2wav(pcm_file, wav_file, channels=1, bits=16, sample_rate=16000):
+    """Save the pcm file as a wav file
+
+    Args:
+        pcm_file (str): pcm file path
+        wav_file (str): wav file save path
+        channels (int, optional): audio channel. Defaults to 1.
+        bits (int, optional): Bit depth. Defaults to 16.
+        sample_rate (int, optional): sample rate. Defaults to 16000.
+    """
+    pcmf = open(pcm_file, 'rb')
+    pcmdata = pcmf.read()
+    pcmf.close()
+
+    if bits % 8 != 0:
+        logger.error("bits % 8 must == 0. now bits:" + str(bits))
+
+    wavfile = wave.open(wav_file, 'wb')
+    wavfile.setnchannels(channels)
+    wavfile.setsampwidth(bits // 8)
+    wavfile.setframerate(sample_rate)
+    wavfile.writeframes(pcmdata)
+    wavfile.close()
+
+
+def change_speed(sample_raw, speed_rate, sample_rate):
+    """Change the audio speed by linear interpolation.
+    Note that this is an in-place transformation.
+    :param speed_rate: Rate of speed change:
+                       speed_rate > 1.0, speed up the audio;
+                       speed_rate = 1.0, unchanged;
+                       speed_rate < 1.0, slow down the audio;
+                       speed_rate <= 0.0, not allowed, raise ValueError.
+    :type speed_rate: float
+    :raises ValueError: If speed_rate <= 0.0.
+    """
+    if speed_rate == 1.0:
+        return sample_raw
+    if speed_rate <= 0:
+        raise ValueError("speed_rate should be greater than zero.")
+
+    # numpy
+    # old_length = self._samples.shape[0]
+    # new_length = int(old_length / speed_rate)
+    # old_indices = np.arange(old_length)
+    # new_indices = np.linspace(start=0, stop=old_length, num=new_length)
+    # self._samples = np.interp(new_indices, old_indices, self._samples)
+
+    # sox, slow
+    try:
+        import soxbindings as sox
+    except ImportError:
+        try:
+            from paddlespeech.s2t.utils import dynamic_pip_install
+            package = "sox"
+            dynamic_pip_install.install(package)
+            package = "soxbindings"
+            dynamic_pip_install.install(package)
+            import soxbindings as sox
+        except Exception:
+            raise RuntimeError("Can not install soxbindings on your system.")
+
+    tfm = sox.Transformer()
+    tfm.set_globals(multithread=False)
+    tfm.tempo(speed_rate)
+    sample_speed = tfm.build_array(
+        input_array=sample_raw,
+        sample_rate_in=sample_rate).squeeze(-1).astype(np.float32).copy()
+
+    return sample_speed
diff --git a/ernie-sat/paddlespeech/server/utils/buffer.py b/ernie-sat/paddlespeech/server/utils/buffer.py
new file mode 100644
index 0000000000000000000000000000000000000000..682357b34f542fe62d9819d225b3e5bdde3a30be
--- /dev/null
+++ b/ernie-sat/paddlespeech/server/utils/buffer.py
@@ -0,0 +1,59 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+class Frame(object):
+    """Represents a "frame" of audio data."""
+
+    def __init__(self, bytes, timestamp, duration):
+        self.bytes = bytes
+        self.timestamp = timestamp
+        self.duration = duration
+
+
+class ChunkBuffer(object):
+    def __init__(self,
+                 frame_duration_ms=80,
+                 shift_ms=40,
+                 sample_rate=16000,
+                 sample_width=2):
+        self.sample_rate = sample_rate
+        self.frame_duration_ms = frame_duration_ms
+        self.shift_ms = shift_ms
+        self.remained_audio = b''
+        self.sample_width = sample_width  # int16 = 2; float32 = 4
+
+    def frame_generator(self, audio):
+        """Generates audio frames from PCM audio data.
+        Takes the desired frame duration in milliseconds, the PCM data, and
+        the sample rate.
+        Yields Frames of the requested duration.
+        """
+        audio = self.remained_audio + audio
+        self.remained_audio = b''
+
+        n = int(self.sample_rate * (self.frame_duration_ms / 1000.0) *
+                self.sample_width)
+        shift_n = int(self.sample_rate * (self.shift_ms / 1000.0) *
+                      self.sample_width)
+        offset = 0
+        timestamp = 0.0
+        duration = (float(n) / self.sample_rate) / self.sample_width
+        shift_duration = (float(shift_n) / self.sample_rate) / self.sample_width
+        while offset + n <= len(audio):
+            yield Frame(audio[offset:offset + n], timestamp, duration)
+            timestamp += shift_duration
+            offset += shift_n
+
+        self.remained_audio += audio[offset:]
diff --git a/ernie-sat/paddlespeech/server/utils/config.py b/ernie-sat/paddlespeech/server/utils/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..8c75f536f5de654f1a09fa82187cfef4ef442e90
--- /dev/null
+++ b/ernie-sat/paddlespeech/server/utils/config.py
@@ -0,0 +1,30 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import yaml
+from yacs.config import CfgNode
+
+
+def get_config(config_file: str):
+    """[summary]
+
+    Args:
+        config_file (str): config_file
+
+    Returns:
+        CfgNode: 
+    """
+    with open(config_file, 'rt') as f:
+        config = CfgNode(yaml.safe_load(f))
+
+    return config
diff --git a/ernie-sat/paddlespeech/server/utils/errors.py b/ernie-sat/paddlespeech/server/utils/errors.py
new file mode 100644
index 0000000000000000000000000000000000000000..17ff75512cd447648ecedf9238809a42743b708c
--- /dev/null
+++ b/ernie-sat/paddlespeech/server/utils/errors.py
@@ -0,0 +1,57 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+from enum import IntEnum
+
+from fastapi import Response
+
+
+class ErrorCode(IntEnum):
+    SERVER_OK = 200  # success.
+
+    SERVER_PARAM_ERR = 400  # Input parameters are not valid.
+    SERVER_TASK_NOT_EXIST = 404  # Task is not exist.
+
+    SERVER_INTERNAL_ERR = 500  # Internal error.
+    SERVER_NETWORK_ERR = 502  # Network exception.
+    SERVER_UNKOWN_ERR = 509  # Unknown error occurred.
+
+
+ErrorMsg = {
+    ErrorCode.SERVER_OK: "success.",
+    ErrorCode.SERVER_PARAM_ERR: "Input parameters are not valid.",
+    ErrorCode.SERVER_TASK_NOT_EXIST: "Task is not exist.",
+    ErrorCode.SERVER_INTERNAL_ERR: "Internal error.",
+    ErrorCode.SERVER_NETWORK_ERR: "Network exception.",
+    ErrorCode.SERVER_UNKOWN_ERR: "Unknown error occurred."
+}
+
+
+def failed_response(code, msg=""):
+    """Interface call failure response
+
+    Args:
+        code (int): error code number
+        msg (str, optional): Interface call failure information. Defaults to "".
+
+    Returns:
+        Response (json): failure json information.
+    """
+
+    if not msg:
+        msg = ErrorMsg.get(code, "Unknown error occurred.")
+
+    res = {"success": False, "code": int(code), "message": {"description": msg}}
+
+    return Response(content=json.dumps(res), media_type="application/json")
diff --git a/ernie-sat/paddlespeech/server/utils/exception.py b/ernie-sat/paddlespeech/server/utils/exception.py
new file mode 100644
index 0000000000000000000000000000000000000000..58ea777ca520c489c7c88090f6c758cad3bac1df
--- /dev/null
+++ b/ernie-sat/paddlespeech/server/utils/exception.py
@@ -0,0 +1,30 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import traceback
+
+from paddlespeech.server.utils.errors import ErrorMsg
+
+
+class ServerBaseException(Exception):
+    """ Server Base exception
+    """
+
+    def __init__(self, error_code, msg=None):
+        #if msg:
+        #log.error(msg)
+        msg = msg if msg else ErrorMsg.get(error_code, "")
+        super(ServerBaseException, self).__init__(error_code, msg)
+        self.error_code = error_code
+        self.msg = msg
+        traceback.print_exc()
diff --git a/ernie-sat/paddlespeech/server/utils/log.py b/ernie-sat/paddlespeech/server/utils/log.py
new file mode 100644
index 0000000000000000000000000000000000000000..8644064c73ef407476e7870e65d1149019762723
--- /dev/null
+++ b/ernie-sat/paddlespeech/server/utils/log.py
@@ -0,0 +1,59 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import functools
+import logging
+
+__all__ = [
+    'logger',
+]
+
+
+class Logger(object):
+    def __init__(self, name: str=None):
+        name = 'PaddleSpeech' if not name else name
+        self.logger = logging.getLogger(name)
+
+        log_config = {
+            'DEBUG': 10,
+            'INFO': 20,
+            'TRAIN': 21,
+            'EVAL': 22,
+            'WARNING': 30,
+            'ERROR': 40,
+            'CRITICAL': 50,
+            'EXCEPTION': 100,
+        }
+        for key, level in log_config.items():
+            logging.addLevelName(level, key)
+            if key == 'EXCEPTION':
+                self.__dict__[key.lower()] = self.logger.exception
+            else:
+                self.__dict__[key.lower()] = functools.partial(self.__call__,
+                                                               level)
+
+        self.format = logging.Formatter(
+            fmt='[%(asctime)-15s] [%(levelname)8s] - %(message)s')
+
+        self.handler = logging.StreamHandler()
+        self.handler.setFormatter(self.format)
+
+        self.logger.addHandler(self.handler)
+        self.logger.setLevel(logging.DEBUG)
+        self.logger.propagate = False
+
+    def __call__(self, log_level: str, msg: str):
+        self.logger.log(log_level, msg)
+
+
+logger = Logger()
diff --git a/ernie-sat/paddlespeech/server/utils/paddle_predictor.py b/ernie-sat/paddlespeech/server/utils/paddle_predictor.py
new file mode 100644
index 0000000000000000000000000000000000000000..16653cf372e696cea432fb0f1562324287937043
--- /dev/null
+++ b/ernie-sat/paddlespeech/server/utils/paddle_predictor.py
@@ -0,0 +1,98 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from typing import List
+from typing import Optional
+
+import paddle
+from paddle.inference import Config
+from paddle.inference import create_predictor
+
+
+def init_predictor(model_dir: Optional[os.PathLike]=None,
+                   model_file: Optional[os.PathLike]=None,
+                   params_file: Optional[os.PathLike]=None,
+                   predictor_conf: dict=None):
+    """Create predictor with Paddle inference
+
+    Args:
+        model_dir (Optional[os.PathLike], optional): The path of the static model saved in the model layer. Defaults to None.
+        model_file (Optional[os.PathLike], optional): *.pdmodel file path. Defaults to None.
+        params_file (Optional[os.PathLike], optional): *.pdiparams file path.. Defaults to None.
+        predictor_conf (dict, optional): The configuration parameters of predictor. Defaults to None.
+
+    Returns:
+        predictor (PaddleInferPredictor): created predictor
+    """
+    if model_dir is not None:
+        assert os.path.isdir(model_dir), 'Please check model dir.'
+        config = Config(args.model_dir)
+    else:
+        assert os.path.isfile(model_file) and os.path.isfile(
+            params_file), 'Please check model and parameter files.'
+        config = Config(model_file, params_file)
+
+    # set device
+    if predictor_conf["device"]:
+        device = predictor_conf["device"]
+    else:
+        device = paddle.get_device()
+    if "gpu" in device:
+        gpu_id = device.split(":")[-1]
+        config.enable_use_gpu(1000, int(gpu_id))
+
+    # IR optim
+    if predictor_conf["switch_ir_optim"]:
+        config.switch_ir_optim()
+
+    # glog
+    if not predictor_conf["glog_info"]:
+        config.disable_glog_info()
+
+    # config summary
+    if predictor_conf["summary"]:
+        print(config.summary())
+
+    # memory optim
+    config.enable_memory_optim()
+
+    predictor = create_predictor(config)
+    return predictor
+
+
+def run_model(predictor, input: List) -> List:
+    """ run predictor
+
+    Args:
+        predictor: paddle inference predictor
+        input (list): The input of predictor
+
+    Returns:
+        list: result list
+    """
+    input_names = predictor.get_input_names()
+    for i, name in enumerate(input_names):
+        input_handle = predictor.get_input_handle(name)
+        input_handle.copy_from_cpu(input[i])
+    # do the inference
+    predictor.run()
+    results = []
+    # get out data from output tensor
+    output_names = predictor.get_output_names()
+    for i, name in enumerate(output_names):
+        output_handle = predictor.get_output_handle(name)
+        output_data = output_handle.copy_to_cpu()
+        results.append(output_data)
+
+    return results
diff --git a/ernie-sat/paddlespeech/server/utils/util.py b/ernie-sat/paddlespeech/server/utils/util.py
new file mode 100644
index 0000000000000000000000000000000000000000..e9104fa2d56283c48304d4676fae19e8dccd1ba5
--- /dev/null
+++ b/ernie-sat/paddlespeech/server/utils/util.py
@@ -0,0 +1,33 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the 
+import base64
+
+
+def wav2base64(wav_file: str):
+    """
+    read wave file and covert to base64 string
+    """
+    with open(wav_file, 'rb') as f:
+        base64_bytes = base64.b64encode(f.read())
+        base64_string = base64_bytes.decode('utf-8')
+    return base64_string
+
+
+def base64towav(base64_string: str):
+    pass
+
+
+def self_check():
+    """ self check resource
+    """
+    return True
diff --git a/ernie-sat/paddlespeech/server/utils/vad.py b/ernie-sat/paddlespeech/server/utils/vad.py
new file mode 100644
index 0000000000000000000000000000000000000000..a2dcf68b80f2985a41ffb44d9501b973730b4ba2
--- /dev/null
+++ b/ernie-sat/paddlespeech/server/utils/vad.py
@@ -0,0 +1,78 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import collections
+
+import webrtcvad
+
+
+class VADAudio():
+    def __init__(self,
+                 aggressiveness=2,
+                 rate=16000,
+                 frame_duration_ms=20,
+                 sample_width=2,
+                 padding_ms=200,
+                 padding_ratio=0.9):
+        """Initializes VAD with given aggressivenes and sets up internal queues"""
+        self.vad = webrtcvad.Vad(aggressiveness)
+        self.rate = rate
+        self.sample_width = sample_width
+        self.frame_duration_ms = frame_duration_ms
+        self._frame_length = int(rate * (frame_duration_ms / 1000.0) *
+                                 self.sample_width)
+        self._buffer_queue = collections.deque()
+        self.ring_buffer = collections.deque(maxlen=padding_ms //
+                                             frame_duration_ms)
+        self._ratio = padding_ratio
+        self.triggered = False
+
+    def add_audio(self, audio):
+        """Adds new audio to internal queue"""
+        for x in audio:
+            self._buffer_queue.append(x)
+
+    def frame_generator(self):
+        """Generator that yields audio frames of frame_duration_ms"""
+        while len(self._buffer_queue) > self._frame_length:
+            frame = bytearray()
+            for _ in range(self._frame_length):
+                frame.append(self._buffer_queue.popleft())
+            yield bytes(frame)
+
+    def vad_collector(self):
+        """Generator that yields series of consecutive audio frames comprising each utterence, separated by yielding a single None.
+            Determines voice activity by ratio of frames in padding_ms. Uses a buffer to include padding_ms prior to being triggered.
+            Example: (frame, ..., frame, None, frame, ..., frame, None, ...)
+                      |---utterence---|        |---utterence---|
+        """
+        for frame in self.frame_generator():
+            is_speech = self.vad.is_speech(frame, self.rate)
+            if not self.triggered:
+                self.ring_buffer.append((frame, is_speech))
+                num_voiced = len(
+                    [f for f, speech in self.ring_buffer if speech])
+                if num_voiced > self._ratio * self.ring_buffer.maxlen:
+                    self.triggered = True
+                    for f, s in self.ring_buffer:
+                        yield f
+                    self.ring_buffer.clear()
+            else:
+                yield frame
+                self.ring_buffer.append((frame, is_speech))
+                num_unvoiced = len(
+                    [f for f, speech in self.ring_buffer if not speech])
+                if num_unvoiced > self._ratio * self.ring_buffer.maxlen:
+                    self.triggered = False
+                    yield None
+                    self.ring_buffer.clear()
diff --git a/ernie-sat/paddlespeech/server/ws/__init__.py b/ernie-sat/paddlespeech/server/ws/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..97043fd7ba6885aac81cad5a49924c23c67d4d47
--- /dev/null
+++ b/ernie-sat/paddlespeech/server/ws/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/ernie-sat/paddlespeech/server/ws/api.py b/ernie-sat/paddlespeech/server/ws/api.py
new file mode 100644
index 0000000000000000000000000000000000000000..10664d11470ba4c98816b1c3a1fa30d40fe67a02
--- /dev/null
+++ b/ernie-sat/paddlespeech/server/ws/api.py
@@ -0,0 +1,38 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List
+
+from fastapi import APIRouter
+
+from paddlespeech.server.ws.asr_socket import router as asr_router
+
+_router = APIRouter()
+
+
+def setup_router(api_list: List):
+    """setup router for fastapi
+    Args:
+        api_list (List): [asr, tts]
+    Returns:
+        APIRouter
+    """
+    for api_name in api_list:
+        if api_name == 'asr':
+            _router.include_router(asr_router)
+        elif api_name == 'tts':
+            pass
+        else:
+            pass
+
+    return _router
diff --git a/ernie-sat/paddlespeech/server/ws/asr_socket.py b/ernie-sat/paddlespeech/server/ws/asr_socket.py
new file mode 100644
index 0000000000000000000000000000000000000000..ea19816b69ff4719220784099205a6d8a5bae4ed
--- /dev/null
+++ b/ernie-sat/paddlespeech/server/ws/asr_socket.py
@@ -0,0 +1,100 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+
+import numpy as np
+from fastapi import APIRouter
+from fastapi import WebSocket
+from fastapi import WebSocketDisconnect
+from starlette.websockets import WebSocketState as WebSocketState
+
+from paddlespeech.server.engine.engine_pool import get_engine_pool
+from paddlespeech.server.utils.buffer import ChunkBuffer
+from paddlespeech.server.utils.vad import VADAudio
+
+router = APIRouter()
+
+
+@router.websocket('/ws/asr')
+async def websocket_endpoint(websocket: WebSocket):
+
+    await websocket.accept()
+
+    engine_pool = get_engine_pool()
+    asr_engine = engine_pool['asr']
+    # init buffer
+    chunk_buffer_conf = asr_engine.config.chunk_buffer_conf
+    chunk_buffer = ChunkBuffer(
+        sample_rate=chunk_buffer_conf['sample_rate'],
+        sample_width=chunk_buffer_conf['sample_width'])
+    # init vad
+    vad_conf = asr_engine.config.vad_conf
+    vad = VADAudio(
+        aggressiveness=vad_conf['aggressiveness'],
+        rate=vad_conf['sample_rate'],
+        frame_duration_ms=vad_conf['frame_duration_ms'])
+
+    try:
+        while True:
+            # careful here, changed the source code from starlette.websockets
+            assert websocket.application_state == WebSocketState.CONNECTED
+            message = await websocket.receive()
+            websocket._raise_on_disconnect(message)
+            if "text" in message:
+                message = json.loads(message["text"])
+                if 'signal' not in message:
+                    resp = {"status": "ok", "message": "no valid json data"}
+                    await websocket.send_json(resp)
+
+                if message['signal'] == 'start':
+                    resp = {"status": "ok", "signal": "server_ready"}
+                    # do something at begining here
+                    await websocket.send_json(resp)
+                elif message['signal'] == 'end':
+                    engine_pool = get_engine_pool()
+                    asr_engine = engine_pool['asr']
+                    # reset single  engine for an new connection
+                    asr_engine.reset()
+                    resp = {"status": "ok", "signal": "finished"}
+                    await websocket.send_json(resp)
+                    break
+                else:
+                    resp = {"status": "ok", "message": "no valid json data"}
+                    await websocket.send_json(resp)
+            elif "bytes" in message:
+                message = message["bytes"]
+
+                # vad for input bytes audio
+                vad.add_audio(message)
+                message = b''.join(f for f in vad.vad_collector()
+                                   if f is not None)
+
+                engine_pool = get_engine_pool()
+                asr_engine = engine_pool['asr']
+                asr_results = ""
+                frames = chunk_buffer.frame_generator(message)
+                for frame in frames:
+                    samples = np.frombuffer(frame.bytes, dtype=np.int16)
+                    sample_rate = asr_engine.config.sample_rate
+                    x_chunk, x_chunk_lens = asr_engine.preprocess(samples,
+                                                                  sample_rate)
+                    asr_engine.run(x_chunk, x_chunk_lens)
+                    asr_results = asr_engine.postprocess()
+
+                asr_results = asr_engine.postprocess()
+                resp = {'asr_results': asr_results}
+
+                await websocket.send_json(resp)
+    except WebSocketDisconnect:
+        pass
diff --git a/ernie-sat/paddlespeech/t2s/__init__.py b/ernie-sat/paddlespeech/t2s/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d93c026ecedda485d52b84c349e8fc1806daaf5
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/__init__.py
@@ -0,0 +1,22 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+
+from . import datasets
+from . import exps
+from . import frontend
+from . import models
+from . import modules
+from . import training
+from . import utils
diff --git a/ernie-sat/paddlespeech/t2s/audio/__init__.py b/ernie-sat/paddlespeech/t2s/audio/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0deefc8bc368fa5e4f6236569d539082fb4c37cb
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/audio/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .audio import AudioProcessor
+from .codec import *
+from .spec_normalizer import LogMagnitude
+from .spec_normalizer import NormalizerBase
diff --git a/ernie-sat/paddlespeech/t2s/audio/audio.py b/ernie-sat/paddlespeech/t2s/audio/audio.py
new file mode 100644
index 0000000000000000000000000000000000000000..59ea8c877771fc7fed607d8c616ae4aa935038f8
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/audio/audio.py
@@ -0,0 +1,102 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import librosa
+import numpy as np
+import soundfile as sf
+
+__all__ = ["AudioProcessor"]
+
+
+class AudioProcessor(object):
+    def __init__(self,
+                 sample_rate: int,
+                 n_fft: int,
+                 win_length: int,
+                 hop_length: int,
+                 n_mels: int=80,
+                 fmin: int=0,
+                 fmax: int=None,
+                 window="hann",
+                 center=True,
+                 pad_mode="reflect",
+                 normalize=True):
+        # read & write
+        self.sample_rate = sample_rate
+        self.normalize = normalize
+
+        # stft
+        self.n_fft = n_fft
+        self.win_length = win_length
+        self.hop_length = hop_length
+        self.window = window
+        self.center = center
+        self.pad_mode = pad_mode
+
+        # mel
+        self.n_mels = n_mels
+        self.fmin = fmin
+        self.fmax = fmax
+
+        self.mel_filter = self._create_mel_filter()
+        self.inv_mel_filter = np.linalg.pinv(self.mel_filter)
+
+    def _create_mel_filter(self):
+        mel_filter = librosa.filters.mel(
+            sr=self.sample_rate,
+            n_fft=self.n_fft,
+            n_mels=self.n_mels,
+            fmin=self.fmin,
+            fmax=self.fmax)
+        return mel_filter
+
+    def read_wav(self, filename):
+        # resampling may occur
+        wav, _ = librosa.load(filename, sr=self.sample_rate)
+
+        # normalize the volume
+        if self.normalize:
+            wav = wav / np.max(np.abs(wav)) * 0.999
+        return wav
+
+    def write_wav(self, path, wav):
+        sf.write(path, wav, samplerate=self.sample_rate)
+
+    def stft(self, wav):
+        D = librosa.core.stft(
+            wav,
+            n_fft=self.n_fft,
+            hop_length=self.hop_length,
+            win_length=self.win_length,
+            window=self.window,
+            center=self.center,
+            pad_mode=self.pad_mode)
+        return D
+
+    def istft(self, D):
+        wav = librosa.core.istft(
+            D,
+            hop_length=self.hop_length,
+            win_length=self.win_length,
+            window=self.window,
+            center=self.center)
+        return wav
+
+    def spectrogram(self, wav):
+        D = self.stft(wav)
+        return np.abs(D)
+
+    def mel_spectrogram(self, wav):
+        S = self.spectrogram(wav)
+        mel = np.dot(self.mel_filter, S)
+        return mel
diff --git a/ernie-sat/paddlespeech/t2s/audio/codec.py b/ernie-sat/paddlespeech/t2s/audio/codec.py
new file mode 100644
index 0000000000000000000000000000000000000000..2a759ce4ce231485204ac2b7b88f6d4943cc534a
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/audio/codec.py
@@ -0,0 +1,51 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+
+import numpy as np
+import paddle
+
+
+# x: [0: 2**bit-1], return: [-1, 1]
+def label_2_float(x, bits):
+    return 2 * x / (2**bits - 1.) - 1.
+
+
+#x: [-1, 1], return: [0, 2**bits-1]
+def float_2_label(x, bits):
+    assert abs(x).max() <= 1.0
+    x = (x + 1.) * (2**bits - 1) / 2
+    return x.clip(0, 2**bits - 1)
+
+
+# y: [-1, 1], mu: 2**bits, return: [0, 2**bits-1]
+# see https://en.wikipedia.org/wiki/%CE%9C-law_algorithm
+# be careful the input `mu` here, which is +1 than that of the link above
+def encode_mu_law(x, mu):
+    mu = mu - 1
+    fx = np.sign(x) * np.log(1 + mu * np.abs(x)) / np.log(1 + mu)
+    return np.floor((fx + 1) / 2 * mu + 0.5)
+
+
+# from_labels = True:
+# y: [0: 2**bit-1], mu: 2**bits, return: [-1,1]
+# from_labels = False:
+# y: [-1, 1], return: [-1, 1]
+def decode_mu_law(y, mu, from_labels=True):
+    # TODO: get rid of log2 - makes no sense
+    if from_labels:
+        y = label_2_float(y, math.log2(mu))
+    mu = mu - 1
+    x = paddle.sign(y) / mu * ((1 + mu)**paddle.abs(y) - 1)
+    return x
diff --git a/ernie-sat/paddlespeech/t2s/audio/spec_normalizer.py b/ernie-sat/paddlespeech/t2s/audio/spec_normalizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..d8cd67a23fbe23555ee1812769dc29fcac262391
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/audio/spec_normalizer.py
@@ -0,0 +1,74 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This modules contains normalizers for spectrogram magnitude.
+Normalizers are invertible transformations. They can be used to process 
+magnitude of spectrogram before training and can also be used to recover from 
+the generated spectrogram so as to be used with vocoders like griffin lim.
+
+The base class describe the interface. `transform` is used to perform 
+transformation and `inverse` is used to perform the inverse transformation.
+
+check issues:
+https://github.com/mozilla/TTS/issues/377
+"""
+import numpy as np
+
+__all__ = ["NormalizerBase", "LogMagnitude", "UnitMagnitude"]
+
+
+class NormalizerBase(object):
+    def transform(self, spec):
+        raise NotImplementedError("transform must be implemented")
+
+    def inverse(self, normalized):
+        raise NotImplementedError("inverse must be implemented")
+
+
+class LogMagnitude(NormalizerBase):
+    """
+    This is a simple normalizer used in Waveglow, Waveflow, tacotron2...
+    """
+
+    def __init__(self, min=1e-5):
+        self.min = min
+
+    def transform(self, x):
+        x = np.maximum(x, self.min)
+        x = np.log(x)
+        return x
+
+    def inverse(self, x):
+        return np.exp(x)
+
+
+class UnitMagnitude(NormalizerBase):
+    # dbscale and (0, 1) normalization
+    """
+    This is the normalizer used in the 
+    """
+
+    def __init__(self, min=1e-5):
+        self.min = min
+
+    def transform(self, x):
+        db_scale = 20 * np.log10(np.maximum(self.min, x)) - 20
+        normalized = (db_scale + 100) / 100
+        clipped = np.clip(normalized, 0, 1)
+        return clipped
+
+    def inverse(self, x):
+        denormalized = np.clip(x, 0, 1) * 100 - 100
+        out = np.exp((denormalized + 20) / 20 * np.log(10))
+        return out
diff --git a/ernie-sat/paddlespeech/t2s/datasets/__init__.py b/ernie-sat/paddlespeech/t2s/datasets/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..caf20aac4e59220b00b049ec134c80af9283f016
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/datasets/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .ljspeech import *
diff --git a/ernie-sat/paddlespeech/t2s/datasets/am_batch_fn.py b/ernie-sat/paddlespeech/t2s/datasets/am_batch_fn.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e3ad3c125da181f397d57e9ca6e41e68eb9623b
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/datasets/am_batch_fn.py
@@ -0,0 +1,295 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+import paddle
+
+from paddlespeech.t2s.datasets.batch import batch_sequences
+
+
+def tacotron2_single_spk_batch_fn(examples):
+    # fields = ["text", "text_lengths", "speech", "speech_lengths"]
+    text = [np.array(item["text"], dtype=np.int64) for item in examples]
+    speech = [np.array(item["speech"], dtype=np.float32) for item in examples]
+    text_lengths = [
+        np.array(item["text_lengths"], dtype=np.int64) for item in examples
+    ]
+    speech_lengths = [
+        np.array(item["speech_lengths"], dtype=np.int64) for item in examples
+    ]
+
+    text = batch_sequences(text)
+    speech = batch_sequences(speech)
+
+    # convert each batch to paddle.Tensor
+    text = paddle.to_tensor(text)
+    speech = paddle.to_tensor(speech)
+    text_lengths = paddle.to_tensor(text_lengths)
+    speech_lengths = paddle.to_tensor(speech_lengths)
+
+    batch = {
+        "text": text,
+        "text_lengths": text_lengths,
+        "speech": speech,
+        "speech_lengths": speech_lengths,
+    }
+    return batch
+
+
+def tacotron2_multi_spk_batch_fn(examples):
+    # fields = ["text", "text_lengths", "speech", "speech_lengths"]
+    text = [np.array(item["text"], dtype=np.int64) for item in examples]
+    speech = [np.array(item["speech"], dtype=np.float32) for item in examples]
+    text_lengths = [
+        np.array(item["text_lengths"], dtype=np.int64) for item in examples
+    ]
+    speech_lengths = [
+        np.array(item["speech_lengths"], dtype=np.int64) for item in examples
+    ]
+
+    text = batch_sequences(text)
+    speech = batch_sequences(speech)
+
+    # convert each batch to paddle.Tensor
+    text = paddle.to_tensor(text)
+    speech = paddle.to_tensor(speech)
+    text_lengths = paddle.to_tensor(text_lengths)
+    speech_lengths = paddle.to_tensor(speech_lengths)
+
+    batch = {
+        "text": text,
+        "text_lengths": text_lengths,
+        "speech": speech,
+        "speech_lengths": speech_lengths,
+    }
+    # spk_emb has a higher priority than spk_id
+    if "spk_emb" in examples[0]:
+        spk_emb = [
+            np.array(item["spk_emb"], dtype=np.float32) for item in examples
+        ]
+        spk_emb = batch_sequences(spk_emb)
+        spk_emb = paddle.to_tensor(spk_emb)
+        batch["spk_emb"] = spk_emb
+    elif "spk_id" in examples[0]:
+        spk_id = [np.array(item["spk_id"], dtype=np.int64) for item in examples]
+        spk_id = paddle.to_tensor(spk_id)
+        batch["spk_id"] = spk_id
+    return batch
+
+
+def speedyspeech_single_spk_batch_fn(examples):
+    # fields = ["phones", "tones", "num_phones", "num_frames", "feats", "durations"]
+    phones = [np.array(item["phones"], dtype=np.int64) for item in examples]
+    tones = [np.array(item["tones"], dtype=np.int64) for item in examples]
+    feats = [np.array(item["feats"], dtype=np.float32) for item in examples]
+    durations = [
+        np.array(item["durations"], dtype=np.int64) for item in examples
+    ]
+    num_phones = [
+        np.array(item["num_phones"], dtype=np.int64) for item in examples
+    ]
+    num_frames = [
+        np.array(item["num_frames"], dtype=np.int64) for item in examples
+    ]
+
+    phones = batch_sequences(phones)
+    tones = batch_sequences(tones)
+    feats = batch_sequences(feats)
+    durations = batch_sequences(durations)
+
+    # convert each batch to paddle.Tensor
+    phones = paddle.to_tensor(phones)
+    tones = paddle.to_tensor(tones)
+    feats = paddle.to_tensor(feats)
+    durations = paddle.to_tensor(durations)
+    num_phones = paddle.to_tensor(num_phones)
+    num_frames = paddle.to_tensor(num_frames)
+    batch = {
+        "phones": phones,
+        "tones": tones,
+        "num_phones": num_phones,
+        "num_frames": num_frames,
+        "feats": feats,
+        "durations": durations,
+    }
+    return batch
+
+
+def speedyspeech_multi_spk_batch_fn(examples):
+    # fields = ["phones", "tones", "num_phones", "num_frames", "feats", "durations", "spk_id"]
+    phones = [np.array(item["phones"], dtype=np.int64) for item in examples]
+    tones = [np.array(item["tones"], dtype=np.int64) for item in examples]
+    feats = [np.array(item["feats"], dtype=np.float32) for item in examples]
+    durations = [
+        np.array(item["durations"], dtype=np.int64) for item in examples
+    ]
+    num_phones = [
+        np.array(item["num_phones"], dtype=np.int64) for item in examples
+    ]
+    num_frames = [
+        np.array(item["num_frames"], dtype=np.int64) for item in examples
+    ]
+
+    phones = batch_sequences(phones)
+    tones = batch_sequences(tones)
+    feats = batch_sequences(feats)
+    durations = batch_sequences(durations)
+
+    # convert each batch to paddle.Tensor
+    phones = paddle.to_tensor(phones)
+    tones = paddle.to_tensor(tones)
+    feats = paddle.to_tensor(feats)
+    durations = paddle.to_tensor(durations)
+    num_phones = paddle.to_tensor(num_phones)
+    num_frames = paddle.to_tensor(num_frames)
+    batch = {
+        "phones": phones,
+        "tones": tones,
+        "num_phones": num_phones,
+        "num_frames": num_frames,
+        "feats": feats,
+        "durations": durations,
+    }
+    if "spk_id" in examples[0]:
+        spk_id = [np.array(item["spk_id"], dtype=np.int64) for item in examples]
+        spk_id = paddle.to_tensor(spk_id)
+        batch["spk_id"] = spk_id
+    return batch
+
+
+def fastspeech2_single_spk_batch_fn(examples):
+    # fields = ["text", "text_lengths", "speech", "speech_lengths", "durations", "pitch", "energy"]
+    text = [np.array(item["text"], dtype=np.int64) for item in examples]
+    speech = [np.array(item["speech"], dtype=np.float32) for item in examples]
+    pitch = [np.array(item["pitch"], dtype=np.float32) for item in examples]
+    energy = [np.array(item["energy"], dtype=np.float32) for item in examples]
+    durations = [
+        np.array(item["durations"], dtype=np.int64) for item in examples
+    ]
+
+    text_lengths = [
+        np.array(item["text_lengths"], dtype=np.int64) for item in examples
+    ]
+    speech_lengths = [
+        np.array(item["speech_lengths"], dtype=np.int64) for item in examples
+    ]
+
+    text = batch_sequences(text)
+    pitch = batch_sequences(pitch)
+    speech = batch_sequences(speech)
+    durations = batch_sequences(durations)
+    energy = batch_sequences(energy)
+
+    # convert each batch to paddle.Tensor
+    text = paddle.to_tensor(text)
+    pitch = paddle.to_tensor(pitch)
+    speech = paddle.to_tensor(speech)
+    durations = paddle.to_tensor(durations)
+    energy = paddle.to_tensor(energy)
+    text_lengths = paddle.to_tensor(text_lengths)
+    speech_lengths = paddle.to_tensor(speech_lengths)
+
+    batch = {
+        "text": text,
+        "text_lengths": text_lengths,
+        "durations": durations,
+        "speech": speech,
+        "speech_lengths": speech_lengths,
+        "pitch": pitch,
+        "energy": energy
+    }
+    return batch
+
+
+def fastspeech2_multi_spk_batch_fn(examples):
+    # fields = ["text", "text_lengths", "speech", "speech_lengths", "durations", "pitch", "energy", "spk_id"/"spk_emb"]
+    text = [np.array(item["text"], dtype=np.int64) for item in examples]
+    speech = [np.array(item["speech"], dtype=np.float32) for item in examples]
+    pitch = [np.array(item["pitch"], dtype=np.float32) for item in examples]
+    energy = [np.array(item["energy"], dtype=np.float32) for item in examples]
+    durations = [
+        np.array(item["durations"], dtype=np.int64) for item in examples
+    ]
+    text_lengths = [
+        np.array(item["text_lengths"], dtype=np.int64) for item in examples
+    ]
+    speech_lengths = [
+        np.array(item["speech_lengths"], dtype=np.int64) for item in examples
+    ]
+
+    text = batch_sequences(text)
+    pitch = batch_sequences(pitch)
+    speech = batch_sequences(speech)
+    durations = batch_sequences(durations)
+    energy = batch_sequences(energy)
+
+    # convert each batch to paddle.Tensor
+    text = paddle.to_tensor(text)
+    pitch = paddle.to_tensor(pitch)
+    speech = paddle.to_tensor(speech)
+    durations = paddle.to_tensor(durations)
+    energy = paddle.to_tensor(energy)
+    text_lengths = paddle.to_tensor(text_lengths)
+    speech_lengths = paddle.to_tensor(speech_lengths)
+
+    batch = {
+        "text": text,
+        "text_lengths": text_lengths,
+        "durations": durations,
+        "speech": speech,
+        "speech_lengths": speech_lengths,
+        "pitch": pitch,
+        "energy": energy
+    }
+    # spk_emb has a higher priority than spk_id
+    if "spk_emb" in examples[0]:
+        spk_emb = [
+            np.array(item["spk_emb"], dtype=np.float32) for item in examples
+        ]
+        spk_emb = batch_sequences(spk_emb)
+        spk_emb = paddle.to_tensor(spk_emb)
+        batch["spk_emb"] = spk_emb
+    elif "spk_id" in examples[0]:
+        spk_id = [np.array(item["spk_id"], dtype=np.int64) for item in examples]
+        spk_id = paddle.to_tensor(spk_id)
+        batch["spk_id"] = spk_id
+    return batch
+
+
+def transformer_single_spk_batch_fn(examples):
+    # fields = ["text", "text_lengths", "speech", "speech_lengths"]
+    text = [np.array(item["text"], dtype=np.int64) for item in examples]
+    speech = [np.array(item["speech"], dtype=np.float32) for item in examples]
+    text_lengths = [
+        np.array(item["text_lengths"], dtype=np.int64) for item in examples
+    ]
+    speech_lengths = [
+        np.array(item["speech_lengths"], dtype=np.int64) for item in examples
+    ]
+
+    text = batch_sequences(text)
+    speech = batch_sequences(speech)
+
+    # convert each batch to paddle.Tensor
+    text = paddle.to_tensor(text)
+    speech = paddle.to_tensor(speech)
+    text_lengths = paddle.to_tensor(text_lengths)
+    speech_lengths = paddle.to_tensor(speech_lengths)
+
+    batch = {
+        "text": text,
+        "text_lengths": text_lengths,
+        "speech": speech,
+        "speech_lengths": speech_lengths,
+    }
+    return batch
diff --git a/ernie-sat/paddlespeech/t2s/datasets/batch.py b/ernie-sat/paddlespeech/t2s/datasets/batch.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d83bbe09c60d25d15ba0fcfd7197d257d4993cf
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/datasets/batch.py
@@ -0,0 +1,188 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Utility functions to create batch for arrays which satisfy some conditions.
+Batch functions for text sequences, audio and spectrograms are provided.
+"""
+import numpy as np
+
+__all__ = [
+    "batch_text_id",
+    "batch_wav",
+    "batch_spec",
+    "TextIDBatcher",
+    "WavBatcher",
+    "SpecBatcher",
+]
+
+
+class TextIDBatcher(object):
+    """A wrapper class for `batch_text_id`."""
+
+    def __init__(self, pad_id=0, dtype=np.int64):
+        self.pad_id = pad_id
+        self.dtype = dtype
+
+    def __call__(self, minibatch):
+        out = batch_text_id(minibatch, pad_id=self.pad_id, dtype=self.dtype)
+        return out
+
+
+def batch_text_id(minibatch, pad_id=0, dtype=np.int64):
+    """Pad sequences to text_ids to the largest length and batch them.
+    
+    Args:
+        minibatch (List[np.ndarray]): list of rank-1 arrays, shape(T,), dtype np.int64, text_ids.
+        pad_id (int, optional): the id which correspond to the special pad token. Defaults to 0.
+        dtype (np.dtype, optional): the data dtype of the output. Defaults to np.int64.
+
+    Returns:
+        np.ndarray: rank-2 array of text_ids, shape(B, T), B stands for batch_size, T stands for length. The output batch.
+    """
+    peek_example = minibatch[0]
+    assert len(peek_example.shape) == 1, "text example is an 1D tensor"
+    # assume (channel, n_samples) or (n_samples, )
+    lengths = [example.shape[0] for example in minibatch]
+    max_len = np.max(lengths)
+
+    batch = []
+    for example in minibatch:
+        pad_len = max_len - example.shape[0]
+        batch.append(
+            np.pad(
+                example, [(0, pad_len)],
+                mode='constant',
+                constant_values=pad_id))
+
+    return np.array(batch, dtype=dtype), np.array(lengths, dtype=np.int64)
+
+
+class WavBatcher(object):
+    """A wrapper class for `batch_wav`."""
+
+    def __init__(self, pad_value=0., dtype=np.float32):
+        self.pad_value = pad_value
+        self.dtype = dtype
+
+    def __call__(self, minibatch):
+        out = batch_wav(minibatch, pad_value=self.pad_value, dtype=self.dtype)
+        return out
+
+
+def batch_wav(minibatch, pad_value=0., dtype=np.float32):
+    """pad audios to the largest length and batch them.
+
+    Args:
+        minibatch (List[np.ndarray]): list of rank-1 float arrays(mono-channel audio, shape(T,)), dtype float.
+        pad_value (float, optional): the pad value. Defaults to 0..
+        dtype (np.dtype, optional): the data type of the output. Defaults to np.float32.
+
+    Returns:
+        np.ndarray: shape(B, T), the output batch.
+    """
+
+    peek_example = minibatch[0]
+    assert len(peek_example.shape) == 1, "we only handles mono-channel wav"
+
+    # assume (channel, n_samples) or (n_samples, )
+    lengths = [example.shape[-1] for example in minibatch]
+    max_len = np.max(lengths)
+
+    batch = []
+    for example in minibatch:
+        pad_len = max_len - example.shape[-1]
+        batch.append(
+            np.pad(
+                example, [(0, pad_len)],
+                mode='constant',
+                constant_values=pad_value))
+    return np.array(batch, dtype=dtype), np.array(lengths, dtype=np.int64)
+
+
+class SpecBatcher(object):
+    """A wrapper class for `batch_spec`"""
+
+    def __init__(self, pad_value=0., time_major=False, dtype=np.float32):
+        self.pad_value = pad_value
+        self.dtype = dtype
+        self.time_major = time_major
+
+    def __call__(self, minibatch):
+        out = batch_spec(
+            minibatch,
+            pad_value=self.pad_value,
+            time_major=self.time_major,
+            dtype=self.dtype)
+        return out
+
+
+def batch_spec(minibatch, pad_value=0., time_major=False, dtype=np.float32):
+    """Pad spectra to the largest length and batch them.
+
+    Args:
+        minibatch (List[np.ndarray]): list of rank-2 arrays of shape(F, T) for mono-channel spectrograms, or list of rank-3 arrays of shape(C, F, T) for multi-channel spectrograms(F stands for frequency bands.), dtype float.
+        pad_value (float, optional): the pad value. Defaults to 0..
+        dtype (np.dtype, optional): data type of the output. Defaults to np.float32.
+
+    Returns:
+        np.ndarray: a rank-3 array of shape(B, F, T) or (B, T, F).
+    """
+    # assume (F, T) or (T, F)
+    peek_example = minibatch[0]
+    assert len(
+        peek_example.shape) == 2, "we only handles mono channel spectrogram"
+
+    # assume (F, n_frame) or (n_frame, F)
+    time_idx = 0 if time_major else -1
+    lengths = [example.shape[time_idx] for example in minibatch]
+    max_len = np.max(lengths)
+
+    batch = []
+    for example in minibatch:
+        pad_len = max_len - example.shape[time_idx]
+        if time_major:
+            batch.append(
+                np.pad(
+                    example, [(0, pad_len), (0, 0)],
+                    mode='constant',
+                    constant_values=pad_value))
+        else:
+            batch.append(
+                np.pad(
+                    example, [(0, 0), (0, pad_len)],
+                    mode='constant',
+                    constant_values=pad_value))
+    return np.array(batch, dtype=dtype), np.array(lengths, dtype=np.int64)
+
+
+def batch_sequences(sequences, axis=0, pad_value=0):
+    # import pdb; pdb.set_trace()
+    seq = sequences[0]
+    ndim = seq.ndim
+    if axis < 0:
+        axis += ndim
+    dtype = seq.dtype
+    pad_value = dtype.type(pad_value)
+    seq_lengths = [seq.shape[axis] for seq in sequences]
+    max_length = np.max(seq_lengths)
+
+    padded_sequences = []
+    for seq, length in zip(sequences, seq_lengths):
+        padding = [(0, 0)] * axis + [(0, max_length - length)] + [(0, 0)] * (
+            ndim - axis - 1)
+        padded_seq = np.pad(
+            seq, padding, mode='constant', constant_values=pad_value)
+        padded_sequences.append(padded_seq)
+    batch = np.stack(padded_sequences)
+    return batch
diff --git a/ernie-sat/paddlespeech/t2s/datasets/data_table.py b/ernie-sat/paddlespeech/t2s/datasets/data_table.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9815af215d8f6136dca4100f35c8c6e3a007c2e
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/datasets/data_table.py
@@ -0,0 +1,133 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from multiprocessing import Manager
+from typing import Any
+from typing import Callable
+from typing import Dict
+from typing import List
+
+from paddle.io import Dataset
+
+
+class DataTable(Dataset):
+    """Dataset to load and convert data for general purpose.
+    Args:
+        data (List[Dict[str, Any]]): Metadata, a list of meta datum, each of which is composed of  several fields
+        fields (List[str], optional): Fields to use, if not specified, all the fields in the data are used, by default None
+        converters (Dict[str, Callable], optional): Converters used to process each field, by default None
+        use_cache (bool, optional): Whether to use cache, by default False
+
+    Raises:
+        ValueError:
+            If there is some field that does not exist in data. 
+        ValueError:
+            If there is some field in converters that does not exist in fields.
+    """
+
+    def __init__(self,
+                 data: List[Dict[str, Any]],
+                 fields: List[str]=None,
+                 converters: Dict[str, Callable]=None,
+                 use_cache: bool=False):
+        # metadata
+        self.data = data
+        assert len(data) > 0, "This dataset has no examples"
+
+        # peak an example to get existing fields.
+        first_example = self.data[0]
+        fields_in_data = first_example.keys()
+
+        # check all the requested fields exist
+        if fields is None:
+            self.fields = fields_in_data
+        else:
+            for field in fields:
+                if field not in fields_in_data:
+                    raise ValueError(
+                        f"The requested field ({field}) is not found"
+                        f"in the data. Fields in the data is {fields_in_data}")
+            self.fields = fields
+
+        # check converters
+        if converters is None:
+            self.converters = {}
+        else:
+            for field in converters.keys():
+                if field not in self.fields:
+                    raise ValueError(
+                        f"The converter has a non existing field ({field})")
+            self.converters = converters
+
+        self.use_cache = use_cache
+        if use_cache:
+            self._initialize_cache()
+
+    def _initialize_cache(self):
+        self.manager = Manager()
+        self.caches = self.manager.list()
+        self.caches += [None for _ in range(len(self))]
+
+    def _get_metadata(self, idx: int) -> Dict[str, Any]:
+        """Return a meta-datum given an index."""
+        return self.data[idx]
+
+    def _convert(self, meta_datum: Dict[str, Any]) -> Dict[str, Any]:
+        """Convert a meta datum to an example by applying the corresponding 
+        converters to each fields requested.
+
+        Args:
+            meta_datum (Dict[str, Any]): Meta datum
+
+        Returns:
+            Dict[str, Any]: Converted example
+        """
+        example = {}
+        for field in self.fields:
+            converter = self.converters.get(field, None)
+            meta_datum_field = meta_datum[field]
+            if converter is not None:
+                converted_field = converter(meta_datum_field)
+            else:
+                converted_field = meta_datum_field
+            example[field] = converted_field
+        return example
+
+    def __getitem__(self, idx: int) -> Dict[str, Any]:
+        """Get an example given an index.
+        Args:
+            idx (int): Index of the example to get
+
+        Returns:
+            Dict[str, Any]: A converted example
+        """
+        if self.use_cache and self.caches[idx] is not None:
+            return self.caches[idx]
+
+        meta_datum = self._get_metadata(idx)
+        example = self._convert(meta_datum)
+
+        if self.use_cache:
+            self.caches[idx] = example
+
+        return example
+
+    def __len__(self) -> int:
+        """Returns the size of the dataset.
+
+        Returns
+        -------
+        int
+            The length of the dataset
+        """
+        return len(self.data)
diff --git a/ernie-sat/paddlespeech/t2s/datasets/dataset.py b/ernie-sat/paddlespeech/t2s/datasets/dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d6c03cb19c585a0736e1da61266d31e88b90dc8
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/datasets/dataset.py
@@ -0,0 +1,261 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import six
+from paddle.io import Dataset
+
+__all__ = [
+    "split",
+    "TransformDataset",
+    "CacheDataset",
+    "TupleDataset",
+    "DictDataset",
+    "SliceDataset",
+    "SubsetDataset",
+    "FilterDataset",
+    "ChainDataset",
+]
+
+
+def split(dataset, first_size):
+    """A utility function to split a dataset into two datasets."""
+    first = SliceDataset(dataset, 0, first_size)
+    second = SliceDataset(dataset, first_size, len(dataset))
+    return first, second
+
+
+class TransformDataset(Dataset):
+    def __init__(self, dataset, transform):
+        """Dataset which is transformed from another with a transform.
+
+        Args:
+            dataset (Dataset): the base dataset.
+            transform (callable): the transform which takes an example of the base dataset as parameter and return a new example.
+        """
+        self._dataset = dataset
+        self._transform = transform
+
+    def __len__(self):
+        return len(self._dataset)
+
+    def __getitem__(self, i):
+        in_data = self._dataset[i]
+        return self._transform(in_data)
+
+
+class CacheDataset(Dataset):
+    def __init__(self, dataset):
+        """A lazy cache of the base dataset.
+
+        Args:
+            dataset (Dataset): the base dataset to cache.
+        """
+        self._dataset = dataset
+        self._cache = dict()
+
+    def __len__(self):
+        return len(self._dataset)
+
+    def __getitem__(self, i):
+        if i not in self._cache:
+            self._cache[i] = self._dataset[i]
+        return self._cache[i]
+
+
+class TupleDataset(Dataset):
+    def __init__(self, *datasets):
+        """A compound dataset made from several datasets of the same length. An example of the `TupleDataset` is a tuple of examples from the constituent datasets.
+
+        Args:
+            datasets: tuple[Dataset], the constituent datasets.
+        """
+        if not datasets:
+            raise ValueError("no datasets are given")
+        length = len(datasets[0])
+        for i, dataset in enumerate(datasets):
+            if len(dataset) != length:
+                raise ValueError("all the datasets should have the same length."
+                                 "dataset {} has a different length".format(i))
+        self._datasets = datasets
+        self._length = length
+
+    def __getitem__(self, index):
+        # SOA
+        batches = [dataset[index] for dataset in self._datasets]
+        if isinstance(index, slice):
+            length = len(batches[0])
+            # AOS
+            return [
+                tuple([batch[i] for batch in batches])
+                for i in six.moves.range(length)
+            ]
+        else:
+            return tuple(batches)
+
+    def __len__(self):
+        return self._length
+
+
+class DictDataset(Dataset):
+    def __init__(self, **datasets):
+        """
+        A compound dataset made from several datasets of the same length. An 
+        example of the `DictDataset` is a dict of examples from the constituent 
+        datasets.
+
+        WARNING: paddle does not have a good support for DictDataset, because
+        every batch yield from a DataLoader is a list, but it cannot be a dict.
+        So you have to provide a collate function because you cannot use the
+        default one.
+
+        Args:
+            datasets: Dict[Dataset], the constituent datasets.
+        """
+        if not datasets:
+            raise ValueError("no datasets are given")
+        length = None
+        for key, dataset in six.iteritems(datasets):
+            if length is None:
+                length = len(dataset)
+            elif len(dataset) != length:
+                raise ValueError(
+                    "all the datasets should have the same length."
+                    "dataset {} has a different length".format(key))
+        self._datasets = datasets
+        self._length = length
+
+    def __getitem__(self, index):
+        batches = {
+            key: dataset[index]
+            for key, dataset in six.iteritems(self._datasets)
+        }
+        if isinstance(index, slice):
+            length = len(six.next(six.itervalues(batches)))
+            return [{key: batch[i]
+                     for key, batch in six.iteritems(batches)}
+                    for i in six.moves.range(length)]
+        else:
+            return batches
+
+    def __len__(self):
+        return self._length
+
+
+class SliceDataset(Dataset):
+    def __init__(self, dataset, start, finish, order=None):
+        """A Dataset which is a slice of the base dataset.
+
+        Args:
+            dataset (Dataset): the base dataset.
+            start (int): the start of the slice.
+            finish (int): the end of the slice, not inclusive.
+            order (List[int], optional): the order, it is a permutation of the valid example ids of the base dataset. If `order` is provided, the slice is taken in `order`. Defaults to None.
+        """
+        if start < 0 or finish > len(dataset):
+            raise ValueError("subset overruns the dataset.")
+        self._dataset = dataset
+        self._start = start
+        self._finish = finish
+        self._size = finish - start
+
+        if order is not None and len(order) != len(dataset):
+            raise ValueError(
+                "order should have the same length as the dataset"
+                "len(order) = {} which does not euqals len(dataset) = {} ".
+                format(len(order), len(dataset)))
+        self._order = order
+
+    def __len__(self):
+        return self._size
+
+    def __getitem__(self, i):
+        if i >= 0:
+            if i >= self._size:
+                raise IndexError('dataset index out of range')
+            index = self._start + i
+        else:
+            if i < -self._size:
+                raise IndexError('dataset index out of range')
+            index = self._finish + i
+
+        if self._order is not None:
+            index = self._order[index]
+        return self._dataset[index]
+
+
+class SubsetDataset(Dataset):
+    def __init__(self, dataset, indices):
+        """A Dataset which is a subset of the base dataset.
+
+        Args:
+            dataset (Dataset): the base dataset.
+            indices (Iterable[int]): the indices of the examples to pick.
+        """
+        self._dataset = dataset
+        if len(indices) > len(dataset):
+            raise ValueError("subset's size larger that dataset's size!")
+        self._indices = indices
+        self._size = len(indices)
+
+    def __len__(self):
+        return self._size
+
+    def __getitem__(self, i):
+        index = self._indices[i]
+        return self._dataset[index]
+
+
+class FilterDataset(Dataset):
+    def __init__(self, dataset, filter_fn):
+        """A filtered dataset.
+
+        Args:
+            dataset (Dataset): the base dataset.
+            filter_fn (callable): a callable which takes an example of the base dataset and return a boolean.
+        """
+        self._dataset = dataset
+        self._indices = [
+            i for i in range(len(dataset)) if filter_fn(dataset[i])
+        ]
+        self._size = len(self._indices)
+
+    def __len__(self):
+        return self._size
+
+    def __getitem__(self, i):
+        index = self._indices[i]
+        return self._dataset[index]
+
+
+class ChainDataset(Dataset):
+    def __init__(self, *datasets):
+        """A concatenation of the several datasets which the same structure.
+
+        Args:
+            datasets (Iterable[Dataset]): datasets to concat.
+        """
+        self._datasets = datasets
+
+    def __len__(self):
+        return sum(len(dataset) for dataset in self._datasets)
+
+    def __getitem__(self, i):
+        if i < 0:
+            raise IndexError("ChainDataset doesnot support negative indexing.")
+
+        for dataset in self._datasets:
+            if i < len(dataset):
+                return dataset[i]
+            i -= len(dataset)
+
+        raise IndexError("dataset index out of range")
diff --git a/ernie-sat/paddlespeech/t2s/datasets/get_feats.py b/ernie-sat/paddlespeech/t2s/datasets/get_feats.py
new file mode 100644
index 0000000000000000000000000000000000000000..a38cfff09ca15d742f602defbcef6a7220a89942
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/datasets/get_feats.py
@@ -0,0 +1,226 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+import librosa
+import numpy as np
+import pyworld
+from scipy.interpolate import interp1d
+
+
+class LogMelFBank():
+    def __init__(self,
+                 sr=24000,
+                 n_fft=2048,
+                 hop_length=300,
+                 win_length=None,
+                 window="hann",
+                 n_mels=80,
+                 fmin=80,
+                 fmax=7600,
+                 eps=1e-10):
+        self.sr = sr
+        # stft
+        self.n_fft = n_fft
+        self.win_length = win_length
+        self.hop_length = hop_length
+        self.window = window
+        self.center = True
+        self.pad_mode = "reflect"
+
+        # mel
+        self.n_mels = n_mels
+        self.fmin = 0 if fmin is None else fmin
+        self.fmax = sr / 2 if fmax is None else fmax
+
+        self.mel_filter = self._create_mel_filter()
+
+    def _create_mel_filter(self):
+        mel_filter = librosa.filters.mel(
+            sr=self.sr,
+            n_fft=self.n_fft,
+            n_mels=self.n_mels,
+            fmin=self.fmin,
+            fmax=self.fmax)
+        return mel_filter
+
+    def _stft(self, wav):
+        D = librosa.core.stft(
+            wav,
+            n_fft=self.n_fft,
+            hop_length=self.hop_length,
+            win_length=self.win_length,
+            window=self.window,
+            center=self.center,
+            pad_mode=self.pad_mode)
+        f = open('/mnt/home/xiaoran/projects/wave_summit/espnet_dual_mask/tmp_var_stft.out.1', 'w')
+        print('stft shape is', D.size())
+        # for item in [round(item, 6) for item in output["speech"][0].tolist()]:
+        #     f.write(str(item)+'\n')
+        # f.close()
+        return D
+
+    def _spectrogram(self, wav):
+        D = self._stft(wav)
+        return np.abs(D)
+
+    def _mel_spectrogram(self, wav):
+        S = self._spectrogram(wav)
+        mel = np.dot(self.mel_filter, S)
+        return mel
+
+    # We use different definition for log-spec between TTS and ASR
+    #   TTS: log_10(abs(stft))
+    #   ASR: log_e(power(stft))
+
+    def get_log_mel_fbank(self, wav, base='10'):
+        mel = self._mel_spectrogram(wav)
+        mel = np.clip(mel, a_min=1e-10, a_max=float("inf"))
+        if base == '10':
+            mel = np.log10(mel.T)
+        elif base == 'e':
+            mel = np.log(mel.T)
+        # (num_frames, n_mels)
+        return mel
+
+
+class Pitch():
+    def __init__(self, sr=24000, hop_length=300, f0min=80, f0max=7600):
+
+        self.sr = sr
+        self.hop_length = hop_length
+        self.f0min = f0min
+        self.f0max = f0max
+
+    def _convert_to_continuous_f0(self, f0: np.array) -> np.array:
+        if (f0 == 0).all():
+            print("All frames seems to be unvoiced.")
+            return f0
+
+        # padding start and end of f0 sequence
+        start_f0 = f0[f0 != 0][0]
+        end_f0 = f0[f0 != 0][-1]
+        start_idx = np.where(f0 == start_f0)[0][0]
+        end_idx = np.where(f0 == end_f0)[0][-1]
+        f0[:start_idx] = start_f0
+        f0[end_idx:] = end_f0
+
+        # get non-zero frame index
+        nonzero_idxs = np.where(f0 != 0)[0]
+
+        # perform linear interpolation
+        interp_fn = interp1d(nonzero_idxs, f0[nonzero_idxs])
+        f0 = interp_fn(np.arange(0, f0.shape[0]))
+
+        return f0
+
+    def _calculate_f0(self,
+                      input: np.array,
+                      use_continuous_f0=True,
+                      use_log_f0=True) -> np.array:
+        input = input.astype(np.float)
+        frame_period = 1000 * self.hop_length / self.sr
+        f0, timeaxis = pyworld.dio(
+            input,
+            fs=self.sr,
+            f0_floor=self.f0min,
+            f0_ceil=self.f0max,
+            frame_period=frame_period)
+        f0 = pyworld.stonemask(input, f0, timeaxis, self.sr)
+        if use_continuous_f0:
+            f0 = self._convert_to_continuous_f0(f0)
+        if use_log_f0:
+            nonzero_idxs = np.where(f0 != 0)[0]
+            f0[nonzero_idxs] = np.log(f0[nonzero_idxs])
+        return f0.reshape(-1)
+
+    def _average_by_duration(self, input: np.array, d: np.array) -> np.array:
+        d_cumsum = np.pad(d.cumsum(0), (1, 0), 'constant')
+        arr_list = []
+        for start, end in zip(d_cumsum[:-1], d_cumsum[1:]):
+            arr = input[start:end]
+            mask = arr == 0
+            arr[mask] = 0
+            avg_arr = np.mean(arr, axis=0) if len(arr) != 0 else np.array(0)
+            arr_list.append(avg_arr)
+        # shape (T,1)
+        arr_list = np.expand_dims(np.array(arr_list), 0).T
+
+        return arr_list
+
+    def get_pitch(self,
+                  wav,
+                  use_continuous_f0=True,
+                  use_log_f0=True,
+                  use_token_averaged_f0=True,
+                  duration=None):
+        f0 = self._calculate_f0(wav, use_continuous_f0, use_log_f0)
+        if use_token_averaged_f0 and duration is not None:
+            f0 = self._average_by_duration(f0, duration)
+        return f0
+
+
+class Energy():
+    def __init__(self,
+                 sr=24000,
+                 n_fft=2048,
+                 hop_length=300,
+                 win_length=None,
+                 window="hann",
+                 center=True,
+                 pad_mode="reflect"):
+
+        self.sr = sr
+        self.n_fft = n_fft
+        self.win_length = win_length
+        self.hop_length = hop_length
+        self.window = window
+        self.center = center
+        self.pad_mode = pad_mode
+
+    def _stft(self, wav):
+        D = librosa.core.stft(
+            wav,
+            n_fft=self.n_fft,
+            hop_length=self.hop_length,
+            win_length=self.win_length,
+            window=self.window,
+            center=self.center,
+            pad_mode=self.pad_mode)
+        return D
+
+    def _calculate_energy(self, input):
+        input = input.astype(np.float32)
+        input_stft = self._stft(input)
+        input_power = np.abs(input_stft)**2
+        energy = np.sqrt(
+            np.clip(
+                np.sum(input_power, axis=0), a_min=1.0e-10, a_max=float('inf')))
+        return energy
+
+    def _average_by_duration(self, input: np.array, d: np.array) -> np.array:
+        d_cumsum = np.pad(d.cumsum(0), (1, 0), 'constant')
+        arr_list = []
+        for start, end in zip(d_cumsum[:-1], d_cumsum[1:]):
+            arr = input[start:end]
+            avg_arr = np.mean(arr, axis=0) if len(arr) != 0 else np.array(0)
+            arr_list.append(avg_arr)
+        # shape (T,1)
+        arr_list = np.expand_dims(np.array(arr_list), 0).T
+        return arr_list
+
+    def get_energy(self, wav, use_token_averaged_energy=True, duration=None):
+        energy = self._calculate_energy(wav)
+        if use_token_averaged_energy and duration is not None:
+            energy = self._average_by_duration(energy, duration)
+        return energy
diff --git a/ernie-sat/paddlespeech/t2s/datasets/ljspeech.py b/ernie-sat/paddlespeech/t2s/datasets/ljspeech.py
new file mode 100644
index 0000000000000000000000000000000000000000..85cc3c183c1c1352bd2ef40c8f58c8316499c5b4
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/datasets/ljspeech.py
@@ -0,0 +1,39 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from pathlib import Path
+
+from paddle.io import Dataset
+
+__all__ = ["LJSpeechMetaData"]
+
+
+class LJSpeechMetaData(Dataset):
+    def __init__(self, root):
+        self.root = Path(root).expanduser()
+        wav_dir = self.root / "wavs"
+        csv_path = self.root / "metadata.csv"
+        records = []
+        speaker_name = "ljspeech"
+        with open(str(csv_path), 'rt', encoding='utf-8') as f:
+            for line in f:
+                filename, _, normalized_text = line.strip().split("|")
+                filename = str(wav_dir / (filename + ".wav"))
+                records.append([filename, normalized_text, speaker_name])
+        self.records = records
+
+    def __getitem__(self, i):
+        return self.records[i]
+
+    def __len__(self):
+        return len(self.records)
diff --git a/ernie-sat/paddlespeech/t2s/datasets/preprocess_utils.py b/ernie-sat/paddlespeech/t2s/datasets/preprocess_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..445b69bda1c8901569a109a261297a6ace60565e
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/datasets/preprocess_utils.py
@@ -0,0 +1,169 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import re
+
+
+# speaker|utt_id|phn dur phn dur ...
+def get_phn_dur(file_name):
+    '''
+    read MFA duration.txt
+    Args:
+        file_name (str or Path): path of gen_duration_from_textgrid.py's result
+    Returns: 
+        Dict: sentence: {'utt': ([char], [int])}
+    '''
+    f = open(file_name, 'r')
+    sentence = {}
+    speaker_set = set()
+    for line in f:
+        line_list = line.strip().split('|')
+        utt = line_list[0]
+        speaker = line_list[1]
+        p_d = line_list[-1]
+        speaker_set.add(speaker)
+        phn_dur = p_d.split()
+        phn = phn_dur[::2]
+        dur = phn_dur[1::2]
+        assert len(phn) == len(dur)
+        sentence[utt] = (phn, [int(i) for i in dur], speaker)
+    f.close()
+    return sentence, speaker_set
+
+
+def merge_silence(sentence):
+    '''
+    merge silences
+    Args:
+        sentence (Dict): sentence: {'utt': (([char], [int]), str)}
+    '''
+    for utt in sentence:
+        cur_phn, cur_dur, speaker = sentence[utt]
+        new_phn = []
+        new_dur = []
+
+        # merge sp and sil
+        for i, p in enumerate(cur_phn):
+            if i > 0 and 'sil' == p and cur_phn[i - 1] in {"sil", "sp"}:
+                new_dur[-1] += cur_dur[i]
+                new_phn[-1] = 'sil'
+            else:
+                new_phn.append(p)
+                new_dur.append(cur_dur[i])
+
+        for i, (p, d) in enumerate(zip(new_phn, new_dur)):
+            if p in {"sp"}:
+                if d < 14:
+                    new_phn[i] = 'sp'
+                else:
+                    new_phn[i] = 'spl'
+
+        assert len(new_phn) == len(new_dur)
+        sentence[utt] = [new_phn, new_dur, speaker]
+
+
+def get_input_token(sentence, output_path, dataset="baker"):
+    '''
+    get phone set from training data and save it
+    Args:
+        sentence (Dict): sentence: {'utt': ([char], [int])}
+        output_path (str or path):path to save phone_id_map
+    '''
+    phn_token = set()
+    for utt in sentence:
+        for phn in sentence[utt][0]:
+            phn_token.add(phn)
+    phn_token = list(phn_token)
+    phn_token.sort()
+    phn_token = ["<pad>", "<unk>"] + phn_token
+    if dataset in {"baker", "aishell3"}:
+        phn_token += ["，", "。", "？", "！"]
+    else:
+        phn_token += [",", ".", "?", "!"]
+    phn_token += ["<eos>"]
+
+    with open(output_path, 'w') as f:
+        for i, phn in enumerate(phn_token):
+            f.write(phn + ' ' + str(i) + '\n')
+
+
+def get_phones_tones(sentence,
+                     phones_output_path,
+                     tones_output_path,
+                     dataset="baker"):
+    '''
+    get phone set and tone set from training data and save it
+    Args:
+        sentence (Dict): sentence: {'utt': ([char], [int])}
+        phones_output_path (str or path): path to save phone_id_map
+        tones_output_path (str or path): path to save tone_id_map
+    '''
+    phn_token = set()
+    tone_token = set()
+    for utt in sentence:
+        for label in sentence[utt][0]:
+            # split tone from finals
+            match = re.match(r'^(\w+)([012345])$', label)
+            if match:
+                phn_token.add(match.group(1))
+                tone_token.add(match.group(2))
+            else:
+                phn_token.add(label)
+                tone_token.add('0')
+    phn_token = list(phn_token)
+    tone_token = list(tone_token)
+    phn_token.sort()
+    tone_token.sort()
+    phn_token = ["<pad>", "<unk>"] + phn_token
+    if dataset in {"baker", "aishell3"}:
+        phn_token += ["，", "。", "？", "！"]
+    else:
+        phn_token += [",", ".", "?", "!"]
+    phn_token += ["<eos>"]
+
+    with open(phones_output_path, 'w') as f:
+        for i, phn in enumerate(phn_token):
+            f.write(phn + ' ' + str(i) + '\n')
+    with open(tones_output_path, 'w') as f:
+        for i, tone in enumerate(tone_token):
+            f.write(tone + ' ' + str(i) + '\n')
+
+
+def get_spk_id_map(speaker_set, output_path):
+    speakers = sorted(list(speaker_set))
+    with open(output_path, 'w') as f:
+        for i, spk in enumerate(speakers):
+            f.write(spk + ' ' + str(i) + '\n')
+
+
+def compare_duration_and_mel_length(sentences, utt, mel):
+    '''
+    check duration error, correct sentences[utt] if possible, else pop sentences[utt]
+    Args:
+        sentences (Dict): sentences[utt] = [phones_list ,durations_list]
+        utt (str): utt_id
+        mel (np.ndarry): features (num_frames, n_mels)
+    '''
+
+    if utt in sentences:
+        len_diff = mel.shape[0] - sum(sentences[utt][1])
+        if len_diff != 0:
+            if len_diff > 0:
+                sentences[utt][1][-1] += len_diff
+            elif sentences[utt][1][-1] + len_diff > 0:
+                sentences[utt][1][-1] += len_diff
+            elif sentences[utt][1][0] + len_diff > 0:
+                sentences[utt][1][0] += len_diff
+            else:
+                print("the len_diff is unable to correct:", len_diff)
+                sentences.pop(utt)
diff --git a/ernie-sat/paddlespeech/t2s/datasets/vocoder_batch_fn.py b/ernie-sat/paddlespeech/t2s/datasets/vocoder_batch_fn.py
new file mode 100644
index 0000000000000000000000000000000000000000..08748de021c87e252df9ec3569141cb0385cac28
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/datasets/vocoder_batch_fn.py
@@ -0,0 +1,220 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+import paddle
+
+from paddlespeech.t2s.audio.codec import encode_mu_law
+from paddlespeech.t2s.audio.codec import float_2_label
+from paddlespeech.t2s.audio.codec import label_2_float
+
+
+class Clip(object):
+    """Collate functor for training vocoders.
+    """
+
+    def __init__(
+            self,
+            batch_max_steps=20480,
+            hop_size=256,
+            aux_context_window=0, ):
+        """Initialize customized collater for DataLoader.
+        Args:
+
+            batch_max_steps (int): The maximum length of input signal in batch.
+            hop_size (int): Hop size of auxiliary features.
+            aux_context_window (int): Context window size for auxiliary feature conv.
+
+        """
+        if batch_max_steps % hop_size != 0:
+            batch_max_steps += -(batch_max_steps % hop_size)
+        assert batch_max_steps % hop_size == 0
+        self.batch_max_steps = batch_max_steps
+        self.batch_max_frames = batch_max_steps // hop_size
+        self.hop_size = hop_size
+        self.aux_context_window = aux_context_window
+
+        # set useful values in random cutting
+        self.start_offset = aux_context_window
+        self.end_offset = -(self.batch_max_frames + aux_context_window)
+        self.mel_threshold = self.batch_max_frames + 2 * aux_context_window
+
+    def __call__(self, batch):
+        """Convert into batch tensors.
+
+        Args:
+            batch (list): list of tuple of the pair of audio and features. Audio shape (T, ), features shape(T', C).
+
+        Returns: 
+            Tensor:
+                Auxiliary feature batch (B, C, T'), where
+                T = (T' - 2 * aux_context_window) * hop_size.
+            Tensor:
+                Target signal batch (B, 1, T).
+
+        """
+        # check length
+        batch = [
+            self._adjust_length(b['wave'], b['feats']) for b in batch
+            if b['feats'].shape[0] > self.mel_threshold
+        ]
+        xs, cs = [b[0] for b in batch], [b[1] for b in batch]
+
+        # make batch with random cut
+        c_lengths = [c.shape[0] for c in cs]
+        start_frames = np.array([
+            np.random.randint(self.start_offset, cl + self.end_offset)
+            for cl in c_lengths
+        ])
+        x_starts = start_frames * self.hop_size
+        x_ends = x_starts + self.batch_max_steps
+
+        c_starts = start_frames - self.aux_context_window
+        c_ends = start_frames + self.batch_max_frames + self.aux_context_window
+        y_batch = np.stack(
+            [x[start:end] for x, start, end in zip(xs, x_starts, x_ends)])
+        c_batch = np.stack(
+            [c[start:end] for c, start, end in zip(cs, c_starts, c_ends)])
+
+        # convert each batch to tensor, assume that each item in batch has the same length
+        y_batch = paddle.to_tensor(
+            y_batch, dtype=paddle.float32).unsqueeze(1)  # (B, 1, T)
+        c_batch = paddle.to_tensor(
+            c_batch, dtype=paddle.float32).transpose([0, 2, 1])  # (B, C, T')
+
+        return y_batch, c_batch
+
+    def _adjust_length(self, x, c):
+        """Adjust the audio and feature lengths.
+
+        Note:
+            Basically we assume that the length of x and c are adjusted
+            through preprocessing stage, but if we use other library processed
+            features, this process will be needed.
+
+        """
+        if len(x) < c.shape[0] * self.hop_size:
+            x = np.pad(x, (0, c.shape[0] * self.hop_size - len(x)), mode="edge")
+        elif len(x) > c.shape[0] * self.hop_size:
+            # print(
+            #     f"wave length: ({len(x)}), mel length: ({c.shape[0]}), hop size: ({self.hop_size })"
+            # )
+            x = x[:c.shape[0] * self.hop_size]
+
+        # check the legnth is valid
+        assert len(x) == c.shape[
+            0] * self.hop_size, f"wave length: ({len(x)}), mel length: ({c.shape[0]})"
+
+        return x, c
+
+
+class WaveRNNClip(Clip):
+    def __init__(self,
+                 mode: str='RAW',
+                 batch_max_steps: int=4500,
+                 hop_size: int=300,
+                 aux_context_window: int=2,
+                 bits: int=9,
+                 mu_law: bool=True):
+        self.mode = mode
+        self.mel_win = batch_max_steps // hop_size + 2 * aux_context_window
+        self.batch_max_steps = batch_max_steps
+        self.hop_size = hop_size
+        self.aux_context_window = aux_context_window
+        self.mu_law = mu_law
+        self.batch_max_frames = batch_max_steps // hop_size
+        self.mel_threshold = self.batch_max_frames + 2 * aux_context_window
+        if self.mode == 'MOL':
+            self.bits = 16
+        else:
+            self.bits = bits
+
+    def to_quant(self, wav):
+        if self.mode == 'RAW':
+            if self.mu_law:
+                quant = encode_mu_law(wav, mu=2**self.bits)
+            else:
+                quant = float_2_label(wav, bits=self.bits)
+        elif self.mode == 'MOL':
+            quant = float_2_label(wav, bits=16)
+        quant = quant.astype(np.int64)
+        return quant
+
+    def __call__(self, batch):
+        # voc_pad = 2  this will pad the input so that the resnet can 'see' wider than input length
+        # max_offsets = n_frames - 2 - (mel_win + 2 * hp.voc_pad) = n_frames - 15
+        """Convert into batch tensors.
+        Args:
+            batch (list): list of tuple of the pair of audio and features. Audio shape (T, ), features shape(T', C).
+
+        Returns:
+            Tensor: Input signal batch (B, 1, T).
+            Tensor: Target signal batch (B, 1, T).
+            Tensor: Auxiliary feature batch (B, C, T'), 
+                where T = (T' - 2 * aux_context_window) * hop_size.
+
+        """
+        # check length
+        batch = [
+            self._adjust_length(b['wave'], b['feats']) for b in batch
+            if b['feats'].shape[0] > self.mel_threshold
+        ]
+        wav, mel = [b[0] for b in batch], [b[1] for b in batch]
+        # mel 此处需要转置
+        mel = [x.T for x in mel]
+        max_offsets = [
+            x.shape[-1] - 2 - (self.mel_win + 2 * self.aux_context_window)
+            for x in mel
+        ]
+        # the slice point of mel selecting randomly 
+        mel_offsets = [np.random.randint(0, offset) for offset in max_offsets]
+        # the slice point of wav selecting randomly, which is behind 2(=pad) frames 
+        sig_offsets = [(offset + self.aux_context_window) * self.hop_size
+                       for offset in mel_offsets]
+        # mels.shape[1] = voc_seq_len // hop_length + 2 * voc_pad
+        mels = [
+            x[:, mel_offsets[i]:mel_offsets[i] + self.mel_win]
+            for i, x in enumerate(mel)
+        ]
+        # label.shape[1] = voc_seq_len + 1
+        wav = [self.to_quant(x) for x in wav]
+
+        labels = [
+            x[sig_offsets[i]:sig_offsets[i] + self.batch_max_steps + 1]
+            for i, x in enumerate(wav)
+        ]
+
+        mels = np.stack(mels).astype(np.float32)
+        labels = np.stack(labels).astype(np.int64)
+
+        mels = paddle.to_tensor(mels)
+        labels = paddle.to_tensor(labels, dtype='int64')
+        # x is input, y is label
+        x = labels[:, :self.batch_max_steps]
+        y = labels[:, 1:]
+        '''
+        mode = RAW:
+            mu_law = True:
+                quant: bits = 9   0, 1, 2, ..., 509, 510, 511  int
+            mu_law = False
+                quant bits = 9    [0， 511]  float
+        mode = MOL:
+            quant: bits = 16  [0. 65536]  float
+        '''
+        # x should be normalizes in.[0, 1] in RAW mode
+        x = label_2_float(paddle.cast(x, dtype='float32'), self.bits)
+        # y should be normalizes in.[0, 1] in MOL mode
+        if self.mode == 'MOL':
+            y = label_2_float(paddle.cast(y, dtype='float32'), self.bits)
+
+        return x, y, mels
diff --git a/ernie-sat/paddlespeech/t2s/exps/__init__.py b/ernie-sat/paddlespeech/t2s/exps/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..abf198b97e6e818e1fbe59006f98492640bcee54
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/exps/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/ernie-sat/paddlespeech/t2s/exps/csmsc_test.txt b/ernie-sat/paddlespeech/t2s/exps/csmsc_test.txt
new file mode 100644
index 0000000000000000000000000000000000000000..d8cf367cd0c4f17203e00502f379671f7acc7b95
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/exps/csmsc_test.txt
@@ -0,0 +1,100 @@
+009901 昨日，这名伤者与医生全部被警方依法刑事拘留。
+009902 钱伟长想到上海来办学校是经过深思熟虑的。
+009903 她见我一进门就骂，吃饭时也骂，骂得我抬不起头。
+009904 李述德在离开之前，只说了一句柱驼杀父亲了。
+009905 这种车票和保险单捆绑出售属于重复性购买。
+009906 戴佩妮的男友西米露接唱情歌，让她非常开心。
+009907 观大势，谋大局，出大策始终是该院的办院方针。
+009908 他们骑着摩托回家，正好为农忙时的父母帮忙。
+009909 但是因为还没到退休年龄，只能掰着指头捱日子。
+009910 这几天雨水不断，人们恨不得待在家里不出门。
+009911 没想到徐赟，张海翔两人就此玩起了人间蒸发。
+009912 藤村此番发言可能是为了凸显野田的领导能力。
+009913 程长庚，生在清王朝嘉庆年间，安徽的潜山小县。
+009914 南海海域综合补给基地码头项目正在论证中。
+009915 也就是说今晚成都市民极有可能再次看到飘雪。
+009916 随着天气转热，各地的游泳场所开始人头攒动。
+009917 更让徐先生纳闷的是，房客的手机也打不通了。
+009918 遇到颠簸时，应听从乘务员的安全指令，回座位坐好。
+009919 他在后面呆惯了，怕自己一插身后的人会不满，不敢排进去。
+009920 傍晚七个小人回来了，白雪公主说，你们就是我命中的七个小矮人吧。
+009921 他本想说，教育局管这个，他们是一路的，这样一管岂不是妓女起嫖客？
+009922 一种表示商品所有权的财物证券，也称商品证券，如提货单，交货单。
+009923 会有很丰富的东西留下来，说都说不完。
+009924 这句话像从天而降，吓得四周一片寂静。
+009925 记者所在的是受害人家属所在的右区。
+009926 不管哈大爷去哪，它都一步不离地跟着。
+009927 大家抬头望去，一只老鼠正趴在吊顶上。
+009928 我决定过年就辞职，接手我爸的废品站！
+009929 最终，中国男子乒乓球队获得此奖项。
+009930 防汛抗旱两手抓，抗旱相对抓的不够。
+009931 图们江下游地区开发开放的进展如何？
+009932 这要求中国必须有一个坚强的政党领导。
+009933 再说，关于利益上的事俺俩都不好开口。
+009934 明代瓦剌，鞑靼入侵明境也是通过此地。
+009935 咪咪舔着孩子，把它身上的毛舔干净。
+009936 是否这次的国标修订被大企业绑架了？
+009937 判决后，姚某妻子胡某不服，提起上诉。
+009938 由此可以看出邯钢的经济效益来自何处。
+009939 琳达说，是瑜伽改变了她和马儿的生活。
+009940 楼下的保安告诉记者，这里不租也不卖。
+009941 习近平说，中斯两国人民传统友谊深厚。
+009942 传闻越来越多，后来连老汉儿自己都怕了。
+009943 我怒吼一声冲上去，举起砖头砸了过去。
+009944 我现在还不会，这就回去问问发明我的人。
+009945 显然，洛阳性奴案不具备上述两个前提。
+009946 另外，杰克逊有文唇线，眼线，眉毛的动作。
+009947 昨晚，华西都市报记者电话采访了尹琪。
+009948 涅拉季科未透露这些航空公司的名称。
+009949 从运行轨迹上来说，它也不可能是星星。
+009950 目前看，如果继续加息也存在两难问题。
+009951 曾宝仪在节目录制现场大爆观众糗事。
+009952 但任凭周某怎么叫，男子仍酣睡不醒。
+009953 老大爷说，小子，你挡我财路了，知道不？
+009954 没料到，闯下大头佛的阿伟还不知悔改。
+009955 卡扎菲部落式统治已遭遇部落内讧。
+009956 这个孩子的生命一半来源于另一位女士捐赠的冷冻卵子。
+009957 出现这种泥鳅内阁的局面既是野田有意为之，也实属无奈。
+009958 济青高速济南，华山，章丘，邹平，周村，淄博，临淄站。
+009959 赵凌飞的话，反映了沈阳赛区所有奥运志愿者的共同心声。
+009960 因为，我们所发出的力量必会因难度加大而减弱。
+009961 发生事故的楼梯拐角处仍可看到血迹。
+009962 想过进公安，可能身高不够，老汉儿也不让我进去。
+009963 路上关卡很多，为了方便撤离，只好轻装前进。
+009964 原来比尔盖茨就是美国微软公司联合创始人呀。
+009965 之后他们一家三口将与双方父母往峇里岛旅游。
+009966 谢谢总理，也感谢广大网友的参与，我们明年再见。
+009967 事实上是，从来没有一个欺善怕恶的人能作出过稍大一点的成就。
+009968 我会打开邮件，你可以从那里继续。
+009969 美方对近期东海局势表示关切。
+009970 据悉，奥巴马一家人对这座冬季白宫极为满意。
+009971 打扫完你会很有成就感的，试一试，你就信了。
+009972 诺曼站在滑板车上，各就各位，准备出发啦！
+009973 塔河的寒夜，气温降到了零下三十多摄氏度。
+009974 其间，连破六点六，六点五，六点四，六点三五等多个重要关口。
+009975 算命其实只是人们的一种自我安慰和自我暗示而已，我们还是要相信科学才好。
+009976 这一切都令人欢欣鼓舞，阿讷西没理由不坚持到最后。
+009977 直至公元前一万一千年，它又再次出现。
+009978 尽量少玩电脑，少看电视，少打游戏。
+009979 从五到七，前后也就是六个月的时间。
+009980 一进咖啡店，他就遇见一张熟悉的脸。
+009981 好在众弟兄看到了把她追了回来。
+009982 有一个人说，哥们儿我们跑过它才能活。
+009983 捅了她以后，模糊记得她没咋动了。
+009984 从小到大，葛启义没有收到过压岁钱。
+009985 舞台下的你会对舞台上的你说什么？
+009986 但考生普遍认为，试题的怪多过难。
+009987 我希望每个人都能够尊重我们的隐私。
+009988 漫天的红霞使劲给两人增添气氛。
+009989 晚上加完班开车回家，太累了，迷迷糊糊开着车，走一半的时候，铛一声！
+009990 该车将三人撞倒后，在大雾中逃窜。
+009991 这人一哆嗦，方向盘也把不稳了，差点撞上了高速边道护栏。
+009992 那女孩儿委屈的说，我一回头见你已经进去了我不敢进去啊！
+009993 小明摇摇头说，不是，我只是美女看多了，想换个口味而已。
+009994 接下来，红娘要求记者交费，记者表示不知表姐身份证号码。
+009995 李东蓊表示，自己当时在法庭上发表了一次独特的公诉意见。
+009996 另一男子扑了上来，手里拿着明晃晃的长刀，向他胸口直刺。
+009997 今天，快递员拿着一个快递在办公室喊，秦王是哪个，有他快递？
+009998 这场抗议活动究竟是如何发展演变的，又究竟是谁伤害了谁？
+009999 因华国锋肖鸡，墓地设计根据其属相设计。
+010000 在狱中，张明宝悔恨交加，写了一份忏悔书。
diff --git a/ernie-sat/paddlespeech/t2s/exps/fastspeech2/__init__.py b/ernie-sat/paddlespeech/t2s/exps/fastspeech2/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..abf198b97e6e818e1fbe59006f98492640bcee54
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/exps/fastspeech2/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/ernie-sat/paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py b/ernie-sat/paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c92ad1cc46ee22e165f560eb4095d4044a559fe
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py
@@ -0,0 +1,226 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# generate mels using durations.txt
+# for mb melgan finetune
+import argparse
+import os
+from pathlib import Path
+
+import numpy as np
+import paddle
+import yaml
+from tqdm import tqdm
+from yacs.config import CfgNode
+
+from paddlespeech.t2s.datasets.preprocess_utils import get_phn_dur
+from paddlespeech.t2s.datasets.preprocess_utils import merge_silence
+from paddlespeech.t2s.models.fastspeech2 import FastSpeech2
+from paddlespeech.t2s.models.fastspeech2 import StyleFastSpeech2Inference
+from paddlespeech.t2s.modules.normalizer import ZScore
+from paddlespeech.t2s.utils import str2bool
+
+
+def evaluate(args, fastspeech2_config):
+    rootdir = Path(args.rootdir).expanduser()
+    assert rootdir.is_dir()
+
+    # construct dataset for evaluation
+    with open(args.phones_dict, "r") as f:
+        phn_id = [line.strip().split() for line in f.readlines()]
+    vocab_size = len(phn_id)
+    print("vocab_size:", vocab_size)
+
+    phone_dict = {}
+    for phn, id in phn_id:
+        phone_dict[phn] = int(id)
+
+    if args.speaker_dict:
+        with open(args.speaker_dict, 'rt') as f:
+            spk_id_list = [line.strip().split() for line in f.readlines()]
+            spk_num = len(spk_id_list)
+    else:
+        spk_num = None
+
+    odim = fastspeech2_config.n_mels
+    model = FastSpeech2(
+        idim=vocab_size,
+        odim=odim,
+        **fastspeech2_config["model"],
+        spk_num=spk_num)
+
+    model.set_state_dict(
+        paddle.load(args.fastspeech2_checkpoint)["main_params"])
+    model.eval()
+
+    stat = np.load(args.fastspeech2_stat)
+    mu, std = stat
+    mu = paddle.to_tensor(mu)
+    std = paddle.to_tensor(std)
+    fastspeech2_normalizer = ZScore(mu, std)
+
+    fastspeech2_inference = StyleFastSpeech2Inference(fastspeech2_normalizer,
+                                                      model)
+    fastspeech2_inference.eval()
+
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    sentences, speaker_set = get_phn_dur(args.dur_file)
+    merge_silence(sentences)
+
+    if args.dataset == "baker":
+        wav_files = sorted(list((rootdir / "Wave").rglob("*.wav")))
+        # split data into 3 sections
+        num_train = 9800
+        num_dev = 100
+        train_wav_files = wav_files[:num_train]
+        dev_wav_files = wav_files[num_train:num_train + num_dev]
+        test_wav_files = wav_files[num_train + num_dev:]
+    elif args.dataset == "aishell3":
+        sub_num_dev = 5
+        wav_dir = rootdir / "train" / "wav"
+        train_wav_files = []
+        dev_wav_files = []
+        test_wav_files = []
+        for speaker in os.listdir(wav_dir):
+            wav_files = sorted(list((wav_dir / speaker).rglob("*.wav")))
+            if len(wav_files) > 100:
+                train_wav_files += wav_files[:-sub_num_dev * 2]
+                dev_wav_files += wav_files[-sub_num_dev * 2:-sub_num_dev]
+                test_wav_files += wav_files[-sub_num_dev:]
+            else:
+                train_wav_files += wav_files
+
+    train_wav_files = [
+        os.path.basename(str(str_path)) for str_path in train_wav_files
+    ]
+    dev_wav_files = [
+        os.path.basename(str(str_path)) for str_path in dev_wav_files
+    ]
+    test_wav_files = [
+        os.path.basename(str(str_path)) for str_path in test_wav_files
+    ]
+
+    for i, utt_id in enumerate(tqdm(sentences)):
+        phones = sentences[utt_id][0]
+        durations = sentences[utt_id][1]
+        speaker = sentences[utt_id][2]
+        # 裁剪掉开头和结尾的 sil
+        if args.cut_sil:
+            if phones[0] == "sil" and len(durations) > 1:
+                durations = durations[1:]
+                phones = phones[1:]
+            if phones[-1] == 'sil' and len(durations) > 1:
+                durations = durations[:-1]
+                phones = phones[:-1]
+            # sentences[utt_id][0] = phones
+            # sentences[utt_id][1] = durations
+
+        phone_ids = [phone_dict[phn] for phn in phones]
+        phone_ids = paddle.to_tensor(np.array(phone_ids))
+
+        if args.speaker_dict:
+            speaker_id = int(
+                [item[1] for item in spk_id_list if speaker == item[0]][0])
+            speaker_id = paddle.to_tensor(speaker_id)
+        else:
+            speaker_id = None
+
+        durations = paddle.to_tensor(np.array(durations))
+        # 生成的和真实的可能有 1, 2 帧的差距，但是 batch_fn 会修复
+        # split data into 3 sections
+
+        wav_path = utt_id + ".wav"
+
+        if wav_path in train_wav_files:
+            sub_output_dir = output_dir / ("train/raw")
+        elif wav_path in dev_wav_files:
+            sub_output_dir = output_dir / ("dev/raw")
+        elif wav_path in test_wav_files:
+            sub_output_dir = output_dir / ("test/raw")
+
+        sub_output_dir.mkdir(parents=True, exist_ok=True)
+
+        with paddle.no_grad():
+            mel = fastspeech2_inference(
+                phone_ids, durations=durations, spk_id=speaker_id)
+        np.save(sub_output_dir / (utt_id + "_feats.npy"), mel)
+
+
+def main():
+    # parse args and config and redirect to train_sp
+    parser = argparse.ArgumentParser(
+        description="Synthesize with fastspeech2 & parallel wavegan.")
+    parser.add_argument(
+        "--dataset",
+        default="baker",
+        type=str,
+        help="name of dataset, should in {baker, ljspeech, vctk} now")
+    parser.add_argument(
+        "--rootdir", default=None, type=str, help="directory to dataset.")
+    parser.add_argument(
+        "--fastspeech2-config", type=str, help="fastspeech2 config file.")
+    parser.add_argument(
+        "--fastspeech2-checkpoint",
+        type=str,
+        help="fastspeech2 checkpoint to load.")
+    parser.add_argument(
+        "--fastspeech2-stat",
+        type=str,
+        help="mean and standard deviation used to normalize spectrogram when training fastspeech2."
+    )
+
+    parser.add_argument(
+        "--phones-dict",
+        type=str,
+        default="phone_id_map.txt",
+        help="phone vocabulary file.")
+
+    parser.add_argument(
+        "--speaker-dict", type=str, default=None, help="speaker id map file.")
+
+    parser.add_argument(
+        "--dur-file", default=None, type=str, help="path to durations.txt.")
+    parser.add_argument("--output-dir", type=str, help="output dir.")
+    parser.add_argument(
+        "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
+
+    parser.add_argument(
+        "--cut-sil",
+        type=str2bool,
+        default=True,
+        help="whether cut sil in the edge of audio")
+
+    args = parser.parse_args()
+
+    if args.ngpu == 0:
+        paddle.set_device("cpu")
+    elif args.ngpu > 0:
+        paddle.set_device("gpu")
+    else:
+        print("ngpu should >= 0 !")
+
+    with open(args.fastspeech2_config) as f:
+        fastspeech2_config = CfgNode(yaml.safe_load(f))
+
+    print("========Args========")
+    print(yaml.safe_dump(vars(args)))
+    print("========Config========")
+    print(fastspeech2_config)
+
+    evaluate(args, fastspeech2_config)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/ernie-sat/paddlespeech/t2s/exps/fastspeech2/normalize.py b/ernie-sat/paddlespeech/t2s/exps/fastspeech2/normalize.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ec20ebf0f8f1865c45cdeed99d487e079e498b0
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/exps/fastspeech2/normalize.py
@@ -0,0 +1,184 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Normalize feature files and dump them."""
+import argparse
+import logging
+from operator import itemgetter
+from pathlib import Path
+
+import jsonlines
+import numpy as np
+from sklearn.preprocessing import StandardScaler
+from tqdm import tqdm
+
+from paddlespeech.t2s.datasets.data_table import DataTable
+
+
+def main():
+    """Run preprocessing process."""
+    parser = argparse.ArgumentParser(
+        description="Normalize dumped raw features (See detail in parallel_wavegan/bin/normalize.py)."
+    )
+    parser.add_argument(
+        "--metadata",
+        type=str,
+        required=True,
+        help="directory including feature files to be normalized. "
+        "you need to specify either *-scp or rootdir.")
+
+    parser.add_argument(
+        "--dumpdir",
+        type=str,
+        required=True,
+        help="directory to dump normalized feature files.")
+    parser.add_argument(
+        "--speech-stats",
+        type=str,
+        required=True,
+        help="speech statistics file.")
+    parser.add_argument(
+        "--pitch-stats", type=str, required=True, help="pitch statistics file.")
+    parser.add_argument(
+        "--energy-stats",
+        type=str,
+        required=True,
+        help="energy statistics file.")
+    parser.add_argument(
+        "--phones-dict", type=str, default=None, help="phone vocabulary file.")
+    parser.add_argument(
+        "--speaker-dict", type=str, default=None, help="speaker id map file.")
+    parser.add_argument(
+        "--verbose",
+        type=int,
+        default=1,
+        help="logging level. higher is more logging. (default=1)")
+    args = parser.parse_args()
+
+    # set logger
+    if args.verbose > 1:
+        logging.basicConfig(
+            level=logging.DEBUG,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
+        )
+    elif args.verbose > 0:
+        logging.basicConfig(
+            level=logging.INFO,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
+        )
+    else:
+        logging.basicConfig(
+            level=logging.WARN,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
+        )
+        logging.warning('Skip DEBUG/INFO messages')
+
+    dumpdir = Path(args.dumpdir).expanduser()
+    # use absolute path
+    dumpdir = dumpdir.resolve()
+    dumpdir.mkdir(parents=True, exist_ok=True)
+
+    # get dataset
+    with jsonlines.open(args.metadata, 'r') as reader:
+        metadata = list(reader)
+    dataset = DataTable(
+        metadata,
+        converters={
+            "speech": np.load,
+            "pitch": np.load,
+            "energy": np.load,
+        })
+    logging.info(f"The number of files = {len(dataset)}.")
+
+    # restore scaler
+    speech_scaler = StandardScaler()
+    speech_scaler.mean_ = np.load(args.speech_stats)[0]
+    speech_scaler.scale_ = np.load(args.speech_stats)[1]
+    speech_scaler.n_features_in_ = speech_scaler.mean_.shape[0]
+
+    pitch_scaler = StandardScaler()
+    pitch_scaler.mean_ = np.load(args.pitch_stats)[0]
+    pitch_scaler.scale_ = np.load(args.pitch_stats)[1]
+    pitch_scaler.n_features_in_ = pitch_scaler.mean_.shape[0]
+
+    energy_scaler = StandardScaler()
+    energy_scaler.mean_ = np.load(args.energy_stats)[0]
+    energy_scaler.scale_ = np.load(args.energy_stats)[1]
+    energy_scaler.n_features_in_ = energy_scaler.mean_.shape[0]
+
+    vocab_phones = {}
+    with open(args.phones_dict, 'rt') as f:
+        phn_id = [line.strip().split() for line in f.readlines()]
+    for phn, id in phn_id:
+        vocab_phones[phn] = int(id)
+
+    vocab_speaker = {}
+    with open(args.speaker_dict, 'rt') as f:
+        spk_id = [line.strip().split() for line in f.readlines()]
+    for spk, id in spk_id:
+        vocab_speaker[spk] = int(id)
+
+    # process each file
+    output_metadata = []
+
+    for item in tqdm(dataset):
+        utt_id = item['utt_id']
+        speech = item['speech']
+        pitch = item['pitch']
+        energy = item['energy']
+        # normalize
+        speech = speech_scaler.transform(speech)
+        speech_dir = dumpdir / "data_speech"
+        speech_dir.mkdir(parents=True, exist_ok=True)
+        speech_path = speech_dir / f"{utt_id}_speech.npy"
+        np.save(speech_path, speech.astype(np.float32), allow_pickle=False)
+
+        pitch = pitch_scaler.transform(pitch)
+        pitch_dir = dumpdir / "data_pitch"
+        pitch_dir.mkdir(parents=True, exist_ok=True)
+        pitch_path = pitch_dir / f"{utt_id}_pitch.npy"
+        np.save(pitch_path, pitch.astype(np.float32), allow_pickle=False)
+
+        energy = energy_scaler.transform(energy)
+        energy_dir = dumpdir / "data_energy"
+        energy_dir.mkdir(parents=True, exist_ok=True)
+        energy_path = energy_dir / f"{utt_id}_energy.npy"
+        np.save(energy_path, energy.astype(np.float32), allow_pickle=False)
+        phone_ids = [vocab_phones[p] for p in item['phones']]
+        spk_id = vocab_speaker[item["speaker"]]
+        record = {
+            "utt_id": item['utt_id'],
+            "spk_id": spk_id,
+            "text": phone_ids,
+            "text_lengths": item['text_lengths'],
+            "speech_lengths": item['speech_lengths'],
+            "durations": item['durations'],
+            "speech": str(speech_path),
+            "pitch": str(pitch_path),
+            "energy": str(energy_path)
+        }
+        # add spk_emb for voice cloning
+        if "spk_emb" in item:
+            record["spk_emb"] = str(item["spk_emb"])
+
+        output_metadata.append(record)
+    output_metadata.sort(key=itemgetter('utt_id'))
+    output_metadata_path = Path(args.dumpdir) / "metadata.jsonl"
+    with jsonlines.open(output_metadata_path, 'w') as writer:
+        for item in output_metadata:
+            writer.write(item)
+    logging.info(f"metadata dumped into {output_metadata_path}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/ernie-sat/paddlespeech/t2s/exps/fastspeech2/preprocess.py b/ernie-sat/paddlespeech/t2s/exps/fastspeech2/preprocess.py
new file mode 100644
index 0000000000000000000000000000000000000000..db1842b2e89fe3044e96ca4babb07c1796d06da3
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/exps/fastspeech2/preprocess.py
@@ -0,0 +1,370 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+from concurrent.futures import ThreadPoolExecutor
+from operator import itemgetter
+from pathlib import Path
+from typing import Any
+from typing import Dict
+from typing import List
+
+import jsonlines
+import librosa
+import numpy as np
+import tqdm
+import yaml
+from yacs.config import CfgNode
+
+from paddlespeech.t2s.datasets.get_feats import Energy
+from paddlespeech.t2s.datasets.get_feats import LogMelFBank
+from paddlespeech.t2s.datasets.get_feats import Pitch
+from paddlespeech.t2s.datasets.preprocess_utils import compare_duration_and_mel_length
+from paddlespeech.t2s.datasets.preprocess_utils import get_input_token
+from paddlespeech.t2s.datasets.preprocess_utils import get_phn_dur
+from paddlespeech.t2s.datasets.preprocess_utils import get_spk_id_map
+from paddlespeech.t2s.datasets.preprocess_utils import merge_silence
+from paddlespeech.t2s.utils import str2bool
+
+
+def process_sentence(config: Dict[str, Any],
+                     fp: Path,
+                     sentences: Dict,
+                     output_dir: Path,
+                     mel_extractor=None,
+                     pitch_extractor=None,
+                     energy_extractor=None,
+                     cut_sil: bool=True,
+                     spk_emb_dir: Path=None):
+    utt_id = fp.stem
+    # for vctk
+    if utt_id.endswith("_mic2"):
+        utt_id = utt_id[:-5]
+    record = None
+    if utt_id in sentences:
+        # reading, resampling may occur
+        wav, _ = librosa.load(str(fp), sr=config.fs)
+        if len(wav.shape) != 1 or np.abs(wav).max() > 1.0:
+            return record
+        assert len(wav.shape) == 1, f"{utt_id} is not a mono-channel audio."
+        assert np.abs(wav).max(
+        ) <= 1.0, f"{utt_id} is seems to be different that 16 bit PCM."
+        phones = sentences[utt_id][0]
+        durations = sentences[utt_id][1]
+        speaker = sentences[utt_id][2]
+        d_cumsum = np.pad(np.array(durations).cumsum(0), (1, 0), 'constant')
+        # little imprecise than use *.TextGrid directly
+        times = librosa.frames_to_time(
+            d_cumsum, sr=config.fs, hop_length=config.n_shift)
+        if cut_sil:
+            start = 0
+            end = d_cumsum[-1]
+            if phones[0] == "sil" and len(durations) > 1:
+                start = times[1]
+                durations = durations[1:]
+                phones = phones[1:]
+            if phones[-1] == 'sil' and len(durations) > 1:
+                end = times[-2]
+                durations = durations[:-1]
+                phones = phones[:-1]
+            sentences[utt_id][0] = phones
+            sentences[utt_id][1] = durations
+            start, end = librosa.time_to_samples([start, end], sr=config.fs)
+            wav = wav[start:end]
+        # extract mel feats
+        logmel = mel_extractor.get_log_mel_fbank(wav)
+        # change duration according to mel_length
+        compare_duration_and_mel_length(sentences, utt_id, logmel)
+        # utt_id may be popped in compare_duration_and_mel_length
+        if utt_id not in sentences:
+            return None
+        phones = sentences[utt_id][0]
+        durations = sentences[utt_id][1]
+        num_frames = logmel.shape[0]
+        assert sum(durations) == num_frames
+        mel_dir = output_dir / "data_speech"
+        mel_dir.mkdir(parents=True, exist_ok=True)
+        mel_path = mel_dir / (utt_id + "_speech.npy")
+        np.save(mel_path, logmel)
+        # extract pitch and energy
+        f0 = pitch_extractor.get_pitch(wav, duration=np.array(durations))
+        assert f0.shape[0] == len(durations)
+        f0_dir = output_dir / "data_pitch"
+        f0_dir.mkdir(parents=True, exist_ok=True)
+        f0_path = f0_dir / (utt_id + "_pitch.npy")
+        np.save(f0_path, f0)
+        energy = energy_extractor.get_energy(wav, duration=np.array(durations))
+        assert energy.shape[0] == len(durations)
+        energy_dir = output_dir / "data_energy"
+        energy_dir.mkdir(parents=True, exist_ok=True)
+        energy_path = energy_dir / (utt_id + "_energy.npy")
+        np.save(energy_path, energy)
+        record = {
+            "utt_id": utt_id,
+            "phones": phones,
+            "text_lengths": len(phones),
+            "speech_lengths": num_frames,
+            "durations": durations,
+            "speech": str(mel_path),
+            "pitch": str(f0_path),
+            "energy": str(energy_path),
+            "speaker": speaker
+        }
+        if spk_emb_dir:
+            if speaker in os.listdir(spk_emb_dir):
+                embed_name = utt_id + ".npy"
+                embed_path = spk_emb_dir / speaker / embed_name
+                if embed_path.is_file():
+                    record["spk_emb"] = str(embed_path)
+                else:
+                    return None
+    return record
+
+
+def process_sentences(config,
+                      fps: List[Path],
+                      sentences: Dict,
+                      output_dir: Path,
+                      mel_extractor=None,
+                      pitch_extractor=None,
+                      energy_extractor=None,
+                      nprocs: int=1,
+                      cut_sil: bool=True,
+                      spk_emb_dir: Path=None):
+    if nprocs == 1:
+        results = []
+        for fp in fps:
+            record = process_sentence(config, fp, sentences, output_dir,
+                                      mel_extractor, pitch_extractor,
+                                      energy_extractor, cut_sil, spk_emb_dir)
+            if record:
+                results.append(record)
+    else:
+        with ThreadPoolExecutor(nprocs) as pool:
+            futures = []
+            with tqdm.tqdm(total=len(fps)) as progress:
+                for fp in fps:
+                    future = pool.submit(process_sentence, config, fp,
+                                         sentences, output_dir, mel_extractor,
+                                         pitch_extractor, energy_extractor,
+                                         cut_sil, spk_emb_dir)
+                    future.add_done_callback(lambda p: progress.update())
+                    futures.append(future)
+
+                results = []
+                for ft in futures:
+                    record = ft.result()
+                    if record:
+                        results.append(record)
+
+    results.sort(key=itemgetter("utt_id"))
+    with jsonlines.open(output_dir / "metadata.jsonl", 'w') as writer:
+        for item in results:
+            writer.write(item)
+    print("Done")
+
+
+def main():
+    # parse config and args
+    parser = argparse.ArgumentParser(
+        description="Preprocess audio and then extract features.")
+
+    parser.add_argument(
+        "--dataset",
+        default="baker",
+        type=str,
+        help="name of dataset, should in {baker, aishell3, ljspeech, vctk} now")
+
+    parser.add_argument(
+        "--rootdir", default=None, type=str, help="directory to dataset.")
+
+    parser.add_argument(
+        "--dumpdir",
+        type=str,
+        required=True,
+        help="directory to dump feature files.")
+    parser.add_argument(
+        "--dur-file", default=None, type=str, help="path to durations.txt.")
+
+    parser.add_argument("--config", type=str, help="fastspeech2 config file.")
+
+    parser.add_argument(
+        "--verbose",
+        type=int,
+        default=1,
+        help="logging level. higher is more logging. (default=1)")
+    parser.add_argument(
+        "--num-cpu", type=int, default=1, help="number of process.")
+
+    parser.add_argument(
+        "--cut-sil",
+        type=str2bool,
+        default=True,
+        help="whether cut sil in the edge of audio")
+
+    parser.add_argument(
+        "--spk_emb_dir",
+        default=None,
+        type=str,
+        help="directory to speaker embedding files.")
+    args = parser.parse_args()
+
+    rootdir = Path(args.rootdir).expanduser()
+    dumpdir = Path(args.dumpdir).expanduser()
+    # use absolute path
+    dumpdir = dumpdir.resolve()
+    dumpdir.mkdir(parents=True, exist_ok=True)
+    dur_file = Path(args.dur_file).expanduser()
+
+    if args.spk_emb_dir:
+        spk_emb_dir = Path(args.spk_emb_dir).expanduser().resolve()
+    else:
+        spk_emb_dir = None
+
+    assert rootdir.is_dir()
+    assert dur_file.is_file()
+
+    with open(args.config, 'rt') as f:
+        config = CfgNode(yaml.safe_load(f))
+
+    if args.verbose > 1:
+        print(vars(args))
+        print(config)
+
+    sentences, speaker_set = get_phn_dur(dur_file)
+
+    merge_silence(sentences)
+    phone_id_map_path = dumpdir / "phone_id_map.txt"
+    speaker_id_map_path = dumpdir / "speaker_id_map.txt"
+    get_input_token(sentences, phone_id_map_path, args.dataset)
+    get_spk_id_map(speaker_set, speaker_id_map_path)
+
+    if args.dataset == "baker":
+        wav_files = sorted(list((rootdir / "Wave").rglob("*.wav")))
+        # split data into 3 sections
+        num_train = 9800
+        num_dev = 100
+        train_wav_files = wav_files[:num_train]
+        dev_wav_files = wav_files[num_train:num_train + num_dev]
+        test_wav_files = wav_files[num_train + num_dev:]
+    elif args.dataset == "aishell3":
+        sub_num_dev = 5
+        wav_dir = rootdir / "train" / "wav"
+        train_wav_files = []
+        dev_wav_files = []
+        test_wav_files = []
+        for speaker in os.listdir(wav_dir):
+            wav_files = sorted(list((wav_dir / speaker).rglob("*.wav")))
+            if len(wav_files) > 100:
+                train_wav_files += wav_files[:-sub_num_dev * 2]
+                dev_wav_files += wav_files[-sub_num_dev * 2:-sub_num_dev]
+                test_wav_files += wav_files[-sub_num_dev:]
+            else:
+                train_wav_files += wav_files
+
+    elif args.dataset == "ljspeech":
+        wav_files = sorted(list((rootdir / "wavs").rglob("*.wav")))
+        # split data into 3 sections
+        num_train = 12900
+        num_dev = 100
+        train_wav_files = wav_files[:num_train]
+        dev_wav_files = wav_files[num_train:num_train + num_dev]
+        test_wav_files = wav_files[num_train + num_dev:]
+    elif args.dataset == "vctk":
+        sub_num_dev = 5
+        wav_dir = rootdir / "wav48_silence_trimmed"
+        train_wav_files = []
+        dev_wav_files = []
+        test_wav_files = []
+        for speaker in os.listdir(wav_dir):
+            wav_files = sorted(list((wav_dir / speaker).rglob("*_mic2.flac")))
+            if len(wav_files) > 100:
+                train_wav_files += wav_files[:-sub_num_dev * 2]
+                dev_wav_files += wav_files[-sub_num_dev * 2:-sub_num_dev]
+                test_wav_files += wav_files[-sub_num_dev:]
+            else:
+                train_wav_files += wav_files
+
+    else:
+        print("dataset should in {baker, aishell3, ljspeech, vctk} now!")
+
+    train_dump_dir = dumpdir / "train" / "raw"
+    train_dump_dir.mkdir(parents=True, exist_ok=True)
+    dev_dump_dir = dumpdir / "dev" / "raw"
+    dev_dump_dir.mkdir(parents=True, exist_ok=True)
+    test_dump_dir = dumpdir / "test" / "raw"
+    test_dump_dir.mkdir(parents=True, exist_ok=True)
+
+    # Extractor
+    mel_extractor = LogMelFBank(
+        sr=config.fs,
+        n_fft=config.n_fft,
+        hop_length=config.n_shift,
+        win_length=config.win_length,
+        window=config.window,
+        n_mels=config.n_mels,
+        fmin=config.fmin,
+        fmax=config.fmax)
+    pitch_extractor = Pitch(
+        sr=config.fs,
+        hop_length=config.n_shift,
+        f0min=config.f0min,
+        f0max=config.f0max)
+    energy_extractor = Energy(
+        sr=config.fs,
+        n_fft=config.n_fft,
+        hop_length=config.n_shift,
+        win_length=config.win_length,
+        window=config.window)
+
+    # process for the 3 sections
+    if train_wav_files:
+        process_sentences(
+            config,
+            train_wav_files,
+            sentences,
+            train_dump_dir,
+            mel_extractor,
+            pitch_extractor,
+            energy_extractor,
+            nprocs=args.num_cpu,
+            cut_sil=args.cut_sil,
+            spk_emb_dir=spk_emb_dir)
+    if dev_wav_files:
+        process_sentences(
+            config,
+            dev_wav_files,
+            sentences,
+            dev_dump_dir,
+            mel_extractor,
+            pitch_extractor,
+            energy_extractor,
+            cut_sil=args.cut_sil,
+            spk_emb_dir=spk_emb_dir)
+    if test_wav_files:
+        process_sentences(
+            config,
+            test_wav_files,
+            sentences,
+            test_dump_dir,
+            mel_extractor,
+            pitch_extractor,
+            energy_extractor,
+            nprocs=args.num_cpu,
+            cut_sil=args.cut_sil,
+            spk_emb_dir=spk_emb_dir)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/ernie-sat/paddlespeech/t2s/exps/fastspeech2/train.py b/ernie-sat/paddlespeech/t2s/exps/fastspeech2/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..10e023d0cbbd60faea4b64ee04f2410a86ffd341
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/exps/fastspeech2/train.py
@@ -0,0 +1,212 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import logging
+import os
+import shutil
+from pathlib import Path
+
+import jsonlines
+import numpy as np
+import paddle
+import yaml
+from paddle import DataParallel
+from paddle import distributed as dist
+from paddle.io import DataLoader
+from paddle.io import DistributedBatchSampler
+from yacs.config import CfgNode
+
+from paddlespeech.t2s.datasets.am_batch_fn import fastspeech2_multi_spk_batch_fn
+from paddlespeech.t2s.datasets.am_batch_fn import fastspeech2_single_spk_batch_fn
+from paddlespeech.t2s.datasets.data_table import DataTable
+from paddlespeech.t2s.models.fastspeech2 import FastSpeech2
+from paddlespeech.t2s.models.fastspeech2 import FastSpeech2Evaluator
+from paddlespeech.t2s.models.fastspeech2 import FastSpeech2Updater
+from paddlespeech.t2s.training.extensions.snapshot import Snapshot
+from paddlespeech.t2s.training.extensions.visualizer import VisualDL
+from paddlespeech.t2s.training.optimizer import build_optimizers
+from paddlespeech.t2s.training.seeding import seed_everything
+from paddlespeech.t2s.training.trainer import Trainer
+from paddlespeech.t2s.utils import str2bool
+
+
+def train_sp(args, config):
+    # decides device type and whether to run in parallel
+    # setup running environment correctly
+    if (not paddle.is_compiled_with_cuda()) or args.ngpu == 0:
+        paddle.set_device("cpu")
+    else:
+        paddle.set_device("gpu")
+    world_size = paddle.distributed.get_world_size()
+    if world_size > 1:
+        paddle.distributed.init_parallel_env()
+
+    # set the random seed, it is a must for multiprocess training
+    seed_everything(config.seed)
+
+    print(
+        f"rank: {dist.get_rank()}, pid: {os.getpid()}, parent_pid: {os.getppid()}",
+    )
+    fields = [
+        "text", "text_lengths", "speech", "speech_lengths", "durations",
+        "pitch", "energy"
+    ]
+    converters = {"speech": np.load, "pitch": np.load, "energy": np.load}
+    spk_num = None
+    if args.speaker_dict is not None:
+        print("multiple speaker fastspeech2!")
+        collate_fn = fastspeech2_multi_spk_batch_fn
+        with open(args.speaker_dict, 'rt') as f:
+            spk_id = [line.strip().split() for line in f.readlines()]
+        spk_num = len(spk_id)
+        fields += ["spk_id"]
+    elif args.voice_cloning:
+        print("Training voice cloning!")
+        collate_fn = fastspeech2_multi_spk_batch_fn
+        fields += ["spk_emb"]
+        converters["spk_emb"] = np.load
+    else:
+        print("single speaker fastspeech2!")
+        collate_fn = fastspeech2_single_spk_batch_fn
+    print("spk_num:", spk_num)
+
+    # dataloader has been too verbose
+    logging.getLogger("DataLoader").disabled = True
+
+    # construct dataset for training and validation
+    with jsonlines.open(args.train_metadata, 'r') as reader:
+        train_metadata = list(reader)
+    train_dataset = DataTable(
+        data=train_metadata,
+        fields=fields,
+        converters=converters, )
+    with jsonlines.open(args.dev_metadata, 'r') as reader:
+        dev_metadata = list(reader)
+    dev_dataset = DataTable(
+        data=dev_metadata,
+        fields=fields,
+        converters=converters, )
+
+    # collate function and dataloader
+
+    train_sampler = DistributedBatchSampler(
+        train_dataset,
+        batch_size=config.batch_size,
+        shuffle=True,
+        drop_last=True)
+
+    print("samplers done!")
+
+    train_dataloader = DataLoader(
+        train_dataset,
+        batch_sampler=train_sampler,
+        collate_fn=collate_fn,
+        num_workers=config.num_workers)
+
+    dev_dataloader = DataLoader(
+        dev_dataset,
+        shuffle=False,
+        drop_last=False,
+        batch_size=config.batch_size,
+        collate_fn=collate_fn,
+        num_workers=config.num_workers)
+    print("dataloaders done!")
+
+    with open(args.phones_dict, "r") as f:
+        phn_id = [line.strip().split() for line in f.readlines()]
+    vocab_size = len(phn_id)
+    print("vocab_size:", vocab_size)
+
+    odim = config.n_mels
+    model = FastSpeech2(
+        idim=vocab_size, odim=odim, spk_num=spk_num, **config["model"])
+    if world_size > 1:
+        model = DataParallel(model)
+    print("model done!")
+
+    optimizer = build_optimizers(model, **config["optimizer"])
+    print("optimizer done!")
+
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    if dist.get_rank() == 0:
+        config_name = args.config.split("/")[-1]
+        # copy conf to output_dir
+        shutil.copyfile(args.config, output_dir / config_name)
+
+    updater = FastSpeech2Updater(
+        model=model,
+        optimizer=optimizer,
+        dataloader=train_dataloader,
+        output_dir=output_dir,
+        **config["updater"])
+
+    trainer = Trainer(updater, (config.max_epoch, 'epoch'), output_dir)
+
+    evaluator = FastSpeech2Evaluator(
+        model, dev_dataloader, output_dir=output_dir, **config["updater"])
+
+    if dist.get_rank() == 0:
+        trainer.extend(evaluator, trigger=(1, "epoch"))
+        trainer.extend(VisualDL(output_dir), trigger=(1, "iteration"))
+    trainer.extend(
+        Snapshot(max_size=config.num_snapshots), trigger=(1, 'epoch'))
+    trainer.run()
+
+
+def main():
+    # parse args and config and redirect to train_sp
+    parser = argparse.ArgumentParser(description="Train a FastSpeech2 model.")
+    parser.add_argument("--config", type=str, help="fastspeech2 config file.")
+    parser.add_argument("--train-metadata", type=str, help="training data.")
+    parser.add_argument("--dev-metadata", type=str, help="dev data.")
+    parser.add_argument("--output-dir", type=str, help="output dir.")
+    parser.add_argument(
+        "--ngpu", type=int, default=1, help="if ngpu=0, use cpu.")
+    parser.add_argument(
+        "--phones-dict", type=str, default=None, help="phone vocabulary file.")
+    parser.add_argument(
+        "--speaker-dict",
+        type=str,
+        default=None,
+        help="speaker id map file for multiple speaker model.")
+
+    parser.add_argument(
+        "--voice-cloning",
+        type=str2bool,
+        default=False,
+        help="whether training voice cloning model.")
+
+    args = parser.parse_args()
+
+    with open(args.config) as f:
+        config = CfgNode(yaml.safe_load(f))
+
+    print("========Args========")
+    print(yaml.safe_dump(vars(args)))
+    print("========Config========")
+    print(config)
+    print(
+        f"master see the word size: {dist.get_world_size()}, from pid: {os.getpid()}"
+    )
+
+    # dispatch
+    if args.ngpu > 1:
+        dist.spawn(train_sp, (args, config), nprocs=args.ngpu)
+    else:
+        train_sp(args, config)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/ernie-sat/paddlespeech/t2s/exps/gan_vocoder/README.md b/ernie-sat/paddlespeech/t2s/exps/gan_vocoder/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3109be17b6a2b43f494ea4b79cbe84a0365e7129
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/exps/gan_vocoder/README.md
@@ -0,0 +1 @@
+different GAN Vocoders have the same preprocess.py and normalize.py
diff --git a/ernie-sat/paddlespeech/t2s/exps/gan_vocoder/__init__.py b/ernie-sat/paddlespeech/t2s/exps/gan_vocoder/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..abf198b97e6e818e1fbe59006f98492640bcee54
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/exps/gan_vocoder/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/ernie-sat/paddlespeech/t2s/exps/gan_vocoder/hifigan/__init__.py b/ernie-sat/paddlespeech/t2s/exps/gan_vocoder/hifigan/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..185a92b8d94d3426d616c0624f0f2ee04339349e
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/exps/gan_vocoder/hifigan/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/ernie-sat/paddlespeech/t2s/exps/gan_vocoder/hifigan/train.py b/ernie-sat/paddlespeech/t2s/exps/gan_vocoder/hifigan/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..c70821e78fe6e4063d74e8c5608ede225ed1b230
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/exps/gan_vocoder/hifigan/train.py
@@ -0,0 +1,275 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import logging
+import os
+import shutil
+from pathlib import Path
+
+import jsonlines
+import numpy as np
+import paddle
+import yaml
+from paddle import DataParallel
+from paddle import distributed as dist
+from paddle import nn
+from paddle.io import DataLoader
+from paddle.io import DistributedBatchSampler
+from paddle.optimizer import Adam
+from paddle.optimizer.lr import MultiStepDecay
+from yacs.config import CfgNode
+
+from paddlespeech.t2s.datasets.data_table import DataTable
+from paddlespeech.t2s.datasets.vocoder_batch_fn import Clip
+from paddlespeech.t2s.models.hifigan import HiFiGANEvaluator
+from paddlespeech.t2s.models.hifigan import HiFiGANGenerator
+from paddlespeech.t2s.models.hifigan import HiFiGANMultiScaleMultiPeriodDiscriminator
+from paddlespeech.t2s.models.hifigan import HiFiGANUpdater
+from paddlespeech.t2s.modules.losses import DiscriminatorAdversarialLoss
+from paddlespeech.t2s.modules.losses import FeatureMatchLoss
+from paddlespeech.t2s.modules.losses import GeneratorAdversarialLoss
+from paddlespeech.t2s.modules.losses import MelSpectrogramLoss
+from paddlespeech.t2s.training.extensions.snapshot import Snapshot
+from paddlespeech.t2s.training.extensions.visualizer import VisualDL
+from paddlespeech.t2s.training.seeding import seed_everything
+from paddlespeech.t2s.training.trainer import Trainer
+
+
+def train_sp(args, config):
+    # decides device type and whether to run in parallel
+    # setup running environment correctly
+    world_size = paddle.distributed.get_world_size()
+    if (not paddle.is_compiled_with_cuda()) or args.ngpu == 0:
+        paddle.set_device("cpu")
+    else:
+        paddle.set_device("gpu")
+        if world_size > 1:
+            paddle.distributed.init_parallel_env()
+
+    # set the random seed, it is a must for multiprocess training
+    seed_everything(config.seed)
+
+    print(
+        f"rank: {dist.get_rank()}, pid: {os.getpid()}, parent_pid: {os.getppid()}",
+    )
+
+    # dataloader has been too verbose
+    logging.getLogger("DataLoader").disabled = True
+
+    # construct dataset for training and validation
+    with jsonlines.open(args.train_metadata, 'r') as reader:
+        train_metadata = list(reader)
+    train_dataset = DataTable(
+        data=train_metadata,
+        fields=["wave", "feats"],
+        converters={
+            "wave": np.load,
+            "feats": np.load,
+        }, )
+    with jsonlines.open(args.dev_metadata, 'r') as reader:
+        dev_metadata = list(reader)
+    dev_dataset = DataTable(
+        data=dev_metadata,
+        fields=["wave", "feats"],
+        converters={
+            "wave": np.load,
+            "feats": np.load,
+        }, )
+
+    # collate function and dataloader
+    train_sampler = DistributedBatchSampler(
+        train_dataset,
+        batch_size=config.batch_size,
+        shuffle=True,
+        drop_last=True)
+    dev_sampler = DistributedBatchSampler(
+        dev_dataset,
+        batch_size=config.batch_size,
+        shuffle=False,
+        drop_last=False)
+    print("samplers done!")
+
+    if "aux_context_window" in config.generator_params:
+        aux_context_window = config.generator_params.aux_context_window
+    else:
+        aux_context_window = 0
+    train_batch_fn = Clip(
+        batch_max_steps=config.batch_max_steps,
+        hop_size=config.n_shift,
+        aux_context_window=aux_context_window)
+
+    train_dataloader = DataLoader(
+        train_dataset,
+        batch_sampler=train_sampler,
+        collate_fn=train_batch_fn,
+        num_workers=config.num_workers)
+
+    dev_dataloader = DataLoader(
+        dev_dataset,
+        batch_sampler=dev_sampler,
+        collate_fn=train_batch_fn,
+        num_workers=config.num_workers)
+    print("dataloaders done!")
+
+    generator = HiFiGANGenerator(**config["generator_params"])
+    discriminator = HiFiGANMultiScaleMultiPeriodDiscriminator(
+        **config["discriminator_params"])
+    if world_size > 1:
+        generator = DataParallel(generator)
+        discriminator = DataParallel(discriminator)
+    print("models done!")
+
+    criterion_feat_match = FeatureMatchLoss(**config["feat_match_loss_params"])
+    criterion_mel = MelSpectrogramLoss(
+        fs=config.fs,
+        fft_size=config.n_fft,
+        hop_size=config.n_shift,
+        win_length=config.win_length,
+        window=config.window,
+        num_mels=config.n_mels,
+        fmin=config.fmin,
+        fmax=config.fmax, )
+    criterion_gen_adv = GeneratorAdversarialLoss(
+        **config["generator_adv_loss_params"])
+    criterion_dis_adv = DiscriminatorAdversarialLoss(
+        **config["discriminator_adv_loss_params"])
+    print("criterions done!")
+
+    lr_schedule_g = MultiStepDecay(**config["generator_scheduler_params"])
+    # Compared to multi_band_melgan.v1 config, Adam optimizer without gradient norm is used
+    generator_grad_norm = config["generator_grad_norm"]
+    gradient_clip_g = nn.ClipGradByGlobalNorm(
+        generator_grad_norm) if generator_grad_norm > 0 else None
+    print("gradient_clip_g:", gradient_clip_g)
+
+    optimizer_g = Adam(
+        learning_rate=lr_schedule_g,
+        grad_clip=gradient_clip_g,
+        parameters=generator.parameters(),
+        **config["generator_optimizer_params"])
+    lr_schedule_d = MultiStepDecay(**config["discriminator_scheduler_params"])
+    discriminator_grad_norm = config["discriminator_grad_norm"]
+    gradient_clip_d = nn.ClipGradByGlobalNorm(
+        discriminator_grad_norm) if discriminator_grad_norm > 0 else None
+    print("gradient_clip_d:", gradient_clip_d)
+    optimizer_d = Adam(
+        learning_rate=lr_schedule_d,
+        grad_clip=gradient_clip_d,
+        parameters=discriminator.parameters(),
+        **config["discriminator_optimizer_params"])
+    print("optimizers done!")
+
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    if dist.get_rank() == 0:
+        config_name = args.config.split("/")[-1]
+        # copy conf to output_dir
+        shutil.copyfile(args.config, output_dir / config_name)
+
+    updater = HiFiGANUpdater(
+        models={
+            "generator": generator,
+            "discriminator": discriminator,
+        },
+        optimizers={
+            "generator": optimizer_g,
+            "discriminator": optimizer_d,
+        },
+        criterions={
+            "mel": criterion_mel,
+            "feat_match": criterion_feat_match,
+            "gen_adv": criterion_gen_adv,
+            "dis_adv": criterion_dis_adv,
+        },
+        schedulers={
+            "generator": lr_schedule_g,
+            "discriminator": lr_schedule_d,
+        },
+        dataloader=train_dataloader,
+        discriminator_train_start_steps=config.discriminator_train_start_steps,
+        # only hifigan have generator_train_start_steps
+        generator_train_start_steps=config.generator_train_start_steps,
+        lambda_adv=config.lambda_adv,
+        lambda_aux=config.lambda_aux,
+        lambda_feat_match=config.lambda_feat_match,
+        output_dir=output_dir)
+
+    evaluator = HiFiGANEvaluator(
+        models={
+            "generator": generator,
+            "discriminator": discriminator,
+        },
+        criterions={
+            "mel": criterion_mel,
+            "feat_match": criterion_feat_match,
+            "gen_adv": criterion_gen_adv,
+            "dis_adv": criterion_dis_adv,
+        },
+        dataloader=dev_dataloader,
+        lambda_adv=config.lambda_adv,
+        lambda_aux=config.lambda_aux,
+        lambda_feat_match=config.lambda_feat_match,
+        output_dir=output_dir)
+
+    trainer = Trainer(
+        updater,
+        stop_trigger=(config.train_max_steps, "iteration"),
+        out=output_dir)
+
+    if dist.get_rank() == 0:
+        trainer.extend(
+            evaluator, trigger=(config.eval_interval_steps, 'iteration'))
+        trainer.extend(VisualDL(output_dir), trigger=(1, 'iteration'))
+    trainer.extend(
+        Snapshot(max_size=config.num_snapshots),
+        trigger=(config.save_interval_steps, 'iteration'))
+
+    print("Trainer Done!")
+    trainer.run()
+
+
+def main():
+    # parse args and config and redirect to train_sp
+
+    parser = argparse.ArgumentParser(description="Train a HiFiGAN model.")
+    parser.add_argument(
+        "--config", type=str, help="config file to overwrite default config.")
+    parser.add_argument("--train-metadata", type=str, help="training data.")
+    parser.add_argument("--dev-metadata", type=str, help="dev data.")
+    parser.add_argument("--output-dir", type=str, help="output dir.")
+    parser.add_argument(
+        "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
+
+    args = parser.parse_args()
+
+    with open(args.config, 'rt') as f:
+        config = CfgNode(yaml.safe_load(f))
+
+    print("========Args========")
+    print(yaml.safe_dump(vars(args)))
+    print("========Config========")
+    print(config)
+    print(
+        f"master see the word size: {dist.get_world_size()}, from pid: {os.getpid()}"
+    )
+
+    # dispatch
+    if args.ngpu > 1:
+        dist.spawn(train_sp, (args, config), nprocs=args.ngpu)
+    else:
+        train_sp(args, config)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/ernie-sat/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/__init__.py b/ernie-sat/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..185a92b8d94d3426d616c0624f0f2ee04339349e
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/ernie-sat/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/train.py b/ernie-sat/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..27ffded63b3621c0f2110815b27fd4420ef0bc5a
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/train.py
@@ -0,0 +1,264 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import logging
+import os
+import shutil
+from pathlib import Path
+
+import jsonlines
+import numpy as np
+import paddle
+import yaml
+from paddle import DataParallel
+from paddle import distributed as dist
+from paddle import nn
+from paddle.io import DataLoader
+from paddle.io import DistributedBatchSampler
+from paddle.optimizer import Adam
+from paddle.optimizer.lr import MultiStepDecay
+from yacs.config import CfgNode
+
+from paddlespeech.t2s.datasets.data_table import DataTable
+from paddlespeech.t2s.datasets.vocoder_batch_fn import Clip
+from paddlespeech.t2s.models.melgan import MBMelGANEvaluator
+from paddlespeech.t2s.models.melgan import MBMelGANUpdater
+from paddlespeech.t2s.models.melgan import MelGANGenerator
+from paddlespeech.t2s.models.melgan import MelGANMultiScaleDiscriminator
+from paddlespeech.t2s.modules.losses import DiscriminatorAdversarialLoss
+from paddlespeech.t2s.modules.losses import GeneratorAdversarialLoss
+from paddlespeech.t2s.modules.losses import MultiResolutionSTFTLoss
+from paddlespeech.t2s.modules.pqmf import PQMF
+from paddlespeech.t2s.training.extensions.snapshot import Snapshot
+from paddlespeech.t2s.training.extensions.visualizer import VisualDL
+from paddlespeech.t2s.training.seeding import seed_everything
+from paddlespeech.t2s.training.trainer import Trainer
+
+
+def train_sp(args, config):
+    # decides device type and whether to run in parallel
+    # setup running environment correctly
+    world_size = paddle.distributed.get_world_size()
+    if (not paddle.is_compiled_with_cuda()) or args.ngpu == 0:
+        paddle.set_device("cpu")
+    else:
+        paddle.set_device("gpu")
+        if world_size > 1:
+            paddle.distributed.init_parallel_env()
+
+    # set the random seed, it is a must for multiprocess training
+    seed_everything(config.seed)
+
+    print(
+        f"rank: {dist.get_rank()}, pid: {os.getpid()}, parent_pid: {os.getppid()}",
+    )
+
+    # dataloader has been too verbose
+    logging.getLogger("DataLoader").disabled = True
+
+    # construct dataset for training and validation
+    with jsonlines.open(args.train_metadata, 'r') as reader:
+        train_metadata = list(reader)
+    train_dataset = DataTable(
+        data=train_metadata,
+        fields=["wave", "feats"],
+        converters={
+            "wave": np.load,
+            "feats": np.load,
+        }, )
+    with jsonlines.open(args.dev_metadata, 'r') as reader:
+        dev_metadata = list(reader)
+    dev_dataset = DataTable(
+        data=dev_metadata,
+        fields=["wave", "feats"],
+        converters={
+            "wave": np.load,
+            "feats": np.load,
+        }, )
+
+    # collate function and dataloader
+    train_sampler = DistributedBatchSampler(
+        train_dataset,
+        batch_size=config.batch_size,
+        shuffle=True,
+        drop_last=True)
+    dev_sampler = DistributedBatchSampler(
+        dev_dataset,
+        batch_size=config.batch_size,
+        shuffle=False,
+        drop_last=False)
+    print("samplers done!")
+
+    if "aux_context_window" in config.generator_params:
+        aux_context_window = config.generator_params.aux_context_window
+    else:
+        aux_context_window = 0
+    train_batch_fn = Clip(
+        batch_max_steps=config.batch_max_steps,
+        hop_size=config.n_shift,
+        aux_context_window=aux_context_window)
+
+    train_dataloader = DataLoader(
+        train_dataset,
+        batch_sampler=train_sampler,
+        collate_fn=train_batch_fn,
+        num_workers=config.num_workers)
+
+    dev_dataloader = DataLoader(
+        dev_dataset,
+        batch_sampler=dev_sampler,
+        collate_fn=train_batch_fn,
+        num_workers=config.num_workers)
+    print("dataloaders done!")
+
+    generator = MelGANGenerator(**config["generator_params"])
+    discriminator = MelGANMultiScaleDiscriminator(
+        **config["discriminator_params"])
+    if world_size > 1:
+        generator = DataParallel(generator)
+        discriminator = DataParallel(discriminator)
+    print("models done!")
+    criterion_stft = MultiResolutionSTFTLoss(**config["stft_loss_params"])
+    criterion_sub_stft = MultiResolutionSTFTLoss(
+        **config["subband_stft_loss_params"])
+    criterion_gen_adv = GeneratorAdversarialLoss()
+    criterion_dis_adv = DiscriminatorAdversarialLoss()
+    # define special module for subband processing
+    criterion_pqmf = PQMF(subbands=config["generator_params"]["out_channels"])
+    print("criterions done!")
+
+    lr_schedule_g = MultiStepDecay(**config["generator_scheduler_params"])
+    # Compared to multi_band_melgan.v1 config, Adam optimizer without gradient norm is used
+    generator_grad_norm = config["generator_grad_norm"]
+    gradient_clip_g = nn.ClipGradByGlobalNorm(
+        generator_grad_norm) if generator_grad_norm > 0 else None
+    print("gradient_clip_g:", gradient_clip_g)
+
+    optimizer_g = Adam(
+        learning_rate=lr_schedule_g,
+        grad_clip=gradient_clip_g,
+        parameters=generator.parameters(),
+        **config["generator_optimizer_params"])
+    lr_schedule_d = MultiStepDecay(**config["discriminator_scheduler_params"])
+    discriminator_grad_norm = config["discriminator_grad_norm"]
+    gradient_clip_d = nn.ClipGradByGlobalNorm(
+        discriminator_grad_norm) if discriminator_grad_norm > 0 else None
+    print("gradient_clip_d:", gradient_clip_d)
+    optimizer_d = Adam(
+        learning_rate=lr_schedule_d,
+        grad_clip=gradient_clip_d,
+        parameters=discriminator.parameters(),
+        **config["discriminator_optimizer_params"])
+    print("optimizers done!")
+
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    if dist.get_rank() == 0:
+        config_name = args.config.split("/")[-1]
+        # copy conf to output_dir
+        shutil.copyfile(args.config, output_dir / config_name)
+
+    updater = MBMelGANUpdater(
+        models={
+            "generator": generator,
+            "discriminator": discriminator,
+        },
+        optimizers={
+            "generator": optimizer_g,
+            "discriminator": optimizer_d,
+        },
+        criterions={
+            "stft": criterion_stft,
+            "sub_stft": criterion_sub_stft,
+            "gen_adv": criterion_gen_adv,
+            "dis_adv": criterion_dis_adv,
+            "pqmf": criterion_pqmf
+        },
+        schedulers={
+            "generator": lr_schedule_g,
+            "discriminator": lr_schedule_d,
+        },
+        dataloader=train_dataloader,
+        discriminator_train_start_steps=config.discriminator_train_start_steps,
+        lambda_adv=config.lambda_adv,
+        output_dir=output_dir)
+
+    evaluator = MBMelGANEvaluator(
+        models={
+            "generator": generator,
+            "discriminator": discriminator,
+        },
+        criterions={
+            "stft": criterion_stft,
+            "sub_stft": criterion_sub_stft,
+            "gen_adv": criterion_gen_adv,
+            "dis_adv": criterion_dis_adv,
+            "pqmf": criterion_pqmf
+        },
+        dataloader=dev_dataloader,
+        lambda_adv=config.lambda_adv,
+        output_dir=output_dir)
+
+    trainer = Trainer(
+        updater,
+        stop_trigger=(config.train_max_steps, "iteration"),
+        out=output_dir)
+
+    if dist.get_rank() == 0:
+        trainer.extend(
+            evaluator, trigger=(config.eval_interval_steps, 'iteration'))
+        trainer.extend(VisualDL(output_dir), trigger=(1, 'iteration'))
+    trainer.extend(
+        Snapshot(max_size=config.num_snapshots),
+        trigger=(config.save_interval_steps, 'iteration'))
+
+    print("Trainer Done!")
+    trainer.run()
+
+
+def main():
+    # parse args and config and redirect to train_sp
+
+    parser = argparse.ArgumentParser(
+        description="Train a Multi-Band MelGAN model.")
+    parser.add_argument(
+        "--config", type=str, help="config file to overwrite default config.")
+    parser.add_argument("--train-metadata", type=str, help="training data.")
+    parser.add_argument("--dev-metadata", type=str, help="dev data.")
+    parser.add_argument("--output-dir", type=str, help="output dir.")
+    parser.add_argument(
+        "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
+
+    args = parser.parse_args()
+
+    with open(args.config, 'rt') as f:
+        config = CfgNode(yaml.safe_load(f))
+
+    print("========Args========")
+    print(yaml.safe_dump(vars(args)))
+    print("========Config========")
+    print(config)
+    print(
+        f"master see the word size: {dist.get_world_size()}, from pid: {os.getpid()}"
+    )
+
+    # dispatch
+    if args.ngpu > 1:
+        dist.spawn(train_sp, (args, config), nprocs=args.ngpu)
+    else:
+        train_sp(args, config)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/ernie-sat/paddlespeech/t2s/exps/gan_vocoder/normalize.py b/ernie-sat/paddlespeech/t2s/exps/gan_vocoder/normalize.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba95d3ed61e341ebc458846a1f79099066c2cc7a
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/exps/gan_vocoder/normalize.py
@@ -0,0 +1,133 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Normalize feature files and dump them."""
+import argparse
+import logging
+from operator import itemgetter
+from pathlib import Path
+
+import jsonlines
+import numpy as np
+from sklearn.preprocessing import StandardScaler
+from tqdm import tqdm
+
+from paddlespeech.t2s.datasets.data_table import DataTable
+
+
+def main():
+    """Run preprocessing process."""
+    parser = argparse.ArgumentParser(
+        description="Normalize dumped raw features.")
+    parser.add_argument(
+        "--metadata",
+        type=str,
+        required=True,
+        help="directory including feature files to be normalized. "
+        "you need to specify either *-scp or rootdir.")
+    parser.add_argument(
+        "--dumpdir",
+        type=str,
+        required=True,
+        help="directory to dump normalized feature files.")
+    parser.add_argument(
+        "--stats", type=str, required=True, help="statistics file.")
+    parser.add_argument(
+        "--skip-wav-copy",
+        default=False,
+        action="store_true",
+        help="whether to skip the copy of wav files.")
+    parser.add_argument(
+        "--verbose",
+        type=int,
+        default=1,
+        help="logging level. higher is more logging. (default=1)")
+    args = parser.parse_args()
+
+    # set logger
+    if args.verbose > 1:
+        logging.basicConfig(
+            level=logging.DEBUG,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
+        )
+    elif args.verbose > 0:
+        logging.basicConfig(
+            level=logging.INFO,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
+        )
+    else:
+        logging.basicConfig(
+            level=logging.WARN,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
+        )
+        logging.warning('Skip DEBUG/INFO messages')
+
+    dumpdir = Path(args.dumpdir).expanduser()
+    # use absolute path
+    dumpdir = dumpdir.resolve()
+    dumpdir.mkdir(parents=True, exist_ok=True)
+
+    # get dataset
+    with jsonlines.open(args.metadata, 'r') as reader:
+        metadata = list(reader)
+    dataset = DataTable(
+        metadata,
+        fields=["utt_id", "wave", "feats"],
+        converters={
+            'utt_id': None,
+            'wave': None if args.skip_wav_copy else np.load,
+            'feats': np.load,
+        })
+    logging.info(f"The number of files = {len(dataset)}.")
+
+    # restore scaler
+    scaler = StandardScaler()
+    scaler.mean_ = np.load(args.stats)[0]
+    scaler.scale_ = np.load(args.stats)[1]
+
+    # from version 0.23.0, this information is needed
+    scaler.n_features_in_ = scaler.mean_.shape[0]
+
+    # process each file
+    output_metadata = []
+
+    for item in tqdm(dataset):
+        utt_id = item['utt_id']
+        wave = item['wave']
+        mel = item['feats']
+        # normalize
+        mel = scaler.transform(mel)
+
+        # save
+        mel_path = dumpdir / f"{utt_id}_feats.npy"
+        np.save(mel_path, mel.astype(np.float32), allow_pickle=False)
+        if not args.skip_wav_copy:
+            wav_path = dumpdir / f"{utt_id}_wave.npy"
+            np.save(wav_path, wave.astype(np.float32), allow_pickle=False)
+        else:
+            wav_path = wave
+        output_metadata.append({
+            'utt_id': utt_id,
+            'wave': str(wav_path),
+            'feats': str(mel_path),
+        })
+    output_metadata.sort(key=itemgetter('utt_id'))
+    output_metadata_path = Path(args.dumpdir) / "metadata.jsonl"
+    with jsonlines.open(output_metadata_path, 'w') as writer:
+        for item in output_metadata:
+            writer.write(item)
+    logging.info(f"metadata dumped into {output_metadata_path}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/ernie-sat/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/__init__.py b/ernie-sat/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..185a92b8d94d3426d616c0624f0f2ee04339349e
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/ernie-sat/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/synthesize_from_wav.py b/ernie-sat/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/synthesize_from_wav.py
new file mode 100644
index 0000000000000000000000000000000000000000..def30e67a5ec9359f81d7bcceaced2e42611ff28
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/synthesize_from_wav.py
@@ -0,0 +1,118 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import logging
+import os
+from pathlib import Path
+
+import librosa
+import numpy as np
+import paddle
+import soundfile as sf
+import yaml
+from yacs.config import CfgNode
+
+from paddlespeech.t2s.datasets.get_feats import LogMelFBank
+from paddlespeech.t2s.models.parallel_wavegan import PWGGenerator
+from paddlespeech.t2s.models.parallel_wavegan import PWGInference
+from paddlespeech.t2s.modules.normalizer import ZScore
+
+
+def evaluate(args, config):
+    # dataloader has been too verbose
+    logging.getLogger("DataLoader").disabled = True
+
+    vocoder = PWGGenerator(**config["generator_params"])
+    state_dict = paddle.load(args.checkpoint)
+    vocoder.set_state_dict(state_dict["generator_params"])
+    vocoder.remove_weight_norm()
+    vocoder.eval()
+    print("model done!")
+
+    stat = np.load(args.stat)
+    mu, std = stat
+    mu = paddle.to_tensor(mu)
+    std = paddle.to_tensor(std)
+    normalizer = ZScore(mu, std)
+
+    pwg_inference = PWGInference(normalizer, vocoder)
+
+    input_dir = Path(args.input_dir)
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    mel_extractor = LogMelFBank(
+        sr=config.fs,
+        n_fft=config.n_fft,
+        hop_length=config.n_shift,
+        win_length=config.win_length,
+        window=config.window,
+        n_mels=config.n_mels,
+        fmin=config.fmin,
+        fmax=config.fmax)
+
+    for utt_name in os.listdir(input_dir):
+        wav, _ = librosa.load(str(input_dir / utt_name), sr=config.fs)
+        # extract mel feats
+        mel = mel_extractor.get_log_mel_fbank(wav)
+        mel = paddle.to_tensor(mel)
+        with paddle.no_grad():
+            gen_wav = pwg_inference(mel)
+        sf.write(
+            str(output_dir / ("gen_" + utt_name)),
+            gen_wav.numpy(),
+            samplerate=config.fs)
+        print(f"{utt_name} done!")
+
+
+def main():
+    # parse args and config and redirect to train_sp
+    parser = argparse.ArgumentParser(
+        description="Synthesize with parallel wavegan.")
+
+    parser.add_argument(
+        "--config", type=str, help="parallel wavegan config file.")
+    parser.add_argument("--checkpoint", type=str, help="snapshot to load.")
+    parser.add_argument(
+        "--stat",
+        type=str,
+        help="mean and standard deviation used to normalize spectrogram when training parallel wavegan."
+    )
+    parser.add_argument("--input-dir", type=str, help="input dir of wavs.")
+    parser.add_argument("--output-dir", type=str, help="output dir.")
+    parser.add_argument(
+        "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
+
+    args = parser.parse_args()
+
+    if args.ngpu == 0:
+        paddle.set_device("cpu")
+    elif args.ngpu > 0:
+        paddle.set_device("gpu")
+    else:
+        print("ngpu should >= 0 !")
+
+    with open(args.config) as f:
+        config = CfgNode(yaml.safe_load(f))
+
+    print("========Args========")
+    print(yaml.safe_dump(vars(args)))
+    print("========Config========")
+    print(config)
+
+    evaluate(args, config)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/ernie-sat/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py b/ernie-sat/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..92de7a2c4e7a04ed28b7b30dfa47be4796acc93f
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py
@@ -0,0 +1,264 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import logging
+import os
+import shutil
+from pathlib import Path
+
+import jsonlines
+import numpy as np
+import paddle
+import yaml
+from paddle import DataParallel
+from paddle import distributed as dist
+from paddle import nn
+from paddle.io import DataLoader
+from paddle.io import DistributedBatchSampler
+from paddle.optimizer import Adam  # No RAdaom
+from paddle.optimizer.lr import StepDecay
+from yacs.config import CfgNode
+
+from paddlespeech.t2s.datasets.data_table import DataTable
+from paddlespeech.t2s.datasets.vocoder_batch_fn import Clip
+from paddlespeech.t2s.models.parallel_wavegan import PWGDiscriminator
+from paddlespeech.t2s.models.parallel_wavegan import PWGEvaluator
+from paddlespeech.t2s.models.parallel_wavegan import PWGGenerator
+from paddlespeech.t2s.models.parallel_wavegan import PWGUpdater
+from paddlespeech.t2s.modules.losses import MultiResolutionSTFTLoss
+from paddlespeech.t2s.training.extensions.snapshot import Snapshot
+from paddlespeech.t2s.training.extensions.visualizer import VisualDL
+from paddlespeech.t2s.training.seeding import seed_everything
+from paddlespeech.t2s.training.trainer import Trainer
+from paddlespeech.t2s.utils import str2bool
+
+
+def train_sp(args, config):
+    # decides device type and whether to run in parallel
+    # setup running environment correctly
+    world_size = paddle.distributed.get_world_size()
+    if (not paddle.is_compiled_with_cuda()) or args.ngpu == 0:
+        paddle.set_device("cpu")
+    else:
+        paddle.set_device("gpu")
+        if world_size > 1:
+            paddle.distributed.init_parallel_env()
+
+    # set the random seed, it is a must for multiprocess training
+    seed_everything(config.seed)
+
+    print(
+        f"rank: {dist.get_rank()}, pid: {os.getpid()}, parent_pid: {os.getppid()}",
+    )
+
+    # dataloader has been too verbose
+    logging.getLogger("DataLoader").disabled = True
+
+    # construct dataset for training and validation
+    with jsonlines.open(args.train_metadata, 'r') as reader:
+        train_metadata = list(reader)
+    train_dataset = DataTable(
+        data=train_metadata,
+        fields=["wave", "feats"],
+        converters={
+            "wave": np.load,
+            "feats": np.load,
+        }, )
+    with jsonlines.open(args.dev_metadata, 'r') as reader:
+        dev_metadata = list(reader)
+    dev_dataset = DataTable(
+        data=dev_metadata,
+        fields=["wave", "feats"],
+        converters={
+            "wave": np.load,
+            "feats": np.load,
+        }, )
+
+    # collate function and dataloader
+    train_sampler = DistributedBatchSampler(
+        train_dataset,
+        batch_size=config.batch_size,
+        shuffle=True,
+        drop_last=True)
+    dev_sampler = DistributedBatchSampler(
+        dev_dataset,
+        batch_size=config.batch_size,
+        shuffle=False,
+        drop_last=False)
+    print("samplers done!")
+
+    train_batch_fn = Clip(
+        batch_max_steps=config.batch_max_steps,
+        hop_size=config.n_shift,
+        aux_context_window=config.generator_params.aux_context_window)
+
+    train_dataloader = DataLoader(
+        train_dataset,
+        batch_sampler=train_sampler,
+        collate_fn=train_batch_fn,
+        num_workers=config.num_workers)
+
+    dev_dataloader = DataLoader(
+        dev_dataset,
+        batch_sampler=dev_sampler,
+        collate_fn=train_batch_fn,
+        num_workers=config.num_workers)
+    print("dataloaders done!")
+
+    generator = PWGGenerator(**config["generator_params"])
+    discriminator = PWGDiscriminator(**config["discriminator_params"])
+    if world_size > 1:
+        generator = DataParallel(generator)
+        discriminator = DataParallel(discriminator)
+    print("models done!")
+
+    criterion_stft = MultiResolutionSTFTLoss(**config["stft_loss_params"])
+    criterion_mse = nn.MSELoss()
+    print("criterions done!")
+
+    lr_schedule_g = StepDecay(**config["generator_scheduler_params"])
+    gradient_clip_g = nn.ClipGradByGlobalNorm(config["generator_grad_norm"])
+    optimizer_g = Adam(
+        learning_rate=lr_schedule_g,
+        grad_clip=gradient_clip_g,
+        parameters=generator.parameters(),
+        **config["generator_optimizer_params"])
+    lr_schedule_d = StepDecay(**config["discriminator_scheduler_params"])
+    gradient_clip_d = nn.ClipGradByGlobalNorm(config["discriminator_grad_norm"])
+    optimizer_d = Adam(
+        learning_rate=lr_schedule_d,
+        grad_clip=gradient_clip_d,
+        parameters=discriminator.parameters(),
+        **config["discriminator_optimizer_params"])
+    print("optimizers done!")
+
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    if dist.get_rank() == 0:
+        config_name = args.config.split("/")[-1]
+        # copy conf to output_dir
+        shutil.copyfile(args.config, output_dir / config_name)
+
+    updater = PWGUpdater(
+        models={
+            "generator": generator,
+            "discriminator": discriminator,
+        },
+        optimizers={
+            "generator": optimizer_g,
+            "discriminator": optimizer_d,
+        },
+        criterions={
+            "stft": criterion_stft,
+            "mse": criterion_mse,
+        },
+        schedulers={
+            "generator": lr_schedule_g,
+            "discriminator": lr_schedule_d,
+        },
+        dataloader=train_dataloader,
+        discriminator_train_start_steps=config.discriminator_train_start_steps,
+        lambda_adv=config.lambda_adv,
+        output_dir=output_dir)
+
+    evaluator = PWGEvaluator(
+        models={
+            "generator": generator,
+            "discriminator": discriminator,
+        },
+        criterions={
+            "stft": criterion_stft,
+            "mse": criterion_mse,
+        },
+        dataloader=dev_dataloader,
+        lambda_adv=config.lambda_adv,
+        output_dir=output_dir)
+    trainer = Trainer(
+        updater,
+        stop_trigger=(config.train_max_steps, "iteration"),
+        out=output_dir,
+        profiler_options=args.profiler_options)
+
+    if dist.get_rank() == 0:
+        trainer.extend(
+            evaluator, trigger=(config.eval_interval_steps, 'iteration'))
+        trainer.extend(VisualDL(output_dir), trigger=(1, 'iteration'))
+    trainer.extend(
+        Snapshot(max_size=config.num_snapshots),
+        trigger=(config.save_interval_steps, 'iteration'))
+
+    print("Trainer Done!")
+    trainer.run()
+
+
+def main():
+    # parse args and config and redirect to train_sp
+
+    parser = argparse.ArgumentParser(
+        description="Train a ParallelWaveGAN model.")
+    parser.add_argument(
+        "--config", type=str, help="config file to overwrite default config.")
+    parser.add_argument("--train-metadata", type=str, help="training data.")
+    parser.add_argument("--dev-metadata", type=str, help="dev data.")
+    parser.add_argument("--output-dir", type=str, help="output dir.")
+    parser.add_argument(
+        "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
+
+    benchmark_group = parser.add_argument_group(
+        'benchmark', 'arguments related to benchmark.')
+    benchmark_group.add_argument(
+        "--batch-size", type=int, default=8, help="batch size.")
+    benchmark_group.add_argument(
+        "--max-iter", type=int, default=400000, help="train max steps.")
+
+    benchmark_group.add_argument(
+        "--run-benchmark",
+        type=str2bool,
+        default=False,
+        help="runing benchmark or not, if True, use the --batch-size and --max-iter."
+    )
+    benchmark_group.add_argument(
+        "--profiler_options",
+        type=str,
+        default=None,
+        help="The option of profiler, which should be in format \"key1=value1;key2=value2;key3=value3\"."
+    )
+
+    args = parser.parse_args()
+
+    with open(args.config, 'rt') as f:
+        config = CfgNode(yaml.safe_load(f))
+
+    # 增加 --batch_size --max_iter 用于 benchmark 调用
+    if args.run_benchmark:
+        config.batch_size = args.batch_size
+        config.train_max_steps = args.max_iter
+
+    print("========Args========")
+    print(yaml.safe_dump(vars(args)))
+    print("========Config========")
+    print(config)
+    print(
+        f"master see the word size: {dist.get_world_size()}, from pid: {os.getpid()}"
+    )
+
+    # dispatch
+    if args.ngpu > 1:
+        dist.spawn(train_sp, (args, config), nprocs=args.ngpu)
+    else:
+        train_sp(args, config)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/ernie-sat/paddlespeech/t2s/exps/gan_vocoder/preprocess.py b/ernie-sat/paddlespeech/t2s/exps/gan_vocoder/preprocess.py
new file mode 100644
index 0000000000000000000000000000000000000000..4871bca7130c418444152561975dff19bf629792
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/exps/gan_vocoder/preprocess.py
@@ -0,0 +1,292 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+from concurrent.futures import ThreadPoolExecutor
+from operator import itemgetter
+from pathlib import Path
+from typing import Any
+from typing import Dict
+from typing import List
+
+import jsonlines
+import librosa
+import numpy as np
+import tqdm
+import yaml
+from yacs.config import CfgNode
+
+from paddlespeech.t2s.datasets.get_feats import LogMelFBank
+from paddlespeech.t2s.datasets.preprocess_utils import get_phn_dur
+from paddlespeech.t2s.datasets.preprocess_utils import merge_silence
+from paddlespeech.t2s.utils import str2bool
+
+
+def process_sentence(config: Dict[str, Any],
+                     fp: Path,
+                     sentences: Dict,
+                     output_dir: Path,
+                     mel_extractor=None,
+                     cut_sil: bool=True):
+    utt_id = fp.stem
+    # for vctk
+    if utt_id.endswith("_mic2"):
+        utt_id = utt_id[:-5]
+    record = None
+    if utt_id in sentences:
+        # reading, resampling may occur
+        y, _ = librosa.load(str(fp), sr=config.fs)
+        if len(y.shape) != 1 or np.abs(y).max() > 1.0:
+            return record
+        assert len(y.shape) == 1, f"{utt_id} is not a mono-channel audio."
+        assert np.abs(y).max(
+        ) <= 1.0, f"{utt_id} is seems to be different that 16 bit PCM."
+        phones = sentences[utt_id][0]
+        durations = sentences[utt_id][1]
+        speaker = sentences[utt_id][2]
+        d_cumsum = np.pad(np.array(durations).cumsum(0), (1, 0), 'constant')
+        # little imprecise than use *.TextGrid directly
+        times = librosa.frames_to_time(
+            d_cumsum, sr=config.fs, hop_length=config.n_shift)
+        if cut_sil:
+            start = 0
+            end = d_cumsum[-1]
+            if phones[0] == "sil" and len(durations) > 1:
+                start = times[1]
+                durations = durations[1:]
+                phones = phones[1:]
+            if phones[-1] == 'sil' and len(durations) > 1:
+                end = times[-2]
+                durations = durations[:-1]
+                phones = phones[:-1]
+            sentences[utt_id][0] = phones
+            sentences[utt_id][1] = durations
+            start, end = librosa.time_to_samples([start, end], sr=config.fs)
+            y = y[start:end]
+
+        # extract mel feats
+        logmel = mel_extractor.get_log_mel_fbank(y)
+
+        # adjust time to make num_samples == num_frames * hop_length
+        num_frames = logmel.shape[0]
+        if y.size < num_frames * config.n_shift:
+            y = np.pad(
+                y, (0, num_frames * config.n_shift - y.size), mode="reflect")
+        else:
+            y = y[:num_frames * config.n_shift]
+        num_sample = y.shape[0]
+
+        mel_path = output_dir / (utt_id + "_feats.npy")
+        wav_path = output_dir / (utt_id + "_wave.npy")
+        np.save(wav_path, y)  # (num_samples, )
+        np.save(mel_path, logmel)  # (num_frames, n_mels)
+        record = {
+            "utt_id": utt_id,
+            "num_samples": num_sample,
+            "num_frames": num_frames,
+            "feats": str(mel_path),
+            "wave": str(wav_path),
+        }
+        return record
+
+
+def process_sentences(config,
+                      fps: List[Path],
+                      sentences: Dict,
+                      output_dir: Path,
+                      mel_extractor=None,
+                      nprocs: int=1,
+                      cut_sil: bool=True):
+    if nprocs == 1:
+        results = []
+        for fp in tqdm.tqdm(fps, total=len(fps)):
+            record = process_sentence(config, fp, sentences, output_dir,
+                                      mel_extractor, cut_sil)
+            if record:
+                results.append(record)
+    else:
+        with ThreadPoolExecutor(nprocs) as pool:
+            futures = []
+            with tqdm.tqdm(total=len(fps)) as progress:
+                for fp in fps:
+                    future = pool.submit(process_sentence, config, fp,
+                                         sentences, output_dir, mel_extractor,
+                                         cut_sil)
+                    future.add_done_callback(lambda p: progress.update())
+                    futures.append(future)
+
+                results = []
+                for ft in futures:
+                    record = ft.result()
+                    if record:
+                        results.append(record)
+
+    results.sort(key=itemgetter("utt_id"))
+    with jsonlines.open(output_dir / "metadata.jsonl", 'w') as writer:
+        for item in results:
+            writer.write(item)
+    print("Done")
+
+
+def main():
+    # parse config and args
+    parser = argparse.ArgumentParser(
+        description="Preprocess audio and then extract features .")
+    parser.add_argument(
+        "--dataset",
+        default="baker",
+        type=str,
+        help="name of dataset, should in {baker, ljspeech, vctk} now")
+    parser.add_argument(
+        "--rootdir", default=None, type=str, help="directory to dataset.")
+    parser.add_argument(
+        "--dumpdir",
+        type=str,
+        required=True,
+        help="directory to dump feature files.")
+    parser.add_argument("--config", type=str, help="vocoder config file.")
+    parser.add_argument(
+        "--verbose",
+        type=int,
+        default=1,
+        help="logging level. higher is more logging. (default=1)")
+    parser.add_argument(
+        "--num-cpu", type=int, default=1, help="number of process.")
+    parser.add_argument(
+        "--dur-file", default=None, type=str, help="path to durations.txt.")
+
+    parser.add_argument(
+        "--cut-sil",
+        type=str2bool,
+        default=True,
+        help="whether cut sil in the edge of audio")
+    args = parser.parse_args()
+
+    rootdir = Path(args.rootdir).expanduser()
+    dumpdir = Path(args.dumpdir).expanduser()
+    # use absolute path
+    dumpdir = dumpdir.resolve()
+    dumpdir.mkdir(parents=True, exist_ok=True)
+    dur_file = Path(args.dur_file).expanduser()
+
+    assert rootdir.is_dir()
+    assert dur_file.is_file()
+
+    with open(args.config, 'rt') as f:
+        config = CfgNode(yaml.safe_load(f))
+
+    if args.verbose > 1:
+        print(vars(args))
+        print(config)
+
+    sentences, speaker_set = get_phn_dur(dur_file)
+    merge_silence(sentences)
+
+    # split data into 3 sections
+    if args.dataset == "baker":
+        wav_files = sorted(list((rootdir / "Wave").rglob("*.wav")))
+        num_train = 9800
+        num_dev = 100
+        train_wav_files = wav_files[:num_train]
+        dev_wav_files = wav_files[num_train:num_train + num_dev]
+        test_wav_files = wav_files[num_train + num_dev:]
+
+    elif args.dataset == "ljspeech":
+        wav_files = sorted(list((rootdir / "wavs").rglob("*.wav")))
+        # split data into 3 sections
+        num_train = 12900
+        num_dev = 100
+        train_wav_files = wav_files[:num_train]
+        dev_wav_files = wav_files[num_train:num_train + num_dev]
+        test_wav_files = wav_files[num_train + num_dev:]
+    elif args.dataset == "vctk":
+        sub_num_dev = 5
+        wav_dir = rootdir / "wav48_silence_trimmed"
+        train_wav_files = []
+        dev_wav_files = []
+        test_wav_files = []
+        for speaker in os.listdir(wav_dir):
+            wav_files = sorted(list((wav_dir / speaker).rglob("*_mic2.flac")))
+            if len(wav_files) > 100:
+                train_wav_files += wav_files[:-sub_num_dev * 2]
+                dev_wav_files += wav_files[-sub_num_dev * 2:-sub_num_dev]
+                test_wav_files += wav_files[-sub_num_dev:]
+            else:
+                train_wav_files += wav_files
+    elif args.dataset == "aishell3":
+        sub_num_dev = 5
+        wav_dir = rootdir / "train" / "wav"
+        train_wav_files = []
+        dev_wav_files = []
+        test_wav_files = []
+        for speaker in os.listdir(wav_dir):
+            wav_files = sorted(list((wav_dir / speaker).rglob("*.wav")))
+            if len(wav_files) > 100:
+                train_wav_files += wav_files[:-sub_num_dev * 2]
+                dev_wav_files += wav_files[-sub_num_dev * 2:-sub_num_dev]
+                test_wav_files += wav_files[-sub_num_dev:]
+            else:
+                train_wav_files += wav_files
+    else:
+        print("dataset should in {baker, ljspeech, vctk, aishell3} now!")
+
+    train_dump_dir = dumpdir / "train" / "raw"
+    train_dump_dir.mkdir(parents=True, exist_ok=True)
+    dev_dump_dir = dumpdir / "dev" / "raw"
+    dev_dump_dir.mkdir(parents=True, exist_ok=True)
+    test_dump_dir = dumpdir / "test" / "raw"
+    test_dump_dir.mkdir(parents=True, exist_ok=True)
+
+    mel_extractor = LogMelFBank(
+        sr=config.fs,
+        n_fft=config.n_fft,
+        hop_length=config.n_shift,
+        win_length=config.win_length,
+        window=config.window,
+        n_mels=config.n_mels,
+        fmin=config.fmin,
+        fmax=config.fmax)
+
+    # process for the 3 sections
+    if train_wav_files:
+        process_sentences(
+            config,
+            train_wav_files,
+            sentences,
+            train_dump_dir,
+            mel_extractor=mel_extractor,
+            nprocs=args.num_cpu,
+            cut_sil=args.cut_sil)
+    if dev_wav_files:
+        process_sentences(
+            config,
+            dev_wav_files,
+            sentences,
+            dev_dump_dir,
+            mel_extractor=mel_extractor,
+            nprocs=args.num_cpu,
+            cut_sil=args.cut_sil)
+    if test_wav_files:
+        process_sentences(
+            config,
+            test_wav_files,
+            sentences,
+            test_dump_dir,
+            mel_extractor=mel_extractor,
+            nprocs=args.num_cpu,
+            cut_sil=args.cut_sil)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/ernie-sat/paddlespeech/t2s/exps/gan_vocoder/style_melgan/__init__.py b/ernie-sat/paddlespeech/t2s/exps/gan_vocoder/style_melgan/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..185a92b8d94d3426d616c0624f0f2ee04339349e
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/exps/gan_vocoder/style_melgan/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/ernie-sat/paddlespeech/t2s/exps/gan_vocoder/style_melgan/train.py b/ernie-sat/paddlespeech/t2s/exps/gan_vocoder/style_melgan/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..be3ba74251d92cf90be713651837205fa8dc582a
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/exps/gan_vocoder/style_melgan/train.py
@@ -0,0 +1,256 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import logging
+import os
+import shutil
+from pathlib import Path
+
+import jsonlines
+import numpy as np
+import paddle
+import yaml
+from paddle import DataParallel
+from paddle import distributed as dist
+from paddle import nn
+from paddle.io import DataLoader
+from paddle.io import DistributedBatchSampler
+from paddle.optimizer import Adam
+from paddle.optimizer.lr import MultiStepDecay
+from yacs.config import CfgNode
+
+from paddlespeech.t2s.datasets.data_table import DataTable
+from paddlespeech.t2s.datasets.vocoder_batch_fn import Clip
+from paddlespeech.t2s.models.melgan import StyleMelGANDiscriminator
+from paddlespeech.t2s.models.melgan import StyleMelGANEvaluator
+from paddlespeech.t2s.models.melgan import StyleMelGANGenerator
+from paddlespeech.t2s.models.melgan import StyleMelGANUpdater
+from paddlespeech.t2s.modules.losses import DiscriminatorAdversarialLoss
+from paddlespeech.t2s.modules.losses import GeneratorAdversarialLoss
+from paddlespeech.t2s.modules.losses import MultiResolutionSTFTLoss
+from paddlespeech.t2s.training.extensions.snapshot import Snapshot
+from paddlespeech.t2s.training.extensions.visualizer import VisualDL
+from paddlespeech.t2s.training.seeding import seed_everything
+from paddlespeech.t2s.training.trainer import Trainer
+
+
+def train_sp(args, config):
+    # decides device type and whether to run in parallel
+    # setup running environment correctly
+    world_size = paddle.distributed.get_world_size()
+    if (not paddle.is_compiled_with_cuda()) or args.ngpu == 0:
+        paddle.set_device("cpu")
+    else:
+        paddle.set_device("gpu")
+        if world_size > 1:
+            paddle.distributed.init_parallel_env()
+
+    # set the random seed, it is a must for multiprocess training
+    seed_everything(config.seed)
+
+    print(
+        f"rank: {dist.get_rank()}, pid: {os.getpid()}, parent_pid: {os.getppid()}",
+    )
+
+    # dataloader has been too verbose
+    logging.getLogger("DataLoader").disabled = True
+
+    # construct dataset for training and validation
+    with jsonlines.open(args.train_metadata, 'r') as reader:
+        train_metadata = list(reader)
+    train_dataset = DataTable(
+        data=train_metadata,
+        fields=["wave", "feats"],
+        converters={
+            "wave": np.load,
+            "feats": np.load,
+        }, )
+    with jsonlines.open(args.dev_metadata, 'r') as reader:
+        dev_metadata = list(reader)
+    dev_dataset = DataTable(
+        data=dev_metadata,
+        fields=["wave", "feats"],
+        converters={
+            "wave": np.load,
+            "feats": np.load,
+        }, )
+
+    # collate function and dataloader
+    train_sampler = DistributedBatchSampler(
+        train_dataset,
+        batch_size=config.batch_size,
+        shuffle=True,
+        drop_last=True)
+    dev_sampler = DistributedBatchSampler(
+        dev_dataset,
+        batch_size=config.batch_size,
+        shuffle=False,
+        drop_last=False)
+    print("samplers done!")
+
+    if "aux_context_window" in config.generator_params:
+        aux_context_window = config.generator_params.aux_context_window
+    else:
+        aux_context_window = 0
+    train_batch_fn = Clip(
+        batch_max_steps=config.batch_max_steps,
+        hop_size=config.n_shift,
+        aux_context_window=aux_context_window)
+
+    train_dataloader = DataLoader(
+        train_dataset,
+        batch_sampler=train_sampler,
+        collate_fn=train_batch_fn,
+        num_workers=config.num_workers)
+
+    dev_dataloader = DataLoader(
+        dev_dataset,
+        batch_sampler=dev_sampler,
+        collate_fn=train_batch_fn,
+        num_workers=config.num_workers)
+    print("dataloaders done!")
+
+    generator = StyleMelGANGenerator(**config["generator_params"])
+    discriminator = StyleMelGANDiscriminator(**config["discriminator_params"])
+    if world_size > 1:
+        generator = DataParallel(generator)
+        discriminator = DataParallel(discriminator)
+    print("models done!")
+    criterion_stft = MultiResolutionSTFTLoss(**config["stft_loss_params"])
+
+    criterion_gen_adv = GeneratorAdversarialLoss(
+        **config["generator_adv_loss_params"])
+    criterion_dis_adv = DiscriminatorAdversarialLoss(
+        **config["discriminator_adv_loss_params"])
+    print("criterions done!")
+
+    lr_schedule_g = MultiStepDecay(**config["generator_scheduler_params"])
+    # Compared to multi_band_melgan.v1 config, Adam optimizer without gradient norm is used
+    generator_grad_norm = config["generator_grad_norm"]
+    gradient_clip_g = nn.ClipGradByGlobalNorm(
+        generator_grad_norm) if generator_grad_norm > 0 else None
+    print("gradient_clip_g:", gradient_clip_g)
+
+    optimizer_g = Adam(
+        learning_rate=lr_schedule_g,
+        grad_clip=gradient_clip_g,
+        parameters=generator.parameters(),
+        **config["generator_optimizer_params"])
+    lr_schedule_d = MultiStepDecay(**config["discriminator_scheduler_params"])
+    discriminator_grad_norm = config["discriminator_grad_norm"]
+    gradient_clip_d = nn.ClipGradByGlobalNorm(
+        discriminator_grad_norm) if discriminator_grad_norm > 0 else None
+    print("gradient_clip_d:", gradient_clip_d)
+    optimizer_d = Adam(
+        learning_rate=lr_schedule_d,
+        grad_clip=gradient_clip_d,
+        parameters=discriminator.parameters(),
+        **config["discriminator_optimizer_params"])
+    print("optimizers done!")
+
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    if dist.get_rank() == 0:
+        config_name = args.config.split("/")[-1]
+        # copy conf to output_dir
+        shutil.copyfile(args.config, output_dir / config_name)
+
+    updater = StyleMelGANUpdater(
+        models={
+            "generator": generator,
+            "discriminator": discriminator,
+        },
+        optimizers={
+            "generator": optimizer_g,
+            "discriminator": optimizer_d,
+        },
+        criterions={
+            "stft": criterion_stft,
+            "gen_adv": criterion_gen_adv,
+            "dis_adv": criterion_dis_adv,
+        },
+        schedulers={
+            "generator": lr_schedule_g,
+            "discriminator": lr_schedule_d,
+        },
+        dataloader=train_dataloader,
+        discriminator_train_start_steps=config.discriminator_train_start_steps,
+        lambda_adv=config.lambda_adv,
+        output_dir=output_dir)
+
+    evaluator = StyleMelGANEvaluator(
+        models={
+            "generator": generator,
+            "discriminator": discriminator,
+        },
+        criterions={
+            "stft": criterion_stft,
+            "gen_adv": criterion_gen_adv,
+            "dis_adv": criterion_dis_adv,
+        },
+        dataloader=dev_dataloader,
+        lambda_adv=config.lambda_adv,
+        output_dir=output_dir)
+
+    trainer = Trainer(
+        updater,
+        stop_trigger=(config.train_max_steps, "iteration"),
+        out=output_dir)
+
+    if dist.get_rank() == 0:
+        trainer.extend(
+            evaluator, trigger=(config.eval_interval_steps, 'iteration'))
+        trainer.extend(VisualDL(output_dir), trigger=(1, 'iteration'))
+    trainer.extend(
+        Snapshot(max_size=config.num_snapshots),
+        trigger=(config.save_interval_steps, 'iteration'))
+
+    print("Trainer Done!")
+    trainer.run()
+
+
+def main():
+    # parse args and config and redirect to train_sp
+
+    parser = argparse.ArgumentParser(description="Train a Style MelGAN model.")
+    parser.add_argument(
+        "--config", type=str, help="config file to overwrite default config.")
+    parser.add_argument("--train-metadata", type=str, help="training data.")
+    parser.add_argument("--dev-metadata", type=str, help="dev data.")
+    parser.add_argument("--output-dir", type=str, help="output dir.")
+    parser.add_argument(
+        "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
+
+    args = parser.parse_args()
+
+    with open(args.config, 'rt') as f:
+        config = CfgNode(yaml.safe_load(f))
+
+    print("========Args========")
+    print(yaml.safe_dump(vars(args)))
+    print("========Config========")
+    print(config)
+    print(
+        f"master see the word size: {dist.get_world_size()}, from pid: {os.getpid()}"
+    )
+
+    # dispatch
+    if args.ngpu > 1:
+        dist.spawn(train_sp, (args, config), nprocs=args.ngpu)
+    else:
+        train_sp(args, config)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/ernie-sat/paddlespeech/t2s/exps/gan_vocoder/synthesize.py b/ernie-sat/paddlespeech/t2s/exps/gan_vocoder/synthesize.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d9a8c49b683376f446adf4fde84c466acd360a0
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/exps/gan_vocoder/synthesize.py
@@ -0,0 +1,121 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+from pathlib import Path
+
+import jsonlines
+import numpy as np
+import paddle
+import soundfile as sf
+import yaml
+from paddle import distributed as dist
+from timer import timer
+from yacs.config import CfgNode
+
+import paddlespeech
+from paddlespeech.t2s.datasets.data_table import DataTable
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Synthesize with GANVocoder.")
+    parser.add_argument(
+        "--generator-type",
+        type=str,
+        default="pwgan",
+        help="type of GANVocoder, should in {pwgan, mb_melgan, style_melgan, hifigan, } now"
+    )
+    parser.add_argument("--config", type=str, help="GANVocoder config file.")
+    parser.add_argument("--checkpoint", type=str, help="snapshot to load.")
+    parser.add_argument("--test-metadata", type=str, help="dev data.")
+    parser.add_argument("--output-dir", type=str, help="output dir.")
+    parser.add_argument(
+        "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
+
+    args = parser.parse_args()
+
+    with open(args.config) as f:
+        config = CfgNode(yaml.safe_load(f))
+
+    print("========Args========")
+    print(yaml.safe_dump(vars(args)))
+    print("========Config========")
+    print(config)
+    print(
+        f"master see the word size: {dist.get_world_size()}, from pid: {os.getpid()}"
+    )
+
+    if args.ngpu == 0:
+        paddle.set_device("cpu")
+    elif args.ngpu > 0:
+        paddle.set_device("gpu")
+    else:
+        print("ngpu should >= 0 !")
+
+    class_map = {
+        "hifigan": "HiFiGANGenerator",
+        "mb_melgan": "MelGANGenerator",
+        "pwgan": "PWGGenerator",
+        "style_melgan": "StyleMelGANGenerator",
+    }
+
+    generator_type = args.generator_type
+
+    assert generator_type in class_map
+
+    print("generator_type:", generator_type)
+
+    generator_class = getattr(paddlespeech.t2s.models,
+                              class_map[generator_type])
+    generator = generator_class(**config["generator_params"])
+    state_dict = paddle.load(args.checkpoint)
+    generator.set_state_dict(state_dict["generator_params"])
+    generator.remove_weight_norm()
+    generator.eval()
+
+    with jsonlines.open(args.test_metadata, 'r') as reader:
+        metadata = list(reader)
+    test_dataset = DataTable(
+        metadata,
+        fields=['utt_id', 'feats'],
+        converters={
+            'utt_id': None,
+            'feats': np.load,
+        })
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    N = 0
+    T = 0
+    for example in test_dataset:
+        utt_id = example['utt_id']
+        mel = example['feats']
+        mel = paddle.to_tensor(mel)  # (T, C)
+        with timer() as t:
+            with paddle.no_grad():
+                wav = generator.inference(c=mel)
+            wav = wav.numpy()
+            N += wav.size
+            T += t.elapse
+            speed = wav.size / t.elapse
+            rtf = config.fs / speed
+        print(
+            f"{utt_id}, mel: {mel.shape}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}."
+        )
+        sf.write(str(output_dir / (utt_id + ".wav")), wav, samplerate=config.fs)
+    print(f"generation speed: {N / T}Hz, RTF: {config.fs / (N / T) }")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/ernie-sat/paddlespeech/t2s/exps/gan_vocoder/synthesize_fxr.py b/ernie-sat/paddlespeech/t2s/exps/gan_vocoder/synthesize_fxr.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d9a8c49b683376f446adf4fde84c466acd360a0
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/exps/gan_vocoder/synthesize_fxr.py
@@ -0,0 +1,121 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+from pathlib import Path
+
+import jsonlines
+import numpy as np
+import paddle
+import soundfile as sf
+import yaml
+from paddle import distributed as dist
+from timer import timer
+from yacs.config import CfgNode
+
+import paddlespeech
+from paddlespeech.t2s.datasets.data_table import DataTable
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Synthesize with GANVocoder.")
+    parser.add_argument(
+        "--generator-type",
+        type=str,
+        default="pwgan",
+        help="type of GANVocoder, should in {pwgan, mb_melgan, style_melgan, hifigan, } now"
+    )
+    parser.add_argument("--config", type=str, help="GANVocoder config file.")
+    parser.add_argument("--checkpoint", type=str, help="snapshot to load.")
+    parser.add_argument("--test-metadata", type=str, help="dev data.")
+    parser.add_argument("--output-dir", type=str, help="output dir.")
+    parser.add_argument(
+        "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
+
+    args = parser.parse_args()
+
+    with open(args.config) as f:
+        config = CfgNode(yaml.safe_load(f))
+
+    print("========Args========")
+    print(yaml.safe_dump(vars(args)))
+    print("========Config========")
+    print(config)
+    print(
+        f"master see the word size: {dist.get_world_size()}, from pid: {os.getpid()}"
+    )
+
+    if args.ngpu == 0:
+        paddle.set_device("cpu")
+    elif args.ngpu > 0:
+        paddle.set_device("gpu")
+    else:
+        print("ngpu should >= 0 !")
+
+    class_map = {
+        "hifigan": "HiFiGANGenerator",
+        "mb_melgan": "MelGANGenerator",
+        "pwgan": "PWGGenerator",
+        "style_melgan": "StyleMelGANGenerator",
+    }
+
+    generator_type = args.generator_type
+
+    assert generator_type in class_map
+
+    print("generator_type:", generator_type)
+
+    generator_class = getattr(paddlespeech.t2s.models,
+                              class_map[generator_type])
+    generator = generator_class(**config["generator_params"])
+    state_dict = paddle.load(args.checkpoint)
+    generator.set_state_dict(state_dict["generator_params"])
+    generator.remove_weight_norm()
+    generator.eval()
+
+    with jsonlines.open(args.test_metadata, 'r') as reader:
+        metadata = list(reader)
+    test_dataset = DataTable(
+        metadata,
+        fields=['utt_id', 'feats'],
+        converters={
+            'utt_id': None,
+            'feats': np.load,
+        })
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    N = 0
+    T = 0
+    for example in test_dataset:
+        utt_id = example['utt_id']
+        mel = example['feats']
+        mel = paddle.to_tensor(mel)  # (T, C)
+        with timer() as t:
+            with paddle.no_grad():
+                wav = generator.inference(c=mel)
+            wav = wav.numpy()
+            N += wav.size
+            T += t.elapse
+            speed = wav.size / t.elapse
+            rtf = config.fs / speed
+        print(
+            f"{utt_id}, mel: {mel.shape}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}."
+        )
+        sf.write(str(output_dir / (utt_id + ".wav")), wav, samplerate=config.fs)
+    print(f"generation speed: {N / T}Hz, RTF: {config.fs / (N / T) }")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/ernie-sat/paddlespeech/t2s/exps/inference.py b/ernie-sat/paddlespeech/t2s/exps/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..62602a01f28c4365c80ba6fb01b98cef2572a579
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/exps/inference.py
@@ -0,0 +1,246 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+from pathlib import Path
+
+import numpy
+import soundfile as sf
+from paddle import inference
+from timer import timer
+
+from paddlespeech.t2s.exps.syn_utils import get_frontend
+from paddlespeech.t2s.exps.syn_utils import get_sentences
+from paddlespeech.t2s.utils import str2bool
+
+
+def get_predictor(args, filed='am'):
+    full_name = ''
+    if filed == 'am':
+        full_name = args.am
+    elif filed == 'voc':
+        full_name = args.voc
+    model_name = full_name[:full_name.rindex('_')]
+    config = inference.Config(
+        str(Path(args.inference_dir) / (full_name + ".pdmodel")),
+        str(Path(args.inference_dir) / (full_name + ".pdiparams")))
+    if args.device == "gpu":
+        config.enable_use_gpu(100, 0)
+    elif args.device == "cpu":
+        config.disable_gpu()
+    # This line must be commented for fastspeech2, if not, it will OOM
+    if model_name != 'fastspeech2':
+        config.enable_memory_optim()
+    predictor = inference.create_predictor(config)
+    return predictor
+
+
+def get_am_output(args, am_predictor, frontend, merge_sentences, input):
+    am_name = args.am[:args.am.rindex('_')]
+    am_dataset = args.am[args.am.rindex('_') + 1:]
+    am_input_names = am_predictor.get_input_names()
+    get_tone_ids = False
+    get_spk_id = False
+    if am_name == 'speedyspeech':
+        get_tone_ids = True
+    if am_dataset in {"aishell3", "vctk"} and args.speaker_dict:
+        get_spk_id = True
+        spk_id = numpy.array([args.spk_id])
+    if args.lang == 'zh':
+        input_ids = frontend.get_input_ids(
+            input, merge_sentences=merge_sentences, get_tone_ids=get_tone_ids)
+        phone_ids = input_ids["phone_ids"]
+    elif args.lang == 'en':
+        input_ids = frontend.get_input_ids(
+            input, merge_sentences=merge_sentences)
+        phone_ids = input_ids["phone_ids"]
+    else:
+        print("lang should in {'zh', 'en'}!")
+
+    if get_tone_ids:
+        tone_ids = input_ids["tone_ids"]
+        tones = tone_ids[0].numpy()
+        tones_handle = am_predictor.get_input_handle(am_input_names[1])
+        tones_handle.reshape(tones.shape)
+        tones_handle.copy_from_cpu(tones)
+    if get_spk_id:
+        spk_id_handle = am_predictor.get_input_handle(am_input_names[1])
+        spk_id_handle.reshape(spk_id.shape)
+        spk_id_handle.copy_from_cpu(spk_id)
+    phones = phone_ids[0].numpy()
+    phones_handle = am_predictor.get_input_handle(am_input_names[0])
+    phones_handle.reshape(phones.shape)
+    phones_handle.copy_from_cpu(phones)
+
+    am_predictor.run()
+    am_output_names = am_predictor.get_output_names()
+    am_output_handle = am_predictor.get_output_handle(am_output_names[0])
+    am_output_data = am_output_handle.copy_to_cpu()
+    return am_output_data
+
+
+def get_voc_output(args, voc_predictor, input):
+    voc_input_names = voc_predictor.get_input_names()
+    mel_handle = voc_predictor.get_input_handle(voc_input_names[0])
+    mel_handle.reshape(input.shape)
+    mel_handle.copy_from_cpu(input)
+
+    voc_predictor.run()
+    voc_output_names = voc_predictor.get_output_names()
+    voc_output_handle = voc_predictor.get_output_handle(voc_output_names[0])
+    wav = voc_output_handle.copy_to_cpu()
+    return wav
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Paddle Infernce with acoustic model & vocoder.")
+    # acoustic model
+    parser.add_argument(
+        '--am',
+        type=str,
+        default='fastspeech2_csmsc',
+        choices=[
+            'speedyspeech_csmsc', 'fastspeech2_csmsc', 'fastspeech2_aishell3',
+            'fastspeech2_vctk', 'tacotron2_csmsc'
+        ],
+        help='Choose acoustic model type of tts task.')
+    parser.add_argument(
+        "--phones_dict", type=str, default=None, help="phone vocabulary file.")
+    parser.add_argument(
+        "--tones_dict", type=str, default=None, help="tone vocabulary file.")
+    parser.add_argument(
+        "--speaker_dict", type=str, default=None, help="speaker id map file.")
+    parser.add_argument(
+        '--spk_id',
+        type=int,
+        default=0,
+        help='spk id for multi speaker acoustic model')
+    # voc
+    parser.add_argument(
+        '--voc',
+        type=str,
+        default='pwgan_csmsc',
+        choices=[
+            'pwgan_csmsc', 'mb_melgan_csmsc', 'hifigan_csmsc', 'pwgan_aishell3',
+            'pwgan_vctk', 'wavernn_csmsc'
+        ],
+        help='Choose vocoder type of tts task.')
+    # other
+    parser.add_argument(
+        '--lang',
+        type=str,
+        default='zh',
+        help='Choose model language. zh or en')
+    parser.add_argument(
+        "--text",
+        type=str,
+        help="text to synthesize, a 'utt_id sentence' pair per line")
+    parser.add_argument(
+        "--inference_dir", type=str, help="dir to save inference models")
+    parser.add_argument("--output_dir", type=str, help="output dir")
+    # inference
+    parser.add_argument(
+        "--use_trt",
+        type=str2bool,
+        default=False,
+        help="Whether to use inference engin TensorRT.", )
+    parser.add_argument(
+        "--int8",
+        type=str2bool,
+        default=False,
+        help="Whether to use int8 inference.", )
+    parser.add_argument(
+        "--fp16",
+        type=str2bool,
+        default=False,
+        help="Whether to use float16 inference.", )
+    parser.add_argument(
+        "--device",
+        default="gpu",
+        choices=["gpu", "cpu"],
+        help="Device selected for inference.", )
+
+    args, _ = parser.parse_known_args()
+    return args
+
+
+# only inference for models trained with csmsc now
+def main():
+    args = parse_args()
+    # frontend
+    frontend = get_frontend(args)
+
+    # am_predictor
+    am_predictor = get_predictor(args, filed='am')
+    # model: {model_name}_{dataset}
+    am_dataset = args.am[args.am.rindex('_') + 1:]
+
+    # voc_predictor
+    voc_predictor = get_predictor(args, filed='voc')
+
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    sentences = get_sentences(args)
+
+    merge_sentences = True
+    fs = 24000 if am_dataset != 'ljspeech' else 22050
+    # warmup
+    for utt_id, sentence in sentences[:3]:
+        with timer() as t:
+            am_output_data = get_am_output(
+                args,
+                am_predictor=am_predictor,
+                frontend=frontend,
+                merge_sentences=merge_sentences,
+                input=sentence)
+            wav = get_voc_output(
+                args, voc_predictor=voc_predictor, input=am_output_data)
+        speed = wav.size / t.elapse
+        rtf = fs / speed
+        print(
+            f"{utt_id}, mel: {am_output_data.shape}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}."
+        )
+
+    print("warm up done!")
+
+    N = 0
+    T = 0
+    for utt_id, sentence in sentences:
+        with timer() as t:
+            am_output_data = get_am_output(
+                args,
+                am_predictor=am_predictor,
+                frontend=frontend,
+                merge_sentences=merge_sentences,
+                input=sentence)
+            wav = get_voc_output(
+                args, voc_predictor=voc_predictor, input=am_output_data)
+
+        N += wav.size
+        T += t.elapse
+        speed = wav.size / t.elapse
+        rtf = fs / speed
+
+        sf.write(output_dir / (utt_id + ".wav"), wav, samplerate=24000)
+        print(
+            f"{utt_id}, mel: {am_output_data.shape}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}."
+        )
+
+        print(f"{utt_id} done!")
+    print(f"generation speed: {N / T}Hz, RTF: {fs / (N / T) }")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/ernie-sat/paddlespeech/t2s/exps/ort_predict.py b/ernie-sat/paddlespeech/t2s/exps/ort_predict.py
new file mode 100644
index 0000000000000000000000000000000000000000..e8d4d61c32e09983e66346a2b9f6a26a7c269846
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/exps/ort_predict.py
@@ -0,0 +1,156 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+from pathlib import Path
+
+import jsonlines
+import numpy as np
+import onnxruntime as ort
+import soundfile as sf
+from timer import timer
+
+from paddlespeech.t2s.exps.syn_utils import get_test_dataset
+from paddlespeech.t2s.utils import str2bool
+
+
+def get_sess(args, filed='am'):
+    full_name = ''
+    if filed == 'am':
+        full_name = args.am
+    elif filed == 'voc':
+        full_name = args.voc
+    model_dir = str(Path(args.inference_dir) / (full_name + ".onnx"))
+    sess_options = ort.SessionOptions()
+    sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
+    sess_options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
+
+    if args.device == "gpu":
+        # fastspeech2/mb_melgan can't use trt now!
+        if args.use_trt:
+            providers = ['TensorrtExecutionProvider']
+        else:
+            providers = ['CUDAExecutionProvider']
+    elif args.device == "cpu":
+        providers = ['CPUExecutionProvider']
+    sess_options.intra_op_num_threads = args.cpu_threads
+    sess = ort.InferenceSession(
+        model_dir, providers=providers, sess_options=sess_options)
+    return sess
+
+
+def ort_predict(args):
+    # construct dataset for evaluation
+    with jsonlines.open(args.test_metadata, 'r') as reader:
+        test_metadata = list(reader)
+    am_name = args.am[:args.am.rindex('_')]
+    am_dataset = args.am[args.am.rindex('_') + 1:]
+    test_dataset = get_test_dataset(args, test_metadata, am_name, am_dataset)
+
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    fs = 24000 if am_dataset != 'ljspeech' else 22050
+
+    # am
+    am_sess = get_sess(args, filed='am')
+
+    # vocoder
+    voc_sess = get_sess(args, filed='voc')
+
+    # am warmup
+    for T in [27, 38, 54]:
+        data = np.random.randint(1, 266, size=(T, ))
+        am_sess.run(None, {"text": data})
+
+    # voc warmup
+    for T in [227, 308, 544]:
+        data = np.random.rand(T, 80).astype("float32")
+        voc_sess.run(None, {"logmel": data})
+    print("warm up done!")
+
+    N = 0
+    T = 0
+    for example in test_dataset:
+        utt_id = example['utt_id']
+        phone_ids = example["text"]
+        with timer() as t:
+            mel = am_sess.run(output_names=None, input_feed={'text': phone_ids})
+            mel = mel[0]
+            wav = voc_sess.run(output_names=None, input_feed={'logmel': mel})
+
+            N += len(wav[0])
+            T += t.elapse
+            speed = len(wav[0]) / t.elapse
+            rtf = fs / speed
+        sf.write(
+            str(output_dir / (utt_id + ".wav")),
+            np.array(wav)[0],
+            samplerate=fs)
+        print(
+            f"{utt_id}, mel: {mel.shape}, wave: {len(wav[0])}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}."
+        )
+    print(f"generation speed: {N / T}Hz, RTF: {fs / (N / T) }")
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Infernce with onnxruntime.")
+    # acoustic model
+    parser.add_argument(
+        '--am',
+        type=str,
+        default='fastspeech2_csmsc',
+        choices=[
+            'fastspeech2_csmsc',
+        ],
+        help='Choose acoustic model type of tts task.')
+
+    # voc
+    parser.add_argument(
+        '--voc',
+        type=str,
+        default='hifigan_csmsc',
+        choices=['hifigan_csmsc', 'mb_melgan_csmsc'],
+        help='Choose vocoder type of tts task.')
+    # other
+    parser.add_argument(
+        "--inference_dir", type=str, help="dir to save inference models")
+    parser.add_argument("--test_metadata", type=str, help="test metadata.")
+    parser.add_argument("--output_dir", type=str, help="output dir")
+
+    # inference
+    parser.add_argument(
+        "--use_trt",
+        type=str2bool,
+        default=False,
+        help="Whether to use inference engin TensorRT.", )
+
+    parser.add_argument(
+        "--device",
+        default="gpu",
+        choices=["gpu", "cpu"],
+        help="Device selected for inference.", )
+    parser.add_argument('--cpu_threads', type=int, default=1)
+
+    args, _ = parser.parse_known_args()
+    return args
+
+
+def main():
+    args = parse_args()
+
+    ort_predict(args)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/ernie-sat/paddlespeech/t2s/exps/ort_predict_e2e.py b/ernie-sat/paddlespeech/t2s/exps/ort_predict_e2e.py
new file mode 100644
index 0000000000000000000000000000000000000000..8aa04cbc556ad36d7275c2b5ccad3cd9fa5b139b
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/exps/ort_predict_e2e.py
@@ -0,0 +1,183 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+from pathlib import Path
+
+import numpy as np
+import onnxruntime as ort
+import soundfile as sf
+from timer import timer
+
+from paddlespeech.t2s.exps.syn_utils import get_frontend
+from paddlespeech.t2s.exps.syn_utils import get_sentences
+from paddlespeech.t2s.utils import str2bool
+
+
+def get_sess(args, filed='am'):
+    full_name = ''
+    if filed == 'am':
+        full_name = args.am
+    elif filed == 'voc':
+        full_name = args.voc
+    model_dir = str(Path(args.inference_dir) / (full_name + ".onnx"))
+    sess_options = ort.SessionOptions()
+    sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
+    sess_options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
+
+    if args.device == "gpu":
+        # fastspeech2/mb_melgan can't use trt now!
+        if args.use_trt:
+            providers = ['TensorrtExecutionProvider']
+        else:
+            providers = ['CUDAExecutionProvider']
+    elif args.device == "cpu":
+        providers = ['CPUExecutionProvider']
+    sess_options.intra_op_num_threads = args.cpu_threads
+    sess = ort.InferenceSession(
+        model_dir, providers=providers, sess_options=sess_options)
+    return sess
+
+
+def ort_predict(args):
+
+    # frontend
+    frontend = get_frontend(args)
+
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    sentences = get_sentences(args)
+
+    am_name = args.am[:args.am.rindex('_')]
+    am_dataset = args.am[args.am.rindex('_') + 1:]
+    fs = 24000 if am_dataset != 'ljspeech' else 22050
+
+    # am
+    am_sess = get_sess(args, filed='am')
+
+    # vocoder
+    voc_sess = get_sess(args, filed='voc')
+
+    # am warmup
+    for T in [27, 38, 54]:
+        data = np.random.randint(1, 266, size=(T, ))
+        am_sess.run(None, {"text": data})
+
+    # voc warmup
+    for T in [227, 308, 544]:
+        data = np.random.rand(T, 80).astype("float32")
+        voc_sess.run(None, {"logmel": data})
+    print("warm up done!")
+
+    # frontend warmup
+    # Loading model cost 0.5+ seconds
+    if args.lang == 'zh':
+        frontend.get_input_ids("你好，欢迎使用飞桨框架进行深度学习研究！", merge_sentences=True)
+    else:
+        print("lang should in be 'zh' here!")
+
+    N = 0
+    T = 0
+    merge_sentences = True
+    for utt_id, sentence in sentences:
+        with timer() as t:
+            if args.lang == 'zh':
+                input_ids = frontend.get_input_ids(
+                    sentence, merge_sentences=merge_sentences)
+
+                phone_ids = input_ids["phone_ids"]
+            else:
+                print("lang should in be 'zh' here!")
+            # merge_sentences=True here, so we only use the first item of phone_ids
+            phone_ids = phone_ids[0].numpy()
+            mel = am_sess.run(output_names=None, input_feed={'text': phone_ids})
+            mel = mel[0]
+            wav = voc_sess.run(output_names=None, input_feed={'logmel': mel})
+
+            N += len(wav[0])
+            T += t.elapse
+            speed = len(wav[0]) / t.elapse
+            rtf = fs / speed
+        sf.write(
+            str(output_dir / (utt_id + ".wav")),
+            np.array(wav)[0],
+            samplerate=fs)
+        print(
+            f"{utt_id}, mel: {mel.shape}, wave: {len(wav[0])}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}."
+        )
+    print(f"generation speed: {N / T}Hz, RTF: {fs / (N / T) }")
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Infernce with onnxruntime.")
+    # acoustic model
+    parser.add_argument(
+        '--am',
+        type=str,
+        default='fastspeech2_csmsc',
+        choices=[
+            'fastspeech2_csmsc',
+        ],
+        help='Choose acoustic model type of tts task.')
+    parser.add_argument(
+        "--phones_dict", type=str, default=None, help="phone vocabulary file.")
+    parser.add_argument(
+        "--tones_dict", type=str, default=None, help="tone vocabulary file.")
+
+    # voc
+    parser.add_argument(
+        '--voc',
+        type=str,
+        default='hifigan_csmsc',
+        choices=['hifigan_csmsc', 'mb_melgan_csmsc'],
+        help='Choose vocoder type of tts task.')
+    # other
+    parser.add_argument(
+        "--inference_dir", type=str, help="dir to save inference models")
+    parser.add_argument(
+        "--text",
+        type=str,
+        help="text to synthesize, a 'utt_id sentence' pair per line")
+    parser.add_argument("--output_dir", type=str, help="output dir")
+    parser.add_argument(
+        '--lang',
+        type=str,
+        default='zh',
+        help='Choose model language. zh or en')
+
+    # inference
+    parser.add_argument(
+        "--use_trt",
+        type=str2bool,
+        default=False,
+        help="Whether to use inference engin TensorRT.", )
+
+    parser.add_argument(
+        "--device",
+        default="gpu",
+        choices=["gpu", "cpu"],
+        help="Device selected for inference.", )
+    parser.add_argument('--cpu_threads', type=int, default=1)
+
+    args, _ = parser.parse_known_args()
+    return args
+
+
+def main():
+    args = parse_args()
+
+    ort_predict(args)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/ernie-sat/paddlespeech/t2s/exps/sentences.txt b/ernie-sat/paddlespeech/t2s/exps/sentences.txt
new file mode 100644
index 0000000000000000000000000000000000000000..3aa5376b4e08bae4e5f75ead2e419679dd2d6ffe
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/exps/sentences.txt
@@ -0,0 +1,16 @@
+001 凯莫瑞安联合体的经济崩溃，迫在眉睫。
+002 对于所有想要离开那片废土，去寻找更美好生活的人来说。
+003 克哈，是你们所有人安全的港湾。
+004 为了保护尤摩扬人民不受异虫的残害，我所做的，比他们自己的领导委员会都多。
+005 无论他们如何诽谤我，我将继续为所有泰伦人的最大利益，而努力奋斗。
+006 身为你们的元首，我带领泰伦人实现了人类统治领地和经济的扩张。
+007 我们将继续成长，用行动回击那些只会说风凉话，不愿意和我们相向而行的害群之马。
+008 帝国武装力量，无数的优秀儿女，正时刻守卫着我们的家园大门，但是他们孤木难支。
+009 凡是今天应征入伍者，所获的所有刑罚罪责，减半。
+010 激进分子和异见者希望你们一听见枪声，就背弃多年的和平与繁荣。
+011 他们没有勇气和能力，带领人类穿越一个充满危险的星系。
+012 法治是我们的命脉，然而它却受到前所未有的挑战。
+013 我将恢复我们帝国的荣光，绝不会向任何外星势力低头。
+014 我已经驯服了异虫，荡平了星灵。如今它们的创造者，想要夺走我们拥有的一切。
+015 永远记住，谁才是最能保护你们的人。
+016 不要听信别人的谗言，我不是什么克隆人。
\ No newline at end of file
diff --git a/ernie-sat/paddlespeech/t2s/exps/sentences_en.txt b/ernie-sat/paddlespeech/t2s/exps/sentences_en.txt
new file mode 100644
index 0000000000000000000000000000000000000000..36b73a528461513938d6ad7dfec7987259799ccd
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/exps/sentences_en.txt
@@ -0,0 +1,9 @@
+001 Life was like a box of chocolates, you never know what you're gonna get.
+002 With great power there must come great responsibility.
+003 To be or not to be, that’s a question.
+004 A man can be destroyed but not defeated
+005 Do not, for one repulse, give up the purpose that you resolved to effort.
+006 Death is just a part of life, something we're all destined to do.
+007 I think it's hard winning a war with words. 
+008 Don’t argue with the people of strong determination, because they may change the fact!
+009 Love you three thousand times.
\ No newline at end of file
diff --git a/ernie-sat/paddlespeech/t2s/exps/speedyspeech/__init__.py b/ernie-sat/paddlespeech/t2s/exps/speedyspeech/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..abf198b97e6e818e1fbe59006f98492640bcee54
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/exps/speedyspeech/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/ernie-sat/paddlespeech/t2s/exps/speedyspeech/gen_gta_mel.py b/ernie-sat/paddlespeech/t2s/exps/speedyspeech/gen_gta_mel.py
new file mode 100644
index 0000000000000000000000000000000000000000..31b7d2eac400f5e9172797ec4739eb422d7d9200
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/exps/speedyspeech/gen_gta_mel.py
@@ -0,0 +1,244 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# generate mels using durations.txt
+# for mb melgan finetune
+# 长度和原本的 mel 不一致怎么办？
+import argparse
+import os
+from pathlib import Path
+
+import numpy as np
+import paddle
+import yaml
+from tqdm import tqdm
+from yacs.config import CfgNode
+
+from paddlespeech.t2s.datasets.preprocess_utils import get_phn_dur
+from paddlespeech.t2s.datasets.preprocess_utils import merge_silence
+from paddlespeech.t2s.frontend.zh_frontend import Frontend
+from paddlespeech.t2s.models.speedyspeech import SpeedySpeech
+from paddlespeech.t2s.models.speedyspeech import SpeedySpeechInference
+from paddlespeech.t2s.modules.normalizer import ZScore
+from paddlespeech.t2s.utils import str2bool
+
+
+def evaluate(args, speedyspeech_config):
+    rootdir = Path(args.rootdir).expanduser()
+    assert rootdir.is_dir()
+
+    # construct dataset for evaluation
+    with open(args.phones_dict, "r") as f:
+        phn_id = [line.strip().split() for line in f.readlines()]
+    vocab_size = len(phn_id)
+    print("vocab_size:", vocab_size)
+
+    phone_dict = {}
+    for phn, id in phn_id:
+        phone_dict[phn] = int(id)
+
+    with open(args.tones_dict, "r") as f:
+        tone_id = [line.strip().split() for line in f.readlines()]
+    tone_size = len(tone_id)
+    print("tone_size:", tone_size)
+
+    frontend = Frontend(
+        phone_vocab_path=args.phones_dict, tone_vocab_path=args.tones_dict)
+
+    if args.speaker_dict:
+        with open(args.speaker_dict, 'rt') as f:
+            spk_id_list = [line.strip().split() for line in f.readlines()]
+            spk_num = len(spk_id_list)
+    else:
+        spk_num = None
+
+    model = SpeedySpeech(
+        vocab_size=vocab_size,
+        tone_size=tone_size,
+        **speedyspeech_config["model"],
+        spk_num=spk_num)
+
+    model.set_state_dict(
+        paddle.load(args.speedyspeech_checkpoint)["main_params"])
+    model.eval()
+
+    stat = np.load(args.speedyspeech_stat)
+    mu, std = stat
+    mu = paddle.to_tensor(mu)
+    std = paddle.to_tensor(std)
+    speedyspeech_normalizer = ZScore(mu, std)
+
+    speedyspeech_inference = SpeedySpeechInference(speedyspeech_normalizer,
+                                                   model)
+    speedyspeech_inference.eval()
+
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    sentences, speaker_set = get_phn_dur(args.dur_file)
+    merge_silence(sentences)
+
+    if args.dataset == "baker":
+        wav_files = sorted(list((rootdir / "Wave").rglob("*.wav")))
+        # split data into 3 sections
+        num_train = 9800
+        num_dev = 100
+        train_wav_files = wav_files[:num_train]
+        dev_wav_files = wav_files[num_train:num_train + num_dev]
+        test_wav_files = wav_files[num_train + num_dev:]
+    elif args.dataset == "aishell3":
+        sub_num_dev = 5
+        wav_dir = rootdir / "train" / "wav"
+        train_wav_files = []
+        dev_wav_files = []
+        test_wav_files = []
+        for speaker in os.listdir(wav_dir):
+            wav_files = sorted(list((wav_dir / speaker).rglob("*.wav")))
+            if len(wav_files) > 100:
+                train_wav_files += wav_files[:-sub_num_dev * 2]
+                dev_wav_files += wav_files[-sub_num_dev * 2:-sub_num_dev]
+                test_wav_files += wav_files[-sub_num_dev:]
+            else:
+                train_wav_files += wav_files
+
+    train_wav_files = [
+        os.path.basename(str(str_path)) for str_path in train_wav_files
+    ]
+    dev_wav_files = [
+        os.path.basename(str(str_path)) for str_path in dev_wav_files
+    ]
+    test_wav_files = [
+        os.path.basename(str(str_path)) for str_path in test_wav_files
+    ]
+
+    for i, utt_id in enumerate(tqdm(sentences)):
+        phones = sentences[utt_id][0]
+        durations = sentences[utt_id][1]
+        speaker = sentences[utt_id][2]
+        # 裁剪掉开头和结尾的 sil
+        if args.cut_sil:
+            if phones[0] == "sil" and len(durations) > 1:
+                durations = durations[1:]
+                phones = phones[1:]
+            if phones[-1] == 'sil' and len(durations) > 1:
+                durations = durations[:-1]
+                phones = phones[:-1]
+
+        phones, tones = frontend._get_phone_tone(phones, get_tone_ids=True)
+        if tones:
+            tone_ids = frontend._t2id(tones)
+            tone_ids = paddle.to_tensor(tone_ids)
+        if phones:
+            phone_ids = frontend._p2id(phones)
+            phone_ids = paddle.to_tensor(phone_ids)
+
+        if args.speaker_dict:
+            speaker_id = int(
+                [item[1] for item in spk_id_list if speaker == item[0]][0])
+            speaker_id = paddle.to_tensor(speaker_id)
+        else:
+            speaker_id = None
+
+        durations = paddle.to_tensor(np.array(durations))
+        durations = paddle.unsqueeze(durations, axis=0)
+
+        # 生成的和真实的可能有 1, 2 帧的差距，但是 batch_fn 会修复
+        # split data into 3 sections
+
+        wav_path = utt_id + ".wav"
+
+        if wav_path in train_wav_files:
+            sub_output_dir = output_dir / ("train/raw")
+        elif wav_path in dev_wav_files:
+            sub_output_dir = output_dir / ("dev/raw")
+        elif wav_path in test_wav_files:
+            sub_output_dir = output_dir / ("test/raw")
+
+        sub_output_dir.mkdir(parents=True, exist_ok=True)
+
+        with paddle.no_grad():
+            mel = speedyspeech_inference(
+                phone_ids, tone_ids, durations=durations, spk_id=speaker_id)
+        np.save(sub_output_dir / (utt_id + "_feats.npy"), mel)
+
+
+def main():
+    # parse args and config and redirect to train_sp
+    parser = argparse.ArgumentParser(
+        description="Synthesize with speedyspeech & parallel wavegan.")
+    parser.add_argument(
+        "--dataset",
+        default="baker",
+        type=str,
+        help="name of dataset, should in {baker, ljspeech, vctk} now")
+    parser.add_argument(
+        "--rootdir", default=None, type=str, help="directory to dataset.")
+    parser.add_argument(
+        "--speedyspeech-config", type=str, help="speedyspeech config file.")
+    parser.add_argument(
+        "--speedyspeech-checkpoint",
+        type=str,
+        help="speedyspeech checkpoint to load.")
+    parser.add_argument(
+        "--speedyspeech-stat",
+        type=str,
+        help="mean and standard deviation used to normalize spectrogram when training speedyspeech."
+    )
+
+    parser.add_argument(
+        "--phones-dict",
+        type=str,
+        default="phone_id_map.txt",
+        help="phone vocabulary file.")
+    parser.add_argument(
+        "--tones-dict",
+        type=str,
+        default="tone_id_map.txt",
+        help="tone vocabulary file.")
+    parser.add_argument(
+        "--speaker-dict", type=str, default=None, help="speaker id map file.")
+
+    parser.add_argument(
+        "--dur-file", default=None, type=str, help="path to durations.txt.")
+    parser.add_argument("--output-dir", type=str, help="output dir.")
+    parser.add_argument(
+        "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
+
+    parser.add_argument(
+        "--cut-sil",
+        type=str2bool,
+        default=True,
+        help="whether cut sil in the edge of audio")
+
+    args = parser.parse_args()
+
+    if args.ngpu == 0:
+        paddle.set_device("cpu")
+    elif args.ngpu > 0:
+        paddle.set_device("gpu")
+    else:
+        print("ngpu should >= 0 !")
+
+    with open(args.speedyspeech_config) as f:
+        speedyspeech_config = CfgNode(yaml.safe_load(f))
+
+    print("========Args========")
+    print(yaml.safe_dump(vars(args)))
+    print("========Config========")
+    print(speedyspeech_config)
+
+    evaluate(args, speedyspeech_config)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/ernie-sat/paddlespeech/t2s/exps/speedyspeech/inference.py b/ernie-sat/paddlespeech/t2s/exps/speedyspeech/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4958bc49a8494dd4b50cf373ef422e61f57852e
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/exps/speedyspeech/inference.py
@@ -0,0 +1,115 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# remain for chains
+import argparse
+from pathlib import Path
+
+import soundfile as sf
+from paddle import inference
+
+from paddlespeech.t2s.frontend.zh_frontend import Frontend
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Paddle Infernce with speedyspeech & parallel wavegan.")
+    parser.add_argument(
+        "--inference-dir", type=str, help="dir to save inference models")
+    parser.add_argument(
+        "--text",
+        type=str,
+        help="text to synthesize, a 'utt_id sentence' pair per line")
+    parser.add_argument("--output-dir", type=str, help="output dir")
+    parser.add_argument(
+        "--phones-dict",
+        type=str,
+        default="phones.txt",
+        help="phone vocabulary file.")
+    parser.add_argument(
+        "--tones-dict",
+        type=str,
+        default="tones.txt",
+        help="tone vocabulary file.")
+
+    args, _ = parser.parse_known_args()
+
+    frontend = Frontend(
+        phone_vocab_path=args.phones_dict, tone_vocab_path=args.tones_dict)
+    print("frontend done!")
+
+    speedyspeech_config = inference.Config(
+        str(Path(args.inference_dir) / "speedyspeech.pdmodel"),
+        str(Path(args.inference_dir) / "speedyspeech.pdiparams"))
+    speedyspeech_config.enable_use_gpu(100, 0)
+    speedyspeech_config.enable_memory_optim()
+    speedyspeech_predictor = inference.create_predictor(speedyspeech_config)
+
+    pwg_config = inference.Config(
+        str(Path(args.inference_dir) / "pwg.pdmodel"),
+        str(Path(args.inference_dir) / "pwg.pdiparams"))
+    pwg_config.enable_use_gpu(100, 0)
+    pwg_config.enable_memory_optim()
+    pwg_predictor = inference.create_predictor(pwg_config)
+
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    sentences = []
+
+    with open(args.text, 'rt') as f:
+        for line in f:
+            items = line.strip().split()
+            utt_id = items[0]
+            sentence = "".join(items[1:])
+            sentences.append((utt_id, sentence))
+
+    for utt_id, sentence in sentences:
+        input_ids = frontend.get_input_ids(
+            sentence, merge_sentences=True, get_tone_ids=True)
+        phone_ids = input_ids["phone_ids"]
+        tone_ids = input_ids["tone_ids"]
+        phones = phone_ids[0].numpy()
+        tones = tone_ids[0].numpy()
+
+        input_names = speedyspeech_predictor.get_input_names()
+        phones_handle = speedyspeech_predictor.get_input_handle(input_names[0])
+        tones_handle = speedyspeech_predictor.get_input_handle(input_names[1])
+
+        phones_handle.reshape(phones.shape)
+        phones_handle.copy_from_cpu(phones)
+        tones_handle.reshape(tones.shape)
+        tones_handle.copy_from_cpu(tones)
+
+        speedyspeech_predictor.run()
+        output_names = speedyspeech_predictor.get_output_names()
+        output_handle = speedyspeech_predictor.get_output_handle(
+            output_names[0])
+        output_data = output_handle.copy_to_cpu()
+
+        input_names = pwg_predictor.get_input_names()
+        mel_handle = pwg_predictor.get_input_handle(input_names[0])
+        mel_handle.reshape(output_data.shape)
+        mel_handle.copy_from_cpu(output_data)
+
+        pwg_predictor.run()
+        output_names = pwg_predictor.get_output_names()
+        output_handle = pwg_predictor.get_output_handle(output_names[0])
+        wav = output_data = output_handle.copy_to_cpu()
+
+        sf.write(output_dir / (utt_id + ".wav"), wav, samplerate=24000)
+
+        print(f"{utt_id} done!")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/ernie-sat/paddlespeech/t2s/exps/speedyspeech/normalize.py b/ernie-sat/paddlespeech/t2s/exps/speedyspeech/normalize.py
new file mode 100644
index 0000000000000000000000000000000000000000..249a4d6d83e59c933994a1532d0e836a0a8679c3
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/exps/speedyspeech/normalize.py
@@ -0,0 +1,166 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Normalize feature files and dump them."""
+import argparse
+import logging
+from operator import itemgetter
+from pathlib import Path
+
+import jsonlines
+import numpy as np
+from sklearn.preprocessing import StandardScaler
+from tqdm import tqdm
+
+from paddlespeech.t2s.datasets.data_table import DataTable
+from paddlespeech.t2s.utils import str2bool
+
+
+def main():
+    """Run preprocessing process."""
+    parser = argparse.ArgumentParser(
+        description="Normalize dumped raw features (See detail in parallel_wavegan/bin/normalize.py)."
+    )
+    parser.add_argument(
+        "--metadata",
+        type=str,
+        required=True,
+        help="directory including feature files to be normalized. "
+        "you need to specify either *-scp or rootdir.")
+    parser.add_argument(
+        "--dumpdir",
+        type=str,
+        required=True,
+        help="directory to dump normalized feature files.")
+    parser.add_argument(
+        "--stats", type=str, required=True, help="statistics file.")
+    parser.add_argument(
+        "--phones-dict", type=str, default=None, help="phone vocabulary file.")
+    parser.add_argument(
+        "--tones-dict", type=str, default=None, help="tone vocabulary file.")
+    parser.add_argument(
+        "--speaker-dict", type=str, default=None, help="speaker id map file.")
+    parser.add_argument(
+        "--verbose",
+        type=int,
+        default=1,
+        help="logging level. higher is more logging. (default=1)")
+
+    parser.add_argument(
+        "--use-relative-path",
+        type=str2bool,
+        default=False,
+        help="whether use relative path in metadata")
+    args = parser.parse_args()
+
+    # set logger
+    if args.verbose > 1:
+        logging.basicConfig(
+            level=logging.DEBUG,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
+        )
+    elif args.verbose > 0:
+        logging.basicConfig(
+            level=logging.INFO,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
+        )
+    else:
+        logging.basicConfig(
+            level=logging.WARN,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
+        )
+        logging.warning('Skip DEBUG/INFO messages')
+
+    dumpdir = Path(args.dumpdir).expanduser()
+    # use absolute path
+    dumpdir = dumpdir.resolve()
+    dumpdir.mkdir(parents=True, exist_ok=True)
+
+    # get dataset
+    with jsonlines.open(args.metadata, 'r') as reader:
+        metadata = list(reader)
+    if args.use_relative_path:
+        # if use_relative_path in preprocess, covert it to absolute path here
+        metadata_dir = Path(args.metadata).parent
+        for item in metadata:
+            item["feats"] = str(metadata_dir / item["feats"])
+
+    dataset = DataTable(
+        metadata, converters={
+            'feats': np.load,
+        })
+    logging.info(f"The number of files = {len(dataset)}.")
+
+    # restore scaler
+    scaler = StandardScaler()
+    scaler.mean_ = np.load(args.stats)[0]
+    scaler.scale_ = np.load(args.stats)[1]
+    # from version 0.23.0, this information is needed
+    scaler.n_features_in_ = scaler.mean_.shape[0]
+
+    vocab_phones = {}
+    with open(args.phones_dict, 'rt') as f:
+        phn_id = [line.strip().split() for line in f.readlines()]
+    for phn, id in phn_id:
+        vocab_phones[phn] = int(id)
+
+    vocab_tones = {}
+    with open(args.tones_dict, 'rt') as f:
+        tone_id = [line.strip().split() for line in f.readlines()]
+    for tone, id in tone_id:
+        vocab_tones[tone] = int(id)
+
+    vocab_speaker = {}
+    with open(args.speaker_dict, 'rt') as f:
+        spk_id = [line.strip().split() for line in f.readlines()]
+    for spk, id in spk_id:
+        vocab_speaker[spk] = int(id)
+
+    # process each file
+    output_metadata = []
+
+    for item in tqdm(dataset):
+        utt_id = item['utt_id']
+        mel = item['feats']
+        # normalize
+        mel = scaler.transform(mel)
+
+        # save
+        mel_path = dumpdir / f"{utt_id}_feats.npy"
+        np.save(mel_path, mel.astype(np.float32), allow_pickle=False)
+        phone_ids = [vocab_phones[p] for p in item['phones']]
+        tone_ids = [vocab_tones[p] for p in item['tones']]
+        spk_id = vocab_speaker[item["speaker"]]
+        if args.use_relative_path:
+            # convert absolute path to relative path:
+            mel_path = mel_path.relative_to(dumpdir)
+        output_metadata.append({
+            'utt_id': utt_id,
+            "spk_id": spk_id,
+            'phones': phone_ids,
+            'tones': tone_ids,
+            'num_phones': item['num_phones'],
+            'num_frames': item['num_frames'],
+            'durations': item['durations'],
+            'feats': str(mel_path),
+        })
+    output_metadata.sort(key=itemgetter('utt_id'))
+    output_metadata_path = Path(args.dumpdir) / "metadata.jsonl"
+    with jsonlines.open(output_metadata_path, 'w') as writer:
+        for item in output_metadata:
+            writer.write(item)
+    logging.info(f"metadata dumped into {output_metadata_path}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/ernie-sat/paddlespeech/t2s/exps/speedyspeech/preprocess.py b/ernie-sat/paddlespeech/t2s/exps/speedyspeech/preprocess.py
new file mode 100644
index 0000000000000000000000000000000000000000..e833d13940530f293842a842b65f33cf6d03d9bd
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/exps/speedyspeech/preprocess.py
@@ -0,0 +1,298 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import re
+from concurrent.futures import ThreadPoolExecutor
+from operator import itemgetter
+from pathlib import Path
+from typing import Any
+from typing import Dict
+from typing import List
+
+import jsonlines
+import librosa
+import numpy as np
+import tqdm
+import yaml
+from yacs.config import CfgNode
+
+from paddlespeech.t2s.datasets.get_feats import LogMelFBank
+from paddlespeech.t2s.datasets.preprocess_utils import compare_duration_and_mel_length
+from paddlespeech.t2s.datasets.preprocess_utils import get_phn_dur
+from paddlespeech.t2s.datasets.preprocess_utils import get_phones_tones
+from paddlespeech.t2s.datasets.preprocess_utils import get_spk_id_map
+from paddlespeech.t2s.datasets.preprocess_utils import merge_silence
+from paddlespeech.t2s.utils import str2bool
+
+
+def process_sentence(config: Dict[str, Any],
+                     fp: Path,
+                     sentences: Dict,
+                     output_dir: Path,
+                     mel_extractor=None,
+                     cut_sil: bool=True):
+    utt_id = fp.stem
+    record = None
+    if utt_id in sentences:
+        # reading, resampling may occur
+        wav, _ = librosa.load(str(fp), sr=config.fs)
+        if len(wav.shape) != 1 or np.abs(wav).max() > 1.0:
+            return record
+        assert len(wav.shape) == 1, f"{utt_id} is not a mono-channel audio."
+        assert np.abs(wav).max(
+        ) <= 1.0, f"{utt_id} is seems to be different that 16 bit PCM."
+        phones = sentences[utt_id][0]
+        durations = sentences[utt_id][1]
+        speaker = sentences[utt_id][2]
+        d_cumsum = np.pad(np.array(durations).cumsum(0), (1, 0), 'constant')
+        # little imprecise than use *.TextGrid directly
+        times = librosa.frames_to_time(
+            d_cumsum, sr=config.fs, hop_length=config.n_shift)
+        if cut_sil:
+            start = 0
+            end = d_cumsum[-1]
+            if phones[0] == "sil" and len(durations) > 1:
+                start = times[1]
+                durations = durations[1:]
+                phones = phones[1:]
+            if phones[-1] == 'sil' and len(durations) > 1:
+                end = times[-2]
+                durations = durations[:-1]
+                phones = phones[:-1]
+            sentences[utt_id][0] = phones
+            sentences[utt_id][1] = durations
+            start, end = librosa.time_to_samples([start, end], sr=config.fs)
+            wav = wav[start:end]
+
+        # extract mel feats
+        logmel = mel_extractor.get_log_mel_fbank(wav)
+        # change duration according to mel_length
+        compare_duration_and_mel_length(sentences, utt_id, logmel)
+        # utt_id may be popped in compare_duration_and_mel_length
+        if utt_id not in sentences:
+            return None
+        labels = sentences[utt_id][0]
+        # extract phone and duration
+        phones = []
+        tones = []
+        for label in labels:
+            # split tone from finals
+            match = re.match(r'^(\w+)([012345])$', label)
+            if match:
+                phones.append(match.group(1))
+                tones.append(match.group(2))
+            else:
+                phones.append(label)
+                tones.append('0')
+        durations = sentences[utt_id][1]
+        num_frames = logmel.shape[0]
+        assert sum(durations) == num_frames
+        assert len(phones) == len(tones) == len(durations)
+
+        mel_path = output_dir / (utt_id + "_feats.npy")
+        np.save(mel_path, logmel)  # (num_frames, n_mels)
+        record = {
+            "utt_id": utt_id,
+            "phones": phones,
+            "tones": tones,
+            "speaker": speaker,
+            "num_phones": len(phones),
+            "num_frames": num_frames,
+            "durations": durations,
+            "feats": str(mel_path),  # Path object
+        }
+    return record
+
+
+def process_sentences(config,
+                      fps: List[Path],
+                      sentences: Dict,
+                      output_dir: Path,
+                      mel_extractor=None,
+                      nprocs: int=1,
+                      cut_sil: bool=True,
+                      use_relative_path: bool=False):
+    if nprocs == 1:
+        results = []
+        for fp in tqdm.tqdm(fps, total=len(fps)):
+            record = process_sentence(config, fp, sentences, output_dir,
+                                      mel_extractor, cut_sil)
+            if record:
+                results.append(record)
+    else:
+        with ThreadPoolExecutor(nprocs) as pool:
+            futures = []
+            with tqdm.tqdm(total=len(fps)) as progress:
+                for fp in fps:
+                    future = pool.submit(process_sentence, config, fp,
+                                         sentences, output_dir, mel_extractor,
+                                         cut_sil)
+                    future.add_done_callback(lambda p: progress.update())
+                    futures.append(future)
+
+                results = []
+                for ft in futures:
+                    record = ft.result()
+                    if record:
+                        results.append(record)
+
+    results.sort(key=itemgetter("utt_id"))
+    output_dir = Path(output_dir)
+    metadata_path = output_dir / "metadata.jsonl"
+    # NOTE: use relative path to the meta jsonlines file for Full Chain Project
+    with jsonlines.open(metadata_path, 'w') as writer:
+        for item in results:
+            if use_relative_path:
+                item["feats"] = str(Path(item["feats"]).relative_to(output_dir))
+            writer.write(item)
+    print("Done")
+
+
+def main():
+    # parse config and args
+    parser = argparse.ArgumentParser(
+        description="Preprocess audio and then extract features.")
+
+    parser.add_argument(
+        "--dataset",
+        default="baker",
+        type=str,
+        help="name of dataset, should in {baker} now")
+
+    parser.add_argument(
+        "--rootdir", default=None, type=str, help="directory to dataset.")
+    parser.add_argument(
+        "--dumpdir",
+        type=str,
+        required=True,
+        help="directory to dump feature files.")
+
+    parser.add_argument(
+        "--dur-file",
+        default=None,
+        type=str,
+        help="path to baker durations.txt.")
+
+    parser.add_argument("--config", type=str, help="fastspeech2 config file.")
+
+    parser.add_argument(
+        "--verbose",
+        type=int,
+        default=1,
+        help="logging level. higher is more logging. (default=1)")
+    parser.add_argument(
+        "--num-cpu", type=int, default=1, help="number of process.")
+
+    parser.add_argument(
+        "--cut-sil",
+        type=str2bool,
+        default=True,
+        help="whether cut sil in the edge of audio")
+
+    parser.add_argument(
+        "--use-relative-path",
+        type=str2bool,
+        default=False,
+        help="whether use relative path in metadata")
+
+    args = parser.parse_args()
+
+    rootdir = Path(args.rootdir).expanduser()
+    dumpdir = Path(args.dumpdir).expanduser()
+    # use absolute path
+    dumpdir = dumpdir.resolve()
+    dumpdir.mkdir(parents=True, exist_ok=True)
+    dur_file = Path(args.dur_file).expanduser()
+
+    assert rootdir.is_dir()
+    assert dur_file.is_file()
+
+    with open(args.config, 'rt') as f:
+        config = CfgNode(yaml.safe_load(f))
+
+    if args.verbose > 1:
+        print(vars(args))
+        print(config)
+
+    sentences, speaker_set = get_phn_dur(dur_file)
+
+    merge_silence(sentences)
+    phone_id_map_path = dumpdir / "phone_id_map.txt"
+    tone_id_map_path = dumpdir / "tone_id_map.txt"
+    get_phones_tones(sentences, phone_id_map_path, tone_id_map_path,
+                     args.dataset)
+    speaker_id_map_path = dumpdir / "speaker_id_map.txt"
+    get_spk_id_map(speaker_set, speaker_id_map_path)
+
+    if args.dataset == "baker":
+        wav_files = sorted(list((rootdir / "Wave").rglob("*.wav")))
+        # split data into 3 sections
+        num_train = 9800
+        num_dev = 100
+        train_wav_files = wav_files[:num_train]
+        dev_wav_files = wav_files[num_train:num_train + num_dev]
+        test_wav_files = wav_files[num_train + num_dev:]
+
+    train_dump_dir = dumpdir / "train" / "raw"
+    train_dump_dir.mkdir(parents=True, exist_ok=True)
+    dev_dump_dir = dumpdir / "dev" / "raw"
+    dev_dump_dir.mkdir(parents=True, exist_ok=True)
+    test_dump_dir = dumpdir / "test" / "raw"
+    test_dump_dir.mkdir(parents=True, exist_ok=True)
+
+    # Extractor
+    mel_extractor = LogMelFBank(
+        sr=config.fs,
+        n_fft=config.n_fft,
+        hop_length=config.n_shift,
+        win_length=config.win_length,
+        window=config.window,
+        n_mels=config.n_mels,
+        fmin=config.fmin,
+        fmax=config.fmax)
+
+    # process for the 3 sections
+    if train_wav_files:
+        process_sentences(
+            config,
+            train_wav_files,
+            sentences,
+            train_dump_dir,
+            mel_extractor,
+            nprocs=args.num_cpu,
+            cut_sil=args.cut_sil,
+            use_relative_path=args.use_relative_path)
+    if dev_wav_files:
+        process_sentences(
+            config,
+            dev_wav_files,
+            sentences,
+            dev_dump_dir,
+            mel_extractor,
+            cut_sil=args.cut_sil,
+            use_relative_path=args.use_relative_path)
+    if test_wav_files:
+        process_sentences(
+            config,
+            test_wav_files,
+            sentences,
+            test_dump_dir,
+            mel_extractor,
+            nprocs=args.num_cpu,
+            cut_sil=args.cut_sil,
+            use_relative_path=args.use_relative_path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/ernie-sat/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py b/ernie-sat/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb742c59587fa91f442d4ba5868c7b13a23fe085
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py
@@ -0,0 +1,203 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# remain for chains
+import argparse
+import logging
+import os
+from pathlib import Path
+
+import numpy as np
+import paddle
+import soundfile as sf
+import yaml
+from paddle import jit
+from paddle.static import InputSpec
+from yacs.config import CfgNode
+
+from paddlespeech.t2s.frontend.zh_frontend import Frontend
+from paddlespeech.t2s.models.parallel_wavegan import PWGGenerator
+from paddlespeech.t2s.models.parallel_wavegan import PWGInference
+from paddlespeech.t2s.models.speedyspeech import SpeedySpeech
+from paddlespeech.t2s.models.speedyspeech import SpeedySpeechInference
+from paddlespeech.t2s.modules.normalizer import ZScore
+
+
+def evaluate(args, speedyspeech_config, pwg_config):
+    # dataloader has been too verbose
+    logging.getLogger("DataLoader").disabled = True
+
+    # construct dataset for evaluation
+    sentences = []
+    with open(args.text, 'rt') as f:
+        for line in f:
+            items = line.strip().split()
+            utt_id = items[0]
+            sentence = "".join(items[1:])
+            sentences.append((utt_id, sentence))
+
+    with open(args.phones_dict, "r") as f:
+        phn_id = [line.strip().split() for line in f.readlines()]
+    vocab_size = len(phn_id)
+    print("vocab_size:", vocab_size)
+    with open(args.tones_dict, "r") as f:
+        tone_id = [line.strip().split() for line in f.readlines()]
+    tone_size = len(tone_id)
+    print("tone_size:", tone_size)
+
+    model = SpeedySpeech(
+        vocab_size=vocab_size,
+        tone_size=tone_size,
+        **speedyspeech_config["model"])
+    model.set_state_dict(
+        paddle.load(args.speedyspeech_checkpoint)["main_params"])
+    model.eval()
+
+    vocoder = PWGGenerator(**pwg_config["generator_params"])
+    vocoder.set_state_dict(paddle.load(args.pwg_checkpoint)["generator_params"])
+    vocoder.remove_weight_norm()
+    vocoder.eval()
+    print("model done!")
+
+    stat = np.load(args.speedyspeech_stat)
+    mu, std = stat
+    mu = paddle.to_tensor(mu)
+    std = paddle.to_tensor(std)
+    speedyspeech_normalizer = ZScore(mu, std)
+
+    stat = np.load(args.pwg_stat)
+    mu, std = stat
+    mu = paddle.to_tensor(mu)
+    std = paddle.to_tensor(std)
+    pwg_normalizer = ZScore(mu, std)
+
+    speedyspeech_inference = SpeedySpeechInference(speedyspeech_normalizer,
+                                                   model)
+    speedyspeech_inference.eval()
+    speedyspeech_inference = jit.to_static(
+        speedyspeech_inference,
+        input_spec=[
+            InputSpec([-1], dtype=paddle.int64), InputSpec(
+                [-1], dtype=paddle.int64)
+        ])
+    paddle.jit.save(speedyspeech_inference,
+                    os.path.join(args.inference_dir, "speedyspeech"))
+    speedyspeech_inference = paddle.jit.load(
+        os.path.join(args.inference_dir, "speedyspeech"))
+
+    pwg_inference = PWGInference(pwg_normalizer, vocoder)
+    pwg_inference.eval()
+    pwg_inference = jit.to_static(
+        pwg_inference, input_spec=[
+            InputSpec([-1, 80], dtype=paddle.float32),
+        ])
+    paddle.jit.save(pwg_inference, os.path.join(args.inference_dir, "pwg"))
+    pwg_inference = paddle.jit.load(os.path.join(args.inference_dir, "pwg"))
+
+    frontend = Frontend(
+        phone_vocab_path=args.phones_dict, tone_vocab_path=args.tones_dict)
+    print("frontend done!")
+
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    for utt_id, sentence in sentences:
+        input_ids = frontend.get_input_ids(
+            sentence, merge_sentences=True, get_tone_ids=True)
+        phone_ids = input_ids["phone_ids"]
+        tone_ids = input_ids["tone_ids"]
+
+        flags = 0
+        for i in range(len(phone_ids)):
+            part_phone_ids = phone_ids[i]
+            part_tone_ids = tone_ids[i]
+            with paddle.no_grad():
+                mel = speedyspeech_inference(part_phone_ids, part_tone_ids)
+                temp_wav = pwg_inference(mel)
+            if flags == 0:
+                wav = temp_wav
+                flags = 1
+            else:
+                wav = paddle.concat([wav, temp_wav])
+        sf.write(
+            output_dir / (utt_id + ".wav"),
+            wav.numpy(),
+            samplerate=speedyspeech_config.fs)
+        print(f"{utt_id} done!")
+
+
+def main():
+    # parse args and config and redirect to train_sp
+    parser = argparse.ArgumentParser(
+        description="Synthesize with speedyspeech & parallel wavegan.")
+    parser.add_argument(
+        "--speedyspeech-config", type=str, help="config file for speedyspeech.")
+    parser.add_argument(
+        "--speedyspeech-checkpoint",
+        type=str,
+        help="speedyspeech checkpoint to load.")
+    parser.add_argument(
+        "--speedyspeech-stat",
+        type=str,
+        help="mean and standard deviation used to normalize spectrogram when training speedyspeech."
+    )
+    parser.add_argument(
+        "--pwg-config", type=str, help="config file for parallelwavegan.")
+    parser.add_argument(
+        "--pwg-checkpoint",
+        type=str,
+        help="parallel wavegan checkpoint to load.")
+    parser.add_argument(
+        "--pwg-stat",
+        type=str,
+        help="mean and standard deviation used to normalize spectrogram when training speedyspeech."
+    )
+    parser.add_argument(
+        "--text",
+        type=str,
+        help="text to synthesize, a 'utt_id sentence' pair per line")
+    parser.add_argument(
+        "--phones-dict", type=str, default=None, help="phone vocabulary file.")
+    parser.add_argument(
+        "--tones-dict", type=str, default=None, help="tone vocabulary file.")
+    parser.add_argument("--output-dir", type=str, help="output dir")
+    parser.add_argument(
+        "--inference-dir", type=str, help="dir to save inference models")
+    parser.add_argument(
+        "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
+
+    args, _ = parser.parse_known_args()
+
+    if args.ngpu == 0:
+        paddle.set_device("cpu")
+    elif args.ngpu > 0:
+        paddle.set_device("gpu")
+    else:
+        print("ngpu should >= 0 !")
+
+    with open(args.speedyspeech_config) as f:
+        speedyspeech_config = CfgNode(yaml.safe_load(f))
+    with open(args.pwg_config) as f:
+        pwg_config = CfgNode(yaml.safe_load(f))
+
+    print("========Args========")
+    print(yaml.safe_dump(vars(args)))
+    print("========Config========")
+    print(speedyspeech_config)
+    print(pwg_config)
+
+    evaluate(args, speedyspeech_config, pwg_config)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/ernie-sat/paddlespeech/t2s/exps/speedyspeech/train.py b/ernie-sat/paddlespeech/t2s/exps/speedyspeech/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..bda5370c1da9ceb1569a27ef068ec57b289e6888
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/exps/speedyspeech/train.py
@@ -0,0 +1,239 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import logging
+import os
+import shutil
+from pathlib import Path
+
+import jsonlines
+import numpy as np
+import paddle
+import yaml
+from paddle import DataParallel
+from paddle import distributed as dist
+from paddle.io import DataLoader
+from paddle.io import DistributedBatchSampler
+from yacs.config import CfgNode
+
+from paddlespeech.t2s.datasets.am_batch_fn import speedyspeech_multi_spk_batch_fn
+from paddlespeech.t2s.datasets.am_batch_fn import speedyspeech_single_spk_batch_fn
+from paddlespeech.t2s.datasets.data_table import DataTable
+from paddlespeech.t2s.models.speedyspeech import SpeedySpeech
+from paddlespeech.t2s.models.speedyspeech import SpeedySpeechEvaluator
+from paddlespeech.t2s.models.speedyspeech import SpeedySpeechUpdater
+from paddlespeech.t2s.training.extensions.snapshot import Snapshot
+from paddlespeech.t2s.training.extensions.visualizer import VisualDL
+from paddlespeech.t2s.training.optimizer import build_optimizers
+from paddlespeech.t2s.training.seeding import seed_everything
+from paddlespeech.t2s.training.trainer import Trainer
+from paddlespeech.t2s.utils import str2bool
+
+
+def train_sp(args, config):
+    # decides device type and whether to run in parallel
+    # setup running environment correctly
+    world_size = paddle.distributed.get_world_size()
+    if (not paddle.is_compiled_with_cuda()) or args.ngpu == 0:
+        paddle.set_device("cpu")
+    else:
+        paddle.set_device("gpu")
+        if world_size > 1:
+            paddle.distributed.init_parallel_env()
+
+    # set the random seed, it is a must for multiprocess training
+    seed_everything(config.seed)
+
+    print(
+        f"rank: {dist.get_rank()}, pid: {os.getpid()}, parent_pid: {os.getppid()}",
+    )
+
+    fields = [
+        "phones", "tones", "num_phones", "num_frames", "feats", "durations"
+    ]
+
+    spk_num = None
+    if args.speaker_dict is not None:
+        print("multiple speaker speedyspeech!")
+        collate_fn = speedyspeech_multi_spk_batch_fn
+        with open(args.speaker_dict, 'rt') as f:
+            spk_id = [line.strip().split() for line in f.readlines()]
+        spk_num = len(spk_id)
+        fields += ["spk_id"]
+    else:
+        print("single speaker speedyspeech!")
+        collate_fn = speedyspeech_single_spk_batch_fn
+    print("spk_num:", spk_num)
+
+    # dataloader has been too verbose
+    logging.getLogger("DataLoader").disabled = True
+
+    # construct dataset for training and validation
+    with jsonlines.open(args.train_metadata, 'r') as reader:
+        train_metadata = list(reader)
+    if args.use_relative_path:
+        # if use_relative_path in preprocess, covert it to absolute path here
+        metadata_dir = Path(args.train_metadata).parent
+        for item in train_metadata:
+            item["feats"] = str(metadata_dir / item["feats"])
+
+    train_dataset = DataTable(
+        data=train_metadata,
+        fields=fields,
+        converters={
+            "feats": np.load,
+        }, )
+    with jsonlines.open(args.dev_metadata, 'r') as reader:
+        dev_metadata = list(reader)
+    if args.use_relative_path:
+        # if use_relative_path in preprocess, covert it to absolute path here
+        metadata_dir = Path(args.dev_metadata).parent
+        for item in dev_metadata:
+            item["feats"] = str(metadata_dir / item["feats"])
+
+    dev_dataset = DataTable(
+        data=dev_metadata,
+        fields=fields,
+        converters={
+            "feats": np.load,
+        }, )
+
+    # collate function and dataloader
+    train_sampler = DistributedBatchSampler(
+        train_dataset,
+        batch_size=config.batch_size,
+        shuffle=True,
+        drop_last=True)
+    print("samplers done!")
+
+    train_dataloader = DataLoader(
+        train_dataset,
+        batch_sampler=train_sampler,
+        collate_fn=collate_fn,
+        num_workers=config.num_workers)
+    dev_dataloader = DataLoader(
+        dev_dataset,
+        shuffle=False,
+        drop_last=False,
+        batch_size=config.batch_size,
+        collate_fn=collate_fn,
+        num_workers=config.num_workers)
+    print("dataloaders done!")
+    with open(args.phones_dict, "r") as f:
+        phn_id = [line.strip().split() for line in f.readlines()]
+    vocab_size = len(phn_id)
+    print("vocab_size:", vocab_size)
+    with open(args.tones_dict, "r") as f:
+        tone_id = [line.strip().split() for line in f.readlines()]
+    tone_size = len(tone_id)
+    print("tone_size:", tone_size)
+
+    model = SpeedySpeech(
+        vocab_size=vocab_size,
+        tone_size=tone_size,
+        spk_num=spk_num,
+        **config["model"])
+    if world_size > 1:
+        model = DataParallel(model)
+    print("model done!")
+    optimizer = build_optimizers(model, **config["optimizer"])
+    print("optimizer done!")
+
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    if dist.get_rank() == 0:
+        config_name = args.config.split("/")[-1]
+        # copy conf to output_dir
+        shutil.copyfile(args.config, output_dir / config_name)
+
+    updater = SpeedySpeechUpdater(
+        model=model,
+        optimizer=optimizer,
+        dataloader=train_dataloader,
+        output_dir=output_dir)
+
+    trainer = Trainer(updater, (config.max_epoch, 'epoch'), output_dir)
+
+    evaluator = SpeedySpeechEvaluator(
+        model, dev_dataloader, output_dir=output_dir)
+
+    if dist.get_rank() == 0:
+        trainer.extend(evaluator, trigger=(1, "epoch"))
+        trainer.extend(VisualDL(output_dir), trigger=(1, "iteration"))
+    trainer.extend(
+        Snapshot(max_size=config.num_snapshots), trigger=(1, 'epoch'))
+    trainer.run()
+
+
+def main():
+    # parse args and config and redirect to train_sp
+    parser = argparse.ArgumentParser(
+        description="Train a Speedyspeech model with a single speaker dataset.")
+    parser.add_argument("--config", type=str, help="config file.")
+    parser.add_argument("--train-metadata", type=str, help="training data.")
+    parser.add_argument("--dev-metadata", type=str, help="dev data.")
+    parser.add_argument("--output-dir", type=str, help="output dir.")
+    parser.add_argument(
+        "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
+
+    parser.add_argument(
+        "--use-relative-path",
+        type=str2bool,
+        default=False,
+        help="whether use relative path in metadata")
+
+    parser.add_argument(
+        "--phones-dict", type=str, default=None, help="phone vocabulary file.")
+
+    parser.add_argument(
+        "--tones-dict", type=str, default=None, help="tone vocabulary file.")
+
+    parser.add_argument(
+        "--speaker-dict",
+        type=str,
+        default=None,
+        help="speaker id map file for multiple speaker model.")
+
+    # 这里可以多传入 max_epoch 等
+    args, rest = parser.parse_known_args()
+
+    with open(args.config) as f:
+        config = CfgNode(yaml.safe_load(f))
+
+    if rest:
+        extra = []
+        # to support key=value format
+        for item in rest:
+            # remove "--"
+            item = item[2:]
+            extra.extend(item.split("=", maxsplit=1))
+        config.merge_from_list(extra)
+
+    print("========Args========")
+    print(yaml.safe_dump(vars(args)))
+    print("========Config========")
+    print(config)
+    print(
+        f"master see the word size: {dist.get_world_size()}, from pid: {os.getpid()}"
+    )
+
+    # dispatch
+    if args.ngpu > 1:
+        dist.spawn(train_sp, (args, config), nprocs=args.ngpu)
+    else:
+        train_sp(args, config)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/ernie-sat/paddlespeech/t2s/exps/syn_utils.py b/ernie-sat/paddlespeech/t2s/exps/syn_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..c52cb372710bd4308d9a54507284e7d10cafa6a1
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/exps/syn_utils.py
@@ -0,0 +1,243 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+
+import numpy as np
+import paddle
+from paddle import jit
+from paddle.static import InputSpec
+
+from paddlespeech.s2t.utils.dynamic_import import dynamic_import
+from paddlespeech.t2s.datasets.data_table import DataTable
+from paddlespeech.t2s.frontend import English
+from paddlespeech.t2s.frontend.zh_frontend import Frontend
+from paddlespeech.t2s.modules.normalizer import ZScore
+
+model_alias = {
+    # acoustic model
+    "speedyspeech":
+    "paddlespeech.t2s.models.speedyspeech:SpeedySpeech",
+    "speedyspeech_inference":
+    "paddlespeech.t2s.models.speedyspeech:SpeedySpeechInference",
+    "fastspeech2":
+    "paddlespeech.t2s.models.fastspeech2:FastSpeech2",
+    "fastspeech2_inference":
+    "paddlespeech.t2s.models.fastspeech2:FastSpeech2Inference",
+    "tacotron2":
+    "paddlespeech.t2s.models.tacotron2:Tacotron2",
+    "tacotron2_inference":
+    "paddlespeech.t2s.models.tacotron2:Tacotron2Inference",
+    # voc
+    "pwgan":
+    "paddlespeech.t2s.models.parallel_wavegan:PWGGenerator",
+    "pwgan_inference":
+    "paddlespeech.t2s.models.parallel_wavegan:PWGInference",
+    "mb_melgan":
+    "paddlespeech.t2s.models.melgan:MelGANGenerator",
+    "mb_melgan_inference":
+    "paddlespeech.t2s.models.melgan:MelGANInference",
+    "style_melgan":
+    "paddlespeech.t2s.models.melgan:StyleMelGANGenerator",
+    "style_melgan_inference":
+    "paddlespeech.t2s.models.melgan:StyleMelGANInference",
+    "hifigan":
+    "paddlespeech.t2s.models.hifigan:HiFiGANGenerator",
+    "hifigan_inference":
+    "paddlespeech.t2s.models.hifigan:HiFiGANInference",
+    "wavernn":
+    "paddlespeech.t2s.models.wavernn:WaveRNN",
+    "wavernn_inference":
+    "paddlespeech.t2s.models.wavernn:WaveRNNInference",
+}
+
+
+# input
+def get_sentences(args):
+    # construct dataset for evaluation
+    sentences = []
+    with open(args.text, 'rt') as f:
+        for line in f:
+            items = line.strip().split()
+            utt_id = items[0]
+            if 'lang' in args and args.lang == 'zh':
+                sentence = "".join(items[1:])
+            elif 'lang' in args and args.lang == 'en':
+                sentence = " ".join(items[1:])
+            sentences.append((utt_id, sentence))
+    return sentences
+
+
+def get_test_dataset(args, test_metadata, am_name, am_dataset):
+    if am_name == 'fastspeech2':
+        fields = ["utt_id", "text"]
+        if am_dataset in {"aishell3", "vctk"} and args.speaker_dict:
+            print("multiple speaker fastspeech2!")
+            fields += ["spk_id"]
+        elif 'voice_cloning' in args and args.voice_cloning:
+            print("voice cloning!")
+            fields += ["spk_emb"]
+        else:
+            print("single speaker fastspeech2!")
+    elif am_name == 'speedyspeech':
+        fields = ["utt_id", "phones", "tones"]
+    elif am_name == 'tacotron2':
+        fields = ["utt_id", "text"]
+        if 'voice_cloning' in args and args.voice_cloning:
+            print("voice cloning!")
+            fields += ["spk_emb"]
+
+    test_dataset = DataTable(data=test_metadata, fields=fields)
+    return test_dataset
+
+
+# frontend
+def get_frontend(args):
+    if 'lang' in args and args.lang == 'zh':
+        frontend = Frontend(
+            phone_vocab_path=args.phones_dict, tone_vocab_path=args.tones_dict)
+    elif 'lang' in args and args.lang == 'en':
+        frontend = English(phone_vocab_path=args.phones_dict)
+    else:
+        print("wrong lang!")
+    print("frontend done!")
+    return frontend
+
+
+# dygraph
+def get_am_inference(args, am_config):
+    with open(args.phones_dict, "r") as f:
+        phn_id = [line.strip().split() for line in f.readlines()]
+    vocab_size = len(phn_id)
+    print("vocab_size:", vocab_size)
+
+    tone_size = None
+    if 'tones_dict' in args and args.tones_dict:
+        with open(args.tones_dict, "r") as f:
+            tone_id = [line.strip().split() for line in f.readlines()]
+        tone_size = len(tone_id)
+        print("tone_size:", tone_size)
+
+    spk_num = None
+    if 'speaker_dict' in args and args.speaker_dict:
+        with open(args.speaker_dict, 'rt') as f:
+            spk_id = [line.strip().split() for line in f.readlines()]
+        spk_num = len(spk_id)
+        print("spk_num:", spk_num)
+
+    odim = am_config.n_mels
+    # model: {model_name}_{dataset}
+    am_name = args.am[:args.am.rindex('_')]
+    am_dataset = args.am[args.am.rindex('_') + 1:]
+
+    am_class = dynamic_import(am_name, model_alias)
+    am_inference_class = dynamic_import(am_name + '_inference', model_alias)
+
+    if am_name == 'fastspeech2':
+        am = am_class(
+            idim=vocab_size, odim=odim, spk_num=spk_num, **am_config["model"])
+    elif am_name == 'speedyspeech':
+        am = am_class(
+            vocab_size=vocab_size,
+            tone_size=tone_size,
+            spk_num=spk_num,
+            **am_config["model"])
+    elif am_name == 'tacotron2':
+        am = am_class(idim=vocab_size, odim=odim, **am_config["model"])
+
+    am.set_state_dict(paddle.load(args.am_ckpt)["main_params"])
+    am.eval()
+    am_mu, am_std = np.load(args.am_stat)
+    am_mu = paddle.to_tensor(am_mu)
+    am_std = paddle.to_tensor(am_std)
+    am_normalizer = ZScore(am_mu, am_std)
+    am_inference = am_inference_class(am_normalizer, am)
+    am_inference.eval()
+    print("acoustic model done!")
+    return am_inference, am_name, am_dataset
+
+
+def get_voc_inference(args, voc_config):
+    # model: {model_name}_{dataset}
+    voc_name = args.voc[:args.voc.rindex('_')]
+    voc_class = dynamic_import(voc_name, model_alias)
+    voc_inference_class = dynamic_import(voc_name + '_inference', model_alias)
+    if voc_name != 'wavernn':
+        voc = voc_class(**voc_config["generator_params"])
+        voc.set_state_dict(paddle.load(args.voc_ckpt)["generator_params"])
+        voc.remove_weight_norm()
+        voc.eval()
+    else:
+        voc = voc_class(**voc_config["model"])
+        voc.set_state_dict(paddle.load(args.voc_ckpt)["main_params"])
+        voc.eval()
+
+    voc_mu, voc_std = np.load(args.voc_stat)
+    voc_mu = paddle.to_tensor(voc_mu)
+    voc_std = paddle.to_tensor(voc_std)
+    voc_normalizer = ZScore(voc_mu, voc_std)
+    voc_inference = voc_inference_class(voc_normalizer, voc)
+    voc_inference.eval()
+    print("voc done!")
+    return voc_inference
+
+
+# to static
+def am_to_static(args, am_inference, am_name, am_dataset):
+    if am_name == 'fastspeech2':
+        if am_dataset in {"aishell3", "vctk"} and args.speaker_dict:
+            am_inference = jit.to_static(
+                am_inference,
+                input_spec=[
+                    InputSpec([-1], dtype=paddle.int64),
+                    InputSpec([1], dtype=paddle.int64),
+                ], )
+        else:
+            am_inference = jit.to_static(
+                am_inference, input_spec=[InputSpec([-1], dtype=paddle.int64)])
+
+    elif am_name == 'speedyspeech':
+        if am_dataset in {"aishell3", "vctk"} and args.speaker_dict:
+            am_inference = jit.to_static(
+                am_inference,
+                input_spec=[
+                    InputSpec([-1], dtype=paddle.int64),  # text
+                    InputSpec([-1], dtype=paddle.int64),  # tone
+                    InputSpec([1], dtype=paddle.int64),  # spk_id
+                    None  # duration
+                ])
+        else:
+            am_inference = jit.to_static(
+                am_inference,
+                input_spec=[
+                    InputSpec([-1], dtype=paddle.int64),
+                    InputSpec([-1], dtype=paddle.int64)
+                ])
+
+    elif am_name == 'tacotron2':
+        am_inference = jit.to_static(
+            am_inference, input_spec=[InputSpec([-1], dtype=paddle.int64)])
+
+    paddle.jit.save(am_inference, os.path.join(args.inference_dir, args.am))
+    am_inference = paddle.jit.load(os.path.join(args.inference_dir, args.am))
+    return am_inference
+
+
+def voc_to_static(args, voc_inference):
+    voc_inference = jit.to_static(
+        voc_inference, input_spec=[
+            InputSpec([-1, 80], dtype=paddle.float32),
+        ])
+    paddle.jit.save(voc_inference, os.path.join(args.inference_dir, args.voc))
+    voc_inference = paddle.jit.load(os.path.join(args.inference_dir, args.voc))
+    return voc_inference
diff --git a/ernie-sat/paddlespeech/t2s/exps/synthesize.py b/ernie-sat/paddlespeech/t2s/exps/synthesize.py
new file mode 100644
index 0000000000000000000000000000000000000000..abb1eb4eb6e395b835c56d64faa226d51e8049ea
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/exps/synthesize.py
@@ -0,0 +1,200 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import logging
+from pathlib import Path
+
+import jsonlines
+import numpy as np
+import paddle
+import soundfile as sf
+import yaml
+from timer import timer
+from yacs.config import CfgNode
+
+from paddlespeech.t2s.exps.syn_utils import get_am_inference
+from paddlespeech.t2s.exps.syn_utils import get_test_dataset
+from paddlespeech.t2s.exps.syn_utils import get_voc_inference
+from paddlespeech.t2s.utils import str2bool
+
+
+def evaluate(args):
+    # dataloader has been too verbose
+    logging.getLogger("DataLoader").disabled = True
+
+    # construct dataset for evaluation
+    with jsonlines.open(args.test_metadata, 'r') as reader:
+        test_metadata = list(reader)
+
+    # Init body.
+    with open(args.am_config) as f:
+        am_config = CfgNode(yaml.safe_load(f))
+    with open(args.voc_config) as f:
+        voc_config = CfgNode(yaml.safe_load(f))
+
+    print("========Args========")
+    print(yaml.safe_dump(vars(args)))
+    print("========Config========")
+    print(am_config)
+    print(voc_config)
+
+    # acoustic model
+    am_inference, am_name, am_dataset = get_am_inference(args, am_config)
+    test_dataset = get_test_dataset(args, test_metadata, am_name, am_dataset)
+
+    # vocoder
+    voc_inference = get_voc_inference(args, voc_config)
+
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    N = 0
+    T = 0
+
+    for datum in test_dataset:
+        utt_id = datum["utt_id"]
+        with timer() as t:
+            with paddle.no_grad():
+                # acoustic model
+                if am_name == 'fastspeech2':
+                    phone_ids = paddle.to_tensor(datum["text"])
+                    spk_emb = None
+                    spk_id = None
+                    # multi speaker
+                    if args.voice_cloning and "spk_emb" in datum:
+                        spk_emb = paddle.to_tensor(np.load(datum["spk_emb"]))
+                    elif "spk_id" in datum:
+                        spk_id = paddle.to_tensor(datum["spk_id"])
+                    mel = am_inference(
+                        phone_ids, spk_id=spk_id, spk_emb=spk_emb)
+                elif am_name == 'speedyspeech':
+                    phone_ids = paddle.to_tensor(datum["phones"])
+                    tone_ids = paddle.to_tensor(datum["tones"])
+                    mel = am_inference(phone_ids, tone_ids)
+                elif am_name == 'tacotron2':
+                    phone_ids = paddle.to_tensor(datum["text"])
+                    spk_emb = None
+                    # multi speaker
+                    if args.voice_cloning and "spk_emb" in datum:
+                        spk_emb = paddle.to_tensor(np.load(datum["spk_emb"]))
+                    mel = am_inference(phone_ids, spk_emb=spk_emb)
+            # vocoder
+            wav = voc_inference(mel)
+
+            wav = wav.numpy()
+            N += wav.size
+            T += t.elapse
+            speed = wav.size / t.elapse
+            rtf = am_config.fs / speed
+        print(
+            f"{utt_id}, mel: {mel.shape}, wave: {wav.size}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}."
+        )
+        sf.write(
+            str(output_dir / (utt_id + ".wav")), wav, samplerate=am_config.fs)
+        print(f"{utt_id} done!")
+    print(f"generation speed: {N / T}Hz, RTF: {am_config.fs / (N / T) }")
+
+
+def parse_args():
+    # parse args and config and redirect to train_sp
+    parser = argparse.ArgumentParser(
+        description="Synthesize with acoustic model & vocoder")
+    # acoustic model
+    parser.add_argument(
+        '--am',
+        type=str,
+        default='fastspeech2_csmsc',
+        choices=[
+            'speedyspeech_csmsc', 'fastspeech2_csmsc', 'fastspeech2_ljspeech',
+            'fastspeech2_aishell3', 'fastspeech2_vctk', 'tacotron2_csmsc',
+            'tacotron2_ljspeech', 'tacotron2_aishell3'
+        ],
+        help='Choose acoustic model type of tts task.')
+    parser.add_argument(
+        '--am_config',
+        type=str,
+        default=None,
+        help='Config of acoustic model. Use deault config when it is None.')
+    parser.add_argument(
+        '--am_ckpt',
+        type=str,
+        default=None,
+        help='Checkpoint file of acoustic model.')
+    parser.add_argument(
+        "--am_stat",
+        type=str,
+        default=None,
+        help="mean and standard deviation used to normalize spectrogram when training acoustic model."
+    )
+    parser.add_argument(
+        "--phones_dict", type=str, default=None, help="phone vocabulary file.")
+    parser.add_argument(
+        "--tones_dict", type=str, default=None, help="tone vocabulary file.")
+    parser.add_argument(
+        "--speaker_dict", type=str, default=None, help="speaker id map file.")
+    parser.add_argument(
+        "--voice-cloning",
+        type=str2bool,
+        default=False,
+        help="whether training voice cloning model.")
+    # vocoder
+    parser.add_argument(
+        '--voc',
+        type=str,
+        default='pwgan_csmsc',
+        choices=[
+            'pwgan_csmsc', 'pwgan_ljspeech', 'pwgan_aishell3', 'pwgan_vctk',
+            'mb_melgan_csmsc', 'wavernn_csmsc', 'hifigan_csmsc',
+            'hifigan_ljspeech', 'hifigan_aishell3', 'hifigan_vctk',
+            'style_melgan_csmsc'
+        ],
+        help='Choose vocoder type of tts task.')
+    parser.add_argument(
+        '--voc_config',
+        type=str,
+        default=None,
+        help='Config of voc. Use deault config when it is None.')
+    parser.add_argument(
+        '--voc_ckpt', type=str, default=None, help='Checkpoint file of voc.')
+    parser.add_argument(
+        "--voc_stat",
+        type=str,
+        default=None,
+        help="mean and standard deviation used to normalize spectrogram when training voc."
+    )
+    # other
+    parser.add_argument(
+        "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
+    parser.add_argument("--test_metadata", type=str, help="test metadata.")
+    parser.add_argument("--output_dir", type=str, help="output dir.")
+
+    args = parser.parse_args()
+    return args
+
+
+def main():
+
+    args = parse_args()
+    if args.ngpu == 0:
+        paddle.set_device("cpu")
+    elif args.ngpu > 0:
+        paddle.set_device("gpu")
+    else:
+        print("ngpu should >= 0 !")
+
+    evaluate(args)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/ernie-sat/paddlespeech/t2s/exps/synthesize_e2e.py b/ernie-sat/paddlespeech/t2s/exps/synthesize_e2e.py
new file mode 100644
index 0000000000000000000000000000000000000000..10b33c60acfafa93cbf6c8400cc3ba1b733c8a30
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/exps/synthesize_e2e.py
@@ -0,0 +1,247 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+from pathlib import Path
+
+import paddle
+import soundfile as sf
+import yaml
+from timer import timer
+from yacs.config import CfgNode
+
+from paddlespeech.t2s.exps.syn_utils import am_to_static
+from paddlespeech.t2s.exps.syn_utils import get_am_inference
+from paddlespeech.t2s.exps.syn_utils import get_frontend
+from paddlespeech.t2s.exps.syn_utils import get_sentences
+from paddlespeech.t2s.exps.syn_utils import get_voc_inference
+from paddlespeech.t2s.exps.syn_utils import voc_to_static
+
+
+def evaluate(args):
+
+    # Init body.
+    with open(args.am_config) as f:
+        am_config = CfgNode(yaml.safe_load(f))
+    with open(args.voc_config) as f:
+        voc_config = CfgNode(yaml.safe_load(f))
+
+    print("========Args========")
+    print(yaml.safe_dump(vars(args)))
+    print("========Config========")
+    print(am_config)
+    print(voc_config)
+
+    sentences = get_sentences(args)
+
+    # frontend
+    frontend = get_frontend(args)
+
+    # acoustic model
+    am_inference, am_name, am_dataset = get_am_inference(args, am_config)
+
+    # vocoder
+    voc_inference = get_voc_inference(args, voc_config)
+
+    # whether dygraph to static
+    if args.inference_dir:
+        # acoustic model
+        am_inference = am_to_static(args, am_inference, am_name, am_dataset)
+
+        # vocoder
+        voc_inference = voc_to_static(args, voc_inference)
+
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    merge_sentences = False
+    # Avoid not stopping at the end of a sub sentence when tacotron2_ljspeech dygraph to static graph
+    # but still not stopping in the end (NOTE by yuantian01 Feb 9 2022)
+    if am_name == 'tacotron2':
+        merge_sentences = True
+    N = 0
+    T = 0
+    for utt_id, sentence in sentences:
+        with timer() as t:
+            get_tone_ids = False
+            if am_name == 'speedyspeech':
+                get_tone_ids = True
+            if args.lang == 'zh':
+                input_ids = frontend.get_input_ids(
+                    sentence,
+                    merge_sentences=merge_sentences,
+                    get_tone_ids=get_tone_ids)
+                phone_ids = input_ids["phone_ids"]
+                if get_tone_ids:
+                    tone_ids = input_ids["tone_ids"]
+            elif args.lang == 'en':
+                input_ids = frontend.get_input_ids(
+                    sentence, merge_sentences=merge_sentences)
+                phone_ids = input_ids["phone_ids"]
+            else:
+                print("lang should in {'zh', 'en'}!")
+            with paddle.no_grad():
+                flags = 0
+                for i in range(len(phone_ids)):
+                    part_phone_ids = phone_ids[i]
+                    # acoustic model
+                    if am_name == 'fastspeech2':
+                        # multi speaker
+                        if am_dataset in {"aishell3", "vctk"}:
+                            spk_id = paddle.to_tensor(args.spk_id)
+                            mel = am_inference(part_phone_ids, spk_id)
+                        else:
+                            mel = am_inference(part_phone_ids)
+                    elif am_name == 'speedyspeech':
+                        part_tone_ids = tone_ids[i]
+                        if am_dataset in {"aishell3", "vctk"}:
+                            spk_id = paddle.to_tensor(args.spk_id)
+                            mel = am_inference(part_phone_ids, part_tone_ids,
+                                               spk_id)
+                        else:
+                            mel = am_inference(part_phone_ids, part_tone_ids)
+                    elif am_name == 'tacotron2':
+                        mel = am_inference(part_phone_ids)
+                    # vocoder
+                    wav = voc_inference(mel)
+                    if flags == 0:
+                        wav_all = wav
+                        flags = 1
+                    else:
+                        wav_all = paddle.concat([wav_all, wav])
+        wav = wav_all.numpy()
+        N += wav.size
+        T += t.elapse
+        speed = wav.size / t.elapse
+        rtf = am_config.fs / speed
+        print(
+            f"{utt_id}, mel: {mel.shape}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}."
+        )
+        sf.write(
+            str(output_dir / (utt_id + ".wav")), wav, samplerate=am_config.fs)
+        print(f"{utt_id} done!")
+    print(f"generation speed: {N / T}Hz, RTF: {am_config.fs / (N / T) }")
+
+
+def parse_args():
+    # parse args and config and redirect to train_sp
+    parser = argparse.ArgumentParser(
+        description="Synthesize with acoustic model & vocoder")
+    # acoustic model
+    parser.add_argument(
+        '--am',
+        type=str,
+        default='fastspeech2_csmsc',
+        choices=[
+            'speedyspeech_csmsc', 'speedyspeech_aishell3', 'fastspeech2_csmsc',
+            'fastspeech2_ljspeech', 'fastspeech2_aishell3', 'fastspeech2_vctk',
+            'tacotron2_csmsc', 'tacotron2_ljspeech'
+        ],
+        help='Choose acoustic model type of tts task.')
+    parser.add_argument(
+        '--am_config',
+        type=str,
+        default=None,
+        help='Config of acoustic model. Use deault config when it is None.')
+    parser.add_argument(
+        '--am_ckpt',
+        type=str,
+        default=None,
+        help='Checkpoint file of acoustic model.')
+    parser.add_argument(
+        "--am_stat",
+        type=str,
+        default=None,
+        help="mean and standard deviation used to normalize spectrogram when training acoustic model."
+    )
+    parser.add_argument(
+        "--phones_dict", type=str, default=None, help="phone vocabulary file.")
+    parser.add_argument(
+        "--tones_dict", type=str, default=None, help="tone vocabulary file.")
+    parser.add_argument(
+        "--speaker_dict", type=str, default=None, help="speaker id map file.")
+    parser.add_argument(
+        '--spk_id',
+        type=int,
+        default=0,
+        help='spk id for multi speaker acoustic model')
+    # vocoder
+    parser.add_argument(
+        '--voc',
+        type=str,
+        default='pwgan_csmsc',
+        choices=[
+            'pwgan_csmsc',
+            'pwgan_ljspeech',
+            'pwgan_aishell3',
+            'pwgan_vctk',
+            'mb_melgan_csmsc',
+            'style_melgan_csmsc',
+            'hifigan_csmsc',
+            'hifigan_ljspeech',
+            'hifigan_aishell3',
+            'hifigan_vctk',
+            'wavernn_csmsc',
+        ],
+        help='Choose vocoder type of tts task.')
+    parser.add_argument(
+        '--voc_config',
+        type=str,
+        default=None,
+        help='Config of voc. Use deault config when it is None.')
+    parser.add_argument(
+        '--voc_ckpt', type=str, default=None, help='Checkpoint file of voc.')
+    parser.add_argument(
+        "--voc_stat",
+        type=str,
+        default=None,
+        help="mean and standard deviation used to normalize spectrogram when training voc."
+    )
+    # other
+    parser.add_argument(
+        '--lang',
+        type=str,
+        default='zh',
+        help='Choose model language. zh or en')
+
+    parser.add_argument(
+        "--inference_dir",
+        type=str,
+        default=None,
+        help="dir to save inference models")
+    parser.add_argument(
+        "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
+    parser.add_argument(
+        "--text",
+        type=str,
+        help="text to synthesize, a 'utt_id sentence' pair per line.")
+    parser.add_argument("--output_dir", type=str, help="output dir.")
+
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+
+    if args.ngpu == 0:
+        paddle.set_device("cpu")
+    elif args.ngpu > 0:
+        paddle.set_device("gpu")
+    else:
+        print("ngpu should >= 0 !")
+
+    evaluate(args)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/ernie-sat/paddlespeech/t2s/exps/synthesize_streaming.py b/ernie-sat/paddlespeech/t2s/exps/synthesize_streaming.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b9906c1076edda751c4a10772df0f82e8f1f39b
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/exps/synthesize_streaming.py
@@ -0,0 +1,273 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import math
+from pathlib import Path
+
+import numpy as np
+import paddle
+import soundfile as sf
+import yaml
+from timer import timer
+from yacs.config import CfgNode
+
+from paddlespeech.s2t.utils.dynamic_import import dynamic_import
+from paddlespeech.t2s.exps.syn_utils import get_frontend
+from paddlespeech.t2s.exps.syn_utils import get_sentences
+from paddlespeech.t2s.exps.syn_utils import get_voc_inference
+from paddlespeech.t2s.exps.syn_utils import model_alias
+from paddlespeech.t2s.utils import str2bool
+
+
+def denorm(data, mean, std):
+    return data * std + mean
+
+
+def get_chunks(data, chunk_size, pad_size):
+    data_len = data.shape[1]
+    chunks = []
+    n = math.ceil(data_len / chunk_size)
+    for i in range(n):
+        start = max(0, i * chunk_size - pad_size)
+        end = min((i + 1) * chunk_size + pad_size, data_len)
+        chunks.append(data[:, start:end, :])
+    return chunks
+
+
+def evaluate(args):
+
+    # Init body.
+    with open(args.am_config) as f:
+        am_config = CfgNode(yaml.safe_load(f))
+    with open(args.voc_config) as f:
+        voc_config = CfgNode(yaml.safe_load(f))
+
+    print("========Args========")
+    print(yaml.safe_dump(vars(args)))
+    print("========Config========")
+    print(am_config)
+    print(voc_config)
+
+    sentences = get_sentences(args)
+
+    # frontend
+    frontend = get_frontend(args)
+
+    with open(args.phones_dict, "r") as f:
+        phn_id = [line.strip().split() for line in f.readlines()]
+    vocab_size = len(phn_id)
+    print("vocab_size:", vocab_size)
+
+    # acoustic model, only support fastspeech2 here now!
+    # am_inference, am_name, am_dataset = get_am_inference(args, am_config)
+    # model: {model_name}_{dataset}
+    am_name = args.am[:args.am.rindex('_')]
+    am_dataset = args.am[args.am.rindex('_') + 1:]
+    odim = am_config.n_mels
+
+    am_class = dynamic_import(am_name, model_alias)
+    am = am_class(idim=vocab_size, odim=odim, **am_config["model"])
+    am.set_state_dict(paddle.load(args.am_ckpt)["main_params"])
+    am.eval()
+    am_mu, am_std = np.load(args.am_stat)
+    am_mu = paddle.to_tensor(am_mu)
+    am_std = paddle.to_tensor(am_std)
+
+    # vocoder
+    voc_inference = get_voc_inference(args, voc_config)
+
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    merge_sentences = True
+    get_tone_ids = False
+
+    N = 0
+    T = 0
+    chunk_size = args.chunk_size
+    pad_size = args.pad_size
+
+    for utt_id, sentence in sentences:
+        with timer() as t:
+            if args.lang == 'zh':
+                input_ids = frontend.get_input_ids(
+                    sentence,
+                    merge_sentences=merge_sentences,
+                    get_tone_ids=get_tone_ids)
+
+                phone_ids = input_ids["phone_ids"]
+            else:
+                print("lang should in be 'zh' here!")
+            # merge_sentences=True here, so we only use the first item of phone_ids
+            phone_ids = phone_ids[0]
+            with paddle.no_grad():
+                # acoustic model
+                orig_hs, h_masks = am.encoder_infer(phone_ids)
+
+                if args.am_streaming:
+                    hss = get_chunks(orig_hs, chunk_size, pad_size)
+                    chunk_num = len(hss)
+                    mel_list = []
+                    for i, hs in enumerate(hss):
+                        before_outs, _ = am.decoder(hs)
+                        after_outs = before_outs + am.postnet(
+                            before_outs.transpose((0, 2, 1))).transpose(
+                                (0, 2, 1))
+                        normalized_mel = after_outs[0]
+                        sub_mel = denorm(normalized_mel, am_mu, am_std)
+                        # clip output part of pad
+                        if i == 0:
+                            sub_mel = sub_mel[:-pad_size]
+                        elif i == chunk_num - 1:
+                            # 最后一块的右侧一定没有 pad 够
+                            sub_mel = sub_mel[pad_size:]
+                        else:
+                            # 倒数几块的右侧也可能没有 pad 够
+                            sub_mel = sub_mel[pad_size:(chunk_size + pad_size) -
+                                              sub_mel.shape[0]]
+                        mel_list.append(sub_mel)
+                    mel = paddle.concat(mel_list, axis=0)
+
+                else:
+                    before_outs, _ = am.decoder(orig_hs)
+                    after_outs = before_outs + am.postnet(
+                        before_outs.transpose((0, 2, 1))).transpose((0, 2, 1))
+                    normalized_mel = after_outs[0]
+                    mel = denorm(normalized_mel, am_mu, am_std)
+
+                # vocoder
+                wav = voc_inference(mel)
+
+        wav = wav.numpy()
+        N += wav.size
+        T += t.elapse
+        speed = wav.size / t.elapse
+        rtf = am_config.fs / speed
+        print(
+            f"{utt_id}, mel: {mel.shape}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}."
+        )
+        sf.write(
+            str(output_dir / (utt_id + ".wav")), wav, samplerate=am_config.fs)
+        print(f"{utt_id} done!")
+    print(f"generation speed: {N / T}Hz, RTF: {am_config.fs / (N / T) }")
+
+
+def parse_args():
+    # parse args and config and redirect to train_sp
+    parser = argparse.ArgumentParser(
+        description="Synthesize with acoustic model & vocoder")
+    # acoustic model
+    parser.add_argument(
+        '--am',
+        type=str,
+        default='fastspeech2_csmsc',
+        choices=['fastspeech2_csmsc'],
+        help='Choose acoustic model type of tts task.')
+    parser.add_argument(
+        '--am_config',
+        type=str,
+        default=None,
+        help='Config of acoustic model. Use deault config when it is None.')
+    parser.add_argument(
+        '--am_ckpt',
+        type=str,
+        default=None,
+        help='Checkpoint file of acoustic model.')
+    parser.add_argument(
+        "--am_stat",
+        type=str,
+        default=None,
+        help="mean and standard deviation used to normalize spectrogram when training acoustic model."
+    )
+    parser.add_argument(
+        "--phones_dict", type=str, default=None, help="phone vocabulary file.")
+    parser.add_argument(
+        "--tones_dict", type=str, default=None, help="tone vocabulary file.")
+
+    # vocoder
+    parser.add_argument(
+        '--voc',
+        type=str,
+        default='pwgan_csmsc',
+        choices=[
+            'pwgan_csmsc',
+            'pwgan_ljspeech',
+            'pwgan_aishell3',
+            'pwgan_vctk',
+            'mb_melgan_csmsc',
+            'style_melgan_csmsc',
+            'hifigan_csmsc',
+            'hifigan_ljspeech',
+            'hifigan_aishell3',
+            'hifigan_vctk',
+            'wavernn_csmsc',
+        ],
+        help='Choose vocoder type of tts task.')
+    parser.add_argument(
+        '--voc_config',
+        type=str,
+        default=None,
+        help='Config of voc. Use deault config when it is None.')
+    parser.add_argument(
+        '--voc_ckpt', type=str, default=None, help='Checkpoint file of voc.')
+    parser.add_argument(
+        "--voc_stat",
+        type=str,
+        default=None,
+        help="mean and standard deviation used to normalize spectrogram when training voc."
+    )
+    # other
+    parser.add_argument(
+        '--lang',
+        type=str,
+        default='zh',
+        help='Choose model language. zh or en')
+
+    parser.add_argument(
+        "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
+    parser.add_argument(
+        "--text",
+        type=str,
+        help="text to synthesize, a 'utt_id sentence' pair per line.")
+
+    parser.add_argument(
+        "--am_streaming",
+        type=str2bool,
+        default=False,
+        help="whether use streaming acoustic model")
+    parser.add_argument(
+        "--chunk_size", type=int, default=42, help="chunk size of am streaming")
+    parser.add_argument(
+        "--pad_size", type=int, default=12, help="pad size of am streaming")
+
+    parser.add_argument("--output_dir", type=str, help="output dir.")
+
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+
+    if args.ngpu == 0:
+        paddle.set_device("cpu")
+    elif args.ngpu > 0:
+        paddle.set_device("gpu")
+    else:
+        print("ngpu should >= 0 !")
+
+    evaluate(args)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/ernie-sat/paddlespeech/t2s/exps/tacotron2/__init__.py b/ernie-sat/paddlespeech/t2s/exps/tacotron2/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..abf198b97e6e818e1fbe59006f98492640bcee54
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/exps/tacotron2/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/ernie-sat/paddlespeech/t2s/exps/tacotron2/normalize.py b/ernie-sat/paddlespeech/t2s/exps/tacotron2/normalize.py
new file mode 100644
index 0000000000000000000000000000000000000000..87e975b88ffb1b27c63885dfbe7fdb3c4cf5b718
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/exps/tacotron2/normalize.py
@@ -0,0 +1,146 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Normalize feature files and dump them."""
+import argparse
+import logging
+from operator import itemgetter
+from pathlib import Path
+
+import jsonlines
+import numpy as np
+from sklearn.preprocessing import StandardScaler
+from tqdm import tqdm
+
+from paddlespeech.t2s.datasets.data_table import DataTable
+
+
+def main():
+    """Run preprocessing process."""
+    parser = argparse.ArgumentParser(
+        description="Normalize dumped raw features (See detail in parallel_wavegan/bin/normalize.py)."
+    )
+    parser.add_argument(
+        "--metadata",
+        type=str,
+        required=True,
+        help="directory including feature files to be normalized. "
+        "you need to specify either *-scp or rootdir.")
+
+    parser.add_argument(
+        "--dumpdir",
+        type=str,
+        required=True,
+        help="directory to dump normalized feature files.")
+    parser.add_argument(
+        "--speech-stats",
+        type=str,
+        required=True,
+        help="speech statistics file.")
+    parser.add_argument(
+        "--phones-dict", type=str, default=None, help="phone vocabulary file.")
+    parser.add_argument(
+        "--speaker-dict", type=str, default=None, help="speaker id map file.")
+    parser.add_argument(
+        "--verbose",
+        type=int,
+        default=1,
+        help="logging level. higher is more logging. (default=1)")
+    args = parser.parse_args()
+
+    # set logger
+    if args.verbose > 1:
+        logging.basicConfig(
+            level=logging.DEBUG,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
+        )
+    elif args.verbose > 0:
+        logging.basicConfig(
+            level=logging.INFO,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
+        )
+    else:
+        logging.basicConfig(
+            level=logging.WARN,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
+        )
+        logging.warning('Skip DEBUG/INFO messages')
+
+    # check directory existence
+    dumpdir = Path(args.dumpdir).resolve()
+    dumpdir.mkdir(parents=True, exist_ok=True)
+
+    # get dataset
+    with jsonlines.open(args.metadata, 'r') as reader:
+        metadata = list(reader)
+    dataset = DataTable(
+        metadata, converters={
+            "speech": np.load,
+        })
+    logging.info(f"The number of files = {len(dataset)}.")
+
+    # restore scaler
+    speech_scaler = StandardScaler()
+    speech_scaler.mean_ = np.load(args.speech_stats)[0]
+    speech_scaler.scale_ = np.load(args.speech_stats)[1]
+    speech_scaler.n_features_in_ = speech_scaler.mean_.shape[0]
+
+    vocab_phones = {}
+    with open(args.phones_dict, 'rt') as f:
+        phn_id = [line.strip().split() for line in f.readlines()]
+    for phn, id in phn_id:
+        vocab_phones[phn] = int(id)
+
+    vocab_speaker = {}
+    with open(args.speaker_dict, 'rt') as f:
+        spk_id = [line.strip().split() for line in f.readlines()]
+    for spk, id in spk_id:
+        vocab_speaker[spk] = int(id)
+
+    # process each file
+    output_metadata = []
+
+    for item in tqdm(dataset):
+        utt_id = item['utt_id']
+        speech = item['speech']
+        # normalize
+        speech = speech_scaler.transform(speech)
+        speech_dir = dumpdir / "data_speech"
+        speech_dir.mkdir(parents=True, exist_ok=True)
+        speech_path = speech_dir / f"{utt_id}_speech.npy"
+        np.save(speech_path, speech.astype(np.float32), allow_pickle=False)
+
+        phone_ids = [vocab_phones[p] for p in item['phones']]
+        spk_id = vocab_speaker[item["speaker"]]
+        record = {
+            "utt_id": item['utt_id'],
+            "spk_id": spk_id,
+            "text": phone_ids,
+            "text_lengths": item['text_lengths'],
+            "speech_lengths": item['speech_lengths'],
+            "speech": str(speech_path),
+        }
+        # add spk_emb for voice cloning
+        if "spk_emb" in item:
+            record["spk_emb"] = str(item["spk_emb"])
+        output_metadata.append(record)
+    output_metadata.sort(key=itemgetter('utt_id'))
+    output_metadata_path = Path(args.dumpdir) / "metadata.jsonl"
+    with jsonlines.open(output_metadata_path, 'w') as writer:
+        for item in output_metadata:
+            writer.write(item)
+    logging.info(f"metadata dumped into {output_metadata_path}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/ernie-sat/paddlespeech/t2s/exps/tacotron2/preprocess.py b/ernie-sat/paddlespeech/t2s/exps/tacotron2/preprocess.py
new file mode 100644
index 0000000000000000000000000000000000000000..14a0d7eae227f5650a716bc656f3d0c32ee077e3
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/exps/tacotron2/preprocess.py
@@ -0,0 +1,329 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+from concurrent.futures import ThreadPoolExecutor
+from operator import itemgetter
+from pathlib import Path
+from typing import Any
+from typing import Dict
+from typing import List
+
+import jsonlines
+import librosa
+import numpy as np
+import tqdm
+import yaml
+from yacs.config import CfgNode
+
+from paddlespeech.t2s.datasets.get_feats import LogMelFBank
+from paddlespeech.t2s.datasets.preprocess_utils import compare_duration_and_mel_length
+from paddlespeech.t2s.datasets.preprocess_utils import get_input_token
+from paddlespeech.t2s.datasets.preprocess_utils import get_phn_dur
+from paddlespeech.t2s.datasets.preprocess_utils import get_spk_id_map
+from paddlespeech.t2s.datasets.preprocess_utils import merge_silence
+from paddlespeech.t2s.utils import str2bool
+
+
+def process_sentence(config: Dict[str, Any],
+                     fp: Path,
+                     sentences: Dict,
+                     output_dir: Path,
+                     mel_extractor=None,
+                     cut_sil: bool=True,
+                     spk_emb_dir: Path=None):
+    utt_id = fp.stem
+    # for vctk
+    if utt_id.endswith("_mic2"):
+        utt_id = utt_id[:-5]
+    record = None
+    if utt_id in sentences:
+        # reading, resampling may occur
+        wav, _ = librosa.load(str(fp), sr=config.fs)
+        if len(wav.shape) != 1 or np.abs(wav).max() > 1.0:
+            return record
+        assert len(wav.shape) == 1, f"{utt_id} is not a mono-channel audio."
+        assert np.abs(wav).max(
+        ) <= 1.0, f"{utt_id} is seems to be different that 16 bit PCM."
+        phones = sentences[utt_id][0]
+        durations = sentences[utt_id][1]
+        speaker = sentences[utt_id][2]
+        d_cumsum = np.pad(np.array(durations).cumsum(0), (1, 0), 'constant')
+        # little imprecise than use *.TextGrid directly
+        times = librosa.frames_to_time(
+            d_cumsum, sr=config.fs, hop_length=config.n_shift)
+        if cut_sil:
+            start = 0
+            end = d_cumsum[-1]
+            if phones[0] == "sil" and len(durations) > 1:
+                start = times[1]
+                durations = durations[1:]
+                phones = phones[1:]
+            if phones[-1] == 'sil' and len(durations) > 1:
+                end = times[-2]
+                durations = durations[:-1]
+                phones = phones[:-1]
+            sentences[utt_id][0] = phones
+            sentences[utt_id][1] = durations
+            start, end = librosa.time_to_samples([start, end], sr=config.fs)
+            wav = wav[start:end]
+        # extract mel feats
+        logmel = mel_extractor.get_log_mel_fbank(wav)
+        # change duration according to mel_length
+        compare_duration_and_mel_length(sentences, utt_id, logmel)
+        # utt_id may be popped in compare_duration_and_mel_length
+        if utt_id not in sentences:
+            return None
+        phones = sentences[utt_id][0]
+        durations = sentences[utt_id][1]
+        num_frames = logmel.shape[0]
+        assert sum(durations) == num_frames
+        mel_dir = output_dir / "data_speech"
+        mel_dir.mkdir(parents=True, exist_ok=True)
+        mel_path = mel_dir / (utt_id + "_speech.npy")
+        np.save(mel_path, logmel)
+        record = {
+            "utt_id": utt_id,
+            "phones": phones,
+            "text_lengths": len(phones),
+            "speech_lengths": num_frames,
+            "speech": str(mel_path),
+            "speaker": speaker
+        }
+        if spk_emb_dir:
+            if speaker in os.listdir(spk_emb_dir):
+                embed_name = utt_id + ".npy"
+                embed_path = spk_emb_dir / speaker / embed_name
+                if embed_path.is_file():
+                    record["spk_emb"] = str(embed_path)
+                else:
+                    return None
+    return record
+
+
+def process_sentences(config,
+                      fps: List[Path],
+                      sentences: Dict,
+                      output_dir: Path,
+                      mel_extractor=None,
+                      nprocs: int=1,
+                      cut_sil: bool=True,
+                      spk_emb_dir: Path=None):
+    if nprocs == 1:
+        results = []
+        for fp in fps:
+            record = process_sentence(config, fp, sentences, output_dir,
+                                      mel_extractor, cut_sil, spk_emb_dir)
+            if record:
+                results.append(record)
+    else:
+        with ThreadPoolExecutor(nprocs) as pool:
+            futures = []
+            with tqdm.tqdm(total=len(fps)) as progress:
+                for fp in fps:
+                    future = pool.submit(process_sentence, config, fp,
+                                         sentences, output_dir, mel_extractor,
+                                         cut_sil, spk_emb_dir)
+                    future.add_done_callback(lambda p: progress.update())
+                    futures.append(future)
+
+                results = []
+                for ft in futures:
+                    record = ft.result()
+                    if record:
+                        results.append(record)
+
+    results.sort(key=itemgetter("utt_id"))
+    with jsonlines.open(output_dir / "metadata.jsonl", 'w') as writer:
+        for item in results:
+            writer.write(item)
+    print("Done")
+
+
+def main():
+    # parse config and args
+    parser = argparse.ArgumentParser(
+        description="Preprocess audio and then extract features.")
+
+    parser.add_argument(
+        "--dataset",
+        default="baker",
+        type=str,
+        help="name of dataset, should in {baker, aishell3, ljspeech, vctk} now")
+
+    parser.add_argument(
+        "--rootdir", default=None, type=str, help="directory to dataset.")
+
+    parser.add_argument(
+        "--dumpdir",
+        type=str,
+        required=True,
+        help="directory to dump feature files.")
+    parser.add_argument(
+        "--dur-file", default=None, type=str, help="path to durations.txt.")
+
+    parser.add_argument("--config", type=str, help="fastspeech2 config file.")
+
+    parser.add_argument(
+        "--verbose",
+        type=int,
+        default=1,
+        help="logging level. higher is more logging. (default=1)")
+    parser.add_argument(
+        "--num-cpu", type=int, default=1, help="number of process.")
+
+    parser.add_argument(
+        "--cut-sil",
+        type=str2bool,
+        default=True,
+        help="whether cut sil in the edge of audio")
+
+    parser.add_argument(
+        "--spk_emb_dir",
+        default=None,
+        type=str,
+        help="directory to speaker embedding files.")
+    args = parser.parse_args()
+
+    rootdir = Path(args.rootdir).expanduser()
+    dumpdir = Path(args.dumpdir).expanduser()
+    # use absolute path
+    dumpdir = dumpdir.resolve()
+    dumpdir.mkdir(parents=True, exist_ok=True)
+    dur_file = Path(args.dur_file).expanduser()
+
+    if args.spk_emb_dir:
+        spk_emb_dir = Path(args.spk_emb_dir).expanduser().resolve()
+    else:
+        spk_emb_dir = None
+
+    assert rootdir.is_dir()
+    assert dur_file.is_file()
+
+    with open(args.config, 'rt') as f:
+        config = CfgNode(yaml.safe_load(f))
+
+    if args.verbose > 1:
+        print(vars(args))
+        print(config)
+
+    sentences, speaker_set = get_phn_dur(dur_file)
+
+    merge_silence(sentences)
+    phone_id_map_path = dumpdir / "phone_id_map.txt"
+    speaker_id_map_path = dumpdir / "speaker_id_map.txt"
+    get_input_token(sentences, phone_id_map_path, args.dataset)
+    get_spk_id_map(speaker_set, speaker_id_map_path)
+
+    if args.dataset == "baker":
+        wav_files = sorted(list((rootdir / "Wave").rglob("*.wav")))
+        # split data into 3 sections
+        num_train = 9800
+        num_dev = 100
+        train_wav_files = wav_files[:num_train]
+        dev_wav_files = wav_files[num_train:num_train + num_dev]
+        test_wav_files = wav_files[num_train + num_dev:]
+    elif args.dataset == "aishell3":
+        sub_num_dev = 5
+        wav_dir = rootdir / "train" / "wav"
+        train_wav_files = []
+        dev_wav_files = []
+        test_wav_files = []
+        for speaker in os.listdir(wav_dir):
+            wav_files = sorted(list((wav_dir / speaker).rglob("*.wav")))
+            if len(wav_files) > 100:
+                train_wav_files += wav_files[:-sub_num_dev * 2]
+                dev_wav_files += wav_files[-sub_num_dev * 2:-sub_num_dev]
+                test_wav_files += wav_files[-sub_num_dev:]
+            else:
+                train_wav_files += wav_files
+
+    elif args.dataset == "ljspeech":
+        wav_files = sorted(list((rootdir / "wavs").rglob("*.wav")))
+        # split data into 3 sections
+        num_train = 12900
+        num_dev = 100
+        train_wav_files = wav_files[:num_train]
+        dev_wav_files = wav_files[num_train:num_train + num_dev]
+        test_wav_files = wav_files[num_train + num_dev:]
+    elif args.dataset == "vctk":
+        sub_num_dev = 5
+        wav_dir = rootdir / "wav48_silence_trimmed"
+        train_wav_files = []
+        dev_wav_files = []
+        test_wav_files = []
+        for speaker in os.listdir(wav_dir):
+            wav_files = sorted(list((wav_dir / speaker).rglob("*_mic2.flac")))
+            if len(wav_files) > 100:
+                train_wav_files += wav_files[:-sub_num_dev * 2]
+                dev_wav_files += wav_files[-sub_num_dev * 2:-sub_num_dev]
+                test_wav_files += wav_files[-sub_num_dev:]
+            else:
+                train_wav_files += wav_files
+
+    else:
+        print("dataset should in {baker, aishell3, ljspeech, vctk} now!")
+
+    train_dump_dir = dumpdir / "train" / "raw"
+    train_dump_dir.mkdir(parents=True, exist_ok=True)
+    dev_dump_dir = dumpdir / "dev" / "raw"
+    dev_dump_dir.mkdir(parents=True, exist_ok=True)
+    test_dump_dir = dumpdir / "test" / "raw"
+    test_dump_dir.mkdir(parents=True, exist_ok=True)
+
+    # Extractor
+    mel_extractor = LogMelFBank(
+        sr=config.fs,
+        n_fft=config.n_fft,
+        hop_length=config.n_shift,
+        win_length=config.win_length,
+        window=config.window,
+        n_mels=config.n_mels,
+        fmin=config.fmin,
+        fmax=config.fmax)
+
+    # process for the 3 sections
+    if train_wav_files:
+        process_sentences(
+            config,
+            train_wav_files,
+            sentences,
+            train_dump_dir,
+            mel_extractor,
+            nprocs=args.num_cpu,
+            cut_sil=args.cut_sil,
+            spk_emb_dir=spk_emb_dir)
+    if dev_wav_files:
+        process_sentences(
+            config,
+            dev_wav_files,
+            sentences,
+            dev_dump_dir,
+            mel_extractor,
+            cut_sil=args.cut_sil,
+            spk_emb_dir=spk_emb_dir)
+    if test_wav_files:
+        process_sentences(
+            config,
+            test_wav_files,
+            sentences,
+            test_dump_dir,
+            mel_extractor,
+            nprocs=args.num_cpu,
+            cut_sil=args.cut_sil,
+            spk_emb_dir=spk_emb_dir)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/ernie-sat/paddlespeech/t2s/exps/tacotron2/train.py b/ernie-sat/paddlespeech/t2s/exps/tacotron2/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..69ff80e467ab79a6827560c168316d9c221aa84c
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/exps/tacotron2/train.py
@@ -0,0 +1,202 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import logging
+import os
+import shutil
+from pathlib import Path
+
+import jsonlines
+import numpy as np
+import paddle
+import yaml
+from paddle import DataParallel
+from paddle import distributed as dist
+from paddle.io import DataLoader
+from paddle.io import DistributedBatchSampler
+from yacs.config import CfgNode
+
+from paddlespeech.t2s.datasets.am_batch_fn import tacotron2_multi_spk_batch_fn
+from paddlespeech.t2s.datasets.am_batch_fn import tacotron2_single_spk_batch_fn
+from paddlespeech.t2s.datasets.data_table import DataTable
+from paddlespeech.t2s.models.tacotron2 import Tacotron2
+from paddlespeech.t2s.models.tacotron2 import Tacotron2Evaluator
+from paddlespeech.t2s.models.tacotron2 import Tacotron2Updater
+from paddlespeech.t2s.training.extensions.snapshot import Snapshot
+from paddlespeech.t2s.training.extensions.visualizer import VisualDL
+from paddlespeech.t2s.training.optimizer import build_optimizers
+from paddlespeech.t2s.training.seeding import seed_everything
+from paddlespeech.t2s.training.trainer import Trainer
+from paddlespeech.t2s.utils import str2bool
+
+
+def train_sp(args, config):
+    # decides device type and whether to run in parallel
+    # setup running environment correctly
+    if (not paddle.is_compiled_with_cuda()) or args.ngpu == 0:
+        paddle.set_device("cpu")
+    else:
+        paddle.set_device("gpu")
+    world_size = paddle.distributed.get_world_size()
+    if world_size > 1:
+        paddle.distributed.init_parallel_env()
+
+    # set the random seed, it is a must for multiprocess training
+    seed_everything(config.seed)
+
+    print(
+        f"rank: {dist.get_rank()}, pid: {os.getpid()}, parent_pid: {os.getppid()}",
+    )
+
+    # dataloader has been too verbose
+    logging.getLogger("DataLoader").disabled = True
+
+    fields = [
+        "text",
+        "text_lengths",
+        "speech",
+        "speech_lengths",
+    ]
+
+    converters = {
+        "speech": np.load,
+    }
+    if args.voice_cloning:
+        print("Training voice cloning!")
+        collate_fn = tacotron2_multi_spk_batch_fn
+        fields += ["spk_emb"]
+        converters["spk_emb"] = np.load
+    else:
+        print("single speaker tacotron2!")
+        collate_fn = tacotron2_single_spk_batch_fn
+
+    # construct dataset for training and validation
+    with jsonlines.open(args.train_metadata, 'r') as reader:
+        train_metadata = list(reader)
+    train_dataset = DataTable(
+        data=train_metadata,
+        fields=fields,
+        converters=converters, )
+    with jsonlines.open(args.dev_metadata, 'r') as reader:
+        dev_metadata = list(reader)
+    dev_dataset = DataTable(
+        data=dev_metadata,
+        fields=fields,
+        converters=converters, )
+
+    # collate function and dataloader
+    train_sampler = DistributedBatchSampler(
+        train_dataset,
+        batch_size=config.batch_size,
+        shuffle=True,
+        drop_last=True)
+
+    print("samplers done!")
+
+    train_dataloader = DataLoader(
+        train_dataset,
+        batch_sampler=train_sampler,
+        collate_fn=collate_fn,
+        num_workers=config.num_workers)
+
+    dev_dataloader = DataLoader(
+        dev_dataset,
+        shuffle=False,
+        drop_last=False,
+        batch_size=config.batch_size,
+        collate_fn=collate_fn,
+        num_workers=config.num_workers)
+    print("dataloaders done!")
+
+    with open(args.phones_dict, "r") as f:
+        phn_id = [line.strip().split() for line in f.readlines()]
+    vocab_size = len(phn_id)
+    print("vocab_size:", vocab_size)
+
+    odim = config.n_mels
+    model = Tacotron2(idim=vocab_size, odim=odim, **config["model"])
+    if world_size > 1:
+        model = DataParallel(model)
+    print("model done!")
+
+    optimizer = build_optimizers(model, **config["optimizer"])
+    print("optimizer done!")
+
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    if dist.get_rank() == 0:
+        config_name = args.config.split("/")[-1]
+        # copy conf to output_dir
+        shutil.copyfile(args.config, output_dir / config_name)
+
+    updater = Tacotron2Updater(
+        model=model,
+        optimizer=optimizer,
+        dataloader=train_dataloader,
+        output_dir=output_dir,
+        **config["updater"])
+
+    trainer = Trainer(updater, (config.max_epoch, 'epoch'), output_dir)
+
+    evaluator = Tacotron2Evaluator(
+        model, dev_dataloader, output_dir=output_dir, **config["updater"])
+
+    if dist.get_rank() == 0:
+        trainer.extend(evaluator, trigger=(1, "epoch"))
+        trainer.extend(VisualDL(output_dir), trigger=(1, "iteration"))
+    trainer.extend(
+        Snapshot(max_size=config.num_snapshots), trigger=(1, 'epoch'))
+    trainer.run()
+
+
+def main():
+    # parse args and config and redirect to train_sp
+    parser = argparse.ArgumentParser(description="Train a Tacotron2 model.")
+    parser.add_argument("--config", type=str, help="tacotron2 config file.")
+    parser.add_argument("--train-metadata", type=str, help="training data.")
+    parser.add_argument("--dev-metadata", type=str, help="dev data.")
+    parser.add_argument("--output-dir", type=str, help="output dir.")
+    parser.add_argument(
+        "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
+    parser.add_argument(
+        "--phones-dict", type=str, default=None, help="phone vocabulary file.")
+
+    parser.add_argument(
+        "--voice-cloning",
+        type=str2bool,
+        default=False,
+        help="whether training voice cloning model.")
+
+    args = parser.parse_args()
+
+    with open(args.config) as f:
+        config = CfgNode(yaml.safe_load(f))
+
+    print("========Args========")
+    print(yaml.safe_dump(vars(args)))
+    print("========Config========")
+    print(config)
+    print(
+        f"master see the word size: {dist.get_world_size()}, from pid: {os.getpid()}"
+    )
+
+    # dispatch
+    if args.ngpu > 1:
+        dist.spawn(train_sp, (args, config), nprocs=args.ngpu)
+    else:
+        train_sp(args, config)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/ernie-sat/paddlespeech/t2s/exps/transformer_tts/__init__.py b/ernie-sat/paddlespeech/t2s/exps/transformer_tts/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..abf198b97e6e818e1fbe59006f98492640bcee54
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/exps/transformer_tts/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/ernie-sat/paddlespeech/t2s/exps/transformer_tts/normalize.py b/ernie-sat/paddlespeech/t2s/exps/transformer_tts/normalize.py
new file mode 100644
index 0000000000000000000000000000000000000000..87e975b88ffb1b27c63885dfbe7fdb3c4cf5b718
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/exps/transformer_tts/normalize.py
@@ -0,0 +1,146 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Normalize feature files and dump them."""
+import argparse
+import logging
+from operator import itemgetter
+from pathlib import Path
+
+import jsonlines
+import numpy as np
+from sklearn.preprocessing import StandardScaler
+from tqdm import tqdm
+
+from paddlespeech.t2s.datasets.data_table import DataTable
+
+
+def main():
+    """Run preprocessing process."""
+    parser = argparse.ArgumentParser(
+        description="Normalize dumped raw features (See detail in parallel_wavegan/bin/normalize.py)."
+    )
+    parser.add_argument(
+        "--metadata",
+        type=str,
+        required=True,
+        help="directory including feature files to be normalized. "
+        "you need to specify either *-scp or rootdir.")
+
+    parser.add_argument(
+        "--dumpdir",
+        type=str,
+        required=True,
+        help="directory to dump normalized feature files.")
+    parser.add_argument(
+        "--speech-stats",
+        type=str,
+        required=True,
+        help="speech statistics file.")
+    parser.add_argument(
+        "--phones-dict", type=str, default=None, help="phone vocabulary file.")
+    parser.add_argument(
+        "--speaker-dict", type=str, default=None, help="speaker id map file.")
+    parser.add_argument(
+        "--verbose",
+        type=int,
+        default=1,
+        help="logging level. higher is more logging. (default=1)")
+    args = parser.parse_args()
+
+    # set logger
+    if args.verbose > 1:
+        logging.basicConfig(
+            level=logging.DEBUG,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
+        )
+    elif args.verbose > 0:
+        logging.basicConfig(
+            level=logging.INFO,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
+        )
+    else:
+        logging.basicConfig(
+            level=logging.WARN,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
+        )
+        logging.warning('Skip DEBUG/INFO messages')
+
+    # check directory existence
+    dumpdir = Path(args.dumpdir).resolve()
+    dumpdir.mkdir(parents=True, exist_ok=True)
+
+    # get dataset
+    with jsonlines.open(args.metadata, 'r') as reader:
+        metadata = list(reader)
+    dataset = DataTable(
+        metadata, converters={
+            "speech": np.load,
+        })
+    logging.info(f"The number of files = {len(dataset)}.")
+
+    # restore scaler
+    speech_scaler = StandardScaler()
+    speech_scaler.mean_ = np.load(args.speech_stats)[0]
+    speech_scaler.scale_ = np.load(args.speech_stats)[1]
+    speech_scaler.n_features_in_ = speech_scaler.mean_.shape[0]
+
+    vocab_phones = {}
+    with open(args.phones_dict, 'rt') as f:
+        phn_id = [line.strip().split() for line in f.readlines()]
+    for phn, id in phn_id:
+        vocab_phones[phn] = int(id)
+
+    vocab_speaker = {}
+    with open(args.speaker_dict, 'rt') as f:
+        spk_id = [line.strip().split() for line in f.readlines()]
+    for spk, id in spk_id:
+        vocab_speaker[spk] = int(id)
+
+    # process each file
+    output_metadata = []
+
+    for item in tqdm(dataset):
+        utt_id = item['utt_id']
+        speech = item['speech']
+        # normalize
+        speech = speech_scaler.transform(speech)
+        speech_dir = dumpdir / "data_speech"
+        speech_dir.mkdir(parents=True, exist_ok=True)
+        speech_path = speech_dir / f"{utt_id}_speech.npy"
+        np.save(speech_path, speech.astype(np.float32), allow_pickle=False)
+
+        phone_ids = [vocab_phones[p] for p in item['phones']]
+        spk_id = vocab_speaker[item["speaker"]]
+        record = {
+            "utt_id": item['utt_id'],
+            "spk_id": spk_id,
+            "text": phone_ids,
+            "text_lengths": item['text_lengths'],
+            "speech_lengths": item['speech_lengths'],
+            "speech": str(speech_path),
+        }
+        # add spk_emb for voice cloning
+        if "spk_emb" in item:
+            record["spk_emb"] = str(item["spk_emb"])
+        output_metadata.append(record)
+    output_metadata.sort(key=itemgetter('utt_id'))
+    output_metadata_path = Path(args.dumpdir) / "metadata.jsonl"
+    with jsonlines.open(output_metadata_path, 'w') as writer:
+        for item in output_metadata:
+            writer.write(item)
+    logging.info(f"metadata dumped into {output_metadata_path}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/ernie-sat/paddlespeech/t2s/exps/transformer_tts/preprocess.py b/ernie-sat/paddlespeech/t2s/exps/transformer_tts/preprocess.py
new file mode 100644
index 0000000000000000000000000000000000000000..9aa87e91a84b8a14f8196337c8a2f3e4a5a69470
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/exps/transformer_tts/preprocess.py
@@ -0,0 +1,275 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+from concurrent.futures import ThreadPoolExecutor
+from operator import itemgetter
+from pathlib import Path
+from typing import Any
+from typing import Dict
+from typing import List
+
+import jsonlines
+import librosa
+import numpy as np
+import tqdm
+import yaml
+from yacs.config import CfgNode as Configuration
+
+from paddlespeech.t2s.datasets.get_feats import LogMelFBank
+from paddlespeech.t2s.frontend import English
+
+
+def get_lj_sentences(file_name, frontend):
+    '''read MFA duration.txt
+
+    Args:
+        file_name (str or Path)
+    Returns:
+        Dict: sentence: {'utt': ([char], [int])}
+    '''
+    f = open(file_name, 'r')
+    sentence = {}
+    speaker_set = set()
+    for line in f:
+        line_list = line.strip().split('|')
+        utt = line_list[0]
+        speaker = utt.split("-")[0][:2]
+        speaker_set.add(speaker)
+        raw_text = line_list[-1]
+        phonemes = frontend.phoneticize(raw_text)
+        phonemes = phonemes[1:-1]
+        phonemes = [phn for phn in phonemes if not phn.isspace()]
+        sentence[utt] = (phonemes, speaker)
+    f.close()
+    return sentence, speaker_set
+
+
+def get_input_token(sentence, output_path):
+    '''get phone set from training data and save it
+    
+    Args:
+        sentence (Dict): sentence: {'utt': ([char], str)}
+        output_path (str or path): path to save phone_id_map
+    '''
+    phn_token = set()
+    for utt in sentence:
+        for phn in sentence[utt][0]:
+            if phn != "<eos>":
+                phn_token.add(phn)
+    phn_token = list(phn_token)
+    phn_token.sort()
+    phn_token = ["<pad>", "<unk>"] + phn_token
+    phn_token += ["<eos>"]
+
+    with open(output_path, 'w') as f:
+        for i, phn in enumerate(phn_token):
+            f.write(phn + ' ' + str(i) + '\n')
+
+
+def get_spk_id_map(speaker_set, output_path):
+    speakers = sorted(list(speaker_set))
+    with open(output_path, 'w') as f:
+        for i, spk in enumerate(speakers):
+            f.write(spk + ' ' + str(i) + '\n')
+
+
+def process_sentence(config: Dict[str, Any],
+                     fp: Path,
+                     sentences: Dict,
+                     output_dir: Path,
+                     mel_extractor=None):
+    utt_id = fp.stem
+    record = None
+    if utt_id in sentences:
+        # reading, resampling may occur
+        wav, _ = librosa.load(str(fp), sr=config.fs)
+        if len(wav.shape) != 1 or np.abs(wav).max() > 1.0:
+            return record
+        assert len(wav.shape) == 1, f"{utt_id} is not a mono-channel audio."
+        assert np.abs(wav).max(
+        ) <= 1.0, f"{utt_id} is seems to be different that 16 bit PCM."
+        phones = sentences[utt_id][0]
+        speaker = sentences[utt_id][1]
+        logmel = mel_extractor.get_log_mel_fbank(wav, base='e')
+        # change duration according to mel_length
+        num_frames = logmel.shape[0]
+        mel_dir = output_dir / "data_speech"
+        mel_dir.mkdir(parents=True, exist_ok=True)
+        mel_path = mel_dir / (utt_id + "_speech.npy")
+        np.save(mel_path, logmel)
+        record = {
+            "utt_id": utt_id,
+            "phones": phones,
+            "text_lengths": len(phones),
+            "speech_lengths": num_frames,
+            "speech": str(mel_path),
+            "speaker": speaker
+        }
+    return record
+
+
+def process_sentences(config,
+                      fps: List[Path],
+                      sentences: Dict,
+                      output_dir: Path,
+                      mel_extractor=None,
+                      nprocs: int=1):
+    if nprocs == 1:
+        results = []
+        for fp in tqdm.tqdm(fps, total=len(fps)):
+            record = process_sentence(config, fp, sentences, output_dir,
+                                      mel_extractor)
+            if record:
+                results.append(record)
+    else:
+        with ThreadPoolExecutor(nprocs) as pool:
+            futures = []
+            with tqdm.tqdm(total=len(fps)) as progress:
+                for fp in fps:
+                    future = pool.submit(process_sentence, config, fp,
+                                         sentences, output_dir, mel_extractor)
+                    future.add_done_callback(lambda p: progress.update())
+                    futures.append(future)
+
+                results = []
+                for ft in futures:
+                    record = ft.result()
+                    if record:
+                        results.append(record)
+
+    results.sort(key=itemgetter("utt_id"))
+    with jsonlines.open(output_dir / "metadata.jsonl", 'w') as writer:
+        for item in results:
+            writer.write(item)
+    print("Done")
+
+
+def main():
+    # parse config and args
+    parser = argparse.ArgumentParser(
+        description="Preprocess audio and then extract features.")
+
+    parser.add_argument(
+        "--dataset",
+        default="ljspeech",
+        type=str,
+        help="name of dataset, should in {ljspeech} now")
+
+    parser.add_argument(
+        "--rootdir", default=None, type=str, help="directory to dataset.")
+
+    parser.add_argument(
+        "--dumpdir",
+        type=str,
+        required=True,
+        help="directory to dump feature files.")
+
+    parser.add_argument(
+        "--config-path",
+        default="conf/default.yaml",
+        type=str,
+        help="yaml format configuration file.")
+
+    parser.add_argument(
+        "--verbose",
+        type=int,
+        default=1,
+        help="logging level. higher is more logging. (default=1)")
+    parser.add_argument(
+        "--num-cpu", type=int, default=1, help="number of process.")
+
+    args = parser.parse_args()
+
+    config_path = Path(args.config_path).resolve()
+    root_dir = Path(args.rootdir).expanduser()
+    dumpdir = Path(args.dumpdir).expanduser()
+    # use absolute path
+    dumpdir = dumpdir.resolve()
+    dumpdir.mkdir(parents=True, exist_ok=True)
+
+    assert root_dir.is_dir()
+
+    with open(config_path, 'rt') as f:
+        _C = yaml.safe_load(f)
+        _C = Configuration(_C)
+        config = _C.clone()
+
+    if args.verbose > 1:
+        print(vars(args))
+        print(config)
+
+    phone_id_map_path = dumpdir / "phone_id_map.txt"
+    speaker_id_map_path = dumpdir / "speaker_id_map.txt"
+
+    if args.dataset == "ljspeech":
+        wav_files = sorted(list((root_dir / "wavs").rglob("*.wav")))
+        frontend = English()
+        sentences, speaker_set = get_lj_sentences(root_dir / "metadata.csv",
+                                                  frontend)
+        get_input_token(sentences, phone_id_map_path)
+        get_spk_id_map(speaker_set, speaker_id_map_path)
+        # split data into 3 sections
+        num_train = 12900
+        num_dev = 100
+        train_wav_files = wav_files[:num_train]
+        dev_wav_files = wav_files[num_train:num_train + num_dev]
+        test_wav_files = wav_files[num_train + num_dev:]
+
+    train_dump_dir = dumpdir / "train" / "raw"
+    train_dump_dir.mkdir(parents=True, exist_ok=True)
+    dev_dump_dir = dumpdir / "dev" / "raw"
+    dev_dump_dir.mkdir(parents=True, exist_ok=True)
+    test_dump_dir = dumpdir / "test" / "raw"
+    test_dump_dir.mkdir(parents=True, exist_ok=True)
+
+    # Extractor
+    mel_extractor = LogMelFBank(
+        sr=config.fs,
+        n_fft=config.n_fft,
+        hop_length=config.n_shift,
+        win_length=config.win_length,
+        window=config.window,
+        n_mels=config.n_mels,
+        fmin=config.fmin,
+        fmax=config.fmax)
+
+    # process for the 3 sections
+    if train_wav_files:
+        process_sentences(
+            config,
+            train_wav_files,
+            sentences,
+            train_dump_dir,
+            mel_extractor,
+            nprocs=args.num_cpu)
+    if dev_wav_files:
+        process_sentences(
+            config,
+            dev_wav_files,
+            sentences,
+            dev_dump_dir,
+            mel_extractor,
+            nprocs=args.num_cpu)
+    if test_wav_files:
+        process_sentences(
+            config,
+            test_wav_files,
+            sentences,
+            test_dump_dir,
+            mel_extractor,
+            nprocs=args.num_cpu)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/ernie-sat/paddlespeech/t2s/exps/transformer_tts/synthesize.py b/ernie-sat/paddlespeech/t2s/exps/transformer_tts/synthesize.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b6b1873fca65d4765df40ac2c8a233f0e33ae3a
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/exps/transformer_tts/synthesize.py
@@ -0,0 +1,146 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import logging
+from pathlib import Path
+
+import jsonlines
+import numpy as np
+import paddle
+import soundfile as sf
+import yaml
+from yacs.config import CfgNode
+
+from paddlespeech.t2s.datasets.data_table import DataTable
+from paddlespeech.t2s.models.transformer_tts import TransformerTTS
+from paddlespeech.t2s.models.transformer_tts import TransformerTTSInference
+from paddlespeech.t2s.models.waveflow import ConditionalWaveFlow
+from paddlespeech.t2s.modules.normalizer import ZScore
+from paddlespeech.t2s.utils import layer_tools
+
+
+def evaluate(args, acoustic_model_config, vocoder_config):
+    # dataloader has been too verbose
+    logging.getLogger("DataLoader").disabled = True
+
+    # construct dataset for evaluation
+    with jsonlines.open(args.test_metadata, 'r') as reader:
+        test_metadata = list(reader)
+    test_dataset = DataTable(data=test_metadata, fields=["utt_id", "text"])
+
+    with open(args.phones_dict, "r") as f:
+        phn_id = [line.strip().split() for line in f.readlines()]
+    vocab_size = len(phn_id)
+    print("vocab_size:", vocab_size)
+    odim = acoustic_model_config.n_mels
+    model = TransformerTTS(
+        idim=vocab_size, odim=odim, **acoustic_model_config["model"])
+
+    model.set_state_dict(
+        paddle.load(args.transformer_tts_checkpoint)["main_params"])
+    model.eval()
+    # remove ".pdparams" in waveflow_checkpoint
+    vocoder_checkpoint_path = args.waveflow_checkpoint[:-9] if args.waveflow_checkpoint.endswith(
+        ".pdparams") else args.waveflow_checkpoint
+    vocoder = ConditionalWaveFlow.from_pretrained(vocoder_config,
+                                                  vocoder_checkpoint_path)
+    layer_tools.recursively_remove_weight_norm(vocoder)
+    vocoder.eval()
+    print("model done!")
+
+    stat = np.load(args.transformer_tts_stat)
+    mu, std = stat
+    mu = paddle.to_tensor(mu)
+    std = paddle.to_tensor(std)
+    transformer_tts_normalizer = ZScore(mu, std)
+
+    transformer_tts_inference = TransformerTTSInference(
+        transformer_tts_normalizer, model)
+
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    for datum in test_dataset:
+        utt_id = datum["utt_id"]
+        text = paddle.to_tensor(datum["text"])
+
+        with paddle.no_grad():
+            mel = transformer_tts_inference(text)
+            # mel shape is (T, feats) and waveflow's input shape is (batch, feats, T)
+            mel = mel.unsqueeze(0).transpose([0, 2, 1])
+            # wavflow's output shape is (B, T)
+            wav = vocoder.infer(mel)[0]
+
+        sf.write(
+            str(output_dir / (utt_id + ".wav")),
+            wav.numpy(),
+            samplerate=acoustic_model_config.fs)
+        print(f"{utt_id} done!")
+
+
+def main():
+    # parse args and config and redirect to train_sp
+    parser = argparse.ArgumentParser(
+        description="Synthesize with transformer tts & waveflow.")
+    parser.add_argument(
+        "--transformer-tts-config",
+        type=str,
+        help="transformer tts config file.")
+    parser.add_argument(
+        "--transformer-tts-checkpoint",
+        type=str,
+        help="transformer tts checkpoint to load.")
+    parser.add_argument(
+        "--transformer-tts-stat",
+        type=str,
+        help="mean and standard deviation used to normalize spectrogram when training transformer tts."
+    )
+    parser.add_argument(
+        "--waveflow-config", type=str, help="waveflow config file.")
+    # not normalize when training waveflow
+    parser.add_argument(
+        "--waveflow-checkpoint", type=str, help="waveflow checkpoint to load.")
+    parser.add_argument(
+        "--phones-dict", type=str, default=None, help="phone vocabulary file.")
+
+    parser.add_argument("--test-metadata", type=str, help="test metadata.")
+    parser.add_argument("--output-dir", type=str, help="output dir.")
+    parser.add_argument(
+        "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
+
+    args = parser.parse_args()
+
+    if args.ngpu == 0:
+        paddle.set_device("cpu")
+    elif args.ngpu > 0:
+        paddle.set_device("gpu")
+    else:
+        print("ngpu should >= 0 !")
+
+    with open(args.transformer_tts_config) as f:
+        transformer_tts_config = CfgNode(yaml.safe_load(f))
+    with open(args.waveflow_config) as f:
+        waveflow_config = CfgNode(yaml.safe_load(f))
+
+    print("========Args========")
+    print(yaml.safe_dump(vars(args)))
+    print("========Config========")
+    print(transformer_tts_config)
+    print(waveflow_config)
+
+    evaluate(args, transformer_tts_config, waveflow_config)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/ernie-sat/paddlespeech/t2s/exps/transformer_tts/synthesize_e2e.py b/ernie-sat/paddlespeech/t2s/exps/transformer_tts/synthesize_e2e.py
new file mode 100644
index 0000000000000000000000000000000000000000..0cd7d224e0b983d1461bfd42ffebd812f79380b4
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/exps/transformer_tts/synthesize_e2e.py
@@ -0,0 +1,165 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import logging
+from pathlib import Path
+
+import numpy as np
+import paddle
+import soundfile as sf
+import yaml
+from yacs.config import CfgNode
+
+from paddlespeech.t2s.frontend import English
+from paddlespeech.t2s.models.transformer_tts import TransformerTTS
+from paddlespeech.t2s.models.transformer_tts import TransformerTTSInference
+from paddlespeech.t2s.models.waveflow import ConditionalWaveFlow
+from paddlespeech.t2s.modules.normalizer import ZScore
+from paddlespeech.t2s.utils import layer_tools
+
+
+def evaluate(args, acoustic_model_config, vocoder_config):
+    # dataloader has been too verbose
+    logging.getLogger("DataLoader").disabled = True
+
+    # construct dataset for evaluation
+    sentences = []
+    with open(args.text, 'rt') as f:
+        for line in f:
+            line_list = line.strip().split()
+            utt_id = line_list[0]
+            sentence = " ".join(line_list[1:])
+            sentences.append((utt_id, sentence))
+
+    with open(args.phones_dict, "r") as f:
+        phn_id = [line.strip().split() for line in f.readlines()]
+
+    vocab_size = len(phn_id)
+    phone_id_map = {}
+    for phn, id in phn_id:
+        phone_id_map[phn] = int(id)
+    print("vocab_size:", vocab_size)
+    odim = acoustic_model_config.n_mels
+    model = TransformerTTS(
+        idim=vocab_size, odim=odim, **acoustic_model_config["model"])
+
+    model.set_state_dict(
+        paddle.load(args.transformer_tts_checkpoint)["main_params"])
+    model.eval()
+
+    # remove ".pdparams" in waveflow_checkpoint
+    vocoder_checkpoint_path = args.waveflow_checkpoint[:-9] if args.waveflow_checkpoint.endswith(
+        ".pdparams") else args.waveflow_checkpoint
+    vocoder = ConditionalWaveFlow.from_pretrained(vocoder_config,
+                                                  vocoder_checkpoint_path)
+    layer_tools.recursively_remove_weight_norm(vocoder)
+    vocoder.eval()
+    print("model done!")
+
+    frontend = English()
+    print("frontend done!")
+
+    stat = np.load(args.transformer_tts_stat)
+    mu, std = stat
+    mu = paddle.to_tensor(mu)
+    std = paddle.to_tensor(std)
+    transformer_tts_normalizer = ZScore(mu, std)
+
+    transformer_tts_inference = TransformerTTSInference(
+        transformer_tts_normalizer, model)
+
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    for utt_id, sentence in sentences:
+        phones = frontend.phoneticize(sentence)
+        # remove start_symbol and end_symbol
+        phones = phones[1:-1]
+        phones = [phn for phn in phones if not phn.isspace()]
+        phones = [phn if phn in phone_id_map else "," for phn in phones]
+        phone_ids = [phone_id_map[phn] for phn in phones]
+        with paddle.no_grad():
+            mel = transformer_tts_inference(paddle.to_tensor(phone_ids))
+            # mel shape is (T, feats) and waveflow's input shape is (batch, feats, T)
+            mel = mel.unsqueeze(0).transpose([0, 2, 1])
+            # wavflow's output shape is (B, T)
+            wav = vocoder.infer(mel)[0]
+
+        sf.write(
+            str(output_dir / (utt_id + ".wav")),
+            wav.numpy(),
+            samplerate=acoustic_model_config.fs)
+        print(f"{utt_id} done!")
+
+
+def main():
+    # parse args and config and redirect to train_sp
+    parser = argparse.ArgumentParser(
+        description="Synthesize with transformer tts & waveflow.")
+    parser.add_argument(
+        "--transformer-tts-config",
+        type=str,
+        help="transformer tts config file.")
+    parser.add_argument(
+        "--transformer-tts-checkpoint",
+        type=str,
+        help="transformer tts checkpoint to load.")
+    parser.add_argument(
+        "--transformer-tts-stat",
+        type=str,
+        help="mean and standard deviation used to normalize spectrogram when training transformer tts."
+    )
+    parser.add_argument(
+        "--waveflow-config", type=str, help="waveflow config file.")
+    # not normalize when training waveflow
+    parser.add_argument(
+        "--waveflow-checkpoint", type=str, help="waveflow checkpoint to load.")
+    parser.add_argument(
+        "--phones-dict",
+        type=str,
+        default="phone_id_map.txt",
+        help="phone vocabulary file.")
+    parser.add_argument(
+        "--text",
+        type=str,
+        help="text to synthesize, a 'utt_id sentence' pair per line.")
+    parser.add_argument("--output-dir", type=str, help="output dir.")
+    parser.add_argument(
+        "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
+
+    args = parser.parse_args()
+
+    if args.ngpu == 0:
+        paddle.set_device("cpu")
+    elif args.ngpu > 0:
+        paddle.set_device("gpu")
+    else:
+        print("ngpu should >= 0 !")
+
+    with open(args.transformer_tts_config) as f:
+        transformer_tts_config = CfgNode(yaml.safe_load(f))
+    with open(args.waveflow_config) as f:
+        waveflow_config = CfgNode(yaml.safe_load(f))
+
+    print("========Args========")
+    print(yaml.safe_dump(vars(args)))
+    print("========Config========")
+    print(transformer_tts_config)
+    print(waveflow_config)
+
+    evaluate(args, transformer_tts_config, waveflow_config)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/ernie-sat/paddlespeech/t2s/exps/transformer_tts/train.py b/ernie-sat/paddlespeech/t2s/exps/transformer_tts/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..45ecb269bac033fed4287e5083ced6ce92b89f35
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/exps/transformer_tts/train.py
@@ -0,0 +1,193 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import logging
+import os
+import shutil
+from pathlib import Path
+
+import jsonlines
+import numpy as np
+import paddle
+import yaml
+from paddle import DataParallel
+from paddle import distributed as dist
+from paddle.io import DataLoader
+from paddle.io import DistributedBatchSampler
+from yacs.config import CfgNode
+
+from paddlespeech.t2s.datasets.am_batch_fn import transformer_single_spk_batch_fn
+from paddlespeech.t2s.datasets.data_table import DataTable
+from paddlespeech.t2s.models.transformer_tts import TransformerTTS
+from paddlespeech.t2s.models.transformer_tts import TransformerTTSEvaluator
+from paddlespeech.t2s.models.transformer_tts import TransformerTTSUpdater
+from paddlespeech.t2s.training.extensions.snapshot import Snapshot
+from paddlespeech.t2s.training.extensions.visualizer import VisualDL
+from paddlespeech.t2s.training.optimizer import build_optimizers
+from paddlespeech.t2s.training.seeding import seed_everything
+from paddlespeech.t2s.training.trainer import Trainer
+
+
+def train_sp(args, config):
+    # decides device type and whether to run in parallel
+    # setup running environment correctly
+    if paddle.is_compiled_with_cuda() and args.ngpu > 0:
+        paddle.set_device("gpu")
+    elif paddle.is_compiled_with_npu() and args.ngpu > 0:
+        paddle.set_device("npu")
+    else:
+        paddle.set_device("cpu")
+    world_size = paddle.distributed.get_world_size()
+    if world_size > 1:
+        paddle.distributed.init_parallel_env()
+
+    # set the random seed, it is a must for multiprocess training
+    seed_everything(config.seed)
+
+    print(
+        f"rank: {dist.get_rank()}, pid: {os.getpid()}, parent_pid: {os.getppid()}",
+    )
+
+    # dataloader has been too verbose
+    logging.getLogger("DataLoader").disabled = True
+
+    # construct dataset for training and validation
+    with jsonlines.open(args.train_metadata, 'r') as reader:
+        train_metadata = list(reader)
+    train_dataset = DataTable(
+        data=train_metadata,
+        fields=[
+            "text",
+            "text_lengths",
+            "speech",
+            "speech_lengths",
+        ],
+        converters={
+            "speech": np.load,
+        }, )
+    with jsonlines.open(args.dev_metadata, 'r') as reader:
+        dev_metadata = list(reader)
+    dev_dataset = DataTable(
+        data=dev_metadata,
+        fields=[
+            "text",
+            "text_lengths",
+            "speech",
+            "speech_lengths",
+        ],
+        converters={
+            "speech": np.load,
+        }, )
+
+    # collate function and dataloader
+    train_sampler = DistributedBatchSampler(
+        train_dataset,
+        batch_size=config.batch_size,
+        shuffle=True,
+        drop_last=True)
+
+    print("samplers done!")
+
+    train_dataloader = DataLoader(
+        train_dataset,
+        batch_sampler=train_sampler,
+        collate_fn=transformer_single_spk_batch_fn,
+        num_workers=config.num_workers)
+
+    dev_dataloader = DataLoader(
+        dev_dataset,
+        shuffle=False,
+        drop_last=False,
+        batch_size=config.batch_size,
+        collate_fn=transformer_single_spk_batch_fn,
+        num_workers=config.num_workers)
+    print("dataloaders done!")
+
+    with open(args.phones_dict, "r") as f:
+        phn_id = [line.strip().split() for line in f.readlines()]
+    vocab_size = len(phn_id)
+    print("vocab_size:", vocab_size)
+
+    odim = config.n_mels
+    model = TransformerTTS(idim=vocab_size, odim=odim, **config["model"])
+    if world_size > 1:
+        model = DataParallel(model)
+    print("model done!")
+
+    optimizer = build_optimizers(model, **config["optimizer"])
+    print("optimizer done!")
+
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    if dist.get_rank() == 0:
+        config_name = args.config.split("/")[-1]
+        # copy conf to output_dir
+        shutil.copyfile(args.config, output_dir / config_name)
+
+    updater = TransformerTTSUpdater(
+        model=model,
+        optimizer=optimizer,
+        dataloader=train_dataloader,
+        output_dir=output_dir,
+        **config["updater"])
+
+    trainer = Trainer(updater, (config.max_epoch, 'epoch'), output_dir)
+
+    evaluator = TransformerTTSEvaluator(
+        model, dev_dataloader, output_dir=output_dir, **config["updater"])
+
+    if dist.get_rank() == 0:
+        trainer.extend(evaluator, trigger=(1, "epoch"))
+        trainer.extend(VisualDL(output_dir), trigger=(1, "iteration"))
+    trainer.extend(
+        Snapshot(max_size=config.num_snapshots), trigger=(1, 'epoch'))
+    trainer.run()
+
+
+def main():
+    # parse args and config and redirect to train_sp
+    parser = argparse.ArgumentParser(description="Train a TransformerTTS "
+                                     "model with LJSpeech TTS dataset.")
+    parser.add_argument(
+        "--config", type=str, help="config file to overwrite default config.")
+    parser.add_argument("--train-metadata", type=str, help="training data.")
+    parser.add_argument("--dev-metadata", type=str, help="dev data.")
+    parser.add_argument("--output-dir", type=str, help="output dir.")
+    parser.add_argument(
+        "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
+    parser.add_argument(
+        "--phones-dict", type=str, default=None, help="phone vocabulary file.")
+
+    args = parser.parse_args()
+
+    with open(args.config) as f:
+        config = CfgNode(yaml.safe_load(f))
+
+    print("========Args========")
+    print(yaml.safe_dump(vars(args)))
+    print("========Config========")
+    print(config)
+    print(
+        f"master see the word size: {dist.get_world_size()}, from pid: {os.getpid()}"
+    )
+
+    # dispatch
+    if args.ngpu > 1:
+        dist.spawn(train_sp, (args, config), nprocs=args.ngpu)
+    else:
+        train_sp(args, config)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/ernie-sat/paddlespeech/t2s/exps/voice_cloning.py b/ernie-sat/paddlespeech/t2s/exps/voice_cloning.py
new file mode 100644
index 0000000000000000000000000000000000000000..1afd21dfffb74946f6212294cf0ddb535db146d2
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/exps/voice_cloning.py
@@ -0,0 +1,202 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+from pathlib import Path
+
+import numpy as np
+import paddle
+import soundfile as sf
+import yaml
+from yacs.config import CfgNode
+
+from paddlespeech.t2s.exps.syn_utils import get_am_inference
+from paddlespeech.t2s.exps.syn_utils import get_voc_inference
+from paddlespeech.t2s.frontend.zh_frontend import Frontend
+from paddlespeech.vector.exps.ge2e.audio_processor import SpeakerVerificationPreprocessor
+from paddlespeech.vector.models.lstm_speaker_encoder import LSTMSpeakerEncoder
+
+
+def voice_cloning(args):
+    # Init body.
+    with open(args.am_config) as f:
+        am_config = CfgNode(yaml.safe_load(f))
+    with open(args.voc_config) as f:
+        voc_config = CfgNode(yaml.safe_load(f))
+
+    print("========Args========")
+    print(yaml.safe_dump(vars(args)))
+    print("========Config========")
+    print(am_config)
+    print(voc_config)
+
+    # speaker encoder
+    p = SpeakerVerificationPreprocessor(
+        sampling_rate=16000,
+        audio_norm_target_dBFS=-30,
+        vad_window_length=30,
+        vad_moving_average_width=8,
+        vad_max_silence_length=6,
+        mel_window_length=25,
+        mel_window_step=10,
+        n_mels=40,
+        partial_n_frames=160,
+        min_pad_coverage=0.75,
+        partial_overlap_ratio=0.5)
+    print("Audio Processor Done!")
+
+    speaker_encoder = LSTMSpeakerEncoder(
+        n_mels=40, num_layers=3, hidden_size=256, output_size=256)
+    speaker_encoder.set_state_dict(paddle.load(args.ge2e_params_path))
+    speaker_encoder.eval()
+    print("GE2E Done!")
+
+    frontend = Frontend(phone_vocab_path=args.phones_dict)
+    print("frontend done!")
+
+    # acoustic model
+    am_inference, *_ = get_am_inference(args, am_config)
+
+    # vocoder
+    voc_inference = get_voc_inference(args, voc_config)
+
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    input_dir = Path(args.input_dir)
+
+    sentence = args.text
+
+    input_ids = frontend.get_input_ids(sentence, merge_sentences=True)
+    phone_ids = input_ids["phone_ids"][0]
+
+    for name in os.listdir(input_dir):
+        utt_id = name.split(".")[0]
+        ref_audio_path = input_dir / name
+        mel_sequences = p.extract_mel_partials(p.preprocess_wav(ref_audio_path))
+        # print("mel_sequences: ", mel_sequences.shape)
+        with paddle.no_grad():
+            spk_emb = speaker_encoder.embed_utterance(
+                paddle.to_tensor(mel_sequences))
+        # print("spk_emb shape: ", spk_emb.shape)
+
+        with paddle.no_grad():
+            wav = voc_inference(am_inference(phone_ids, spk_emb=spk_emb))
+
+        sf.write(
+            str(output_dir / (utt_id + ".wav")),
+            wav.numpy(),
+            samplerate=am_config.fs)
+        print(f"{utt_id} done!")
+    # Randomly generate numbers of 0 ~ 0.2, 256 is the dim of spk_emb
+    random_spk_emb = np.random.rand(256) * 0.2
+    random_spk_emb = paddle.to_tensor(random_spk_emb)
+    utt_id = "random_spk_emb"
+    with paddle.no_grad():
+        wav = voc_inference(am_inference(phone_ids, spk_emb=spk_emb))
+    sf.write(
+        str(output_dir / (utt_id + ".wav")),
+        wav.numpy(),
+        samplerate=am_config.fs)
+    print(f"{utt_id} done!")
+
+
+def parse_args():
+    # parse args and config and redirect to train_sp
+    parser = argparse.ArgumentParser(description="")
+    parser.add_argument(
+        '--am',
+        type=str,
+        default='fastspeech2_csmsc',
+        choices=['fastspeech2_aishell3', 'tacotron2_aishell3'],
+        help='Choose acoustic model type of tts task.')
+    parser.add_argument(
+        '--am_config',
+        type=str,
+        default=None,
+        help='Config of acoustic model. Use deault config when it is None.')
+    parser.add_argument(
+        '--am_ckpt',
+        type=str,
+        default=None,
+        help='Checkpoint file of acoustic model.')
+    parser.add_argument(
+        "--am_stat",
+        type=str,
+        default=None,
+        help="mean and standard deviation used to normalize spectrogram when training acoustic model."
+    )
+    parser.add_argument(
+        "--phones-dict",
+        type=str,
+        default="phone_id_map.txt",
+        help="phone vocabulary file.")
+    # vocoder
+    parser.add_argument(
+        '--voc',
+        type=str,
+        default='pwgan_csmsc',
+        choices=['pwgan_aishell3'],
+        help='Choose vocoder type of tts task.')
+
+    parser.add_argument(
+        '--voc_config',
+        type=str,
+        default=None,
+        help='Config of voc. Use deault config when it is None.')
+    parser.add_argument(
+        '--voc_ckpt', type=str, default=None, help='Checkpoint file of voc.')
+    parser.add_argument(
+        "--voc_stat",
+        type=str,
+        default=None,
+        help="mean and standard deviation used to normalize spectrogram when training voc."
+    )
+    parser.add_argument(
+        "--text",
+        type=str,
+        default="每当你觉得，想要批评什么人的时候，你切要记着，这个世界上的人，并非都具备你禀有的条件。",
+        help="text to synthesize, a line")
+
+    parser.add_argument(
+        "--ge2e_params_path", type=str, help="ge2e params path.")
+
+    parser.add_argument(
+        "--ngpu", type=int, default=1, help="if ngpu=0, use cpu.")
+
+    parser.add_argument(
+        "--input-dir",
+        type=str,
+        help="input dir of *.wav, the sample rate will be resample to 16k.")
+    parser.add_argument("--output-dir", type=str, help="output dir.")
+
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+
+    if args.ngpu == 0:
+        paddle.set_device("cpu")
+    elif args.ngpu > 0:
+        paddle.set_device("gpu")
+    else:
+        print("ngpu should >= 0 !")
+
+    voice_cloning(args)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/ernie-sat/paddlespeech/t2s/exps/waveflow/__init__.py b/ernie-sat/paddlespeech/t2s/exps/waveflow/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..abf198b97e6e818e1fbe59006f98492640bcee54
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/exps/waveflow/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/ernie-sat/paddlespeech/t2s/exps/waveflow/config.py b/ernie-sat/paddlespeech/t2s/exps/waveflow/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..869caa6a2c43e902f897011d637e80987b5f383a
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/exps/waveflow/config.py
@@ -0,0 +1,55 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from yacs.config import CfgNode as CN
+
+_C = CN()
+_C.data = CN(
+    dict(
+        batch_size=8,  # batch size
+        valid_size=16,  # the first N examples are reserved for validation
+        sample_rate=22050,  # Hz, sample rate
+        n_fft=1024,  # fft frame size
+        win_length=1024,  # window size
+        hop_length=256,  # hop size between ajacent frame
+        fmin=0,
+        fmax=8000,  # Hz, max frequency when converting to mel
+        n_mels=80,  # mel bands
+        clip_frames=65,  # mel clip frames
+    ))
+
+_C.model = CN(
+    dict(
+        upsample_factors=[16, 16],
+        n_flows=8,  # number of flows in WaveFlow
+        n_layers=8,  # number of conv block in each flow
+        n_group=16,  # folding factor of audio and spectrogram
+        channels=128,  # resiaudal channel in each flow
+        kernel_size=[3, 3],  # kernel size in each conv block
+        sigma=1.0,  # stddev of the random noise
+    ))
+
+_C.training = CN(
+    dict(
+        lr=2e-4,  # learning rates
+        valid_interval=1000,  # validation
+        save_interval=10000,  # checkpoint
+        max_iteration=3000000,  # max iteration to train
+    ))
+
+
+def get_cfg_defaults():
+    """Get a yacs CfgNode object with default values for my_project."""
+    # Return a clone so that the defaults will not be altered
+    # This is for the "local variable" use pattern
+    return _C.clone()
diff --git a/ernie-sat/paddlespeech/t2s/exps/waveflow/ljspeech.py b/ernie-sat/paddlespeech/t2s/exps/waveflow/ljspeech.py
new file mode 100644
index 0000000000000000000000000000000000000000..a6efa9ec221f7c2f3a644ddb3968a002dc7254c1
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/exps/waveflow/ljspeech.py
@@ -0,0 +1,89 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from pathlib import Path
+
+import numpy as np
+import pandas
+from paddle.io import Dataset
+
+from paddlespeech.t2s.datasets.batch import batch_spec
+from paddlespeech.t2s.datasets.batch import batch_wav
+
+
+class LJSpeech(Dataset):
+    """A simple dataset adaptor for the processed ljspeech dataset."""
+
+    def __init__(self, root):
+        self.root = Path(root).expanduser()
+        meta_data = pandas.read_csv(
+            str(self.root / "metadata.csv"),
+            sep="\t",
+            header=None,
+            names=["fname", "frames", "samples"])
+
+        records = []
+        for row in meta_data.itertuples():
+            mel_path = str(self.root / "mel" / (row.fname + ".npy"))
+            wav_path = str(self.root / "wav" / (row.fname + ".npy"))
+            records.append((mel_path, wav_path))
+        self.records = records
+
+    def __getitem__(self, i):
+        mel_name, wav_name = self.records[i]
+        mel = np.load(mel_name)
+        wav = np.load(wav_name)
+        return mel, wav
+
+    def __len__(self):
+        return len(self.records)
+
+
+class LJSpeechCollector(object):
+    """A simple callable to batch LJSpeech examples."""
+
+    def __init__(self, padding_value=0.):
+        self.padding_value = padding_value
+
+    def __call__(self, examples):
+        mels = [example[0] for example in examples]
+        wavs = [example[1] for example in examples]
+        mels, _ = batch_spec(mels, pad_value=self.padding_value)
+        wavs, _ = batch_wav(wavs, pad_value=self.padding_value)
+        return mels, wavs
+
+
+class LJSpeechClipCollector(object):
+    def __init__(self, clip_frames=65, hop_length=256):
+        self.clip_frames = clip_frames
+        self.hop_length = hop_length
+
+    def __call__(self, examples):
+        mels = []
+        wavs = []
+        for example in examples:
+            mel_clip, wav_clip = self.clip(example)
+            mels.append(mel_clip)
+            wavs.append(wav_clip)
+        mels = np.stack(mels)
+        wavs = np.stack(wavs)
+        return mels, wavs
+
+    def clip(self, example):
+        mel, wav = example
+        frames = mel.shape[-1]
+        start = np.random.randint(0, frames - self.clip_frames)
+        mel_clip = mel[:, start:start + self.clip_frames]
+        wav_clip = wav[start * self.hop_length:(start + self.clip_frames) *
+                       self.hop_length]
+        return mel_clip, wav_clip
diff --git a/ernie-sat/paddlespeech/t2s/exps/waveflow/preprocess.py b/ernie-sat/paddlespeech/t2s/exps/waveflow/preprocess.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef3a29175896d7d02f7a9df4dcc930d33f9476af
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/exps/waveflow/preprocess.py
@@ -0,0 +1,160 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+from pathlib import Path
+
+import librosa
+import numpy as np
+import pandas as pd
+import tqdm
+
+from paddlespeech.t2s.audio import LogMagnitude
+from paddlespeech.t2s.datasets import LJSpeechMetaData
+from paddlespeech.t2s.exps.waveflow.config import get_cfg_defaults
+
+
+class Transform(object):
+    def __init__(self, sample_rate, n_fft, win_length, hop_length, n_mels, fmin,
+                 fmax):
+        self.sample_rate = sample_rate
+        self.n_fft = n_fft
+        self.win_length = win_length
+        self.hop_length = hop_length
+        self.n_mels = n_mels
+        self.fmin = fmin
+        self.fmax = fmax
+
+        self.spec_normalizer = LogMagnitude(min=1e-5)
+
+    def __call__(self, example):
+        wav_path, _, _ = example
+
+        sr = self.sample_rate
+        n_fft = self.n_fft
+        win_length = self.win_length
+        hop_length = self.hop_length
+        n_mels = self.n_mels
+        fmin = self.fmin
+        fmax = self.fmax
+
+        wav, loaded_sr = librosa.load(wav_path, sr=None)
+        assert loaded_sr == sr, "sample rate does not match, resampling applied"
+
+        # Pad audio to the right size.
+        frames = int(np.ceil(float(wav.size) / hop_length))
+        fft_padding = (n_fft - hop_length) // 2  # sound
+        desired_length = frames * hop_length + fft_padding * 2
+        pad_amount = (desired_length - wav.size) // 2
+
+        if wav.size % 2 == 0:
+            wav = np.pad(wav, (pad_amount, pad_amount), mode='reflect')
+        else:
+            wav = np.pad(wav, (pad_amount, pad_amount + 1), mode='reflect')
+
+        # Normalize audio.
+        wav = wav / np.abs(wav).max() * 0.999
+
+        # Compute mel-spectrogram.
+        # Turn center to False to prevent internal padding.
+        spectrogram = librosa.core.stft(
+            wav,
+            hop_length=hop_length,
+            win_length=win_length,
+            n_fft=n_fft,
+            center=False)
+        spectrogram_magnitude = np.abs(spectrogram)
+
+        # Compute mel-spectrograms.
+        mel_filter_bank = librosa.filters.mel(
+            sr=sr, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax)
+        mel_spectrogram = np.dot(mel_filter_bank, spectrogram_magnitude)
+
+        # log scale mel_spectrogram.
+        mel_spectrogram = self.spec_normalizer.transform(mel_spectrogram)
+
+        # Extract the center of audio that corresponds to mel spectrograms.
+        audio = wav[fft_padding:-fft_padding]
+        assert mel_spectrogram.shape[1] * hop_length == audio.size
+
+        # there is no clipping here
+        return audio, mel_spectrogram
+
+
+def create_dataset(config, input_dir, output_dir):
+    input_dir = Path(input_dir).expanduser()
+    dataset = LJSpeechMetaData(input_dir)
+
+    output_dir = Path(output_dir).expanduser()
+    output_dir.mkdir(exist_ok=True)
+
+    transform = Transform(config.sample_rate, config.n_fft, config.win_length,
+                          config.hop_length, config.n_mels, config.fmin,
+                          config.fmax)
+    file_names = []
+
+    for example in tqdm.tqdm(dataset):
+        fname, _, _ = example
+        base_name = os.path.splitext(os.path.basename(fname))[0]
+        wav_dir = output_dir / "wav"
+        mel_dir = output_dir / "mel"
+        wav_dir.mkdir(exist_ok=True)
+        mel_dir.mkdir(exist_ok=True)
+
+        audio, mel = transform(example)
+        np.save(str(wav_dir / base_name), audio)
+        np.save(str(mel_dir / base_name), mel)
+
+        file_names.append((base_name, mel.shape[-1], audio.shape[-1]))
+
+    meta_data = pd.DataFrame.from_records(file_names)
+    meta_data.to_csv(
+        str(output_dir / "metadata.csv"), sep="\t", index=None, header=None)
+    print("saved meta data in to {}".format(
+        os.path.join(output_dir, "metadata.csv")))
+
+    print("Done!")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="create dataset")
+    parser.add_argument(
+        "--config",
+        type=str,
+        metavar="FILE",
+        help="extra config to overwrite the default config")
+    parser.add_argument(
+        "--input", type=str, help="path of the ljspeech dataset")
+    parser.add_argument(
+        "--output", type=str, help="path to save output dataset")
+    parser.add_argument(
+        "--opts",
+        nargs=argparse.REMAINDER,
+        help="options to overwrite --config file and the default config, passing in KEY VALUE pairs"
+    )
+    parser.add_argument(
+        "-v", "--verbose", action="store_true", help="print msg")
+
+    config = get_cfg_defaults()
+    args = parser.parse_args()
+    if args.config:
+        config.merge_from_file(args.config)
+    if args.opts:
+        config.merge_from_list(args.opts)
+    config.freeze()
+    if args.verbose:
+        print(config.data)
+        print(args)
+
+    create_dataset(config.data, args.input, args.output)
diff --git a/ernie-sat/paddlespeech/t2s/exps/waveflow/synthesize.py b/ernie-sat/paddlespeech/t2s/exps/waveflow/synthesize.py
new file mode 100644
index 0000000000000000000000000000000000000000..53715b01ea0f89fd7cf19f18c4643e07f28d0422
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/exps/waveflow/synthesize.py
@@ -0,0 +1,87 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+from pathlib import Path
+
+import numpy as np
+import paddle
+import soundfile as sf
+
+from paddlespeech.t2s.exps.waveflow.config import get_cfg_defaults
+from paddlespeech.t2s.models.waveflow import ConditionalWaveFlow
+from paddlespeech.t2s.utils import layer_tools
+
+
+def main(config, args):
+    if args.ngpu == 0:
+        paddle.set_device("cpu")
+    elif args.ngpu > 0:
+        paddle.set_device("gpu")
+    else:
+        print("ngpu should >= 0 !")
+
+    model = ConditionalWaveFlow.from_pretrained(config, args.checkpoint_path)
+    layer_tools.recursively_remove_weight_norm(model)
+    model.eval()
+
+    mel_dir = Path(args.input).expanduser()
+    output_dir = Path(args.output).expanduser()
+    output_dir.mkdir(parents=True, exist_ok=True)
+    for file_path in mel_dir.glob("*.npy"):
+        mel = np.load(str(file_path))
+        with paddle.amp.auto_cast():
+            audio = model.predict(mel)
+        audio_path = output_dir / (os.path.splitext(file_path.name)[0] + ".wav")
+        sf.write(audio_path, audio, config.data.sample_rate)
+        print("[synthesize] {} -> {}".format(file_path, audio_path))
+
+
+if __name__ == "__main__":
+    config = get_cfg_defaults()
+
+    parser = argparse.ArgumentParser(
+        description="generate mel spectrogram with TransformerTTS.")
+    parser.add_argument(
+        "--config",
+        type=str,
+        metavar="FILE",
+        help="extra config to overwrite the default config")
+    parser.add_argument(
+        "--checkpoint_path", type=str, help="path of the checkpoint to load.")
+    parser.add_argument(
+        "--input",
+        type=str,
+        help="path of directory containing mel spectrogram (in .npy format)")
+    parser.add_argument("--output", type=str, help="path to save outputs")
+    parser.add_argument(
+        "--ngpu", type=int, default=1, help="if ngpu=0, use cpu.")
+    parser.add_argument(
+        "--opts",
+        nargs=argparse.REMAINDER,
+        help="options to overwrite --config file and the default config, passing in KEY VALUE pairs"
+    )
+    parser.add_argument(
+        "-v", "--verbose", action="store_true", help="print msg")
+
+    args = parser.parse_args()
+    if args.config:
+        config.merge_from_file(args.config)
+    if args.opts:
+        config.merge_from_list(args.opts)
+    config.freeze()
+    print(config)
+    print(args)
+
+    main(config, args)
diff --git a/ernie-sat/paddlespeech/t2s/exps/waveflow/train.py b/ernie-sat/paddlespeech/t2s/exps/waveflow/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf03f5ef17064d43a663c757fff8d605be2cb2f1
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/exps/waveflow/train.py
@@ -0,0 +1,160 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import time
+
+import numpy as np
+import paddle
+from paddle import distributed as dist
+from paddle.io import DataLoader
+from paddle.io import DistributedBatchSampler
+
+from paddlespeech.t2s.datasets import dataset
+from paddlespeech.t2s.exps.waveflow.config import get_cfg_defaults
+from paddlespeech.t2s.exps.waveflow.ljspeech import LJSpeech
+from paddlespeech.t2s.exps.waveflow.ljspeech import LJSpeechClipCollector
+from paddlespeech.t2s.exps.waveflow.ljspeech import LJSpeechCollector
+from paddlespeech.t2s.models.waveflow import ConditionalWaveFlow
+from paddlespeech.t2s.models.waveflow import WaveFlowLoss
+from paddlespeech.t2s.training.cli import default_argument_parser
+from paddlespeech.t2s.training.experiment import ExperimentBase
+from paddlespeech.t2s.utils import mp_tools
+
+
+class Experiment(ExperimentBase):
+    def setup_model(self):
+        config = self.config
+        model = ConditionalWaveFlow(
+            upsample_factors=config.model.upsample_factors,
+            n_flows=config.model.n_flows,
+            n_layers=config.model.n_layers,
+            n_group=config.model.n_group,
+            channels=config.model.channels,
+            n_mels=config.data.n_mels,
+            kernel_size=config.model.kernel_size)
+
+        if self.parallel:
+            model = paddle.DataParallel(model)
+        optimizer = paddle.optimizer.Adam(
+            config.training.lr, parameters=model.parameters())
+        criterion = WaveFlowLoss(sigma=config.model.sigma)
+
+        self.model = model
+        self.optimizer = optimizer
+        self.criterion = criterion
+
+    def setup_dataloader(self):
+        config = self.config
+        args = self.args
+
+        ljspeech_dataset = LJSpeech(args.data)
+        valid_set, train_set = dataset.split(ljspeech_dataset,
+                                             config.data.valid_size)
+
+        batch_fn = LJSpeechClipCollector(config.data.clip_frames,
+                                         config.data.hop_length)
+
+        if not self.parallel:
+            train_loader = DataLoader(
+                train_set,
+                batch_size=config.data.batch_size,
+                shuffle=True,
+                drop_last=True,
+                collate_fn=batch_fn)
+        else:
+            sampler = DistributedBatchSampler(
+                train_set,
+                batch_size=config.data.batch_size,
+                num_replicas=dist.get_world_size(),
+                rank=dist.get_rank(),
+                shuffle=True,
+                drop_last=True)
+            train_loader = DataLoader(
+                train_set, batch_sampler=sampler, collate_fn=batch_fn)
+
+        valid_batch_fn = LJSpeechCollector()
+        valid_loader = DataLoader(
+            valid_set, batch_size=1, collate_fn=valid_batch_fn)
+
+        self.train_loader = train_loader
+        self.valid_loader = valid_loader
+
+    def compute_outputs(self, mel, wav):
+        # model_core = model._layers if isinstance(model, paddle.DataParallel) else model
+        z, log_det_jocobian = self.model(wav, mel)
+        return z, log_det_jocobian
+
+    def train_batch(self):
+        start = time.time()
+        batch = self.read_batch()
+        data_loader_time = time.time() - start
+
+        self.model.train()
+        self.optimizer.clear_grad()
+        mel, wav = batch
+        z, log_det_jocobian = self.compute_outputs(mel, wav)
+        loss = self.criterion(z, log_det_jocobian)
+        loss.backward()
+        self.optimizer.step()
+        iteration_time = time.time() - start
+
+        loss_value = float(loss)
+        msg = "Rank: {}, ".format(dist.get_rank())
+        msg += "step: {}, ".format(self.iteration)
+        msg += "time: {:>.3f}s/{:>.3f}s, ".format(data_loader_time,
+                                                  iteration_time)
+        msg += "loss: {:>.6f}".format(loss_value)
+        self.logger.info(msg)
+        if dist.get_rank() == 0:
+            self.visualizer.add_scalar("train/loss", loss_value, self.iteration)
+
+    @mp_tools.rank_zero_only
+    @paddle.no_grad()
+    def valid(self):
+        valid_iterator = iter(self.valid_loader)
+        valid_losses = []
+        mel, wav = next(valid_iterator)
+        z, log_det_jocobian = self.compute_outputs(mel, wav)
+        loss = self.criterion(z, log_det_jocobian)
+        valid_losses.append(float(loss))
+        valid_loss = np.mean(valid_losses)
+        self.visualizer.add_scalar("valid/loss", valid_loss, self.iteration)
+
+
+def main_sp(config, args):
+    exp = Experiment(config, args)
+    exp.setup()
+    exp.resume_or_load()
+    exp.run()
+
+
+def main(config, args):
+    if args.ngpu > 1:
+        dist.spawn(main_sp, args=(config, args), nprocs=args.ngpu)
+    else:
+        main_sp(config, args)
+
+
+if __name__ == "__main__":
+    config = get_cfg_defaults()
+    parser = default_argument_parser()
+    args = parser.parse_args()
+    if args.config:
+        config.merge_from_file(args.config)
+    if args.opts:
+        config.merge_from_list(args.opts)
+    config.freeze()
+    print(config)
+    print(args)
+
+    main(config, args)
diff --git a/ernie-sat/paddlespeech/t2s/exps/wavernn/__init__.py b/ernie-sat/paddlespeech/t2s/exps/wavernn/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..abf198b97e6e818e1fbe59006f98492640bcee54
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/exps/wavernn/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/ernie-sat/paddlespeech/t2s/exps/wavernn/synthesize.py b/ernie-sat/paddlespeech/t2s/exps/wavernn/synthesize.py
new file mode 100644
index 0000000000000000000000000000000000000000..d23e9cb7ed99104d20d73af6541cab462a4c5e11
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/exps/wavernn/synthesize.py
@@ -0,0 +1,108 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+from pathlib import Path
+
+import jsonlines
+import numpy as np
+import paddle
+import soundfile as sf
+import yaml
+from paddle import distributed as dist
+from timer import timer
+from yacs.config import CfgNode
+
+from paddlespeech.t2s.datasets.data_table import DataTable
+from paddlespeech.t2s.models.wavernn import WaveRNN
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Synthesize with WaveRNN.")
+
+    parser.add_argument("--config", type=str, help="Vocoder config file.")
+    parser.add_argument("--checkpoint", type=str, help="snapshot to load.")
+    parser.add_argument("--test-metadata", type=str, help="dev data.")
+    parser.add_argument("--output-dir", type=str, help="output dir.")
+    parser.add_argument(
+        "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
+
+    args = parser.parse_args()
+
+    with open(args.config) as f:
+        config = CfgNode(yaml.safe_load(f))
+
+    print("========Args========")
+    print(yaml.safe_dump(vars(args)))
+    print("========Config========")
+    print(config)
+    print(
+        f"master see the word size: {dist.get_world_size()}, from pid: {os.getpid()}"
+    )
+
+    if args.ngpu == 0:
+        paddle.set_device("cpu")
+    elif args.ngpu > 0:
+        paddle.set_device("gpu")
+    else:
+        print("ngpu should >= 0 !")
+
+    model = WaveRNN(
+        hop_length=config.n_shift, sample_rate=config.fs, **config["model"])
+    state_dict = paddle.load(args.checkpoint)
+    model.set_state_dict(state_dict["main_params"])
+
+    model.eval()
+
+    with jsonlines.open(args.test_metadata, 'r') as reader:
+        metadata = list(reader)
+    test_dataset = DataTable(
+        metadata,
+        fields=['utt_id', 'feats'],
+        converters={
+            'utt_id': None,
+            'feats': np.load,
+        })
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    N = 0
+    T = 0
+    for example in test_dataset:
+        utt_id = example['utt_id']
+        mel = example['feats']
+        mel = paddle.to_tensor(mel)  # (T, C)
+        with timer() as t:
+            with paddle.no_grad():
+                wav = model.generate(
+                    c=mel,
+                    batched=config.inference.gen_batched,
+                    target=config.inference.target,
+                    overlap=config.inference.overlap,
+                    mu_law=config.mu_law,
+                    gen_display=False)
+            wav = wav.numpy()
+            N += wav.size
+            T += t.elapse
+            speed = wav.size / t.elapse
+            rtf = config.fs / speed
+        print(
+            f"{utt_id}, mel: {mel.shape}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}."
+        )
+        sf.write(str(output_dir / (utt_id + ".wav")), wav, samplerate=config.fs)
+    print(f"generation speed: {N / T}Hz, RTF: {config.fs / (N / T) }")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/ernie-sat/paddlespeech/t2s/exps/wavernn/train.py b/ernie-sat/paddlespeech/t2s/exps/wavernn/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..8661d311d218bda58142a846f3dedce5a07ffabf
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/exps/wavernn/train.py
@@ -0,0 +1,212 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+import shutil
+from pathlib import Path
+
+import jsonlines
+import numpy as np
+import paddle
+import yaml
+from paddle import DataParallel
+from paddle import distributed as dist
+from paddle.io import DataLoader
+from paddle.io import DistributedBatchSampler
+from paddle.optimizer import Adam
+from yacs.config import CfgNode
+
+from paddlespeech.t2s.datasets.data_table import DataTable
+from paddlespeech.t2s.datasets.vocoder_batch_fn import WaveRNNClip
+from paddlespeech.t2s.models.wavernn import WaveRNN
+from paddlespeech.t2s.models.wavernn import WaveRNNEvaluator
+from paddlespeech.t2s.models.wavernn import WaveRNNUpdater
+from paddlespeech.t2s.modules.losses import discretized_mix_logistic_loss
+from paddlespeech.t2s.training.extensions.snapshot import Snapshot
+from paddlespeech.t2s.training.extensions.visualizer import VisualDL
+from paddlespeech.t2s.training.seeding import seed_everything
+from paddlespeech.t2s.training.trainer import Trainer
+
+
+def train_sp(args, config):
+    # decides device type and whether to run in parallel
+    # setup running environment correctly
+    world_size = paddle.distributed.get_world_size()
+    if (not paddle.is_compiled_with_cuda()) or args.ngpu == 0:
+        paddle.set_device("cpu")
+    else:
+        paddle.set_device("gpu")
+        if world_size > 1:
+            paddle.distributed.init_parallel_env()
+
+    # set the random seed, it is a must for multiprocess training
+    seed_everything(config.seed)
+
+    print(
+        f"rank: {dist.get_rank()}, pid: {os.getpid()}, parent_pid: {os.getppid()}",
+    )
+
+    # construct dataset for training and validation
+    with jsonlines.open(args.train_metadata, 'r') as reader:
+        train_metadata = list(reader)
+    train_dataset = DataTable(
+        data=train_metadata,
+        fields=["wave", "feats"],
+        converters={
+            "wave": np.load,
+            "feats": np.load,
+        }, )
+
+    with jsonlines.open(args.dev_metadata, 'r') as reader:
+        dev_metadata = list(reader)
+    dev_dataset = DataTable(
+        data=dev_metadata,
+        fields=["wave", "feats"],
+        converters={
+            "wave": np.load,
+            "feats": np.load,
+        }, )
+
+    batch_fn = WaveRNNClip(
+        mode=config.model.mode,
+        aux_context_window=config.model.aux_context_window,
+        hop_size=config.n_shift,
+        batch_max_steps=config.batch_max_steps,
+        bits=config.model.bits)
+
+    # collate function and dataloader
+    train_sampler = DistributedBatchSampler(
+        train_dataset,
+        batch_size=config.batch_size,
+        shuffle=True,
+        drop_last=True)
+    dev_sampler = DistributedBatchSampler(
+        dev_dataset,
+        batch_size=config.batch_size,
+        shuffle=False,
+        drop_last=False)
+    print("samplers done!")
+
+    train_dataloader = DataLoader(
+        train_dataset,
+        batch_sampler=train_sampler,
+        collate_fn=batch_fn,
+        num_workers=config.num_workers)
+
+    dev_dataloader = DataLoader(
+        dev_dataset,
+        collate_fn=batch_fn,
+        batch_sampler=dev_sampler,
+        num_workers=config.num_workers)
+
+    valid_generate_loader = DataLoader(dev_dataset, batch_size=1)
+
+    print("dataloaders done!")
+
+    model = WaveRNN(
+        hop_length=config.n_shift, sample_rate=config.fs, **config["model"])
+    if world_size > 1:
+        model = DataParallel(model)
+    print("model done!")
+
+    if config.model.mode == 'RAW':
+        criterion = paddle.nn.CrossEntropyLoss(axis=1)
+    elif config.model.mode == 'MOL':
+        criterion = discretized_mix_logistic_loss
+    else:
+        criterion = None
+        RuntimeError('Unknown model mode value - ', config.model.mode)
+    print("criterions done!")
+    clip = paddle.nn.ClipGradByGlobalNorm(config.grad_clip)
+    optimizer = Adam(
+        parameters=model.parameters(),
+        learning_rate=config.learning_rate,
+        grad_clip=clip)
+
+    print("optimizer done!")
+
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    if dist.get_rank() == 0:
+        config_name = args.config.split("/")[-1]
+        # copy conf to output_dir
+        shutil.copyfile(args.config, output_dir / config_name)
+
+    updater = WaveRNNUpdater(
+        model=model,
+        optimizer=optimizer,
+        criterion=criterion,
+        dataloader=train_dataloader,
+        output_dir=output_dir,
+        mode=config.model.mode)
+
+    evaluator = WaveRNNEvaluator(
+        model=model,
+        dataloader=dev_dataloader,
+        criterion=criterion,
+        output_dir=output_dir,
+        valid_generate_loader=valid_generate_loader,
+        config=config)
+
+    trainer = Trainer(
+        updater,
+        stop_trigger=(config.train_max_steps, "iteration"),
+        out=output_dir)
+
+    if dist.get_rank() == 0:
+        trainer.extend(
+            evaluator, trigger=(config.eval_interval_steps, 'iteration'))
+        trainer.extend(VisualDL(output_dir), trigger=(1, 'iteration'))
+    trainer.extend(
+        Snapshot(max_size=config.num_snapshots),
+        trigger=(config.save_interval_steps, 'iteration'))
+
+    print("Trainer Done!")
+    trainer.run()
+
+
+def main():
+    # parse args and config and redirect to train_sp
+
+    parser = argparse.ArgumentParser(description="Train a WaveRNN model.")
+    parser.add_argument(
+        "--config", type=str, help="config file to overwrite default config.")
+    parser.add_argument("--train-metadata", type=str, help="training data.")
+    parser.add_argument("--dev-metadata", type=str, help="dev data.")
+    parser.add_argument("--output-dir", type=str, help="output dir.")
+    parser.add_argument(
+        "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
+
+    args = parser.parse_args()
+
+    with open(args.config, 'rt') as f:
+        config = CfgNode(yaml.safe_load(f))
+
+    print("========Args========")
+    print(yaml.safe_dump(vars(args)))
+    print("========Config========")
+    print(config)
+    print(
+        f"master see the word size: {dist.get_world_size()}, from pid: {os.getpid()}"
+    )
+
+    # dispatch
+    if args.ngpu > 1:
+        dist.spawn(train_sp, (args, config), nprocs=args.ngpu)
+    else:
+        train_sp(args, config)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/ernie-sat/paddlespeech/t2s/frontend/__init__.py b/ernie-sat/paddlespeech/t2s/frontend/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..64015435eefd7a8f1d3369a49cb0be7e10c8ec60
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/frontend/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .generate_lexicon import *
+from .normalizer import *
+from .phonectic import *
+from .punctuation import *
+from .tone_sandhi import *
+from .vocab import *
+from .zh_normalization import *
diff --git a/ernie-sat/paddlespeech/t2s/frontend/arpabet.py b/ernie-sat/paddlespeech/t2s/frontend/arpabet.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a81b645d426c618d49f2ded1acd73d1bc9ccbbe
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/frontend/arpabet.py
@@ -0,0 +1,268 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from paddlespeech.t2s.frontend.phonectic import Phonetics
+"""
+A phonology system with ARPABET symbols and limited punctuations. The G2P 
+conversion is done by g2p_en.
+
+Note that g2p_en does not handle words with hypen well. So make sure the input
+sentence is first normalized.
+"""
+from paddlespeech.t2s.frontend.vocab import Vocab
+from g2p_en import G2p
+
+
+class ARPABET(Phonetics):
+    """A phonology for English that uses ARPABET as the phoneme vocabulary.
+    See http://www.speech.cs.cmu.edu/cgi-bin/cmudict for more details.
+    Phoneme Example Translation
+        ------- ------- -----------
+        AA	odd     AA D
+        AE	at	AE T
+        AH	hut	HH AH T
+        AO	ought	AO T
+        AW	cow	K AW
+        AY	hide	HH AY D
+        B 	be	B IY
+        CH	cheese	CH IY Z
+        D 	dee	D IY
+        DH	thee	DH IY
+        EH	Ed	EH D
+        ER	hurt	HH ER T
+        EY	ate	EY T
+        F 	fee	F IY
+        G 	green	G R IY N
+        HH	he	HH IY
+        IH	it	IH T
+        IY	eat	IY T
+        JH	gee	JH IY
+        K 	key	K IY
+        L 	lee	L IY
+        M 	me	M IY
+        N 	knee	N IY
+        NG	ping	P IH NG
+        OW	oat	OW T
+        OY	toy	T OY
+        P 	pee	P IY
+        R 	read	R IY D
+        S 	sea	S IY
+        SH	she	SH IY
+        T 	tea	T IY
+        TH	theta	TH EY T AH
+        UH	hood	HH UH D
+        UW	two	T UW
+        V 	vee	V IY
+        W 	we	W IY
+        Y 	yield	Y IY L D
+        Z 	zee	Z IY
+        ZH	seizure	S IY ZH ER
+    """
+    phonemes = [
+        'AA', 'AE', 'AH', 'AO', 'AW', 'AY', 'B', 'CH', 'D', 'DH', 'EH', 'ER',
+        'EY', 'F', 'G', 'HH', 'IH', 'IY', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW',
+        'OY', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UW', 'UH', 'V', 'W', 'Y', 'Z',
+        'ZH'
+    ]
+    punctuations = [',', '.', '?', '!']
+    symbols = phonemes + punctuations
+    _stress_to_no_stress_ = {
+        'AA0': 'AA',
+        'AA1': 'AA',
+        'AA2': 'AA',
+        'AE0': 'AE',
+        'AE1': 'AE',
+        'AE2': 'AE',
+        'AH0': 'AH',
+        'AH1': 'AH',
+        'AH2': 'AH',
+        'AO0': 'AO',
+        'AO1': 'AO',
+        'AO2': 'AO',
+        'AW0': 'AW',
+        'AW1': 'AW',
+        'AW2': 'AW',
+        'AY0': 'AY',
+        'AY1': 'AY',
+        'AY2': 'AY',
+        'EH0': 'EH',
+        'EH1': 'EH',
+        'EH2': 'EH',
+        'ER0': 'ER',
+        'ER1': 'ER',
+        'ER2': 'ER',
+        'EY0': 'EY',
+        'EY1': 'EY',
+        'EY2': 'EY',
+        'IH0': 'IH',
+        'IH1': 'IH',
+        'IH2': 'IH',
+        'IY0': 'IY',
+        'IY1': 'IY',
+        'IY2': 'IY',
+        'OW0': 'OW',
+        'OW1': 'OW',
+        'OW2': 'OW',
+        'OY0': 'OY',
+        'OY1': 'OY',
+        'OY2': 'OY',
+        'UH0': 'UH',
+        'UH1': 'UH',
+        'UH2': 'UH',
+        'UW0': 'UW',
+        'UW1': 'UW',
+        'UW2': 'UW'
+    }
+
+    def __init__(self):
+        self.backend = G2p()
+        self.vocab = Vocab(self.phonemes + self.punctuations)
+
+    def _remove_vowels(self, phone):
+        return self._stress_to_no_stress_.get(phone, phone)
+
+    def phoneticize(self, sentence, add_start_end=False):
+        """ Normalize the input text sequence and convert it into pronunciation sequence.
+        Args:
+            sentence (str): The input text sequence.
+    
+        Returns:
+            List[str]: The list of pronunciation sequence.
+        """
+        phonemes = [
+            self._remove_vowels(item) for item in self.backend(sentence)
+        ]
+        if add_start_end:
+            start = self.vocab.start_symbol
+            end = self.vocab.end_symbol
+            phonemes = [start] + phonemes + [end]
+        phonemes = [item for item in phonemes if item in self.vocab.stoi]
+        return phonemes
+
+    def numericalize(self, phonemes):
+        """ Convert pronunciation sequence into pronunciation id sequence.
+
+        Args:
+            phonemes (List[str]): The list of pronunciation sequence.
+    
+        Returns:
+            List[int]: The list of pronunciation id sequence.
+        """
+        ids = [self.vocab.lookup(item) for item in phonemes]
+        return ids
+
+    def reverse(self, ids):
+        """ Reverse the list of pronunciation id sequence to a list of pronunciation sequence.
+        
+        Args:
+            ids( List[int]): The list of pronunciation id sequence.
+    
+        Returns: 
+            List[str]: 
+                The list of pronunciation sequence.
+        """
+        return [self.vocab.reverse(i) for i in ids]
+
+    def __call__(self, sentence, add_start_end=False):
+        """ Convert the input text sequence into pronunciation id sequence.
+    
+        Args:
+            sentence (str): The input text sequence.
+    
+        Returns:
+            List[str]: The list of pronunciation id sequence.
+        """
+        return self.numericalize(
+            self.phoneticize(sentence, add_start_end=add_start_end))
+
+    @property
+    def vocab_size(self):
+        """ Vocab size.
+        """
+        # 47 = 39 phones + 4 punctuations + 4 special tokens
+        return len(self.vocab)
+
+
+class ARPABETWithStress(Phonetics):
+    phonemes = [
+        'AA0', 'AA1', 'AA2', 'AE0', 'AE1', 'AE2', 'AH0', 'AH1', 'AH2', 'AO0',
+        'AO1', 'AO2', 'AW0', 'AW1', 'AW2', 'AY0', 'AY1', 'AY2', 'B', 'CH', 'D',
+        'DH', 'EH0', 'EH1', 'EH2', 'ER0', 'ER1', 'ER2', 'EY0', 'EY1', 'EY2',
+        'F', 'G', 'HH', 'IH0', 'IH1', 'IH2', 'IY0', 'IY1', 'IY2', 'JH', 'K',
+        'L', 'M', 'N', 'NG', 'OW0', 'OW1', 'OW2', 'OY0', 'OY1', 'OY2', 'P', 'R',
+        'S', 'SH', 'T', 'TH', 'UH0', 'UH1', 'UH2', 'UW0', 'UW1', 'UW2', 'V',
+        'W', 'Y', 'Z', 'ZH'
+    ]
+    punctuations = [',', '.', '?', '!']
+    symbols = phonemes + punctuations
+
+    def __init__(self):
+        self.backend = G2p()
+        self.vocab = Vocab(self.phonemes + self.punctuations)
+
+    def phoneticize(self, sentence, add_start_end=False):
+        """ Normalize the input text sequence and convert it into pronunciation sequence.
+    
+        Args: 
+            sentence (str): The input text sequence.
+    
+        Returns: 
+            List[str]: The list of pronunciation sequence.
+        """
+        phonemes = self.backend(sentence)
+        if add_start_end:
+            start = self.vocab.start_symbol
+            end = self.vocab.end_symbol
+            phonemes = [start] + phonemes + [end]
+        phonemes = [item for item in phonemes if item in self.vocab.stoi]
+        return phonemes
+
+    def numericalize(self, phonemes):
+        """ Convert pronunciation sequence into pronunciation id sequence.
+
+        Args:
+            phonemes (List[str]): The list of pronunciation sequence.
+    
+        Returns:
+            List[int]: The list of pronunciation id sequence.
+        """
+        ids = [self.vocab.lookup(item) for item in phonemes]
+        return ids
+
+    def reverse(self, ids):
+        """ Reverse the list of pronunciation id sequence to a list of pronunciation sequence.
+        Args:
+            ids (List[int]): The list of pronunciation id sequence.
+    
+        Returns: 
+            List[str]: The list of pronunciation sequence.
+        """
+        return [self.vocab.reverse(i) for i in ids]
+
+    def __call__(self, sentence, add_start_end=False):
+        """ Convert the input text sequence into pronunciation id sequence.
+        Args:
+            sentence (str): The input text sequence.
+    
+        Returns: 
+            List[str]: The list of pronunciation id sequence.
+        """
+        return self.numericalize(
+            self.phoneticize(sentence, add_start_end=add_start_end))
+
+    @property
+    def vocab_size(self):
+        """ Vocab size.
+        """
+        # 77 = 69 phones + 4 punctuations + 4 special tokens
+        return len(self.vocab)
diff --git a/ernie-sat/paddlespeech/t2s/frontend/generate_lexicon.py b/ernie-sat/paddlespeech/t2s/frontend/generate_lexicon.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b467d00e120692a7b5dab131cb517db2efc6b88
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/frontend/generate_lexicon.py
@@ -0,0 +1,158 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Design principles: https://zhuanlan.zhihu.com/p/349600439
+"""Generate lexicon and symbols for Mandarin Chinese phonology.
+The lexicon is used for Montreal Force Aligner.
+Note that syllables are used as word in this lexicon. Since syllables rather 
+than words are used in transcriptions produced by `reorganize_baker.py`.
+We make this choice to better leverage other software for chinese text to 
+pinyin tools like pypinyin. This is the convention for G2P in Chinese.
+"""
+import re
+from collections import OrderedDict
+
+INITIALS = [
+    'b', 'p', 'm', 'f', 'd', 't', 'n', 'l', 'g', 'k', 'h', 'zh', 'ch', 'sh',
+    'r', 'z', 'c', 's', 'j', 'q', 'x'
+]
+
+FINALS = [
+    'a', 'ai', 'ao', 'an', 'ang', 'e', 'er', 'ei', 'en', 'eng', 'o', 'ou',
+    'ong', 'ii', 'iii', 'i', 'ia', 'iao', 'ian', 'iang', 'ie', 'io', 'iou',
+    'iong', 'in', 'ing', 'u', 'ua', 'uai', 'uan', 'uang', 'uei', 'uo', 'uen',
+    'ueng', 'v', 've', 'van', 'vn'
+]
+
+SPECIALS = ['sil', 'sp']
+
+
+def rule(C, V, R, T):
+    """Generate a syllable given the initial, the final, erhua indicator, and tone.
+    Orthographical rules for pinyin are applied. (special case for y, w, ui, un, iu)
+
+    Note that in this system, 'ü' is alway written as 'v' when appeared in phoneme, but converted to 
+    'u' in syllables when certain conditions are satisfied.
+
+    'i' is distinguished when appeared in phonemes, and separated into 3 categories, 'i', 'ii' and 'iii'.
+    Erhua is is possibly applied to every finals, except for finals that already ends with 'r'.
+    When a syllable is impossible or does not have any characters with this pronunciation, return None
+    to filter it out.
+    """
+
+    # 不可拼的音节, ii 只能和 z, c, s 拼
+    if V in ["ii"] and (C not in ['z', 'c', 's']):
+        return None
+    # iii 只能和 zh, ch, sh, r 拼
+    if V in ['iii'] and (C not in ['zh', 'ch', 'sh', 'r']):
+        return None
+
+    # 齐齿呼或者撮口呼不能和 f, g, k, h, zh, ch, sh, r, z, c, s
+    if (V not in ['ii', 'iii']) and V[0] in ['i', 'v'] and (
+            C in ['f', 'g', 'k', 'h', 'zh', 'ch', 'sh', 'r', 'z', 'c', 's']):
+        return None
+
+    # 撮口呼只能和 j, q, x l, n 拼
+    if V.startswith("v"):
+        # v, ve 只能和 j ,q , x, n, l 拼
+        if V in ['v', 've']:
+            if C not in ['j', 'q', 'x', 'n', 'l', '']:
+                return None
+        # 其他只能和 j, q, x 拼
+        else:
+            if C not in ['j', 'q', 'x', '']:
+                return None
+
+    # j, q, x 只能和齐齿呼或者撮口呼拼
+    if (C in ['j', 'q', 'x']) and not (
+        (V not in ['ii', 'iii']) and V[0] in ['i', 'v']):
+        return None
+
+    # b, p ,m, f 不能和合口呼拼，除了 u 之外
+    # bm p, m, f 不能和撮口呼拼
+    if (C in ['b', 'p', 'm', 'f']) and ((V[0] in ['u', 'v'] and V != "u") or
+                                        V == 'ong'):
+        return None
+
+    # ua, uai, uang 不能和 d, t, n, l, r, z, c, s 拼
+    if V in ['ua', 'uai',
+             'uang'] and C in ['d', 't', 'n', 'l', 'r', 'z', 'c', 's']:
+        return None
+
+    # sh 和 ong 不能拼
+    if V == 'ong' and C in ['sh']:
+        return None
+
+    # o 和 gkh, zh ch sh r z c s 不能拼
+    if V == "o" and C in [
+            'd', 't', 'n', 'g', 'k', 'h', 'zh', 'ch', 'sh', 'r', 'z', 'c', 's'
+    ]:
+        return None
+
+    # ueng 只是 weng 这个 ad-hoc 其他情况下都是 ong
+    if V == 'ueng' and C != '':
+        return
+
+    # 非儿化的 er 只能单独存在
+    if V == 'er' and C != '':
+        return None
+
+    if C == '':
+        if V in ["i", "in", "ing"]:
+            C = 'y'
+        elif V == 'u':
+            C = 'w'
+        elif V.startswith('i') and V not in ["ii", "iii"]:
+            C = 'y'
+            V = V[1:]
+        elif V.startswith('u'):
+            C = 'w'
+            V = V[1:]
+        elif V.startswith('v'):
+            C = 'yu'
+            V = V[1:]
+    else:
+        if C in ['j', 'q', 'x']:
+            if V.startswith('v'):
+                V = re.sub('v', 'u', V)
+        if V == 'iou':
+            V = 'iu'
+        elif V == 'uei':
+            V = 'ui'
+        elif V == 'uen':
+            V = 'un'
+    result = C + V
+
+    # Filter  er 不能再儿化
+    if result.endswith('r') and R == 'r':
+        return None
+
+    # ii and iii, change back to i
+    result = re.sub(r'i+', 'i', result)
+
+    result = result + R + T
+    return result
+
+
+def generate_lexicon(with_tone=False, with_erhua=False):
+    """Generate lexicon for Mandarin Chinese."""
+    syllables = OrderedDict()
+
+    for C in [''] + INITIALS:
+        for V in FINALS:
+            for R in [''] if not with_erhua else ['', 'r']:
+                for T in [''] if not with_tone else ['1', '2', '3', '4', '5']:
+                    result = rule(C, V, R, T)
+                    if result:
+                        syllables[result] = f'{C} {V}{R}{T}'
+    return syllables
diff --git a/ernie-sat/paddlespeech/t2s/frontend/normalizer/__init__.py b/ernie-sat/paddlespeech/t2s/frontend/normalizer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a03329f1e7f3e3d69ef49a46241511a4d05c098c
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/frontend/normalizer/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from paddlespeech.t2s.frontend.normalizer.normalizer import *
+from paddlespeech.t2s.frontend.normalizer.numbers import *
diff --git a/ernie-sat/paddlespeech/t2s/frontend/normalizer/abbrrviation.py b/ernie-sat/paddlespeech/t2s/frontend/normalizer/abbrrviation.py
new file mode 100644
index 0000000000000000000000000000000000000000..abf198b97e6e818e1fbe59006f98492640bcee54
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/frontend/normalizer/abbrrviation.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/ernie-sat/paddlespeech/t2s/frontend/normalizer/acronyms.py b/ernie-sat/paddlespeech/t2s/frontend/normalizer/acronyms.py
new file mode 100644
index 0000000000000000000000000000000000000000..abf198b97e6e818e1fbe59006f98492640bcee54
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/frontend/normalizer/acronyms.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/ernie-sat/paddlespeech/t2s/frontend/normalizer/normalizer.py b/ernie-sat/paddlespeech/t2s/frontend/normalizer/normalizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..421ebd1f54194f7ac421e8eaa7a757baca2715ea
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/frontend/normalizer/normalizer.py
@@ -0,0 +1,34 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import re
+import unicodedata
+from builtins import str as unicode
+
+from paddlespeech.t2s.frontend.normalizer.numbers import normalize_numbers
+
+
+def normalize(sentence):
+    """ Normalize English text.
+    """
+    # preprocessing
+    sentence = unicode(sentence)
+    sentence = normalize_numbers(sentence)
+    sentence = ''.join(
+        char for char in unicodedata.normalize('NFD', sentence)
+        if unicodedata.category(char) != 'Mn')  # Strip accents
+    sentence = sentence.lower()
+    sentence = re.sub(r"[^ a-z'.,?!\-]", "", sentence)
+    sentence = sentence.replace("i.e.", "that is")
+    sentence = sentence.replace("e.g.", "for example")
+    return sentence
diff --git a/ernie-sat/paddlespeech/t2s/frontend/normalizer/numbers.py b/ernie-sat/paddlespeech/t2s/frontend/normalizer/numbers.py
new file mode 100644
index 0000000000000000000000000000000000000000..564fb9b635aac72966662601e8a90f48dcf4e34a
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/frontend/normalizer/numbers.py
@@ -0,0 +1,86 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# number expansion is not that easy
+import re
+
+import inflect
+
+_inflect = inflect.engine()
+_comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
+_decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
+_pounds_re = re.compile(r'£([0-9\,]*[0-9]+)')
+_dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)')
+_ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)')
+_number_re = re.compile(r'[0-9]+')
+
+
+def _remove_commas(m):
+    return m.group(1).replace(',', '')
+
+
+def _expand_decimal_point(m):
+    return m.group(1).replace('.', ' point ')
+
+
+def _expand_dollars(m):
+    match = m.group(1)
+    parts = match.split('.')
+    if len(parts) > 2:
+        return match + ' dollars'  # Unexpected format
+    dollars = int(parts[0]) if parts[0] else 0
+    cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
+    if dollars and cents:
+        dollar_unit = 'dollar' if dollars == 1 else 'dollars'
+        cent_unit = 'cent' if cents == 1 else 'cents'
+        return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit)
+    elif dollars:
+        dollar_unit = 'dollar' if dollars == 1 else 'dollars'
+        return '%s %s' % (dollars, dollar_unit)
+    elif cents:
+        cent_unit = 'cent' if cents == 1 else 'cents'
+        return '%s %s' % (cents, cent_unit)
+    else:
+        return 'zero dollars'
+
+
+def _expand_ordinal(m):
+    return _inflect.number_to_words(m.group(0))
+
+
+def _expand_number(m):
+    num = int(m.group(0))
+    if num > 1000 and num < 3000:
+        if num == 2000:
+            return 'two thousand'
+        elif num > 2000 and num < 2010:
+            return 'two thousand ' + _inflect.number_to_words(num % 100)
+        elif num % 100 == 0:
+            return _inflect.number_to_words(num // 100) + ' hundred'
+        else:
+            return _inflect.number_to_words(
+                num, andword='', zero='oh', group=2).replace(', ', ' ')
+    else:
+        return _inflect.number_to_words(num, andword='')
+
+
+def normalize_numbers(text):
+    """ Normalize numbers in English text.
+    """
+    text = re.sub(_comma_number_re, _remove_commas, text)
+    text = re.sub(_pounds_re, r'\1 pounds', text)
+    text = re.sub(_dollars_re, _expand_dollars, text)
+    text = re.sub(_decimal_number_re, _expand_decimal_point, text)
+    text = re.sub(_ordinal_re, _expand_ordinal, text)
+    text = re.sub(_number_re, _expand_number, text)
+    return text
diff --git a/ernie-sat/paddlespeech/t2s/frontend/normalizer/width.py b/ernie-sat/paddlespeech/t2s/frontend/normalizer/width.py
new file mode 100644
index 0000000000000000000000000000000000000000..d655e9274be2403c034240773b3b37d5f49b60b2
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/frontend/normalizer/width.py
@@ -0,0 +1,40 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+def full2half_width(ustr):
+    half = []
+    for u in ustr:
+        num = ord(u)
+        if num == 0x3000:  # 全角空格变半角
+            num = 32
+        elif 0xFF01 <= num <= 0xFF5E:
+            num -= 0xfee0
+        u = chr(num)
+        half.append(u)
+    return ''.join(half)
+
+
+def half2full_width(ustr):
+    full = []
+    for u in ustr:
+        num = ord(u)
+        if num == 32:  # 半角空格变全角
+            num = 0x3000
+        elif 0x21 <= num <= 0x7E:
+            num += 0xfee0
+        u = chr(num)  # to unicode
+        full.append(u)
+
+    return ''.join(full)
diff --git a/ernie-sat/paddlespeech/t2s/frontend/phonectic.py b/ernie-sat/paddlespeech/t2s/frontend/phonectic.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e9f11737d117eb5143d070d971fd81f7f2b41f0
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/frontend/phonectic.py
@@ -0,0 +1,294 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from abc import ABC
+from abc import abstractmethod
+from typing import List
+
+import numpy as np
+import paddle
+from g2p_en import G2p
+from g2pM import G2pM
+
+from paddlespeech.t2s.frontend.normalizer.normalizer import normalize
+from paddlespeech.t2s.frontend.punctuation import get_punctuations
+from paddlespeech.t2s.frontend.vocab import Vocab
+from paddlespeech.t2s.frontend.zh_normalization.text_normlization import TextNormalizer
+
+# discard opencc untill we find an easy solution to install it on windows
+# from opencc import OpenCC
+
+__all__ = ["Phonetics", "English", "EnglishCharacter", "Chinese"]
+
+
+class Phonetics(ABC):
+    @abstractmethod
+    def __call__(self, sentence):
+        pass
+
+    @abstractmethod
+    def phoneticize(self, sentence):
+        pass
+
+    @abstractmethod
+    def numericalize(self, phonemes):
+        pass
+
+
+class English(Phonetics):
+    """ Normalize the input text sequence and convert into pronunciation id sequence.
+    """
+
+    def __init__(self, phone_vocab_path=None):
+        self.backend = G2p()
+        self.phonemes = list(self.backend.phonemes)
+        self.punctuations = get_punctuations("en")
+        self.vocab = Vocab(self.phonemes + self.punctuations)
+        self.vocab_phones = {}
+        self.punc = "：，；。？！“”‘’':,;.?!"
+        self.text_normalizer = TextNormalizer()
+        if phone_vocab_path:
+            with open(phone_vocab_path, 'rt') as f:
+                phn_id = [line.strip().split() for line in f.readlines()]
+            for phn, id in phn_id:
+                self.vocab_phones[phn] = int(id)
+
+    def phoneticize(self, sentence):
+        """ Normalize the input text sequence and convert it into pronunciation sequence.
+        Args:
+            sentence (str): The input text sequence.
+        Returns: 
+            List[str]: The list of pronunciation sequence.
+        """
+        start = self.vocab.start_symbol
+        end = self.vocab.end_symbol
+        phonemes = ([] if start is None else [start]) \
+                   + self.backend(sentence) \
+                   + ([] if end is None else [end])
+        phonemes = [item for item in phonemes if item in self.vocab.stoi]
+        return phonemes
+
+    def _p2id(self, phonemes: List[str]) -> np.array:
+        phone_ids = [self.vocab_phones[item] for item in phonemes]
+        return np.array(phone_ids, np.int64)
+
+    def get_input_ids(self, sentence: str,
+                      merge_sentences: bool=False) -> paddle.Tensor:
+        result = {}
+        sentences = self.text_normalizer._split(sentence, lang="en")
+        phones_list = []
+        temp_phone_ids = []
+        for sentence in sentences:
+            phones = self.phoneticize(sentence)
+            # remove start_symbol and end_symbol
+            phones = phones[1:-1]
+            phones = [phn for phn in phones if not phn.isspace()]
+            # replace unk phone with sp
+            phones = [
+                phn
+                if (phn in self.vocab_phones and phn not in self.punc) else "sp"
+                for phn in phones
+            ]
+            phones_list.append(phones)
+
+        if merge_sentences:
+            merge_list = sum(phones_list, [])
+            # rm the last 'sp' to avoid the noise at the end
+            # cause in the training data, no 'sp' in the end
+            if merge_list[-1] == 'sp':
+                merge_list = merge_list[:-1]
+            phones_list = []
+            phones_list.append(merge_list)
+
+        for part_phones_list in phones_list:
+            phone_ids = self._p2id(part_phones_list)
+            phone_ids = paddle.to_tensor(phone_ids)
+            temp_phone_ids.append(phone_ids)
+        result["phone_ids"] = temp_phone_ids
+        return result
+
+    def numericalize(self, phonemes):
+        """ Convert pronunciation sequence into pronunciation id sequence.
+        Args:
+            phonemes (List[str]): The list of pronunciation sequence.
+        Returns: 
+            List[int]: The list of pronunciation id sequence.
+        """
+        ids = [
+            self.vocab.lookup(item) for item in phonemes
+            if item in self.vocab.stoi
+        ]
+        return ids
+
+    def reverse(self, ids):
+        """ Reverse the list of pronunciation id sequence to a list of pronunciation sequence.
+        Args:
+            ids (List[int]): The list of pronunciation id sequence.
+        Returns: 
+            List[str]: The list of pronunciation sequence.
+        """
+        return [self.vocab.reverse(i) for i in ids]
+
+    def __call__(self, sentence):
+        """ Convert the input text sequence into pronunciation id sequence.
+        Args:
+            sentence(str): The input text sequence.
+        Returns: 
+            List[str]: The list of pronunciation id sequence.
+        """
+        return self.numericalize(self.phoneticize(sentence))
+
+    @property
+    def vocab_size(self):
+        """ Vocab size.
+        """
+        return len(self.vocab)
+
+
+class EnglishCharacter(Phonetics):
+    """ Normalize the input text sequence and convert it into character id sequence.
+    """
+
+    def __init__(self):
+        self.backend = G2p()
+        self.graphemes = list(self.backend.graphemes)
+        self.punctuations = get_punctuations("en")
+        self.vocab = Vocab(self.graphemes + self.punctuations)
+
+    def phoneticize(self, sentence):
+        """ Normalize the input text sequence.
+        Args:
+            sentence(str): The input text sequence.
+        Returns:
+            str: A text sequence after normalize.
+        """
+        words = normalize(sentence)
+        return words
+
+    def numericalize(self, sentence):
+        """ Convert a text sequence into ids.
+        Args:
+            sentence (str): The input text sequence.
+        Returns:
+            List[int]:
+                List of a character id sequence.
+        """
+        ids = [
+            self.vocab.lookup(item) for item in sentence
+            if item in self.vocab.stoi
+        ]
+        return ids
+
+    def reverse(self, ids):
+        """ Convert a character id sequence into text.
+        Args:
+            ids (List[int]): List of a character id sequence.
+        Returns:
+            str: The input text sequence.
+        """
+        return [self.vocab.reverse(i) for i in ids]
+
+    def __call__(self, sentence):
+        """ Normalize the input text sequence and convert it into character id sequence.
+        Args:
+            sentence (str): The input text sequence.
+        Returns: 
+            List[int]: List of a character id sequence.
+        """
+        return self.numericalize(self.phoneticize(sentence))
+
+    @property
+    def vocab_size(self):
+        """ Vocab size.
+        """
+        return len(self.vocab)
+
+
+class Chinese(Phonetics):
+    """Normalize Chinese text sequence and convert it into ids.
+    """
+
+    def __init__(self):
+        # self.opencc_backend = OpenCC('t2s.json')
+        self.backend = G2pM()
+        self.phonemes = self._get_all_syllables()
+        self.punctuations = get_punctuations("cn")
+        self.vocab = Vocab(self.phonemes + self.punctuations)
+
+    def _get_all_syllables(self):
+        all_syllables = set([
+            syllable for k, v in self.backend.cedict.items() for syllable in v
+        ])
+        return list(all_syllables)
+
+    def phoneticize(self, sentence):
+        """ Normalize the input text sequence and convert it into pronunciation sequence.
+        Args:
+            sentence(str): The input text sequence.
+        Returns: 
+            List[str]: The list of pronunciation sequence.
+        """
+        # simplified = self.opencc_backend.convert(sentence)
+        simplified = sentence
+        phonemes = self.backend(simplified)
+        start = self.vocab.start_symbol
+        end = self.vocab.end_symbol
+        phonemes = ([] if start is None else [start]) \
+                   + phonemes \
+                   + ([] if end is None else [end])
+        return self._filter_symbols(phonemes)
+
+    def _filter_symbols(self, phonemes):
+        cleaned_phonemes = []
+        for item in phonemes:
+            if item in self.vocab.stoi:
+                cleaned_phonemes.append(item)
+            else:
+                for char in item:
+                    if char in self.vocab.stoi:
+                        cleaned_phonemes.append(char)
+        return cleaned_phonemes
+
+    def numericalize(self, phonemes):
+        """ Convert pronunciation sequence into pronunciation id sequence.
+        Args:
+            phonemes(List[str]): The list of pronunciation sequence.
+        Returns:
+                List[int]: The list of pronunciation id sequence.
+        """
+        ids = [self.vocab.lookup(item) for item in phonemes]
+        return ids
+
+    def __call__(self, sentence):
+        """ Convert the input text sequence into pronunciation id sequence.
+        Args:
+            sentence (str): The input text sequence.
+        Returns:
+            List[str]: The list of pronunciation id sequence.
+        """
+        return self.numericalize(self.phoneticize(sentence))
+
+    @property
+    def vocab_size(self):
+        """ Vocab size.
+        """
+        return len(self.vocab)
+
+    def reverse(self, ids):
+        """ Reverse the list of pronunciation id sequence to a list of pronunciation sequence.
+        Args:
+        ids (List[int]): The list of pronunciation id sequence.
+        Returns: 
+            List[str]: The list of pronunciation sequence.
+        """
+        return [self.vocab.reverse(i) for i in ids]
diff --git a/ernie-sat/paddlespeech/t2s/frontend/punctuation.py b/ernie-sat/paddlespeech/t2s/frontend/punctuation.py
new file mode 100644
index 0000000000000000000000000000000000000000..23636dc54b1fa1f32ab88dc16d0d80efe217021b
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/frontend/punctuation.py
@@ -0,0 +1,36 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__all__ = ["get_punctuations"]
+
+EN_PUNCT = [
+    " ",
+    "-",
+    "...",
+    ",",
+    ".",
+    "?",
+    "!",
+]
+
+CN_PUNCT = ["、", "，", "；", "：", "。", "？", "！"]
+
+
+def get_punctuations(lang):
+    if lang == "en":
+        return EN_PUNCT
+    elif lang == "cn":
+        return CN_PUNCT
+    else:
+        raise ValueError(f"language {lang} Not supported")
diff --git a/ernie-sat/paddlespeech/t2s/frontend/tone_sandhi.py b/ernie-sat/paddlespeech/t2s/frontend/tone_sandhi.py
new file mode 100644
index 0000000000000000000000000000000000000000..07f7fa2b8f8615af73fd656b0abd381e551179f9
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/frontend/tone_sandhi.py
@@ -0,0 +1,348 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List
+from typing import Tuple
+
+import jieba
+from pypinyin import lazy_pinyin
+from pypinyin import Style
+
+
+class ToneSandhi():
+    def __init__(self):
+        self.must_neural_tone_words = {
+            '麻烦', '麻利', '鸳鸯', '高粱', '骨头', '骆驼', '马虎', '首饰', '馒头', '馄饨', '风筝',
+            '难为', '队伍', '阔气', '闺女', '门道', '锄头', '铺盖', '铃铛', '铁匠', '钥匙', '里脊',
+            '里头', '部分', '那么', '道士', '造化', '迷糊', '连累', '这么', '这个', '运气', '过去',
+            '软和', '转悠', '踏实', '跳蚤', '跟头', '趔趄', '财主', '豆腐', '讲究', '记性', '记号',
+            '认识', '规矩', '见识', '裁缝', '补丁', '衣裳', '衣服', '衙门', '街坊', '行李', '行当',
+            '蛤蟆', '蘑菇', '薄荷', '葫芦', '葡萄', '萝卜', '荸荠', '苗条', '苗头', '苍蝇', '芝麻',
+            '舒服', '舒坦', '舌头', '自在', '膏药', '脾气', '脑袋', '脊梁', '能耐', '胳膊', '胭脂',
+            '胡萝', '胡琴', '胡同', '聪明', '耽误', '耽搁', '耷拉', '耳朵', '老爷', '老实', '老婆',
+            '老头', '老太', '翻腾', '罗嗦', '罐头', '编辑', '结实', '红火', '累赘', '糨糊', '糊涂',
+            '精神', '粮食', '簸箕', '篱笆', '算计', '算盘', '答应', '笤帚', '笑语', '笑话', '窟窿',
+            '窝囊', '窗户', '稳当', '稀罕', '称呼', '秧歌', '秀气', '秀才', '福气', '祖宗', '砚台',
+            '码头', '石榴', '石头', '石匠', '知识', '眼睛', '眯缝', '眨巴', '眉毛', '相声', '盘算',
+            '白净', '痢疾', '痛快', '疟疾', '疙瘩', '疏忽', '畜生', '生意', '甘蔗', '琵琶', '琢磨',
+            '琉璃', '玻璃', '玫瑰', '玄乎', '狐狸', '状元', '特务', '牲口', '牙碜', '牌楼', '爽快',
+            '爱人', '热闹', '烧饼', '烟筒', '烂糊', '点心', '炊帚', '灯笼', '火候', '漂亮', '滑溜',
+            '溜达', '温和', '清楚', '消息', '浪头', '活泼', '比方', '正经', '欺负', '模糊', '槟榔',
+            '棺材', '棒槌', '棉花', '核桃', '栅栏', '柴火', '架势', '枕头', '枇杷', '机灵', '本事',
+            '木头', '木匠', '朋友', '月饼', '月亮', '暖和', '明白', '时候', '新鲜', '故事', '收拾',
+            '收成', '提防', '挖苦', '挑剔', '指甲', '指头', '拾掇', '拳头', '拨弄', '招牌', '招呼',
+            '抬举', '护士', '折腾', '扫帚', '打量', '打算', '打点', '打扮', '打听', '打发', '扎实',
+            '扁担', '戒指', '懒得', '意识', '意思', '情形', '悟性', '怪物', '思量', '怎么', '念头',
+            '念叨', '快活', '忙活', '志气', '心思', '得罪', '张罗', '弟兄', '开通', '应酬', '庄稼',
+            '干事', '帮手', '帐篷', '希罕', '师父', '师傅', '巴结', '巴掌', '差事', '工夫', '岁数',
+            '屁股', '尾巴', '少爷', '小气', '小伙', '将就', '对头', '对付', '寡妇', '家伙', '客气',
+            '实在', '官司', '学问', '学生', '字号', '嫁妆', '媳妇', '媒人', '婆家', '娘家', '委屈',
+            '姑娘', '姐夫', '妯娌', '妥当', '妖精', '奴才', '女婿', '头发', '太阳', '大爷', '大方',
+            '大意', '大夫', '多少', '多么', '外甥', '壮实', '地道', '地方', '在乎', '困难', '嘴巴',
+            '嘱咐', '嘟囔', '嘀咕', '喜欢', '喇嘛', '喇叭', '商量', '唾沫', '哑巴', '哈欠', '哆嗦',
+            '咳嗽', '和尚', '告诉', '告示', '含糊', '吓唬', '后头', '名字', '名堂', '合同', '吆喝',
+            '叫唤', '口袋', '厚道', '厉害', '千斤', '包袱', '包涵', '匀称', '勤快', '动静', '动弹',
+            '功夫', '力气', '前头', '刺猬', '刺激', '别扭', '利落', '利索', '利害', '分析', '出息',
+            '凑合', '凉快', '冷战', '冤枉', '冒失', '养活', '关系', '先生', '兄弟', '便宜', '使唤',
+            '佩服', '作坊', '体面', '位置', '似的', '伙计', '休息', '什么', '人家', '亲戚', '亲家',
+            '交情', '云彩', '事情', '买卖', '主意', '丫头', '丧气', '两口', '东西', '东家', '世故',
+            '不由', '不在', '下水', '下巴', '上头', '上司', '丈夫', '丈人', '一辈', '那个', '菩萨',
+            '父亲', '母亲', '咕噜', '邋遢', '费用', '冤家', '甜头', '介绍', '荒唐', '大人', '泥鳅',
+            '幸福', '熟悉', '计划', '扑腾', '蜡烛', '姥爷', '照顾', '喉咙', '吉他', '弄堂', '蚂蚱',
+            '凤凰', '拖沓', '寒碜', '糟蹋', '倒腾', '报复', '逻辑', '盘缠', '喽啰', '牢骚', '咖喱',
+            '扫把', '惦记'
+        }
+        self.must_not_neural_tone_words = {
+            "男子", "女子", "分子", "原子", "量子", "莲子", "石子", "瓜子", "电子", "人人", "虎虎"
+        }
+        self.punc = "：，；。？！“”‘’':,;.?!"
+
+    # the meaning of jieba pos tag: https://blog.csdn.net/weixin_44174352/article/details/113731041
+    # e.g.
+    # word: "家里"
+    # pos: "s"
+    # finals: ['ia1', 'i3']
+    def _neural_sandhi(self, word: str, pos: str,
+                       finals: List[str]) -> List[str]:
+
+        # reduplication words for n. and v. e.g. 奶奶, 试试, 旺旺
+        for j, item in enumerate(word):
+            if j - 1 >= 0 and item == word[j - 1] and pos[0] in {
+                    "n", "v", "a"
+            } and word not in self.must_not_neural_tone_words:
+                finals[j] = finals[j][:-1] + "5"
+        ge_idx = word.find("个")
+        if len(word) >= 1 and word[-1] in "吧呢哈啊呐噻嘛吖嗨呐哦哒额滴哩哟喽啰耶喔诶":
+            finals[-1] = finals[-1][:-1] + "5"
+        elif len(word) >= 1 and word[-1] in "的地得":
+            finals[-1] = finals[-1][:-1] + "5"
+        # e.g. 走了, 看着, 去过
+        elif len(word) == 1 and word in "了着过" and pos in {"ul", "uz", "ug"}:
+            finals[-1] = finals[-1][:-1] + "5"
+        elif len(word) > 1 and word[-1] in "们子" and pos in {
+                "r", "n"
+        } and word not in self.must_not_neural_tone_words:
+            finals[-1] = finals[-1][:-1] + "5"
+        # e.g. 桌上, 地下, 家里
+        elif len(word) > 1 and word[-1] in "上下里" and pos in {"s", "l", "f"}:
+            finals[-1] = finals[-1][:-1] + "5"
+        # e.g. 上来, 下去
+        elif len(word) > 1 and word[-1] in "来去" and word[-2] in "上下进出回过起开":
+            finals[-1] = finals[-1][:-1] + "5"
+        # 个做量词
+        elif (ge_idx >= 1 and
+              (word[ge_idx - 1].isnumeric() or
+               word[ge_idx - 1] in "几有两半多各整每做是")) or word == '个':
+            finals[ge_idx] = finals[ge_idx][:-1] + "5"
+        else:
+            if word in self.must_neural_tone_words or word[
+                    -2:] in self.must_neural_tone_words:
+                finals[-1] = finals[-1][:-1] + "5"
+
+        word_list = self._split_word(word)
+        finals_list = [finals[:len(word_list[0])], finals[len(word_list[0]):]]
+        for i, word in enumerate(word_list):
+            # conventional neural in Chinese
+            if word in self.must_neural_tone_words or word[
+                    -2:] in self.must_neural_tone_words:
+                finals_list[i][-1] = finals_list[i][-1][:-1] + "5"
+        finals = sum(finals_list, [])
+        return finals
+
+    def _bu_sandhi(self, word: str, finals: List[str]) -> List[str]:
+        # e.g. 看不懂
+        if len(word) == 3 and word[1] == "不":
+            finals[1] = finals[1][:-1] + "5"
+        else:
+            for i, char in enumerate(word):
+                # "不" before tone4 should be bu2, e.g. 不怕
+                if char == "不" and i + 1 < len(word) and finals[i +
+                                                                1][-1] == "4":
+                    finals[i] = finals[i][:-1] + "2"
+        return finals
+
+    def _yi_sandhi(self, word: str, finals: List[str]) -> List[str]:
+        # "一" in number sequences, e.g. 一零零, 二一零
+        if word.find("一") != -1 and all(
+            [item.isnumeric() for item in word if item != "一"]):
+            return finals
+        # "一" between reduplication words shold be yi5, e.g. 看一看
+        elif len(word) == 3 and word[1] == "一" and word[0] == word[-1]:
+            finals[1] = finals[1][:-1] + "5"
+        # when "一" is ordinal word, it should be yi1
+        elif word.startswith("第一"):
+            finals[1] = finals[1][:-1] + "1"
+        else:
+            for i, char in enumerate(word):
+                if char == "一" and i + 1 < len(word):
+                    # "一" before tone4 should be yi2, e.g. 一段
+                    if finals[i + 1][-1] == "4":
+                        finals[i] = finals[i][:-1] + "2"
+                    # "一" before non-tone4 should be yi4, e.g. 一天
+                    else:
+                        # "一" 后面如果是标点，还读一声
+                        if word[i + 1] not in self.punc:
+                            finals[i] = finals[i][:-1] + "4"
+        return finals
+
+    def _split_word(self, word: str) -> List[str]:
+        word_list = jieba.cut_for_search(word)
+        word_list = sorted(word_list, key=lambda i: len(i), reverse=False)
+        first_subword = word_list[0]
+        first_begin_idx = word.find(first_subword)
+        if first_begin_idx == 0:
+            second_subword = word[len(first_subword):]
+            new_word_list = [first_subword, second_subword]
+        else:
+            second_subword = word[:-len(first_subword)]
+            new_word_list = [second_subword, first_subword]
+        return new_word_list
+
+    def _three_sandhi(self, word: str, finals: List[str]) -> List[str]:
+        if len(word) == 2 and self._all_tone_three(finals):
+            finals[0] = finals[0][:-1] + "2"
+        elif len(word) == 3:
+            word_list = self._split_word(word)
+            if self._all_tone_three(finals):
+                #  disyllabic + monosyllabic, e.g. 蒙古/包
+                if len(word_list[0]) == 2:
+                    finals[0] = finals[0][:-1] + "2"
+                    finals[1] = finals[1][:-1] + "2"
+                #  monosyllabic + disyllabic, e.g. 纸/老虎
+                elif len(word_list[0]) == 1:
+                    finals[1] = finals[1][:-1] + "2"
+            else:
+                finals_list = [
+                    finals[:len(word_list[0])], finals[len(word_list[0]):]
+                ]
+                if len(finals_list) == 2:
+                    for i, sub in enumerate(finals_list):
+                        # e.g. 所有/人
+                        if self._all_tone_three(sub) and len(sub) == 2:
+                            finals_list[i][0] = finals_list[i][0][:-1] + "2"
+                        # e.g. 好/喜欢
+                        elif i == 1 and not self._all_tone_three(sub) and finals_list[i][0][-1] == "3" and \
+                                finals_list[0][-1][-1] == "3":
+
+                            finals_list[0][-1] = finals_list[0][-1][:-1] + "2"
+                        finals = sum(finals_list, [])
+        # split idiom into two words who's length is 2
+        elif len(word) == 4:
+            finals_list = [finals[:2], finals[2:]]
+            finals = []
+            for sub in finals_list:
+                if self._all_tone_three(sub):
+                    sub[0] = sub[0][:-1] + "2"
+                finals += sub
+
+        return finals
+
+    def _all_tone_three(self, finals: List[str]) -> bool:
+        return all(x[-1] == "3" for x in finals)
+
+    # merge "不" and the word behind it
+    # if don't merge, "不" sometimes appears alone according to jieba, which may occur sandhi error
+    def _merge_bu(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
+        new_seg = []
+        last_word = ""
+        for word, pos in seg:
+            if last_word == "不":
+                word = last_word + word
+            if word != "不":
+                new_seg.append((word, pos))
+            last_word = word[:]
+        if last_word == "不":
+            new_seg.append((last_word, 'd'))
+            last_word = ""
+        return new_seg
+
+    # function 1: merge "一" and reduplication words in it's left and right, e.g. "听","一","听" ->"听一听"
+    # function 2: merge single  "一" and the word behind it
+    # if don't merge, "一" sometimes appears alone according to jieba, which may occur sandhi error
+    # e.g.
+    # input seg: [('听', 'v'), ('一', 'm'), ('听', 'v')]
+    # output seg: [['听一听', 'v']]
+    def _merge_yi(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
+        new_seg = []
+        # function 1
+        for i, (word, pos) in enumerate(seg):
+            if i - 1 >= 0 and word == "一" and i + 1 < len(seg) and seg[i - 1][
+                    0] == seg[i + 1][0] and seg[i - 1][1] == "v":
+                new_seg[i - 1][0] = new_seg[i - 1][0] + "一" + new_seg[i - 1][0]
+            else:
+                if i - 2 >= 0 and seg[i - 1][0] == "一" and seg[i - 2][
+                        0] == word and pos == "v":
+                    continue
+                else:
+                    new_seg.append([word, pos])
+        seg = new_seg
+        new_seg = []
+        # function 2
+        for i, (word, pos) in enumerate(seg):
+            if new_seg and new_seg[-1][0] == "一":
+                new_seg[-1][0] = new_seg[-1][0] + word
+            else:
+                new_seg.append([word, pos])
+        return new_seg
+
+    # the first and the second words are all_tone_three
+    def _merge_continuous_three_tones(
+            self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
+        new_seg = []
+        sub_finals_list = [
+            lazy_pinyin(
+                word, neutral_tone_with_five=True, style=Style.FINALS_TONE3)
+            for (word, pos) in seg
+        ]
+        assert len(sub_finals_list) == len(seg)
+        merge_last = [False] * len(seg)
+        for i, (word, pos) in enumerate(seg):
+            if i - 1 >= 0 and self._all_tone_three(
+                    sub_finals_list[i - 1]) and self._all_tone_three(
+                        sub_finals_list[i]) and not merge_last[i - 1]:
+                # if the last word is reduplication, not merge, because reduplication need to be _neural_sandhi
+                if not self._is_reduplication(seg[i - 1][0]) and len(
+                        seg[i - 1][0]) + len(seg[i][0]) <= 3:
+                    new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
+                    merge_last[i] = True
+                else:
+                    new_seg.append([word, pos])
+            else:
+                new_seg.append([word, pos])
+
+        return new_seg
+
+    def _is_reduplication(self, word: str) -> bool:
+        return len(word) == 2 and word[0] == word[1]
+
+    # the last char of first word and the first char of second word is tone_three
+    def _merge_continuous_three_tones_2(
+            self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
+        new_seg = []
+        sub_finals_list = [
+            lazy_pinyin(
+                word, neutral_tone_with_five=True, style=Style.FINALS_TONE3)
+            for (word, pos) in seg
+        ]
+        assert len(sub_finals_list) == len(seg)
+        merge_last = [False] * len(seg)
+        for i, (word, pos) in enumerate(seg):
+            if i - 1 >= 0 and sub_finals_list[i - 1][-1][-1] == "3" and sub_finals_list[i][0][-1] == "3" and not \
+                    merge_last[i - 1]:
+                # if the last word is reduplication, not merge, because reduplication need to be _neural_sandhi
+                if not self._is_reduplication(seg[i - 1][0]) and len(
+                        seg[i - 1][0]) + len(seg[i][0]) <= 3:
+                    new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
+                    merge_last[i] = True
+                else:
+                    new_seg.append([word, pos])
+            else:
+                new_seg.append([word, pos])
+        return new_seg
+
+    def _merge_er(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
+        new_seg = []
+        for i, (word, pos) in enumerate(seg):
+            if i - 1 >= 0 and word == "儿":
+                new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
+            else:
+                new_seg.append([word, pos])
+        return new_seg
+
+    def _merge_reduplication(
+            self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
+        new_seg = []
+        for i, (word, pos) in enumerate(seg):
+            if new_seg and word == new_seg[-1][0]:
+                new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
+            else:
+                new_seg.append([word, pos])
+        return new_seg
+
+    def pre_merge_for_modify(
+            self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
+        seg = self._merge_bu(seg)
+        seg = self._merge_yi(seg)
+        seg = self._merge_reduplication(seg)
+        seg = self._merge_continuous_three_tones(seg)
+        seg = self._merge_continuous_three_tones_2(seg)
+        seg = self._merge_er(seg)
+        return seg
+
+    def modified_tone(self, word: str, pos: str,
+                      finals: List[str]) -> List[str]:
+        finals = self._bu_sandhi(word, finals)
+        finals = self._yi_sandhi(word, finals)
+        finals = self._neural_sandhi(word, pos, finals)
+        finals = self._three_sandhi(word, finals)
+        return finals
diff --git a/ernie-sat/paddlespeech/t2s/frontend/vocab.py b/ernie-sat/paddlespeech/t2s/frontend/vocab.py
new file mode 100644
index 0000000000000000000000000000000000000000..76bb3c7bb59fc286e8bcaa5db572dc71aec0f7df
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/frontend/vocab.py
@@ -0,0 +1,120 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from collections import OrderedDict
+from typing import Iterable
+
+__all__ = ["Vocab"]
+
+
+class Vocab(object):
+    """  Vocabulary.
+
+    Args:
+        symbols (Iterable[str]): Common symbols.
+        padding_symbol (str, optional): Symbol for pad. Defaults to "<pad>".
+        unk_symbol (str, optional): Symbol for unknow. Defaults to "<unk>"
+        start_symbol (str, optional): Symbol for start. Defaults to "<s>"
+        end_symbol (str, optional): Symbol for end. Defaults to "</s>"
+    """
+
+    def __init__(self,
+                 symbols: Iterable[str],
+                 padding_symbol="<pad>",
+                 unk_symbol="<unk>",
+                 start_symbol="<s>",
+                 end_symbol="</s>"):
+        self.special_symbols = OrderedDict()
+        for i, item in enumerate(
+            [padding_symbol, unk_symbol, start_symbol, end_symbol]):
+            if item:
+                self.special_symbols[item] = len(self.special_symbols)
+
+        self.padding_symbol = padding_symbol
+        self.unk_symbol = unk_symbol
+        self.start_symbol = start_symbol
+        self.end_symbol = end_symbol
+
+        self.stoi = OrderedDict()
+        self.stoi.update(self.special_symbols)
+
+        for i, s in enumerate(symbols):
+            if s not in self.stoi:
+                self.stoi[s] = len(self.stoi)
+        self.itos = {v: k for k, v in self.stoi.items()}
+
+    def __len__(self):
+        return len(self.stoi)
+
+    @property
+    def num_specials(self):
+        """ The number of special symbols.
+        """
+        return len(self.special_symbols)
+
+    # special tokens
+    @property
+    def padding_index(self):
+        """ The index of padding symbol
+        """
+        return self.stoi.get(self.padding_symbol, -1)
+
+    @property
+    def unk_index(self):
+        """The index of unknow symbol.
+        """
+        return self.stoi.get(self.unk_symbol, -1)
+
+    @property
+    def start_index(self):
+        """The index of start symbol.
+        """
+        return self.stoi.get(self.start_symbol, -1)
+
+    @property
+    def end_index(self):
+        """ The index of end symbol.
+        """
+        return self.stoi.get(self.end_symbol, -1)
+
+    def __repr__(self):
+        fmt = "Vocab(size: {},\nstoi:\n{})"
+        return fmt.format(len(self), self.stoi)
+
+    def __str__(self):
+        return self.__repr__()
+
+    def lookup(self, symbol):
+        """ The index that symbol correspond.
+        """
+        return self.stoi[symbol]
+
+    def reverse(self, index):
+        """ The symbol thar index cottespond.
+        """
+        return self.itos[index]
+
+    def add_symbol(self, symbol):
+        """ Add a new symbol in vocab.
+        """
+        if symbol in self.stoi:
+            return
+        N = len(self.stoi)
+        self.stoi[symbol] = N
+        self.itos[N] = symbol
+
+    def add_symbols(self, symbols):
+        """ Add multiple symbols in vocab.
+        """
+        for symbol in symbols:
+            self.add_symbol(symbol)
diff --git a/ernie-sat/paddlespeech/t2s/frontend/zh_frontend.py b/ernie-sat/paddlespeech/t2s/frontend/zh_frontend.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb8ed5b4919ecfb67d3f54aade65b0d31e1d1a00
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/frontend/zh_frontend.py
@@ -0,0 +1,314 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import re
+from typing import Dict
+from typing import List
+
+import jieba.posseg as psg
+import numpy as np
+import paddle
+from g2pM import G2pM
+from pypinyin import lazy_pinyin
+from pypinyin import load_phrases_dict
+from pypinyin import load_single_dict
+from pypinyin import Style
+from pypinyin_dict.phrase_pinyin_data import large_pinyin
+
+from paddlespeech.t2s.frontend.generate_lexicon import generate_lexicon
+from paddlespeech.t2s.frontend.tone_sandhi import ToneSandhi
+from paddlespeech.t2s.frontend.zh_normalization.text_normlization import TextNormalizer
+
+
+class Frontend():
+    def __init__(self,
+                 g2p_model="pypinyin",
+                 phone_vocab_path=None,
+                 tone_vocab_path=None):
+        self.tone_modifier = ToneSandhi()
+        self.text_normalizer = TextNormalizer()
+        self.punc = "：，；。？！“”‘’':,;.?!"
+        # g2p_model can be pypinyin and g2pM
+        self.g2p_model = g2p_model
+        if self.g2p_model == "g2pM":
+            self.g2pM_model = G2pM()
+            self.pinyin2phone = generate_lexicon(
+                with_tone=True, with_erhua=False)
+        else:
+            self.__init__pypinyin()
+        self.must_erhua = {"小院儿", "胡同儿", "范儿", "老汉儿", "撒欢儿", "寻老礼儿", "妥妥儿"}
+        self.not_erhua = {
+            "虐儿", "为儿", "护儿", "瞒儿", "救儿", "替儿", "有儿", "一儿", "我儿", "俺儿", "妻儿",
+            "拐儿", "聋儿", "乞儿", "患儿", "幼儿", "孤儿", "婴儿", "婴幼儿", "连体儿", "脑瘫儿",
+            "流浪儿", "体弱儿", "混血儿", "蜜雪儿", "舫儿", "祖儿", "美儿", "应采儿", "可儿", "侄儿",
+            "孙儿", "侄孙儿", "女儿", "男儿", "红孩儿", "花儿", "虫儿", "马儿", "鸟儿", "猪儿", "猫儿",
+            "狗儿"
+        }
+        self.vocab_phones = {}
+        self.vocab_tones = {}
+        if phone_vocab_path:
+            with open(phone_vocab_path, 'rt') as f:
+                phn_id = [line.strip().split() for line in f.readlines()]
+            for phn, id in phn_id:
+                self.vocab_phones[phn] = int(id)
+        if tone_vocab_path:
+            with open(tone_vocab_path, 'rt') as f:
+                tone_id = [line.strip().split() for line in f.readlines()]
+            for tone, id in tone_id:
+                self.vocab_tones[tone] = int(id)
+
+    def __init__pypinyin(self):
+        large_pinyin.load()
+
+        load_phrases_dict({u'开户行': [[u'ka1i'], [u'hu4'], [u'hang2']]})
+        load_phrases_dict({u'发卡行': [[u'fa4'], [u'ka3'], [u'hang2']]})
+        load_phrases_dict({u'放款行': [[u'fa4ng'], [u'kua3n'], [u'hang2']]})
+        load_phrases_dict({u'茧行': [[u'jia3n'], [u'hang2']]})
+        load_phrases_dict({u'行号': [[u'hang2'], [u'ha4o']]})
+        load_phrases_dict({u'各地': [[u'ge4'], [u'di4']]})
+        load_phrases_dict({u'借还款': [[u'jie4'], [u'hua2n'], [u'kua3n']]})
+        load_phrases_dict({u'时间为': [[u'shi2'], [u'jia1n'], [u'we2i']]})
+        load_phrases_dict({u'为准': [[u'we2i'], [u'zhu3n']]})
+        load_phrases_dict({u'色差': [[u'se4'], [u'cha1']]})
+
+        # 调整字的拼音顺序
+        load_single_dict({ord(u'地'): u'de,di4'})
+
+    def _get_initials_finals(self, word: str) -> List[List[str]]:
+        initials = []
+        finals = []
+        if self.g2p_model == "pypinyin":
+            orig_initials = lazy_pinyin(
+                word, neutral_tone_with_five=True, style=Style.INITIALS)
+            orig_finals = lazy_pinyin(
+                word, neutral_tone_with_five=True, style=Style.FINALS_TONE3)
+            for c, v in zip(orig_initials, orig_finals):
+                if re.match(r'i\d', v):
+                    if c in ['z', 'c', 's']:
+                        v = re.sub('i', 'ii', v)
+                    elif c in ['zh', 'ch', 'sh', 'r']:
+                        v = re.sub('i', 'iii', v)
+                initials.append(c)
+                finals.append(v)
+        elif self.g2p_model == "g2pM":
+            pinyins = self.g2pM_model(word, tone=True, char_split=False)
+            for pinyin in pinyins:
+                pinyin = pinyin.replace("u:", "v")
+                if pinyin in self.pinyin2phone:
+                    initial_final_list = self.pinyin2phone[pinyin].split(" ")
+                    if len(initial_final_list) == 2:
+                        initials.append(initial_final_list[0])
+                        finals.append(initial_final_list[1])
+                    elif len(initial_final_list) == 1:
+                        initials.append('')
+                        finals.append(initial_final_list[1])
+                else:
+                    # If it's not pinyin (possibly punctuation) or no conversion is required
+                    initials.append(pinyin)
+                    finals.append(pinyin)
+        return initials, finals
+
+    # if merge_sentences, merge all sentences into one phone sequence
+    def _g2p(self,
+             sentences: List[str],
+             merge_sentences: bool=True,
+             with_erhua: bool=True) -> List[List[str]]:
+        segments = sentences
+        phones_list = []
+        for seg in segments:
+            phones = []
+            # Replace all English words in the sentence
+            seg = re.sub('[a-zA-Z]+', '', seg)
+            seg_cut = psg.lcut(seg)
+            initials = []
+            finals = []
+            seg_cut = self.tone_modifier.pre_merge_for_modify(seg_cut)
+            for word, pos in seg_cut:
+                if pos == 'eng':
+                    continue
+                sub_initials, sub_finals = self._get_initials_finals(word)
+                sub_finals = self.tone_modifier.modified_tone(word, pos,
+                                                              sub_finals)
+                if with_erhua:
+                    sub_initials, sub_finals = self._merge_erhua(
+                        sub_initials, sub_finals, word, pos)
+                initials.append(sub_initials)
+                finals.append(sub_finals)
+                # assert len(sub_initials) == len(sub_finals) == len(word)
+            initials = sum(initials, [])
+            finals = sum(finals, [])
+
+            for c, v in zip(initials, finals):
+                # NOTE: post process for pypinyin outputs
+                # we discriminate i, ii and iii
+                if c and c not in self.punc:
+                    phones.append(c)
+                if c and c in self.punc:
+                    phones.append('sp')
+                if v and v not in self.punc:
+                    phones.append(v)
+
+            phones_list.append(phones)
+        if merge_sentences:
+            merge_list = sum(phones_list, [])
+            # rm the last 'sp' to avoid the noise at the end
+            # cause in the training data, no 'sp' in the end
+            if merge_list[-1] == 'sp':
+                merge_list = merge_list[:-1]
+            phones_list = []
+            phones_list.append(merge_list)
+        return phones_list
+
+    def _merge_erhua(self,
+                     initials: List[str],
+                     finals: List[str],
+                     word: str,
+                     pos: str) -> List[List[str]]:
+        if word not in self.must_erhua and (word in self.not_erhua or
+                                            pos in {"a", "j", "nr"}):
+            return initials, finals
+        # "……" 等情况直接返回
+        if len(finals) != len(word):
+            return initials, finals
+
+        assert len(finals) == len(word)
+
+        new_initials = []
+        new_finals = []
+        for i, phn in enumerate(finals):
+            if i == len(finals) - 1 and word[i] == "儿" and phn in {
+                    "er2", "er5"
+            } and word[-2:] not in self.not_erhua and new_finals:
+                new_finals[-1] = new_finals[-1][:-1] + "r" + new_finals[-1][-1]
+            else:
+                new_finals.append(phn)
+                new_initials.append(initials[i])
+        return new_initials, new_finals
+
+    def _p2id(self, phonemes: List[str]) -> np.array:
+        # replace unk phone with sp
+        phonemes = [
+            phn if phn in self.vocab_phones else "sp" for phn in phonemes
+        ]
+        phone_ids = [self.vocab_phones[item] for item in phonemes]
+        return np.array(phone_ids, np.int64)
+
+    def _t2id(self, tones: List[str]) -> np.array:
+        # replace unk phone with sp
+        tones = [tone if tone in self.vocab_tones else "0" for tone in tones]
+        tone_ids = [self.vocab_tones[item] for item in tones]
+        return np.array(tone_ids, np.int64)
+
+    def _get_phone_tone(self, phonemes: List[str],
+                        get_tone_ids: bool=False) -> List[List[str]]:
+        phones = []
+        tones = []
+        if get_tone_ids and self.vocab_tones:
+            for full_phone in phonemes:
+                # split tone from finals
+                match = re.match(r'^(\w+)([012345])$', full_phone)
+                if match:
+                    phone = match.group(1)
+                    tone = match.group(2)
+                    # if the merged erhua not in the vocab
+                    # assume that the input is ['iaor3'] and 'iaor' not in self.vocab_phones, we split 'iaor' into ['iao','er']
+                    # and the tones accordingly change from ['3'] to ['3','2'], while '2' is the tone of 'er2'
+                    if len(phone) >= 2 and phone != "er" and phone[
+                            -1] == 'r' and phone not in self.vocab_phones and phone[:
+                                                                                    -1] in self.vocab_phones:
+                        phones.append(phone[:-1])
+                        phones.append("er")
+                        tones.append(tone)
+                        tones.append("2")
+                    else:
+                        phones.append(phone)
+                        tones.append(tone)
+                else:
+                    phones.append(full_phone)
+                    tones.append('0')
+        else:
+            for phone in phonemes:
+                # if the merged erhua not in the vocab
+                # assume that the input is ['iaor3'] and 'iaor' not in self.vocab_phones, change ['iaor3'] to ['iao3','er2']
+                if len(phone) >= 3 and phone[:-1] != "er" and phone[
+                        -2] == 'r' and phone not in self.vocab_phones and (
+                            phone[:-2] + phone[-1]) in self.vocab_phones:
+                    phones.append((phone[:-2] + phone[-1]))
+                    phones.append("er2")
+                else:
+                    phones.append(phone)
+        return phones, tones
+
+    def get_phonemes(self,
+                     sentence: str,
+                     merge_sentences: bool=True,
+                     with_erhua: bool=True,
+                     robot: bool=False,
+                     print_info: bool=False) -> List[List[str]]:
+        sentences = self.text_normalizer.normalize(sentence)
+        phonemes = self._g2p(
+            sentences, merge_sentences=merge_sentences, with_erhua=with_erhua)
+        # change all tones to `1`
+        if robot:
+            new_phonemes = []
+            for sentence in phonemes:
+                new_sentence = []
+                for item in sentence:
+                    # `er` only have tone `2`
+                    if item[-1] in "12345" and item != "er2":
+                        item = item[:-1] + "1"
+                    new_sentence.append(item)
+                new_phonemes.append(new_sentence)
+            phonemes = new_phonemes
+        if print_info:
+            print("----------------------------")
+            print("text norm results:")
+            print(sentences)
+            print("----------------------------")
+            print("g2p results:")
+            print(phonemes)
+            print("----------------------------")
+        return phonemes
+
+    def get_input_ids(self,
+                      sentence: str,
+                      merge_sentences: bool=True,
+                      get_tone_ids: bool=False,
+                      robot: bool=False,
+                      print_info: bool=False) -> Dict[str, List[paddle.Tensor]]:
+        phonemes = self.get_phonemes(
+            sentence,
+            merge_sentences=merge_sentences,
+            print_info=print_info,
+            robot=robot)
+        result = {}
+        phones = []
+        tones = []
+        temp_phone_ids = []
+        temp_tone_ids = []
+        for part_phonemes in phonemes:
+            phones, tones = self._get_phone_tone(
+                part_phonemes, get_tone_ids=get_tone_ids)
+            if tones:
+                tone_ids = self._t2id(tones)
+                tone_ids = paddle.to_tensor(tone_ids)
+                temp_tone_ids.append(tone_ids)
+            if phones:
+                phone_ids = self._p2id(phones)
+                phone_ids = paddle.to_tensor(phone_ids)
+                temp_phone_ids.append(phone_ids)
+        if temp_tone_ids:
+            result["tone_ids"] = temp_tone_ids
+        if temp_phone_ids:
+            result["phone_ids"] = temp_phone_ids
+        return result
diff --git a/ernie-sat/paddlespeech/t2s/frontend/zh_normalization/README.md b/ernie-sat/paddlespeech/t2s/frontend/zh_normalization/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..92eea9f54630dfd41dfc3ce53bc511cc7595062c
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/frontend/zh_normalization/README.md
@@ -0,0 +1,16 @@
+## Supported NSW (Non-Standard-Word) Normalization
+
+|NSW type|raw|normalized|
+|:--|:-|:-|
+|serial number|电影中梁朝伟扮演的陈永仁的编号27149|电影中梁朝伟扮演的陈永仁的编号二七一四九|
+|cardinal|这块黄金重达324.75克<br>我们班的最高总分为583分|这块黄金重达三百二十四点七五克<br>我们班的最高总分为五百八十三分|
+|numeric range |12\~23<br>-1.5\~2|十二到二十三<br>负一点五到二|
+|date|她出生于86年8月18日，她弟弟出生于1995年3月1日|她出生于八六年八月十八日， 她弟弟出生于一九九五年三月一日|
+|time|等会请在12:05请通知我|等会请在十二点零五分请通知我
+|temperature|今天的最低气温达到-10°C|今天的最低气温达到零下十度
+|fraction|现场有7/12的观众投出了赞成票|现场有十二分之七的观众投出了赞成票|
+|percentage|明天有62％的概率降雨|明天有百分之六十二的概率降雨|
+|money|随便来几个价格12块5，34.5元，20.1万|随便来几个价格十二块五，三十四点五元，二十点一万|
+|telephone|这是固话0421-33441122<br>这是手机+86 18544139121|这是固话零四二一三三四四一一二二<br>这是手机八六一八五四四一三九一二一|
+## References
+[Pull requests #658 of DeepSpeech](https://github.com/PaddlePaddle/DeepSpeech/pull/658/files)
diff --git a/ernie-sat/paddlespeech/t2s/frontend/zh_normalization/__init__.py b/ernie-sat/paddlespeech/t2s/frontend/zh_normalization/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9d1f44d707fc182f2ba22a34be1cd200e9dafd8
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/frontend/zh_normalization/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from paddlespeech.t2s.frontend.zh_normalization.text_normlization import *
diff --git a/ernie-sat/paddlespeech/t2s/frontend/zh_normalization/char_convert.py b/ernie-sat/paddlespeech/t2s/frontend/zh_normalization/char_convert.py
new file mode 100644
index 0000000000000000000000000000000000000000..dcf95d72861348b36e4f21c31350f47e56934fc1
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/frontend/zh_normalization/char_convert.py
@@ -0,0 +1,46 @@
+# coding=utf-8
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Traditional and simplified Chinese conversion, a simplified character may correspond to multiple traditional characters.
+"""
+simplified_charcters = '制咖片型超声盘鉴定仔点他命书歌粉巾字帐恤手指记忆棒形转弯沟光○〇㐄㐅㐆㐌㐖毒㐜㐡㐤㐰㐺㑇㑳㒳㒸㔾㗂㗎㝵㞎㞙㞞以㢲㢴㤅㥁㥯㨗㫺㬎㮎㮚㮸㲋㲱㲾㳮涧㵪㶸㷖㷭㹢㹴犬㺢狓㺵碗㽮㿝䍃䔢䖟䖸䗈䗥䗪䝓射䥯䦉䯝鲃鱼䲔䳗鹅䵹鼄䶑一对应映射丁不识下儿子做二休世丘之貉并中台原则串为甚谓干净了百事无成八变五十些人得道鸡升天代如并来去个国政策劲幽灵在欧洲游荡接样萝卜坑侧化传价元论醇共再准刀两断切分耕耘收获钱货物向看旧就绪险刻千金动劳永逸匙零夜半卡通回复返影踪反常态口咬气句话同吐快吹周味呼诺呜品红锅哄而散起唱和问三知生熟团漆黑火糟堆场空块面塌糊涂尘染壁厢夔已足多情露水大早到晚夫妻当关万莫开失古恨套所料既往孔见提师要家主审寸阴难买斗牛小撮部阵局展身层巴掌帆风顺席地带过年计于春头载四季期被蛇怕井绳度愿式份弹顷深前律径心意念差愁孤行俱全房厅交遮打技长把抓死拿眼泪鼻涕钥锁折段抿拍即合扫排掬挥拨拥上入击洞掷揽改故辙败文值名斑方面旁族日秋餐隔雅里终父旦时晌会霎间晃暴寒曝更月望垠际朝夕本正经利杯羹东西板枝独秀根筋杆进条龙服务概模次函数又性程总付步脚印趋登毛拔呵氧氮碳决雌雄波未平派谎言流清楚白准溜烟潭有获闻是处降琴鹤甲病发可拾沙目然了直以相眨穿睹瞥瞬矢的解石鸟神教秉虔诚秘种窝蜂穷窍笑置笔苟勾销抹杀煞等奖箍节吃箭仇双雕诗筹箩筐系列纸级士官统丝毫挂维网尽线微吭响股脑胎脉承腔臂力致效资源址器举功投般说讲规贸易叶障着慎满皆输号木电池衣倾钟高低视仁觉醒览遗角银币触溃九鼎蔽抄出驷马追重语破贫洗贯走路安蹴至几蹶振跃役胆汗较辈轮辞赞退六连遍递边针血锤音错门思闪真倒项栽雾类保护川先惊乍体哄鳞爪鸣滴泡邻域党专鼓作齐炒丑烯亥克内酯冬加奴卯肝炎基尺梁街裤镐客宠庭巳汝昌烷玲磊糖肇酉醛啷青县韪良香骨鲷丂七集河市弦喜嘴张舌堵区工业姊妹星架构巧彩扭歪拼凑余热曜武州爷浮屠美乡老阶树荤素碎落能魄鳃鳗珠丄丅丆万俟丈尚摸母娘量管群亚虎必我堂令申件装伏位博侠义界表女墟台戏臭皮匠胜诸葛亮赛顶倍催请运算包立叉戟离疫苗土史志演围揭瓦晒夷姑婆帝村宝烂尖杉碱屉桌山岔岛由纪峡坝库镇废从德后拗汤治旬食明昧曹朋友框栏极权幂曲归依猫民氟硼氯磷铁江侗自旅法司洋浦梅园温暖湾焦班幸用田略番叠皇炮捶硝苯酸腺苷棱草镜穗跳远索锦纲聚氰胺联店胚膲爱色堇紫罗兰芝茶饭菱云虫藏藩乱叛苏亲债凳学座恐恋柱测肌腹衩锥系貂企乌跪叩军车农题迭都甘油屯奏键短阿姨陪姐只顾茅庐槽驾魂鲜鹿页其菜单乘任供势午齿汉组织吊调泻唇坡城报坟外夸将尉建筑岸岗公床扬新剑升杭林栗校楼标款汽社浣海商馆剧院钢华港机械广媒环球融第医科证券综财乐育游涨犹岭疏瘾睑确兵领导缴肢膛船艾瑟尔苍蔡虞效衫覆访诉课谕议轨述野钩限敌鞋颌颔颚饶首龈站例修凡划垂届属崽颏厨拜挫摆放旋削棋榻槛礼沉注滑营狱画确仪聘花葬诏员跌辖周达酒锚闸陷陆雨雪飞威丌于丹久乏予理评产亢卑亦乎舞己悲矩圆词害志但住佞佳便俗信票案幅翁倦伦假偏倚斜亏鬼敲停备伤脾胃仅此像俭匮免宜穴焉戴兼容许冻伯仲负彼昼皂轩轾实刊划颠卫战哥比省非好黄饰别拘束掩奶睬选择摇扰烦苦枚写协厌及格受欢迎约只估侵犯割状告或缺抗拒挽撤救药喻磨灭端倪少逆逾越避靠适吉誉吝玉含延咎歹听啻渊善谋均匀堪忍够太惹妙妥妨孕症孝术室完纳推冠积宣疑辩栗碴称屈挠屑干涉衡待很忙恶忿怎么怠急耻恭息悦惑惜惟想愉愧怍慌愤启懂懈怀材才紧招认扣抵拉舍也罢插揣冒搭撞南墙扩核支攻敢雷攀敬里吗需景智暇曾罪遇朽枉止况竞争辱求愈渝溶济左右袒困补爽特寂寞示弱找谢畏强疾徐痛痒冤符眠睦瞅董何厚云措活疲羞者轻玻璃祥兆禁移稂莠稳佛换答简结果盟绝缕途给谈否羁翼耐肖胫毋宁兴舒若菲莱痕迹窠臼虚衰脸兔撒鹰棺范该详讳抬泰让须眉象众赀账费灰赖奇虑训辍辨菽麦辛近送透逞徒速续逮捕遂遑违逊斧钺艰醉锈随观弃显饱脂肪使丏丐帮丒且慢末丕替桃宗王尊凉爵各图屋脊粮署录坛吾禄职胄袭君厦丗北壑桐疹损逢陵鹬丙寅戌氨腈唑纶辰酮脱氢酶醚丞丢现掉纱帽弄扯炮碗丠両丣坐存激肩臻蒂莲悖序驱丨丩丫挺杈髻鬟细介俄伊犁京尼布订普渡央委监察检查剂圈设警队斯督剩震境航舶革防托播促质版蝾螈锋研艺历残消频谱精密制造陲邮候埔坚压坜凹汇执府究邦俘摄寮彬狼岳肺肿庸英讯诊埋粒胞括控码韩暑枪枢砥澳哇牟寿甸钻探篇签缀缝继耳肯照妇埃悬璧轴柜台辣搁浅邪跑纤阮阳私囊魔丮丰姿采丱烧丳丵丶丷丸参寨朗桂瑞砂衷霞貌凤仆舰因嫌宰峰干络牌持旨祭祷簿编罚宾办丼丿乀乂乃乄仰慕盛旷留考验阔乆乇么丑麽乊湖燃乑乒乓乕乖僻忤戾离谬迕乗危肥劫除隙浪婿乙炔肠酰吡咯盐乚乛乜嘢卿玄宫尾狐龟塔嶷兄弟泉章霄钉耙乞扎哀怜恕讨乢乣乤乥乧乨乩童乪乫乭乳晕汁液瑶浆牙癌突窦罩腐胶猪酪蛋糕菌瘤乴乵乶乷乸乹乺乼乾俸冰嘉哕嚎坤妈尸垒旱枯涸俐渴潮涩煸豆燥爹瘦瘪癣瞪袋脆姜贝隆馏乿亀亁叫咕攘扔搞男砸窜蓬麻亃亄亅却亇迟典今临繁累卵奉婚聪躬巨与迁添裂副宿岁怪恶尕仑愣杆硅硫钛铀锰芑杂异钠砷胂磺琥珀舱棍簧胡茬盗浩盆贩郎腿亍洪亐互欠助勉惠操斥诿系户译亓墓碑刑铃卅渠缤纷斗米旗宪钒灯徽瘟祖拳福谷丰脏腑绑肉腌苓蕴桥铺霸颜闹判喷冈底蛙陉矿亖亘亜罕们娜桑那努哈喀弗烈曼松森杜氏杯奥琛敦戊穆圣裔汇薛孙亟亡佚虏羊牢奋释卷卸契媾感额睫缠谊趾塞挤纽阻还配驰庄亨洛祚亪享津沪畿郊慈菴枇杷膏亭阁锃丽亳亶亹诛初责翻疯偶杰丛稠妖拖寰居吸授慧蜗吞壮魅狗矛盾益渣患忧稀描猿梦暂涯畜祸缘沸搜引擎臣横纭谁混援蒸兽狮税剖亻亼亽亡什献刹邡么仂仃仄仆富怨仈仉毕昔晨壳绍仍仏仒仕宦仗欺恃腰叹叹炬梓讫施仙后琼逝仚仝仞仟悔仡佬偿填泊拓扑簇羔购顿钦佩发棻阃驭养亿儆尤借帧赈凌叙帖李柔刚沃眦睚戒讹取飨读仨仫仮著泳卧躺韶夏裁仳仵唯贤凭钓诞仿似宋佛讽伀硕盼鹅伄儅伈伉俪柯始娃迈戈坦堡帕茨萨庙玛莉莎藤霍姆伋伍奢胥廷芳豪伎俩侍汛勒希羲雏伐憩整谟闲闲伕伙伴颐伜伝伢叔恒兹恩翰伱伲侣伶俜悧鼬伸懒缩喇叭伹伺伻伽倻辐伾似佃伫布乔妮墨佉卢佌贷劣廉昂档浓矮伞洼缓耗胸谷迷挡率龋宅沫舍疗佐贰佑占优据铧尝呢须鲁晓佗佘余坪寺瓜铳僧蒙芒陀龛哼呕坊奸孽弊揖祟茧缚誓贼佝偻瞀佟你夺赶佡佢佣佤佧贾佪佫佯佰佱洁绩酿肴佴卷佶佷佸佹佺佻佼佽佾具唤窘坏娱怒慨硬习惯聋膨胀蔓骇贵痹侀侁侂侃侄侅鸿燕侇侈糜靡侉侌妾侏儒仓鼠侐侑侔仑侘侚链侜偎傍钴循柳葫芦附価侮骂蔑侯岩截蚀局贴壶嬛宴捷携桶笺酌俣狭膝狄俅俉俊俏俎俑俓俔谚俚俛黎健呈固墒增守康箱湿祐镖镳杠盒靖膜龄俞豹猎噪孚封札筒托衍鸽剪撰稿炼厂禊练缮葺俯瞰撑冲效俳俴俵俶俷俺备俾伥倂倅储卒惶敷猝逃颉蓄崇隐倌倏忽刺蜡烛噍嚼坍扁抽毙葱楣灌灶粪背薮卖赔闭霉腾倓倔幸倘倜傥倝借箸挹浇阅倡狂倢倣値倥偬倨傲倩匡嗣冲柝珍倬倭寇猩倮倶倷倹勤赞偁偃充伪吏嗓寐惺扮拱芫茜藉虢钞偈伟晶偌宕距析滤殿疼瘫注颇偓偕鸭歇滞偝偟偢忘怡旺偨偩逼偫偭偯偰偱偲侦缉蹄偷减惰漏窥窃偸偺迹傀儡傅傈僳骂篱傎奎琳迪叟芭傒傔傕伧悉荒傜傞傢傣芽逼佣婢傮睨寄檄诵谣颂伛担辜弓惨蒿悼疤傺傻屄臆巢泄箧羡盖轧颓傿㑩僄僇佥僊働僎侨僔僖僚僝伪僣僤侥僦猴偾僩僬僭僮僯僰雇僵殖签静僾僿征陇儁侬儃儇侩朴薄儊儋儌儍傧儓俦侪拟尽儜儞儤儦儩汰哉寡渥裕酷儭儱罐儳儵儹傩俨儽兀臬臲鹫允勋勋宙宵帅憝彝谐嫂阋畅沛溢盈饥赫凶悍狠猛顽愚妣斩秦遣鞭耀敏荣槃泽爆碟磁秃缆辉霁卤朵娄孜烽酱勃汀箕裘钳耶蒙蕾彻兑软遭黜兎児韵媳爸兕觥兖兙兛兜售鍪肚兝兞兟兡兢兣樽殓涅睡禀籍赘泌啡肽奸幕涵涝熵疚眷稃衬讧赴焕椒歼植跏没试误猜栖窗肋袖颊兪卦撇胡岐廓轿疸枫茴珑厕秩募勺吨寓斤历亩迫筷厘最淫螺韬兮宽匪筛襄赢轭复兲诈刃堰戎痞蚁饷它冀铸冂冃円冇冉册嫁厉砺竭醮冏牧冑冓冔冕冖冗冘冞冢窄抑诬冥冫烘菇蛰冷凝坨橇淇淋炭饼砖碛窖醋雕雹霜冱冶炉艳嘲峻滩淡漠煖飕饮冼冽凃凄怆梗凅凇净凊凋敝蒙凔凛遵汞脢凞几凢処凰凯凵凶焰凸折刷纹预丧喽奔巡榜殡芙蓉租笼辑鞘萃凼锯镬刁蛮刂娩崩批拆摊掰蘖骤歧颗秒袂赃勿嘱忌磋琢肤刈羽刎讼戮舂桨艇刓刖霹雳刜创犊刡恙墅帜筵致劫劫刨昏默攸尿欲熏润薰圭删刮痧铲刱刲刳刴刵踏磅戳柏槐绣芹苋猬舟铭鹄鹜劫剁剃辫刭锉履铅克剌姻咽哨廊掠桅沿召瞻翅赵卜渺茫郭剒剔剕沥剚愎毅讷才剜剥啄采剞剟剡剣剤䌽剐肾驶黏剰袍剀紊铲剸剺剽剿劁劂札劈啪柴扳啦刘奭姥夼昫涓熙禅禹锡翔雁鹗刽刿弩柄蜻蛉劒劓劖劘劙澜篑赏矶釜晋甜薪逐劦熔纣虐赤囚劬劭労劵效劻劼劾峭艮勅勇励勍勐腊脖庞漫饲荡粥辄勖勗勘骄馁碌泮雇捐竹骑殊阱绩朴恳谨剿勧勩勯勰劢勋勷劝惩慰诫谏勹芡践阑匁庇拯粟扎袱裹饺匆遽匈匉匊匋匍匐茎匏匕妆痰脓蛹斋苑烤蹈塘羌熊阀螳螂疆碚竿纬荷茵邙魏匚匜匝匟扶稷匣匦拢匸匹耦匽匾匿卂叮疮禧轸堤棚迢钧炼卄卆遐卉瓷盲瓶当胱腱裸卋卌卍卐怯污贱鄙龌龊陋卓溪唐梯渔陈枣泥漳浔涧梨芬谯赡辕迦郑単驴弈洽鳌卛占筮卝卞卟吩啉屎翠厄卣卨卪卬卮榫袄玺绶钮蚤惧殆笃耸卲帘帙绕恤卼卽厂厎厓厔厖厗奚厘厍厜厝谅厕厤厥厪腻孢厮厰厳厣厹厺粕垢芜菁厼厾叁悟茸薯叄吵笄悌哺讥坫垄弧芯杠潜婴刍袁诘贪谍煽馈驳収岳缔灾贿骗叚叡吻拦蘑蜜诀燧玩砚筝椎蔺铜逗骊另觅叨唠谒杵姓喊嚷嚣咚咛塑寻恼憎擦只泣渗蝠叱吒咄咤喝籀黛舵舷叵叶铎懿昭穰苴辽叻叼吁堑嫖赌瞧爬众抒吅吆夥卺橡涤抱纵摩郡唁坠扇篮膀袜颈吋忾谘酬哭妓媛暗表缰迩妃羿絮蕃浑拐葵暮隅吔吖啶嗪戚吜啬噬咽吟哦咏吠吧唧嗒咐吪隽咀征燐苞茹钙哧吮吰吱嘎吲哚吴栋娇窟孟箫忠晗淞阖闾趼宇呐睛嘘拂捧疵熄竽笛糠吼吽呀吕韦蒙呃呆笨呇贡呉罄呋喃呎呏呔呠呡痴呣呤呦呧瑛眩扒晬淑姬瑜璇鹃呪呫哔嚅嗫呬呯呰呱呲咧噌钝呴呶呷呸呺呻哱咻啸噜吁坎坷逻呿咁咂咆哮咇咈咋蟹煦珅蔼咍咑咒诅咔哒嚓咾哝哩喱咗咠咡咢咣咥咦咨嗟询咩咪咫啮啮咭咮咱咲咳呛嗽咴啕咸咹咺呙喉咿婉恸悯赋矜绿茗蓝哂抢瞒哆嗦啰噻啾滨彗哋哌哎唷哟哏哐哞哢哤哪里哫啼喘哰哲萎蚌哳咩哽哿呗唅唆唈唉唎唏哗尧棣殇璜睿肃唔睇唕吣唞唣喳唪唬唰喏唲唳唵嘛唶唸唹唻唼唾唿啁啃鹦鹉啅埠栈榷祺铺鞅飙啊啍啎啐啓啕啖啗啜哑祈啢衔啤啥啫啱啲啵啺饥啽噶昆沁喁喂喆裙喈咙喋喌喎喑喒喓喔粗喙幛庆滋鹊喟喣喤喥喦喧骚喨喩梆吃葡萄喭驼挑吓碰枞瓣纯疱藻趟铬喵営喹喺喼喿嗀嗃嗄嗅嗈嗉嗊嗍嗐嗑嗔诟嗕嗖嗙嗛嗜痂癖嗝嗡嗤嗥嗨唢嗬嗯嗰嗲嗵叽嗷嗹嗾嗿嘀嘁嘂嘅惋嘈峪禾荫啀嘌嘏嘐嘒啯啧嘚唛嘞嘟囔嘣嘥嘦嘧嘬嘭这谑严敞馋松哓嘶嗥呒虾嘹嘻啴嘿噀噂噅噇噉噎噏噔噗噘噙噚咝噞噢噤蝉皿噩噫噭嗳噱哙噳嚏涌洒欲巫霏噷噼嚃嚄嚆抖哜尝嚔苏嚚嚜嚞嚟呖嚬嚭嚮嚯亸喾饬按竣苛嚵嘤啭冁呓膪谦囍囒囓囗囘萧酚飘溅谛囝溯眸纥銮鹘囟殉囡団囤囥囧囨囱囫囵囬囮囯囲図囶囷囸囹圄圉拟囻囿圀圂圃圊粹蠹赦圌垦圏滚鲱凿枘圕圛圜圞坯埂壤骸炕祠窑豚绅魠鲮鳖圧握圩圪垯圬圮圯炸岬幔毯祇窨菩溉圳圴圻圾坂坆沾坋坌舛壈昆垫墩椅坒坓坩埚坭坰坱坳坴坵坻坼杨挣涎帘垃垈垌垍垓垔垕垗垚垛垝垣垞垟垤垧垮垵垺垾垿埀畔埄埆埇埈埌殃隍埏埒埕埗埜垭埤埦埧埭埯埰埲埳埴埵埶绋埸培怖桩础辅埼埽堀诃侄庑堃堄摧磐贞韧砌堈堉垩堋堌堍堎垴堙堞堠礁堧堨舆堭堮蜓摘堲堳堽堿塁塄塈煤茔棵塍垲埘塓绸塕鸦沽虱塙冢塝缪塡坞埙塥塩塬塱场螨塼塽塾塿墀墁墈墉墐夯増毁墝墠墦渍钵墫墬堕墰墺墙橱壅壆壊壌壎壒榨蒜壔壕壖圹垆壜壝垅壡壬壭壱売壴壹壻壸寝壿夂夅夆変夊夌漱邑夓腕泄甥御骼夗夘夙衮瑙妊娠醣枭珊莺鹭戗幻魇夤蹀秘擂鸫姚宛闺屿庾挞拇賛蛤裨菠氅漓捞湄蚊霆鲨箐篆篷荆肆舅荔鲆巷惭骰辟邱镕镰阪漂烩鲵鲽鳄鸨胪鹏妒峨谭枰晏玑癸祝秤竺牡籁恢罡蝼蝎赐绒御梭夬夭砣榆怙枕夶夹馅奄崛葩谲奈贺祀赠奌奂奓奕䜣詝奘奜奠奡奣陶奨奁魁奫奬奰娲孩贬隶酥宄狡猾她姹嫣妁毡荼皋膻蝇嫔妄妍嫉媚娆妗趣妚妞妤碍妬娅妯娌妲妳妵妺姁姅姉姗姒姘姙姜姝姞姣姤姧姫姮娥姱姸姺姽婀娀诱慑胁娉婷娑娓娟娣娭娯娵娶娸娼婊婐婕婞婤婥溪孺婧婪婬婹婺婼婽媁媄媊媕媞媟媠媢媬媮妫媲媵媸媺媻媪眯媿嫄嫈袅嫏嫕妪嫘嫚嫜嫠嫡嫦嫩嫪毐嫫嫬嫰妩嫺娴嫽嫿妫嬃嬅嬉耍婵痴艳嬔嬖嬗嫱袅嫒嬢嬷嬦嬬嬭幼嬲嬴婶嬹嬾嬿孀娘孅娈孏曰癫屏孑孓雀孖斟篓谜摺孛矻鸠崮轲祜鸾孥邈毓棠膑孬孭孰孱孳孵泛罔衔孻孪宀宁冗拙株薇掣抚琪瓿榴谧弥宊濂祁瑕宍宏碁宓邸谳実潢町宥宧宨宬徵崎骏掖阙臊煮禽蚕宸豫寀寁寥寃檐庶寎暄碜寔寖寘寙寛寠苫寤肘洱滥蒗陕核寪弘绰螽宝擅疙瘩晷対檐専尃尅赎绌缭畴衅尌峙醌襟痲碧屁昊槌淘恵瀑牝畑莓缸羚觑蔻脏躁尔尓锐尗尙尜尟尢尥尨尪尬尭尰擒尲尶尴尸尹潽蠖蛾尻扣梢蚴鳍脬蹲屇屌蚵屐屃挪屖屘屙屛屝屡屣峦嶂岩舄屧屦屩屪屃屮戍驻钾崖嵛巅旮旯楂榄榉芋茱萸靛麓屴屹屺屼岀岊岌岍阜岑彭巩岒岝岢岚岣岧岨岫岱岵岷峁峇峋峒峓峞峠嵋峨峰峱岘峹峿崀崁崆祯崋崌崃岖昆崒崔嵬巍萤颢崚崞崟崠峥巆崤崦崧殂岽崱崳崴崶崿嵂嵇嵊泗嵌嵎嵒嵓岁嵙嵞嵡嵩嵫嵯嵴嵼嵾嵝崭崭晴嶋嶌嶒嶓嵚崂嶙嶝嶞峤嶡嶢峄嶨嶭嶮嶰嶲岙嵘巂巃巇巉岿巌巓巘巛滇芎巟巠弋回巣巤炊擘蜥蟒蛊觋巰蜀彦淖杏茂甫楞巻巽帼巿帛斐鲫蕊帑帔帗帚琉汶帟帡帣帨裙帯帰帷帹暆帏幄帮幋幌幏帻幙帮幞幠幡幢幦幨幩幪帱幭幯幰遥蹉跎馀庚鉴幵幷稚邃庀庁広庄庈庉笠庋跋庖牺庠庤庥鲸庬庱庳庴庵馨衢庹庿廃厩廆廋廌廎廏廐廑廒荫廖廛厮搏锣廞弛袤廥廧廨廪廱绵踵髓廸迫瓯邺廻廼廾廿躔弁皱弇弌弍弎弐弑吊诡憾荐弝弢弣弤弨弭弮弰弪霖繇焘斌旭溥骞弶弸弼弾彀彄别累纠强彔彖彘彟彟陌彤贻彧绘虹彪炳雕蔚鸥彰瘅彲彳彴仿彷徉徨彸彽踩敛旆徂徇徊渭畲铉裼従筌徘徙徜徕膳苏萌渐徬徭醺徯徳徴潘徻徼忀瘁胖燎怦悸颤扉犀澎湃砰恍惚绞隘忉惮挨饿忐忑忒忖応忝忞耿忡忪忭忮忱忸怩忻悠懑怏遏怔怗怚怛怞怼黍讶怫怭懦怱怲恍怵惕怸怹恁恂恇恉恌恏恒恓恔恘恚恛恝恞恟恠恣恧眄恪恫恬澹恰恿悀悁悃悄悆悊悐悒晦悚悛悜悝悤您悩悪悮悰悱凄恻德悴怅惘闷悻悾惄愫钟蒐惆惇惌惎惏惓惔惙惛耄惝疟浊恿惦德恽惴蠢惸拈愀愃愆愈愊愍愐愑愒愓愔愕恪氓蠢騃昵惬赧悫愬愮愯恺愼慁恿慅慆慇霭慉慊愠慝慥怄怂慬慱悭慴慵慷戚焚憀灼郁憃惫憋憍眺捏轼愦憔憖憙憧憬憨憪憭怃憯憷憸憹憺懃懅懆邀懊懋怿懔懐懞懠懤懥恹懫懮懰懱毖懵遁梁雍忏懽戁戄戆戉戋戕戛戝戛戠戡戢戣戤戥戦戬戭戯轰戱披菊牖戸戹戺戻卯戽锹扂楔扃扆扈扊杖牵绢铐镯赉扐搂搅烊盹瞌跟趸镲靶鼾払扗玫腮扛扞扠扡扢盔押扤扦扱罾揄绥鞍郤窾扻扼扽抃抆抈抉抌抏瞎抔缳缢擞抜拗択抨摔歉蹿牾抶抻搐泵菸拃拄拊髀抛拌脯拎拏拑擢秧沓曳挛迂拚拝拠拡拫拭拮踢拴拶拷攒拽掇芥橐簪摹疔挈瓢骥捺蹻挌挍挎挐拣挓挖掘浚挙揍聩挲挶挟挿捂捃捄捅捆捉捋胳膊揎捌捍捎躯蛛捗捘捙捜捥捩扪捭据捱捻捼捽掀掂抡臀膘掊掎掏掐笙掔掗掞棉芍掤搪阐掫掮掯揉掱掲掽掾揃揅揆搓揌诨揕揗揘揜揝揞揠揥揩揪揫橥遒麈揰揲揵揶揸背揺搆搉搊搋搌搎搔搕撼橹捣搘搠搡搢搣搤搥搦搧搨搬楦裢讪赸掏搰搲搳搴揾搷搽搾搿摀摁摂摃摎掴摒摓跤摙摛掼摞摠摦喉羯摭摮挚摰摲抠摴抟摷掺摽撂撃撅稻撊撋挦锏泼撕撙撚㧑挢撢掸撦撅撩撬撱朔揿蚍蜉挝捡擀掳闯擉缶觚擐擕擖擗擡擣擤澡腚擧擨擩擫擭摈拧撷擸撸擽擿攃摅撵攉攥攐攓撄搀撺每攩攫辔澄攮攰攲攴轶攷砭讦攽碘敁敃敇敉叙敎筏敔敕敖闰诲敜煌敧敪敳敹敺敻敿斁衽斄牒绉诌斉斎斓鹑谰驳鳢斒筲斛斝斞斠斡斢斨斫斮晾沂潟颖绛邵斲斸釳於琅斾斿旀旗旃旄涡旌旎旐旒旓旖旛旝旟旡旣浴旰獭魃旴时旻旼旽昀昃昄昇昉晰躲澈熹皎皓矾昑昕昜昝昞昡昤晖笋昦昨是昱昳昴昶昺昻晁蹇隧蔬髦晄晅晒晛晜晞晟晡晢晤晥曦晩萘莹顗晿暁暋暌暍暐暔暕煅旸暝暠暡曚暦暨暪朦胧昵暲殄冯暵暸暹暻暾曀晔昙曈曌曏曐暧曘曙曛叠昽曩骆曱甴肱曷牍禺锟曽沧耽朁朅朆杪栓夸竟粘绦朊膺朏朐朓朕朘朙瞄觐溘饔飧朠朢朣栅椆淀虱朩朮朰朱炆璋钰炽鹮朳槿朵朾朿杅杇杌陧欣钊湛漼楷瀍煜玟缨翱肇舜贽适逵杓杕杗杙荀蘅杝杞脩珓筊杰榔狍閦颦缅莞杲杳眇杴杶杸杻杼枋枌枒枓衾葄翘纾逋枙狸桠枟槁枲枳枴枵枷枸橼枹枻柁柂柃柅柈柊柎某柑橘柒柘柙柚柜柞栎柟柢柣柤柩柬柮柰柲橙柶柷柸柺査柿栃栄栒栔栘栝栟柏栩栫栭栱栲栳栴檀栵栻桀骜桁镁桄桉桋桎梏椹葚桓桔桕桜桟桫椤桭杯桯桲桴桷桹湘溟梃梊梍梐潼栀枧梜梠梡梣梧梩梱梲梳梴梵梹棁棃樱棐棑棕榈簑绷蓑枨棘棜棨棩棪棫棬棯棰棱棳棸棹椁棼碗椄苕椈椊椋椌椐椑椓椗検椤椪椰椳椴椵椷椸椽椿楀匾楅篪楋楍楎楗楘楙楛楝楟楠楢楥桢楩楪楫楬楮楯楰梅楸楹楻楽榀榃榊榎槺榕榖榘榛狉莽搒笞榠榡榤榥榦榧杩榭榰榱梿霰榼榾桤槊闩槎槑槔槖様槜槢槥椠槪槭椮槱槲槻槼槾樆樊樏樑樕樗樘樛樟樠樧樨権樲樴樵猢狲桦樻罍樾樿橁橄橆桡笥龠橕橚橛辆椭橤橧竖膈跨橾橿檩檃檇柽檍檎檑檖檗桧槚檠樯檨檫檬梼槟檴檵柠棹櫆櫌栉櫜椟櫡槠栌枥榇栊櫹棂茄櫽欀欂欃欐欑栾欙棂溴欨欬欱欵欶欷歔欸欹欻欼欿歁歃歆艎歈歊莳蝶歓歕歘歙歛歜欤歠蹦诠镶蹒跚升陟歩歮歯歰歳歴璞歺瞑歾殁夭殈殍殑殗殜殙殛殒殢殣殥殪殚僵殰殳荃殷殸殹蛟殻肴谤殴毈毉喂毎毑蕈毗毘毚茛邓毧毬毳毷毹毽毾毵牦氄氆靴氉氊氇氍氐聊氕氖気氘氙氚氛氜氝氡汹焊痉氤氲氥氦铝锌氪烃氩铵痤汪浒漉痘盂碾菖蒲蕹蛭螅氵冰氹氺氽烫氾氿渚汆汊汋汍汎汏汐汔汕褟汙汚汜蓠沼秽蔑汧汨汩汭汲汳汴堤汾沄沅沆瀣沇沈葆浸沦湎溺痼疴沌沍沏沐沔沕沘浜畹砾沚沢沬沭沮沰沱灢沴沷籽沺烹濡洄泂肛泅泆涌肓泐泑泒泓泔泖泙泚泜泝泠漩馍涛粼泞藓鳅泩泫泭泯铢泱泲洇洊泾琵琶荽蓟箔洌洎洏洑潄濯洙洚洟洢洣洧洨洩痢滔洫洮洳洴洵洸洹洺洼洿淌蜚浄浉浙赣渫浠浡浤浥淼瀚浬浭翩萍浯浰蜃淀苔蛞蝓蜇螵蛸煲鲤浃浼浽溦涂涊涐涑涒涔滂莅涘涙涪涫涬涮涴涶涷涿淄淅淆淊凄黯淓淙涟淜淝淟淠淢淤渌淦淩猥藿亵淬淮淯淰淳诣涞纺淸淹炖癯绮渇済渉渋渓渕涣渟渢滓渤澥渧渨渮渰渲渶渼湅湉湋湍湑湓湔黔湜湝浈湟湢湣湩湫湮麟湱湲湴涅満沩溍溎溏溛舐漭溠溤溧驯溮溱溲溳溵溷溻溼溽溾滁滃滉滊荥滏稽滕滘汇滝滫滮羼耷卤滹浐煎漈漊漎绎漕漖漘漙沤漜漪漾漥漦漯漰溆漶漷濞潀颍潎潏潕潗潚潝潞潠潦祉疡潲潵滗潸潺潾涠澁澂澃澉澌澍澐澒澔澙渑澣澦澧澨澫澬浍澰澴澶澼熏郁濆濇濈濉濊貊濔疣濜濠濩觞浚濮盥潍濲泺瀁滢渎渖瀌浏瀒瀔濒泸瀛潇潆瀡潴泷濑瀬弥潋瀳瀵瀹瀺瀼沣滠灉灋灒漓灖灏灞灠滦灥灨滟灪蜴灮烬獴灴灸灺炁炅鱿炗炘炙炤炫疽烙钎炯炰炱炲炴炷毁炻烀烋瘴鲳烓烔焙烜烝烳饪烺焃焄耆焌焐焓焗焜焞焠焢焮焯焱焼煁煃煆煇煊熠煍熬煐炜煕暖熏硷霾煚煝煟煠茕矸煨琐炀萁煳煺煻熀熅熇熉罴荧穹炝熘熛熜稔谙烁熤熨熯熰眶蚂颎熳熸熿燀烨燂燄盏燊燋燏燔隼燖焖燠燡灿燨燮燹燻燽燿爇爊爓爚爝爟爨蟾爯爰为爻丬爿牀牁牂牄牋窗牏牓窗釉牚腩蒡虻牠虽蛎牣牤牮牯牲牳牴牷牸牼绊牿靬犂犄犆犇犉犍犎犒荦犗犛犟犠犨犩犪犮犰狳犴犵犺狁甩狃狆狎狒獾狘狙黠狨狩狫狴狷狺狻豕狈蜘猁猇猈猊猋猓猖獗猗猘狰狞犸猞猟獕猭猱猲猳猷猸猹猺玃獀獃獉獍獏獐獒毙獙獚獜獝獞獠獢獣獧鼇蹊狯猃獬豸狝獯鬻獳犷猕猡玁菟玅玆玈珉糁禛郅玍玎玓瓅玔玕玖玗玘玞玠玡玢玤玥玦珏瑰玭玳瑁玶玷玹玼珂珇珈瑚珌馐馔珔珖珙珛珞珡珣珥珧珩珪佩珶珷珺珽琀琁陨玡琇琖琚琠琤琦琨琫琬琭琮琯琰琱琲琅琴珐珲瑀瑂瑄瑉玮瑑瑔瑗瑢瑭瑱瑲瑳瑽瑾瑿璀璨璁璅璆璈琏璊璐璘璚璝璟璠璡璥瑷璩璪璫璯璲玙璸璺璿瓀璎瓖瓘瓒瓛脐瓞瓠瓤瓧瓩瓮瓰瓱瓴瓸瓻瓼甀甁甃甄甇甋甍甎甏甑甒甓甔瓮甖甗饴蔗甙诧钜粱盎锈团甡褥産甪甬甭甮宁铠甹甽甾甿畀畁畇畈畊畋畎畓畚畛畟鄂畤畦畧荻畯畳畵畷畸畽畾疃叠疋疍疎箪疐疒疕疘疝疢疥疧疳疶疿痁痄痊痌痍痏痐痒痔痗瘢痚痠痡痣痦痩痭痯痱痳痵痻痿瘀痖瘃瘈瘉瘊瘌瘏瘐痪瘕瘖瘙瘚瘛疭瘜瘝瘗瘠瘥瘨瘭瘆瘯瘰疬瘳疠瘵瘸瘺瘘瘼癃痨痫癈癎癐癔癙癜癠疖症癞蟆癪瘿痈発踔绀蔫酵皙砬砒翎翳蔹钨镴皑鹎驹暨粤褶皀皁荚皃镈皈皌皋皒朱皕皖皘皜皝皞皤皦皨皪皫皭糙绽皴皲皻皽盅盋碗盍盚盝踞盦盩秋千盬盭眦睁瞤盯盱眙裰盵盻睐眂眅眈眊県眑眕眚眛眞眢眣眭眳眴眵眹瞓眽郛睃睅睆睊睍睎困睒睖睙睟睠睢睥睪睾睯睽睾眯瞈瞋瞍逛瞏瞕瞖眍䁖瞟瞠瞢瞫瞭瞳瞵瞷瞹瞽阇瞿眬矉矍铄矔矗矙瞩矞矟矠矣矧矬矫矰矱硪碇磙罅舫阡、矼矽礓砃砅砆砉砍砑砕砝砟砠砢砦砧砩砫砮砳艏砵砹砼硇硌硍硎硏硐硒硜硖砗磲茚钡硭硻硾碃碉碏碣碓碔碞碡碪碫碬砀碯碲砜碻礴磈磉磎硙磔磕磖磛磟磠磡磤磥蹭磪磬磴磵磹磻硗礀硚礅礌礐礚礜礞礤礧礮砻礲礵礽礿祂祄祅祆禳祊祍祏祓祔祕祗祘祛祧祫祲祻祼饵脔锢禂禇禋祦禔祎隋禖禘禚禜禝禠祃禢禤禥禨禫祢禴禸秆秈秊闱飒秋秏秕笈蘵赁秠秣秪秫秬秭秷秸稊稌稍稑稗稙稛稞稬秸稲稹稼颡稿穂穄穇穈穉穋稣贮穏穜穟秾穑穣穤穧穨穭穮穵穸窿阒窀窂窅窆窈窕窊窋窌窒窗窔窞窣窬黩蹙窑窳窴窵窭窸窗竁竃竈竑竜并竦竖篦篾笆鲛竾笉笊笎笏笐靥笓笤箓笪笫笭笮笰笱笲笳笵笸笻筀筅筇筈筎筑筘筠筤筥筦笕筒筭箸筰筱筳筴宴筸箂个箊箎箑箒箘箙箛箜篌箝箠箬镞箯箴箾篁筼筜篘篙篚篛篜篝篟篠篡篢篥篧篨篭篰篲筚篴篶篹篼箦簁簃簆簉簋簌簏簜簟簠簥簦簨簬簰簸簻籊藤籒籓籔签籚篯箨籣籥籧笾簖籫籯芾麴籵籸籹籼粁秕粋粑粔粝粛粞粢粧粨粲粳稗粻粽辟粿糅糆糈糌糍糒糔萼糗蛆蹋糢糨糬粽糯糱籴粜糸糺紃蹼鲣霉纡纨绔纫闽襻紑纰纮锭鸢鹞纴紞紟扎紩紬绂绁纻紽紾绐絁絃絅経絍绗絏缡褵絓絖絘絜绚絣螯絪絫聒絰絵绝絺絻絿綀绡綅绠绨绣綌綍綎捆綖綘継続缎绻綦綪线綮綯绾罟蝽綷縩绺绫緁绲緅緆缁绯緌緎総緑绱緖缃缄缂绵缗緤褓缌纂緪緰缑缈缏缇縁縃縄萦缙缒縏缣縕缞縚缜缟缛縠縡縢縦绦縯縰骋缧縳纤缦絷缥縻衙縿繄缫繈繊繋繐缯繖繘繙繠缋繣繨缰缲繸繻缱纁纆纇缬缵纩纑纕缵纙纚纛缾罃罆坛罋罂罎罏罖罘罛罝罠罣罥罦罨罫罭锾罳罶罹罻罽罿羂羃羇芈蕉５１鸵羑羖羌羜羝羢羣羟羧羭羮羰羱羵羶羸藜鲐翀翃翅翊翌翏翕翛翟翡翣翥翦跹翪翫翚翮翯翱翽翾翿板饕鸹锨耋耇耎耏专耒耜耔耞耡耤耨耩耪耧耰鬓耵聍聃聆聎聝聡聦聱聴聂聼阈聿肄肏肐肕腋肙肜肟肧胛肫肬肭肰肴肵肸肼胊胍胏胑胔胗胙胝胠铨胤胦胩胬胭胯胰胲胴胹胻胼胾脇脘脝脞脡脣脤脥脧脰脲脳腆腊腌臜腍腒腓胨腜腠脶腥腧腬腯踝蹬镣腴腶蠕诽膂腽嗉膇膋膔腘膗膙膟黐膣膦膫膰膴膵膷脍臃臄臇臈臌臐臑臓膘臖臙臛臝臞臧蓐诩臽臾臿舀舁鳑鲏舋舎舔舗馆舝舠舡舢舨舭舲舳舴舸舺艁艄艅艉艋艑艕艖艗艘艚艜艟艣舣艨艩舻艬艭荏艴艳艸艹艻艿芃芄芊萰陂藭芏芔芘芚蕙芟芣芤茉芧芨芩芪芮芰鲢芴芷芸荛豢芼芿苄苒苘苙苜蓿苠苡苣荬苤苎苪镑苶苹苺苻苾茀茁范蠡萣茆茇茈茌茍茖茞茠茢茥茦菰茭茯茳藨茷藘茼荁荄荅荇荈菅蜢鸮荍荑荘豆荵荸荠莆莒莔莕莘莙莚莛莜莝莦莨菪莩莪莭莰莿菀菆菉菎菏菐菑菓菔芲菘菝菡菢菣菥蓂菧菫毂蓥菶菷菹醢菺菻菼菾萅萆苌萋萏萐萑萜萩萱萴莴扁萻葇葍葎葑荭葖葙葠葥苇葧葭药葳葴葶葸葹葽蒄蒎莼茏薹莅蒟蒻蒢蒦蒨蒭藁蒯蒱鉾蒴蒹蒺蒽荪蓁蓆蓇蓊蓌蓍蓏蓓蓖蓧蓪蓫荜跣藕苁蓰蓱莼蓷蓺蓼蔀蔂蔃蔆蔇蔉蔊蔋蔌蔎蔕蔘蔙蒌蔟锷蒋雯茑蔯蔳麻蔵蔸蔾荨蒇蕋蕍荞蕐蕑芸莸蕖蕗蕝蕞蕠蕡蒉蕣蕤蕨蕳蓣蕸蕺蕻薀薁薃薅薆荟薉芗薏薐蔷薖薘剃谔钗薜薠薢薤薧薨薫薬薳薶薷薸薽薾薿藄藇藋荩藐藙藚藟藦藳藴苈藷藾蘀蘁蕲苹蘗蘘蘝蘤蘧蘩蘸蘼虀虆虍蟠虒虓虖虡虣虥虩虬虰蛵蛇虷鳟虺虼蚆蚈蚋蚓蚔蚖蚘蚜蚡蚣蚧蚨蚩蚪蚯蚰蜒蚱蚳蚶蚹蚺蚻蚿蛀蛁蛄蛅蝮蛌蛍蛐蟮蛑蛓蛔蛘蛚蛜蛡蛣蜊蛩蛱蜕螫蜅蚬蜈蝣蜋蜍蜎蜑蠊蜛饯蜞蜣蜨蜩蜮蜱蜷蜺蜾蜿蝀蝃蝋蝌蝍蝎蝏蝗蝘蝙蝝鲼蝡蝤蝥猿蝰虻蝲蝴蝻螃蠏蛳螉螋螒螓螗螘螙螚蟥螟螣螥螬螭䗖螾螀蟀蟅蝈蟊蟋蟑蟓蟛蟜蟟蟢虮蟨蟪蟭蛲蟳蛏蟷蟺蟿蠁蠂蠃虿蠋蛴蠓蚝蠗蠙蠚蠛蠜蠧蟏蠩蜂蠮蠰蠲蠵蠸蠼蠽衁衄衄衇衈衉衋衎衒同衖胡衞裳钩衭衲衵衹衺衿袈裟袗袚袟袢袪袮袲袴袷袺袼褙袽裀裉袅裋夹裍裎裒裛裯裱裲裴裾褀褂褉褊裈褎褐褒褓褔褕袆褚褡褢褦褧褪褫袅褯褰褱裆褛褽褾襁褒襆裥襉襋襌襏襚襛襜裣襞襡襢褴襦襫襬襭襮襕襶襼襽襾覂覃覅霸覉覊覌覗觇覚覜觍觎覧覩觊觏覰観觌觔觕觖觜觽觝觡酲觩觫觭觱觳觯觷觼觾觿言赅讣訇訏訑訒诂讬訧訬訳訹证訾詀詅诋毁詈詊讵詑诒诐詗诎察詨诜詶詸詹詻诙诖誂誃诔锄诓誋诳诶悖誙诮诰誧説読誯谇訚谄谆諆諌诤诹诼諕谂谀諝谝諟喧谥諴諵谌谖誊謆謇歌謍謏謑谡谥謡謦謪谪讴謷謼谩哗譅譆譈譊讹譒撰谮鑫譞噪譩谵譬譱譲谴譸譹谫讅讆詟䜩雠讐谗谶讙谠讟谽豁豉豇岂豊豋豌豏豔豞豖豗豜豝豣豦豨豭豱豳豵豶豷豺豻貅貆狸猊貔貘䝙貜貤餍贳餸贶贲赂賏赊赇赒賝赓赕賨赍斗賮賵賸赚赙赜赟贉赆赑贕赝赬赭赱赳迄趁趂趄趐趑趒趔趡趦趫趮趯趱趴趵趷趹趺趿跁跂跅跆踬跄跐跕跖跗跙跛跦跧跩跫跬跮跱跲跴跺跼跽踅踆踈踉踊踒踖踘踜踟躇蹰踠踡踣踤踥踦踧跷踫踮逾踱踊踶踹踺踼踽躞蹁蹂躏蹎蹐蹓蹔跸蹚蹜蹝迹蹠蹡蹢跶蹧蹩蹪蹯鞠蹽躃躄躅踌跻躐踯跞躘躙躗躝躠蹑躜躧躩躭躰躬躶軃軆辊軏轫軘軜軝腭転軥軨軭軱轱辘軷轵轺軽軿輀輂辇辂辁輈挽輗辄辎辋輠輤輬輭輮辏輴輵輶輹輼辗辒轇轏轑轒辚轕轖轗轘轙轝轞轹轳罪辣辞辵辶辺込辿迅迋迍麿迓迣迤逦迥迨迮迸迺迻迿逄逅逌逍逑逓迳逖逡逭逯逴逶逹遄遅侦遘遛遝遢遨遫遯遰遴绕遹遻邂邅邉邋邎邕邗邘邛邠邢邧邨邯郸邰邲邳邴邶邷邽邾邿郃郄郇郈郔郕郗郙郚郜郝郞郏郠郢郪郫郯郰郲郳郴郷郹郾郿鄀鄄郓鄇鄈鄋鄍鄎鄏鄐鄑邹邬鄕郧鄗鄘鄚鄜鄞鄠鄢鄣鄤鄦鄩鄫鄬鄮鄯鄱郐鄷鄹邝鄻鄾鄿酃酅酆酇郦酊酋酎酏酐酣酔酕醄酖酗酞酡酢酤酩酴酹酺醁醅醆醊醍醐醑醓醖醝酝醡醤醨醪醭醯醰酦醲醴醵醸醹醼醽醾釂酾酽釆釈鲈镏阊钆钇钌钯钋鼢鼹钐钏釪釬釭釱钍釸钕钫鈃钭鈆鈇钚鈊鈌钤钣鈒鈤钬钪鈬铌铈钶铛钹铍钸钿鉄鉆铊铇鉌铋鉏铂钷铆钵鉥钲鉨钼钽鉱鉲鉶铰铒鉼铪銍銎铣銕镂铫铦铑铷銤铱铟銧铥铕铯銭銰焊銶锑锉汞鋂锒鋆鋈鋊铤鋍铗鋐鋑鋕鋘鋙锊锓锔锇铓鋭铖锆锂铽鋳鋹鋺鉴镚钎錀锞锖锫锩錍铔锕錔锱铮锛錞锬锜錤錩錬録铼錼锝钔锴鍉镀鍏鍐铡鍚锻锽锸锲锘鍫鍭鍱鍴锶鍹锗针锺锿镅鎉鎋鎌鎍鎏鎒鎓鎗镉鎚鎞镃鎤铩锼鎭鎯镒镍鎴镓鎸鎹镎镟鏊镆镠镝鏖铿锵鏚镗镘镛鏠鏦錾镤鏸镪鏻鏽鏾铙鐄鐇鐏铹镦镡鐗馗镫镢镨鐡锎镄鐩镌鐬鐱镭鐶鐻鐽镱鑀鑅镔鑐鑕鑚鑛鑢鑤镥鑪镧鑯鑱鑴鑵镊镢钃镻闫闬闶闳閒闵閗閟阂関合閤哄阆閲阉閺阎阏阍阌暗闉阕阗闑闒闿闘闚阚闟闠闤闼阞阢阤阨阬阯阹阼阽陁陑陔陛陜陡陥陬骘陴険陼陾阴隃隈隒隗隞隠隣隤隩隮隰颧隳隷隹雂雈雉雊雎雑雒雗雘雚雝雟雩雰雱驿霂霅霈霊沾霒霓霙霝霢霣霤霨霩霪霫霮靁叇叆靑靓靣腼靪靮靰靳靷靸靺靼靿鞀鞃鞄鞍鞗鞙鞚鞝鞞鞡鞣鞨鞫鞬鞮鞶鞹鞾鞑韅鞯驮韍韎韔韖韘韝韫韡韣韭韭韱韹韺頀刮頄顸顼頍颀颃颁頖頞頠頫頬颅頯頲颕頼悴顋顑颙颛颜顕顚顜颟顣颥颞飐飑台飓颸飏飖颽颾颿飀飂飚飌翻飡飣饲飥饨饫飮飧飶餀餂饸饹餇餈饽哺馂餖餗餚馄馃餟餠餤餧餩餪餫糊餮糇餲饧馎糕饩馈馊馌馒饇馑馓膳饎饐饘饟馕馘馥馝馡馣骝骡馵馹駃駄駅駆駉駋驽駓驵駗骀驸駜骂骈駪駬骃駴骎駹駽駾騂騄骓騆騉騋骒骐麟騑騒験騕骛騠騢騣騤騧骧騵驺骟騺蓦骖骠骢驆驈骅驌骁驎骣驒驔驖驙驦驩驫骺鲠骫骭肮骱骴骶骷髅骾髁髂髄髆膀髇髑髌髋髙髝髞髟髡髣髧髪髫髭髯髲髳髹髺髽髾鬁鬃鬅鬈鬋鬎鬏鬐鬑鬒鬖鬗鬘鬙鬠鬣斗鬫鬬阄鬯鬰鬲鬵鬷魆魈魊魋魍魉魑魖鳔魛魟魣魦魨魬鲂魵魸鮀鲅鮆鲧鲇鲍鲋鮓鲒鲕鮟鱇鮠鮦鮨鲔鲑鮶鮸鮿鲧鯄鯆鲩鯈鲻鯕鲭鲞鯙鯠鲲鯥鲰鲶鳀鯸鳊鲗䲠鹣鳇鰋鳄鳆鰕鰛鰜鲥鰤鳏鰦鳎鳐鳁鳓鰶鲦鲡鰼鰽鱀鱄鳙鱆鳕鱎鱐鳝鳝鳜鲟鲎鱠鳣鱨鲚鱮鱲鱵鱻鲅鳦凫鳯鳲鳷鳻鴂鴃鴄鸩鴈鴎鸰鴔鴗鸳鸯鸲鹆鸱鴠鴢鸪鴥鸸鹋鴳鸻鴷鴽鵀鵁鸺鹁鵖鵙鹈鹕鹅鵟鵩鹌鵫鵵鵷鵻鹍鶂鶊鶏鶒鹙鶗鶡鶤鶦鶬鶱鹟鶵鶸鶹鹡鶿鹚鷁鷃鷄鷇䴘䴘鷊鷏鹧鷕鹥鸷鷞鷟鸶鹪鹩鷩鷫鷭鹇鹇鸴鷾䴙鸂鸇䴙鸏鸑鸒鸓鸬鹳鸜鹂鹸咸鹾麀麂麃麄麇麋麌麐麑麒麚麛麝麤麸面麫麮麯麰麺麾黁黈黉黢黒黓黕黙黝黟黥黦黧黮黰黱黪黶黹黻黼黾鼋鼂鼃鼅鼈鼍鼏鼐鼒冬鼖鼙鼚鼛鼡鼩鼱鼪鼫鼯鼷鼽齁齆齇齈齉齌赍齑龀齕齗龅齚龇齞龃龉龆齢出齧齩齮齯齰齱齵齾厐龑龒龚龖龘龝龡龢龤'
+
+traditional_characters = '制咖片型超聲盤鑒定仔點他命書歌粉巾字帳恤手指記憶棒形轉彎溝光○〇㐄㐅㐆㐌㐖毒㐜㐡㐤㐰㐺㑇㑳㒳㒸㔾㗂㗎㝵㞎㞙㞞㠯㢲㢴㤅㥁㥯㨗㫺㬎㮎㮚㮸㲋㲱㲾㳮㵎㵪㶸㷖㷭㹢㹴犬㺢狓㺵㼝㽮㿝䍃䔢䖟䖸䗈䗥䗪䝓䠶䥯䦉䯝䰾魚䲔䳗䳘䵹鼄䶑一對應映射丁不識下兒子做二休世丘之貉並中台原則串為甚謂乾淨了百事無成八變五十些人得道雞升天代如併來去個國政策勁幽靈在歐洲遊蕩接樣蘿蔔坑側化傳價元論醇共再准刀兩斷切分耕耘收穫錢貨物向看舊就緒險刻千金動勞永逸匙零夜半卡通回復返影蹤反常態口咬氣句話同吐快吹周味呼諾嗚品紅鍋哄而散起唱和問三知生熟團漆黑火糟堆場空塊麵塌糊塗塵染壁廂夔已足多情露水大早到晚夫妻當關萬莫開失古恨套所料既往孔見提師要家主審寸陰難買鬥牛小撮部陣局展身層巴掌帆風順席地帶過年計於春頭載四季期被蛇怕井繩度願式份彈頃深前律徑心意念差愁孤行俱全房廳交遮打技長把抓死拿眼淚鼻涕鑰鎖折段抿拍即合掃排掬揮撥擁上入擊洞擲攬改故轍敗文值名斑方面旁族日秋餐隔雅里終父旦時晌會霎間晃暴寒曝更月望垠際朝夕本正經利杯羹東西板枝獨秀根筋桿進條龍服務概模次函數又性程總付步腳印趨登毛拔呵氧氮碳決雌雄波未平派謊言流清楚白準溜煙潭有獲聞是處降琴鶴甲病發可拾沙目然瞭直以相眨穿睹瞥瞬矢的解石鳥神教秉虔誠秘種窩蜂窮竅笑置筆苟勾銷抹殺煞等獎箍節吃箭仇雙鵰詩籌籮筐系列紙級士官統絲毫掛維網盡線微吭響股腦胎脈承腔臂力致效資源址器舉功投般說講規貿易葉障著慎滿皆輸號木電池衣傾鐘高低視仁覺醒覽遺角銀幣觸潰九鼎蔽抄出駟馬追重語破貧洗貫走路安蹴至幾蹶振躍役膽汗較輩輪辭贊退六連遍遞邊針血錘音錯門思閃真倒項栽霧類保護川先驚乍體鬨鱗爪鳴滴泡鄰域黨專鼓作齊炒丑烯亥克內酯冬加奴卯肝炎基尺梁街褲鎬客寵庭巳汝昌烷玲磊糖肇酉醛啷青縣韙良香骨鯛丂七集河市弦喜嘴張舌堵區工業姊妹星架構巧彩扭歪拼湊餘熱曜武州爺浮屠美鄉老階樹葷素碎落能魄鰓鰻珠丄丅丆万俟丈尚摸母娘量管群亞虎必我堂令申件裝伏位博俠義界表女墟臺戲臭皮匠勝諸葛亮賽頂倍催請運算包立叉戟離疫苗土史志演圍揭瓦曬夷姑婆帝村寶爛尖杉鹼屜桌山岔島由紀峽壩庫鎮廢從德後拗湯治旬食明昧曹朋友框欄極權冪曲歸依貓民氟硼氯磷鐵江侗自旅法司洋浦梅園溫暖灣焦班幸用田略番疊皇炮捶硝苯酸腺苷稜草鏡穗跳遠索錦綱聚氰胺聯店胚膲愛色堇紫羅蘭芝茶飯菱雲蟲藏藩亂叛蘇親債凳學座恐戀柱測肌腹衩錐係貂企烏跪叩軍車農題迭都甘油屯奏鍵短阿姨陪姐隻顧茅廬槽駕魂鮮鹿頁其菜單乘任供勢午齒漢組織吊調瀉唇坡城報墳外夸將尉建築岸崗公床揚新劍昇杭林栗校樓標款汽社浣海商館劇院鋼華港機械廣媒環球融第醫科證券綜財樂育游漲猶嶺疏癮瞼確兵領導繳肢膛船艾瑟爾蒼蔡虞傚衫覆訪訴課諭議軌述野鉤限敵鞋頜頷顎饒首齦站例修凡劃垂屆屬崽頦廚拜挫擺放旋削棋榻檻禮沉注滑營獄畫确儀聘花葬詔員跌轄週達酒錨閘陷陸雨雪飛威丌于丹久乏予理評產亢卑亦乎舞己悲矩圓詞害誌但住佞佳便俗信票案幅翁倦倫假偏倚斜虧鬼敲停備傷脾胃僅此像儉匱免宜穴焉戴兼容許凍伯仲負彼晝皂軒輊實刊划顛衛戰哥比省非好黃飾別拘束掩奶睬選擇搖擾煩苦枚寫協厭及格受歡迎約只估侵犯割狀告或缺抗拒挽撤救藥喻磨滅端倪少逆逾越避靠適吉譽吝玉含延咎歹聽啻淵善謀均勻堪忍夠太惹妙妥妨孕症孝術室完納推冠積宣疑辯慄碴稱屈撓屑干涉衡待很忙惡忿怎麼怠急恥恭息悅惑惜惟想愉愧怍慌憤啟懂懈懷材才緊招認扣抵拉捨也罷插揣冒搭撞南牆擴核支攻敢雷攀敬裡嗎需景智暇曾罪遇朽枉止況競爭辱求癒渝溶濟左右袒困補爽特寂寞示弱找謝畏強疾徐痛癢冤符眠睦瞅董何厚云措活疲羞者輕玻璃祥兆禁移稂莠穩佛換答簡結果盟絕縷途給談否羈翼耐肖脛毋寧興舒若菲萊痕跡窠臼虛衰臉兔撒鷹棺範該詳諱抬泰讓鬚眉象眾貲賬費灰賴奇慮訓輟辨菽麥辛近送透逞徒速續逮捕遂遑違遜斧鉞艱醉鏽隨觀棄顯飽脂肪使丏丐幫丒且慢末丕替桃宗王尊涼爵各圖屋脊糧署錄壇吾祿職胄襲君廈丗北壑桐疹損逢陵鷸丙寅戌氨腈唑綸辰酮脫氫酶醚丞丟現掉紗帽弄扯砲碗丠両丣坐存激肩臻蒂蓮悖序驅丨丩丫挺杈髻鬟細介俄伊犁京尼布訂普渡央委監察檢查劑圈設警隊斯督剩震境航舶革防托播促質版蠑螈鋒研藝歷殘消頻譜精密製造陲郵候埔堅壓壢凹匯執府究邦俘攝寮彬狼嶽肺腫庸英訊診埋粒胞括控碼韓暑槍樞砥澳哇牟壽甸鑽探篇簽綴縫繼耳肯照婦埃懸璧軸櫃檯辣擱淺邪跑纖阮陽私囊魔丮丰姿采丱燒丳丵丶丷丸參寨朗桂瑞砂衷霞貌鳳僕艦因嫌宰峰幹絡牌持旨祭禱簿編罰賓辦丼丿乀乂乃乄仰慕盛曠留考驗闊乆乇么醜麼乊湖燃乑乒乓乕乖僻忤戾离謬迕乗危肥劫除隙浪婿乙炔腸酰吡咯鹽乚乛乜嘢卿玄宮尾狐龜塔嶷兄弟泉章霄釘耙乞扎哀憐恕討乢乣乤乥乧乨乩童乪乫乭乳暈汁液瑤漿牙癌突竇罩腐膠豬酪蛋糕菌瘤乴乵乶乷乸乹乺乼乾俸冰嘉噦嚎坤媽屍壘旱枯涸俐渴潮澀煸豆燥爹瘦癟癬瞪袋脆薑貝隆餾乿亀亁叫咕攘扔搞男砸竄蓬麻亃亄亅卻亇遲典今臨繁累卵奉婚聰躬巨與遷添裂副宿歲怪噁尕崙愣杆硅硫鈦鈾錳芑雜異鈉砷胂磺琥珀艙棍簧胡茬盜浩盆販郎腿亍洪亐互欠助勉惠操斥諉繫戶譯亓墓碑刑鈴卅渠繽紛斗米旗憲釩燈徽瘟祖拳福穀豐臟腑綁肉醃苓蘊橋鋪霸顏鬧判噴岡底蛙陘礦亖亙亜罕們娜桑那努哈喀弗烈曼松森杜氏盃奧琛敦戊穆聖裔彙薛孫亟亡佚虜羊牢奮釋卷卸契媾感額睫纏誼趾塞擠紐阻還配馳莊亨洛祚亪享津滬畿郊慈菴枇杷膏亭閣鋥麗亳亶亹誅初責翻瘋偶傑叢稠妖拖寰居吸授慧蝸吞壯魅狗矛盾益渣患憂稀描猿夢暫涯畜禍緣沸搜引擎臣橫紜誰混援蒸獸獅稅剖亻亼亽亾什獻剎邡麽仂仃仄仆富怨仈仉畢昔晨殼紹仍仏仒仕宦仗欺恃腰嘆歎炬梓訖施仙后瓊逝仚仝仞仟悔仡佬償填泊拓撲簇羔購頓欽佩髮棻閫馭養億儆尤藉幀賑凌敘帖李柔剛沃眥睚戒訛取饗讀仨仫仮著泳臥躺韶夏裁仳仵唯賢憑釣誕仿似宋彿諷伀碩盼鵝伄儅伈伉儷柯始娃邁戈坦堡帕茨薩廟瑪莉莎藤霍姆伋伍奢胥廷芳豪伎倆侍汛勒希羲雛伐憩整謨閑閒伕伙伴頤伜伝伢叔恆茲恩翰伱伲侶伶俜悧鼬伸懶縮喇叭伹伺伻伽倻輻伾佀佃佇佈喬妮墨佉盧佌貸劣廉昂檔濃矮傘窪緩耗胸谷迷擋率齲宅沫舍療佐貳佑佔優據鏵嘗呢須魯曉佗佘余坪寺瓜銃僧蒙芒陀龕哼嘔坊姦孽弊揖祟繭縛誓賊佝僂瞀佟你奪趕佡佢佣佤佧賈佪佫佯佰佱潔績釀餚佴捲佶佷佸佹佺佻佼佽佾具喚窘壞娛怒慨硬習慣聾膨脹蔓駭貴痺侀侁侂侃侄侅鴻燕侇侈糜靡侉侌妾侏儒倉鼠侐侑侔侖侘侚鏈侜偎傍鈷循柳葫蘆附価侮罵蔑侯岩截蝕侷貼壺嬛宴捷攜桶箋酌俁狹膝狄俅俉俊俏俎俑俓俔諺俚俛黎健呈固墒增守康箱濕祐鏢鑣槓盒靖膜齡俞豹獵噪孚封札筒託衍鴿剪撰稿煉廠禊練繕葺俯瞰撐衝俲俳俴俵俶俷俺俻俾倀倂倅儲卒惶敷猝逃頡蓄崇隱倌倏忽刺蠟燭噍嚼坍扁抽斃蔥楣灌灶糞背藪賣賠閉霉騰倓倔倖倘倜儻倝借箸挹澆閱倡狂倢倣値倥傯倨傲倩匡嗣沖柝珍倬倭寇猩倮倶倷倹勤讚偁偃充偽吏嗓寐惺扮拱芫茜藉虢鈔偈偉晶偌宕距析濾殿疼癱註頗偓偕鴨歇滯偝偟偢忘怡旺偨偩偪偫偭偯偰偱偲偵緝蹄偷減惰漏窺竊偸偺迹傀儡傅傈僳傌籬傎奎琳迪叟芭傒傔傕傖悉荒傜傞傢傣芽逼傭婢傮睨寄檄誦謠頌傴擔辜弓慘蒿悼疤傺傻屄臆巢洩篋羨蓋軋頹傿儸僄僇僉僊働僎僑僔僖僚僝僞僣僤僥僦猴僨僩僬僭僮僯僰僱僵殖籤靜僾僿征隴儁儂儃儇儈朴薄儊儋儌儍儐儓儔儕儗儘儜儞儤儦儩汰哉寡渥裕酷儭儱罐儳儵儹儺儼儽兀臬臲鷲允勛勳宙宵帥憝彞諧嫂鬩暢沛溢盈飢赫兇悍狠猛頑愚妣斬秦遣鞭耀敏榮槃澤爆碟磁禿纜輝霽鹵朵婁孜烽醬勃汀箕裘鉗耶懞蕾徹兌軟遭黜兎児韻媳爸兕觥兗兙兛兜售鍪肚兝兞兟兡兢兣樽殮涅睡稟籍贅泌啡肽奸幕涵澇熵疚眷稃襯訌赴煥椒殲植跏沒試誤猜棲窗肋袖頰兪卦撇鬍岐廓轎疸楓茴瓏廁秩募勺噸寓斤曆畝迫筷釐最淫螺韜兮寬匪篩襄贏軛複兲詐刃堰戎痞蟻餉它冀鑄冂冃円冇冉冊嫁厲礪竭醮冏牧冑冓冔冕冖冗冘冞冢窄抑誣冥冫烘菇蟄冷凝坨橇淇淋炭餅磚磧窖醋雕雹霜冱冶爐艷嘲峻灘淡漠煖颼飲冼冽凃凄愴梗凅凇凈凊凋敝濛凔凜遵汞脢凞几凢処凰凱凵凶焰凸摺刷紋預喪嘍奔巡榜殯芙蓉租籠輯鞘萃凼鋸鑊刁蠻刂娩崩批拆攤掰櫱驟歧顆秒袂贓勿囑忌磋琢膚刈羽刎訟戮舂槳艇刓刖霹靂刜創犢刡恙墅幟筵緻刦刧刨昏默攸尿慾薰潤薰圭刪刮痧鏟刱刲刳刴刵踏磅戳柏槐繡芹莧蝟舟銘鵠鶩刼剁剃辮剄剉履鉛剋剌姻咽哨廊掠桅沿召瞻翅趙卜渺茫郭剒剔剕瀝剚愎毅訥纔剜剝啄採剞剟剡剣剤綵剮腎駛黏剰袍剴紊剷剸剺剽剿劁劂劄劈啪柴扳啦劉奭姥夼昫涓熙禪禹錫翔雁鶚劊劌弩柄蜻蛉劒劓劖劘劙瀾簣賞磯釜晉甜薪逐劦熔紂虐赤囚劬劭労劵効劻劼劾峭艮勅勇勵勍勐臘脖龐漫飼盪粥輒勖勗勘驕餒碌泮雇捐竹騎殊阱勣樸懇謹勦勧勩勯勰勱勲勷勸懲慰誡諫勹芡踐闌匁庇拯粟紮袱裹餃匆遽匈匉匊匋匍匐莖匏匕妝痰膿蛹齋苑烤蹈塘羌熊閥螳螂疆碚竿緯荷茵邙魏匚匜匝匟扶稷匣匭攏匸匹耦匽匾匿卂叮瘡禧軫堤棚迢鈞鍊卄卆遐卉瓷盲瓶噹胱腱裸卋卌卍卐怯污賤鄙齷齪陋卓溪唐梯漁陳棗泥漳潯澗梨芬譙贍轅迦鄭単驢弈洽鰲卛占筮卝卞卟吩啉屎翠厄卣卨卪卬卮榫襖璽綬鈕蚤懼殆篤聳卲帘帙繞卹卼卽厂厎厓厔厖厗奚厘厙厜厝諒厠厤厥厪膩孢厮厰厳厴厹厺粕垢蕪菁厼厾叁悟茸薯叄吵笄悌哺譏坫壟弧芯杠潛嬰芻袁詰貪諜煽饋駁収岳締災賄騙叚叡吻攔蘑蜜訣燧玩硯箏椎藺銅逗驪另覓叨嘮謁杵姓喊嚷囂咚嚀塑尋惱憎擦祇泣滲蝠叱吒咄咤喝籀黛舵舷叵叶鐸懿昭穰苴遼叻叼吁塹嫖賭瞧爬衆抒吅吆夥巹橡滌抱縱摩郡唁墜扇籃膀襪頸吋愾諮酬哭妓媛暗錶韁邇妃羿絮蕃渾拐葵暮隅吔吖啶嗪戚吜嗇噬嚥吟哦詠吠吧唧嗒咐吪雋咀徵燐苞茹鈣哧吮吰吱嘎吲哚吳棟嬌窟孟簫忠晗淞闔閭趼宇吶睛噓拂捧疵熄竽笛糠吼吽呀呂韋矇呃呆笨呇貢呉罄呋喃呎呏呔呠呡癡呣呤呦呧瑛眩扒晬淑姬瑜璇鵑呪呫嗶嚅囁呬呯呰呱呲咧噌鈍呴呶呷呸呺呻哱咻嘯嚕籲坎坷邏呿咁咂咆哮咇咈咋蟹煦珅藹咍咑咒詛咔噠嚓咾噥哩喱咗咠咡咢咣咥咦咨嗟詢咩咪咫嚙齧咭咮咱咲咳嗆嗽咴咷咸咹咺咼喉咿婉慟憫賦矜綠茗藍哂搶瞞哆嗦囉噻啾濱彗哋哌哎唷喲哏哐哞哢哤哪裏哫啼喘哰哲萎蚌哳哶哽哿唄唅唆唈唉唎唏嘩堯棣殤璜睿肅唔睇唕唚唞唣喳唪唬唰喏唲唳唵嘛唶唸唹唻唼唾唿啁啃鸚鵡啅埠棧榷祺舖鞅飆啊啍啎啐啓啕啖啗啜啞祈啢啣啤啥啫啱啲啵啺饑啽噶崑沁喁喂喆裙喈嚨喋喌喎喑喒喓喔粗喙幛慶滋鵲喟喣喤喥喦喧騷喨喩梆喫葡萄喭駝挑嚇碰樅瓣純皰藻趟鉻喵営喹喺喼喿嗀嗃嗄嗅嗈嗉嗊嗍嗐嗑嗔詬嗕嗖嗙嗛嗜痂癖嗝嗡嗤嗥嗨嗩嗬嗯嗰嗲嗵嘰嗷嗹嗾嗿嘀嘁嘂嘅惋嘈峪禾蔭嘊嘌嘏嘐嘒嘓嘖嘚嘜嘞嘟囔嘣嘥嘦嘧嘬嘭這謔嚴敞饞鬆嘵嘶嘷嘸蝦嘹嘻嘽嘿噀噂噅噇噉噎噏噔噗噘噙噚噝噞噢噤蟬皿噩噫噭噯噱噲噳嚏涌灑欲巫霏噷噼嚃嚄嚆抖嚌嚐嚔囌嚚嚜嚞嚟嚦嚬嚭嚮嚯嚲嚳飭按竣苛嚵嚶囀囅囈膪謙囍囒囓囗囘蕭酚飄濺諦囝溯眸紇鑾鶻囟殉囡団囤囥囧囨囪囫圇囬囮囯囲図囶囷囸囹圄圉擬囻囿圀圂圃圊粹蠹赦圌墾圏滾鯡鑿枘圕圛圜圞坯埂壤骸炕祠窯豚紳魠鯪鱉圧握圩圪垯圬圮圯炸岬幔毯祇窨菩溉圳圴圻圾坂坆沾坋坌舛壈昆墊墩椅坒坓坩堝坭坰坱坳坴坵坻坼楊掙涎簾垃垈垌垍垓垔垕垗垚垛垝垣垞垟垤垧垮垵垺垾垿埀畔埄埆埇埈埌殃隍埏埒埕埗埜埡埤埦埧埭埯埰埲埳埴埵埶紼埸培怖樁礎輔埼埽堀訶姪廡堃堄摧磐貞韌砌堈堉堊堋堌堍堎堖堙堞堠礁堧堨輿堭堮蜓摘堲堳堽堿塁塄塈煤塋棵塍塏塒塓綢塕鴉沽虱塙塚塝繆塡塢塤塥塩塬塱塲蟎塼塽塾塿墀墁墈墉墐夯増毀墝墠墦漬缽墫墬墮墰墺墻櫥壅壆壊壌壎壒榨蒜壔壕壖壙壚壜壝壠壡壬壭壱売壴壹壻壼寢壿夂夅夆変夊夌漱邑夓腕泄甥禦骼夗夘夙袞瑙妊娠醣梟珊鶯鷺戧幻魘夤蹀祕擂鶇姚宛閨嶼庾撻拇賛蛤裨菠氅漓撈湄蚊霆鯊箐篆篷荊肆舅荔鮃巷慚骰辟邱鎔鐮阪漂燴鯢鰈鱷鴇臚鵬妒峨譚枰晏璣癸祝秤竺牡籟恢罡螻蠍賜絨御梭夬夭砣榆怙枕夶夾餡奄崛葩譎奈賀祀贈奌奐奓奕訢詝奘奜奠奡奣陶奨奩魁奫奬奰媧孩貶隸酥宄狡猾她奼嫣妁氈荼皋膻蠅嬪妄妍嫉媚嬈妗趣妚妞妤礙妬婭妯娌妲妳妵妺姁姅姉姍姒姘姙姜姝姞姣姤姧姫姮娥姱姸姺姽婀娀誘懾脅娉婷娑娓娟娣娭娯娵娶娸娼婊婐婕婞婤婥谿孺婧婪婬婹婺婼婽媁媄媊媕媞媟媠媢媬媮媯媲媵媸媺媻媼眯媿嫄嫈嫋嫏嫕嫗嫘嫚嫜嫠嫡嫦嫩嫪毐嫫嫬嫰嫵嫺嫻嫽嫿嬀嬃嬅嬉耍嬋痴豔嬔嬖嬗嬙嬝嬡嬢嬤嬦嬬嬭幼嬲嬴嬸嬹嬾嬿孀孃孅孌孏曰癲屏孑孓雀孖斟簍謎摺孛矻鳩崮軻祜鸞孥邈毓棠臏孬孭孰孱孳孵泛罔銜孻孿宀宁宂拙株薇掣撫琪瓿榴謐彌宊濂祁瑕宍宏碁宓邸讞実潢町宥宧宨宬徵崎駿掖闕臊煮禽蠶宸豫寀寁寥寃簷庶寎暄磣寔寖寘寙寛寠苫寤肘洱濫蒗陝覈寪弘綽螽寳擅疙瘩晷対檐専尃尅贖絀繚疇釁尌峙醌襟痲碧屁昊槌淘恵瀑牝畑莓缸羚覷蔻髒躁尒尓銳尗尙尜尟尢尥尨尪尬尭尰擒尲尶尷尸尹潽蠖蛾尻釦梢蚴鰭脬蹲屇屌蚵屐屓挪屖屘屙屛屝屢屣巒嶂巖舄屧屨屩屪屭屮戍駐鉀崖嵛巔旮旯楂欖櫸芋茱萸靛麓屴屹屺屼岀岊岌岍阜岑彭鞏岒岝岢嵐岣岧岨岫岱岵岷峁峇峋峒峓峞峠嵋峩峯峱峴峹峿崀崁崆禎崋崌崍嶇崐崒崔嵬巍螢顥崚崞崟崠崢巆崤崦崧殂崬崱崳崴崶崿嵂嵇嵊泗嵌嵎嵒嵓嵗嵙嵞嵡嵩嵫嵯嵴嵼嵾嶁嶃嶄晴嶋嶌嶒嶓嶔嶗嶙嶝嶞嶠嶡嶢嶧嶨嶭嶮嶰嶲嶴嶸巂巃巇巉巋巌巓巘巛滇芎巟巠弋迴巣巤炊擘蜥蟒蠱覡巰蜀彥淖杏茂甫楞巻巽幗巿帛斐鯽蕊帑帔帗帚琉汶帟帡帣帨帬帯帰帷帹暆幃幄幇幋幌幏幘幙幚幞幠幡幢幦幨幩幪幬幭幯幰遙蹉跎餘庚鑑幵幷稚邃庀庁広庄庈庉笠庋跋庖犧庠庤庥鯨庬庱庳庴庵馨衢庹庿廃廄廆廋廌廎廏廐廑廒廕廖廛廝搏鑼廞弛袤廥廧廨廩廱綿踵髓廸廹甌鄴廻廼廾廿躔弁皺弇弌弍弎弐弒弔詭憾薦弝弢弣弤弨弭弮弰弳霖繇燾斌旭溥騫弶弸弼弾彀彄彆纍糾彊彔彖彘彟彠陌彤貽彧繪虹彪炳彫蔚鷗彰癉彲彳彴彷彷徉徨彸彽踩斂旆徂徇徊渭畬鉉裼従筌徘徙徜徠膳甦萌漸徬徭醺徯徳徴潘徻徼忀瘁胖燎怦悸顫扉犀澎湃砰恍惚絞隘忉憚挨餓忐忑忒忖応忝忞耿忡忪忭忮忱忸怩忻悠懣怏遏怔怗怚怛怞懟黍訝怫怭懦怱怲怳怵惕怸怹恁恂恇恉恌恏恒恓恔恘恚恛恝恞恟恠恣恧眄恪恫恬澹恰恿悀悁悃悄悆悊悐悒晦悚悛悜悝悤您悩悪悮悰悱悽惻悳悴悵惘悶悻悾惄愫鍾蒐惆惇惌惎惏惓惔惙惛耄惝瘧濁惥惦惪惲惴惷惸拈愀愃愆愈愊愍愐愑愒愓愔愕愙氓蠢騃昵愜赧愨愬愮愯愷愼慁慂慅慆慇靄慉慊慍慝慥慪慫慬慱慳慴慵慷慼焚憀灼鬱憃憊憋憍眺捏軾憒憔憖憙憧憬憨憪憭憮憯憷憸憹憺懃懅懆邀懊懋懌懍懐懞懠懤懥懨懫懮懰懱毖懵遁樑雍懺懽戁戄戇戉戔戕戛戝戞戠戡戢戣戤戥戦戩戭戯轟戱披菊牖戸戹戺戻戼戽鍬扂楔扃扆扈扊杖牽絹銬鐲賚扐摟攪烊盹瞌跟躉鑔靶鼾払扗玫腮扛扞扠扡扢盔押扤扦扱罾揄綏鞍郤窾扻扼扽抃抆抈抉抌抏瞎抔繯縊擻抜抝択抨摔歉躥牾抶抻搐泵菸拃拄拊髀拋拌脯拎拏拑擢秧沓曳攣迂拚拝拠拡拫拭拮踢拴拶拷攢拽掇芥橐簪摹疔挈瓢驥捺蹻挌挍挎挐揀挓挖掘浚挙揍聵挲挶挾挿捂捃捄捅捆捉捋胳膊揎捌捍捎軀蛛捗捘捙捜捥捩捫捭据捱捻捼捽掀掂掄臀膘掊掎掏掐笙掔掗掞棉芍掤搪闡掫掮掯揉掱掲掽掾揃揅揆搓揌諢揕揗揘揜揝揞揠揥揩揪揫櫫遒麈揰揲揵揶揸揹揺搆搉搊搋搌搎搔搕撼櫓搗搘搠搡搢搣搤搥搦搧搨搬楦褳訕赸搯搰搲搳搴搵搷搽搾搿摀摁摂摃摎摑摒摓跤摙摛摜摞摠摦睺羯摭摮摯摰摲摳摴摶摷摻摽撂撃撅稻撊撋撏鐧潑撕撙撚撝撟撢撣撦撧撩撬撱朔撳蚍蜉撾撿擀擄闖擉缶觚擐擕擖擗擡擣擤澡腚擧擨擩擫擭擯擰擷擸擼擽擿攃攄攆攉攥攐攓攖攙攛每攩攫轡澄攮攰攲攴軼攷砭訐攽碘敁敃敇敉敍敎筏敔敕敖閏誨敜煌敧敪敱敹敺敻敿斁衽斄牒縐謅斉斎斕鶉讕駮鱧斒筲斛斝斞斠斡斢斨斫斮晾沂潟穎絳邵斲斸釳於琅斾斿旀旂旃旄渦旌旎旐旒旓旖旛旝旟旡旣浴旰獺魃旴旹旻旼旽昀昃昄昇昉晰躲澈熹皎皓礬昑昕昜昝昞昡昤暉筍昦昨昰昱昳昴昶昺昻晁蹇隧蔬髦晄晅晒晛晜晞晟晡晢晤晥曦晩萘瑩顗晿暁暋暌暍暐暔暕煅暘暝暠暡曚暦暨暪朦朧暱暲殄馮暵暸暹暻暾曀曄曇曈曌曏曐曖曘曙曛曡曨曩駱曱甴肱曷牘禺錕曽滄耽朁朅朆杪栓誇竟粘絛朊膺朏朐朓朕朘朙瞄覲溘饔飧朠朢朣柵椆澱蝨朩朮朰朱炆璋鈺熾鹮朳槿朶朾朿杅杇杌隉欣釗湛漼楷瀍煜玟纓翱肈舜贄适逵杓杕杗杙荀蘅杝杞脩珓筊杰榔狍閦顰緬莞杲杳眇杴杶杸杻杼枋枌枒枓衾葄翹紓逋枙狸椏枟槁枲枳枴枵枷枸櫞枹枻柁柂柃柅柈柊柎某柑橘柒柘柙柚柜柞櫟柟柢柣柤柩柬柮柰柲橙柶柷柸柺査柿栃栄栒栔栘栝栟栢栩栫栭栱栲栳栴檀栵栻桀驁桁鎂桄桉桋桎梏椹葚桓桔桕桜桟桫欏桭桮桯桲桴桷桹湘溟梃梊梍梐潼梔梘梜梠梡梣梧梩梱梲梳梴梵梹棁棃櫻棐棑棕櫚簑繃蓑棖棘棜棨棩棪棫棬棯棰棱棳棸棹槨棼椀椄苕椈椊椋椌椐椑椓椗検椤椪椰椳椴椵椷椸椽椿楀楄楅篪楋楍楎楗楘楙楛楝楟楠楢楥楨楩楪楫楬楮楯楰楳楸楹楻楽榀榃榊榎槺榕榖榘榛狉莽榜笞榠榡榤榥榦榧榪榭榰榱槤霰榼榾榿槊閂槎槑槔槖様槜槢槥槧槪槭槮槱槲槻槼槾樆樊樏樑樕樗樘樛樟樠樧樨権樲樴樵猢猻樺樻罍樾樿橁橄橆橈笥龠橕橚橛輛橢橤橧豎膈跨橾橿檁檃檇檉檍檎檑檖檗檜檟檠檣檨檫檬檮檳檴檵檸櫂櫆櫌櫛櫜櫝櫡櫧櫨櫪櫬櫳櫹櫺茄櫽欀欂欃欐欑欒欙欞溴欨欬欱欵欶欷歔欸欹欻欼欿歁歃歆艎歈歊蒔蝶歓歕歘歙歛歜歟歠蹦詮鑲蹣跚陞陟歩歮歯歰歳歴璞歺瞑歾歿殀殈殍殑殗殜殙殛殞殢殣殥殪殫殭殰殳荃殷殸殹蛟殻殽謗毆毈毉餵毎毑蕈毗毘毚茛鄧毧毬毳毷毹毽毾毿氂氄氆靴氉氊氌氍氐聊氕氖気氘氙氚氛氜氝氡洶焊痙氤氳氥氦鋁鋅氪烴氬銨痤汪滸漉痘盂碾菖蒲蕹蛭螅氵氷氹氺氽燙氾氿渚汆汊汋汍汎汏汐汔汕褟汙汚汜蘺沼穢衊汧汨汩汭汲汳汴隄汾沄沅沆瀣沇沈葆浸淪湎溺痼痾沌沍沏沐沔沕沘浜畹礫沚沢沬沭沮沰沱灢沴沷籽沺烹濡洄泂肛泅泆湧肓泐泑泒泓泔泖泙泚泜泝泠漩饃濤粼濘蘚鰍泩泫泭泯銖泱泲洇洊涇琵琶荽薊箔洌洎洏洑潄濯洙洚洟洢洣洧洨洩痢滔洫洮洳洴洵洸洹洺洼洿淌蜚浄浉浙贛渫浠浡浤浥淼瀚浬浭翩萍浯浰蜃淀苔蛞蝓蜇螵蛸煲鯉浹浼浽溦涂涊涐涑涒涔滂涖涘涙涪涫涬涮涴涶涷涿淄淅淆淊淒黯淓淙漣淜淝淟淠淢淤淥淦淩猥藿褻淬淮淯淰淳詣淶紡淸淹燉癯綺渇済渉渋渓渕渙渟渢滓渤澥渧渨渮渰渲渶渼湅湉湋湍湑湓湔黔湜湝湞湟湢湣湩湫湮麟湱湲湴湼満溈溍溎溏溛舐漭溠溤溧馴溮溱溲溳溵溷溻溼溽溾滁滃滉滊滎滏稽滕滘滙滝滫滮羼耷滷滹滻煎漈漊漎繹漕漖漘漙漚漜漪漾漥漦漯漰漵漶漷濞潀潁潎潏潕潗潚潝潞潠潦祉瘍潲潵潷潸潺潾潿澁澂澃澉澌澍澐澒澔澙澠澣澦澧澨澫澬澮澰澴澶澼熏郁濆濇濈濉濊貊濔疣濜濠濩觴濬濮盥濰濲濼瀁瀅瀆瀋瀌瀏瀒瀔瀕瀘瀛瀟瀠瀡瀦瀧瀨瀬瀰瀲瀳瀵瀹瀺瀼灃灄灉灋灒灕灖灝灞灠灤灥灨灩灪蜴灮燼獴灴灸灺炁炅魷炗炘炙炤炫疽烙釺炯炰炱炲炴炷燬炻烀烋瘴鯧烓烔焙烜烝烳飪烺焃焄耆焌焐焓焗焜焞焠焢焮焯焱焼煁煃煆煇煊熠煍熬煐煒煕煗燻礆霾煚煝煟煠煢矸煨瑣煬萁煳煺煻熀熅熇熉羆熒穹熗熘熛熜稔諳爍熤熨熯熰眶螞熲熳熸熿燀燁燂燄盞燊燋燏燔隼燖燜燠燡燦燨燮燹燻燽燿爇爊爓爚爝爟爨蟾爯爰爲爻爿爿牀牁牂牄牋牎牏牓牕釉牚腩蒡虻牠雖蠣牣牤牮牯牲牳牴牷牸牼絆牿靬犂犄犆犇犉犍犎犒犖犗犛犟犠犨犩犪犮犰狳犴犵犺狁甩狃狆狎狒獾狘狙黠狨狩狫狴狷狺狻豕狽蜘猁猇猈猊猋猓猖獗猗猘猙獰獁猞猟獕猭猱猲猳猷猸猹猺玃獀獃獉獍獏獐獒獘獙獚獜獝獞獠獢獣獧鼇蹊獪獫獬豸獮獯鬻獳獷獼玀玁菟玅玆玈珉糝禛郅玍玎玓瓅玔玕玖玗玘玞玠玡玢玤玥玦玨瑰玭玳瑁玶玷玹玼珂珇珈瑚珌饈饌珔珖珙珛珞珡珣珥珧珩珪珮珶珷珺珽琀琁隕琊琇琖琚琠琤琦琨琫琬琭琮琯琰琱琲瑯琹琺琿瑀瑂瑄瑉瑋瑑瑔瑗瑢瑭瑱瑲瑳瑽瑾瑿璀璨璁璅璆璈璉璊璐璘璚璝璟璠璡璥璦璩璪璫璯璲璵璸璺璿瓀瓔瓖瓘瓚瓛臍瓞瓠瓤瓧瓩瓮瓰瓱瓴瓸瓻瓼甀甁甃甄甇甋甍甎甏甑甒甓甔甕甖甗飴蔗甙詫鉅粱盎銹糰甡褥産甪甬甭甮甯鎧甹甽甾甿畀畁畇畈畊畋畎畓畚畛畟鄂畤畦畧荻畯畳畵畷畸畽畾疃疉疋疍疎簞疐疒疕疘疝疢疥疧疳疶疿痁痄痊痌痍痏痐痒痔痗瘢痚痠痡痣痦痩痭痯痱痳痵痻痿瘀瘂瘃瘈瘉瘊瘌瘏瘐瘓瘕瘖瘙瘚瘛瘲瘜瘝瘞瘠瘥瘨瘭瘮瘯瘰癧瘳癘瘵瘸瘺瘻瘼癃癆癇癈癎癐癔癙癜癠癤癥癩蟆癪癭癰発踔紺蔫酵皙砬砒翎翳蘞鎢鑞皚鵯駒鱀粵褶皀皁莢皃鎛皈皌皐皒硃皕皖皘皜皝皞皤皦皨皪皫皭糙綻皴皸皻皽盅盋盌盍盚盝踞盦盩鞦韆盬盭眦睜瞤盯盱眙裰盵盻睞眂眅眈眊県眑眕眚眛眞眢眣眭眳眴眵眹瞓眽郛睃睅睆睊睍睎睏睒睖睙睟睠睢睥睪睪睯睽睾瞇瞈瞋瞍逛瞏瞕瞖瞘瞜瞟瞠瞢瞫瞭瞳瞵瞷瞹瞽闍瞿矓矉矍鑠矔矗矙矚矞矟矠矣矧矬矯矰矱硪碇磙罅舫阡、矼矽礓砃砅砆砉砍砑砕砝砟砠砢砦砧砩砫砮砳艏砵砹砼硇硌硍硎硏硐硒硜硤硨磲茚鋇硭硻硾碃碉碏碣碓碔碞碡碪碫碬碭碯碲碸碻礡磈磉磎磑磔磕磖磛磟磠磡磤磥蹭磪磬磴磵磹磻磽礀礄礅礌礐礚礜礞礤礧礮礱礲礵礽礿祂祄祅祆禳祊祍祏祓祔祕祗祘祛祧祫祲祻祼餌臠錮禂禇禋禑禔禕隋禖禘禚禜禝禠禡禢禤禥禨禫禰禴禸稈秈秊闈颯秌秏秕笈蘵賃秠秣秪秫秬秭秷秸稊稌稍稑稗稙稛稞稬稭稲稹稼顙稾穂穄穇穈穉穋穌貯穏穜穟穠穡穣穤穧穨穭穮穵穸窿闃窀窂窅窆窈窕窊窋窌窒窓窔窞窣窬黷蹙窰窳窴窵窶窸窻竁竃竈竑竜竝竦竪篦篾笆鮫竾笉笊笎笏笐靨笓笤籙笪笫笭笮笰笱笲笳笵笸笻筀筅筇筈筎筑筘筠筤筥筦筧筩筭筯筰筱筳筴讌筸箂箇箊箎箑箒箘箙箛箜篌箝箠箬鏃箯箴箾篁篔簹篘篙篚篛篜篝篟篠篡篢篥篧篨篭篰篲篳篴篶篹篼簀簁簃簆簉簋簌簏簜簟簠簥簦簨簬簰簸簻籊籐籒籓籔籖籚籛籜籣籥籧籩籪籫籯芾麴籵籸籹籼粁粃粋粑粔糲粛粞粢粧粨粲粳粺粻粽闢粿糅糆糈糌糍糒糔萼糗蛆蹋糢糨糬糭糯糱糴糶糸糺紃蹼鰹黴紆紈絝紉閩襻紑紕紘錠鳶鷂紝紞紟紥紩紬紱紲紵紽紾紿絁絃絅経絍絎絏縭褵絓絖絘絜絢絣螯絪絫聒絰絵絶絺絻絿綀綃綅綆綈綉綌綍綎綑綖綘継続緞綣綦綪綫綮綯綰罟蝽綷縩綹綾緁緄緅緆緇緋緌緎総緑緔緖緗緘緙緜緡緤緥緦纂緪緰緱緲緶緹縁縃縄縈縉縋縏縑縕縗縚縝縞縟縠縡縢縦縧縯縰騁縲縳縴縵縶縹縻衙縿繄繅繈繊繋繐繒繖繘繙繠繢繣繨繮繰繸繻繾纁纆纇纈纉纊纑纕纘纙纚纛缾罃罆罈罋罌罎罏罖罘罛罝罠罣罥罦罨罫罭鍰罳罶罹罻罽罿羂羃羇羋蕉５１鴕羑羖羗羜羝羢羣羥羧羭羮羰羱羵羶羸藜鮐翀翃翄翊翌翏翕翛翟翡翣翥翦躚翪翫翬翮翯翺翽翾翿闆饕鴰鍁耋耇耎耏耑耒耜耔耞耡耤耨耩耪耬耰鬢耵聹聃聆聎聝聡聦聱聴聶聼閾聿肄肏肐肕腋肙肜肟肧胛肫肬肭肰肴肵肸肼胊胍胏胑胔胗胙胝胠銓胤胦胩胬胭胯胰胲胴胹胻胼胾脇脘脝脞脡脣脤脥脧脰脲脳腆腊腌臢腍腒腓腖腜腠腡腥腧腬腯踝蹬鐐腴腶蠕誹膂膃膆膇膋膔膕膗膙膟黐膣膦膫膰膴膵膷膾臃臄臇臈臌臐臑臓臕臖臙臛臝臞臧蓐詡臽臾臿舀舁鰟鮍舋舎舔舗舘舝舠舡舢舨舭舲舳舴舸舺艁艄艅艉艋艑艕艖艗艘艚艜艟艣艤艨艩艫艬艭荏艴艶艸艹艻艿芃芄芊萰陂藭芏芔芘芚蕙芟芣芤茉芧芨芩芪芮芰鰱芴芷芸蕘豢芼芿苄苒苘苙苜蓿苠苡苣蕒苤苧苪鎊苶苹苺苻苾茀茁范蠡萣茆茇茈茌茍茖茞茠茢茥茦菰茭茯茳藨茷藘茼荁荄荅荇荈菅蜢鴞荍荑荘荳荵荸薺莆莒莔莕莘莙莚莛莜莝莦莨菪莩莪莭莰莿菀菆菉菎菏菐菑菓菔菕菘菝菡菢菣菥蓂菧菫轂鎣菶菷菹醢菺菻菼菾萅萆萇萋萏萐萑萜萩萱萴萵萹萻葇葍葎葑葒葖葙葠葥葦葧葭葯葳葴葶葸葹葽蒄蒎蒓蘢薹蒞蒟蒻蒢蒦蒨蒭藁蒯蒱鉾蒴蒹蒺蒽蓀蓁蓆蓇蓊蓌蓍蓏蓓蓖蓧蓪蓫蓽跣藕蓯蓰蓱蓴蓷蓺蓼蔀蔂蔃蔆蔇蔉蔊蔋蔌蔎蔕蔘蔙蔞蔟鍔蔣雯蔦蔯蔳蔴蔵蔸蔾蕁蕆蕋蕍蕎蕐蕑蕓蕕蕖蕗蕝蕞蕠蕡蕢蕣蕤蕨蕳蕷蕸蕺蕻薀薁薃薅薆薈薉薌薏薐薔薖薘薙諤釵薜薠薢薤薧薨薫薬薳薶薷薸薽薾薿藄藇藋藎藐藙藚藟藦藳藴藶藷藾蘀蘁蘄蘋蘗蘘蘝蘤蘧蘩蘸蘼虀虆虍蟠虒虓虖虡虣虥虩虯虰蛵虵虷鱒虺虼蚆蚈蚋蚓蚔蚖蚘蚜蚡蚣蚧蚨蚩蚪蚯蚰蜒蚱蚳蚶蚹蚺蚻蚿蛀蛁蛄蛅蝮蛌蛍蛐蟮蛑蛓蛔蛘蛚蛜蛡蛣蜊蛩蛺蛻螫蜅蜆蜈蝣蜋蜍蜎蜑蠊蜛餞蜞蜣蜨蜩蜮蜱蜷蜺蜾蜿蝀蝃蝋蝌蝍蝎蝏蝗蝘蝙蝝鱝蝡蝤蝥蝯蝰蝱蝲蝴蝻螃蠏螄螉螋螒螓螗螘螙螚蟥螟螣螥螬螭螮螾螿蟀蟅蟈蟊蟋蟑蟓蟛蟜蟟蟢蟣蟨蟪蟭蟯蟳蟶蟷蟺蟿蠁蠂蠃蠆蠋蠐蠓蠔蠗蠙蠚蠛蠜蠧蠨蠩蠭蠮蠰蠲蠵蠸蠼蠽衁衂衄衇衈衉衋衎衒衕衖衚衞裳鈎衭衲衵衹衺衿袈裟袗袚袟袢袪袮袲袴袷袺袼褙袽裀裉裊裋裌裍裎裒裛裯裱裲裴裾褀褂褉褊褌褎褐褒褓褔褕褘褚褡褢褦褧褪褫褭褯褰褱襠褸褽褾襁襃襆襇襉襋襌襏襚襛襜襝襞襡襢襤襦襫襬襭襮襴襶襼襽襾覂覃覅覇覉覊覌覗覘覚覜覥覦覧覩覬覯覰観覿觔觕觖觜觽觝觡酲觩觫觭觱觳觶觷觼觾觿言賅訃訇訏訑訒詁託訧訬訳訹証訾詀詅詆譭詈詊詎詑詒詖詗詘詧詨詵詶詸詹詻詼詿誂誃誄鋤誆誋誑誒誖誙誚誥誧説読誯誶誾諂諄諆諌諍諏諑諕諗諛諝諞諟諠諡諴諵諶諼謄謆謇謌謍謏謑謖謚謡謦謪謫謳謷謼謾譁譅譆譈譊譌譒譔譖鑫譞譟譩譫譬譱譲譴譸譹譾讅讆讋讌讎讐讒讖讙讜讟谽豁豉豇豈豊豋豌豏豔豞豖豗豜豝豣豦豨豭豱豳豵豶豷豺豻貅貆貍貎貔貘貙貜貤饜貰餸貺賁賂賏賒賕賙賝賡賧賨賫鬭賮賵賸賺賻賾贇贉贐贔贕贗赬赭赱赳迄趁趂趄趐趑趒趔趡趦趫趮趯趲趴趵趷趹趺趿跁跂跅跆躓蹌跐跕跖跗跙跛跦跧跩跫跬跮跱跲跴跺跼跽踅踆踈踉踊踒踖踘踜踟躇躕踠踡踣踤踥踦踧蹺踫踮踰踱踴踶踹踺踼踽躞蹁蹂躪蹎蹐蹓蹔蹕蹚蹜蹝蹟蹠蹡蹢躂蹧蹩蹪蹯鞠蹽躃躄躅躊躋躐躑躒躘躙躛躝躠躡躦躧躩躭躰躳躶軃軆輥軏軔軘軜軝齶転軥軨軭軱軲轆軷軹軺軽軿輀輂輦輅輇輈輓輗輙輜輞輠輤輬輭輮輳輴輵輶輹輼輾轀轇轏轑轒轔轕轖轗轘轙轝轞轢轤辠辢辤辵辶辺込辿迅迋迍麿迓迣迤邐迥迨迮迸迺迻迿逄逅逌逍逑逓逕逖逡逭逯逴逶逹遄遅遉遘遛遝遢遨遫遯遰遴遶遹遻邂邅邉邋邎邕邗邘邛邠邢邧邨邯鄲邰邲邳邴邶邷邽邾邿郃郄郇郈郔郕郗郙郚郜郝郞郟郠郢郪郫郯郰郲郳郴郷郹郾郿鄀鄄鄆鄇鄈鄋鄍鄎鄏鄐鄑鄒鄔鄕鄖鄗鄘鄚鄜鄞鄠鄢鄣鄤鄦鄩鄫鄬鄮鄯鄱鄶鄷鄹鄺鄻鄾鄿酃酅酆酇酈酊酋酎酏酐酣酔酕醄酖酗酞酡酢酤酩酴酹酺醁醅醆醊醍醐醑醓醖醝醞醡醤醨醪醭醯醰醱醲醴醵醸醹醼醽醾釂釃釅釆釈鱸鎦閶釓釔釕鈀釙鼢鼴釤釧釪釬釭釱釷釸釹鈁鈃鈄鈆鈇鈈鈊鈌鈐鈑鈒鈤鈥鈧鈬鈮鈰鈳鐺鈸鈹鈽鈿鉄鉆鉈鉋鉌鉍鉏鉑鉕鉚鉢鉥鉦鉨鉬鉭鉱鉲鉶鉸鉺鉼鉿銍銎銑銕鏤銚銛銠銣銤銥銦銧銩銪銫銭銰銲銶銻銼銾鋂鋃鋆鋈鋊鋌鋍鋏鋐鋑鋕鋘鋙鋝鋟鋦鋨鋩鋭鋮鋯鋰鋱鋳鋹鋺鋻鏰鐱錀錁錆錇錈錍錏錒錔錙錚錛錞錟錡錤錩錬録錸錼鍀鍆鍇鍉鍍鍏鍐鍘鍚鍛鍠鍤鍥鍩鍫鍭鍱鍴鍶鍹鍺鍼鍾鎄鎇鎉鎋鎌鎍鎏鎒鎓鎗鎘鎚鎞鎡鎤鎩鎪鎭鎯鎰鎳鎴鎵鎸鎹鎿鏇鏊鏌鏐鏑鏖鏗鏘鏚鏜鏝鏞鏠鏦鏨鏷鏸鏹鏻鏽鏾鐃鐄鐇鐏鐒鐓鐔鐗馗鐙鐝鐠鐡鐦鐨鐩鐫鐬鐱鐳鐶鐻鐽鐿鑀鑅鑌鑐鑕鑚鑛鑢鑤鑥鑪鑭鑯鑱鑴鑵鑷钁钃镻閆閈閌閎閒閔閗閟閡関閤閤閧閬閲閹閺閻閼閽閿闇闉闋闐闑闒闓闘闚闞闟闠闤闥阞阢阤阨阬阯阹阼阽陁陑陔陛陜陡陥陬騭陴険陼陾隂隃隈隒隗隞隠隣隤隩隮隰顴隳隷隹雂雈雉雊雎雑雒雗雘雚雝雟雩雰雱驛霂霅霈霊霑霒霓霙霝霢霣霤霨霩霪霫霮靁靆靉靑靚靣靦靪靮靰靳靷靸靺靼靿鞀鞃鞄鞌鞗鞙鞚鞝鞞鞡鞣鞨鞫鞬鞮鞶鞹鞾韃韅韉馱韍韎韔韖韘韝韞韡韣韭韮韱韹韺頀颳頄頇頊頍頎頏頒頖頞頠頫頬顱頯頲頴頼顇顋顑顒顓顔顕顚顜顢顣顬顳颭颮颱颶颸颺颻颽颾颿飀飂飈飌飜飡飣飤飥飩飫飮飱飶餀餂餄餎餇餈餑餔餕餖餗餚餛餜餟餠餤餧餩餪餫餬餮餱餲餳餺餻餼餽餿饁饅饇饉饊饍饎饐饘饟饢馘馥馝馡馣騮騾馵馹駃駄駅駆駉駋駑駓駔駗駘駙駜駡駢駪駬駰駴駸駹駽駾騂騄騅騆騉騋騍騏驎騑騒験騕騖騠騢騣騤騧驤騵騶騸騺驀驂驃驄驆驈驊驌驍驎驏驒驔驖驙驦驩驫骺鯁骫骭骯骱骴骶骷髏骾髁髂髄髆髈髐髑髕髖髙髝髞髟髡髣髧髪髫髭髯髲髳髹髺髽髾鬁鬃鬅鬈鬋鬎鬏鬐鬑鬒鬖鬗鬘鬙鬠鬣鬪鬫鬬鬮鬯鬰鬲鬵鬷魆魈魊魋魍魎魑魖鰾魛魟魣魦魨魬魴魵魸鮀鮁鮆鮌鮎鮑鮒鮓鮚鮞鮟鱇鮠鮦鮨鮪鮭鮶鮸鮿鯀鯄鯆鯇鯈鯔鯕鯖鯗鯙鯠鯤鯥鯫鯰鯷鯸鯿鰂鰆鶼鰉鰋鰐鰒鰕鰛鰜鰣鰤鰥鰦鰨鰩鰮鰳鰶鰷鱺鰼鰽鱀鱄鱅鱆鱈鱎鱐鱓鱔鱖鱘鱟鱠鱣鱨鱭鱮鱲鱵鱻鲅鳦鳧鳯鳲鳷鳻鴂鴃鴄鴆鴈鴎鴒鴔鴗鴛鴦鴝鵒鴟鴠鴢鴣鴥鴯鶓鴳鴴鴷鴽鵀鵁鵂鵓鵖鵙鵜鶘鵞鵟鵩鵪鵫鵵鵷鵻鵾鶂鶊鶏鶒鶖鶗鶡鶤鶦鶬鶱鶲鶵鶸鶹鶺鶿鷀鷁鷃鷄鷇鷈鷉鷊鷏鷓鷕鷖鷙鷞鷟鷥鷦鷯鷩鷫鷭鷳鷴鷽鷾鷿鸂鸇鸊鸏鸑鸒鸓鸕鸛鸜鸝鹸鹹鹺麀麂麃麄麇麋麌麐麑麒麚麛麝麤麩麪麫麮麯麰麺麾黁黈黌黢黒黓黕黙黝黟黥黦黧黮黰黱黲黶黹黻黼黽黿鼂鼃鼅鼈鼉鼏鼐鼒鼕鼖鼙鼚鼛鼡鼩鼱鼪鼫鼯鼷鼽齁齆齇齈齉齌齎齏齔齕齗齙齚齜齞齟齬齠齢齣齧齩齮齯齰齱齵齾龎龑龒龔龖龘龝龡龢龤'
+
+assert len(simplified_charcters) == len(simplified_charcters)
+
+s2t_dict = {}
+t2s_dict = {}
+for i, item in enumerate(simplified_charcters):
+    s2t_dict[item] = traditional_characters[i]
+    t2s_dict[traditional_characters[i]] = item
+
+
+def tranditional_to_simplified(text: str) -> str:
+    return "".join(
+        [t2s_dict[item] if item in t2s_dict else item for item in text])
+
+
+def simplified_to_traditional(text: str) -> str:
+    return "".join(
+        [s2t_dict[item] if item in s2t_dict else item for item in text])
+
+
+if __name__ == "__main__":
+    text = "一般是指存取一個應用程式啟動時始終顯示在網站或網頁瀏覽器中的一個或多個初始網頁等畫面存在的站點"
+    print(text)
+    text_simple = tranditional_to_simplified(text)
+    print(text_simple)
+    text_traditional = simplified_to_traditional(text_simple)
+    print(text_traditional)
diff --git a/ernie-sat/paddlespeech/t2s/frontend/zh_normalization/chronology.py b/ernie-sat/paddlespeech/t2s/frontend/zh_normalization/chronology.py
new file mode 100644
index 0000000000000000000000000000000000000000..ea4558e2a7abba4ff454656b82e67b3f5c483bf2
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/frontend/zh_normalization/chronology.py
@@ -0,0 +1,134 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import re
+
+from .num import DIGITS
+from .num import num2str
+from .num import verbalize_cardinal
+from .num import verbalize_digit
+
+
+def _time_num2str(num_string: str) -> str:
+    """A special case for verbalizing number in time."""
+    result = num2str(num_string.lstrip('0'))
+    if num_string.startswith('0'):
+        result = DIGITS['0'] + result
+    return result
+
+
+# 时刻表达式
+RE_TIME = re.compile(r'([0-1]?[0-9]|2[0-3])'
+                     r':([0-5][0-9])'
+                     r'(:([0-5][0-9]))?')
+
+# 时间范围，如8:30-12:30
+RE_TIME_RANGE = re.compile(r'([0-1]?[0-9]|2[0-3])'
+                           r':([0-5][0-9])'
+                           r'(:([0-5][0-9]))?'
+                           r'(~|-)'
+                           r'([0-1]?[0-9]|2[0-3])'
+                           r':([0-5][0-9])'
+                           r'(:([0-5][0-9]))?')
+
+
+def replace_time(match) -> str:
+    """
+    Args:
+        match (re.Match)
+    Returns:
+        str
+    """
+
+    is_range = len(match.groups()) > 5
+
+    hour = match.group(1)
+    minute = match.group(2)
+    second = match.group(4)
+
+    if is_range:
+        hour_2 = match.group(6)
+        minute_2 = match.group(7)
+        second_2 = match.group(9)
+
+    result = f"{num2str(hour)}点"
+    if minute.lstrip('0'):
+        if int(minute) == 30:
+            result += "半"
+        else:
+            result += f"{_time_num2str(minute)}分"
+    if second and second.lstrip('0'):
+        result += f"{_time_num2str(second)}秒"
+
+    if is_range:
+        result += "至"
+        result += f"{num2str(hour_2)}点"
+        if minute_2.lstrip('0'):
+            if int(minute) == 30:
+                result += "半"
+            else:
+                result += f"{_time_num2str(minute_2)}分"
+        if second_2 and second_2.lstrip('0'):
+            result += f"{_time_num2str(second_2)}秒"
+
+    return result
+
+
+RE_DATE = re.compile(r'(\d{4}|\d{2})年'
+                     r'((0?[1-9]|1[0-2])月)?'
+                     r'(((0?[1-9])|((1|2)[0-9])|30|31)([日号]))?')
+
+
+def replace_date(match) -> str:
+    """
+    Args:
+        match (re.Match)
+    Returns:
+        str
+    """
+    year = match.group(1)
+    month = match.group(3)
+    day = match.group(5)
+    result = ""
+    if year:
+        result += f"{verbalize_digit(year)}年"
+    if month:
+        result += f"{verbalize_cardinal(month)}月"
+    if day:
+        result += f"{verbalize_cardinal(day)}{match.group(9)}"
+    return result
+
+
+# 用 / 或者 - 分隔的 YY/MM/DD 或者 YY-MM-DD 日期
+RE_DATE2 = re.compile(
+    r'(\d{4})([- /.])(0[1-9]|1[012])\2(0[1-9]|[12][0-9]|3[01])')
+
+
+def replace_date2(match) -> str:
+    """
+    Args:
+        match (re.Match)
+    Returns:
+        str
+    """
+    year = match.group(1)
+    month = match.group(3)
+    day = match.group(4)
+    result = ""
+    if year:
+        result += f"{verbalize_digit(year)}年"
+    if month:
+        result += f"{verbalize_cardinal(month)}月"
+    if day:
+        result += f"{verbalize_cardinal(day)}日"
+    return result
diff --git a/ernie-sat/paddlespeech/t2s/frontend/zh_normalization/constants.py b/ernie-sat/paddlespeech/t2s/frontend/zh_normalization/constants.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d2b0b34ea345a678de8d6f8e2b68a9e92a7996b
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/frontend/zh_normalization/constants.py
@@ -0,0 +1,62 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import re
+import string
+
+from pypinyin.constants import SUPPORT_UCS4
+
+# 全角半角转换
+# 英文字符全角 -> 半角映射表 (num: 52)
+F2H_ASCII_LETTERS = {
+    chr(ord(char) + 65248): char
+    for char in string.ascii_letters
+}
+
+# 英文字符半角 -> 全角映射表
+H2F_ASCII_LETTERS = {value: key for key, value in F2H_ASCII_LETTERS.items()}
+
+# 数字字符全角 -> 半角映射表 (num: 10)
+F2H_DIGITS = {chr(ord(char) + 65248): char for char in string.digits}
+# 数字字符半角 -> 全角映射表
+H2F_DIGITS = {value: key for key, value in F2H_DIGITS.items()}
+
+# 标点符号全角 -> 半角映射表 (num: 32)
+F2H_PUNCTUATIONS = {chr(ord(char) + 65248): char for char in string.punctuation}
+# 标点符号半角 -> 全角映射表
+H2F_PUNCTUATIONS = {value: key for key, value in F2H_PUNCTUATIONS.items()}
+
+# 空格 (num: 1)
+F2H_SPACE = {'\u3000': ' '}
+H2F_SPACE = {' ': '\u3000'}
+
+# 非"有拼音的汉字"的字符串，可用于NSW提取
+if SUPPORT_UCS4:
+    RE_NSW = re.compile(r'(?:[^'
+                        r'\u3007'  # 〇
+                        r'\u3400-\u4dbf'  # CJK扩展A:[3400-4DBF]
+                        r'\u4e00-\u9fff'  # CJK基本:[4E00-9FFF]
+                        r'\uf900-\ufaff'  # CJK兼容:[F900-FAFF]
+                        r'\U00020000-\U0002A6DF'  # CJK扩展B:[20000-2A6DF]
+                        r'\U0002A703-\U0002B73F'  # CJK扩展C:[2A700-2B73F]
+                        r'\U0002B740-\U0002B81D'  # CJK扩展D:[2B740-2B81D]
+                        r'\U0002F80A-\U0002FA1F'  # CJK兼容扩展:[2F800-2FA1F]
+                        r'])+')
+else:
+    RE_NSW = re.compile(  # pragma: no cover
+        r'(?:[^'
+        r'\u3007'  # 〇
+        r'\u3400-\u4dbf'  # CJK扩展A:[3400-4DBF]
+        r'\u4e00-\u9fff'  # CJK基本:[4E00-9FFF]
+        r'\uf900-\ufaff'  # CJK兼容:[F900-FAFF]
+        r'])+')
diff --git a/ernie-sat/paddlespeech/t2s/frontend/zh_normalization/num.py b/ernie-sat/paddlespeech/t2s/frontend/zh_normalization/num.py
new file mode 100644
index 0000000000000000000000000000000000000000..a83b42a47b70b30452d5908e58d6e7a5b1c2f93c
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/frontend/zh_normalization/num.py
@@ -0,0 +1,238 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Rules to verbalize numbers into Chinese characters.
+https://zh.wikipedia.org/wiki/中文数字#現代中文
+"""
+import re
+from collections import OrderedDict
+from typing import List
+
+DIGITS = {str(i): tran for i, tran in enumerate('零一二三四五六七八九')}
+UNITS = OrderedDict({
+    1: '十',
+    2: '百',
+    3: '千',
+    4: '万',
+    8: '亿',
+})
+
+COM_QUANTIFIERS = '(所|朵|匹|张|座|回|场|尾|条|个|首|阙|阵|网|炮|顶|丘|棵|只|支|袭|辆|挑|担|颗|壳|窠|曲|墙|群|腔|砣|座|客|贯|扎|捆|刀|令|打|手|罗|坡|山|岭|江|溪|钟|队|单|双|对|出|口|头|脚|板|跳|枝|件|贴|针|线|管|名|位|身|堂|课|本|页|家|户|层|丝|毫|厘|分|钱|两|斤|担|铢|石|钧|锱|忽|(千|毫|微)克|毫|厘|(公)分|分|寸|尺|丈|里|寻|常|铺|程|(千|分|厘|毫|微)米|米|撮|勺|合|升|斗|石|盘|碗|碟|叠|桶|笼|盆|盒|杯|钟|斛|锅|簋|篮|盘|桶|罐|瓶|壶|卮|盏|箩|箱|煲|啖|袋|钵|年|月|日|季|刻|时|周|天|秒|分|小时|旬|纪|岁|世|更|夜|春|夏|秋|冬|代|伏|辈|丸|泡|粒|颗|幢|堆|条|根|支|道|面|片|张|颗|块|元|(亿|千万|百万|万|千|百)|(亿|千万|百万|万|千|百|美|)元|(亿|千万|百万|万|千|百|)块|角|毛|分)'
+
+# 分数表达式
+RE_FRAC = re.compile(r'(-?)(\d+)/(\d+)')
+
+
+def replace_frac(match) -> str:
+    """
+    Args:
+        match (re.Match)
+    Returns:
+        str
+    """
+    sign = match.group(1)
+    nominator = match.group(2)
+    denominator = match.group(3)
+    sign: str = "负" if sign else ""
+    nominator: str = num2str(nominator)
+    denominator: str = num2str(denominator)
+    result = f"{sign}{denominator}分之{nominator}"
+    return result
+
+
+# 百分数表达式
+RE_PERCENTAGE = re.compile(r'(-?)(\d+(\.\d+)?)%')
+
+
+def replace_percentage(match) -> str:
+    """
+    Args:
+        match (re.Match)
+    Returns:
+        str
+    """
+    sign = match.group(1)
+    percent = match.group(2)
+    sign: str = "负" if sign else ""
+    percent: str = num2str(percent)
+    result = f"{sign}百分之{percent}"
+    return result
+
+
+# 整数表达式
+# 带负号的整数 -10
+RE_INTEGER = re.compile(r'(-)' r'(\d+)')
+
+
+def replace_negative_num(match) -> str:
+    """
+    Args:
+        match (re.Match)
+    Returns:
+        str
+    """
+    sign = match.group(1)
+    number = match.group(2)
+    sign: str = "负" if sign else ""
+    number: str = num2str(number)
+    result = f"{sign}{number}"
+    return result
+
+
+# 编号-无符号整形
+# 00078
+RE_DEFAULT_NUM = re.compile(r'\d{3}\d*')
+
+
+def replace_default_num(match):
+    """
+    Args:
+        match (re.Match)
+    Returns:
+        str
+    """
+    number = match.group(0)
+    return verbalize_digit(number)
+
+
+# 数字表达式
+# 纯小数
+RE_DECIMAL_NUM = re.compile(r'(-?)((\d+)(\.\d+))' r'|(\.(\d+))')
+# 正整数 + 量词
+RE_POSITIVE_QUANTIFIERS = re.compile(r"(\d+)([多余几\+])?" + COM_QUANTIFIERS)
+RE_NUMBER = re.compile(r'(-?)((\d+)(\.\d+)?)' r'|(\.(\d+))')
+
+
+def replace_positive_quantifier(match) -> str:
+    """
+    Args:
+        match (re.Match)
+    Returns:
+        str
+    """
+    number = match.group(1)
+    match_2 = match.group(2)
+    if match_2 == "+":
+        match_2 = "多"
+    match_2: str = match_2 if match_2 else ""
+    quantifiers: str = match.group(3)
+    number: str = num2str(number)
+    result = f"{number}{match_2}{quantifiers}"
+    return result
+
+
+def replace_number(match) -> str:
+    """
+    Args:
+        match (re.Match)
+    Returns:
+        str
+    """
+    sign = match.group(1)
+    number = match.group(2)
+    pure_decimal = match.group(5)
+    if pure_decimal:
+        result = num2str(pure_decimal)
+    else:
+        sign: str = "负" if sign else ""
+        number: str = num2str(number)
+        result = f"{sign}{number}"
+    return result
+
+
+# 范围表达式
+# match.group(1) and match.group(8) are copy from RE_NUMBER
+
+RE_RANGE = re.compile(
+    r'((-?)((\d+)(\.\d+)?)|(\.(\d+)))[-~]((-?)((\d+)(\.\d+)?)|(\.(\d+)))')
+
+
+def replace_range(match) -> str:
+    """
+    Args:
+        match (re.Match)
+    Returns:
+        str
+    """
+    first, second = match.group(1), match.group(8)
+    first = RE_NUMBER.sub(replace_number, first)
+    second = RE_NUMBER.sub(replace_number, second)
+    result = f"{first}到{second}"
+    return result
+
+
+def _get_value(value_string: str, use_zero: bool=True) -> List[str]:
+    stripped = value_string.lstrip('0')
+    if len(stripped) == 0:
+        return []
+    elif len(stripped) == 1:
+        if use_zero and len(stripped) < len(value_string):
+            return [DIGITS['0'], DIGITS[stripped]]
+        else:
+            return [DIGITS[stripped]]
+    else:
+        largest_unit = next(
+            power for power in reversed(UNITS.keys()) if power < len(stripped))
+        first_part = value_string[:-largest_unit]
+        second_part = value_string[-largest_unit:]
+        return _get_value(first_part) + [UNITS[largest_unit]] + _get_value(
+            second_part)
+
+
+def verbalize_cardinal(value_string: str) -> str:
+    if not value_string:
+        return ''
+
+    # 000 -> '零' , 0 -> '零'
+    value_string = value_string.lstrip('0')
+    if len(value_string) == 0:
+        return DIGITS['0']
+
+    result_symbols = _get_value(value_string)
+    # verbalized number starting with '一十*' is abbreviated as `十*`
+    if len(result_symbols) >= 2 and result_symbols[0] == DIGITS[
+            '1'] and result_symbols[1] == UNITS[1]:
+        result_symbols = result_symbols[1:]
+    return ''.join(result_symbols)
+
+
+def verbalize_digit(value_string: str, alt_one=False) -> str:
+    result_symbols = [DIGITS[digit] for digit in value_string]
+    result = ''.join(result_symbols)
+    if alt_one:
+        result = result.replace("一", "幺")
+    return result
+
+
+def num2str(value_string: str) -> str:
+    integer_decimal = value_string.split('.')
+    if len(integer_decimal) == 1:
+        integer = integer_decimal[0]
+        decimal = ''
+    elif len(integer_decimal) == 2:
+        integer, decimal = integer_decimal
+    else:
+        raise ValueError(
+            f"The value string: '${value_string}' has more than one point in it."
+        )
+
+    result = verbalize_cardinal(integer)
+
+    decimal = decimal.rstrip('0')
+    if decimal:
+        # '.22' is verbalized as '零点二二'
+        # '3.20' is verbalized as '三点二
+        result = result if result else "零"
+        result += '点' + verbalize_digit(decimal)
+    return result
diff --git a/ernie-sat/paddlespeech/t2s/frontend/zh_normalization/phonecode.py b/ernie-sat/paddlespeech/t2s/frontend/zh_normalization/phonecode.py
new file mode 100644
index 0000000000000000000000000000000000000000..06b5d41b8b606dd29d7c5960f6fa516ce249c462
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/frontend/zh_normalization/phonecode.py
@@ -0,0 +1,63 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import re
+
+from .num import verbalize_digit
+
+# 规范化固话/手机号码
+# 手机
+# http://www.jihaoba.com/news/show/13680
+# 移动：139、138、137、136、135、134、159、158、157、150、151、152、188、187、182、183、184、178、198
+# 联通：130、131、132、156、155、186、185、176
+# 电信：133、153、189、180、181、177
+RE_MOBILE_PHONE = re.compile(
+    r"(?<!\d)((\+?86 ?)?1([38]\d|5[0-35-9]|7[678]|9[89])\d{8})(?!\d)")
+RE_TELEPHONE = re.compile(
+    r"(?<!\d)((0(10|2[1-3]|[3-9]\d{2})-?)?[1-9]\d{7,8})(?!\d)")
+
+# 全国统一的号码400开头
+RE_NATIONAL_UNIFORM_NUMBER = re.compile(r"(400)(-)?\d{3}(-)?\d{4}")
+
+
+def phone2str(phone_string: str, mobile=True) -> str:
+    if mobile:
+        sp_parts = phone_string.strip('+').split()
+        result = '，'.join(
+            [verbalize_digit(part, alt_one=True) for part in sp_parts])
+        return result
+    else:
+        sil_parts = phone_string.split('-')
+        result = '，'.join(
+            [verbalize_digit(part, alt_one=True) for part in sil_parts])
+        return result
+
+
+def replace_phone(match) -> str:
+    """
+    Args:
+        match (re.Match)
+    Returns:
+        str
+    """
+    return phone2str(match.group(0), mobile=False)
+
+
+def replace_mobile(match) -> str:
+    """
+    Args:
+        match (re.Match)
+    Returns:
+        str
+    """
+    return phone2str(match.group(0))
diff --git a/ernie-sat/paddlespeech/t2s/frontend/zh_normalization/quantifier.py b/ernie-sat/paddlespeech/t2s/frontend/zh_normalization/quantifier.py
new file mode 100644
index 0000000000000000000000000000000000000000..268d7229b8cf778b103321f910aa202abb835381
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/frontend/zh_normalization/quantifier.py
@@ -0,0 +1,37 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import re
+
+from .num import num2str
+
+# 温度表达式，温度会影响负号的读法
+# -3°C 零下三度
+RE_TEMPERATURE = re.compile(r'(-?)(\d+(\.\d+)?)(°C|℃|度|摄氏度)')
+
+
+def replace_temperature(match) -> str:
+    """
+    Args:
+        match (re.Match)
+    Returns:
+        str
+    """
+    sign = match.group(1)
+    temperature = match.group(2)
+    unit = match.group(3)
+    sign: str = "零下" if sign else ""
+    temperature: str = num2str(temperature)
+    unit: str = "摄氏度" if unit == "摄氏度" else "度"
+    result = f"{sign}{temperature}{unit}"
+    return result
diff --git a/ernie-sat/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py b/ernie-sat/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc663c70d77da24c9ef9b21fea64a5b1fc6cf2e9
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py
@@ -0,0 +1,116 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import re
+from typing import List
+
+from .char_convert import tranditional_to_simplified
+from .chronology import RE_DATE
+from .chronology import RE_DATE2
+from .chronology import RE_TIME
+from .chronology import RE_TIME_RANGE
+from .chronology import replace_date
+from .chronology import replace_date2
+from .chronology import replace_time
+from .constants import F2H_ASCII_LETTERS
+from .constants import F2H_DIGITS
+from .constants import F2H_SPACE
+from .num import RE_DECIMAL_NUM
+from .num import RE_DEFAULT_NUM
+from .num import RE_FRAC
+from .num import RE_INTEGER
+from .num import RE_NUMBER
+from .num import RE_PERCENTAGE
+from .num import RE_POSITIVE_QUANTIFIERS
+from .num import RE_RANGE
+from .num import replace_default_num
+from .num import replace_frac
+from .num import replace_negative_num
+from .num import replace_number
+from .num import replace_percentage
+from .num import replace_positive_quantifier
+from .num import replace_range
+from .phonecode import RE_MOBILE_PHONE
+from .phonecode import RE_NATIONAL_UNIFORM_NUMBER
+from .phonecode import RE_TELEPHONE
+from .phonecode import replace_mobile
+from .phonecode import replace_phone
+from .quantifier import RE_TEMPERATURE
+from .quantifier import replace_temperature
+
+
+class TextNormalizer():
+    def __init__(self):
+        self.SENTENCE_SPLITOR = re.compile(r'([：、，；。？！,;?!][”’]?)')
+
+    def _split(self, text: str, lang="zh") -> List[str]:
+        """Split long text into sentences with sentence-splitting punctuations.
+        Args:
+            text (str): The input text.
+        Returns:
+            List[str]: Sentences.
+        """
+        # Only for pure Chinese here
+        if lang == "zh":
+            text = text.replace(" ", "")
+            # 过滤掉特殊字符
+            text = re.sub(r'[《》【】<=>{}()（）#&@“”^_|…\\]', '', text)
+        text = self.SENTENCE_SPLITOR.sub(r'\1\n', text)
+        text = text.strip()
+        sentences = [sentence.strip() for sentence in re.split(r'\n+', text)]
+        return sentences
+
+    def _post_replace(self, sentence: str) -> str:
+        sentence = sentence.replace('/', '每')
+        sentence = sentence.replace('~', '至')
+
+        return sentence
+
+    def normalize_sentence(self, sentence: str) -> str:
+        # basic character conversions
+        sentence = tranditional_to_simplified(sentence)
+        sentence = sentence.translate(F2H_ASCII_LETTERS).translate(
+            F2H_DIGITS).translate(F2H_SPACE)
+
+        # number related NSW verbalization
+        sentence = RE_DATE.sub(replace_date, sentence)
+        sentence = RE_DATE2.sub(replace_date2, sentence)
+
+        # range first
+        sentence = RE_TIME_RANGE.sub(replace_time, sentence)
+        sentence = RE_TIME.sub(replace_time, sentence)
+
+        sentence = RE_TEMPERATURE.sub(replace_temperature, sentence)
+        sentence = RE_FRAC.sub(replace_frac, sentence)
+        sentence = RE_PERCENTAGE.sub(replace_percentage, sentence)
+        sentence = RE_MOBILE_PHONE.sub(replace_mobile, sentence)
+
+        sentence = RE_TELEPHONE.sub(replace_phone, sentence)
+        sentence = RE_NATIONAL_UNIFORM_NUMBER.sub(replace_phone, sentence)
+
+        sentence = RE_RANGE.sub(replace_range, sentence)
+        sentence = RE_INTEGER.sub(replace_negative_num, sentence)
+        sentence = RE_DECIMAL_NUM.sub(replace_number, sentence)
+        sentence = RE_POSITIVE_QUANTIFIERS.sub(replace_positive_quantifier,
+                                               sentence)
+        sentence = RE_DEFAULT_NUM.sub(replace_default_num, sentence)
+        sentence = RE_NUMBER.sub(replace_number, sentence)
+        sentence = self._post_replace(sentence)
+
+        return sentence
+
+    def normalize(self, text: str) -> List[str]:
+        sentences = self._split(text)
+
+        sentences = [self.normalize_sentence(sent) for sent in sentences]
+        return sentences
diff --git a/ernie-sat/paddlespeech/t2s/models/__init__.py b/ernie-sat/paddlespeech/t2s/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..41be7c1db92508271121047ba997ac95fc064505
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/models/__init__.py
@@ -0,0 +1,22 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .fastspeech2 import *
+from .hifigan import *
+from .melgan import *
+from .parallel_wavegan import *
+from .speedyspeech import *
+from .tacotron2 import *
+from .transformer_tts import *
+from .waveflow import *
+from .wavernn import *
diff --git a/ernie-sat/paddlespeech/t2s/models/fastspeech2/__init__.py b/ernie-sat/paddlespeech/t2s/models/fastspeech2/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..52925ef8ce732b87999097fff469b19a7dd8f719
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/models/fastspeech2/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .fastspeech2 import *
+from .fastspeech2_updater import *
diff --git a/ernie-sat/paddlespeech/t2s/models/fastspeech2/fastspeech2.py b/ernie-sat/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
new file mode 100644
index 0000000000000000000000000000000000000000..c2f1e218f15ba7178bb20751984db8c2b130fe12
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
@@ -0,0 +1,1057 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+"""Fastspeech2 related modules for paddle"""
+from typing import Dict
+from typing import List
+from typing import Sequence
+from typing import Tuple
+from typing import Union
+
+import numpy as np
+import paddle
+import paddle.nn.functional as F
+from paddle import nn
+from typeguard import check_argument_types
+
+from paddlespeech.t2s.modules.nets_utils import initialize
+from paddlespeech.t2s.modules.nets_utils import make_non_pad_mask
+from paddlespeech.t2s.modules.nets_utils import make_pad_mask
+from paddlespeech.t2s.modules.predictor.duration_predictor import DurationPredictor
+from paddlespeech.t2s.modules.predictor.duration_predictor import DurationPredictorLoss
+from paddlespeech.t2s.modules.predictor.length_regulator import LengthRegulator
+from paddlespeech.t2s.modules.predictor.variance_predictor import VariancePredictor
+from paddlespeech.t2s.modules.tacotron2.decoder import Postnet
+from paddlespeech.t2s.modules.transformer.encoder import CNNDecoder
+from paddlespeech.t2s.modules.transformer.encoder import CNNPostnet
+from paddlespeech.t2s.modules.transformer.encoder import ConformerEncoder
+from paddlespeech.t2s.modules.transformer.encoder import TransformerEncoder
+
+
+class FastSpeech2(nn.Layer):
+    """FastSpeech2 module.
+    
+    This is a module of FastSpeech2 described in `FastSpeech 2: Fast and
+    High-Quality End-to-End Text to Speech`_. Instead of quantized pitch and
+    energy, we use token-averaged value introduced in `FastPitch: Parallel
+    Text-to-speech with Pitch Prediction`_.
+    
+    .. _`FastSpeech 2: Fast and High-Quality End-to-End Text to Speech`:
+        https://arxiv.org/abs/2006.04558
+    .. _`FastPitch: Parallel Text-to-speech with Pitch Prediction`:
+        https://arxiv.org/abs/2006.06873
+
+    Args:
+    
+    Returns:
+
+    """
+
+    def __init__(
+            self,
+            # network structure related
+            idim: int,
+            odim: int,
+            adim: int=384,
+            aheads: int=4,
+            elayers: int=6,
+            eunits: int=1536,
+            dlayers: int=6,
+            dunits: int=1536,
+            postnet_layers: int=5,
+            postnet_chans: int=512,
+            postnet_filts: int=5,
+            postnet_dropout_rate: float=0.5,
+            positionwise_layer_type: str="conv1d",
+            positionwise_conv_kernel_size: int=1,
+            use_scaled_pos_enc: bool=True,
+            use_batch_norm: bool=True,
+            encoder_normalize_before: bool=True,
+            decoder_normalize_before: bool=True,
+            encoder_concat_after: bool=False,
+            decoder_concat_after: bool=False,
+            reduction_factor: int=1,
+            encoder_type: str="transformer",
+            decoder_type: str="transformer",
+            # for transformer
+            transformer_enc_dropout_rate: float=0.1,
+            transformer_enc_positional_dropout_rate: float=0.1,
+            transformer_enc_attn_dropout_rate: float=0.1,
+            transformer_dec_dropout_rate: float=0.1,
+            transformer_dec_positional_dropout_rate: float=0.1,
+            transformer_dec_attn_dropout_rate: float=0.1,
+            # for conformer
+            conformer_pos_enc_layer_type: str="rel_pos",
+            conformer_self_attn_layer_type: str="rel_selfattn",
+            conformer_activation_type: str="swish",
+            use_macaron_style_in_conformer: bool=True,
+            use_cnn_in_conformer: bool=True,
+            zero_triu: bool=False,
+            conformer_enc_kernel_size: int=7,
+            conformer_dec_kernel_size: int=31,
+            # for CNN Decoder
+            cnn_dec_dropout_rate: float=0.2,
+            cnn_postnet_dropout_rate: float=0.2,
+            cnn_postnet_resblock_kernel_sizes: List[int]=[256, 256],
+            cnn_postnet_kernel_size: int=5,
+            cnn_decoder_embedding_dim: int=256,
+            # duration predictor
+            duration_predictor_layers: int=2,
+            duration_predictor_chans: int=384,
+            duration_predictor_kernel_size: int=3,
+            duration_predictor_dropout_rate: float=0.1,
+            # energy predictor
+            energy_predictor_layers: int=2,
+            energy_predictor_chans: int=384,
+            energy_predictor_kernel_size: int=3,
+            energy_predictor_dropout: float=0.5,
+            energy_embed_kernel_size: int=9,
+            energy_embed_dropout: float=0.5,
+            stop_gradient_from_energy_predictor: bool=False,
+            # pitch predictor
+            pitch_predictor_layers: int=2,
+            pitch_predictor_chans: int=384,
+            pitch_predictor_kernel_size: int=3,
+            pitch_predictor_dropout: float=0.5,
+            pitch_embed_kernel_size: int=9,
+            pitch_embed_dropout: float=0.5,
+            stop_gradient_from_pitch_predictor: bool=False,
+            # spk emb
+            spk_num: int=None,
+            spk_embed_dim: int=None,
+            spk_embed_integration_type: str="add",
+            # tone emb
+            tone_num: int=None,
+            tone_embed_dim: int=None,
+            tone_embed_integration_type: str="add",
+            # training related
+            init_type: str="xavier_uniform",
+            init_enc_alpha: float=1.0,
+            init_dec_alpha: float=1.0, ):
+        """Initialize FastSpeech2 module.
+        Args:
+            idim (int): Dimension of the inputs.
+            odim (int): Dimension of the outputs.
+            adim (int): Attention dimension.
+            aheads (int): Number of attention heads.
+            elayers (int): Number of encoder layers.
+            eunits (int): Number of encoder hidden units.
+            dlayers (int): Number of decoder layers.
+            dunits (int): Number of decoder hidden units.
+            postnet_layers (int): Number of postnet layers.
+            postnet_chans (int): Number of postnet channels.
+            postnet_filts (int): Kernel size of postnet.
+            postnet_dropout_rate (float): Dropout rate in postnet.
+            use_scaled_pos_enc (bool): Whether to use trainable scaled pos encoding.
+            use_batch_norm (bool): Whether to use batch normalization in encoder prenet.
+            encoder_normalize_before (bool): Whether to apply layernorm layer before encoder block.
+            decoder_normalize_before (bool): Whether to apply layernorm layer before decoder block.
+            encoder_concat_after (bool): Whether to concatenate attention layer's input and output in encoder.
+            decoder_concat_after (bool): Whether to concatenate attention layer's input  and output in decoder.
+            reduction_factor (int): Reduction factor.
+            encoder_type (str): Encoder type ("transformer" or "conformer").
+            decoder_type (str): Decoder type ("transformer" or "conformer").
+            transformer_enc_dropout_rate (float): Dropout rate in encoder except attention and positional encoding.
+            transformer_enc_positional_dropout_rate (float): Dropout rate after encoder positional encoding.
+            transformer_enc_attn_dropout_rate (float): Dropout rate in encoder self-attention module.
+            transformer_dec_dropout_rate (float): Dropout rate in decoder except attention & positional encoding.
+            transformer_dec_positional_dropout_rate (float): Dropout rate after decoder positional encoding.
+            transformer_dec_attn_dropout_rate (float): Dropout rate in decoder self-attention module.
+            conformer_pos_enc_layer_type (str): Pos encoding layer type in conformer.
+            conformer_self_attn_layer_type (str): Self-attention layer type in conformer
+            conformer_activation_type (str): Activation function type in conformer.
+            use_macaron_style_in_conformer (bool): Whether to use macaron style FFN.
+            use_cnn_in_conformer (bool): Whether to use CNN in conformer.
+            zero_triu (bool): Whether to use zero triu in relative self-attention module.
+            conformer_enc_kernel_size (int): Kernel size of encoder conformer.
+            conformer_dec_kernel_size (int): Kernel size of decoder conformer.
+            duration_predictor_layers (int): Number of duration predictor layers.
+            duration_predictor_chans (int): Number of duration predictor channels.
+            duration_predictor_kernel_size (int): Kernel size of duration predictor.
+            duration_predictor_dropout_rate (float): Dropout rate in duration predictor.
+            pitch_predictor_layers (int): Number of pitch predictor layers.
+            pitch_predictor_chans (int): Number of pitch predictor channels.
+            pitch_predictor_kernel_size (int): Kernel size of pitch predictor.
+            pitch_predictor_dropout_rate (float): Dropout rate in pitch predictor.
+            pitch_embed_kernel_size (float): Kernel size of pitch embedding.
+            pitch_embed_dropout_rate (float): Dropout rate for pitch embedding.
+            stop_gradient_from_pitch_predictor (bool): Whether to stop gradient from pitch predictor to encoder.
+            energy_predictor_layers (int): Number of energy predictor layers.
+            energy_predictor_chans (int): Number of energy predictor channels.
+            energy_predictor_kernel_size (int): Kernel size of energy predictor.
+            energy_predictor_dropout_rate (float): Dropout rate in energy predictor.
+            energy_embed_kernel_size (float): Kernel size of energy embedding.
+            energy_embed_dropout_rate (float): Dropout rate for energy embedding.
+            stop_gradient_from_energy_predictor（bool): Whether to stop gradient from energy predictor to encoder.
+            spk_num (Optional[int]): Number of speakers. If not None, assume that the spk_embed_dim is not None,
+                spk_ids will be provided as the input and use spk_embedding_table.
+            spk_embed_dim (Optional[int]): Speaker embedding dimension. If not None, 
+                assume that spk_emb will be provided as the input or spk_num is not None.
+            spk_embed_integration_type (str): How to integrate speaker embedding.
+            tone_num (Optional[int]): Number of tones. If not None, assume that the
+                tone_ids will be provided as the input and use tone_embedding_table.
+            tone_embed_dim (Optional[int]): Tone embedding dimension. If not None, assume that tone_num is not None.
+            tone_embed_integration_type (str): How to integrate tone embedding.
+            init_type (str): How to initialize transformer parameters.
+            init_enc_alpha （float): Initial value of alpha in scaled pos encoding of the encoder.
+            init_dec_alpha (float): Initial value of alpha in scaled pos encoding of the decoder.
+    
+        """
+        assert check_argument_types()
+        super().__init__()
+
+        # store hyperparameters
+        self.idim = idim
+        self.odim = odim
+        self.eos = idim - 1
+        self.reduction_factor = reduction_factor
+        self.encoder_type = encoder_type
+        self.decoder_type = decoder_type
+        self.stop_gradient_from_pitch_predictor = stop_gradient_from_pitch_predictor
+        self.stop_gradient_from_energy_predictor = stop_gradient_from_energy_predictor
+        self.use_scaled_pos_enc = use_scaled_pos_enc
+
+        self.spk_embed_dim = spk_embed_dim
+        if self.spk_embed_dim is not None:
+            self.spk_embed_integration_type = spk_embed_integration_type
+
+        self.tone_embed_dim = tone_embed_dim
+        if self.tone_embed_dim is not None:
+            self.tone_embed_integration_type = tone_embed_integration_type
+
+        # use idx 0 as padding idx
+        self.padding_idx = 0
+
+        # initialize parameters
+        initialize(self, init_type)
+
+        if spk_num and self.spk_embed_dim:
+            self.spk_embedding_table = nn.Embedding(
+                num_embeddings=spk_num,
+                embedding_dim=self.spk_embed_dim,
+                padding_idx=self.padding_idx)
+
+        if self.tone_embed_dim is not None:
+            self.tone_embedding_table = nn.Embedding(
+                num_embeddings=tone_num,
+                embedding_dim=self.tone_embed_dim,
+                padding_idx=self.padding_idx)
+
+        # get positional encoding layer type
+        transformer_pos_enc_layer_type = "scaled_abs_pos" if self.use_scaled_pos_enc else "abs_pos"
+
+        # define encoder
+        encoder_input_layer = nn.Embedding(
+            num_embeddings=idim,
+            embedding_dim=adim,
+            padding_idx=self.padding_idx)
+
+        if encoder_type == "transformer":
+            print("encoder_type is transformer")
+            self.encoder = TransformerEncoder(
+                idim=idim,
+                attention_dim=adim,
+                attention_heads=aheads,
+                linear_units=eunits,
+                num_blocks=elayers,
+                input_layer=encoder_input_layer,
+                dropout_rate=transformer_enc_dropout_rate,
+                positional_dropout_rate=transformer_enc_positional_dropout_rate,
+                attention_dropout_rate=transformer_enc_attn_dropout_rate,
+                pos_enc_layer_type=transformer_pos_enc_layer_type,
+                normalize_before=encoder_normalize_before,
+                concat_after=encoder_concat_after,
+                positionwise_layer_type=positionwise_layer_type,
+                positionwise_conv_kernel_size=positionwise_conv_kernel_size, )
+        elif encoder_type == "conformer":
+            print("encoder_type is conformer")
+            self.encoder = ConformerEncoder(
+                idim=idim,
+                attention_dim=adim,
+                attention_heads=aheads,
+                linear_units=eunits,
+                num_blocks=elayers,
+                input_layer=encoder_input_layer,
+                dropout_rate=transformer_enc_dropout_rate,
+                positional_dropout_rate=transformer_enc_positional_dropout_rate,
+                attention_dropout_rate=transformer_enc_attn_dropout_rate,
+                normalize_before=encoder_normalize_before,
+                concat_after=encoder_concat_after,
+                positionwise_layer_type=positionwise_layer_type,
+                positionwise_conv_kernel_size=positionwise_conv_kernel_size,
+                macaron_style=use_macaron_style_in_conformer,
+                pos_enc_layer_type=conformer_pos_enc_layer_type,
+                selfattention_layer_type=conformer_self_attn_layer_type,
+                activation_type=conformer_activation_type,
+                use_cnn_module=use_cnn_in_conformer,
+                cnn_module_kernel=conformer_enc_kernel_size,
+                zero_triu=zero_triu, )
+        else:
+            raise ValueError(f"{encoder_type} is not supported.")
+
+        # define additional projection for speaker embedding
+        if self.spk_embed_dim is not None:
+            if self.spk_embed_integration_type == "add":
+                self.spk_projection = nn.Linear(self.spk_embed_dim, adim)
+            else:
+                self.spk_projection = nn.Linear(adim + self.spk_embed_dim, adim)
+
+        # define additional projection for tone embedding
+        if self.tone_embed_dim is not None:
+            if self.tone_embed_integration_type == "add":
+                self.tone_projection = nn.Linear(self.tone_embed_dim, adim)
+            else:
+                self.tone_projection = nn.Linear(adim + self.tone_embed_dim,
+                                                 adim)
+
+        # define duration predictor
+        self.duration_predictor = DurationPredictor(
+            idim=adim,
+            n_layers=duration_predictor_layers,
+            n_chans=duration_predictor_chans,
+            kernel_size=duration_predictor_kernel_size,
+            dropout_rate=duration_predictor_dropout_rate, )
+
+        # define pitch predictor
+        self.pitch_predictor = VariancePredictor(
+            idim=adim,
+            n_layers=pitch_predictor_layers,
+            n_chans=pitch_predictor_chans,
+            kernel_size=pitch_predictor_kernel_size,
+            dropout_rate=pitch_predictor_dropout, )
+        #  We use continuous pitch + FastPitch style avg
+        self.pitch_embed = nn.Sequential(
+            nn.Conv1D(
+                in_channels=1,
+                out_channels=adim,
+                kernel_size=pitch_embed_kernel_size,
+                padding=(pitch_embed_kernel_size - 1) // 2, ),
+            nn.Dropout(pitch_embed_dropout), )
+
+        # define energy predictor
+        self.energy_predictor = VariancePredictor(
+            idim=adim,
+            n_layers=energy_predictor_layers,
+            n_chans=energy_predictor_chans,
+            kernel_size=energy_predictor_kernel_size,
+            dropout_rate=energy_predictor_dropout, )
+        # We use continuous enegy + FastPitch style avg
+        self.energy_embed = nn.Sequential(
+            nn.Conv1D(
+                in_channels=1,
+                out_channels=adim,
+                kernel_size=energy_embed_kernel_size,
+                padding=(energy_embed_kernel_size - 1) // 2, ),
+            nn.Dropout(energy_embed_dropout), )
+
+        # define length regulator
+        self.length_regulator = LengthRegulator()
+
+        # define decoder
+        # NOTE: we use encoder as decoder
+        # because fastspeech's decoder is the same as encoder
+        if decoder_type == "transformer":
+            print("decoder_type is transformer")
+            self.decoder = TransformerEncoder(
+                idim=0,
+                attention_dim=adim,
+                attention_heads=aheads,
+                linear_units=dunits,
+                num_blocks=dlayers,
+                # in decoder, don't need layer before pos_enc_class (we use embedding here in encoder)
+                input_layer=None,
+                dropout_rate=transformer_dec_dropout_rate,
+                positional_dropout_rate=transformer_dec_positional_dropout_rate,
+                attention_dropout_rate=transformer_dec_attn_dropout_rate,
+                pos_enc_layer_type=transformer_pos_enc_layer_type,
+                normalize_before=decoder_normalize_before,
+                concat_after=decoder_concat_after,
+                positionwise_layer_type=positionwise_layer_type,
+                positionwise_conv_kernel_size=positionwise_conv_kernel_size, )
+        elif decoder_type == "conformer":
+            print("decoder_type is conformer")
+            self.decoder = ConformerEncoder(
+                idim=0,
+                attention_dim=adim,
+                attention_heads=aheads,
+                linear_units=dunits,
+                num_blocks=dlayers,
+                input_layer=None,
+                dropout_rate=transformer_dec_dropout_rate,
+                positional_dropout_rate=transformer_dec_positional_dropout_rate,
+                attention_dropout_rate=transformer_dec_attn_dropout_rate,
+                normalize_before=decoder_normalize_before,
+                concat_after=decoder_concat_after,
+                positionwise_layer_type=positionwise_layer_type,
+                positionwise_conv_kernel_size=positionwise_conv_kernel_size,
+                macaron_style=use_macaron_style_in_conformer,
+                pos_enc_layer_type=conformer_pos_enc_layer_type,
+                selfattention_layer_type=conformer_self_attn_layer_type,
+                activation_type=conformer_activation_type,
+                use_cnn_module=use_cnn_in_conformer,
+                cnn_module_kernel=conformer_dec_kernel_size, )
+        elif decoder_type == 'cnndecoder':
+            self.decoder = CNNDecoder(
+                emb_dim=adim,
+                odim=odim,
+                kernel_size=cnn_postnet_kernel_size,
+                dropout_rate=cnn_dec_dropout_rate,
+                resblock_kernel_sizes=cnn_postnet_resblock_kernel_sizes)
+        else:
+            raise ValueError(f"{decoder_type} is not supported.")
+
+        # define final projection
+        self.feat_out = nn.Linear(adim, odim * reduction_factor)
+
+        # define postnet
+        if decoder_type == 'cnndecoder':
+            self.postnet = CNNPostnet(
+                odim=odim,
+                kernel_size=cnn_postnet_kernel_size,
+                dropout_rate=cnn_postnet_dropout_rate,
+                resblock_kernel_sizes=cnn_postnet_resblock_kernel_sizes)
+        else:
+            self.postnet = (None if postnet_layers == 0 else Postnet(
+                idim=idim,
+                odim=odim,
+                n_layers=postnet_layers,
+                n_chans=postnet_chans,
+                n_filts=postnet_filts,
+                use_batch_norm=use_batch_norm,
+                dropout_rate=postnet_dropout_rate, ))
+
+        nn.initializer.set_global_initializer(None)
+
+        self._reset_parameters(
+            init_enc_alpha=init_enc_alpha,
+            init_dec_alpha=init_dec_alpha, )
+
+    def forward(
+            self,
+            text: paddle.Tensor,
+            text_lengths: paddle.Tensor,
+            speech: paddle.Tensor,
+            speech_lengths: paddle.Tensor,
+            durations: paddle.Tensor,
+            pitch: paddle.Tensor,
+            energy: paddle.Tensor,
+            tone_id: paddle.Tensor=None,
+            spk_emb: paddle.Tensor=None,
+            spk_id: paddle.Tensor=None
+    ) -> Tuple[paddle.Tensor, Dict[str, paddle.Tensor], paddle.Tensor]:
+        """Calculate forward propagation.
+
+        Args:
+            text(Tensor(int64)): Batch of padded token ids (B, Tmax).
+            text_lengths(Tensor(int64)): Batch of lengths of each input (B,).
+            speech(Tensor): Batch of padded target features (B, Lmax, odim).
+            speech_lengths(Tensor(int64)): Batch of the lengths of each target (B,).
+            durations(Tensor(int64)): Batch of padded durations (B, Tmax).
+            pitch(Tensor): Batch of padded token-averaged pitch (B, Tmax, 1).
+            energy(Tensor): Batch of padded token-averaged energy (B, Tmax, 1).
+            tone_id(Tensor, optional(int64)): Batch of padded tone ids  (B, Tmax).
+            spk_emb(Tensor, optional): Batch of speaker embeddings (B, spk_embed_dim).
+            spk_id(Tnesor, optional(int64)): Batch of speaker ids (B,)
+
+        Returns:
+
+        
+        """
+
+        # input of embedding must be int64
+        xs = paddle.cast(text, 'int64')
+        ilens = paddle.cast(text_lengths, 'int64')
+        ds = paddle.cast(durations, 'int64')
+        olens = paddle.cast(speech_lengths, 'int64')
+        ys = speech
+        ps = pitch
+        es = energy
+        if spk_id is not None:
+            spk_id = paddle.cast(spk_id, 'int64')
+        if tone_id is not None:
+            tone_id = paddle.cast(tone_id, 'int64')
+        # forward propagation
+        before_outs, after_outs, d_outs, p_outs, e_outs = self._forward(
+            xs,
+            ilens,
+            olens,
+            ds,
+            ps,
+            es,
+            is_inference=False,
+            spk_emb=spk_emb,
+            spk_id=spk_id,
+            tone_id=tone_id)
+        # modify mod part of groundtruth
+        if self.reduction_factor > 1:
+            olens = olens - olens % self.reduction_factor
+            max_olen = max(olens)
+            ys = ys[:, :max_olen]
+
+        return before_outs, after_outs, d_outs, p_outs, e_outs, ys, olens
+
+    def _forward(self,
+                 xs: paddle.Tensor,
+                 ilens: paddle.Tensor,
+                 olens: paddle.Tensor=None,
+                 ds: paddle.Tensor=None,
+                 ps: paddle.Tensor=None,
+                 es: paddle.Tensor=None,
+                 is_inference: bool=False,
+                 return_after_enc=False,
+                 alpha: float=1.0,
+                 spk_emb=None,
+                 spk_id=None,
+                 tone_id=None) -> Sequence[paddle.Tensor]:
+        # forward encoder
+        x_masks = self._source_mask(ilens)
+        # (B, Tmax, adim)
+        hs, _ = self.encoder(xs, x_masks)
+
+        # integrate speaker embedding
+        if self.spk_embed_dim is not None:
+            # spk_emb has a higher priority than spk_id
+            if spk_emb is not None:
+                hs = self._integrate_with_spk_embed(hs, spk_emb)
+            elif spk_id is not None:
+                spk_emb = self.spk_embedding_table(spk_id)
+                hs = self._integrate_with_spk_embed(hs, spk_emb)
+
+        # integrate tone embedding
+        if self.tone_embed_dim is not None:
+            if tone_id is not None:
+                tone_embs = self.tone_embedding_table(tone_id)
+                hs = self._integrate_with_tone_embed(hs, tone_embs)
+        # forward duration predictor and variance predictors
+        d_masks = make_pad_mask(ilens)
+
+        if self.stop_gradient_from_pitch_predictor:
+            p_outs = self.pitch_predictor(hs.detach(), d_masks.unsqueeze(-1))
+        else:
+            p_outs = self.pitch_predictor(hs, d_masks.unsqueeze(-1))
+        if self.stop_gradient_from_energy_predictor:
+            e_outs = self.energy_predictor(hs.detach(), d_masks.unsqueeze(-1))
+        else:
+            e_outs = self.energy_predictor(hs, d_masks.unsqueeze(-1))
+
+        if is_inference:
+            # (B, Tmax)
+            if ds is not None:
+                d_outs = ds
+            else:
+                d_outs = self.duration_predictor.inference(hs, d_masks)
+            if ps is not None:
+                p_outs = ps
+            if es is not None:
+                e_outs = es
+
+            # use prediction in inference
+            # (B, Tmax, 1)
+
+            p_embs = self.pitch_embed(p_outs.transpose((0, 2, 1))).transpose(
+                (0, 2, 1))
+            e_embs = self.energy_embed(e_outs.transpose((0, 2, 1))).transpose(
+                (0, 2, 1))
+            hs = hs + e_embs + p_embs
+
+            # (B, Lmax, adim)
+            hs = self.length_regulator(hs, d_outs, alpha, is_inference=True)
+        else:
+            d_outs = self.duration_predictor(hs, d_masks)
+            # use groundtruth in training
+            p_embs = self.pitch_embed(ps.transpose((0, 2, 1))).transpose(
+                (0, 2, 1))
+            e_embs = self.energy_embed(es.transpose((0, 2, 1))).transpose(
+                (0, 2, 1))
+            hs = hs + e_embs + p_embs
+
+            # (B, Lmax, adim)
+            hs = self.length_regulator(hs, ds, is_inference=False)
+
+        # forward decoder
+        if olens is not None and not is_inference:
+            if self.reduction_factor > 1:
+                olens_in = paddle.to_tensor(
+                    [olen // self.reduction_factor for olen in olens.numpy()])
+            else:
+                olens_in = olens
+            # (B, 1, T)
+            h_masks = self._source_mask(olens_in)
+        else:
+            h_masks = None
+
+        if return_after_enc:
+            return hs, h_masks
+        # (B, Lmax, adim)
+        zs, _ = self.decoder(hs, h_masks)
+        # (B, Lmax, odim)
+        if self.decoder_type == 'cnndecoder':
+            before_outs = zs
+        else:
+            before_outs = self.feat_out(zs).reshape(
+                (paddle.shape(zs)[0], -1, self.odim))
+
+        # postnet -> (B, Lmax//r * r, odim)
+        if self.postnet is None:
+            after_outs = before_outs
+        else:
+            after_outs = before_outs + self.postnet(
+                before_outs.transpose((0, 2, 1))).transpose((0, 2, 1))
+
+        return before_outs, after_outs, d_outs, p_outs, e_outs
+
+    def encoder_infer(
+            self,
+            text: paddle.Tensor,
+            alpha: float=1.0,
+            spk_emb=None,
+            spk_id=None,
+            tone_id=None,
+    ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
+        # input of embedding must be int64
+        x = paddle.cast(text, 'int64')
+        # setup batch axis
+        ilens = paddle.shape(x)[0]
+
+        xs = x.unsqueeze(0)
+
+        if spk_emb is not None:
+            spk_emb = spk_emb.unsqueeze(0)
+
+        if tone_id is not None:
+            tone_id = tone_id.unsqueeze(0)
+
+        # (1, L, odim)
+        hs, h_masks = self._forward(
+            xs,
+            ilens,
+            is_inference=True,
+            return_after_enc=True,
+            alpha=alpha,
+            spk_emb=spk_emb,
+            spk_id=spk_id,
+            tone_id=tone_id)
+        return hs, h_masks
+
+    def inference(
+            self,
+            text: paddle.Tensor,
+            durations: paddle.Tensor=None,
+            pitch: paddle.Tensor=None,
+            energy: paddle.Tensor=None,
+            alpha: float=1.0,
+            use_teacher_forcing: bool=False,
+            spk_emb=None,
+            spk_id=None,
+            tone_id=None,
+    ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
+        """Generate the sequence of features given the sequences of characters.
+
+        Args:
+            text(Tensor(int64)): Input sequence of characters (T,).
+            durations(Tensor, optional (int64)): Groundtruth of duration (T,).
+            pitch(Tensor, optional): Groundtruth of token-averaged pitch (T, 1).
+            energy(Tensor, optional): Groundtruth of token-averaged energy (T, 1).
+            alpha(float, optional): Alpha to control the speed.
+            use_teacher_forcing(bool, optional): Whether to use teacher forcing.
+                If true, groundtruth of duration, pitch and energy will be used.
+            spk_emb(Tensor, optional, optional): peaker embedding vector (spk_embed_dim,). (Default value = None)
+            spk_id(Tensor, optional(int64), optional): Batch of padded spk ids  (1,). (Default value = None)
+            tone_id(Tensor, optional(int64), optional): Batch of padded tone ids  (T,). (Default value = None)
+
+        Returns:
+
+        
+        """
+        # input of embedding must be int64
+        x = paddle.cast(text, 'int64')
+        d, p, e = durations, pitch, energy
+        # setup batch axis
+        ilens = paddle.shape(x)[0]
+
+        xs = x.unsqueeze(0)
+
+        if spk_emb is not None:
+            spk_emb = spk_emb.unsqueeze(0)
+
+        if tone_id is not None:
+            tone_id = tone_id.unsqueeze(0)
+
+        if use_teacher_forcing:
+            # use groundtruth of duration, pitch, and energy
+            ds = d.unsqueeze(0) if d is not None else None
+            ps = p.unsqueeze(0) if p is not None else None
+            es = e.unsqueeze(0) if e is not None else None
+
+            # (1, L, odim)
+            _, outs, d_outs, p_outs, e_outs = self._forward(
+                xs,
+                ilens,
+                ds=ds,
+                ps=ps,
+                es=es,
+                spk_emb=spk_emb,
+                spk_id=spk_id,
+                tone_id=tone_id,
+                is_inference=True)
+        else:
+            # (1, L, odim)
+            _, outs, d_outs, p_outs, e_outs = self._forward(
+                xs,
+                ilens,
+                is_inference=True,
+                alpha=alpha,
+                spk_emb=spk_emb,
+                spk_id=spk_id,
+                tone_id=tone_id)
+        return outs[0], d_outs[0], p_outs[0], e_outs[0]
+
+    def _integrate_with_spk_embed(self, hs, spk_emb):
+        """Integrate speaker embedding with hidden states.
+
+        Args:
+            hs(Tensor): Batch of hidden state sequences (B, Tmax, adim).
+            spk_emb(Tensor): Batch of speaker embeddings (B, spk_embed_dim).
+
+        Returns:
+
+        
+        """
+        if self.spk_embed_integration_type == "add":
+            # apply projection and then add to hidden states
+            spk_emb = self.spk_projection(F.normalize(spk_emb))
+            hs = hs + spk_emb.unsqueeze(1)
+        elif self.spk_embed_integration_type == "concat":
+            # concat hidden states with spk embeds and then apply projection
+            spk_emb = F.normalize(spk_emb).unsqueeze(1).expand(
+                shape=[-1, paddle.shape(hs)[1], -1])
+            hs = self.spk_projection(paddle.concat([hs, spk_emb], axis=-1))
+        else:
+            raise NotImplementedError("support only add or concat.")
+
+        return hs
+
+    def _integrate_with_tone_embed(self, hs, tone_embs):
+        """Integrate speaker embedding with hidden states.
+
+        Args:
+            hs(Tensor): Batch of hidden state sequences (B, Tmax, adim).
+            tone_embs(Tensor): Batch of speaker embeddings (B, Tmax, tone_embed_dim).
+
+        Returns:
+
+        
+        """
+        if self.tone_embed_integration_type == "add":
+            # apply projection and then add to hidden states
+            tone_embs = self.tone_projection(F.normalize(tone_embs))
+            hs = hs + tone_embs
+
+        elif self.tone_embed_integration_type == "concat":
+            # concat hidden states with tone embeds and then apply projection
+            tone_embs = F.normalize(tone_embs).expand(
+                shape=[-1, hs.shape[1], -1])
+            hs = self.tone_projection(paddle.concat([hs, tone_embs], axis=-1))
+        else:
+            raise NotImplementedError("support only add or concat.")
+        return hs
+
+    def _source_mask(self, ilens: paddle.Tensor) -> paddle.Tensor:
+        """Make masks for self-attention.
+
+        Args:
+            ilens(Tensor): Batch of lengths (B,).
+
+        Returns:
+            Tensor: Mask tensor for self-attention. dtype=paddle.bool
+
+        Examples:
+            >>> ilens = [5, 3]
+            >>> self._source_mask(ilens)
+            tensor([[[1, 1, 1, 1, 1],
+                        [1, 1, 1, 0, 0]]]) bool
+        """
+        x_masks = make_non_pad_mask(ilens)
+        return x_masks.unsqueeze(-2)
+
+    def _reset_parameters(self, init_enc_alpha: float, init_dec_alpha: float):
+
+        # initialize alpha in scaled positional encoding
+        if self.encoder_type == "transformer" and self.use_scaled_pos_enc:
+            init_enc_alpha = paddle.to_tensor(init_enc_alpha)
+            self.encoder.embed[-1].alpha = paddle.create_parameter(
+                shape=init_enc_alpha.shape,
+                dtype=str(init_enc_alpha.numpy().dtype),
+                default_initializer=paddle.nn.initializer.Assign(
+                    init_enc_alpha))
+        if self.decoder_type == "transformer" and self.use_scaled_pos_enc:
+            init_dec_alpha = paddle.to_tensor(init_dec_alpha)
+            self.decoder.embed[-1].alpha = paddle.create_parameter(
+                shape=init_dec_alpha.shape,
+                dtype=str(init_dec_alpha.numpy().dtype),
+                default_initializer=paddle.nn.initializer.Assign(
+                    init_dec_alpha))
+
+
+class FastSpeech2Inference(nn.Layer):
+    def __init__(self, normalizer, model):
+        super().__init__()
+        self.normalizer = normalizer
+        self.acoustic_model = model
+
+    def forward(self, text, spk_id=None, spk_emb=None):
+        normalized_mel, d_outs, p_outs, e_outs = self.acoustic_model.inference(
+            text, spk_id=spk_id, spk_emb=spk_emb)
+        logmel = self.normalizer.inverse(normalized_mel)
+        return logmel
+
+
+class StyleFastSpeech2Inference(FastSpeech2Inference):
+    def __init__(self,
+                 normalizer,
+                 model,
+                 pitch_stats_path=None,
+                 energy_stats_path=None):
+        super().__init__(normalizer, model)
+        if pitch_stats_path:
+            pitch_mean, pitch_std = np.load(pitch_stats_path)
+            self.pitch_mean = paddle.to_tensor(pitch_mean)
+            self.pitch_std = paddle.to_tensor(pitch_std)
+        if energy_stats_path:
+            energy_mean, energy_std = np.load(energy_stats_path)
+            self.energy_mean = paddle.to_tensor(energy_mean)
+            self.energy_std = paddle.to_tensor(energy_std)
+
+    def denorm(self, data, mean, std):
+        return data * std + mean
+
+    def norm(self, data, mean, std):
+        return (data - mean) / std
+
+    def forward(self,
+                text: paddle.Tensor,
+                durations: Union[paddle.Tensor, np.ndarray]=None,
+                durations_scale: Union[int, float]=None,
+                durations_bias: Union[int, float]=None,
+                pitch: Union[paddle.Tensor, np.ndarray]=None,
+                pitch_scale: Union[int, float]=None,
+                pitch_bias: Union[int, float]=None,
+                energy: Union[paddle.Tensor, np.ndarray]=None,
+                energy_scale: Union[int, float]=None,
+                energy_bias: Union[int, float]=None,
+                robot: bool=False,
+                spk_emb=None,
+                spk_id=None):
+        """
+
+        Args:
+            text(Tensor(int64)): Input sequence of characters (T,).
+            durations(paddle.Tensor/np.ndarray, optional (int64)): Groundtruth of duration (T,), this will overwrite the set of durations_scale and durations_bias
+            durations_scale(int/float, optional): 
+            durations_bias(int/float, optional): 
+            pitch(paddle.Tensor/np.ndarray, optional): Groundtruth of token-averaged pitch (T, 1), this will overwrite the set of pitch_scale and pitch_bias
+            pitch_scale(int/float, optional): In denormed HZ domain.
+            pitch_bias(int/float, optional): In denormed HZ domain.
+            energy(paddle.Tensor/np.ndarray, optional): Groundtruth of token-averaged energy (T, 1), this will overwrite the set of energy_scale and energy_bias
+            energy_scale(int/float, optional): In denormed domain.
+            energy_bias(int/float, optional): In denormed domain.
+            robot: bool:  (Default value = False)
+            spk_emb: (Default value = None)
+            spk_id: (Default value = None)
+
+        Returns:
+            Tensor: logmel
+
+        """
+        normalized_mel, d_outs, p_outs, e_outs = self.acoustic_model.inference(
+            text,
+            durations=None,
+            pitch=None,
+            energy=None,
+            spk_emb=spk_emb,
+            spk_id=spk_id)
+        # priority: groundtruth > scale/bias > previous output
+        # set durations
+        if isinstance(durations, np.ndarray):
+            durations = paddle.to_tensor(durations)
+        elif isinstance(durations, paddle.Tensor):
+            durations = durations
+        elif durations_scale or durations_bias:
+            durations_scale = durations_scale if durations_scale is not None else 1
+            durations_bias = durations_bias if durations_bias is not None else 0
+            durations = durations_scale * d_outs + durations_bias
+        else:
+            durations = d_outs
+
+        if robot:
+            # set normed pitch to zeros have the same effect with set denormd ones to mean
+            pitch = paddle.zeros(p_outs.shape)
+
+        # set pitch, can overwrite robot set  
+        if isinstance(pitch, np.ndarray):
+            pitch = paddle.to_tensor(pitch)
+        elif isinstance(pitch, paddle.Tensor):
+            pitch = pitch
+        elif pitch_scale or pitch_bias:
+            pitch_scale = pitch_scale if pitch_scale is not None else 1
+            pitch_bias = pitch_bias if pitch_bias is not None else 0
+            p_Hz = paddle.exp(
+                self.denorm(p_outs, self.pitch_mean, self.pitch_std))
+            p_HZ = pitch_scale * p_Hz + pitch_bias
+            pitch = self.norm(paddle.log(p_HZ), self.pitch_mean, self.pitch_std)
+        else:
+            pitch = p_outs
+
+        # set energy
+        if isinstance(energy, np.ndarray):
+            energy = paddle.to_tensor(energy)
+        elif isinstance(energy, paddle.Tensor):
+            energy = energy
+        elif energy_scale or energy_bias:
+            energy_scale = energy_scale if energy_scale is not None else 1
+            energy_bias = energy_bias if energy_bias is not None else 0
+            e_dnorm = self.denorm(e_outs, self.energy_mean, self.energy_std)
+            e_dnorm = energy_scale * e_dnorm + energy_bias
+            energy = self.norm(e_dnorm, self.energy_mean, self.energy_std)
+        else:
+            energy = e_outs
+
+        normalized_mel, d_outs, p_outs, e_outs = self.acoustic_model.inference(
+            text,
+            durations=durations,
+            pitch=pitch,
+            energy=energy,
+            use_teacher_forcing=True,
+            spk_emb=spk_emb,
+            spk_id=spk_id)
+
+        logmel = self.normalizer.inverse(normalized_mel)
+        return logmel
+
+
+class FastSpeech2Loss(nn.Layer):
+    """Loss function module for FastSpeech2."""
+
+    def __init__(self, use_masking: bool=True,
+                 use_weighted_masking: bool=False):
+        """Initialize feed-forward Transformer loss module.
+        Args:
+            use_masking (bool): Whether to apply masking for padded part in loss calculation.
+            use_weighted_masking (bool): Whether to weighted masking in loss calculation.
+        """
+        assert check_argument_types()
+        super().__init__()
+
+        assert (use_masking != use_weighted_masking) or not use_masking
+        self.use_masking = use_masking
+        self.use_weighted_masking = use_weighted_masking
+
+        # define criterions
+        reduction = "none" if self.use_weighted_masking else "mean"
+        self.l1_criterion = nn.L1Loss(reduction=reduction)
+        self.mse_criterion = nn.MSELoss(reduction=reduction)
+        self.duration_criterion = DurationPredictorLoss(reduction=reduction)
+
+    def forward(
+            self,
+            after_outs: paddle.Tensor,
+            before_outs: paddle.Tensor,
+            d_outs: paddle.Tensor,
+            p_outs: paddle.Tensor,
+            e_outs: paddle.Tensor,
+            ys: paddle.Tensor,
+            ds: paddle.Tensor,
+            ps: paddle.Tensor,
+            es: paddle.Tensor,
+            ilens: paddle.Tensor,
+            olens: paddle.Tensor,
+    ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor]:
+        """Calculate forward propagation.
+
+        Args:
+            after_outs(Tensor): Batch of outputs after postnets (B, Lmax, odim).
+            before_outs(Tensor): Batch of outputs before postnets (B, Lmax, odim).
+            d_outs(Tensor): Batch of outputs of duration predictor (B, Tmax).
+            p_outs(Tensor): Batch of outputs of pitch predictor (B, Tmax, 1).
+            e_outs(Tensor): Batch of outputs of energy predictor (B, Tmax, 1).
+            ys(Tensor): Batch of target features (B, Lmax, odim).
+            ds(Tensor): Batch of durations (B, Tmax).
+            ps(Tensor): Batch of target token-averaged pitch (B, Tmax, 1).
+            es(Tensor): Batch of target token-averaged energy (B, Tmax, 1).
+            ilens(Tensor): Batch of the lengths of each input (B,).
+            olens(Tensor): Batch of the lengths of each target (B,).
+
+        Returns:
+
+        
+        """
+        # apply mask to remove padded part
+        if self.use_masking:
+            out_masks = make_non_pad_mask(olens).unsqueeze(-1)
+            before_outs = before_outs.masked_select(
+                out_masks.broadcast_to(before_outs.shape))
+            if after_outs is not None:
+                after_outs = after_outs.masked_select(
+                    out_masks.broadcast_to(after_outs.shape))
+            ys = ys.masked_select(out_masks.broadcast_to(ys.shape))
+            duration_masks = make_non_pad_mask(ilens)
+            d_outs = d_outs.masked_select(
+                duration_masks.broadcast_to(d_outs.shape))
+            ds = ds.masked_select(duration_masks.broadcast_to(ds.shape))
+            pitch_masks = make_non_pad_mask(ilens).unsqueeze(-1)
+            p_outs = p_outs.masked_select(
+                pitch_masks.broadcast_to(p_outs.shape))
+            e_outs = e_outs.masked_select(
+                pitch_masks.broadcast_to(e_outs.shape))
+            ps = ps.masked_select(pitch_masks.broadcast_to(ps.shape))
+            es = es.masked_select(pitch_masks.broadcast_to(es.shape))
+
+        # calculate loss
+        l1_loss = self.l1_criterion(before_outs, ys)
+        if after_outs is not None:
+            l1_loss += self.l1_criterion(after_outs, ys)
+        duration_loss = self.duration_criterion(d_outs, ds)
+        pitch_loss = self.mse_criterion(p_outs, ps)
+        energy_loss = self.mse_criterion(e_outs, es)
+
+        # make weighted mask and apply it
+        if self.use_weighted_masking:
+            out_masks = make_non_pad_mask(olens).unsqueeze(-1)
+            out_weights = out_masks.cast(dtype=paddle.float32) / out_masks.cast(
+                dtype=paddle.float32).sum(
+                    axis=1, keepdim=True)
+            out_weights /= ys.shape[0] * ys.shape[2]
+            duration_masks = make_non_pad_mask(ilens)
+            duration_weights = (duration_masks.cast(dtype=paddle.float32) /
+                                duration_masks.cast(dtype=paddle.float32).sum(
+                                    axis=1, keepdim=True))
+            duration_weights /= ds.shape[0]
+
+            # apply weight
+
+            l1_loss = l1_loss.multiply(out_weights)
+            l1_loss = l1_loss.masked_select(
+                out_masks.broadcast_to(l1_loss.shape)).sum()
+            duration_loss = (duration_loss.multiply(duration_weights)
+                             .masked_select(duration_masks).sum())
+            pitch_masks = duration_masks.unsqueeze(-1)
+            pitch_weights = duration_weights.unsqueeze(-1)
+            pitch_loss = pitch_loss.multiply(pitch_weights)
+            pitch_loss = pitch_loss.masked_select(
+                pitch_masks.broadcast_to(pitch_loss.shape)).sum()
+            energy_loss = energy_loss.multiply(pitch_weights)
+            energy_loss = energy_loss.masked_select(
+                pitch_masks.broadcast_to(energy_loss.shape)).sum()
+
+        return l1_loss, duration_loss, pitch_loss, energy_loss
diff --git a/ernie-sat/paddlespeech/t2s/models/fastspeech2/fastspeech2_updater.py b/ernie-sat/paddlespeech/t2s/models/fastspeech2/fastspeech2_updater.py
new file mode 100644
index 0000000000000000000000000000000000000000..92aa9dfc7730beb377aa36333a55b9133f378b0f
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/models/fastspeech2/fastspeech2_updater.py
@@ -0,0 +1,174 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+from pathlib import Path
+
+from paddle import distributed as dist
+from paddle.io import DataLoader
+from paddle.nn import Layer
+from paddle.optimizer import Optimizer
+
+from paddlespeech.t2s.models.fastspeech2 import FastSpeech2Loss
+from paddlespeech.t2s.training.extensions.evaluator import StandardEvaluator
+from paddlespeech.t2s.training.reporter import report
+from paddlespeech.t2s.training.updaters.standard_updater import StandardUpdater
+logging.basicConfig(
+    format='%(asctime)s [%(levelname)s] [%(filename)s:%(lineno)d] %(message)s',
+    datefmt='[%Y-%m-%d %H:%M:%S]')
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+
+class FastSpeech2Updater(StandardUpdater):
+    def __init__(self,
+                 model: Layer,
+                 optimizer: Optimizer,
+                 dataloader: DataLoader,
+                 init_state=None,
+                 use_masking: bool=False,
+                 use_weighted_masking: bool=False,
+                 output_dir: Path=None):
+        super().__init__(model, optimizer, dataloader, init_state=None)
+
+        self.criterion = FastSpeech2Loss(
+            use_masking=use_masking, use_weighted_masking=use_weighted_masking)
+
+        log_file = output_dir / 'worker_{}.log'.format(dist.get_rank())
+        self.filehandler = logging.FileHandler(str(log_file))
+        logger.addHandler(self.filehandler)
+        self.logger = logger
+        self.msg = ""
+
+    def update_core(self, batch):
+        self.msg = "Rank: {}, ".format(dist.get_rank())
+        losses_dict = {}
+        # spk_id!=None in multiple spk fastspeech2 
+        spk_id = batch["spk_id"] if "spk_id" in batch else None
+        spk_emb = batch["spk_emb"] if "spk_emb" in batch else None
+        # No explicit speaker identifier labels are used during voice cloning training.
+        if spk_emb is not None:
+            spk_id = None
+
+        before_outs, after_outs, d_outs, p_outs, e_outs, ys, olens = self.model(
+            text=batch["text"],
+            text_lengths=batch["text_lengths"],
+            speech=batch["speech"],
+            speech_lengths=batch["speech_lengths"],
+            durations=batch["durations"],
+            pitch=batch["pitch"],
+            energy=batch["energy"],
+            spk_id=spk_id,
+            spk_emb=spk_emb)
+
+        l1_loss, duration_loss, pitch_loss, energy_loss = self.criterion(
+            after_outs=after_outs,
+            before_outs=before_outs,
+            d_outs=d_outs,
+            p_outs=p_outs,
+            e_outs=e_outs,
+            ys=ys,
+            ds=batch["durations"],
+            ps=batch["pitch"],
+            es=batch["energy"],
+            ilens=batch["text_lengths"],
+            olens=olens)
+
+        loss = l1_loss + duration_loss + pitch_loss + energy_loss
+
+        optimizer = self.optimizer
+        optimizer.clear_grad()
+        loss.backward()
+        optimizer.step()
+
+        report("train/loss", float(loss))
+        report("train/l1_loss", float(l1_loss))
+        report("train/duration_loss", float(duration_loss))
+        report("train/pitch_loss", float(pitch_loss))
+        report("train/energy_loss", float(energy_loss))
+
+        losses_dict["l1_loss"] = float(l1_loss)
+        losses_dict["duration_loss"] = float(duration_loss)
+        losses_dict["pitch_loss"] = float(pitch_loss)
+        losses_dict["energy_loss"] = float(energy_loss)
+        losses_dict["loss"] = float(loss)
+        self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
+                              for k, v in losses_dict.items())
+
+
+class FastSpeech2Evaluator(StandardEvaluator):
+    def __init__(self,
+                 model: Layer,
+                 dataloader: DataLoader,
+                 use_masking: bool=False,
+                 use_weighted_masking: bool=False,
+                 output_dir: Path=None):
+        super().__init__(model, dataloader)
+
+        log_file = output_dir / 'worker_{}.log'.format(dist.get_rank())
+        self.filehandler = logging.FileHandler(str(log_file))
+        logger.addHandler(self.filehandler)
+        self.logger = logger
+        self.msg = ""
+
+        self.criterion = FastSpeech2Loss(
+            use_masking=use_masking, use_weighted_masking=use_weighted_masking)
+
+    def evaluate_core(self, batch):
+        self.msg = "Evaluate: "
+        losses_dict = {}
+        # spk_id!=None in multiple spk fastspeech2 
+        spk_id = batch["spk_id"] if "spk_id" in batch else None
+        spk_emb = batch["spk_emb"] if "spk_emb" in batch else None
+        if spk_emb is not None:
+            spk_id = None
+
+        before_outs, after_outs, d_outs, p_outs, e_outs, ys, olens = self.model(
+            text=batch["text"],
+            text_lengths=batch["text_lengths"],
+            speech=batch["speech"],
+            speech_lengths=batch["speech_lengths"],
+            durations=batch["durations"],
+            pitch=batch["pitch"],
+            energy=batch["energy"],
+            spk_id=spk_id,
+            spk_emb=spk_emb)
+
+        l1_loss, duration_loss, pitch_loss, energy_loss = self.criterion(
+            after_outs=after_outs,
+            before_outs=before_outs,
+            d_outs=d_outs,
+            p_outs=p_outs,
+            e_outs=e_outs,
+            ys=ys,
+            ds=batch["durations"],
+            ps=batch["pitch"],
+            es=batch["energy"],
+            ilens=batch["text_lengths"],
+            olens=olens, )
+        loss = l1_loss + duration_loss + pitch_loss + energy_loss
+
+        report("eval/loss", float(loss))
+        report("eval/l1_loss", float(l1_loss))
+        report("eval/duration_loss", float(duration_loss))
+        report("eval/pitch_loss", float(pitch_loss))
+        report("eval/energy_loss", float(energy_loss))
+
+        losses_dict["l1_loss"] = float(l1_loss)
+        losses_dict["duration_loss"] = float(duration_loss)
+        losses_dict["pitch_loss"] = float(pitch_loss)
+        losses_dict["energy_loss"] = float(energy_loss)
+        losses_dict["loss"] = float(loss)
+        self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
+                              for k, v in losses_dict.items())
+        self.logger.info(self.msg)
diff --git a/ernie-sat/paddlespeech/t2s/models/hifigan/__init__.py b/ernie-sat/paddlespeech/t2s/models/hifigan/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..7aa5e9d780d252c43c0e180278c4906023b73a77
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/models/hifigan/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .hifigan import *
+from .hifigan_updater import *
diff --git a/ernie-sat/paddlespeech/t2s/models/hifigan/hifigan.py b/ernie-sat/paddlespeech/t2s/models/hifigan/hifigan.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac5ff204fae661dbc159f53970389ce3287a7b9f
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/models/hifigan/hifigan.py
@@ -0,0 +1,716 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This code is based on https://github.com/jik876/hifi-gan.
+import copy
+from typing import Any
+from typing import Dict
+from typing import List
+
+import paddle
+import paddle.nn.functional as F
+from paddle import nn
+
+from paddlespeech.t2s.modules.activation import get_activation
+from paddlespeech.t2s.modules.nets_utils import initialize
+from paddlespeech.t2s.modules.residual_block import HiFiGANResidualBlock as ResidualBlock
+
+
+class HiFiGANGenerator(nn.Layer):
+    """HiFiGAN generator module."""
+
+    def __init__(
+            self,
+            in_channels: int=80,
+            out_channels: int=1,
+            channels: int=512,
+            kernel_size: int=7,
+            upsample_scales: List[int]=(8, 8, 2, 2),
+            upsample_kernel_sizes: List[int]=(16, 16, 4, 4),
+            resblock_kernel_sizes: List[int]=(3, 7, 11),
+            resblock_dilations: List[List[int]]=[(1, 3, 5), (1, 3, 5),
+                                                 (1, 3, 5)],
+            use_additional_convs: bool=True,
+            bias: bool=True,
+            nonlinear_activation: str="leakyrelu",
+            nonlinear_activation_params: Dict[str, Any]={"negative_slope": 0.1},
+            use_weight_norm: bool=True,
+            init_type: str="xavier_uniform", ):
+        """Initialize HiFiGANGenerator module.
+        Args:
+            in_channels (int): Number of input channels.
+            out_channels (int): Number of output channels.
+            channels (int): Number of hidden representation channels.
+            kernel_size (int): Kernel size of initial and final conv layer.
+            upsample_scales (list): List of upsampling scales.
+            upsample_kernel_sizes (list): List of kernel sizes for upsampling layers.
+            resblock_kernel_sizes (list): List of kernel sizes for residual blocks.
+            resblock_dilations (list): List of dilation list for residual blocks.
+            use_additional_convs (bool): Whether to use additional conv layers in residual blocks.
+            bias (bool): Whether to add bias parameter in convolution layers.
+            nonlinear_activation (str): Activation function module name.
+            nonlinear_activation_params (dict): Hyperparameters for activation function.
+            use_weight_norm (bool): Whether to use weight norm.
+                If set to true, it will be applied to all of the conv layers.
+        """
+        super().__init__()
+
+        # initialize parameters
+        initialize(self, init_type)
+
+        # check hyperparameters are valid
+        assert kernel_size % 2 == 1, "Kernel size must be odd number."
+        assert len(upsample_scales) == len(upsample_kernel_sizes)
+        assert len(resblock_dilations) == len(resblock_kernel_sizes)
+
+        # define modules
+        self.num_upsamples = len(upsample_kernel_sizes)
+        self.num_blocks = len(resblock_kernel_sizes)
+        self.input_conv = nn.Conv1D(
+            in_channels,
+            channels,
+            kernel_size,
+            1,
+            padding=(kernel_size - 1) // 2, )
+        self.upsamples = nn.LayerList()
+        self.blocks = nn.LayerList()
+        for i in range(len(upsample_kernel_sizes)):
+            assert upsample_kernel_sizes[i] == 2 * upsample_scales[i]
+            self.upsamples.append(
+                nn.Sequential(
+                    get_activation(nonlinear_activation, **
+                                   nonlinear_activation_params),
+                    nn.Conv1DTranspose(
+                        channels // (2**i),
+                        channels // (2**(i + 1)),
+                        upsample_kernel_sizes[i],
+                        upsample_scales[i],
+                        padding=upsample_scales[i] // 2 + upsample_scales[i] %
+                        2,
+                        output_padding=upsample_scales[i] % 2, ), ))
+            for j in range(len(resblock_kernel_sizes)):
+                self.blocks.append(
+                    ResidualBlock(
+                        kernel_size=resblock_kernel_sizes[j],
+                        channels=channels // (2**(i + 1)),
+                        dilations=resblock_dilations[j],
+                        bias=bias,
+                        use_additional_convs=use_additional_convs,
+                        nonlinear_activation=nonlinear_activation,
+                        nonlinear_activation_params=nonlinear_activation_params,
+                    ))
+        self.output_conv = nn.Sequential(
+            nn.LeakyReLU(),
+            nn.Conv1D(
+                channels // (2**(i + 1)),
+                out_channels,
+                kernel_size,
+                1,
+                padding=(kernel_size - 1) // 2, ),
+            nn.Tanh(), )
+
+        nn.initializer.set_global_initializer(None)
+
+        # apply weight norm
+        if use_weight_norm:
+            self.apply_weight_norm()
+
+        # reset parameters
+        self.reset_parameters()
+
+    def forward(self, c):
+        """Calculate forward propagation.
+        
+        Args:
+            c (Tensor): Input tensor (B, in_channels, T).
+        Returns:
+            Tensor: Output tensor (B, out_channels, T).
+        """
+        c = self.input_conv(c)
+        for i in range(self.num_upsamples):
+            c = self.upsamples[i](c)
+            # initialize
+            cs = 0.0
+            for j in range(self.num_blocks):
+                cs += self.blocks[i * self.num_blocks + j](c)
+            c = cs / self.num_blocks
+        c = self.output_conv(c)
+
+        return c
+
+    def reset_parameters(self):
+        """Reset parameters.
+        This initialization follows official implementation manner.
+        https://github.com/jik876/hifi-gan/blob/master/models.py
+        """
+        # 定义参数为float的正态分布。
+        dist = paddle.distribution.Normal(loc=0.0, scale=0.01)
+
+        def _reset_parameters(m):
+            if isinstance(m, nn.Conv1D) or isinstance(m, nn.Conv1DTranspose):
+                w = dist.sample(m.weight.shape)
+                m.weight.set_value(w)
+
+        self.apply(_reset_parameters)
+
+    def apply_weight_norm(self):
+        """Recursively apply weight normalization to all the Convolution layers
+        in the sublayers.
+        """
+
+        def _apply_weight_norm(layer):
+            if isinstance(layer, (nn.Conv1D, nn.Conv2D, nn.Conv1DTranspose)):
+                nn.utils.weight_norm(layer)
+
+        self.apply(_apply_weight_norm)
+
+    def remove_weight_norm(self):
+        """Recursively remove weight normalization from all the Convolution 
+        layers in the sublayers.
+        """
+
+        def _remove_weight_norm(layer):
+            try:
+                nn.utils.remove_weight_norm(layer)
+            except ValueError:
+                pass
+
+        self.apply(_remove_weight_norm)
+
+    def inference(self, c):
+        """Perform inference.
+        Args:
+            c (Tensor): Input tensor (T, in_channels).
+                normalize_before (bool): Whether to perform normalization.
+        Returns:
+            Tensor:
+                Output tensor (T ** prod(upsample_scales), out_channels).
+        """
+        c = self.forward(c.transpose([1, 0]).unsqueeze(0))
+        return c.squeeze(0).transpose([1, 0])
+
+
+class HiFiGANPeriodDiscriminator(nn.Layer):
+    """HiFiGAN period discriminator module."""
+
+    def __init__(
+            self,
+            in_channels: int=1,
+            out_channels: int=1,
+            period: int=3,
+            kernel_sizes: List[int]=[5, 3],
+            channels: int=32,
+            downsample_scales: List[int]=[3, 3, 3, 3, 1],
+            max_downsample_channels: int=1024,
+            bias: bool=True,
+            nonlinear_activation: str="leakyrelu",
+            nonlinear_activation_params: Dict[str, Any]={"negative_slope": 0.1},
+            use_weight_norm: bool=True,
+            use_spectral_norm: bool=False,
+            init_type: str="xavier_uniform", ):
+        """Initialize HiFiGANPeriodDiscriminator module.
+
+        Args:
+            in_channels (int): Number of input channels.
+            out_channels (int): Number of output channels.
+            period (int): Period.
+            kernel_sizes (list): Kernel sizes of initial conv layers and the final conv layer.
+            channels (int): Number of initial channels.
+            downsample_scales (list): List of downsampling scales.
+            max_downsample_channels (int): Number of maximum downsampling channels.
+            use_additional_convs (bool): Whether to use additional conv layers in residual blocks.
+            bias (bool): Whether to add bias parameter in convolution layers.
+            nonlinear_activation (str): Activation function module name.
+            nonlinear_activation_params (dict): Hyperparameters for activation function.
+            use_weight_norm (bool): Whether to use weight norm.
+                If set to true, it will be applied to all of the conv layers.
+            use_spectral_norm (bool): Whether to use spectral norm.
+                If set to true, it will be applied to all of the conv layers.
+        """
+        super().__init__()
+
+        # initialize parameters
+        initialize(self, init_type)
+
+        assert len(kernel_sizes) == 2
+        assert kernel_sizes[0] % 2 == 1, "Kernel size must be odd number."
+        assert kernel_sizes[1] % 2 == 1, "Kernel size must be odd number."
+
+        self.period = period
+        self.convs = nn.LayerList()
+        in_chs = in_channels
+        out_chs = channels
+        for downsample_scale in downsample_scales:
+            self.convs.append(
+                nn.Sequential(
+                    nn.Conv2D(
+                        in_chs,
+                        out_chs,
+                        (kernel_sizes[0], 1),
+                        (downsample_scale, 1),
+                        padding=((kernel_sizes[0] - 1) // 2, 0), ),
+                    get_activation(nonlinear_activation, **
+                                   nonlinear_activation_params), ))
+            in_chs = out_chs
+            # NOTE: Use downsample_scale + 1?
+            out_chs = min(out_chs * 4, max_downsample_channels)
+        self.output_conv = nn.Conv2D(
+            out_chs,
+            out_channels,
+            (kernel_sizes[1] - 1, 1),
+            1,
+            padding=((kernel_sizes[1] - 1) // 2, 0), )
+
+        if use_weight_norm and use_spectral_norm:
+            raise ValueError("Either use use_weight_norm or use_spectral_norm.")
+
+        # apply weight norm
+        if use_weight_norm:
+            self.apply_weight_norm()
+
+        # apply spectral norm
+        if use_spectral_norm:
+            self.apply_spectral_norm()
+
+    def forward(self, x):
+        """Calculate forward propagation.
+
+        Args:
+            c (Tensor): Input tensor (B, in_channels, T).
+        Returns:
+            list: List of each layer's tensors.
+        """
+        # transform 1d to 2d -> (B, C, T/P, P)
+        b, c, t = paddle.shape(x)
+        if t % self.period != 0:
+            n_pad = self.period - (t % self.period)
+            x = F.pad(x, (0, n_pad), "reflect", data_format="NCL")
+            t += n_pad
+        x = x.reshape([b, c, t // self.period, self.period])
+
+        # forward conv
+        outs = []
+        for layer in self.convs:
+            x = layer(x)
+            outs += [x]
+        x = self.output_conv(x)
+        x = paddle.flatten(x, 1, -1)
+        outs += [x]
+
+        return outs
+
+    def apply_weight_norm(self):
+        """Recursively apply weight normalization to all the Convolution layers
+        in the sublayers.
+        """
+
+        def _apply_weight_norm(layer):
+            if isinstance(layer, (nn.Conv1D, nn.Conv2D, nn.Conv1DTranspose)):
+                nn.utils.weight_norm(layer)
+
+        self.apply(_apply_weight_norm)
+
+    def apply_spectral_norm(self):
+        """Apply spectral normalization module from all of the layers."""
+
+        def _apply_spectral_norm(m):
+            if isinstance(m, nn.Conv2D):
+                nn.utils.spectral_norm(m)
+
+        self.apply(_apply_spectral_norm)
+
+
+class HiFiGANMultiPeriodDiscriminator(nn.Layer):
+    """HiFiGAN multi-period discriminator module."""
+
+    def __init__(
+            self,
+            periods: List[int]=[2, 3, 5, 7, 11],
+            discriminator_params: Dict[str, Any]={
+                "in_channels": 1,
+                "out_channels": 1,
+                "kernel_sizes": [5, 3],
+                "channels": 32,
+                "downsample_scales": [3, 3, 3, 3, 1],
+                "max_downsample_channels": 1024,
+                "bias": True,
+                "nonlinear_activation": "leakyrelu",
+                "nonlinear_activation_params": {
+                    "negative_slope": 0.1
+                },
+                "use_weight_norm": True,
+                "use_spectral_norm": False,
+            },
+            init_type: str="xavier_uniform", ):
+        """Initialize HiFiGANMultiPeriodDiscriminator module.
+
+        Args:
+            periods (list): List of periods.
+            discriminator_params (dict): Parameters for hifi-gan period discriminator module.
+                The period parameter will be overwritten.
+        """
+        super().__init__()
+        # initialize parameters
+        initialize(self, init_type)
+
+        self.discriminators = nn.LayerList()
+        for period in periods:
+            params = copy.deepcopy(discriminator_params)
+            params["period"] = period
+            self.discriminators.append(HiFiGANPeriodDiscriminator(**params))
+
+    def forward(self, x):
+        """Calculate forward propagation.
+
+        Args:
+            x (Tensor): Input noise signal (B, 1, T).
+        Returns:
+            List: List of list of each discriminator outputs, which consists of each layer output tensors.
+        """
+        outs = []
+        for f in self.discriminators:
+            outs += [f(x)]
+
+        return outs
+
+
+class HiFiGANScaleDiscriminator(nn.Layer):
+    """HiFi-GAN scale discriminator module."""
+
+    def __init__(
+            self,
+            in_channels: int=1,
+            out_channels: int=1,
+            kernel_sizes: List[int]=[15, 41, 5, 3],
+            channels: int=128,
+            max_downsample_channels: int=1024,
+            max_groups: int=16,
+            bias: bool=True,
+            downsample_scales: List[int]=[2, 2, 4, 4, 1],
+            nonlinear_activation: str="leakyrelu",
+            nonlinear_activation_params: Dict[str, Any]={"negative_slope": 0.1},
+            use_weight_norm: bool=True,
+            use_spectral_norm: bool=False,
+            init_type: str="xavier_uniform", ):
+        """Initilize HiFiGAN scale discriminator module.
+
+        Args:
+            in_channels (int): Number of input channels.
+            out_channels (int): Number of output channels.
+            kernel_sizes (list): List of four kernel sizes. The first will be used for the first conv layer,
+                and the second is for downsampling part, and the remaining two are for output layers.
+            channels (int): Initial number of channels for conv layer.
+            max_downsample_channels (int): Maximum number of channels for downsampling layers.
+            bias (bool): Whether to add bias parameter in convolution layers.
+            downsample_scales (list): List of downsampling scales.
+            nonlinear_activation (str): Activation function module name.
+            nonlinear_activation_params (dict): Hyperparameters for activation function.
+            use_weight_norm (bool): Whether to use weight norm.
+                If set to true, it will be applied to all of the conv layers.
+            use_spectral_norm (bool): Whether to use spectral norm.
+                If set to true, it will be applied to all of the conv layers.
+        """
+        super().__init__()
+
+        # initialize parameters
+        initialize(self, init_type)
+
+        self.layers = nn.LayerList()
+
+        # check kernel size is valid
+        assert len(kernel_sizes) == 4
+        for ks in kernel_sizes:
+            assert ks % 2 == 1
+
+        # add first layer
+        self.layers.append(
+            nn.Sequential(
+                nn.Conv1D(
+                    in_channels,
+                    channels,
+                    # NOTE: Use always the same kernel size
+                    kernel_sizes[0],
+                    bias_attr=bias,
+                    padding=(kernel_sizes[0] - 1) // 2, ),
+                get_activation(nonlinear_activation, **
+                               nonlinear_activation_params), ))
+
+        # add downsample layers
+        in_chs = channels
+        out_chs = channels
+        # NOTE(kan-bayashi): Remove hard coding?
+        groups = 4
+        for downsample_scale in downsample_scales:
+            self.layers.append(
+                nn.Sequential(
+                    nn.Conv1D(
+                        in_chs,
+                        out_chs,
+                        kernel_size=kernel_sizes[1],
+                        stride=downsample_scale,
+                        padding=(kernel_sizes[1] - 1) // 2,
+                        groups=groups,
+                        bias_attr=bias, ),
+                    get_activation(nonlinear_activation, **
+                                   nonlinear_activation_params), ))
+            in_chs = out_chs
+            # NOTE: Remove hard coding?
+            out_chs = min(in_chs * 2, max_downsample_channels)
+            # NOTE: Remove hard coding?
+            groups = min(groups * 4, max_groups)
+
+        # add final layers
+        out_chs = min(in_chs * 2, max_downsample_channels)
+        self.layers.append(
+            nn.Sequential(
+                nn.Conv1D(
+                    in_chs,
+                    out_chs,
+                    kernel_size=kernel_sizes[2],
+                    stride=1,
+                    padding=(kernel_sizes[2] - 1) // 2,
+                    bias_attr=bias, ),
+                get_activation(nonlinear_activation, **
+                               nonlinear_activation_params), ))
+        self.layers.append(
+            nn.Conv1D(
+                out_chs,
+                out_channels,
+                kernel_size=kernel_sizes[3],
+                stride=1,
+                padding=(kernel_sizes[3] - 1) // 2,
+                bias_attr=bias, ), )
+
+        if use_weight_norm and use_spectral_norm:
+            raise ValueError("Either use use_weight_norm or use_spectral_norm.")
+
+        # apply weight norm
+        if use_weight_norm:
+            self.apply_weight_norm()
+
+        # apply spectral norm
+        if use_spectral_norm:
+            self.apply_spectral_norm()
+
+    def forward(self, x):
+        """Calculate forward propagation.
+
+        Args:
+            x (Tensor): Input noise signal (B, 1, T).
+        Returns:
+            List: List of output tensors of each layer.
+        """
+        outs = []
+        for f in self.layers:
+            x = f(x)
+            outs += [x]
+
+        return outs
+
+    def apply_weight_norm(self):
+        """Recursively apply weight normalization to all the Convolution layers
+        in the sublayers.
+        """
+
+        def _apply_weight_norm(layer):
+            if isinstance(layer, (nn.Conv1D, nn.Conv2D, nn.Conv1DTranspose)):
+                nn.utils.weight_norm(layer)
+
+        self.apply(_apply_weight_norm)
+
+    def apply_spectral_norm(self):
+        """Apply spectral normalization module from all of the layers."""
+
+        def _apply_spectral_norm(m):
+            if isinstance(m, nn.Conv2D):
+                nn.utils.spectral_norm(m)
+
+        self.apply(_apply_spectral_norm)
+
+
+class HiFiGANMultiScaleDiscriminator(nn.Layer):
+    """HiFi-GAN multi-scale discriminator module."""
+
+    def __init__(
+            self,
+            scales: int=3,
+            downsample_pooling: str="AvgPool1D",
+            # follow the official implementation setting
+            downsample_pooling_params: Dict[str, Any]={
+                "kernel_size": 4,
+                "stride": 2,
+                "padding": 2,
+            },
+            discriminator_params: Dict[str, Any]={
+                "in_channels": 1,
+                "out_channels": 1,
+                "kernel_sizes": [15, 41, 5, 3],
+                "channels": 128,
+                "max_downsample_channels": 1024,
+                "max_groups": 16,
+                "bias": True,
+                "downsample_scales": [2, 2, 4, 4, 1],
+                "nonlinear_activation": "leakyrelu",
+                "nonlinear_activation_params": {
+                    "negative_slope": 0.1
+                },
+            },
+            follow_official_norm: bool=False,
+            init_type: str="xavier_uniform", ):
+        """Initilize HiFiGAN multi-scale discriminator module.
+   
+        Args:
+            scales (int): Number of multi-scales.
+            downsample_pooling (str): Pooling module name for downsampling of the inputs.
+            downsample_pooling_params (dict): Parameters for the above pooling module.
+            discriminator_params (dict): Parameters for hifi-gan scale discriminator module.
+            follow_official_norm (bool): Whether to follow the norm setting of the official
+                implementaion. The first discriminator uses spectral norm and the other discriminators use weight norm.
+        """
+        super().__init__()
+
+        # initialize parameters
+        initialize(self, init_type)
+
+        self.discriminators = nn.LayerList()
+
+        # add discriminators
+        for i in range(scales):
+            params = copy.deepcopy(discriminator_params)
+            if follow_official_norm:
+                if i == 0:
+                    params["use_weight_norm"] = False
+                    params["use_spectral_norm"] = True
+                else:
+                    params["use_weight_norm"] = True
+                    params["use_spectral_norm"] = False
+            self.discriminators.append(HiFiGANScaleDiscriminator(**params))
+        self.pooling = getattr(nn, downsample_pooling)(
+            **downsample_pooling_params)
+
+    def forward(self, x):
+        """Calculate forward propagation.
+
+        Args:
+            x (Tensor): Input noise signal (B, 1, T).
+        Returns:
+            List: List of list of each discriminator outputs, which consists of each layer output tensors.
+        """
+        outs = []
+        for f in self.discriminators:
+            outs += [f(x)]
+            x = self.pooling(x)
+
+        return outs
+
+
+class HiFiGANMultiScaleMultiPeriodDiscriminator(nn.Layer):
+    """HiFi-GAN multi-scale + multi-period discriminator module."""
+
+    def __init__(
+            self,
+            # Multi-scale discriminator related
+            scales: int=3,
+            scale_downsample_pooling: str="AvgPool1D",
+            scale_downsample_pooling_params: Dict[str, Any]={
+                "kernel_size": 4,
+                "stride": 2,
+                "padding": 2,
+            },
+            scale_discriminator_params: Dict[str, Any]={
+                "in_channels": 1,
+                "out_channels": 1,
+                "kernel_sizes": [15, 41, 5, 3],
+                "channels": 128,
+                "max_downsample_channels": 1024,
+                "max_groups": 16,
+                "bias": True,
+                "downsample_scales": [2, 2, 4, 4, 1],
+                "nonlinear_activation": "leakyrelu",
+                "nonlinear_activation_params": {
+                    "negative_slope": 0.1
+                },
+            },
+            follow_official_norm: bool=True,
+            # Multi-period discriminator related
+            periods: List[int]=[2, 3, 5, 7, 11],
+            period_discriminator_params: Dict[str, Any]={
+                "in_channels": 1,
+                "out_channels": 1,
+                "kernel_sizes": [5, 3],
+                "channels": 32,
+                "downsample_scales": [3, 3, 3, 3, 1],
+                "max_downsample_channels": 1024,
+                "bias": True,
+                "nonlinear_activation": "leakyrelu",
+                "nonlinear_activation_params": {
+                    "negative_slope": 0.1
+                },
+                "use_weight_norm": True,
+                "use_spectral_norm": False,
+            },
+            init_type: str="xavier_uniform", ):
+        """Initilize HiFiGAN multi-scale + multi-period discriminator module.
+
+        Args:
+            scales (int): Number of multi-scales.
+            scale_downsample_pooling (str): Pooling module name for downsampling of the inputs.
+            scale_downsample_pooling_params (dict): Parameters for the above pooling module.
+            scale_discriminator_params (dict): Parameters for hifi-gan scale discriminator module.
+            follow_official_norm （bool): Whether to follow the norm setting of the official implementaion. 
+                The first discriminator uses spectral norm and the other discriminators use weight norm.
+            periods (list): List of periods.
+            period_discriminator_params (dict): Parameters for hifi-gan period discriminator module.
+                The period parameter will be overwritten.
+        """
+        super().__init__()
+
+        # initialize parameters
+        initialize(self, init_type)
+
+        self.msd = HiFiGANMultiScaleDiscriminator(
+            scales=scales,
+            downsample_pooling=scale_downsample_pooling,
+            downsample_pooling_params=scale_downsample_pooling_params,
+            discriminator_params=scale_discriminator_params,
+            follow_official_norm=follow_official_norm, )
+        self.mpd = HiFiGANMultiPeriodDiscriminator(
+            periods=periods,
+            discriminator_params=period_discriminator_params, )
+
+    def forward(self, x):
+        """Calculate forward propagation.
+
+        Args:
+            x (Tensor): Input noise signal (B, 1, T).
+        Returns:
+            List:
+                List of list of each discriminator outputs,
+                which consists of each layer output tensors.
+                Multi scale and multi period ones are concatenated.
+        """
+        msd_outs = self.msd(x)
+        mpd_outs = self.mpd(x)
+        return msd_outs + mpd_outs
+
+
+class HiFiGANInference(nn.Layer):
+    def __init__(self, normalizer, hifigan_generator):
+        super().__init__()
+        self.normalizer = normalizer
+        self.hifigan_generator = hifigan_generator
+
+    def forward(self, logmel):
+        normalized_mel = self.normalizer(logmel)
+        wav = self.hifigan_generator.inference(normalized_mel)
+        return wav
diff --git a/ernie-sat/paddlespeech/t2s/models/hifigan/hifigan_updater.py b/ernie-sat/paddlespeech/t2s/models/hifigan/hifigan_updater.py
new file mode 100644
index 0000000000000000000000000000000000000000..f12c666fd3a3ab08fa404466ccac39affcf8f43e
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/models/hifigan/hifigan_updater.py
@@ -0,0 +1,247 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+from typing import Dict
+
+import paddle
+from paddle import distributed as dist
+from paddle.io import DataLoader
+from paddle.nn import Layer
+from paddle.optimizer import Optimizer
+from paddle.optimizer.lr import LRScheduler
+
+from paddlespeech.t2s.training.extensions.evaluator import StandardEvaluator
+from paddlespeech.t2s.training.reporter import report
+from paddlespeech.t2s.training.updaters.standard_updater import StandardUpdater
+from paddlespeech.t2s.training.updaters.standard_updater import UpdaterState
+logging.basicConfig(
+    format='%(asctime)s [%(levelname)s] [%(filename)s:%(lineno)d] %(message)s',
+    datefmt='[%Y-%m-%d %H:%M:%S]')
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+
+class HiFiGANUpdater(StandardUpdater):
+    def __init__(self,
+                 models: Dict[str, Layer],
+                 optimizers: Dict[str, Optimizer],
+                 criterions: Dict[str, Layer],
+                 schedulers: Dict[str, LRScheduler],
+                 dataloader: DataLoader,
+                 generator_train_start_steps: int=0,
+                 discriminator_train_start_steps: int=100000,
+                 lambda_adv: float=1.0,
+                 lambda_aux: float=1.0,
+                 lambda_feat_match: float=1.0,
+                 output_dir=None):
+        self.models = models
+        self.generator: Layer = models['generator']
+        self.discriminator: Layer = models['discriminator']
+
+        self.optimizers = optimizers
+        self.optimizer_g: Optimizer = optimizers['generator']
+        self.optimizer_d: Optimizer = optimizers['discriminator']
+
+        self.criterions = criterions
+        self.criterion_feat_match = criterions['feat_match']
+        self.criterion_mel = criterions['mel']
+
+        self.criterion_gen_adv = criterions["gen_adv"]
+        self.criterion_dis_adv = criterions["dis_adv"]
+
+        self.schedulers = schedulers
+        self.scheduler_g = schedulers['generator']
+        self.scheduler_d = schedulers['discriminator']
+
+        self.dataloader = dataloader
+
+        self.generator_train_start_steps = generator_train_start_steps
+        self.discriminator_train_start_steps = discriminator_train_start_steps
+        self.lambda_adv = lambda_adv
+        self.lambda_aux = lambda_aux
+        self.lambda_feat_match = lambda_feat_match
+
+        self.state = UpdaterState(iteration=0, epoch=0)
+        self.train_iterator = iter(self.dataloader)
+
+        log_file = output_dir / 'worker_{}.log'.format(dist.get_rank())
+        self.filehandler = logging.FileHandler(str(log_file))
+        logger.addHandler(self.filehandler)
+        self.logger = logger
+        self.msg = ""
+
+    def update_core(self, batch):
+        self.msg = "Rank: {}, ".format(dist.get_rank())
+        losses_dict = {}
+        # parse batch
+        wav, mel = batch
+
+        # Generator
+        if self.state.iteration > self.generator_train_start_steps:
+            # (B, out_channels, T ** prod(upsample_scales)
+            wav_ = self.generator(mel)
+
+            # initialize
+            gen_loss = 0.0
+            aux_loss = 0.0
+
+            # mel spectrogram loss
+            mel_loss = self.criterion_mel(wav_, wav)
+            aux_loss += mel_loss
+            report("train/mel_loss", float(mel_loss))
+            losses_dict["mel_loss"] = float(mel_loss)
+
+            gen_loss += aux_loss * self.lambda_aux
+
+            # adversarial loss
+            if self.state.iteration > self.discriminator_train_start_steps:
+                p_ = self.discriminator(wav_)
+                adv_loss = self.criterion_gen_adv(p_)
+                report("train/adversarial_loss", float(adv_loss))
+                losses_dict["adversarial_loss"] = float(adv_loss)
+
+                # feature matching loss
+                # no need to track gradients
+                with paddle.no_grad():
+                    p = self.discriminator(wav)
+                fm_loss = self.criterion_feat_match(p_, p)
+                report("train/feature_matching_loss", float(fm_loss))
+                losses_dict["feature_matching_loss"] = float(fm_loss)
+
+                adv_loss += self.lambda_feat_match * fm_loss
+
+                gen_loss += self.lambda_adv * adv_loss
+
+            report("train/generator_loss", float(gen_loss))
+            losses_dict["generator_loss"] = float(gen_loss)
+
+            self.optimizer_g.clear_grad()
+            gen_loss.backward()
+
+            self.optimizer_g.step()
+            self.scheduler_g.step()
+
+        # Disctiminator
+        if self.state.iteration > self.discriminator_train_start_steps:
+            # re-compute wav_ which leads better quality
+            with paddle.no_grad():
+                wav_ = self.generator(mel)
+
+            p = self.discriminator(wav)
+            p_ = self.discriminator(wav_.detach())
+            real_loss, fake_loss = self.criterion_dis_adv(p_, p)
+            dis_loss = real_loss + fake_loss
+            report("train/real_loss", float(real_loss))
+            report("train/fake_loss", float(fake_loss))
+            report("train/discriminator_loss", float(dis_loss))
+            losses_dict["real_loss"] = float(real_loss)
+            losses_dict["fake_loss"] = float(fake_loss)
+            losses_dict["discriminator_loss"] = float(dis_loss)
+
+            self.optimizer_d.clear_grad()
+            dis_loss.backward()
+
+            self.optimizer_d.step()
+            self.scheduler_d.step()
+
+        self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
+                              for k, v in losses_dict.items())
+
+
+class HiFiGANEvaluator(StandardEvaluator):
+    def __init__(self,
+                 models: Dict[str, Layer],
+                 criterions: Dict[str, Layer],
+                 dataloader: DataLoader,
+                 lambda_adv: float=1.0,
+                 lambda_aux: float=1.0,
+                 lambda_feat_match: float=1.0,
+                 output_dir=None):
+        self.models = models
+        self.generator = models['generator']
+        self.discriminator = models['discriminator']
+
+        self.criterions = criterions
+        self.criterion_feat_match = criterions['feat_match']
+        self.criterion_mel = criterions['mel']
+        self.criterion_gen_adv = criterions["gen_adv"]
+        self.criterion_dis_adv = criterions["dis_adv"]
+
+        self.dataloader = dataloader
+
+        self.lambda_adv = lambda_adv
+        self.lambda_aux = lambda_aux
+        self.lambda_feat_match = lambda_feat_match
+
+        log_file = output_dir / 'worker_{}.log'.format(dist.get_rank())
+        self.filehandler = logging.FileHandler(str(log_file))
+        logger.addHandler(self.filehandler)
+        self.logger = logger
+        self.msg = ""
+
+    def evaluate_core(self, batch):
+        # logging.debug("Evaluate: ")
+        self.msg = "Evaluate: "
+        losses_dict = {}
+        wav, mel = batch
+
+        # Generator
+        # (B, out_channels, T ** prod(upsample_scales)
+        wav_ = self.generator(mel)
+
+        # initialize
+        gen_loss = 0.0
+        aux_loss = 0.0
+
+        ## Adversarial loss
+        p_ = self.discriminator(wav_)
+        adv_loss = self.criterion_gen_adv(p_)
+        report("eval/adversarial_loss", float(adv_loss))
+        losses_dict["adversarial_loss"] = float(adv_loss)
+
+        # feature matching loss
+        p = self.discriminator(wav)
+        fm_loss = self.criterion_feat_match(p_, p)
+        report("eval/feature_matching_loss", float(fm_loss))
+        losses_dict["feature_matching_loss"] = float(fm_loss)
+        adv_loss += self.lambda_feat_match * fm_loss
+
+        gen_loss += self.lambda_adv * adv_loss
+
+        # mel spectrogram loss
+        mel_loss = self.criterion_mel(wav_, wav)
+        aux_loss += mel_loss
+        report("eval/mel_loss", float(mel_loss))
+        losses_dict["mel_loss"] = float(mel_loss)
+
+        gen_loss += aux_loss * self.lambda_aux
+
+        report("eval/generator_loss", float(gen_loss))
+        losses_dict["generator_loss"] = float(gen_loss)
+
+        # Disctiminator
+        p = self.discriminator(wav)
+        real_loss, fake_loss = self.criterion_dis_adv(p_, p)
+        dis_loss = real_loss + fake_loss
+        report("eval/real_loss", float(real_loss))
+        report("eval/fake_loss", float(fake_loss))
+        report("eval/discriminator_loss", float(dis_loss))
+
+        losses_dict["real_loss"] = float(real_loss)
+        losses_dict["fake_loss"] = float(fake_loss)
+        losses_dict["discriminator_loss"] = float(dis_loss)
+
+        self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
+                              for k, v in losses_dict.items())
+        self.logger.info(self.msg)
diff --git a/ernie-sat/paddlespeech/t2s/models/melgan/__init__.py b/ernie-sat/paddlespeech/t2s/models/melgan/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..df8ccd92dae127fe86734a495105de3de49eda74
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/models/melgan/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .melgan import *
+from .multi_band_melgan_updater import *
+from .style_melgan import *
+from .style_melgan_updater import *
diff --git a/ernie-sat/paddlespeech/t2s/models/melgan/melgan.py b/ernie-sat/paddlespeech/t2s/models/melgan/melgan.py
new file mode 100644
index 0000000000000000000000000000000000000000..22d8fd9e764c5c7f3c71ca1e2d17acc641a029cd
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/models/melgan/melgan.py
@@ -0,0 +1,528 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+"""MelGAN Modules."""
+from typing import Any
+from typing import Dict
+from typing import List
+
+import numpy as np
+import paddle
+from paddle import nn
+
+from paddlespeech.t2s.modules.activation import get_activation
+from paddlespeech.t2s.modules.causal_conv import CausalConv1D
+from paddlespeech.t2s.modules.causal_conv import CausalConv1DTranspose
+from paddlespeech.t2s.modules.nets_utils import initialize
+from paddlespeech.t2s.modules.pqmf import PQMF
+from paddlespeech.t2s.modules.residual_stack import ResidualStack
+
+
+class MelGANGenerator(nn.Layer):
+    """MelGAN generator module."""
+
+    def __init__(
+            self,
+            in_channels: int=80,
+            out_channels: int=1,
+            kernel_size: int=7,
+            channels: int=512,
+            bias: bool=True,
+            upsample_scales: List[int]=[8, 8, 2, 2],
+            stack_kernel_size: int=3,
+            stacks: int=3,
+            nonlinear_activation: str="leakyrelu",
+            nonlinear_activation_params: Dict[str, Any]={"negative_slope": 0.2},
+            pad: str="Pad1D",
+            pad_params: Dict[str, Any]={"mode": "reflect"},
+            use_final_nonlinear_activation: bool=True,
+            use_weight_norm: bool=True,
+            use_causal_conv: bool=False,
+            init_type: str="xavier_uniform", ):
+        """Initialize MelGANGenerator module.
+
+        Args:
+            in_channels (int): Number of input channels.
+            out_channels (int): Number of output channels,
+                the number of sub-band is out_channels in multi-band melgan.
+            kernel_size (int): Kernel size of initial and final conv layer.
+            channels (int): Initial number of channels for conv layer.
+            bias (bool): Whether to add bias parameter in convolution layers.
+            upsample_scales (List[int]): List of upsampling scales.
+            stack_kernel_size (int): Kernel size of dilated conv layers in residual stack.
+            stacks (int): Number of stacks in a single residual stack.
+            nonlinear_activation (Optional[str], optional): Non linear activation in upsample network, by default None
+            nonlinear_activation_params (Dict[str, Any], optional): Parameters passed to the linear activation in the upsample network, 
+                by default {}
+            pad (str): Padding function module name before dilated convolution layer.
+            pad_params (dict): Hyperparameters for padding function.
+            use_final_nonlinear_activation (nn.Layer): Activation function for the final layer.
+            use_weight_norm (bool): Whether to use weight norm.
+                If set to true, it will be applied to all of the conv layers.
+            use_causal_conv (bool): Whether to use causal convolution.
+        """
+        super().__init__()
+
+        # initialize parameters
+        initialize(self, init_type)
+
+        # for compatibility
+        if nonlinear_activation:
+            nonlinear_activation = nonlinear_activation.lower()
+
+        # check hyper parameters is valid
+        assert channels >= np.prod(upsample_scales)
+        assert channels % (2**len(upsample_scales)) == 0
+        if not use_causal_conv:
+            assert (kernel_size - 1
+                    ) % 2 == 0, "Not support even number kernel size."
+
+        layers = []
+        if not use_causal_conv:
+            layers += [
+                getattr(paddle.nn, pad)((kernel_size - 1) // 2, **pad_params),
+                nn.Conv1D(in_channels, channels, kernel_size, bias_attr=bias),
+            ]
+        else:
+            layers += [
+                CausalConv1D(
+                    in_channels,
+                    channels,
+                    kernel_size,
+                    bias=bias,
+                    pad=pad,
+                    pad_params=pad_params, ),
+            ]
+
+        for i, upsample_scale in enumerate(upsample_scales):
+            # add upsampling layer
+            layers += [
+                get_activation(nonlinear_activation,
+                               **nonlinear_activation_params)
+            ]
+            if not use_causal_conv:
+                layers += [
+                    nn.Conv1DTranspose(
+                        channels // (2**i),
+                        channels // (2**(i + 1)),
+                        upsample_scale * 2,
+                        stride=upsample_scale,
+                        padding=upsample_scale // 2 + upsample_scale % 2,
+                        output_padding=upsample_scale % 2,
+                        bias_attr=bias, )
+                ]
+            else:
+                layers += [
+                    CausalConv1DTranspose(
+                        channels // (2**i),
+                        channels // (2**(i + 1)),
+                        upsample_scale * 2,
+                        stride=upsample_scale,
+                        bias=bias, )
+                ]
+
+            # add residual stack
+            for j in range(stacks):
+                layers += [
+                    ResidualStack(
+                        kernel_size=stack_kernel_size,
+                        channels=channels // (2**(i + 1)),
+                        dilation=stack_kernel_size**j,
+                        bias=bias,
+                        nonlinear_activation=nonlinear_activation,
+                        nonlinear_activation_params=nonlinear_activation_params,
+                        pad=pad,
+                        pad_params=pad_params,
+                        use_causal_conv=use_causal_conv, )
+                ]
+
+        # add final layer
+        layers += [
+            get_activation(nonlinear_activation, **nonlinear_activation_params)
+        ]
+        if not use_causal_conv:
+            layers += [
+                getattr(nn, pad)((kernel_size - 1) // 2, **pad_params),
+                nn.Conv1D(
+                    channels // (2**(i + 1)),
+                    out_channels,
+                    kernel_size,
+                    bias_attr=bias),
+            ]
+        else:
+            layers += [
+                CausalConv1D(
+                    channels // (2**(i + 1)),
+                    out_channels,
+                    kernel_size,
+                    bias=bias,
+                    pad=pad,
+                    pad_params=pad_params, ),
+            ]
+        if use_final_nonlinear_activation:
+            layers += [nn.Tanh()]
+
+        # define the model as a single function        
+        self.melgan = nn.Sequential(*layers)
+        nn.initializer.set_global_initializer(None)
+
+        # apply weight norm
+        if use_weight_norm:
+            self.apply_weight_norm()
+
+        # reset parameters
+        self.reset_parameters()
+
+        # initialize pqmf for multi-band melgan inference
+        if out_channels > 1:
+            self.pqmf = PQMF(subbands=out_channels)
+        else:
+            self.pqmf = None
+
+    def forward(self, c):
+        """Calculate forward propagation.
+
+        Args:
+            c (Tensor): Input tensor (B, in_channels, T).
+        Returns:
+            Tensor: Output tensor (B, out_channels, T ** prod(upsample_scales)).
+        """
+        out = self.melgan(c)
+        return out
+
+    def apply_weight_norm(self):
+        """Recursively apply weight normalization to all the Convolution layers
+        in the sublayers.
+        """
+
+        def _apply_weight_norm(layer):
+            if isinstance(layer, (nn.Conv1D, nn.Conv2D, nn.Conv1DTranspose)):
+                nn.utils.weight_norm(layer)
+
+        self.apply(_apply_weight_norm)
+
+    def remove_weight_norm(self):
+        """Recursively remove weight normalization from all the Convolution 
+        layers in the sublayers.
+        """
+
+        def _remove_weight_norm(layer):
+            try:
+                nn.utils.remove_weight_norm(layer)
+            except ValueError:
+                pass
+
+        self.apply(_remove_weight_norm)
+
+    def reset_parameters(self):
+        """Reset parameters.
+        This initialization follows official implementation manner.
+        https://github.com/descriptinc/melgan-neurips/blob/master/mel2wav/modules.py
+        """
+        # 定义参数为float的正态分布。
+        dist = paddle.distribution.Normal(loc=0.0, scale=0.02)
+
+        def _reset_parameters(m):
+            if isinstance(m, nn.Conv1D) or isinstance(m, nn.Conv1DTranspose):
+                w = dist.sample(m.weight.shape)
+                m.weight.set_value(w)
+
+        self.apply(_reset_parameters)
+
+    def inference(self, c):
+        """Perform inference.
+
+        Args:
+            c (Union[Tensor, ndarray]): Input tensor (T, in_channels).
+        Returns:
+            Tensor: Output tensor (out_channels*T ** prod(upsample_scales), 1).
+        """
+        # pseudo batch
+        c = c.transpose([1, 0]).unsqueeze(0)
+        # (B, out_channels, T ** prod(upsample_scales)
+        out = self.melgan(c)
+        if self.pqmf is not None:
+            # (B, 1, out_channels * T ** prod(upsample_scales)
+            out = self.pqmf(out)
+        out = out.squeeze(0).transpose([1, 0])
+        return out
+
+
+class MelGANDiscriminator(nn.Layer):
+    """MelGAN discriminator module."""
+
+    def __init__(
+            self,
+            in_channels: int=1,
+            out_channels: int=1,
+            kernel_sizes: List[int]=[5, 3],
+            channels: int=16,
+            max_downsample_channels: int=1024,
+            bias: bool=True,
+            downsample_scales: List[int]=[4, 4, 4, 4],
+            nonlinear_activation: str="leakyrelu",
+            nonlinear_activation_params: Dict[str, Any]={"negative_slope": 0.2},
+            pad: str="Pad1D",
+            pad_params: Dict[str, Any]={"mode": "reflect"},
+            init_type: str="xavier_uniform", ):
+        """Initilize MelGAN discriminator module.
+
+        Args:
+            in_channels (int): Number of input channels.
+            out_channels (int): Number of output channels.
+            kernel_sizes (List[int]): List of two kernel sizes. The prod will be used for the first conv layer,
+                and the first and the second kernel sizes will be used for the last two layers.
+                For example if kernel_sizes = [5, 3], the first layer kernel size will be 5 * 3 = 15,
+                the last two layers' kernel size will be 5 and 3, respectively.
+            channels (int): Initial number of channels for conv layer.
+            max_downsample_channels (int): Maximum number of channels for downsampling layers.
+            bias (bool): Whether to add bias parameter in convolution layers.
+            downsample_scales (List[int]): List of downsampling scales.
+            nonlinear_activation (str): Activation function module name.
+            nonlinear_activation_params (dict): Hyperparameters for activation function.
+            pad (str): Padding function module name before dilated convolution layer.
+            pad_params (dict): Hyperparameters for padding function.
+        """
+        super().__init__()
+
+        # for compatibility
+        if nonlinear_activation:
+            nonlinear_activation = nonlinear_activation.lower()
+
+        # initialize parameters
+        initialize(self, init_type)
+
+        self.layers = nn.LayerList()
+
+        # check kernel size is valid
+        assert len(kernel_sizes) == 2
+        assert kernel_sizes[0] % 2 == 1
+        assert kernel_sizes[1] % 2 == 1
+
+        # add first layer
+        self.layers.append(
+            nn.Sequential(
+                getattr(nn, pad)((np.prod(kernel_sizes) - 1) // 2, **
+                                 pad_params),
+                nn.Conv1D(
+                    in_channels,
+                    channels,
+                    int(np.prod(kernel_sizes)),
+                    bias_attr=bias),
+                get_activation(nonlinear_activation, **
+                               nonlinear_activation_params), ))
+
+        # add downsample layers
+        in_chs = channels
+        for downsample_scale in downsample_scales:
+            out_chs = min(in_chs * downsample_scale, max_downsample_channels)
+            self.layers.append(
+                nn.Sequential(
+                    nn.Conv1D(
+                        in_chs,
+                        out_chs,
+                        kernel_size=downsample_scale * 10 + 1,
+                        stride=downsample_scale,
+                        padding=downsample_scale * 5,
+                        groups=in_chs // 4,
+                        bias_attr=bias, ),
+                    get_activation(nonlinear_activation, **
+                                   nonlinear_activation_params), ))
+            in_chs = out_chs
+
+        # add final layers
+        out_chs = min(in_chs * 2, max_downsample_channels)
+        self.layers.append(
+            nn.Sequential(
+                nn.Conv1D(
+                    in_chs,
+                    out_chs,
+                    kernel_sizes[0],
+                    padding=(kernel_sizes[0] - 1) // 2,
+                    bias_attr=bias, ),
+                get_activation(nonlinear_activation, **
+                               nonlinear_activation_params), ))
+        self.layers.append(
+            nn.Conv1D(
+                out_chs,
+                out_channels,
+                kernel_sizes[1],
+                padding=(kernel_sizes[1] - 1) // 2,
+                bias_attr=bias, ), )
+
+    def forward(self, x):
+        """Calculate forward propagation.
+        Args:
+            x (Tensor): Input noise signal (B, 1, T).
+        Returns:
+            List: List of output tensors of each layer (for feat_match_loss).
+        """
+        outs = []
+        for f in self.layers:
+            x = f(x)
+            outs += [x]
+
+        return outs
+
+
+class MelGANMultiScaleDiscriminator(nn.Layer):
+    """MelGAN multi-scale discriminator module."""
+
+    def __init__(
+            self,
+            in_channels: int=1,
+            out_channels: int=1,
+            scales: int=3,
+            downsample_pooling: str="AvgPool1D",
+            # follow the official implementation setting
+            downsample_pooling_params: Dict[str, Any]={
+                "kernel_size": 4,
+                "stride": 2,
+                "padding": 1,
+                "exclusive": True,
+            },
+            kernel_sizes: List[int]=[5, 3],
+            channels: int=16,
+            max_downsample_channels: int=1024,
+            bias: bool=True,
+            downsample_scales: List[int]=[4, 4, 4, 4],
+            nonlinear_activation: str="leakyrelu",
+            nonlinear_activation_params: Dict[str, Any]={"negative_slope": 0.2},
+            pad: str="Pad1D",
+            pad_params: Dict[str, Any]={"mode": "reflect"},
+            use_weight_norm: bool=True,
+            init_type: str="xavier_uniform", ):
+        """Initilize MelGAN multi-scale discriminator module.
+
+        Args:
+            in_channels (int): Number of input channels.
+            out_channels (int): Number of output channels.
+            scales (int): Number of multi-scales.
+            downsample_pooling (str): Pooling module name for downsampling of the inputs.
+            downsample_pooling_params (dict): Parameters for the above pooling module.
+            kernel_sizes (List[int]): List of two kernel sizes. The sum will be used for the first conv layer,
+                and the first and the second kernel sizes will be used for the last two layers.
+            channels (int): Initial number of channels for conv layer.
+            max_downsample_channels (int): Maximum number of channels for downsampling layers.
+            bias (bool): Whether to add bias parameter in convolution layers.
+            downsample_scales (List[int]): List of downsampling scales.
+            nonlinear_activation (str): Activation function module name.
+            nonlinear_activation_params (dict): Hyperparameters for activation function.
+            pad (str): Padding function module name before dilated convolution layer.
+            pad_params (dict): Hyperparameters for padding function.
+            use_causal_conv (bool): Whether to use causal convolution.
+        """
+        super().__init__()
+
+        # initialize parameters
+        initialize(self, init_type)
+
+        # for 
+        if nonlinear_activation:
+            nonlinear_activation = nonlinear_activation.lower()
+
+        self.discriminators = nn.LayerList()
+
+        # add discriminators
+        for _ in range(scales):
+            self.discriminators.append(
+                MelGANDiscriminator(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    kernel_sizes=kernel_sizes,
+                    channels=channels,
+                    max_downsample_channels=max_downsample_channels,
+                    bias=bias,
+                    downsample_scales=downsample_scales,
+                    nonlinear_activation=nonlinear_activation,
+                    nonlinear_activation_params=nonlinear_activation_params,
+                    pad=pad,
+                    pad_params=pad_params, ))
+        self.pooling = getattr(nn, downsample_pooling)(
+            **downsample_pooling_params)
+
+        nn.initializer.set_global_initializer(None)
+
+        # apply weight norm
+        if use_weight_norm:
+            self.apply_weight_norm()
+
+        # reset parameters
+        self.reset_parameters()
+
+    def forward(self, x):
+        """Calculate forward propagation.
+        Args:
+            x (Tensor): Input noise signal (B, 1, T).
+        Returns:
+            List: List of list of each discriminator outputs, which consists of each layer output tensors.
+        """
+        outs = []
+        for f in self.discriminators:
+            outs += [f(x)]
+            x = self.pooling(x)
+
+        return outs
+
+    def apply_weight_norm(self):
+        """Recursively apply weight normalization to all the Convolution layers
+        in the sublayers.
+        """
+
+        def _apply_weight_norm(layer):
+            if isinstance(layer, (nn.Conv1D, nn.Conv2D, nn.Conv1DTranspose)):
+                nn.utils.weight_norm(layer)
+
+        self.apply(_apply_weight_norm)
+
+    def remove_weight_norm(self):
+        """Recursively remove weight normalization from all the Convolution 
+        layers in the sublayers.
+        """
+
+        def _remove_weight_norm(layer):
+            try:
+                nn.utils.remove_weight_norm(layer)
+            except ValueError:
+                pass
+
+        self.apply(_remove_weight_norm)
+
+    def reset_parameters(self):
+        """Reset parameters.
+        This initialization follows official implementation manner.
+        https://github.com/descriptinc/melgan-neurips/blob/master/mel2wav/modules.py
+        """
+
+        # 定义参数为float的正态分布。
+        dist = paddle.distribution.Normal(loc=0.0, scale=0.02)
+
+        def _reset_parameters(m):
+            if isinstance(m, nn.Conv1D) or isinstance(m, nn.Conv1DTranspose):
+                w = dist.sample(m.weight.shape)
+                m.weight.set_value(w)
+
+        self.apply(_reset_parameters)
+
+
+class MelGANInference(nn.Layer):
+    def __init__(self, normalizer, melgan_generator):
+        super().__init__()
+        self.normalizer = normalizer
+        self.melgan_generator = melgan_generator
+
+    def forward(self, logmel):
+        normalized_mel = self.normalizer(logmel)
+        wav = self.melgan_generator.inference(normalized_mel)
+        return wav
diff --git a/ernie-sat/paddlespeech/t2s/models/melgan/multi_band_melgan_updater.py b/ernie-sat/paddlespeech/t2s/models/melgan/multi_band_melgan_updater.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c6c34c2a7c0c24c34278c287e18bb5d4c0cf139
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/models/melgan/multi_band_melgan_updater.py
@@ -0,0 +1,263 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+from pathlib import Path
+from typing import Dict
+
+import paddle
+from paddle import distributed as dist
+from paddle.io import DataLoader
+from paddle.nn import Layer
+from paddle.optimizer import Optimizer
+from paddle.optimizer.lr import LRScheduler
+
+from paddlespeech.t2s.training.extensions.evaluator import StandardEvaluator
+from paddlespeech.t2s.training.reporter import report
+from paddlespeech.t2s.training.updaters.standard_updater import StandardUpdater
+from paddlespeech.t2s.training.updaters.standard_updater import UpdaterState
+logging.basicConfig(
+    format='%(asctime)s [%(levelname)s] [%(filename)s:%(lineno)d] %(message)s',
+    datefmt='[%Y-%m-%d %H:%M:%S]')
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+
+class MBMelGANUpdater(StandardUpdater):
+    def __init__(self,
+                 models: Dict[str, Layer],
+                 optimizers: Dict[str, Optimizer],
+                 criterions: Dict[str, Layer],
+                 schedulers: Dict[str, LRScheduler],
+                 dataloader: DataLoader,
+                 generator_train_start_steps: int=0,
+                 discriminator_train_start_steps: int=100000,
+                 lambda_aux: float=1.0,
+                 lambda_adv: float=1.0,
+                 output_dir: Path=None):
+        self.models = models
+        self.generator: Layer = models['generator']
+        self.discriminator: Layer = models['discriminator']
+
+        self.optimizers = optimizers
+        self.optimizer_g: Optimizer = optimizers['generator']
+        self.optimizer_d: Optimizer = optimizers['discriminator']
+
+        self.criterions = criterions
+        self.criterion_stft = criterions['stft']
+        self.criterion_sub_stft = criterions['sub_stft']
+        self.criterion_pqmf = criterions['pqmf']
+        self.criterion_gen_adv = criterions["gen_adv"]
+        self.criterion_dis_adv = criterions["dis_adv"]
+
+        self.schedulers = schedulers
+        self.scheduler_g = schedulers['generator']
+        self.scheduler_d = schedulers['discriminator']
+
+        self.dataloader = dataloader
+
+        self.generator_train_start_steps = generator_train_start_steps
+        self.discriminator_train_start_steps = discriminator_train_start_steps
+        self.lambda_adv = lambda_adv
+        self.lambda_aux = lambda_aux
+
+        self.state = UpdaterState(iteration=0, epoch=0)
+        self.train_iterator = iter(self.dataloader)
+
+        log_file = output_dir / 'worker_{}.log'.format(dist.get_rank())
+        self.filehandler = logging.FileHandler(str(log_file))
+        logger.addHandler(self.filehandler)
+        self.logger = logger
+        self.msg = ""
+
+    def update_core(self, batch):
+        self.msg = "Rank: {}, ".format(dist.get_rank())
+        losses_dict = {}
+        # parse batch
+        wav, mel = batch
+
+        # Generator
+        if self.state.iteration > self.generator_train_start_steps:
+            # (B, out_channels, T ** prod(upsample_scales)
+            wav_ = self.generator(mel)
+            wav_mb_ = wav_
+            # (B, 1, out_channels*T ** prod(upsample_scales)
+            wav_ = self.criterion_pqmf.synthesis(wav_mb_)
+
+            # initialize
+            gen_loss = 0.0
+            aux_loss = 0.0
+
+            # full band Multi-resolution stft loss
+            sc_loss, mag_loss = self.criterion_stft(wav_, wav)
+            # for balancing with subband stft loss
+            # Eq.(9) in paper
+            aux_loss += 0.5 * (sc_loss + mag_loss)
+            report("train/spectral_convergence_loss", float(sc_loss))
+            report("train/log_stft_magnitude_loss", float(mag_loss))
+            losses_dict["spectral_convergence_loss"] = float(sc_loss)
+            losses_dict["log_stft_magnitude_loss"] = float(mag_loss)
+
+            # sub band Multi-resolution stft loss
+            # (B, subbands, T // subbands)
+            wav_mb = self.criterion_pqmf.analysis(wav)
+            sub_sc_loss, sub_mag_loss = self.criterion_sub_stft(wav_mb_, wav_mb)
+            # Eq.(9) in paper
+            aux_loss += 0.5 * (sub_sc_loss + sub_mag_loss)
+            report("train/sub_spectral_convergence_loss", float(sub_sc_loss))
+            report("train/sub_log_stft_magnitude_loss", float(sub_mag_loss))
+            losses_dict["sub_spectral_convergence_loss"] = float(sub_sc_loss)
+            losses_dict["sub_log_stft_magnitude_loss"] = float(sub_mag_loss)
+
+            gen_loss += aux_loss * self.lambda_aux
+
+            # adversarial loss
+            if self.state.iteration > self.discriminator_train_start_steps:
+                p_ = self.discriminator(wav_)
+                adv_loss = self.criterion_gen_adv(p_)
+                report("train/adversarial_loss", float(adv_loss))
+                losses_dict["adversarial_loss"] = float(adv_loss)
+
+                gen_loss += self.lambda_adv * adv_loss
+
+            report("train/generator_loss", float(gen_loss))
+            losses_dict["generator_loss"] = float(gen_loss)
+
+            self.optimizer_g.clear_grad()
+            gen_loss.backward()
+
+            self.optimizer_g.step()
+            self.scheduler_g.step()
+
+        # Disctiminator
+        if self.state.iteration > self.discriminator_train_start_steps:
+            # re-compute wav_ which leads better quality
+            with paddle.no_grad():
+                wav_ = self.generator(mel)
+            wav_ = self.criterion_pqmf.synthesis(wav_)
+            p = self.discriminator(wav)
+            p_ = self.discriminator(wav_.detach())
+            real_loss, fake_loss = self.criterion_dis_adv(p_, p)
+            dis_loss = real_loss + fake_loss
+            report("train/real_loss", float(real_loss))
+            report("train/fake_loss", float(fake_loss))
+            report("train/discriminator_loss", float(dis_loss))
+            losses_dict["real_loss"] = float(real_loss)
+            losses_dict["fake_loss"] = float(fake_loss)
+            losses_dict["discriminator_loss"] = float(dis_loss)
+
+            self.optimizer_d.clear_grad()
+            dis_loss.backward()
+
+            self.optimizer_d.step()
+            self.scheduler_d.step()
+
+        self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
+                              for k, v in losses_dict.items())
+
+
+class MBMelGANEvaluator(StandardEvaluator):
+    def __init__(self,
+                 models: Dict[str, Layer],
+                 criterions: Dict[str, Layer],
+                 dataloader: DataLoader,
+                 lambda_aux: float=1.0,
+                 lambda_adv: float=1.0,
+                 output_dir: Path=None):
+        self.models = models
+        self.generator = models['generator']
+        self.discriminator = models['discriminator']
+
+        self.criterions = criterions
+        self.criterion_stft = criterions['stft']
+        self.criterion_sub_stft = criterions['sub_stft']
+        self.criterion_pqmf = criterions['pqmf']
+        self.criterion_gen_adv = criterions["gen_adv"]
+        self.criterion_dis_adv = criterions["dis_adv"]
+
+        self.dataloader = dataloader
+
+        self.lambda_adv = lambda_adv
+        self.lambda_aux = lambda_aux
+
+        log_file = output_dir / 'worker_{}.log'.format(dist.get_rank())
+        self.filehandler = logging.FileHandler(str(log_file))
+        logger.addHandler(self.filehandler)
+        self.logger = logger
+        self.msg = ""
+
+    def evaluate_core(self, batch):
+        # logging.debug("Evaluate: ")
+        self.msg = "Evaluate: "
+        losses_dict = {}
+        wav, mel = batch
+
+        # Generator
+        # (B, out_channels, T ** prod(upsample_scales)
+        wav_ = self.generator(mel)
+        wav_mb_ = wav_
+        # (B, 1, out_channels*T ** prod(upsample_scales)
+        wav_ = self.criterion_pqmf.synthesis(wav_mb_)
+
+        # initialize
+        gen_loss = 0.0
+        aux_loss = 0.0
+
+        # adversarial loss
+        p_ = self.discriminator(wav_)
+        adv_loss = self.criterion_gen_adv(p_)
+        report("eval/adversarial_loss", float(adv_loss))
+        losses_dict["adversarial_loss"] = float(adv_loss)
+
+        gen_loss += self.lambda_adv * adv_loss
+
+        # Multi-resolution stft loss
+        sc_loss, mag_loss = self.criterion_stft(wav_, wav)
+        # Eq.(9) in paper
+        aux_loss += 0.5 * (sc_loss + mag_loss)
+        report("eval/spectral_convergence_loss", float(sc_loss))
+        report("eval/log_stft_magnitude_loss", float(mag_loss))
+        losses_dict["spectral_convergence_loss"] = float(sc_loss)
+        losses_dict["log_stft_magnitude_loss"] = float(mag_loss)
+
+        # sub band Multi-resolution stft loss
+        # (B, subbands, T // subbands)
+        wav_mb = self.criterion_pqmf.analysis(wav)
+        sub_sc_loss, sub_mag_loss = self.criterion_sub_stft(wav_mb_, wav_mb)
+        # Eq.(9) in paper
+        aux_loss += 0.5 * (sub_sc_loss + sub_mag_loss)
+        report("eval/sub_spectral_convergence_loss", float(sub_sc_loss))
+        report("eval/sub_log_stft_magnitude_loss", float(sub_mag_loss))
+        losses_dict["sub_spectral_convergence_loss"] = float(sub_sc_loss)
+        losses_dict["sub_log_stft_magnitude_loss"] = float(sub_mag_loss)
+
+        gen_loss += aux_loss * self.lambda_aux
+
+        report("eval/generator_loss", float(gen_loss))
+        losses_dict["generator_loss"] = float(gen_loss)
+
+        # Disctiminator
+        p = self.discriminator(wav)
+        real_loss, fake_loss = self.criterion_dis_adv(p_, p)
+        dis_loss = real_loss + fake_loss
+        report("eval/real_loss", float(real_loss))
+        report("eval/fake_loss", float(fake_loss))
+        report("eval/discriminator_loss", float(dis_loss))
+
+        losses_dict["real_loss"] = float(real_loss)
+        losses_dict["fake_loss"] = float(fake_loss)
+        losses_dict["discriminator_loss"] = float(dis_loss)
+
+        self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
+                              for k, v in losses_dict.items())
+        self.logger.info(self.msg)
diff --git a/ernie-sat/paddlespeech/t2s/models/melgan/style_melgan.py b/ernie-sat/paddlespeech/t2s/models/melgan/style_melgan.py
new file mode 100644
index 0000000000000000000000000000000000000000..40a2f10096680b0dc0420c54ad0373d7f80f1912
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/models/melgan/style_melgan.py
@@ -0,0 +1,375 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+"""StyleMelGAN Modules."""
+import copy
+from typing import Any
+from typing import Dict
+from typing import List
+
+import numpy as np
+import paddle
+import paddle.nn.functional as F
+from paddle import nn
+
+from paddlespeech.t2s.models.melgan import MelGANDiscriminator as BaseDiscriminator
+from paddlespeech.t2s.modules.activation import get_activation
+from paddlespeech.t2s.modules.nets_utils import initialize
+from paddlespeech.t2s.modules.pqmf import PQMF
+from paddlespeech.t2s.modules.tade_res_block import TADEResBlock
+
+
+class StyleMelGANGenerator(nn.Layer):
+    """Style MelGAN generator module."""
+
+    def __init__(
+            self,
+            in_channels: int=128,
+            aux_channels: int=80,
+            channels: int=64,
+            out_channels: int=1,
+            kernel_size: int=9,
+            dilation: int=2,
+            bias: bool=True,
+            noise_upsample_scales: List[int]=[11, 2, 2, 2],
+            noise_upsample_activation: str="leakyrelu",
+            noise_upsample_activation_params: Dict[str,
+                                                   Any]={"negative_slope": 0.2},
+            upsample_scales: List[int]=[2, 2, 2, 2, 2, 2, 2, 2, 1],
+            upsample_mode: str="linear",
+            gated_function: str="softmax",
+            use_weight_norm: bool=True,
+            init_type: str="xavier_uniform", ):
+        """Initilize Style MelGAN generator.
+
+        Args:
+            in_channels (int): Number of input noise channels.
+            aux_channels (int): Number of auxiliary input channels.
+            channels (int): Number of channels for conv layer.
+            out_channels (int): Number of output channels.
+            kernel_size (int): Kernel size of conv layers.
+            dilation (int): Dilation factor for conv layers.
+            bias (bool): Whether to add bias parameter in convolution layers.
+            noise_upsample_scales (list): List of noise upsampling scales.
+            noise_upsample_activation (str): Activation function module name for noise upsampling.
+            noise_upsample_activation_params (dict): Hyperparameters for the above activation function.
+            upsample_scales (list): List of upsampling scales.
+            upsample_mode (str): Upsampling mode in TADE layer.
+            gated_function (str): Gated function in TADEResBlock ("softmax" or "sigmoid").
+            use_weight_norm (bool): Whether to use weight norm.
+                If set to true, it will be applied to all of the conv layers.
+        """
+        super().__init__()
+
+        # initialize parameters
+        initialize(self, init_type)
+
+        self.in_channels = in_channels
+        noise_upsample = []
+        in_chs = in_channels
+        for noise_upsample_scale in noise_upsample_scales:
+            noise_upsample.append(
+                nn.Conv1DTranspose(
+                    in_chs,
+                    channels,
+                    noise_upsample_scale * 2,
+                    stride=noise_upsample_scale,
+                    padding=noise_upsample_scale // 2 + noise_upsample_scale %
+                    2,
+                    output_padding=noise_upsample_scale % 2,
+                    bias_attr=bias, ))
+            noise_upsample.append(
+                get_activation(noise_upsample_activation, **
+                               noise_upsample_activation_params))
+            in_chs = channels
+        self.noise_upsample = nn.Sequential(*noise_upsample)
+        self.noise_upsample_factor = np.prod(noise_upsample_scales)
+
+        self.blocks = nn.LayerList()
+        aux_chs = aux_channels
+        for upsample_scale in upsample_scales:
+            self.blocks.append(
+                TADEResBlock(
+                    in_channels=channels,
+                    aux_channels=aux_chs,
+                    kernel_size=kernel_size,
+                    dilation=dilation,
+                    bias=bias,
+                    upsample_factor=upsample_scale,
+                    upsample_mode=upsample_mode,
+                    gated_function=gated_function, ), )
+            aux_chs = channels
+        self.upsample_factor = np.prod(upsample_scales)
+
+        self.output_conv = nn.Sequential(
+            nn.Conv1D(
+                channels,
+                out_channels,
+                kernel_size,
+                1,
+                bias_attr=bias,
+                padding=(kernel_size - 1) // 2, ),
+            nn.Tanh(), )
+
+        nn.initializer.set_global_initializer(None)
+
+        # apply weight norm
+        if use_weight_norm:
+            self.apply_weight_norm()
+
+        # reset parameters
+        self.reset_parameters()
+
+    def forward(self, c, z=None):
+        """Calculate forward propagation.
+
+        Args:
+            c (Tensor): Auxiliary input tensor (B, channels, T).
+            z (Tensor): Input noise tensor (B, in_channels, 1).
+        Returns:
+            Tensor: Output tensor (B, out_channels, T ** prod(upsample_scales)).
+        """
+        # batch_max_steps(24000) == noise_upsample_factor(80) * upsample_factor(300)
+        if z is None:
+            z = paddle.randn([paddle.shape(c)[0], self.in_channels, 1])
+        # (B, in_channels, noise_upsample_factor).
+        x = self.noise_upsample(z)
+        for block in self.blocks:
+            x, c = block(x, c)
+        x = self.output_conv(x)
+        return x
+
+    def apply_weight_norm(self):
+        """Recursively apply weight normalization to all the Convolution layers
+        in the sublayers.
+        """
+
+        def _apply_weight_norm(layer):
+            if isinstance(layer, (nn.Conv1D, nn.Conv1DTranspose)):
+                nn.utils.weight_norm(layer)
+
+        self.apply(_apply_weight_norm)
+
+    def remove_weight_norm(self):
+        """Recursively remove weight normalization from all the Convolution 
+        layers in the sublayers.
+        """
+
+        def _remove_weight_norm(layer):
+            try:
+                if layer:
+                    nn.utils.remove_weight_norm(layer)
+            # add AttributeError to bypass https://github.com/PaddlePaddle/Paddle/issues/38532 temporarily
+            except (ValueError, AttributeError):
+                pass
+
+        self.apply(_remove_weight_norm)
+
+    def reset_parameters(self):
+        """Reset parameters.
+        This initialization follows official implementation manner.
+        https://github.com/descriptinc/melgan-neurips/blob/master/mel2wav/modules.py
+        """
+        # 定义参数为float的正态分布。
+        dist = paddle.distribution.Normal(loc=0.0, scale=0.02)
+
+        def _reset_parameters(m):
+            if isinstance(m, nn.Conv1D) or isinstance(m, nn.Conv1DTranspose):
+                w = dist.sample(m.weight.shape)
+                m.weight.set_value(w)
+
+        self.apply(_reset_parameters)
+
+    def inference(self, c):
+        """Perform inference.
+        Args:
+            c (Tensor): Input tensor (T, in_channels).
+        Returns:
+            Tensor: Output tensor (T ** prod(upsample_scales), out_channels).
+        """
+        # (1, in_channels, T)
+        c = c.transpose([1, 0]).unsqueeze(0)
+        c_shape = paddle.shape(c)
+        # prepare noise input
+        # there is a bug in Paddle int division, we must convert a int tensor to int here
+        noise_T = paddle.cast(
+            paddle.ceil(c_shape[2] / int(self.noise_upsample_factor)),
+            dtype='int64')
+        noise_size = (1, self.in_channels, noise_T)
+        # (1, in_channels, T/noise_upsample_factor)
+        noise = paddle.randn(noise_size)
+        # (1, in_channels, T)
+        x = self.noise_upsample(noise)
+        x_shape = paddle.shape(x)
+        total_length = c_shape[2] * self.upsample_factor
+        # Dygraph to Static Graph bug here, 2021.12.15
+        c = F.pad(
+            c, (0, x_shape[2] - c_shape[2]), "replicate", data_format="NCL")
+        # c.shape[2] == x.shape[2] here
+        # (1, in_channels, T*prod(upsample_scales))
+        for block in self.blocks:
+            x, c = block(x, c)
+        x = self.output_conv(x)[..., :total_length]
+        return x.squeeze(0).transpose([1, 0])
+
+
+class StyleMelGANDiscriminator(nn.Layer):
+    """Style MelGAN disciminator module."""
+
+    def __init__(
+            self,
+            repeats: int=2,
+            window_sizes: List[int]=[512, 1024, 2048, 4096],
+            pqmf_params: List[List[int]]=[
+                [1, None, None, None],
+                [2, 62, 0.26700, 9.0],
+                [4, 62, 0.14200, 9.0],
+                [8, 62, 0.07949, 9.0],
+            ],
+            discriminator_params: Dict[str, Any]={
+                "out_channels": 1,
+                "kernel_sizes": [5, 3],
+                "channels": 16,
+                "max_downsample_channels": 512,
+                "bias": True,
+                "downsample_scales": [4, 4, 4, 1],
+                "nonlinear_activation": "leakyrelu",
+                "nonlinear_activation_params": {
+                    "negative_slope": 0.2
+                },
+                "pad": "Pad1D",
+                "pad_params": {
+                    "mode": "reflect"
+                },
+            },
+            use_weight_norm: bool=True,
+            init_type: str="xavier_uniform", ):
+        """Initilize Style MelGAN discriminator.
+
+        Args:
+            repeats (int): Number of repititons to apply RWD.
+            window_sizes (list): List of random window sizes.
+            pqmf_params (list): List of list of Parameters for PQMF modules
+            discriminator_params (dict): Parameters for base discriminator module.
+            use_weight_nom (bool): Whether to apply weight normalization.
+        """
+        super().__init__()
+
+        # initialize parameters
+        initialize(self, init_type)
+
+        # window size check
+        assert len(window_sizes) == len(pqmf_params)
+        sizes = [ws // p[0] for ws, p in zip(window_sizes, pqmf_params)]
+        assert len(window_sizes) == sum([sizes[0] == size for size in sizes])
+
+        self.repeats = repeats
+        self.window_sizes = window_sizes
+        self.pqmfs = nn.LayerList()
+        self.discriminators = nn.LayerList()
+        for pqmf_param in pqmf_params:
+            d_params = copy.deepcopy(discriminator_params)
+            d_params["in_channels"] = pqmf_param[0]
+            if pqmf_param[0] == 1:
+                self.pqmfs.append(nn.Identity())
+            else:
+                self.pqmfs.append(PQMF(*pqmf_param))
+            self.discriminators.append(BaseDiscriminator(**d_params))
+
+        nn.initializer.set_global_initializer(None)
+
+        # apply weight norm
+        if use_weight_norm:
+            self.apply_weight_norm()
+
+        # reset parameters
+        self.reset_parameters()
+
+    def forward(self, x):
+        """Calculate forward propagation.
+        Args:
+            x (Tensor): Input tensor (B, 1, T).
+        Returns:
+            List: List of discriminator outputs, #items in the list will be
+                equal to repeats * #discriminators.
+        """
+        outs = []
+        for _ in range(self.repeats):
+            outs += self._forward(x)
+        return outs
+
+    def _forward(self, x):
+        outs = []
+        for idx, (ws, pqmf, disc) in enumerate(
+                zip(self.window_sizes, self.pqmfs, self.discriminators)):
+            start_idx = int(np.random.randint(paddle.shape(x)[-1] - ws))
+            x_ = x[:, :, start_idx:start_idx + ws]
+            if idx == 0:
+                # nn.Identity()
+                x_ = pqmf(x_)
+            else:
+                x_ = pqmf.analysis(x_)
+            outs += [disc(x_)]
+        return outs
+
+    def apply_weight_norm(self):
+        """Recursively apply weight normalization to all the Convolution layers
+        in the sublayers.
+        """
+
+        def _apply_weight_norm(layer):
+            if isinstance(layer, (nn.Conv1D, nn.Conv1DTranspose)):
+                nn.utils.weight_norm(layer)
+
+        self.apply(_apply_weight_norm)
+
+    def remove_weight_norm(self):
+        """Recursively remove weight normalization from all the Convolution 
+        layers in the sublayers.
+        """
+
+        def _remove_weight_norm(layer):
+            try:
+                nn.utils.remove_weight_norm(layer)
+            except ValueError:
+                pass
+
+        self.apply(_remove_weight_norm)
+
+    def reset_parameters(self):
+        """Reset parameters.
+        This initialization follows official implementation manner.
+        https://github.com/descriptinc/melgan-neurips/blob/master/mel2wav/modules.py
+        """
+        # 定义参数为float的正态分布。
+        dist = paddle.distribution.Normal(loc=0.0, scale=0.02)
+
+        def _reset_parameters(m):
+            if isinstance(m, nn.Conv1D) or isinstance(m, nn.Conv1DTranspose):
+                w = dist.sample(m.weight.shape)
+                m.weight.set_value(w)
+
+        self.apply(_reset_parameters)
+
+
+class StyleMelGANInference(nn.Layer):
+    def __init__(self, normalizer, style_melgan_generator):
+        super().__init__()
+        self.normalizer = normalizer
+        self.style_melgan_generator = style_melgan_generator
+
+    def forward(self, logmel):
+        normalized_mel = self.normalizer(logmel)
+        wav = self.style_melgan_generator.inference(normalized_mel)
+        return wav
diff --git a/ernie-sat/paddlespeech/t2s/models/melgan/style_melgan_updater.py b/ernie-sat/paddlespeech/t2s/models/melgan/style_melgan_updater.py
new file mode 100644
index 0000000000000000000000000000000000000000..b0cb4ed662a4107aee405933b0b771a4b291be2a
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/models/melgan/style_melgan_updater.py
@@ -0,0 +1,227 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+from pathlib import Path
+from typing import Dict
+
+import paddle
+from paddle import distributed as dist
+from paddle.io import DataLoader
+from paddle.nn import Layer
+from paddle.optimizer import Optimizer
+from paddle.optimizer.lr import LRScheduler
+
+from paddlespeech.t2s.training.extensions.evaluator import StandardEvaluator
+from paddlespeech.t2s.training.reporter import report
+from paddlespeech.t2s.training.updaters.standard_updater import StandardUpdater
+from paddlespeech.t2s.training.updaters.standard_updater import UpdaterState
+logging.basicConfig(
+    format='%(asctime)s [%(levelname)s] [%(filename)s:%(lineno)d] %(message)s',
+    datefmt='[%Y-%m-%d %H:%M:%S]')
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+
+class StyleMelGANUpdater(StandardUpdater):
+    def __init__(self,
+                 models: Dict[str, Layer],
+                 optimizers: Dict[str, Optimizer],
+                 criterions: Dict[str, Layer],
+                 schedulers: Dict[str, LRScheduler],
+                 dataloader: DataLoader,
+                 generator_train_start_steps: int=0,
+                 discriminator_train_start_steps: int=100000,
+                 lambda_adv: float=1.0,
+                 lambda_aux: float=1.0,
+                 output_dir: Path=None):
+        self.models = models
+        self.generator: Layer = models['generator']
+        self.discriminator: Layer = models['discriminator']
+
+        self.optimizers = optimizers
+        self.optimizer_g: Optimizer = optimizers['generator']
+        self.optimizer_d: Optimizer = optimizers['discriminator']
+
+        self.criterions = criterions
+        self.criterion_stft = criterions['stft']
+        self.criterion_gen_adv = criterions["gen_adv"]
+        self.criterion_dis_adv = criterions["dis_adv"]
+
+        self.schedulers = schedulers
+        self.scheduler_g = schedulers['generator']
+        self.scheduler_d = schedulers['discriminator']
+
+        self.dataloader = dataloader
+
+        self.generator_train_start_steps = generator_train_start_steps
+        self.discriminator_train_start_steps = discriminator_train_start_steps
+        self.lambda_adv = lambda_adv
+        self.lambda_aux = lambda_aux
+
+        self.state = UpdaterState(iteration=0, epoch=0)
+        self.train_iterator = iter(self.dataloader)
+
+        log_file = output_dir / 'worker_{}.log'.format(dist.get_rank())
+        self.filehandler = logging.FileHandler(str(log_file))
+        logger.addHandler(self.filehandler)
+        self.logger = logger
+        self.msg = ""
+
+    def update_core(self, batch):
+        self.msg = "Rank: {}, ".format(dist.get_rank())
+        losses_dict = {}
+        # parse batch
+        wav, mel = batch
+
+        # Generator
+        if self.state.iteration > self.generator_train_start_steps:
+            # (B, out_channels, T ** prod(upsample_scales)
+            wav_ = self.generator(mel)
+
+            # initialize
+            gen_loss = 0.0
+            aux_loss = 0.0
+
+            # full band multi-resolution stft loss
+            sc_loss, mag_loss = self.criterion_stft(wav_, wav)
+            aux_loss += sc_loss + mag_loss
+            report("train/spectral_convergence_loss", float(sc_loss))
+            report("train/log_stft_magnitude_loss", float(mag_loss))
+            losses_dict["spectral_convergence_loss"] = float(sc_loss)
+            losses_dict["log_stft_magnitude_loss"] = float(mag_loss)
+
+            gen_loss += aux_loss * self.lambda_aux
+
+            # adversarial loss
+            if self.state.iteration > self.discriminator_train_start_steps:
+                p_ = self.discriminator(wav_)
+                adv_loss = self.criterion_gen_adv(p_)
+                report("train/adversarial_loss", float(adv_loss))
+                losses_dict["adversarial_loss"] = float(adv_loss)
+
+                gen_loss += self.lambda_adv * adv_loss
+
+            report("train/generator_loss", float(gen_loss))
+            losses_dict["generator_loss"] = float(gen_loss)
+
+            self.optimizer_g.clear_grad()
+            gen_loss.backward()
+
+            self.optimizer_g.step()
+            self.scheduler_g.step()
+
+        # Disctiminator
+        if self.state.iteration > self.discriminator_train_start_steps:
+            # re-compute wav_ which leads better quality
+            with paddle.no_grad():
+                wav_ = self.generator(mel)
+
+            p = self.discriminator(wav)
+            p_ = self.discriminator(wav_.detach())
+            real_loss, fake_loss = self.criterion_dis_adv(p_, p)
+            dis_loss = real_loss + fake_loss
+            report("train/real_loss", float(real_loss))
+            report("train/fake_loss", float(fake_loss))
+            report("train/discriminator_loss", float(dis_loss))
+            losses_dict["real_loss"] = float(real_loss)
+            losses_dict["fake_loss"] = float(fake_loss)
+            losses_dict["discriminator_loss"] = float(dis_loss)
+
+            self.optimizer_d.clear_grad()
+            dis_loss.backward()
+
+            self.optimizer_d.step()
+            self.scheduler_d.step()
+
+        self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
+                              for k, v in losses_dict.items())
+
+
+class StyleMelGANEvaluator(StandardEvaluator):
+    def __init__(self,
+                 models: Dict[str, Layer],
+                 criterions: Dict[str, Layer],
+                 dataloader: DataLoader,
+                 lambda_adv: float=1.0,
+                 lambda_aux: float=1.0,
+                 output_dir: Path=None):
+        self.models = models
+        self.generator = models['generator']
+        self.discriminator = models['discriminator']
+
+        self.criterions = criterions
+        self.criterion_stft = criterions['stft']
+        self.criterion_gen_adv = criterions["gen_adv"]
+        self.criterion_dis_adv = criterions["dis_adv"]
+
+        self.dataloader = dataloader
+
+        self.lambda_adv = lambda_adv
+        self.lambda_aux = lambda_aux
+
+        log_file = output_dir / 'worker_{}.log'.format(dist.get_rank())
+        self.filehandler = logging.FileHandler(str(log_file))
+        logger.addHandler(self.filehandler)
+        self.logger = logger
+        self.msg = ""
+
+    def evaluate_core(self, batch):
+        self.msg = "Evaluate: "
+        losses_dict = {}
+        wav, mel = batch
+
+        # Generator
+        # (B, out_channels, T ** prod(upsample_scales)
+        wav_ = self.generator(mel)
+
+        # initialize
+        gen_loss = 0.0
+        aux_loss = 0.0
+
+        # adversarial loss
+        p_ = self.discriminator(wav_)
+        adv_loss = self.criterion_gen_adv(p_)
+        report("eval/adversarial_loss", float(adv_loss))
+        losses_dict["adversarial_loss"] = float(adv_loss)
+
+        gen_loss += self.lambda_adv * adv_loss
+
+        # multi-resolution stft loss
+        sc_loss, mag_loss = self.criterion_stft(wav_, wav)
+        aux_loss += sc_loss + mag_loss
+        report("eval/spectral_convergence_loss", float(sc_loss))
+        report("eval/log_stft_magnitude_loss", float(mag_loss))
+        losses_dict["spectral_convergence_loss"] = float(sc_loss)
+        losses_dict["log_stft_magnitude_loss"] = float(mag_loss)
+
+        gen_loss += aux_loss * self.lambda_aux
+
+        report("eval/generator_loss", float(gen_loss))
+        losses_dict["generator_loss"] = float(gen_loss)
+
+        # Disctiminator
+        p = self.discriminator(wav)
+        real_loss, fake_loss = self.criterion_dis_adv(p_, p)
+        dis_loss = real_loss + fake_loss
+        report("eval/real_loss", float(real_loss))
+        report("eval/fake_loss", float(fake_loss))
+        report("eval/discriminator_loss", float(dis_loss))
+
+        losses_dict["real_loss"] = float(real_loss)
+        losses_dict["fake_loss"] = float(fake_loss)
+        losses_dict["discriminator_loss"] = float(dis_loss)
+
+        self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
+                              for k, v in losses_dict.items())
+        self.logger.info(self.msg)
diff --git a/ernie-sat/paddlespeech/t2s/models/parallel_wavegan/__init__.py b/ernie-sat/paddlespeech/t2s/models/parallel_wavegan/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..72322735b7c5719a12a60efaace31edde5d39aaa
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/models/parallel_wavegan/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .parallel_wavegan import *
+from .parallel_wavegan_updater import *
diff --git a/ernie-sat/paddlespeech/t2s/models/parallel_wavegan/parallel_wavegan.py b/ernie-sat/paddlespeech/t2s/models/parallel_wavegan/parallel_wavegan.py
new file mode 100644
index 0000000000000000000000000000000000000000..cc8460e4d7131331e66d55e5119942c531923409
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/models/parallel_wavegan/parallel_wavegan.py
@@ -0,0 +1,450 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+import math
+from typing import Any
+from typing import Dict
+from typing import List
+from typing import Optional
+
+import numpy as np
+import paddle
+from paddle import nn
+
+from paddlespeech.t2s.modules.activation import get_activation
+from paddlespeech.t2s.modules.nets_utils import initialize
+from paddlespeech.t2s.modules.residual_block import WaveNetResidualBlock as ResidualBlock
+from paddlespeech.t2s.modules.upsample import ConvInUpsampleNet
+
+
+class PWGGenerator(nn.Layer):
+    """Wave Generator for Parallel WaveGAN
+
+    Args:
+        in_channels (int, optional): Number of channels of the input waveform, by default 1
+        out_channels (int, optional): Number of channels of the output waveform, by default 1
+        kernel_size (int, optional): Kernel size of the residual blocks inside, by default 3
+        layers (int, optional): Number of residual blocks inside, by default 30
+        stacks (int, optional): The number of groups to split the residual blocks into, by default 3
+            Within each group, the dilation of the residual block grows exponentially.
+        residual_channels (int, optional): Residual channel of the residual blocks, by default 64
+        gate_channels (int, optional): Gate channel of the residual blocks, by default 128
+        skip_channels (int, optional): Skip channel of the residual blocks, by default 64
+        aux_channels (int, optional): Auxiliary channel of the residual blocks, by default 80
+        aux_context_window (int, optional): The context window size of the first convolution applied to the 
+            auxiliary input, by default 2
+        dropout (float, optional): Dropout of the residual blocks, by default 0.
+        bias (bool, optional): Whether to use bias in residual blocks, by default True
+        use_weight_norm (bool, optional): Whether to use weight norm in all convolutions, by default True
+        use_causal_conv (bool, optional): Whether to use causal padding in the upsample network and residual 
+            blocks, by default False
+        upsample_scales (List[int], optional): Upsample scales of the upsample network, by default [4, 4, 4, 4]
+        nonlinear_activation (Optional[str], optional): Non linear activation in upsample network, by default None
+        nonlinear_activation_params (Dict[str, Any], optional): Parameters passed to the linear activation in the upsample network, 
+            by default {}
+        interpolate_mode (str, optional): Interpolation mode of the upsample network, by default "nearest"
+        freq_axis_kernel_size (int, optional): Kernel size along the frequency axis of the upsample network, by default 1
+    """
+
+    def __init__(
+            self,
+            in_channels: int=1,
+            out_channels: int=1,
+            kernel_size: int=3,
+            layers: int=30,
+            stacks: int=3,
+            residual_channels: int=64,
+            gate_channels: int=128,
+            skip_channels: int=64,
+            aux_channels: int=80,
+            aux_context_window: int=2,
+            dropout: float=0.,
+            bias: bool=True,
+            use_weight_norm: bool=True,
+            use_causal_conv: bool=False,
+            upsample_scales: List[int]=[4, 4, 4, 4],
+            nonlinear_activation: Optional[str]=None,
+            nonlinear_activation_params: Dict[str, Any]={},
+            interpolate_mode: str="nearest",
+            freq_axis_kernel_size: int=1,
+            init_type: str="xavier_uniform", ):
+        super().__init__()
+
+        # initialize parameters
+        initialize(self, init_type)
+
+        # for compatibility
+        if nonlinear_activation:
+            nonlinear_activation = nonlinear_activation.lower()
+
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.aux_channels = aux_channels
+        self.aux_context_window = aux_context_window
+        self.layers = layers
+        self.stacks = stacks
+        self.kernel_size = kernel_size
+
+        assert layers % stacks == 0
+        layers_per_stack = layers // stacks
+
+        self.first_conv = nn.Conv1D(
+            in_channels, residual_channels, 1, bias_attr=True)
+        self.upsample_net = ConvInUpsampleNet(
+            upsample_scales=upsample_scales,
+            nonlinear_activation=nonlinear_activation,
+            nonlinear_activation_params=nonlinear_activation_params,
+            interpolate_mode=interpolate_mode,
+            freq_axis_kernel_size=freq_axis_kernel_size,
+            aux_channels=aux_channels,
+            aux_context_window=aux_context_window,
+            use_causal_conv=use_causal_conv)
+        self.upsample_factor = np.prod(upsample_scales)
+
+        self.conv_layers = nn.LayerList()
+        for layer in range(layers):
+            dilation = 2**(layer % layers_per_stack)
+            conv = ResidualBlock(
+                kernel_size=kernel_size,
+                residual_channels=residual_channels,
+                gate_channels=gate_channels,
+                skip_channels=skip_channels,
+                aux_channels=aux_channels,
+                dilation=dilation,
+                dropout=dropout,
+                bias=bias,
+                use_causal_conv=use_causal_conv)
+            self.conv_layers.append(conv)
+
+        self.last_conv_layers = nn.Sequential(nn.ReLU(),
+                                              nn.Conv1D(
+                                                  skip_channels,
+                                                  skip_channels,
+                                                  1,
+                                                  bias_attr=True),
+                                              nn.ReLU(),
+                                              nn.Conv1D(
+                                                  skip_channels,
+                                                  out_channels,
+                                                  1,
+                                                  bias_attr=True))
+
+        if use_weight_norm:
+            self.apply_weight_norm()
+
+    def forward(self, x, c):
+        """Generate waveform.
+
+        Args:
+            x(Tensor): Shape (N, C_in, T), The input waveform.
+            c(Tensor): Shape (N, C_aux, T'). The auxiliary input (e.g. spectrogram). It
+            is upsampled to match the time resolution of the input.
+
+        Returns:
+            Tensor: Shape (N, C_out, T), the generated waveform.
+        """
+        c = self.upsample_net(c)
+        assert c.shape[-1] == x.shape[-1]
+
+        x = self.first_conv(x)
+        skips = 0
+        for f in self.conv_layers:
+            x, s = f(x, c)
+            skips += s
+        skips *= math.sqrt(1.0 / len(self.conv_layers))
+
+        x = self.last_conv_layers(skips)
+        return x
+
+    def apply_weight_norm(self):
+        """Recursively apply weight normalization to all the Convolution layers
+        in the sublayers.
+        """
+
+        def _apply_weight_norm(layer):
+            if isinstance(layer, (nn.Conv1D, nn.Conv2D)):
+                nn.utils.weight_norm(layer)
+
+        self.apply(_apply_weight_norm)
+
+    def remove_weight_norm(self):
+        """Recursively remove weight normalization from all the Convolution 
+        layers in the sublayers.
+        """
+
+        def _remove_weight_norm(layer):
+            try:
+                nn.utils.remove_weight_norm(layer)
+            except ValueError:
+                pass
+
+        self.apply(_remove_weight_norm)
+
+    def inference(self, c=None):
+        """Waveform generation. This function is used for single instance inference.
+
+        Args:
+            c(Tensor, optional, optional): Shape (T', C_aux), the auxiliary input, by default None
+            x(Tensor, optional): Shape (T, C_in), the noise waveform, by default None
+
+        Returns:
+            Tensor: Shape (T, C_out), the generated waveform
+        """
+        # when to static, can not input x, see https://github.com/PaddlePaddle/Parakeet/pull/132/files
+        x = paddle.randn(
+            [1, self.in_channels, paddle.shape(c)[0] * self.upsample_factor])
+        c = paddle.transpose(c, [1, 0]).unsqueeze(0)  # pseudo batch
+        c = nn.Pad1D(self.aux_context_window, mode='replicate')(c)
+        out = self(x, c).squeeze(0).transpose([1, 0])
+        return out
+
+
+class PWGDiscriminator(nn.Layer):
+    """A convolutional discriminator for audio.
+
+    Args:
+        in_channels (int, optional): Number of channels of the input audio, by default 1
+        out_channels (int, optional): Output feature size, by default 1
+        kernel_size (int, optional): Kernel size of convolutional sublayers, by default 3
+        layers (int, optional): Number of layers, by default 10
+        conv_channels (int, optional): Feature size of the convolutional sublayers, by default 64
+        dilation_factor (int, optional): The factor with which dilation of each convolutional sublayers grows 
+            exponentially if it is greater than 1, else the dilation of each convolutional sublayers grows linearly, 
+            by default 1
+        nonlinear_activation (str, optional): The activation after each convolutional sublayer, by default "leakyrelu"
+        nonlinear_activation_params (Dict[str, Any], optional): The parameters passed to the activation's initializer, by default 
+            {"negative_slope": 0.2}
+        bias (bool, optional): Whether to use bias in convolutional sublayers, by default True
+        use_weight_norm (bool, optional): Whether to use weight normalization at all convolutional sublayers, 
+            by default True
+    """
+
+    def __init__(
+            self,
+            in_channels: int=1,
+            out_channels: int=1,
+            kernel_size: int=3,
+            layers: int=10,
+            conv_channels: int=64,
+            dilation_factor: int=1,
+            nonlinear_activation: str="leakyrelu",
+            nonlinear_activation_params: Dict[str, Any]={"negative_slope": 0.2},
+            bias: bool=True,
+            use_weight_norm: bool=True,
+            init_type: str="xavier_uniform", ):
+        super().__init__()
+
+        # initialize parameters
+        initialize(self, init_type)
+        # for compatibility
+        if nonlinear_activation:
+            nonlinear_activation = nonlinear_activation.lower()
+
+        assert kernel_size % 2 == 1
+        assert dilation_factor > 0
+        conv_layers = []
+        conv_in_channels = in_channels
+        for i in range(layers - 1):
+            if i == 0:
+                dilation = 1
+            else:
+                dilation = i if dilation_factor == 1 else dilation_factor**i
+                conv_in_channels = conv_channels
+            padding = (kernel_size - 1) // 2 * dilation
+            conv_layer = nn.Conv1D(
+                conv_in_channels,
+                conv_channels,
+                kernel_size,
+                padding=padding,
+                dilation=dilation,
+                bias_attr=bias)
+            nonlinear = get_activation(nonlinear_activation,
+                                       **nonlinear_activation_params)
+            conv_layers.append(conv_layer)
+            conv_layers.append(nonlinear)
+        padding = (kernel_size - 1) // 2
+        last_conv = nn.Conv1D(
+            conv_in_channels,
+            out_channels,
+            kernel_size,
+            padding=padding,
+            bias_attr=bias)
+        conv_layers.append(last_conv)
+        self.conv_layers = nn.Sequential(*conv_layers)
+
+        if use_weight_norm:
+            self.apply_weight_norm()
+
+    def forward(self, x):
+        """
+
+        Args:
+            x (Tensor): Shape (N, in_channels, num_samples), the input audio.
+
+        Returns:
+            Tensor: Shape (N, out_channels, num_samples), the predicted logits.
+        """
+        return self.conv_layers(x)
+
+    def apply_weight_norm(self):
+        def _apply_weight_norm(layer):
+            if isinstance(layer, (nn.Conv1D, nn.Conv2D)):
+                nn.utils.weight_norm(layer)
+
+        self.apply(_apply_weight_norm)
+
+    def remove_weight_norm(self):
+        def _remove_weight_norm(layer):
+            try:
+                nn.utils.remove_weight_norm(layer)
+            except ValueError:
+                pass
+
+        self.apply(_remove_weight_norm)
+
+
+class ResidualPWGDiscriminator(nn.Layer):
+    """A wavenet-style discriminator for audio.
+
+    Args:
+        in_channels (int, optional): Number of channels of the input audio, by default 1
+        out_channels (int, optional): Output feature size, by default 1
+        kernel_size (int, optional): Kernel size of residual blocks, by default 3
+        layers (int, optional): Number of residual blocks, by default 30
+        stacks (int, optional): Number of groups of residual blocks, within which the dilation 
+            of each residual blocks grows exponentially, by default 3
+        residual_channels (int, optional): Residual channels of residual blocks, by default 64
+        gate_channels (int, optional): Gate channels of residual blocks, by default 128
+        skip_channels (int, optional): Skip channels of residual blocks, by default 64
+        dropout (float, optional): Dropout probability of residual blocks, by default 0.
+        bias (bool, optional): Whether to use bias in residual blocks, by default True
+        use_weight_norm (bool, optional): Whether to use weight normalization in all convolutional layers, 
+            by default True
+        use_causal_conv (bool, optional): Whether to use causal convolution in residual blocks, by default False
+        nonlinear_activation (str, optional): Activation after convolutions other than those in residual blocks, 
+            by default "leakyrelu"
+        nonlinear_activation_params (Dict[str, Any], optional): Parameters to pass to the activation, 
+            by default {"negative_slope": 0.2}
+    """
+
+    def __init__(
+            self,
+            in_channels: int=1,
+            out_channels: int=1,
+            kernel_size: int=3,
+            layers: int=30,
+            stacks: int=3,
+            residual_channels: int=64,
+            gate_channels: int=128,
+            skip_channels: int=64,
+            dropout: float=0.,
+            bias: bool=True,
+            use_weight_norm: bool=True,
+            use_causal_conv: bool=False,
+            nonlinear_activation: str="leakyrelu",
+            nonlinear_activation_params: Dict[str, Any]={"negative_slope": 0.2},
+            init_type: str="xavier_uniform", ):
+        super().__init__()
+
+        # initialize parameters
+        initialize(self, init_type)
+
+        # for compatibility
+        if nonlinear_activation:
+            nonlinear_activation = nonlinear_activation.lower()
+
+        assert kernel_size % 2 == 1
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.layers = layers
+        self.stacks = stacks
+        self.kernel_size = kernel_size
+
+        assert layers % stacks == 0
+        layers_per_stack = layers // stacks
+
+        self.first_conv = nn.Sequential(
+            nn.Conv1D(in_channels, residual_channels, 1, bias_attr=True),
+            get_activation(nonlinear_activation, **nonlinear_activation_params))
+
+        self.conv_layers = nn.LayerList()
+        for layer in range(layers):
+            dilation = 2**(layer % layers_per_stack)
+            conv = ResidualBlock(
+                kernel_size=kernel_size,
+                residual_channels=residual_channels,
+                gate_channels=gate_channels,
+                skip_channels=skip_channels,
+                aux_channels=None,  # no auxiliary input
+                dropout=dropout,
+                dilation=dilation,
+                bias=bias,
+                use_causal_conv=use_causal_conv)
+            self.conv_layers.append(conv)
+
+        self.last_conv_layers = nn.Sequential(
+            get_activation(nonlinear_activation, **nonlinear_activation_params),
+            nn.Conv1D(skip_channels, skip_channels, 1, bias_attr=True),
+            get_activation(nonlinear_activation, **nonlinear_activation_params),
+            nn.Conv1D(skip_channels, out_channels, 1, bias_attr=True))
+
+        if use_weight_norm:
+            self.apply_weight_norm()
+
+    def forward(self, x):
+        """
+        Args:
+            x(Tensor): Shape (N, in_channels, num_samples), the input audio.↩
+
+        Returns:
+            Tensor: Shape (N, out_channels, num_samples), the predicted logits.
+        """
+        x = self.first_conv(x)
+        skip = 0
+        for f in self.conv_layers:
+            x, h = f(x, None)
+            skip += h
+        skip *= math.sqrt(1 / len(self.conv_layers))
+
+        x = skip
+        x = self.last_conv_layers(x)
+        return x
+
+    def apply_weight_norm(self):
+        def _apply_weight_norm(layer):
+            if isinstance(layer, (nn.Conv1D, nn.Conv2D)):
+                nn.utils.weight_norm(layer)
+
+        self.apply(_apply_weight_norm)
+
+    def remove_weight_norm(self):
+        def _remove_weight_norm(layer):
+            try:
+                nn.utils.remove_weight_norm(layer)
+            except ValueError:
+                pass
+
+        self.apply(_remove_weight_norm)
+
+
+class PWGInference(nn.Layer):
+    def __init__(self, normalizer, pwg_generator):
+        super().__init__()
+        self.normalizer = normalizer
+        self.pwg_generator = pwg_generator
+
+    def forward(self, logmel):
+        normalized_mel = self.normalizer(logmel)
+        wav = self.pwg_generator.inference(normalized_mel)
+        return wav
diff --git a/ernie-sat/paddlespeech/t2s/models/parallel_wavegan/parallel_wavegan_updater.py b/ernie-sat/paddlespeech/t2s/models/parallel_wavegan/parallel_wavegan_updater.py
new file mode 100644
index 0000000000000000000000000000000000000000..40cfff5a5eedf54754a2f7ef6388964a0949f6f2
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/models/parallel_wavegan/parallel_wavegan_updater.py
@@ -0,0 +1,228 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+from pathlib import Path
+from typing import Dict
+
+import paddle
+from paddle import distributed as dist
+from paddle.io import DataLoader
+from paddle.nn import Layer
+from paddle.optimizer import Optimizer
+from paddle.optimizer.lr import LRScheduler
+
+from paddlespeech.t2s.training.extensions.evaluator import StandardEvaluator
+from paddlespeech.t2s.training.reporter import report
+from paddlespeech.t2s.training.updaters.standard_updater import StandardUpdater
+from paddlespeech.t2s.training.updaters.standard_updater import UpdaterState
+
+logging.basicConfig(
+    format='%(asctime)s [%(levelname)s] [%(filename)s:%(lineno)d] %(message)s',
+    datefmt='[%Y-%m-%d %H:%M:%S]')
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+
+class PWGUpdater(StandardUpdater):
+    def __init__(self,
+                 models: Dict[str, Layer],
+                 optimizers: Dict[str, Optimizer],
+                 criterions: Dict[str, Layer],
+                 schedulers: Dict[str, LRScheduler],
+                 dataloader: DataLoader,
+                 generator_train_start_steps: int=0,
+                 discriminator_train_start_steps: int=100000,
+                 lambda_adv: float=1.0,
+                 lambda_aux: float=1.0,
+                 output_dir: Path=None):
+        self.models = models
+        self.generator: Layer = models['generator']
+        self.discriminator: Layer = models['discriminator']
+
+        self.optimizers = optimizers
+        self.optimizer_g: Optimizer = optimizers['generator']
+        self.optimizer_d: Optimizer = optimizers['discriminator']
+
+        self.criterions = criterions
+        self.criterion_stft = criterions['stft']
+        self.criterion_mse = criterions['mse']
+
+        self.schedulers = schedulers
+        self.scheduler_g = schedulers['generator']
+        self.scheduler_d = schedulers['discriminator']
+
+        self.dataloader = dataloader
+
+        self.generator_train_start_steps = generator_train_start_steps
+        self.discriminator_train_start_steps = discriminator_train_start_steps
+        self.lambda_adv = lambda_adv
+        self.lambda_aux = lambda_aux
+        self.state = UpdaterState(iteration=0, epoch=0)
+
+        self.train_iterator = iter(self.dataloader)
+
+        log_file = output_dir / 'worker_{}.log'.format(dist.get_rank())
+        self.filehandler = logging.FileHandler(str(log_file))
+        logger.addHandler(self.filehandler)
+        self.logger = logger
+        self.msg = ""
+
+    def update_core(self, batch):
+        self.msg = "Rank: {}, ".format(dist.get_rank())
+        losses_dict = {}
+        # parse batch
+        wav, mel = batch
+
+        # Generator
+        if self.state.iteration > self.generator_train_start_steps:
+            noise = paddle.randn(wav.shape)
+            wav_ = self.generator(noise, mel)
+
+            # initialize
+            gen_loss = 0.0
+            aux_loss = 0.0
+
+            # multi-resolution stft loss
+            sc_loss, mag_loss = self.criterion_stft(wav_, wav)
+            aux_loss += sc_loss + mag_loss
+            report("train/spectral_convergence_loss", float(sc_loss))
+            report("train/log_stft_magnitude_loss", float(mag_loss))
+
+            gen_loss += aux_loss * self.lambda_aux
+
+            losses_dict["spectral_convergence_loss"] = float(sc_loss)
+            losses_dict["log_stft_magnitude_loss"] = float(mag_loss)
+
+            # adversarial loss
+            if self.state.iteration > self.discriminator_train_start_steps:
+                p_ = self.discriminator(wav_)
+                adv_loss = self.criterion_mse(p_, paddle.ones_like(p_))
+                report("train/adversarial_loss", float(adv_loss))
+                losses_dict["adversarial_loss"] = float(adv_loss)
+
+                gen_loss += self.lambda_adv * adv_loss
+
+            report("train/generator_loss", float(gen_loss))
+            losses_dict["generator_loss"] = float(gen_loss)
+
+            self.optimizer_g.clear_grad()
+            gen_loss.backward()
+
+            self.optimizer_g.step()
+            self.scheduler_g.step()
+
+        # Disctiminator
+        if self.state.iteration > self.discriminator_train_start_steps:
+            with paddle.no_grad():
+                wav_ = self.generator(noise, mel)
+            p = self.discriminator(wav)
+            p_ = self.discriminator(wav_.detach())
+            real_loss = self.criterion_mse(p, paddle.ones_like(p))
+            fake_loss = self.criterion_mse(p_, paddle.zeros_like(p_))
+            dis_loss = real_loss + fake_loss
+            report("train/real_loss", float(real_loss))
+            report("train/fake_loss", float(fake_loss))
+            report("train/discriminator_loss", float(dis_loss))
+            losses_dict["real_loss"] = float(real_loss)
+            losses_dict["fake_loss"] = float(fake_loss)
+            losses_dict["discriminator_loss"] = float(dis_loss)
+
+            self.optimizer_d.clear_grad()
+            dis_loss.backward()
+
+            self.optimizer_d.step()
+            self.scheduler_d.step()
+
+        self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
+                              for k, v in losses_dict.items())
+
+
+class PWGEvaluator(StandardEvaluator):
+    def __init__(self,
+                 models: Dict[str, Layer],
+                 criterions: Dict[str, Layer],
+                 dataloader: DataLoader,
+                 lambda_adv: float=1.0,
+                 lambda_aux: float=1.0,
+                 output_dir: Path=None):
+        self.models = models
+        self.generator = models['generator']
+        self.discriminator = models['discriminator']
+
+        self.criterions = criterions
+        self.criterion_stft = criterions['stft']
+        self.criterion_mse = criterions['mse']
+
+        self.dataloader = dataloader
+
+        self.lambda_adv = lambda_adv
+        self.lambda_aux = lambda_aux
+
+        log_file = output_dir / 'worker_{}.log'.format(dist.get_rank())
+        self.filehandler = logging.FileHandler(str(log_file))
+        logger.addHandler(self.filehandler)
+        self.logger = logger
+        self.msg = ""
+
+    def evaluate_core(self, batch):
+        # logging.debug("Evaluate: ")
+        self.msg = "Evaluate: "
+        losses_dict = {}
+        wav, mel = batch
+        noise = paddle.randn(wav.shape)
+
+        # Generator
+        wav_ = self.generator(noise, mel)
+
+        # initialize
+        gen_loss = 0.0
+        aux_loss = 0.0
+
+        # adversarial loss
+        p_ = self.discriminator(wav_)
+        adv_loss = self.criterion_mse(p_, paddle.ones_like(p_))
+        report("eval/adversarial_loss", float(adv_loss))
+        losses_dict["adversarial_loss"] = float(adv_loss)
+
+        gen_loss += self.lambda_adv * adv_loss
+
+        # multi-resolution stft loss
+        sc_loss, mag_loss = self.criterion_stft(wav_, wav)
+        report("eval/spectral_convergence_loss", float(sc_loss))
+        report("eval/log_stft_magnitude_loss", float(mag_loss))
+        losses_dict["spectral_convergence_loss"] = float(sc_loss)
+        losses_dict["log_stft_magnitude_loss"] = float(mag_loss)
+        aux_loss += sc_loss + mag_loss
+
+        gen_loss += aux_loss * self.lambda_aux
+
+        report("eval/generator_loss", float(gen_loss))
+        losses_dict["generator_loss"] = float(gen_loss)
+
+        # Disctiminator
+        p = self.discriminator(wav)
+        real_loss = self.criterion_mse(p, paddle.ones_like(p))
+        fake_loss = self.criterion_mse(p_, paddle.zeros_like(p_))
+        dis_loss = real_loss + fake_loss
+        report("eval/real_loss", float(real_loss))
+        report("eval/fake_loss", float(fake_loss))
+        report("eval/discriminator_loss", float(dis_loss))
+
+        losses_dict["real_loss"] = float(real_loss)
+        losses_dict["fake_loss"] = float(fake_loss)
+        losses_dict["discriminator_loss"] = float(dis_loss)
+
+        self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
+                              for k, v in losses_dict.items())
+        self.logger.info(self.msg)
diff --git a/ernie-sat/paddlespeech/t2s/models/speedyspeech/__init__.py b/ernie-sat/paddlespeech/t2s/models/speedyspeech/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..abdac8da4dfd1c55b9ed4038e17602023bc3bbc5
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/models/speedyspeech/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .speedyspeech import *
+from .speedyspeech_updater import *
diff --git a/ernie-sat/paddlespeech/t2s/models/speedyspeech/speedyspeech.py b/ernie-sat/paddlespeech/t2s/models/speedyspeech/speedyspeech.py
new file mode 100644
index 0000000000000000000000000000000000000000..44ccfc60ff3508eaf06b450a08e53418c82bcc12
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/models/speedyspeech/speedyspeech.py
@@ -0,0 +1,254 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddle
+from paddle import nn
+
+from paddlespeech.t2s.modules.nets_utils import initialize
+from paddlespeech.t2s.modules.positional_encoding import sinusoid_position_encoding
+from paddlespeech.t2s.modules.predictor.length_regulator import LengthRegulator
+
+
+class ResidualBlock(nn.Layer):
+    def __init__(self, channels, kernel_size, dilation, n=2):
+        super().__init__()
+        blocks = [
+            nn.Sequential(
+                nn.Conv1D(
+                    channels,
+                    channels,
+                    kernel_size,
+                    dilation=dilation,
+                    padding="same",
+                    data_format="NLC"),
+                nn.ReLU(),
+                nn.BatchNorm1D(channels, data_format="NLC"), ) for _ in range(n)
+        ]
+        self.blocks = nn.Sequential(*blocks)
+
+    def forward(self, x):
+        return x + self.blocks(x)
+
+
+class TextEmbedding(nn.Layer):
+    def __init__(self,
+                 vocab_size: int,
+                 embedding_size: int,
+                 tone_vocab_size: int=None,
+                 tone_embedding_size: int=None,
+                 padding_idx: int=None,
+                 tone_padding_idx: int=None,
+                 concat: bool=False):
+        super().__init__()
+        self.text_embedding = nn.Embedding(vocab_size, embedding_size,
+                                           padding_idx)
+        if tone_vocab_size:
+            tone_embedding_size = tone_embedding_size or embedding_size
+            if tone_embedding_size != embedding_size and not concat:
+                raise ValueError(
+                    "embedding size != tone_embedding size, only conat is avaiable."
+                )
+            self.tone_embedding = nn.Embedding(
+                tone_vocab_size, tone_embedding_size, tone_padding_idx)
+        self.concat = concat
+
+    def forward(self, text, tone=None):
+        text_embed = self.text_embedding(text)
+        if tone is None:
+            return text_embed
+        tone_embed = self.tone_embedding(tone)
+        if self.concat:
+            embed = paddle.concat([text_embed, tone_embed], -1)
+        else:
+            embed = text_embed + tone_embed
+        return embed
+
+
+class SpeedySpeechEncoder(nn.Layer):
+    def __init__(self,
+                 vocab_size,
+                 tone_size,
+                 hidden_size,
+                 kernel_size,
+                 dilations,
+                 spk_num=None):
+        super().__init__()
+        self.embedding = TextEmbedding(
+            vocab_size,
+            hidden_size,
+            tone_size,
+            padding_idx=0,
+            tone_padding_idx=0)
+
+        if spk_num:
+            self.spk_emb = nn.Embedding(
+                num_embeddings=spk_num,
+                embedding_dim=hidden_size,
+                padding_idx=0)
+        else:
+            self.spk_emb = None
+
+        self.prenet = nn.Sequential(
+            nn.Linear(hidden_size, hidden_size),
+            nn.ReLU(), )
+        res_blocks = [
+            ResidualBlock(hidden_size, kernel_size, d, n=2) for d in dilations
+        ]
+        self.res_blocks = nn.Sequential(*res_blocks)
+
+        self.postnet1 = nn.Sequential(nn.Linear(hidden_size, hidden_size))
+        self.postnet2 = nn.Sequential(
+            nn.ReLU(),
+            nn.BatchNorm1D(hidden_size, data_format="NLC"),
+            nn.Linear(hidden_size, hidden_size), )
+
+    def forward(self, text, tones, spk_id=None):
+        embedding = self.embedding(text, tones)
+        if self.spk_emb:
+            embedding += self.spk_emb(spk_id).unsqueeze(1)
+        embedding = self.prenet(embedding)
+        x = self.res_blocks(embedding)
+        x = embedding + self.postnet1(x)
+        x = self.postnet2(x)
+        return x
+
+
+class DurationPredictor(nn.Layer):
+    def __init__(self, hidden_size):
+        super().__init__()
+        self.layers = nn.Sequential(
+            ResidualBlock(hidden_size, 4, 1, n=1),
+            ResidualBlock(hidden_size, 3, 1, n=1),
+            ResidualBlock(hidden_size, 1, 1, n=1), nn.Linear(hidden_size, 1))
+
+    def forward(self, x):
+        return paddle.squeeze(self.layers(x), -1)
+
+
+class SpeedySpeechDecoder(nn.Layer):
+    def __init__(self, hidden_size, output_size, kernel_size, dilations):
+        super().__init__()
+        res_blocks = [
+            ResidualBlock(hidden_size, kernel_size, d, n=2) for d in dilations
+        ]
+        self.res_blocks = nn.Sequential(*res_blocks)
+
+        self.postnet1 = nn.Sequential(nn.Linear(hidden_size, hidden_size))
+        self.postnet2 = nn.Sequential(
+            ResidualBlock(hidden_size, kernel_size, 1, n=2),
+            nn.Linear(hidden_size, output_size))
+
+    def forward(self, x):
+        xx = self.res_blocks(x)
+        x = x + self.postnet1(xx)
+        x = self.postnet2(x)
+        return x
+
+
+class SpeedySpeech(nn.Layer):
+    def __init__(
+            self,
+            vocab_size,
+            encoder_hidden_size,
+            encoder_kernel_size,
+            encoder_dilations,
+            duration_predictor_hidden_size,
+            decoder_hidden_size,
+            decoder_output_size,
+            decoder_kernel_size,
+            decoder_dilations,
+            tone_size=None,
+            spk_num=None,
+            init_type: str="xavier_uniform", ):
+        super().__init__()
+
+        # initialize parameters
+        initialize(self, init_type)
+
+        encoder = SpeedySpeechEncoder(vocab_size, tone_size,
+                                      encoder_hidden_size, encoder_kernel_size,
+                                      encoder_dilations, spk_num)
+        duration_predictor = DurationPredictor(duration_predictor_hidden_size)
+        decoder = SpeedySpeechDecoder(decoder_hidden_size, decoder_output_size,
+                                      decoder_kernel_size, decoder_dilations)
+
+        self.encoder = encoder
+        self.duration_predictor = duration_predictor
+        self.decoder = decoder
+        # define length regulator
+        self.length_regulator = LengthRegulator()
+
+        nn.initializer.set_global_initializer(None)
+
+    def forward(self, text, tones, durations, spk_id: paddle.Tensor=None):
+        # input of embedding must be int64
+        text = paddle.cast(text, 'int64')
+        tones = paddle.cast(tones, 'int64')
+        if spk_id is not None:
+            spk_id = paddle.cast(spk_id, 'int64')
+        durations = paddle.cast(durations, 'int64')
+        encodings = self.encoder(text, tones, spk_id)
+
+        pred_durations = self.duration_predictor(encodings.detach())
+
+        # expand encodings
+        durations_to_expand = durations
+        encodings = self.length_regulator(encodings, durations_to_expand)
+
+        # decode
+        # remove positional encoding here
+        _, t_dec, feature_size = encodings.shape
+        encodings += sinusoid_position_encoding(t_dec, feature_size)
+        decoded = self.decoder(encodings)
+        return decoded, pred_durations
+
+    def inference(self, text, tones=None, durations=None, spk_id=None):
+        # text: [T]
+        # tones: [T]
+        # input of embedding must be int64
+        text = paddle.cast(text, 'int64')
+        text = text.unsqueeze(0)
+        if tones is not None:
+            tones = paddle.cast(tones, 'int64')
+            tones = tones.unsqueeze(0)
+
+        encodings = self.encoder(text, tones, spk_id)
+
+        if durations is None:
+            # (1, T)
+            pred_durations = self.duration_predictor(encodings)
+            durations_to_expand = paddle.round(pred_durations.exp())
+            durations_to_expand = durations_to_expand.astype(paddle.int64)
+        else:
+            durations_to_expand = durations
+        encodings = self.length_regulator(
+            encodings, durations_to_expand, is_inference=True)
+
+        shape = paddle.shape(encodings)
+        t_dec, feature_size = shape[1], shape[2]
+        encodings += sinusoid_position_encoding(t_dec, feature_size)
+        decoded = self.decoder(encodings)
+        return decoded[0]
+
+
+class SpeedySpeechInference(nn.Layer):
+    def __init__(self, normalizer, speedyspeech_model):
+        super().__init__()
+        self.normalizer = normalizer
+        self.acoustic_model = speedyspeech_model
+
+    def forward(self, phones, tones, spk_id=None, durations=None):
+        normalized_mel = self.acoustic_model.inference(
+            phones, tones, durations=durations, spk_id=spk_id)
+        logmel = self.normalizer.inverse(normalized_mel)
+        return logmel
diff --git a/ernie-sat/paddlespeech/t2s/models/speedyspeech/speedyspeech_updater.py b/ernie-sat/paddlespeech/t2s/models/speedyspeech/speedyspeech_updater.py
new file mode 100644
index 0000000000000000000000000000000000000000..e30a3fe1a5947c7046501ef26fe069656c1fcb31
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/models/speedyspeech/speedyspeech_updater.py
@@ -0,0 +1,172 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+from pathlib import Path
+
+import paddle
+from paddle import distributed as dist
+from paddle.fluid.layers import huber_loss
+from paddle.io import DataLoader
+from paddle.nn import functional as F
+from paddle.nn import Layer
+from paddle.optimizer import Optimizer
+
+from paddlespeech.t2s.modules.losses import masked_l1_loss
+from paddlespeech.t2s.modules.losses import ssim
+from paddlespeech.t2s.modules.losses import weighted_mean
+from paddlespeech.t2s.training.extensions.evaluator import StandardEvaluator
+from paddlespeech.t2s.training.reporter import report
+from paddlespeech.t2s.training.updaters.standard_updater import StandardUpdater
+logging.basicConfig(
+    format='%(asctime)s [%(levelname)s] [%(filename)s:%(lineno)d] %(message)s',
+    datefmt='[%Y-%m-%d %H:%M:%S]')
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+
+class SpeedySpeechUpdater(StandardUpdater):
+    def __init__(self,
+                 model: Layer,
+                 optimizer: Optimizer,
+                 dataloader: DataLoader,
+                 init_state=None,
+                 output_dir: Path=None):
+        super().__init__(model, optimizer, dataloader, init_state=None)
+
+        log_file = output_dir / 'worker_{}.log'.format(dist.get_rank())
+        self.filehandler = logging.FileHandler(str(log_file))
+        logger.addHandler(self.filehandler)
+        self.logger = logger
+        self.msg = ""
+
+    def update_core(self, batch):
+        self.msg = "Rank: {}, ".format(dist.get_rank())
+        losses_dict = {}
+
+        # spk_id!=None in multiple spk speedyspeech 
+        spk_id = batch["spk_id"] if "spk_id" in batch else None
+
+        decoded, predicted_durations = self.model(
+            text=batch["phones"],
+            tones=batch["tones"],
+            durations=batch["durations"],
+            spk_id=spk_id)
+
+        target_mel = batch["feats"]
+        spec_mask = F.sequence_mask(
+            batch["num_frames"], dtype=target_mel.dtype).unsqueeze(-1)
+        text_mask = F.sequence_mask(
+            batch["num_phones"], dtype=predicted_durations.dtype)
+
+        # spec loss
+        l1_loss = masked_l1_loss(decoded, target_mel, spec_mask)
+
+        # duration loss
+        target_durations = batch["durations"]
+        target_durations = paddle.maximum(
+            target_durations.astype(predicted_durations.dtype),
+            paddle.to_tensor([1.0]))
+        duration_loss = weighted_mean(
+            huber_loss(
+                predicted_durations, paddle.log(target_durations), delta=1.0),
+            text_mask, )
+
+        # ssim loss
+        ssim_loss = 1.0 - ssim((decoded * spec_mask).unsqueeze(1),
+                               (target_mel * spec_mask).unsqueeze(1))
+
+        loss = l1_loss + ssim_loss + duration_loss
+
+        optimizer = self.optimizer
+        optimizer.clear_grad()
+        loss.backward()
+        optimizer.step()
+
+        report("train/loss", float(loss))
+        report("train/l1_loss", float(l1_loss))
+        report("train/duration_loss", float(duration_loss))
+        report("train/ssim_loss", float(ssim_loss))
+
+        losses_dict["l1_loss"] = float(l1_loss)
+        losses_dict["duration_loss"] = float(duration_loss)
+        losses_dict["ssim_loss"] = float(ssim_loss)
+        losses_dict["loss"] = float(loss)
+        self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
+                              for k, v in losses_dict.items())
+
+
+class SpeedySpeechEvaluator(StandardEvaluator):
+    def __init__(self,
+                 model: Layer,
+                 dataloader: DataLoader,
+                 output_dir: Path=None):
+        super().__init__(model, dataloader)
+
+        log_file = output_dir / 'worker_{}.log'.format(dist.get_rank())
+        self.filehandler = logging.FileHandler(str(log_file))
+        logger.addHandler(self.filehandler)
+        self.logger = logger
+        self.msg = ""
+
+    def evaluate_core(self, batch):
+        self.msg = "Evaluate: "
+        losses_dict = {}
+
+        spk_id = batch["spk_id"] if "spk_id" in batch else None
+
+        decoded, predicted_durations = self.model(
+            text=batch["phones"],
+            tones=batch["tones"],
+            durations=batch["durations"],
+            spk_id=spk_id)
+
+        target_mel = batch["feats"]
+        spec_mask = F.sequence_mask(
+            batch["num_frames"], dtype=target_mel.dtype).unsqueeze(-1)
+        text_mask = F.sequence_mask(
+            batch["num_phones"], dtype=predicted_durations.dtype)
+
+        # spec loss
+        l1_loss = masked_l1_loss(decoded, target_mel, spec_mask)
+
+        # duration loss
+        target_durations = batch["durations"]
+        target_durations = paddle.maximum(
+            target_durations.astype(predicted_durations.dtype),
+            paddle.to_tensor([1.0]))
+        duration_loss = weighted_mean(
+            huber_loss(
+                predicted_durations, paddle.log(target_durations), delta=1.0),
+            text_mask, )
+
+        # ssim loss
+        ssim_loss = 1.0 - ssim((decoded * spec_mask).unsqueeze(1),
+                               (target_mel * spec_mask).unsqueeze(1))
+
+        loss = l1_loss + ssim_loss + duration_loss
+
+        # import pdb; pdb.set_trace()
+
+        report("eval/loss", float(loss))
+        report("eval/l1_loss", float(l1_loss))
+        report("eval/duration_loss", float(duration_loss))
+        report("eval/ssim_loss", float(ssim_loss))
+
+        losses_dict["l1_loss"] = float(l1_loss)
+        losses_dict["duration_loss"] = float(duration_loss)
+        losses_dict["ssim_loss"] = float(ssim_loss)
+        losses_dict["loss"] = float(loss)
+        self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
+                              for k, v in losses_dict.items())
+        self.logger.info(self.msg)
diff --git a/ernie-sat/paddlespeech/t2s/models/tacotron2/__init__.py b/ernie-sat/paddlespeech/t2s/models/tacotron2/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ea63257c80d10cf16f34b027ad190edc15bfc815
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/models/tacotron2/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .tacotron2 import *
+from .tacotron2_updater import *
diff --git a/ernie-sat/paddlespeech/t2s/models/tacotron2/tacotron2.py b/ernie-sat/paddlespeech/t2s/models/tacotron2/tacotron2.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b306e4820de10db9ae8551fffe62ab50d055905
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/models/tacotron2/tacotron2.py
@@ -0,0 +1,441 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+"""Tacotron 2 related modules for paddle"""
+import logging
+from typing import Dict
+from typing import Optional
+from typing import Tuple
+
+import paddle
+import paddle.nn.functional as F
+from paddle import nn
+from typeguard import check_argument_types
+
+from paddlespeech.t2s.modules.nets_utils import initialize
+from paddlespeech.t2s.modules.nets_utils import make_pad_mask
+from paddlespeech.t2s.modules.tacotron2.attentions import AttForward
+from paddlespeech.t2s.modules.tacotron2.attentions import AttForwardTA
+from paddlespeech.t2s.modules.tacotron2.attentions import AttLoc
+from paddlespeech.t2s.modules.tacotron2.decoder import Decoder
+from paddlespeech.t2s.modules.tacotron2.encoder import Encoder
+
+
+class Tacotron2(nn.Layer):
+    """Tacotron2 module for end-to-end text-to-speech.
+
+    This is a module of Spectrogram prediction network in Tacotron2 described
+    in `Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions`_,
+    which converts the sequence of characters into the sequence of Mel-filterbanks.
+
+    .. _`Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions`:
+       https://arxiv.org/abs/1712.05884
+
+    """
+
+    def __init__(
+            self,
+            # network structure related
+            idim: int,
+            odim: int,
+            embed_dim: int=512,
+            elayers: int=1,
+            eunits: int=512,
+            econv_layers: int=3,
+            econv_chans: int=512,
+            econv_filts: int=5,
+            atype: str="location",
+            adim: int=512,
+            aconv_chans: int=32,
+            aconv_filts: int=15,
+            cumulate_att_w: bool=True,
+            dlayers: int=2,
+            dunits: int=1024,
+            prenet_layers: int=2,
+            prenet_units: int=256,
+            postnet_layers: int=5,
+            postnet_chans: int=512,
+            postnet_filts: int=5,
+            output_activation: str=None,
+            use_batch_norm: bool=True,
+            use_concate: bool=True,
+            use_residual: bool=False,
+            reduction_factor: int=1,
+            # extra embedding related
+            spk_num: Optional[int]=None,
+            lang_num: Optional[int]=None,
+            spk_embed_dim: Optional[int]=None,
+            spk_embed_integration_type: str="concat",
+            dropout_rate: float=0.5,
+            zoneout_rate: float=0.1,
+            # training related
+            init_type: str="xavier_uniform", ):
+        """Initialize Tacotron2 module.
+        Args:
+            idim (int): Dimension of the inputs.
+            odim (int): Dimension of the outputs.
+            embed_dim (int): Dimension of the token embedding.
+            elayers (int): Number of encoder blstm layers.
+            eunits (int): Number of encoder blstm units.
+            econv_layers (int): Number of encoder conv layers.
+            econv_filts (int): Number of encoder conv filter size.
+            econv_chans (int): Number of encoder conv filter channels.
+            dlayers (int): Number of decoder lstm layers.
+            dunits (int): Number of decoder lstm units.
+            prenet_layers (int): Number of prenet layers.
+            prenet_units (int): Number of prenet units.
+            postnet_layers (int): Number of postnet layers.
+            postnet_filts (int): Number of postnet filter size.
+            postnet_chans (int): Number of postnet filter channels.
+            output_activation (str): Name of activation function for outputs.
+            adim (int): Number of dimension of mlp in attention.
+            aconv_chans (int): Number of attention conv filter channels.
+            aconv_filts (int): Number of attention conv filter size.
+            cumulate_att_w (bool): Whether to cumulate previous attention weight.
+            use_batch_norm (bool): Whether to use batch normalization.
+            use_concate (bool): Whether to concat enc outputs w/ dec lstm outputs.
+            reduction_factor (int): Reduction factor.
+            spk_num (Optional[int]): Number of speakers. If set to > 1, assume that the
+                sids will be provided as the input and use sid embedding layer.
+            lang_num (Optional[int]): Number of languages. If set to > 1, assume that the
+                lids will be provided as the input and use sid embedding layer.
+            spk_embed_dim (Optional[int]): Speaker embedding dimension. If set to > 0,
+                assume that spk_emb will be provided as the input.
+            spk_embed_integration_type (str): How to integrate speaker embedding.
+            dropout_rate (float): Dropout rate.
+            zoneout_rate (float): Zoneout rate.
+        """
+        assert check_argument_types()
+        super().__init__()
+
+        # store hyperparameters
+        self.idim = idim
+        self.odim = odim
+        self.eos = idim - 1
+        self.cumulate_att_w = cumulate_att_w
+        self.reduction_factor = reduction_factor
+
+        # define activation function for the final output
+        if output_activation is None:
+            self.output_activation_fn = None
+        elif hasattr(F, output_activation):
+            self.output_activation_fn = getattr(F, output_activation)
+        else:
+            raise ValueError(f"there is no such an activation function. "
+                             f"({output_activation})")
+
+        # set padding idx
+        padding_idx = 0
+        self.padding_idx = padding_idx
+
+        # initialize parameters
+        initialize(self, init_type)
+
+        # define network modules
+        self.enc = Encoder(
+            idim=idim,
+            embed_dim=embed_dim,
+            elayers=elayers,
+            eunits=eunits,
+            econv_layers=econv_layers,
+            econv_chans=econv_chans,
+            econv_filts=econv_filts,
+            use_batch_norm=use_batch_norm,
+            use_residual=use_residual,
+            dropout_rate=dropout_rate,
+            padding_idx=padding_idx, )
+
+        self.spk_num = None
+        if spk_num is not None and spk_num > 1:
+            self.spk_num = spk_num
+            self.sid_emb = nn.Embedding(spk_num, eunits)
+        self.lang_num = None
+        if lang_num is not None and lang_num > 1:
+            self.lang_num = lang_num
+            self.lid_emb = nn.Embedding(lang_num, eunits)
+
+        self.spk_embed_dim = None
+        if spk_embed_dim is not None and spk_embed_dim > 0:
+            self.spk_embed_dim = spk_embed_dim
+            self.spk_embed_integration_type = spk_embed_integration_type
+        if self.spk_embed_dim is None:
+            dec_idim = eunits
+        elif self.spk_embed_integration_type == "concat":
+            dec_idim = eunits + spk_embed_dim
+        elif self.spk_embed_integration_type == "add":
+            dec_idim = eunits
+            self.projection = nn.Linear(self.spk_embed_dim, eunits)
+        else:
+            raise ValueError(f"{spk_embed_integration_type} is not supported.")
+
+        if atype == "location":
+            att = AttLoc(dec_idim, dunits, adim, aconv_chans, aconv_filts)
+        elif atype == "forward":
+            att = AttForward(dec_idim, dunits, adim, aconv_chans, aconv_filts)
+            if self.cumulate_att_w:
+                logging.warning("cumulation of attention weights is disabled "
+                                "in forward attention.")
+                self.cumulate_att_w = False
+        elif atype == "forward_ta":
+            att = AttForwardTA(dec_idim, dunits, adim, aconv_chans, aconv_filts,
+                               odim)
+            if self.cumulate_att_w:
+                logging.warning("cumulation of attention weights is disabled "
+                                "in forward attention.")
+                self.cumulate_att_w = False
+        else:
+            raise NotImplementedError("Support only location or forward")
+        self.dec = Decoder(
+            idim=dec_idim,
+            odim=odim,
+            att=att,
+            dlayers=dlayers,
+            dunits=dunits,
+            prenet_layers=prenet_layers,
+            prenet_units=prenet_units,
+            postnet_layers=postnet_layers,
+            postnet_chans=postnet_chans,
+            postnet_filts=postnet_filts,
+            output_activation_fn=self.output_activation_fn,
+            cumulate_att_w=self.cumulate_att_w,
+            use_batch_norm=use_batch_norm,
+            use_concate=use_concate,
+            dropout_rate=dropout_rate,
+            zoneout_rate=zoneout_rate,
+            reduction_factor=reduction_factor, )
+
+        nn.initializer.set_global_initializer(None)
+
+    def forward(
+            self,
+            text: paddle.Tensor,
+            text_lengths: paddle.Tensor,
+            speech: paddle.Tensor,
+            speech_lengths: paddle.Tensor,
+            spk_emb: Optional[paddle.Tensor]=None,
+            spk_id: Optional[paddle.Tensor]=None,
+            lang_id: Optional[paddle.Tensor]=None
+    ) -> Tuple[paddle.Tensor, Dict[str, paddle.Tensor], paddle.Tensor]:
+        """Calculate forward propagation.
+
+        Args:
+            text (Tensor(int64)): Batch of padded character ids (B, T_text).
+            text_lengths (Tensor(int64)): Batch of lengths of each input batch (B,).
+            speech (Tensor): Batch of padded target features (B, T_feats, odim).
+            speech_lengths (Tensor(int64)): Batch of the lengths of each target (B,).
+            spk_emb (Optional[Tensor]): Batch of speaker embeddings (B, spk_embed_dim).
+            spk_id (Optional[Tensor]): Batch of speaker IDs (B, 1).
+            lang_id (Optional[Tensor]): Batch of language IDs (B, 1).
+
+        Returns:
+            Tensor: Loss scalar value.
+            Dict: Statistics to be monitored.
+            Tensor: Weight value if not joint training else model outputs.
+
+        """
+        text = text[:, :text_lengths.max()]
+        speech = speech[:, :speech_lengths.max()]
+
+        batch_size = paddle.shape(text)[0]
+
+        # Add eos at the last of sequence
+        xs = F.pad(text, [0, 0, 0, 1], "constant", self.padding_idx)
+        for i, l in enumerate(text_lengths):
+            xs[i, l] = self.eos
+        ilens = text_lengths + 1
+
+        ys = speech
+        olens = speech_lengths
+
+        # make labels for stop prediction
+        stop_labels = make_pad_mask(olens - 1)
+        # bool 类型无法切片
+        stop_labels = paddle.cast(stop_labels, dtype='float32')
+        stop_labels = F.pad(stop_labels, [0, 0, 0, 1], "constant", 1.0)
+
+        # calculate tacotron2 outputs
+        after_outs, before_outs, logits, att_ws = self._forward(
+            xs=xs,
+            ilens=ilens,
+            ys=ys,
+            olens=olens,
+            spk_emb=spk_emb,
+            spk_id=spk_id,
+            lang_id=lang_id, )
+
+        # modify mod part of groundtruth
+        if self.reduction_factor > 1:
+            assert olens.ge(self.reduction_factor).all(
+            ), "Output length must be greater than or equal to reduction factor."
+            olens = olens - olens % self.reduction_factor
+            max_out = max(olens)
+            ys = ys[:, :max_out]
+            stop_labels = stop_labels[:, :max_out]
+            stop_labels = paddle.scatter(stop_labels, 1,
+                                         (olens - 1).unsqueeze(1), 1.0)
+            olens_in = olens // self.reduction_factor
+        else:
+            olens_in = olens
+        return after_outs, before_outs, logits, ys, stop_labels, olens, att_ws, olens_in
+
+    def _forward(
+            self,
+            xs: paddle.Tensor,
+            ilens: paddle.Tensor,
+            ys: paddle.Tensor,
+            olens: paddle.Tensor,
+            spk_emb: paddle.Tensor,
+            spk_id: paddle.Tensor,
+            lang_id: paddle.Tensor,
+    ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
+
+        hs, hlens = self.enc(xs, ilens)
+        if self.spk_num is not None:
+            sid_embs = self.sid_emb(spk_id.reshape([-1]))
+            hs = hs + sid_embs.unsqueeze(1)
+        if self.lang_num is not None:
+            lid_embs = self.lid_emb(lang_id.reshape([-1]))
+            hs = hs + lid_embs.unsqueeze(1)
+        if self.spk_embed_dim is not None:
+            hs = self._integrate_with_spk_embed(hs, spk_emb)
+
+        return self.dec(hs, hlens, ys)
+
+    def inference(
+            self,
+            text: paddle.Tensor,
+            speech: Optional[paddle.Tensor]=None,
+            spk_emb: Optional[paddle.Tensor]=None,
+            spk_id: Optional[paddle.Tensor]=None,
+            lang_id: Optional[paddle.Tensor]=None,
+            threshold: float=0.5,
+            minlenratio: float=0.0,
+            maxlenratio: float=10.0,
+            use_att_constraint: bool=False,
+            backward_window: int=1,
+            forward_window: int=3,
+            use_teacher_forcing: bool=False, ) -> Dict[str, paddle.Tensor]:
+        """Generate the sequence of features given the sequences of characters.
+
+        Args:
+            text (Tensor(int64)): Input sequence of characters (T_text,).
+            speech (Optional[Tensor]): Feature sequence to extract style (N, idim).
+            spk_emb (ptional[Tensor]): Speaker embedding (spk_embed_dim,).
+            spk_id (Optional[Tensor]): Speaker ID (1,).
+            lang_id (Optional[Tensor]): Language ID (1,).
+            threshold (float): Threshold in inference.
+            minlenratio (float): Minimum length ratio in inference.
+            maxlenratio (float): Maximum length ratio in inference.
+            use_att_constraint (bool): Whether to apply attention constraint.
+            backward_window (int): Backward window in attention constraint.
+            forward_window (int): Forward window in attention constraint.
+            use_teacher_forcing (bool): Whether to use teacher forcing.
+
+        Returns:
+            Dict[str, Tensor]
+            Output dict including the following items:
+                * feat_gen (Tensor): Output sequence of features (T_feats, odim).
+                * prob (Tensor): Output sequence of stop probabilities (T_feats,).
+                * att_w (Tensor): Attention weights (T_feats, T).
+
+        """
+        x = text
+        y = speech
+
+        # add eos at the last of sequence
+        x = F.pad(x, [0, 1], "constant", self.eos)
+
+        # inference with teacher forcing
+        if use_teacher_forcing:
+            assert speech is not None, "speech must be provided with teacher forcing."
+
+            xs, ys = x.unsqueeze(0), y.unsqueeze(0)
+            spk_emb = None if spk_emb is None else spk_emb.unsqueeze(0)
+            ilens = paddle.shape(xs)[1]
+            olens = paddle.shape(ys)[1]
+            outs, _, _, att_ws = self._forward(
+                xs=xs,
+                ilens=ilens,
+                ys=ys,
+                olens=olens,
+                spk_emb=spk_emb,
+                spk_id=spk_id,
+                lang_id=lang_id, )
+
+            return dict(feat_gen=outs[0], att_w=att_ws[0])
+
+        # inference
+        h = self.enc.inference(x)
+
+        if self.spk_num is not None:
+            sid_emb = self.sid_emb(spk_id.reshape([-1]))
+            h = h + sid_emb
+        if self.lang_num is not None:
+            lid_emb = self.lid_emb(lang_id.reshape([-1]))
+            h = h + lid_emb
+        if self.spk_embed_dim is not None:
+            hs, spk_emb = h.unsqueeze(0), spk_emb.unsqueeze(0)
+            h = self._integrate_with_spk_embed(hs, spk_emb)[0]
+        out, prob, att_w = self.dec.inference(
+            h,
+            threshold=threshold,
+            minlenratio=minlenratio,
+            maxlenratio=maxlenratio,
+            use_att_constraint=use_att_constraint,
+            backward_window=backward_window,
+            forward_window=forward_window, )
+
+        return dict(feat_gen=out, prob=prob, att_w=att_w)
+
+    def _integrate_with_spk_embed(self,
+                                  hs: paddle.Tensor,
+                                  spk_emb: paddle.Tensor) -> paddle.Tensor:
+        """Integrate speaker embedding with hidden states.
+
+        Args:
+            hs (Tensor): Batch of hidden state sequences (B, Tmax, eunits).
+            spk_emb (Tensor): Batch of speaker embeddings (B, spk_embed_dim).
+
+        Returns:
+            Tensor: Batch of integrated hidden state sequences (B, Tmax, eunits) if
+                integration_type is "add" else (B, Tmax, eunits + spk_embed_dim).
+
+        """
+        if self.spk_embed_integration_type == "add":
+            # apply projection and then add to hidden states
+            spk_emb = self.projection(F.normalize(spk_emb))
+            hs = hs + spk_emb.unsqueeze(1)
+        elif self.spk_embed_integration_type == "concat":
+            # concat hidden states with spk embeds
+            spk_emb = F.normalize(spk_emb).unsqueeze(1).expand(
+                shape=[-1, paddle.shape(hs)[1], -1])
+            hs = paddle.concat([hs, spk_emb], axis=-1)
+        else:
+            raise NotImplementedError("support only add or concat.")
+
+        return hs
+
+
+class Tacotron2Inference(nn.Layer):
+    def __init__(self, normalizer, model):
+        super().__init__()
+        self.normalizer = normalizer
+        self.acoustic_model = model
+
+    def forward(self, text, spk_id=None, spk_emb=None):
+        out = self.acoustic_model.inference(
+            text, spk_id=spk_id, spk_emb=spk_emb)
+        normalized_mel = out["feat_gen"]
+        logmel = self.normalizer.inverse(normalized_mel)
+        return logmel
diff --git a/ernie-sat/paddlespeech/t2s/models/tacotron2/tacotron2_updater.py b/ernie-sat/paddlespeech/t2s/models/tacotron2/tacotron2_updater.py
new file mode 100644
index 0000000000000000000000000000000000000000..09e6827d04ec97da2d37cc1393480fda3abc234e
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/models/tacotron2/tacotron2_updater.py
@@ -0,0 +1,219 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+from pathlib import Path
+
+from paddle import distributed as dist
+from paddle.io import DataLoader
+from paddle.nn import Layer
+from paddle.optimizer import Optimizer
+
+from paddlespeech.t2s.modules.losses import GuidedAttentionLoss
+from paddlespeech.t2s.modules.losses import Tacotron2Loss
+from paddlespeech.t2s.training.extensions.evaluator import StandardEvaluator
+from paddlespeech.t2s.training.reporter import report
+from paddlespeech.t2s.training.updaters.standard_updater import StandardUpdater
+logging.basicConfig(
+    format='%(asctime)s [%(levelname)s] [%(filename)s:%(lineno)d] %(message)s',
+    datefmt='[%Y-%m-%d %H:%M:%S]')
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+
+class Tacotron2Updater(StandardUpdater):
+    def __init__(self,
+                 model: Layer,
+                 optimizer: Optimizer,
+                 dataloader: DataLoader,
+                 init_state=None,
+                 use_masking: bool=True,
+                 use_weighted_masking: bool=False,
+                 bce_pos_weight: float=5.0,
+                 loss_type: str="L1+L2",
+                 use_guided_attn_loss: bool=True,
+                 guided_attn_loss_sigma: float=0.4,
+                 guided_attn_loss_lambda: float=1.0,
+                 output_dir: Path=None):
+        super().__init__(model, optimizer, dataloader, init_state=None)
+
+        self.loss_type = loss_type
+        self.use_guided_attn_loss = use_guided_attn_loss
+
+        self.taco2_loss = Tacotron2Loss(
+            use_masking=use_masking,
+            use_weighted_masking=use_weighted_masking,
+            bce_pos_weight=bce_pos_weight, )
+        if self.use_guided_attn_loss:
+            self.attn_loss = GuidedAttentionLoss(
+                sigma=guided_attn_loss_sigma,
+                alpha=guided_attn_loss_lambda, )
+
+        log_file = output_dir / 'worker_{}.log'.format(dist.get_rank())
+        self.filehandler = logging.FileHandler(str(log_file))
+        logger.addHandler(self.filehandler)
+        self.logger = logger
+        self.msg = ""
+
+    def update_core(self, batch):
+        self.msg = "Rank: {}, ".format(dist.get_rank())
+        losses_dict = {}
+        # spk_id!=None in multiple spk fastspeech2 
+        spk_id = batch["spk_id"] if "spk_id" in batch else None
+        spk_emb = batch["spk_emb"] if "spk_emb" in batch else None
+        if spk_emb is not None:
+            spk_id = None
+
+        after_outs, before_outs, logits, ys, stop_labels, olens, att_ws, olens_in = self.model(
+            text=batch["text"],
+            text_lengths=batch["text_lengths"],
+            speech=batch["speech"],
+            speech_lengths=batch["speech_lengths"],
+            spk_id=spk_id,
+            spk_emb=spk_emb)
+
+        # calculate taco2 loss
+        l1_loss, mse_loss, bce_loss = self.taco2_loss(
+            after_outs=after_outs,
+            before_outs=before_outs,
+            logits=logits,
+            ys=ys,
+            stop_labels=stop_labels,
+            olens=olens)
+
+        if self.loss_type == "L1+L2":
+            loss = l1_loss + mse_loss + bce_loss
+        elif self.loss_type == "L1":
+            loss = l1_loss + bce_loss
+        elif self.loss_type == "L2":
+            loss = mse_loss + bce_loss
+        else:
+            raise ValueError(f"unknown --loss-type {self.loss_type}")
+
+        # calculate attention loss
+        if self.use_guided_attn_loss:
+            # NOTE: length of output for auto-regressive
+            # input will be changed when r > 1
+            attn_loss = self.attn_loss(
+                att_ws=att_ws, ilens=batch["text_lengths"] + 1, olens=olens_in)
+            loss = loss + attn_loss
+
+        optimizer = self.optimizer
+        optimizer.clear_grad()
+        loss.backward()
+        optimizer.step()
+
+        report("train/l1_loss", float(l1_loss))
+        report("train/mse_loss", float(mse_loss))
+        report("train/bce_loss", float(bce_loss))
+        report("train/attn_loss", float(attn_loss))
+        report("train/loss", float(loss))
+
+        losses_dict["l1_loss"] = float(l1_loss)
+        losses_dict["mse_loss"] = float(mse_loss)
+        losses_dict["bce_loss"] = float(bce_loss)
+        losses_dict["attn_loss"] = float(attn_loss)
+        losses_dict["loss"] = float(loss)
+        self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
+                              for k, v in losses_dict.items())
+
+
+class Tacotron2Evaluator(StandardEvaluator):
+    def __init__(self,
+                 model: Layer,
+                 dataloader: DataLoader,
+                 use_masking: bool=True,
+                 use_weighted_masking: bool=False,
+                 bce_pos_weight: float=5.0,
+                 loss_type: str="L1+L2",
+                 use_guided_attn_loss: bool=True,
+                 guided_attn_loss_sigma: float=0.4,
+                 guided_attn_loss_lambda: float=1.0,
+                 output_dir=None):
+        super().__init__(model, dataloader)
+
+        self.loss_type = loss_type
+        self.use_guided_attn_loss = use_guided_attn_loss
+
+        self.taco2_loss = Tacotron2Loss(
+            use_masking=use_masking,
+            use_weighted_masking=use_weighted_masking,
+            bce_pos_weight=bce_pos_weight, )
+        if self.use_guided_attn_loss:
+            self.attn_loss = GuidedAttentionLoss(
+                sigma=guided_attn_loss_sigma,
+                alpha=guided_attn_loss_lambda, )
+
+        log_file = output_dir / 'worker_{}.log'.format(dist.get_rank())
+        self.filehandler = logging.FileHandler(str(log_file))
+        logger.addHandler(self.filehandler)
+        self.logger = logger
+        self.msg = ""
+
+    def evaluate_core(self, batch):
+        self.msg = "Evaluate: "
+        losses_dict = {}
+        # spk_id!=None in multiple spk fastspeech2 
+        spk_id = batch["spk_id"] if "spk_id" in batch else None
+        spk_emb = batch["spk_emb"] if "spk_emb" in batch else None
+        if spk_emb is not None:
+            spk_id = None
+
+        after_outs, before_outs, logits, ys, stop_labels, olens, att_ws, olens_in = self.model(
+            text=batch["text"],
+            text_lengths=batch["text_lengths"],
+            speech=batch["speech"],
+            speech_lengths=batch["speech_lengths"],
+            spk_id=spk_id,
+            spk_emb=spk_emb)
+
+        # calculate taco2 loss
+        l1_loss, mse_loss, bce_loss = self.taco2_loss(
+            after_outs=after_outs,
+            before_outs=before_outs,
+            logits=logits,
+            ys=ys,
+            stop_labels=stop_labels,
+            olens=olens)
+
+        if self.loss_type == "L1+L2":
+            loss = l1_loss + mse_loss + bce_loss
+        elif self.loss_type == "L1":
+            loss = l1_loss + bce_loss
+        elif self.loss_type == "L2":
+            loss = mse_loss + bce_loss
+        else:
+            raise ValueError(f"unknown --loss-type {self.loss_type}")
+
+        # calculate attention loss
+        if self.use_guided_attn_loss:
+            # NOTE: length of output for auto-regressive
+            # input will be changed when r > 1
+            attn_loss = self.attn_loss(
+                att_ws=att_ws, ilens=batch["text_lengths"] + 1, olens=olens_in)
+            loss = loss + attn_loss
+
+        report("eval/l1_loss", float(l1_loss))
+        report("eval/mse_loss", float(mse_loss))
+        report("eval/bce_loss", float(bce_loss))
+        report("eval/attn_loss", float(attn_loss))
+        report("eval/loss", float(loss))
+
+        losses_dict["l1_loss"] = float(l1_loss)
+        losses_dict["mse_loss"] = float(mse_loss)
+        losses_dict["bce_loss"] = float(bce_loss)
+        losses_dict["attn_loss"] = float(attn_loss)
+        losses_dict["loss"] = float(loss)
+        self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
+                              for k, v in losses_dict.items())
+        self.logger.info(self.msg)
diff --git a/ernie-sat/paddlespeech/t2s/models/transformer_tts/__init__.py b/ernie-sat/paddlespeech/t2s/models/transformer_tts/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..80a151ecaf28e7bfb54bb3ec62bf2f6c6480514b
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/models/transformer_tts/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .transformer_tts import *
+from .transformer_tts_updater import *
diff --git a/ernie-sat/paddlespeech/t2s/models/transformer_tts/transformer_tts.py b/ernie-sat/paddlespeech/t2s/models/transformer_tts/transformer_tts.py
new file mode 100644
index 0000000000000000000000000000000000000000..92754c30a47e9619643b5f780205a5b47d971841
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/models/transformer_tts/transformer_tts.py
@@ -0,0 +1,674 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+"""Fastspeech2 related modules for paddle"""
+from typing import Dict
+from typing import Sequence
+from typing import Tuple
+
+import numpy
+import paddle
+import paddle.nn.functional as F
+from paddle import nn
+from typeguard import check_argument_types
+
+from paddlespeech.t2s.modules.nets_utils import initialize
+from paddlespeech.t2s.modules.nets_utils import make_non_pad_mask
+from paddlespeech.t2s.modules.nets_utils import make_pad_mask
+from paddlespeech.t2s.modules.style_encoder import StyleEncoder
+from paddlespeech.t2s.modules.tacotron2.decoder import Postnet
+from paddlespeech.t2s.modules.tacotron2.decoder import Prenet as DecoderPrenet
+from paddlespeech.t2s.modules.tacotron2.encoder import Encoder as EncoderPrenet
+from paddlespeech.t2s.modules.transformer.attention import MultiHeadedAttention
+from paddlespeech.t2s.modules.transformer.decoder import Decoder
+from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding
+from paddlespeech.t2s.modules.transformer.embedding import ScaledPositionalEncoding
+from paddlespeech.t2s.modules.transformer.encoder import TransformerEncoder
+from paddlespeech.t2s.modules.transformer.mask import subsequent_mask
+
+
+class TransformerTTS(nn.Layer):
+    """TTS-Transformer module.
+
+    This is a module of text-to-speech Transformer described in `Neural Speech Synthesis
+    with Transformer Network`_, which convert the sequence of tokens into the sequence
+    of Mel-filterbanks.
+
+    .. _`Neural Speech Synthesis with Transformer Network`:
+        https://arxiv.org/pdf/1809.08895.pdf
+
+    Args:
+        idim (int): Dimension of the inputs.
+        odim (int): Dimension of the outputs.
+        embed_dim (int, optional): Dimension of character embedding.
+        eprenet_conv_layers (int, optional): Number of encoder prenet convolution layers.
+        eprenet_conv_chans (int, optional): Number of encoder prenet convolution channels.
+        eprenet_conv_filts (int, optional): Filter size of encoder prenet convolution.
+        dprenet_layers (int, optional): Number of decoder prenet layers.
+        dprenet_units (int, optional): Number of decoder prenet hidden units.
+        elayers (int, optional): Number of encoder layers.
+        eunits (int, optional): Number of encoder hidden units.
+        adim (int, optional): Number of attention transformation dimensions.
+        aheads (int, optional): Number of heads for multi head attention.
+        dlayers (int, optional): Number of decoder layers.
+        dunits (int, optional): Number of decoder hidden units.
+        postnet_layers (int, optional): Number of postnet layers.
+        postnet_chans (int, optional): Number of postnet channels.
+        postnet_filts (int, optional): Filter size of postnet.
+        use_scaled_pos_enc (pool, optional): Whether to use trainable scaled positional encoding.
+        use_batch_norm (bool, optional): Whether to use batch normalization in encoder prenet.
+        encoder_normalize_before (bool, optional): Whether to perform layer normalization before encoder block.
+        decoder_normalize_before (bool, optional): Whether to perform layer normalization before decoder block.
+        encoder_concat_after (bool, optional): Whether to concatenate attention layer's input and output in encoder.
+        decoder_concat_after (bool, optional): Whether to concatenate attention layer's input and output in decoder.
+        positionwise_layer_type (str, optional): Position-wise operation type.
+        positionwise_conv_kernel_size (int, optional): Kernel size in position wise conv 1d.
+        reduction_factor (int, optional): Reduction factor.
+        spk_embed_dim (int, optional): Number of speaker embedding dimenstions.
+        spk_embed_integration_type (str, optional): How to integrate speaker embedding.
+        use_gst (str, optional): Whether to use global style token.
+        gst_tokens (int, optional): The number of GST embeddings.
+        gst_heads (int, optional): The number of heads in GST multihead attention.
+        gst_conv_layers (int, optional): The number of conv layers in GST.
+        gst_conv_chans_list (Sequence[int], optional): List of the number of channels of conv layers in GST.
+        gst_conv_kernel_size (int, optional): Kernal size of conv layers in GST.
+        gst_conv_stride (int, optional): Stride size of conv layers in GST.
+        gst_gru_layers (int, optional): The number of GRU layers in GST.
+        gst_gru_units (int, optional): The number of GRU units in GST.
+        transformer_lr (float, optional): Initial value of learning rate.
+        transformer_warmup_steps (int, optional): Optimizer warmup steps.
+        transformer_enc_dropout_rate (float, optional): Dropout rate in encoder except attention and positional encoding.
+        transformer_enc_positional_dropout_rate (float, optional): Dropout rate after encoder positional encoding.
+        transformer_enc_attn_dropout_rate （float, optional): Dropout rate in encoder self-attention module.
+        transformer_dec_dropout_rate (float, optional): Dropout rate in decoder except attention & positional encoding.
+        transformer_dec_positional_dropout_rate (float, optional): Dropout rate after decoder positional encoding.
+        transformer_dec_attn_dropout_rate （float, optional): Dropout rate in deocoder self-attention module.
+        transformer_enc_dec_attn_dropout_rate (float, optional): Dropout rate in encoder-deocoder attention module.
+        init_type (str, optional): How to initialize transformer parameters.
+        init_enc_alpha （float, optional）: Initial value of alpha in scaled pos encoding of the encoder.
+        init_dec_alpha (float, optional): Initial value of alpha in scaled pos encoding of the decoder.
+        eprenet_dropout_rate (float, optional): Dropout rate in encoder prenet.
+        dprenet_dropout_rate (float, optional): Dropout rate in decoder prenet.
+        postnet_dropout_rate (float, optional): Dropout rate in postnet.
+        use_masking (bool, optional): Whether to apply masking for padded part in loss calculation.
+        use_weighted_masking (bool, optional): Whether to apply weighted masking in loss calculation.
+        bce_pos_weight (float, optional): Positive sample weight in bce calculation (only for use_masking=true).
+        loss_type (str, optional): How to calculate loss.
+        use_guided_attn_loss (bool, optional): Whether to use guided attention loss.
+        num_heads_applied_guided_attn (int, optional): Number of heads in each layer to apply guided attention loss.
+        num_layers_applied_guided_attn (int, optional): Number of layers to apply guided attention loss.
+            List of module names to apply guided attention loss.
+    """
+
+    def __init__(
+            self,
+            # network structure related
+            idim: int,
+            odim: int,
+            embed_dim: int=512,
+            eprenet_conv_layers: int=3,
+            eprenet_conv_chans: int=256,
+            eprenet_conv_filts: int=5,
+            dprenet_layers: int=2,
+            dprenet_units: int=256,
+            elayers: int=6,
+            eunits: int=1024,
+            adim: int=512,
+            aheads: int=4,
+            dlayers: int=6,
+            dunits: int=1024,
+            postnet_layers: int=5,
+            postnet_chans: int=256,
+            postnet_filts: int=5,
+            positionwise_layer_type: str="conv1d",
+            positionwise_conv_kernel_size: int=1,
+            use_scaled_pos_enc: bool=True,
+            use_batch_norm: bool=True,
+            encoder_normalize_before: bool=True,
+            decoder_normalize_before: bool=True,
+            encoder_concat_after: bool=False,
+            decoder_concat_after: bool=False,
+            reduction_factor: int=1,
+            spk_embed_dim: int=None,
+            spk_embed_integration_type: str="add",
+            use_gst: bool=False,
+            gst_tokens: int=10,
+            gst_heads: int=4,
+            gst_conv_layers: int=6,
+            gst_conv_chans_list: Sequence[int]=(32, 32, 64, 64, 128, 128),
+            gst_conv_kernel_size: int=3,
+            gst_conv_stride: int=2,
+            gst_gru_layers: int=1,
+            gst_gru_units: int=128,
+            # training related
+            transformer_enc_dropout_rate: float=0.1,
+            transformer_enc_positional_dropout_rate: float=0.1,
+            transformer_enc_attn_dropout_rate: float=0.1,
+            transformer_dec_dropout_rate: float=0.1,
+            transformer_dec_positional_dropout_rate: float=0.1,
+            transformer_dec_attn_dropout_rate: float=0.1,
+            transformer_enc_dec_attn_dropout_rate: float=0.1,
+            eprenet_dropout_rate: float=0.5,
+            dprenet_dropout_rate: float=0.5,
+            postnet_dropout_rate: float=0.5,
+            init_type: str="xavier_uniform",
+            init_enc_alpha: float=1.0,
+            init_dec_alpha: float=1.0,
+            use_guided_attn_loss: bool=True,
+            num_heads_applied_guided_attn: int=2,
+            num_layers_applied_guided_attn: int=2, ):
+        """Initialize Transformer module."""
+        assert check_argument_types()
+        super().__init__()
+
+        # store hyperparameters
+        self.idim = idim
+        self.odim = odim
+        self.eos = idim - 1
+        self.spk_embed_dim = spk_embed_dim
+        self.reduction_factor = reduction_factor
+        self.use_gst = use_gst
+        self.use_scaled_pos_enc = use_scaled_pos_enc
+        self.use_guided_attn_loss = use_guided_attn_loss
+        if self.use_guided_attn_loss:
+            if num_layers_applied_guided_attn == -1:
+                self.num_layers_applied_guided_attn = elayers
+            else:
+                self.num_layers_applied_guided_attn = num_layers_applied_guided_attn
+            if num_heads_applied_guided_attn == -1:
+                self.num_heads_applied_guided_attn = aheads
+            else:
+                self.num_heads_applied_guided_attn = num_heads_applied_guided_attn
+        if self.spk_embed_dim is not None:
+            self.spk_embed_integration_type = spk_embed_integration_type
+
+        # use idx 0 as padding idx
+        self.padding_idx = 0
+        # set_global_initializer 会影响后面的全局，包括 create_parameter
+        initialize(self, init_type)
+
+        # get positional encoding layer type
+        transformer_pos_enc_layer_type = "scaled_abs_pos" if self.use_scaled_pos_enc else "abs_pos"
+
+        # define transformer encoder
+        if eprenet_conv_layers != 0:
+            # encoder prenet
+            encoder_input_layer = nn.Sequential(
+                EncoderPrenet(
+                    idim=idim,
+                    embed_dim=embed_dim,
+                    elayers=0,
+                    econv_layers=eprenet_conv_layers,
+                    econv_chans=eprenet_conv_chans,
+                    econv_filts=eprenet_conv_filts,
+                    use_batch_norm=use_batch_norm,
+                    dropout_rate=eprenet_dropout_rate,
+                    padding_idx=self.padding_idx, ),
+                nn.Linear(eprenet_conv_chans, adim), )
+        else:
+            encoder_input_layer = nn.Embedding(
+                num_embeddings=idim,
+                embedding_dim=adim,
+                padding_idx=self.padding_idx)
+        self.encoder = TransformerEncoder(
+            idim=idim,
+            attention_dim=adim,
+            attention_heads=aheads,
+            linear_units=eunits,
+            num_blocks=elayers,
+            input_layer=encoder_input_layer,
+            dropout_rate=transformer_enc_dropout_rate,
+            positional_dropout_rate=transformer_enc_positional_dropout_rate,
+            attention_dropout_rate=transformer_enc_attn_dropout_rate,
+            pos_enc_layer_type=transformer_pos_enc_layer_type,
+            normalize_before=encoder_normalize_before,
+            concat_after=encoder_concat_after,
+            positionwise_layer_type=positionwise_layer_type,
+            positionwise_conv_kernel_size=positionwise_conv_kernel_size, )
+
+        # define GST
+        if self.use_gst:
+            self.gst = StyleEncoder(
+                idim=odim,  # the input is mel-spectrogram
+                gst_tokens=gst_tokens,
+                gst_token_dim=adim,
+                gst_heads=gst_heads,
+                conv_layers=gst_conv_layers,
+                conv_chans_list=gst_conv_chans_list,
+                conv_kernel_size=gst_conv_kernel_size,
+                conv_stride=gst_conv_stride,
+                gru_layers=gst_gru_layers,
+                gru_units=gst_gru_units, )
+
+        # define projection layer
+        if self.spk_embed_dim is not None:
+            if self.spk_embed_integration_type == "add":
+                self.projection = nn.Linear(self.spk_embed_dim, adim)
+            else:
+                self.projection = nn.Linear(adim + self.spk_embed_dim, adim)
+
+        # define transformer decoder
+        if dprenet_layers != 0:
+            # decoder prenet
+            decoder_input_layer = nn.Sequential(
+                DecoderPrenet(
+                    idim=odim,
+                    n_layers=dprenet_layers,
+                    n_units=dprenet_units,
+                    dropout_rate=dprenet_dropout_rate, ),
+                nn.Linear(dprenet_units, adim), )
+        else:
+            decoder_input_layer = "linear"
+        # get positional encoding class
+        pos_enc_class = (ScaledPositionalEncoding
+                         if self.use_scaled_pos_enc else PositionalEncoding)
+        self.decoder = Decoder(
+            odim=odim,  # odim is needed when no prenet is used
+            attention_dim=adim,
+            attention_heads=aheads,
+            linear_units=dunits,
+            num_blocks=dlayers,
+            dropout_rate=transformer_dec_dropout_rate,
+            positional_dropout_rate=transformer_dec_positional_dropout_rate,
+            self_attention_dropout_rate=transformer_dec_attn_dropout_rate,
+            src_attention_dropout_rate=transformer_enc_dec_attn_dropout_rate,
+            input_layer=decoder_input_layer,
+            use_output_layer=False,
+            pos_enc_class=pos_enc_class,
+            normalize_before=decoder_normalize_before,
+            concat_after=decoder_concat_after, )
+
+        # define final projection
+        self.feat_out = nn.Linear(adim, odim * reduction_factor)
+        self.prob_out = nn.Linear(adim, reduction_factor)
+
+        # define postnet
+        self.postnet = (None if postnet_layers == 0 else Postnet(
+            idim=idim,
+            odim=odim,
+            n_layers=postnet_layers,
+            n_chans=postnet_chans,
+            n_filts=postnet_filts,
+            use_batch_norm=use_batch_norm,
+            dropout_rate=postnet_dropout_rate, ))
+
+        # 闭合的 initialize() 中的 set_global_initializer 的作用域，防止其影响到 self._reset_parameters()
+        nn.initializer.set_global_initializer(None)
+
+        self._reset_parameters(
+            init_enc_alpha=init_enc_alpha,
+            init_dec_alpha=init_dec_alpha, )
+
+    def _reset_parameters(self, init_enc_alpha: float, init_dec_alpha: float):
+
+        # initialize alpha in scaled positional encoding
+        if self.use_scaled_pos_enc:
+            init_enc_alpha = paddle.to_tensor(init_enc_alpha)
+            self.encoder.embed[-1].alpha = paddle.create_parameter(
+                shape=init_enc_alpha.shape,
+                dtype=str(init_enc_alpha.numpy().dtype),
+                default_initializer=paddle.nn.initializer.Assign(
+                    init_enc_alpha))
+
+            init_dec_alpha = paddle.to_tensor(init_dec_alpha)
+            self.decoder.embed[-1].alpha = paddle.create_parameter(
+                shape=init_dec_alpha.shape,
+                dtype=str(init_dec_alpha.numpy().dtype),
+                default_initializer=paddle.nn.initializer.Assign(
+                    init_dec_alpha))
+
+    def forward(
+            self,
+            text: paddle.Tensor,
+            text_lengths: paddle.Tensor,
+            speech: paddle.Tensor,
+            speech_lengths: paddle.Tensor,
+            spk_emb: paddle.Tensor=None,
+    ) -> Tuple[paddle.Tensor, Dict[str, paddle.Tensor], paddle.Tensor]:
+        """Calculate forward propagation.
+
+        Args:
+            text(Tensor(int64)): Batch of padded character ids (B, Tmax).
+            text_lengths(Tensor(int64)): Batch of lengths of each input batch (B,).
+            speech(Tensor): Batch of padded target features (B, Lmax, odim).
+            speech_lengths(Tensor(int64)): Batch of the lengths of each target (B,).
+            spk_emb(Tensor, optional): Batch of speaker embeddings (B, spk_embed_dim).
+
+        Returns:
+            Tensor: Loss scalar value.
+            Dict: Statistics to be monitored.
+
+        """
+        # input of embedding must be int64
+        text_lengths = paddle.cast(text_lengths, 'int64')
+
+        # Add eos at the last of sequence
+        text = numpy.pad(text.numpy(), ((0, 0), (0, 1)), 'constant')
+        xs = paddle.to_tensor(text, dtype='int64')
+        for i, l in enumerate(text_lengths):
+            xs[i, l] = self.eos
+        ilens = text_lengths + 1
+
+        ys = speech
+        olens = paddle.cast(speech_lengths, 'int64')
+
+        # make labels for stop prediction
+        stop_labels = make_pad_mask(olens - 1)
+        # bool 类型无法切片
+        stop_labels = paddle.cast(stop_labels, dtype='float32')
+        stop_labels = F.pad(stop_labels, [0, 0, 0, 1], "constant", 1.0)
+
+        # calculate transformer outputs
+        after_outs, before_outs, logits = self._forward(xs, ilens, ys, olens,
+                                                        spk_emb)
+
+        # modifiy mod part of groundtruth
+
+        if self.reduction_factor > 1:
+            olens = olens - olens % self.reduction_factor
+            max_olen = max(olens)
+            ys = ys[:, :max_olen]
+            stop_labels = stop_labels[:, :max_olen]
+            stop_labels[:, -1] = 1.0  # make sure at least one frame has 1
+            olens_in = olens // self.reduction_factor
+        else:
+            olens_in = olens
+
+        need_dict = {}
+        need_dict['encoder'] = self.encoder
+        need_dict['decoder'] = self.decoder
+        need_dict[
+            'num_heads_applied_guided_attn'] = self.num_heads_applied_guided_attn
+        need_dict[
+            'num_layers_applied_guided_attn'] = self.num_layers_applied_guided_attn
+        need_dict['use_scaled_pos_enc'] = self.use_scaled_pos_enc
+
+        return after_outs, before_outs, logits, ys, stop_labels, olens, olens_in, need_dict
+
+    def _forward(
+            self,
+            xs: paddle.Tensor,
+            ilens: paddle.Tensor,
+            ys: paddle.Tensor,
+            olens: paddle.Tensor,
+            spk_emb: paddle.Tensor,
+    ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
+        # forward encoder
+        x_masks = self._source_mask(ilens)
+        hs, h_masks = self.encoder(xs, x_masks)
+
+        # integrate with GST
+        if self.use_gst:
+            style_embs = self.gst(ys)
+            hs = hs + style_embs.unsqueeze(1)
+
+        # integrate speaker embedding
+        if self.spk_embed_dim is not None:
+            hs = self._integrate_with_spk_embed(hs, spk_emb)
+
+        # thin out frames for reduction factor (B, Lmax, odim) ->  (B, Lmax//r, odim)
+        if self.reduction_factor > 1:
+            ys_in = ys[:, self.reduction_factor - 1::self.reduction_factor]
+            olens_in = olens // self.reduction_factor
+        else:
+            ys_in, olens_in = ys, olens
+
+        # add first zero frame and remove last frame for auto-regressive
+        ys_in = self._add_first_frame_and_remove_last_frame(ys_in)
+
+        # forward decoder
+        y_masks = self._target_mask(olens_in)
+        zs, _ = self.decoder(ys_in, y_masks, hs, h_masks)
+        # (B, Lmax//r, odim * r) -> (B, Lmax//r * r, odim)
+        before_outs = self.feat_out(zs).reshape([zs.shape[0], -1, self.odim])
+        # (B, Lmax//r, r) -> (B, Lmax//r * r)
+        logits = self.prob_out(zs).reshape([zs.shape[0], -1])
+
+        # postnet -> (B, Lmax//r * r, odim)
+        if self.postnet is None:
+            after_outs = before_outs
+        else:
+            after_outs = before_outs + self.postnet(
+                before_outs.transpose([0, 2, 1])).transpose([0, 2, 1])
+
+        return after_outs, before_outs, logits
+
+    def inference(
+            self,
+            text: paddle.Tensor,
+            speech: paddle.Tensor=None,
+            spk_emb: paddle.Tensor=None,
+            threshold: float=0.5,
+            minlenratio: float=0.0,
+            maxlenratio: float=10.0,
+            use_teacher_forcing: bool=False,
+    ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
+        """Generate the sequence of features given the sequences of characters.
+
+        Args:
+            text(Tensor(int64)): Input sequence of characters (T,).
+            speech(Tensor, optional): Feature sequence to extract style (N, idim).
+            spk_emb(Tensor, optional): Speaker embedding vector (spk_embed_dim,).
+            threshold(float, optional): Threshold in inference.
+            minlenratio(float, optional): Minimum length ratio in inference.
+            maxlenratio(float, optional): Maximum length ratio in inference.
+            use_teacher_forcing(bool, optional): Whether to use teacher forcing.
+
+        Returns:
+            Tensor: Output sequence of features (L, odim).
+            Tensor: Output sequence of stop probabilities (L,).
+            Tensor: Encoder-decoder (source) attention weights (#layers, #heads, L, T).
+
+        """
+        # input of embedding must be int64
+        y = speech
+
+        # add eos at the last of sequence
+        text = numpy.pad(
+            text.numpy(), (0, 1), 'constant', constant_values=self.eos)
+        x = paddle.to_tensor(text, dtype='int64')
+
+        # inference with teacher forcing
+        if use_teacher_forcing:
+            assert speech is not None, "speech must be provided with teacher forcing."
+
+            # get teacher forcing outputs
+            xs, ys = x.unsqueeze(0), y.unsqueeze(0)
+            spk_emb = None if spk_emb is None else spk_emb.unsqueeze(0)
+            ilens = paddle.to_tensor(
+                [xs.shape[1]], dtype=paddle.int64, place=xs.place)
+            olens = paddle.to_tensor(
+                [ys.shape[1]], dtype=paddle.int64, place=ys.place)
+            outs, *_ = self._forward(xs, ilens, ys, olens, spk_emb)
+
+            # get attention weights
+            att_ws = []
+            for i in range(len(self.decoder.decoders)):
+                att_ws += [self.decoder.decoders[i].src_attn.attn]
+            # (B, L, H, T_out, T_in)
+            att_ws = paddle.stack(att_ws, axis=1)
+
+            return outs[0], None, att_ws[0]
+
+        # forward encoder
+        xs = x.unsqueeze(0)
+        hs, _ = self.encoder(xs, None)
+
+        # integrate GST
+        if self.use_gst:
+            style_embs = self.gst(y.unsqueeze(0))
+            hs = hs + style_embs.unsqueeze(1)
+
+        # integrate speaker embedding
+        if spk_emb is not None:
+            spk_emb = spk_emb.unsqueeze(0)
+            hs = self._integrate_with_spk_embed(hs, spk_emb)
+
+        # set limits of length
+        maxlen = int(hs.shape[1] * maxlenratio / self.reduction_factor)
+        minlen = int(hs.shape[1] * minlenratio / self.reduction_factor)
+
+        # initialize
+        idx = 0
+        ys = paddle.zeros([1, 1, self.odim])
+        outs, probs = [], []
+
+        # forward decoder step-by-step
+        z_cache = None
+        while True:
+            # update index
+            idx += 1
+
+            # calculate output and stop prob at idx-th step
+            y_masks = subsequent_mask(idx).unsqueeze(0)
+            z, z_cache = self.decoder.forward_one_step(
+                ys, y_masks, hs, cache=z_cache)  # (B, adim)
+            outs += [
+                self.feat_out(z).reshape([self.reduction_factor, self.odim])
+            ]  # [(r, odim), ...]
+            probs += [F.sigmoid(self.prob_out(z))[0]]  # [(r), ...]
+
+            # update next inputs
+            ys = paddle.concat(
+                (ys, outs[-1][-1].reshape([1, 1, self.odim])),
+                axis=1)  # (1, idx + 1, odim)
+
+            # get attention weights
+            att_ws_ = []
+            for name, m in self.named_sublayers():
+                if isinstance(m, MultiHeadedAttention) and "src" in name:
+                    # [(#heads, 1, T),...]
+                    att_ws_ += [m.attn[0, :, -1].unsqueeze(1)]
+            if idx == 1:
+                att_ws = att_ws_
+            else:
+                # [(#heads, l, T), ...]
+                att_ws = [
+                    paddle.concat([att_w, att_w_], axis=1)
+                    for att_w, att_w_ in zip(att_ws, att_ws_)
+                ]
+
+            # check whether to finish generation
+            if sum(paddle.cast(probs[-1] >= threshold,
+                               'int64')) > 0 or idx >= maxlen:
+                # check mininum length
+                if idx < minlen:
+                    continue
+                # (L, odim) -> (1, L, odim) -> (1, odim, L)
+                outs = (paddle.concat(outs, axis=0).unsqueeze(0).transpose(
+                    [0, 2, 1]))
+                if self.postnet is not None:
+                    # (1, odim, L)
+                    outs = outs + self.postnet(outs)
+                # (L, odim)
+                outs = outs.transpose([0, 2, 1]).squeeze(0)
+                probs = paddle.concat(probs, axis=0)
+                break
+
+        # concatenate attention weights -> (#layers, #heads, L, T)
+        att_ws = paddle.stack(att_ws, axis=0)
+
+        return outs, probs, att_ws
+
+    def _add_first_frame_and_remove_last_frame(
+            self, ys: paddle.Tensor) -> paddle.Tensor:
+        ys_in = paddle.concat(
+            [paddle.zeros((ys.shape[0], 1, ys.shape[2])), ys[:, :-1]], axis=1)
+        return ys_in
+
+    def _source_mask(self, ilens: paddle.Tensor) -> paddle.Tensor:
+        """Make masks for self-attention.
+
+        Args:
+            ilens(Tensor): Batch of lengths (B,).
+
+        Returns:
+            Tensor: Mask tensor for self-attention. dtype=paddle.bool
+
+        Examples:
+            >>> ilens = [5, 3]
+            >>> self._source_mask(ilens)
+            tensor([[[1, 1, 1, 1, 1],
+                        [1, 1, 1, 0, 0]]]) bool
+
+        """
+        x_masks = make_non_pad_mask(ilens)
+        return x_masks.unsqueeze(-2)
+
+    def _target_mask(self, olens: paddle.Tensor) -> paddle.Tensor:
+        """Make masks for masked self-attention.
+
+        Args:
+            olens (Tensor(int64)): Batch of lengths (B,).
+
+        Returns:
+            Tensor: Mask tensor for masked self-attention.
+
+        Examples:
+            >>> olens = [5, 3]
+            >>> self._target_mask(olens)
+            tensor([[[1, 0, 0, 0, 0],
+                        [1, 1, 0, 0, 0],
+                        [1, 1, 1, 0, 0],
+                        [1, 1, 1, 1, 0],
+                        [1, 1, 1, 1, 1]],
+                    [[1, 0, 0, 0, 0],
+                        [1, 1, 0, 0, 0],
+                        [1, 1, 1, 0, 0],
+                        [1, 1, 1, 0, 0],
+                        [1, 1, 1, 0, 0]]], dtype=paddle.uint8)
+
+        """
+        y_masks = make_non_pad_mask(olens)
+        s_masks = subsequent_mask(y_masks.shape[-1]).unsqueeze(0)
+        return paddle.logical_and(y_masks.unsqueeze(-2), s_masks)
+
+    def _integrate_with_spk_embed(self,
+                                  hs: paddle.Tensor,
+                                  spk_emb: paddle.Tensor) -> paddle.Tensor:
+        """Integrate speaker embedding with hidden states.
+
+        Args:
+            hs(Tensor): Batch of hidden state sequences (B, Tmax, adim).
+            spk_emb(Tensor): Batch of speaker embeddings (B, spk_embed_dim).
+
+        Returns:
+            Tensor: Batch of integrated hidden state sequences (B, Tmax, adim).
+
+        """
+        if self.spk_embed_integration_type == "add":
+            # apply projection and then add to hidden states
+            spk_emb = self.projection(F.normalize(spk_emb))
+            hs = hs + spk_emb.unsqueeze(1)
+        elif self.spk_embed_integration_type == "concat":
+            # concat hidden states with spk embeds and then apply projection
+            spk_emb = F.normalize(spk_emb).unsqueeze(1).expand(-1, hs.shape[1],
+                                                               -1)
+            hs = self.projection(paddle.concat([hs, spk_emb], axis=-1))
+        else:
+            raise NotImplementedError("support only add or concat.")
+
+        return hs
+
+
+class TransformerTTSInference(nn.Layer):
+    def __init__(self, normalizer, model):
+        super().__init__()
+        self.normalizer = normalizer
+        self.acoustic_model = model
+
+    def forward(self, text, spk_id=None):
+        normalized_mel = self.acoustic_model.inference(text)[0]
+        logmel = self.normalizer.inverse(normalized_mel)
+        return logmel
diff --git a/ernie-sat/paddlespeech/t2s/models/transformer_tts/transformer_tts_updater.py b/ernie-sat/paddlespeech/t2s/models/transformer_tts/transformer_tts_updater.py
new file mode 100644
index 0000000000000000000000000000000000000000..dff908e05bf01d181352fc6ebd28113f0a106923
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/models/transformer_tts/transformer_tts_updater.py
@@ -0,0 +1,333 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+from pathlib import Path
+from typing import Sequence
+
+import paddle
+from paddle import distributed as dist
+from paddle.io import DataLoader
+from paddle.nn import Layer
+from paddle.optimizer import Optimizer
+
+from paddlespeech.t2s.modules.losses import GuidedMultiHeadAttentionLoss
+from paddlespeech.t2s.modules.losses import Tacotron2Loss as TransformerTTSLoss
+from paddlespeech.t2s.training.extensions.evaluator import StandardEvaluator
+from paddlespeech.t2s.training.reporter import report
+from paddlespeech.t2s.training.updaters.standard_updater import StandardUpdater
+logging.basicConfig(
+    format='%(asctime)s [%(levelname)s] [%(filename)s:%(lineno)d] %(message)s',
+    datefmt='[%Y-%m-%d %H:%M:%S]')
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+
+class TransformerTTSUpdater(StandardUpdater):
+    def __init__(
+            self,
+            model: Layer,
+            optimizer: Optimizer,
+            dataloader: DataLoader,
+            init_state=None,
+            use_masking: bool=False,
+            use_weighted_masking: bool=False,
+            output_dir: Path=None,
+            bce_pos_weight: float=5.0,
+            loss_type: str="L1",
+            use_guided_attn_loss: bool=True,
+            modules_applied_guided_attn: Sequence[str]=("encoder-decoder"),
+            guided_attn_loss_sigma: float=0.4,
+            guided_attn_loss_lambda: float=1.0, ):
+        super().__init__(model, optimizer, dataloader, init_state=None)
+
+        self.loss_type = loss_type
+        self.use_guided_attn_loss = use_guided_attn_loss
+        self.modules_applied_guided_attn = modules_applied_guided_attn
+
+        self.criterion = TransformerTTSLoss(
+            use_masking=use_masking,
+            use_weighted_masking=use_weighted_masking,
+            bce_pos_weight=bce_pos_weight)
+
+        if self.use_guided_attn_loss:
+            self.attn_criterion = GuidedMultiHeadAttentionLoss(
+                sigma=guided_attn_loss_sigma,
+                alpha=guided_attn_loss_lambda, )
+
+        log_file = output_dir / 'worker_{}.log'.format(dist.get_rank())
+        self.filehandler = logging.FileHandler(str(log_file))
+        logger.addHandler(self.filehandler)
+        self.logger = logger
+        self.msg = ""
+
+    def update_core(self, batch):
+        self.msg = "Rank: {}, ".format(dist.get_rank())
+        losses_dict = {}
+
+        after_outs, before_outs, logits, ys, stop_labels, olens, olens_in, need_dict = self.model(
+            text=batch["text"],
+            text_lengths=batch["text_lengths"],
+            speech=batch["speech"],
+            speech_lengths=batch["speech_lengths"], )
+
+        l1_loss, l2_loss, bce_loss = self.criterion(
+            after_outs=after_outs,
+            before_outs=before_outs,
+            logits=logits,
+            ys=ys,
+            stop_labels=stop_labels,
+            olens=olens)
+
+        report("train/bce_loss", float(bce_loss))
+        report("train/l1_loss", float(l1_loss))
+        report("train/l2_loss", float(l2_loss))
+        losses_dict["bce_loss"] = float(bce_loss)
+        losses_dict["l1_loss"] = float(l1_loss)
+        losses_dict["l2_loss"] = float(l2_loss)
+        # caluculate loss values
+        if self.loss_type == "L1":
+            loss = l1_loss + bce_loss
+        elif self.loss_type == "L2":
+            loss = l2_loss + bce_loss
+        elif self.loss_type == "L1+L2":
+            loss = l1_loss + l2_loss + bce_loss
+        else:
+            raise ValueError("unknown --loss-type " + self.loss_type)
+
+        # calculate guided attention loss
+        if self.use_guided_attn_loss:
+            # calculate for encoder
+            if "encoder" in self.modules_applied_guided_attn:
+                att_ws = []
+                for idx, layer_idx in enumerate(
+                        reversed(range(len(need_dict['encoder'].encoders)))):
+                    att_ws += [
+                        need_dict['encoder'].encoders[layer_idx].self_attn.
+                        attn[:, :need_dict['num_heads_applied_guided_attn']]
+                    ]
+                    if idx + 1 == need_dict['num_layers_applied_guided_attn']:
+                        break
+                # (B, H*L, T_in, T_in)
+                att_ws = paddle.concat(att_ws, axis=1)
+                enc_attn_loss = self.attn_criterion(
+                    att_ws=att_ws,
+                    ilens=batch["text_lengths"] + 1,
+                    olens=batch["text_lengths"] + 1)
+                loss = loss + enc_attn_loss
+                report("train/enc_attn_loss", float(enc_attn_loss))
+                losses_dict["enc_attn_loss"] = float(enc_attn_loss)
+            # calculate for decoder
+            if "decoder" in self.modules_applied_guided_attn:
+                att_ws = []
+                for idx, layer_idx in enumerate(
+                        reversed(range(len(need_dict['decoder'].decoders)))):
+                    att_ws += [
+                        need_dict['decoder'].decoders[layer_idx].self_attn.
+                        attn[:, :need_dict['num_heads_applied_guided_attn']]
+                    ]
+                    if idx + 1 == need_dict['num_layers_applied_guided_attn']:
+                        break
+                # (B, H*L, T_out, T_out)
+                att_ws = paddle.concat(att_ws, axis=1)
+                dec_attn_loss = self.attn_criterion(
+                    att_ws=att_ws, ilens=olens_in, olens=olens_in)
+                report("train/dec_attn_loss", float(dec_attn_loss))
+                losses_dict["dec_attn_loss"] = float(dec_attn_loss)
+                loss = loss + dec_attn_loss
+            # calculate for encoder-decoder
+            if "encoder-decoder" in self.modules_applied_guided_attn:
+                att_ws = []
+                for idx, layer_idx in enumerate(
+                        reversed(range(len(need_dict['decoder'].decoders)))):
+                    att_ws += [
+                        need_dict['decoder'].decoders[layer_idx].src_attn.
+                        attn[:, :need_dict['num_heads_applied_guided_attn']]
+                    ]
+                    if idx + 1 == need_dict['num_layers_applied_guided_attn']:
+                        break
+                # (B, H*L, T_out, T_in)
+                att_ws = paddle.concat(att_ws, axis=1)
+                enc_dec_attn_loss = self.attn_criterion(
+                    att_ws=att_ws,
+                    ilens=batch["text_lengths"] + 1,
+                    olens=olens_in)
+                report("train/enc_dec_attn_loss", float(enc_dec_attn_loss))
+                losses_dict["enc_dec_attn_loss"] = float(enc_dec_attn_loss)
+                loss = loss + enc_dec_attn_loss
+        if need_dict['use_scaled_pos_enc']:
+            report("train/encoder_alpha",
+                   float(need_dict['encoder'].embed[-1].alpha))
+            report("train/decoder_alpha",
+                   float(need_dict['decoder'].embed[-1].alpha))
+            losses_dict["encoder_alpha"] = float(
+                need_dict['encoder'].embed[-1].alpha)
+            losses_dict["decoder_alpha"] = float(
+                need_dict['decoder'].embed[-1].alpha)
+
+        optimizer = self.optimizer
+        optimizer.clear_grad()
+        loss.backward()
+        optimizer.step()
+
+        report("train/loss", float(loss))
+        losses_dict["loss"] = float(loss)
+        self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
+                              for k, v in losses_dict.items())
+
+
+class TransformerTTSEvaluator(StandardEvaluator):
+    def __init__(
+            self,
+            model: Layer,
+            dataloader: DataLoader,
+            init_state=None,
+            use_masking: bool=False,
+            use_weighted_masking: bool=False,
+            output_dir: Path=None,
+            bce_pos_weight: float=5.0,
+            loss_type: str="L1",
+            use_guided_attn_loss: bool=True,
+            modules_applied_guided_attn: Sequence[str]=("encoder-decoder"),
+            guided_attn_loss_sigma: float=0.4,
+            guided_attn_loss_lambda: float=1.0, ):
+        super().__init__(model, dataloader)
+
+        self.loss_type = loss_type
+        self.use_guided_attn_loss = use_guided_attn_loss
+        self.modules_applied_guided_attn = modules_applied_guided_attn
+
+        self.criterion = TransformerTTSLoss(
+            use_masking=use_masking,
+            use_weighted_masking=use_weighted_masking,
+            bce_pos_weight=bce_pos_weight)
+
+        if self.use_guided_attn_loss:
+            self.attn_criterion = GuidedMultiHeadAttentionLoss(
+                sigma=guided_attn_loss_sigma,
+                alpha=guided_attn_loss_lambda, )
+
+        log_file = output_dir / 'worker_{}.log'.format(dist.get_rank())
+        self.filehandler = logging.FileHandler(str(log_file))
+        logger.addHandler(self.filehandler)
+        self.logger = logger
+        self.msg = ""
+
+    def evaluate_core(self, batch):
+        self.msg = "Evaluate: "
+        losses_dict = {}
+        after_outs, before_outs, logits, ys, stop_labels, olens, olens_in, need_dict = self.model(
+            text=batch["text"],
+            text_lengths=batch["text_lengths"],
+            speech=batch["speech"],
+            speech_lengths=batch["speech_lengths"])
+
+        l1_loss, l2_loss, bce_loss = self.criterion(
+            after_outs=after_outs,
+            before_outs=before_outs,
+            logits=logits,
+            ys=ys,
+            stop_labels=stop_labels,
+            olens=olens)
+
+        report("eval/bce_loss", float(bce_loss))
+        report("eval/l1_loss", float(l1_loss))
+        report("eval/l2_loss", float(l2_loss))
+        losses_dict["bce_loss"] = float(bce_loss)
+        losses_dict["l1_loss"] = float(l1_loss)
+        losses_dict["l2_loss"] = float(l2_loss)
+        # caluculate loss values
+        if self.loss_type == "L1":
+            loss = l1_loss + bce_loss
+        elif self.loss_type == "L2":
+            loss = l2_loss + bce_loss
+        elif self.loss_type == "L1+L2":
+            loss = l1_loss + l2_loss + bce_loss
+        else:
+            raise ValueError("unknown --loss-type " + self.loss_type)
+
+        # calculate guided attention loss
+        if self.use_guided_attn_loss:
+            # calculate for encoder
+            if "encoder" in self.modules_applied_guided_attn:
+                att_ws = []
+                for idx, layer_idx in enumerate(
+                        reversed(range(len(need_dict['encoder'].encoders)))):
+                    att_ws += [
+                        need_dict['encoder'].encoders[layer_idx].self_attn.
+                        attn[:, :need_dict['num_heads_applied_guided_attn']]
+                    ]
+                    if idx + 1 == need_dict['num_layers_applied_guided_attn']:
+                        break
+                # (B, H*L, T_in, T_in)
+                att_ws = paddle.concat(att_ws, axis=1)
+                enc_attn_loss = self.attn_criterion(
+                    att_ws=att_ws,
+                    ilens=batch["text_lengths"] + 1,
+                    olens=batch["text_lengths"] + 1)
+                loss = loss + enc_attn_loss
+                report("train/enc_attn_loss", float(enc_attn_loss))
+                losses_dict["enc_attn_loss"] = float(enc_attn_loss)
+            # calculate for decoder
+            if "decoder" in self.modules_applied_guided_attn:
+                att_ws = []
+                for idx, layer_idx in enumerate(
+                        reversed(range(len(need_dict['decoder'].decoders)))):
+                    att_ws += [
+                        need_dict['decoder'].decoders[layer_idx].self_attn.
+                        attn[:, :need_dict['num_heads_applied_guided_attn']]
+                    ]
+                    if idx + 1 == need_dict['num_layers_applied_guided_attn']:
+                        break
+                # (B, H*L, T_out, T_out)
+                att_ws = paddle.concat(att_ws, axis=1)
+                dec_attn_loss = self.attn_criterion(
+                    att_ws=att_ws, ilens=olens_in, olens=olens_in)
+                report("eval/dec_attn_loss", float(dec_attn_loss))
+                losses_dict["dec_attn_loss"] = float(dec_attn_loss)
+                loss = loss + dec_attn_loss
+            # calculate for encoder-decoder
+            if "encoder-decoder" in self.modules_applied_guided_attn:
+
+                att_ws = []
+                for idx, layer_idx in enumerate(
+                        reversed(range(len(need_dict['decoder'].decoders)))):
+                    att_ws += [
+                        need_dict['decoder'].decoders[layer_idx].src_attn.
+                        attn[:, :need_dict['num_heads_applied_guided_attn']]
+                    ]
+                    if idx + 1 == need_dict['num_layers_applied_guided_attn']:
+                        break
+                # (B, H*L, T_out, T_in)
+                att_ws = paddle.concat(att_ws, axis=1)
+                enc_dec_attn_loss = self.attn_criterion(
+                    att_ws=att_ws,
+                    ilens=batch["text_lengths"] + 1,
+                    olens=olens_in)
+                report("eval/enc_dec_attn_loss", float(enc_dec_attn_loss))
+                losses_dict["enc_dec_attn_loss"] = float(enc_dec_attn_loss)
+                loss = loss + enc_dec_attn_loss
+        if need_dict['use_scaled_pos_enc']:
+            report("eval/encoder_alpha",
+                   float(need_dict['encoder'].embed[-1].alpha))
+            report("eval/decoder_alpha",
+                   float(need_dict['decoder'].embed[-1].alpha))
+            losses_dict["encoder_alpha"] = float(
+                need_dict['encoder'].embed[-1].alpha)
+            losses_dict["decoder_alpha"] = float(
+                need_dict['decoder'].embed[-1].alpha)
+        report("eval/loss", float(loss))
+        losses_dict["loss"] = float(loss)
+        self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
+                              for k, v in losses_dict.items())
+        self.logger.info(self.msg)
diff --git a/ernie-sat/paddlespeech/t2s/models/waveflow.py b/ernie-sat/paddlespeech/t2s/models/waveflow.py
new file mode 100644
index 0000000000000000000000000000000000000000..52e6005be3969e1ad89c0d634efbbb62dfc1a68e
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/models/waveflow.py
@@ -0,0 +1,736 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+import time
+from typing import List
+from typing import Tuple
+from typing import Union
+
+import numpy as np
+import paddle
+from paddle import nn
+from paddle.nn import functional as F
+from paddle.nn import initializer as I
+
+from paddlespeech.t2s.modules import geometry as geo
+from paddlespeech.t2s.utils import checkpoint
+
+__all__ = ["WaveFlow", "ConditionalWaveFlow", "WaveFlowLoss"]
+
+
+def fold(x, n_group):
+    """Fold audio or spectrogram's temporal dimension in to groups.
+
+    Args:
+        x(Tensor): The input tensor. shape=(*, time_steps)
+        n_group(int): The size of a group.
+
+    Returns:
+        Tensor: Folded tensor. shape=(*, time_steps // n_group, group)
+    """
+    spatial_shape = list(x.shape[:-1])
+    time_steps = paddle.shape(x)[-1]
+    new_shape = spatial_shape + [time_steps // n_group, n_group]
+    return paddle.reshape(x, new_shape)
+
+
+class UpsampleNet(nn.LayerList):
+    """Layer to upsample mel spectrogram to the same temporal resolution with
+    the corresponding waveform.
+
+    It consists of several conv2dtranspose layers which perform deconvolution
+    on mel and time dimension.
+
+    Args:
+        upscale_factors(List[int], optional): Time upsampling factors for each Conv2DTranspose Layer.
+            The ``UpsampleNet`` contains ``len(upscale_factor)`` Conv2DTranspose
+            Layers. Each upscale_factor is used as the ``stride`` for the
+            corresponding Conv2DTranspose. Defaults to [16, 16], this the default
+            upsampling factor is 256.
+
+    Notes:
+        ``np.prod(upscale_factors)`` should equals the ``hop_length`` of the stft
+        transformation used to extract spectrogram features from audio.
+
+        For example, ``16 * 16 = 256``, then the spectrogram extracted with a stft
+        transformation whose ``hop_length`` equals 256 is suitable.
+
+        See Also
+    
+        ``librosa.core.stft``
+    """
+
+    def __init__(self, upsample_factors):
+        super().__init__()
+        for factor in upsample_factors:
+            std = math.sqrt(1 / (3 * 2 * factor))
+            init = I.Uniform(-std, std)
+            self.append(
+                nn.utils.weight_norm(
+                    nn.Conv2DTranspose(
+                        1,
+                        1, (3, 2 * factor),
+                        padding=(1, factor // 2),
+                        stride=(1, factor),
+                        weight_attr=init,
+                        bias_attr=init)))
+
+        # upsample factors
+        self.upsample_factor = np.prod(upsample_factors)
+        self.upsample_factors = upsample_factors
+
+    def forward(self, x, trim_conv_artifact=False):
+        """Forward pass of the ``UpsampleNet``
+
+        Args:
+            x(Tensor): The input spectrogram. shape=(batch_size, input_channels, time_steps)
+            trim_conv_artifact(bool, optional, optional): Trim deconvolution artifact at each layer. Defaults to False.
+
+        Returns:
+           Tensor: The upsampled spectrogram. shape=(batch_size, input_channels, time_steps * upsample_factor)
+
+        Notes:
+            If trim_conv_artifact is ``True``, the output time steps is less
+            than ``time_steps * upsample_factors``.
+        """
+        x = paddle.unsqueeze(x, 1)  # (B, C, T) -> (B, 1, C, T)
+        for layer in self:
+            x = layer(x)
+            if trim_conv_artifact:
+                time_cutoff = layer._kernel_size[1] - layer._stride[1]
+                x = x[:, :, :, :-time_cutoff]
+            x = F.leaky_relu(x, 0.4)
+        x = paddle.squeeze(x, 1)  # back to (B, C, T)
+        return x
+
+
+class ResidualBlock(nn.Layer):
+    """ResidualBlock, the basic unit of ResidualNet used in WaveFlow.
+
+    It has a conv2d layer, which has causal padding in height dimension and
+    same paddign in width dimension. It also has projection for the condition
+    and output.
+
+    Args:
+        channels (int): Feature size of the input.
+        cond_channels (int): Featuer size of the condition.
+        kernel_size (Tuple[int]): Kernel size of the Convolution2d applied to the input.
+        dilations (int): Dilations of the Convolution2d applied to the input.
+    """
+
+    def __init__(self, channels, cond_channels, kernel_size, dilations):
+        super().__init__()
+        # input conv
+        std = math.sqrt(1 / channels * np.prod(kernel_size))
+        init = I.Uniform(-std, std)
+        receptive_field = [
+            1 + (k - 1) * d for (k, d) in zip(kernel_size, dilations)
+        ]
+        rh, rw = receptive_field
+        paddings = [rh - 1, 0, rw // 2, (rw - 1) // 2]  # causal & same
+        conv = nn.Conv2D(
+            channels,
+            2 * channels,
+            kernel_size,
+            padding=paddings,
+            dilation=dilations,
+            weight_attr=init,
+            bias_attr=init)
+        self.conv = nn.utils.weight_norm(conv)
+        self.rh = rh
+        self.rw = rw
+        self.dilations = dilations
+
+        # condition projection
+        std = math.sqrt(1 / cond_channels)
+        init = I.Uniform(-std, std)
+        condition_proj = nn.Conv2D(
+            cond_channels,
+            2 * channels, (1, 1),
+            weight_attr=init,
+            bias_attr=init)
+        self.condition_proj = nn.utils.weight_norm(condition_proj)
+
+        # parametric residual & skip connection
+        std = math.sqrt(1 / channels)
+        init = I.Uniform(-std, std)
+        out_proj = nn.Conv2D(
+            channels, 2 * channels, (1, 1), weight_attr=init, bias_attr=init)
+        self.out_proj = nn.utils.weight_norm(out_proj)
+
+    def forward(self, x, condition):
+        """Compute output for a whole folded sequence.
+
+        Args:
+            x (Tensor): The input. [shape=(batch_size, channel, height, width)]
+            condition (Tensor [shape=(batch_size, condition_channel, height, width)]): The local condition.
+
+        Returns: 
+            res (Tensor): The residual output. [shape=(batch_size, channel, height, width)]
+            skip (Tensor): The skip output. [shape=(batch_size, channel, height, width)]
+        """
+        x_in = x
+        x = self.conv(x)
+        x += self.condition_proj(condition)
+
+        content, gate = paddle.chunk(x, 2, axis=1)
+        x = paddle.tanh(content) * F.sigmoid(gate)
+
+        x = self.out_proj(x)
+        res, skip = paddle.chunk(x, 2, axis=1)
+        res = x_in + res
+        return res, skip
+
+    def start_sequence(self):
+        """Prepare the layer for incremental computation of causal
+        convolution. Reset the buffer for causal convolution.
+
+        Raises:
+            ValueError: If not in evaluation mode.
+        """
+        if self.training:
+            raise ValueError("Only use start sequence at evaluation mode.")
+        self._conv_buffer = paddle.zeros([1])
+
+        # NOTE: call self.conv's weight norm hook expliccitly since
+        # its weight will be visited directly in `add_input` without
+        # calling its `__call__` method. If we do not trigger the weight
+        # norm hook, the weight may be outdated. e.g. after loading from
+        # a saved checkpoint
+        # see also: https://github.com/pytorch/pytorch/issues/47588
+        for hook in self.conv._forward_pre_hooks.values():
+            hook(self.conv, None)
+
+    def add_input(self, x_row, condition_row):
+        """Compute the output for a row and update the buffer.
+
+        Args:
+            x_row (Tensor): A row of the input. shape=(batch_size, channel, 1, width)
+            condition_row (Tensor): A row of the condition. shape=(batch_size, condition_channel, 1, width)
+
+        Returns:
+            res (Tensor): A row of the the residual output. shape=(batch_size, channel, 1, width)
+            skip (Tensor): A row of the skip output. shape=(batch_size, channel, 1, width)
+
+        """
+        x_row_in = x_row
+        if len(paddle.shape(self._conv_buffer)) == 1:
+            self._init_buffer(x_row)
+        self._update_buffer(x_row)
+        rw = self.rw
+        x_row = F.conv2d(
+            self._conv_buffer,
+            self.conv.weight,
+            self.conv.bias,
+            padding=[0, 0, rw // 2, (rw - 1) // 2],
+            dilation=self.dilations)
+        x_row += self.condition_proj(condition_row)
+        content, gate = paddle.chunk(x_row, 2, axis=1)
+        x_row = paddle.tanh(content) * F.sigmoid(gate)
+
+        x_row = self.out_proj(x_row)
+        res, skip = paddle.chunk(x_row, 2, axis=1)
+        res = x_row_in + res
+        return res, skip
+
+    def _init_buffer(self, input):
+        batch_size, channels, _, width = input.shape
+        self._conv_buffer = paddle.zeros(
+            [batch_size, channels, self.rh, width], dtype=input.dtype)
+
+    def _update_buffer(self, input):
+        self._conv_buffer = paddle.concat(
+            [self._conv_buffer[:, :, 1:, :], input], axis=2)
+
+
+class ResidualNet(nn.LayerList):
+    """A stack of several ResidualBlocks. It merges condition at each layer.
+
+    Args:
+        n_layer (int): Number of ResidualBlocks in the ResidualNet.
+        residual_channels (int): Feature size of each ResidualBlocks.
+        condition_channels (int): Feature size of the condition.
+        kernel_size (Tuple[int]): Kernel size of each ResidualBlock.
+        dilations_h (List[int]): Dilation in height dimension of every ResidualBlock.
+
+    Raises:
+        ValueError: If the length of dilations_h does not equals n_layers.
+    """
+
+    def __init__(self,
+                 n_layer: int,
+                 residual_channels: int,
+                 condition_channels: int,
+                 kernel_size: Tuple[int],
+                 dilations_h: List[int]):
+        if len(dilations_h) != n_layer:
+            raise ValueError(
+                "number of dilations_h should equals num of layers")
+        super().__init__()
+        for i in range(n_layer):
+            dilation = (dilations_h[i], 2**i)
+            layer = ResidualBlock(residual_channels, condition_channels,
+                                  kernel_size, dilation)
+            self.append(layer)
+
+    def forward(self, x, condition):
+        """Comput the output of given the input and the condition.
+
+        Args:
+            x (Tensor): The input. shape=(batch_size, channel, height, width)
+            condition (Tensor): The local condition. shape=(batch_size, condition_channel, height, width)
+            
+        Returns: 
+            Tensor : The output, which is an aggregation of all the skip outputs. shape=(batch_size, channel, height, width)
+            
+        """
+        skip_connections = []
+        for layer in self:
+            x, skip = layer(x, condition)
+            skip_connections.append(skip)
+        out = paddle.sum(paddle.stack(skip_connections, 0), 0)
+        return out
+
+    def start_sequence(self):
+        """Prepare the layer for incremental computation.
+        """
+        for layer in self:
+            layer.start_sequence()
+
+    def add_input(self, x_row, condition_row):
+        """Compute the output for a row and update the buffers.
+
+        Args:
+            x_row (Tensor): A row of the input. shape=(batch_size, channel, 1, width)
+            condition_row (Tensor):  A row of the condition. shape=(batch_size, condition_channel, 1, width)
+            
+        Returns:
+            res (Tensor): A row of the the residual output. shape=(batch_size, channel, 1, width) 
+            skip (Tensor): A row of the skip output. shape=(batch_size, channel, 1, width)
+                
+        """
+        skip_connections = []
+        for layer in self:
+            x_row, skip = layer.add_input(x_row, condition_row)
+            skip_connections.append(skip)
+        out = paddle.sum(paddle.stack(skip_connections, 0), 0)
+        return out
+
+
+class Flow(nn.Layer):
+    """A bijection (Reversable layer) that transform a density of latent
+    variables p(Z) into a complex data distribution p(X).
+
+    It's an auto regressive flow. The ``forward`` method implements the
+    probability density estimation. The ``inverse`` method implements the
+    sampling.
+
+    Args:
+        n_layers (int): Number of ResidualBlocks in the Flow.
+        channels (int): Feature size of the ResidualBlocks.
+        mel_bands (int): Feature size of the mel spectrogram (mel bands).
+        kernel_size (Tuple[int]): Kernel size of each ResisualBlocks in the Flow.
+        n_group (int): Number of timesteps to the folded into a group.
+    """
+    dilations_dict = {
+        8: [1, 1, 1, 1, 1, 1, 1, 1],
+        16: [1, 1, 1, 1, 1, 1, 1, 1],
+        32: [1, 2, 4, 1, 2, 4, 1, 2],
+        64: [1, 2, 4, 8, 16, 1, 2, 4],
+        128: [1, 2, 4, 8, 16, 32, 64, 1]
+    }
+
+    def __init__(self, n_layers, channels, mel_bands, kernel_size, n_group):
+        super().__init__()
+        # input projection
+        self.input_proj = nn.utils.weight_norm(
+            nn.Conv2D(
+                1,
+                channels, (1, 1),
+                weight_attr=I.Uniform(-1., 1.),
+                bias_attr=I.Uniform(-1., 1.)))
+
+        # residual net
+        self.resnet = ResidualNet(n_layers, channels, mel_bands, kernel_size,
+                                  self.dilations_dict[n_group])
+
+        # output projection
+        self.output_proj = nn.Conv2D(
+            channels,
+            2, (1, 1),
+            weight_attr=I.Constant(0.),
+            bias_attr=I.Constant(0.))
+
+        # specs
+        self.n_group = n_group
+
+    def _predict_parameters(self, x, condition):
+        x = self.input_proj(x)
+        x = self.resnet(x, condition)
+        bijection_params = self.output_proj(x)
+        logs, b = paddle.chunk(bijection_params, 2, axis=1)
+        return logs, b
+
+    def _transform(self, x, logs, b):
+        z_0 = x[:, :, :1, :]  # the first row, just copy it
+        z_out = x[:, :, 1:, :] * paddle.exp(logs) + b
+        z_out = paddle.concat([z_0, z_out], axis=2)
+        return z_out
+
+    def forward(self, x, condition):
+        """Probability density estimation. It is done by inversely transform
+        a sample from p(X) into a sample from p(Z).
+
+        Args:
+            x (Tensor): A input sample of the distribution p(X). shape=(batch, 1, height, width)
+            condition (Tensor): The local condition. shape=(batch, condition_channel, height, width)
+            
+        Returns:
+            z (Tensor): shape(batch, 1, height, width), the transformed sample.
+            Tuple[Tensor, Tensor]:
+                The parameter of the transformation.
+                logs (Tensor): shape(batch, 1, height - 1, width), the log scale of the transformation from x to z.
+                b (Tensor): shape(batch, 1, height - 1, width), the shift of the transformation from x to z.
+        """
+        # (B, C, H-1, W)
+        logs, b = self._predict_parameters(x[:, :, :-1, :],
+                                           condition[:, :, 1:, :])
+        z = self._transform(x, logs, b)
+        return z, (logs, b)
+
+    def _predict_row_parameters(self, x_row, condition_row):
+        x_row = self.input_proj(x_row)
+        x_row = self.resnet.add_input(x_row, condition_row)
+        bijection_params = self.output_proj(x_row)
+        logs, b = paddle.chunk(bijection_params, 2, axis=1)
+        return logs, b
+
+    def _inverse_transform_row(self, z_row, logs, b):
+        x_row = (z_row - b) * paddle.exp(-logs)
+        return x_row
+
+    def _inverse_row(self, z_row, x_row, condition_row):
+        logs, b = self._predict_row_parameters(x_row, condition_row)
+        x_next_row = self._inverse_transform_row(z_row, logs, b)
+        return x_next_row, (logs, b)
+
+    def _start_sequence(self):
+        self.resnet.start_sequence()
+
+    def inverse(self, z, condition):
+        """Sampling from the the distrition p(X). It is done by sample form
+        p(Z) and transform the sample. It is a auto regressive transformation.
+
+        Args:
+            z(Tensor): A sample of the distribution p(Z). shape=(batch, 1, time_steps
+            condition(Tensor): The local condition. shape=(batch, condition_channel, time_steps)
+        Returns:
+            Tensor:
+                The transformed sample. shape=(batch, 1, height, width)
+        """
+        z_0 = z[:, :, :1, :]
+        x = paddle.zeros_like(z)
+        x[:, :, :1, :] = z_0
+
+        self._start_sequence()
+
+        num_step = paddle.ones([1], dtype='int32') * (self.n_group)
+        for i in range(1, num_step):
+            x_row = x[:, :, i - 1:i, :]
+            z_row = z[:, :, i:i + 1, :]
+            condition_row = condition[:, :, i:i + 1, :]
+            x_next_row, (logs, b) = self._inverse_row(z_row, x_row,
+                                                      condition_row)
+            x[:, :, i:i + 1, :] = x_next_row
+
+        return x
+
+
+class WaveFlow(nn.LayerList):
+    """An Deep Reversible layer that is composed of severel auto regressive
+    flows.
+
+    Args:
+        n_flows (int): Number of flows in the WaveFlow model.
+        n_layers (int): Number of ResidualBlocks in each Flow.
+        n_group (int): Number of timesteps to fold as a group.
+        channels (int): Feature size of each ResidualBlock.
+        mel_bands (int): Feature size of mel spectrogram (mel bands).
+        kernel_size (Union[int, List[int]]): Kernel size of the convolution layer in each ResidualBlock.
+    """
+
+    def __init__(self, n_flows, n_layers, n_group, channels, mel_bands,
+                 kernel_size):
+        if n_group % 2 or n_flows % 2:
+            raise ValueError(
+                "number of flows and number of group must be even "
+                "since a permutation along group among flows is used.")
+        super().__init__()
+        for _ in range(n_flows):
+            self.append(
+                Flow(n_layers, channels, mel_bands, kernel_size, n_group))
+
+        # permutations in h
+        self.perms = self._create_perm(n_group, n_flows)
+
+        # specs
+        self.n_group = n_group
+        self.n_flows = n_flows
+
+    def _create_perm(self, n_group, n_flows):
+        indices = list(range(n_group))
+        half = n_group // 2
+        perms = []
+        for i in range(n_flows):
+            if i < n_flows // 2:
+                perm = indices[::-1]
+            else:
+                perm = list(reversed(indices[:half])) + list(
+                    reversed(indices[half:]))
+            perm = paddle.to_tensor(perm)
+            self.register_buffer(perm.name, perm)
+            perms.append(perm)
+        return perms
+
+    def _trim(self, x, condition):
+        assert condition.shape[-1] >= x.shape[-1]
+        pruned_len = int(paddle.shape(x)[-1] // self.n_group * self.n_group)
+
+        if x.shape[-1] > pruned_len:
+            x = x[:, :pruned_len]
+        if condition.shape[-1] > pruned_len:
+            condition = condition[:, :, :pruned_len]
+        return x, condition
+
+    def forward(self, x, condition):
+        """Probability density estimation of random variable x given the
+        condition.
+
+        Args:
+            x (Tensor): The audio. shape=(batch_size, time_steps)
+            condition (Tensor): The local condition (mel spectrogram here). shape=(batch_size, condition channel, time_steps)
+                
+        Returns:
+            Tensor: The transformed random variable. shape=(batch_size, time_steps)
+            Tensor: The log determinant of the jacobian of the transformation from x to z. shape=(1,)
+        """
+        # x: (B, T)
+        # condition: (B, C, T) upsampled condition
+        x, condition = self._trim(x, condition)
+
+        # to (B, C, h, T//h) layout
+        x = paddle.unsqueeze(
+            paddle.transpose(fold(x, self.n_group), [0, 2, 1]), 1)
+        condition = paddle.transpose(
+            fold(condition, self.n_group), [0, 1, 3, 2])
+
+        # flows
+        logs_list = []
+        for i, layer in enumerate(self):
+            x, (logs, b) = layer(x, condition)
+            logs_list.append(logs)
+            # permute paddle has no shuffle dim
+            x = geo.shuffle_dim(x, 2, perm=self.perms[i])
+            condition = geo.shuffle_dim(condition, 2, perm=self.perms[i])
+
+        z = paddle.squeeze(x, 1)  # (B, H, W)
+        batch_size = z.shape[0]
+        z = paddle.reshape(paddle.transpose(z, [0, 2, 1]), [batch_size, -1])
+
+        log_det_jacobian = paddle.sum(paddle.stack(logs_list))
+        return z, log_det_jacobian
+
+    def inverse(self, z, condition):
+        """Sampling from the the distrition p(X).
+
+        It is done by sample a ``z`` form p(Z) and transform it into ``x``.
+        Each Flow transform .. math:: `z_{i-1}` to .. math:: `z_{i}` in an
+        autoregressive manner.
+
+        Args:
+            z (Tensor): A sample of the distribution p(Z). shape=(batch, 1, time_steps
+            condition (Tensor): The local condition. shape=(batch, condition_channel, time_steps)    
+
+        Returns: 
+            Tensor: The transformed sample (audio here). shape=(batch_size, time_steps)
+            
+        """
+
+        z, condition = self._trim(z, condition)
+        # to (B, C, h, T//h) layout
+        z = paddle.unsqueeze(
+            paddle.transpose(fold(z, self.n_group), [0, 2, 1]), 1)
+        condition = paddle.transpose(
+            fold(condition, self.n_group), [0, 1, 3, 2])
+
+        # reverse it flow by flow
+        for i in reversed(range(self.n_flows)):
+            z = geo.shuffle_dim(z, 2, perm=self.perms[i])
+            condition = geo.shuffle_dim(condition, 2, perm=self.perms[i])
+            z = self[i].inverse(z, condition)
+
+        x = paddle.squeeze(z, 1)  # (B, H, W)
+        batch_size = x.shape[0]
+        x = paddle.reshape(paddle.transpose(x, [0, 2, 1]), [batch_size, -1])
+        return x
+
+
+class ConditionalWaveFlow(nn.LayerList):
+    """ConditionalWaveFlow, a UpsampleNet with a WaveFlow model.
+
+    Args:
+        upsample_factors (List[int]): Upsample factors for the upsample net.
+        n_flows (int): Number of flows in the WaveFlow model.
+        n_layers (int): Number of ResidualBlocks in each Flow.
+        n_group (int): Number of timesteps to fold as a group.
+        channels (int): Feature size of each ResidualBlock.
+        n_mels (int): Feature size of mel spectrogram (mel bands).
+        kernel_size (Union[int, List[int]]): Kernel size of the convolution layer in each ResidualBlock.
+        """
+
+    def __init__(self,
+                 upsample_factors: List[int],
+                 n_flows: int,
+                 n_layers: int,
+                 n_group: int,
+                 channels: int,
+                 n_mels: int,
+                 kernel_size: Union[int, List[int]]):
+        super().__init__()
+        self.encoder = UpsampleNet(upsample_factors)
+        self.decoder = WaveFlow(
+            n_flows=n_flows,
+            n_layers=n_layers,
+            n_group=n_group,
+            channels=channels,
+            mel_bands=n_mels,
+            kernel_size=kernel_size)
+
+    def forward(self, audio, mel):
+        """Compute the transformed random variable z (x to z) and the log of
+        the determinant of the jacobian of the transformation from x to z.
+
+        Args:
+            audio(Tensor): The audio. shape=(B, T)
+            mel(Tensor): The mel spectrogram. shape=(B, C_mel, T_mel)
+
+        Returns:
+            Tensor: The inversely transformed random variable z (x to z). shape=(B, T)
+            Tensor: the log of the determinant of the jacobian of the transformation from x to z. shape=(1,)
+        """
+        condition = self.encoder(mel)
+        z, log_det_jacobian = self.decoder(audio, condition)
+        return z, log_det_jacobian
+
+    @paddle.no_grad()
+    def infer(self, mel):
+        """Generate raw audio given mel spectrogram.
+
+        Args:
+            mel(np.ndarray): Mel spectrogram of an utterance(in log-magnitude). shape=(C_mel, T_mel)
+
+        Returns:
+            Tensor: The synthesized audio, where``T <= T_mel * upsample_factors``. shape=(B, T)
+        """
+        start = time.time()
+        condition = self.encoder(mel, trim_conv_artifact=True)  # (B, C, T)
+        batch_size, _, time_steps = condition.shape
+        z = paddle.randn([batch_size, time_steps], dtype=mel.dtype)
+        x = self.decoder.inverse(z, condition)
+        end = time.time()
+        print("time: {}s".format(end - start))
+        return x
+
+    @paddle.no_grad()
+    def predict(self, mel):
+        """Generate raw audio given mel spectrogram.
+
+        Args:
+            mel(np.ndarray): Mel spectrogram of an utterance(in log-magnitude). shape=(C_mel, T_mel)
+
+        Returns:
+            np.ndarray: The synthesized audio. shape=(T,)
+        """
+        mel = paddle.to_tensor(mel)
+        mel = paddle.unsqueeze(mel, 0)
+        audio = self.infer(mel)
+        audio = audio[0].numpy()
+        return audio
+
+    @classmethod
+    def from_pretrained(cls, config, checkpoint_path):
+        """Build a ConditionalWaveFlow model from a pretrained model.
+
+        Args:
+            config(yacs.config.CfgNode): model configs
+            checkpoint_path(Path or str): the path of pretrained model checkpoint, without extension name
+
+        Returns:
+            ConditionalWaveFlow The model built from pretrained result.
+        """
+        model = cls(upsample_factors=config.model.upsample_factors,
+                    n_flows=config.model.n_flows,
+                    n_layers=config.model.n_layers,
+                    n_group=config.model.n_group,
+                    channels=config.model.channels,
+                    n_mels=config.data.n_mels,
+                    kernel_size=config.model.kernel_size)
+        checkpoint.load_parameters(model, checkpoint_path=checkpoint_path)
+        return model
+
+
+class WaveFlowLoss(nn.Layer):
+    """Criterion of a WaveFlow model.
+
+    Args:
+        sigma (float): The standard deviation of the gaussian noise used in WaveFlow, 
+            by default 1.0.
+    """
+
+    def __init__(self, sigma=1.0):
+        super().__init__()
+        self.sigma = sigma
+        self.const = 0.5 * np.log(2 * np.pi) + np.log(self.sigma)
+
+    def forward(self, z, log_det_jacobian):
+        """Compute the loss given the transformed random variable z and the
+        log_det_jacobian of transformation from x to z.
+
+        Args:
+            z(Tensor): The transformed random variable (x to z). shape=(B, T)
+            log_det_jacobian(Tensor): The log of the determinant of the jacobian matrix of the
+                transformation from x to z.  shape=(1,)
+
+        Returns:
+            Tensor: The loss. shape=(1,)
+        """
+        loss = paddle.sum(z * z) / (2 * self.sigma * self.sigma
+                                    ) - log_det_jacobian
+        loss = loss / np.prod(z.shape)
+        return loss + self.const
+
+
+class ConditionalWaveFlow2Infer(ConditionalWaveFlow):
+    def forward(self, mel):
+        """Generate raw audio given mel spectrogram.
+
+        Args:
+            mel (np.ndarray): Mel spectrogram of an utterance(in log-magnitude). shape=(C_mel, T_mel)
+            
+        Returns:
+            np.ndarray: The synthesized audio. shape=(T,)
+            
+        """
+        audio = self.predict(mel)
+        return audio
diff --git a/ernie-sat/paddlespeech/t2s/models/wavernn/__init__.py b/ernie-sat/paddlespeech/t2s/models/wavernn/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..80ffd0688bc4339a5e6d34d5a5220133dbc69a9b
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/models/wavernn/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .wavernn import *
+from .wavernn_updater import *
diff --git a/ernie-sat/paddlespeech/t2s/models/wavernn/wavernn.py b/ernie-sat/paddlespeech/t2s/models/wavernn/wavernn.py
new file mode 100644
index 0000000000000000000000000000000000000000..b4b8b48091ff2ad4f2774af2662afd5e0a48b79c
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/models/wavernn/wavernn.py
@@ -0,0 +1,582 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from https://github.com/fatchord/WaveRNN
+import sys
+import time
+from typing import List
+
+import numpy as np
+import paddle
+from paddle import nn
+from paddle.nn import functional as F
+
+from paddlespeech.t2s.audio.codec import decode_mu_law
+from paddlespeech.t2s.modules.losses import sample_from_discretized_mix_logistic
+from paddlespeech.t2s.modules.nets_utils import initialize
+from paddlespeech.t2s.modules.upsample import Stretch2D
+
+
+class ResBlock(nn.Layer):
+    def __init__(self, dims):
+        super().__init__()
+        self.conv1 = nn.Conv1D(dims, dims, kernel_size=1, bias_attr=False)
+        self.conv2 = nn.Conv1D(dims, dims, kernel_size=1, bias_attr=False)
+        self.batch_norm1 = nn.BatchNorm1D(dims)
+        self.batch_norm2 = nn.BatchNorm1D(dims)
+
+    def forward(self, x):
+        '''
+        conv -> bn -> relu -> conv -> bn + residual connection
+        '''
+        residual = x
+        x = self.conv1(x)
+        x = self.batch_norm1(x)
+        x = F.relu(x)
+        x = self.conv2(x)
+        x = self.batch_norm2(x)
+        return x + residual
+
+
+class MelResNet(nn.Layer):
+    def __init__(self,
+                 res_blocks: int=10,
+                 compute_dims: int=128,
+                 res_out_dims: int=128,
+                 aux_channels: int=80,
+                 aux_context_window: int=0):
+        super().__init__()
+        k_size = aux_context_window * 2 + 1
+        # pay attention here, the dim reduces aux_context_window * 2
+        self.conv_in = nn.Conv1D(
+            aux_channels, compute_dims, kernel_size=k_size, bias_attr=False)
+        self.batch_norm = nn.BatchNorm1D(compute_dims)
+        self.layers = nn.LayerList()
+        for _ in range(res_blocks):
+            self.layers.append(ResBlock(compute_dims))
+        self.conv_out = nn.Conv1D(compute_dims, res_out_dims, kernel_size=1)
+
+    def forward(self, x):
+        '''
+        Args:
+            x (Tensor): Input tensor (B, in_dims, T).
+        Returns:
+            Tensor: Output tensor (B, res_out_dims, T).
+        '''
+
+        x = self.conv_in(x)
+        x = self.batch_norm(x)
+        x = F.relu(x)
+        for f in self.layers:
+            x = f(x)
+        x = self.conv_out(x)
+        return x
+
+
+class UpsampleNetwork(nn.Layer):
+    def __init__(self,
+                 aux_channels: int=80,
+                 upsample_scales: List[int]=[4, 5, 3, 5],
+                 compute_dims: int=128,
+                 res_blocks: int=10,
+                 res_out_dims: int=128,
+                 aux_context_window: int=2):
+        super().__init__()
+        # total_scale is the total Up sampling multiple
+        total_scale = np.prod(upsample_scales)
+        # TODO pad*total_scale is numpy.int64
+        self.indent = int(aux_context_window * total_scale)
+        self.resnet = MelResNet(
+            res_blocks=res_blocks,
+            aux_channels=aux_channels,
+            compute_dims=compute_dims,
+            res_out_dims=res_out_dims,
+            aux_context_window=aux_context_window)
+        self.resnet_stretch = Stretch2D(total_scale, 1)
+        self.up_layers = nn.LayerList()
+        for scale in upsample_scales:
+            k_size = (1, scale * 2 + 1)
+            padding = (0, scale)
+            stretch = Stretch2D(scale, 1)
+
+            conv = nn.Conv2D(
+                1, 1, kernel_size=k_size, padding=padding, bias_attr=False)
+            weight_ = paddle.full_like(conv.weight, 1. / k_size[1])
+            conv.weight.set_value(weight_)
+            self.up_layers.append(stretch)
+            self.up_layers.append(conv)
+
+    def forward(self, m):
+        '''
+        Args:
+            c (Tensor): Input tensor (B, C_aux, T).
+        Returns:
+            Tensor: Output tensor (B, (T - 2 * pad) *  prob(upsample_scales), C_aux).
+            Tensor: Output tensor (B, (T - 2 * pad) *  prob(upsample_scales), res_out_dims).
+        '''
+        # aux: [B, C_aux, T] 
+        # -> [B, res_out_dims, T - 2 * aux_context_window]
+        # -> [B, 1, res_out_dims, T - 2 * aux_context_window]
+        aux = self.resnet(m).unsqueeze(1)
+        # aux: [B, 1, res_out_dims, T - 2 * aux_context_window]
+        # -> [B, 1, res_out_dims, (T - 2 * pad) *  prob(upsample_scales)]
+        aux = self.resnet_stretch(aux)
+        # aux: [B, 1, res_out_dims, T * prob(upsample_scales)] 
+        # -> [B, res_out_dims, T * prob(upsample_scales)]
+        aux = aux.squeeze(1)
+        # m: [B, C_aux, T] -> [B, 1, C_aux, T]
+        m = m.unsqueeze(1)
+        for f in self.up_layers:
+            m = f(m)
+        # m: [B, 1, C_aux, T*prob(upsample_scales)]
+        # -> [B, C_aux, T * prob(upsample_scales)]
+        # -> [B, C_aux, (T - 2 * pad) * prob(upsample_scales)]
+        m = m.squeeze(1)[:, :, self.indent:-self.indent]
+        # m: [B, (T - 2 * pad) * prob(upsample_scales), C_aux]
+        # aux: [B, (T - 2 * pad) * prob(upsample_scales), res_out_dims]
+        return m.transpose([0, 2, 1]), aux.transpose([0, 2, 1])
+
+
+class WaveRNN(nn.Layer):
+    def __init__(
+            self,
+            rnn_dims: int=512,
+            fc_dims: int=512,
+            bits: int=9,
+            aux_context_window: int=2,
+            upsample_scales: List[int]=[4, 5, 3, 5],
+            aux_channels: int=80,
+            compute_dims: int=128,
+            res_out_dims: int=128,
+            res_blocks: int=10,
+            hop_length: int=300,
+            sample_rate: int=24000,
+            mode='RAW',
+            init_type: str="xavier_uniform", ):
+        '''
+        Args:
+            rnn_dims (int, optional): Hidden dims of RNN Layers.
+            fc_dims (int, optional): Dims of FC Layers.
+            bits (int, optional): bit depth of signal.
+            aux_context_window (int, optional): The context window size of the first convolution applied to the 
+                auxiliary input, by default 2
+            upsample_scales (List[int], optional): Upsample scales of the upsample network.
+            aux_channels (int, optional): Auxiliary channel of the residual blocks.
+            compute_dims (int, optional): Dims of Conv1D in MelResNet.
+            res_out_dims (int, optional): Dims of output in MelResNet.
+            res_blocks (int, optional): Number of residual blocks.
+            mode (str, optional): Output mode of the WaveRNN vocoder. 
+                `MOL` for Mixture of Logistic Distribution, and `RAW` for quantized bits as the model's output.
+            init_type (str): How to initialize parameters.
+        '''
+        super().__init__()
+        self.mode = mode
+        self.aux_context_window = aux_context_window
+        if self.mode == 'RAW':
+            self.n_classes = 2**bits
+        elif self.mode == 'MOL':
+            self.n_classes = 10 * 3
+        else:
+            RuntimeError('Unknown model mode value - ', self.mode)
+
+        # List of rnns to call 'flatten_parameters()' on
+        self._to_flatten = []
+
+        self.rnn_dims = rnn_dims
+        self.aux_dims = res_out_dims // 4
+        self.hop_length = hop_length
+        self.sample_rate = sample_rate
+
+        # initialize parameters
+        initialize(self, init_type)
+
+        self.upsample = UpsampleNetwork(
+            aux_channels=aux_channels,
+            upsample_scales=upsample_scales,
+            compute_dims=compute_dims,
+            res_blocks=res_blocks,
+            res_out_dims=res_out_dims,
+            aux_context_window=aux_context_window)
+        self.I = nn.Linear(aux_channels + self.aux_dims + 1, rnn_dims)
+
+        self.rnn1 = nn.GRU(rnn_dims, rnn_dims)
+        self.rnn2 = nn.GRU(rnn_dims + self.aux_dims, rnn_dims)
+
+        self._to_flatten += [self.rnn1, self.rnn2]
+
+        self.fc1 = nn.Linear(rnn_dims + self.aux_dims, fc_dims)
+        self.fc2 = nn.Linear(fc_dims + self.aux_dims, fc_dims)
+        self.fc3 = nn.Linear(fc_dims, self.n_classes)
+
+        # Avoid fragmentation of RNN parameters and associated warning
+        self._flatten_parameters()
+
+        nn.initializer.set_global_initializer(None)
+
+    def forward(self, x, c):
+        '''
+        Args:
+            x (Tensor): wav sequence, [B, T]
+            c (Tensor): mel spectrogram [B, C_aux, T']
+
+            T = (T' - 2 * aux_context_window ) * hop_length
+        Returns:
+            Tensor: [B, T, n_classes]
+        '''
+        # Although we `_flatten_parameters()` on init, when using DataParallel
+        # the model gets replicated, making it no longer guaranteed that the
+        # weights are contiguous in GPU memory. Hence, we must call it again
+        self._flatten_parameters()
+
+        bsize = paddle.shape(x)[0]
+        h1 = paddle.zeros([1, bsize, self.rnn_dims])
+        h2 = paddle.zeros([1, bsize, self.rnn_dims])
+        # c: [B, T, C_aux]
+        # aux: [B, T, res_out_dims]
+        c, aux = self.upsample(c)
+
+        aux_idx = [self.aux_dims * i for i in range(5)]
+        a1 = aux[:, :, aux_idx[0]:aux_idx[1]]
+        a2 = aux[:, :, aux_idx[1]:aux_idx[2]]
+        a3 = aux[:, :, aux_idx[2]:aux_idx[3]]
+        a4 = aux[:, :, aux_idx[3]:aux_idx[4]]
+
+        x = paddle.concat([x.unsqueeze(-1), c, a1], axis=2)
+        x = self.I(x)
+        res = x
+        x, _ = self.rnn1(x, h1)
+
+        x = x + res
+        res = x
+        x = paddle.concat([x, a2], axis=2)
+        x, _ = self.rnn2(x, h2)
+
+        x = x + res
+        x = paddle.concat([x, a3], axis=2)
+        x = F.relu(self.fc1(x))
+
+        x = paddle.concat([x, a4], axis=2)
+        x = F.relu(self.fc2(x))
+
+        return self.fc3(x)
+
+    @paddle.no_grad()
+    def generate(self,
+                 c,
+                 batched: bool=True,
+                 target: int=12000,
+                 overlap: int=600,
+                 mu_law: bool=True,
+                 gen_display: bool=False):
+        """
+        Args:
+            c(Tensor): input mels, (T', C_aux)
+            batched(bool): generate in batch or not
+            target(int): target number of samples to be generated in each batch entry
+            overlap(int): number of samples for crossfading between batches
+            mu_law(bool)
+        Returns: 
+            wav sequence: Output (T' * prod(upsample_scales), out_channels, C_out).
+        """
+
+        self.eval()
+
+        mu_law = mu_law if self.mode == 'RAW' else False
+
+        output = []
+        start = time.time()
+
+        # pseudo batch
+        # (T, C_aux) -> (1, C_aux, T)
+        c = paddle.transpose(c, [1, 0]).unsqueeze(0)
+        T = paddle.shape(c)[-1]
+        wave_len = T * self.hop_length
+        # TODO remove two transpose op by modifying function pad_tensor
+        c = self.pad_tensor(
+            c.transpose([0, 2, 1]), pad=self.aux_context_window,
+            side='both').transpose([0, 2, 1])
+
+        c, aux = self.upsample(c)
+
+        if batched:
+            # (num_folds, target + 2 * overlap, features)
+            c = self.fold_with_overlap(c, target, overlap)
+            aux = self.fold_with_overlap(aux, target, overlap)
+
+        # for dygraph to static graph, if use seq_len of `b_size, seq_len, _ = paddle.shape(c)` in for
+        # will not get TensorArray
+        # see https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/04_dygraph_to_static/case_analysis_cn.html#list-lodtensorarray
+        # b_size, seq_len, _ = paddle.shape(c)
+        b_size = paddle.shape(c)[0]
+        seq_len = paddle.shape(c)[1]
+
+        h1 = paddle.zeros([b_size, self.rnn_dims])
+        h2 = paddle.zeros([b_size, self.rnn_dims])
+        x = paddle.zeros([b_size, 1])
+
+        d = self.aux_dims
+        aux_split = [aux[:, :, d * i:d * (i + 1)] for i in range(4)]
+
+        for i in range(seq_len):
+            m_t = c[:, i, :]
+            # for dygraph to static graph
+            # a1_t, a2_t, a3_t, a4_t = (a[:, i, :] for a in aux_split)
+            a1_t = aux_split[0][:, i, :]
+            a2_t = aux_split[1][:, i, :]
+            a3_t = aux_split[2][:, i, :]
+            a4_t = aux_split[3][:, i, :]
+            x = paddle.concat([x, m_t, a1_t], axis=1)
+            x = self.I(x)
+            # use GRUCell here
+            h1, _ = self.rnn1[0].cell(x, h1)
+            x = x + h1
+            inp = paddle.concat([x, a2_t], axis=1)
+            # use GRUCell here
+            h2, _ = self.rnn2[0].cell(inp, h2)
+
+            x = x + h2
+            x = paddle.concat([x, a3_t], axis=1)
+            x = F.relu(self.fc1(x))
+
+            x = paddle.concat([x, a4_t], axis=1)
+            x = F.relu(self.fc2(x))
+
+            logits = self.fc3(x)
+
+            if self.mode == 'MOL':
+                sample = sample_from_discretized_mix_logistic(
+                    logits.unsqueeze(0).transpose([0, 2, 1]))
+                output.append(sample.reshape([-1]))
+                x = sample.transpose([1, 0, 2])
+
+            elif self.mode == 'RAW':
+                posterior = F.softmax(logits, axis=1)
+                distrib = paddle.distribution.Categorical(posterior)
+                # corresponding operate [np.floor((fx + 1) / 2 * mu + 0.5)] in enocde_mu_law
+                # distrib.sample([1])[0].cast('float32'): [0, 2**bits-1]
+                # sample: [-1, 1]
+                sample = 2 * distrib.sample([1])[0].cast('float32') / (
+                    self.n_classes - 1.) - 1.
+                output.append(sample)
+                x = sample.unsqueeze(-1)
+            else:
+                raise RuntimeError('Unknown model mode value - ', self.mode)
+
+            if gen_display:
+                if i % 1000 == 0:
+                    self.gen_display(i, int(seq_len), int(b_size), start)
+
+        output = paddle.stack(output).transpose([1, 0])
+
+        if mu_law:
+            output = decode_mu_law(output, self.n_classes, False)
+
+        if batched:
+            output = self.xfade_and_unfold(output, target, overlap)
+        else:
+            output = output[0]
+
+        # Fade-out at the end to avoid signal cutting out suddenly
+        fade_out = paddle.linspace(1, 0, 10 * self.hop_length)
+        output = output[:wave_len]
+        output[-10 * self.hop_length:] *= fade_out
+
+        self.train()
+
+        # 增加 C_out 维度
+        return output.unsqueeze(-1)
+
+    def _flatten_parameters(self):
+        [m.flatten_parameters() for m in self._to_flatten]
+
+    def pad_tensor(self, x, pad, side='both'):
+        '''
+        Args:
+            x(Tensor): mel, [1, n_frames, 80]
+            pad(int): 
+            side(str, optional):  (Default value = 'both')
+
+        Returns:
+            Tensor
+        '''
+        b, t, _ = paddle.shape(x)
+        # for dygraph to static graph
+        c = x.shape[-1]
+        total = t + 2 * pad if side == 'both' else t + pad
+        padded = paddle.zeros([b, total, c])
+        if side == 'before' or side == 'both':
+            padded[:, pad:pad + t, :] = x
+        elif side == 'after':
+            padded[:, :t, :] = x
+        return padded
+
+    def fold_with_overlap(self, x, target, overlap):
+        '''
+        Fold the tensor with overlap for quick batched inference.
+        Overlap will be used for crossfading in xfade_and_unfold()
+
+        Args:
+            x(Tensor): Upsampled conditioning features. mels or aux
+                shape=(1, T, features)
+                mels: [1, T, 80]
+                aux: [1, T, 128]
+            target(int): Target timesteps for each index of batch
+            overlap(int): Timesteps for both xfade and rnn warmup
+
+        Returns:
+            Tensor: 
+                shape=(num_folds, target + 2 * overlap, features)
+                num_flods = (time_seq - overlap) // (target + overlap)
+                mel: [num_folds, target + 2 * overlap, 80]
+                aux: [num_folds, target + 2 * overlap, 128]
+
+        Details:
+            x = [[h1, h2, ... hn]]
+            Where each h is a vector of conditioning features
+            Eg: target=2, overlap=1 with x.size(1)=10
+
+            folded = [[h1, h2, h3, h4],
+                    [h4, h5, h6, h7],
+                    [h7, h8, h9, h10]]
+        '''
+
+        _, total_len, features = paddle.shape(x)
+
+        # Calculate variables needed
+        num_folds = (total_len - overlap) // (target + overlap)
+        extended_len = num_folds * (overlap + target) + overlap
+        remaining = total_len - extended_len
+
+        # Pad if some time steps poking out
+        if remaining != 0:
+            num_folds += 1
+            padding = target + 2 * overlap - remaining
+            x = self.pad_tensor(x, padding, side='after')
+
+        folded = paddle.zeros([num_folds, target + 2 * overlap, features])
+
+        # Get the values for the folded tensor
+        for i in range(num_folds):
+            start = i * (target + overlap)
+            end = start + target + 2 * overlap
+            folded[i] = x[0][start:end, :]
+        return folded
+
+    def xfade_and_unfold(self, y, target: int=12000, overlap: int=600):
+        ''' Applies a crossfade and unfolds into a 1d array.
+
+        Args:
+            y (Tensor): 
+                Batched sequences of audio samples
+                shape=(num_folds, target + 2 * overlap)
+                dtype=paddle.float32
+            overlap (int): Timesteps for both xfade and rnn warmup
+
+        Returns:
+            Tensor
+                audio samples in a 1d array
+                shape=(total_len)
+                dtype=paddle.float32
+
+        Details:
+            y = [[seq1],
+                [seq2],
+                [seq3]]
+
+            Apply a gain envelope at both ends of the sequences
+
+            y = [[seq1_in, seq1_target, seq1_out],
+                [seq2_in, seq2_target, seq2_out],
+                [seq3_in, seq3_target, seq3_out]]
+
+            Stagger and add up the groups of samples:
+
+            [seq1_in, seq1_target, (seq1_out + seq2_in), seq2_target, ...]
+
+        '''
+        # num_folds = (total_len - overlap) // (target + overlap)
+        num_folds, length = paddle.shape(y)
+        target = length - 2 * overlap
+        total_len = num_folds * (target + overlap) + overlap
+
+        # Need some silence for the run warmup
+        slience_len = 0
+        linear_len = slience_len
+        fade_len = overlap - slience_len
+        slience = paddle.zeros([slience_len], dtype=paddle.float32)
+        linear = paddle.ones([linear_len], dtype=paddle.float32)
+
+        # Equal power crossfade
+        # fade_in increase from 0 to 1, fade_out reduces from 1 to 0
+        sigmoid_scale = 2.3
+        t = paddle.linspace(
+            -sigmoid_scale, sigmoid_scale, fade_len, dtype=paddle.float32)
+        # sigmoid 曲线应该更好
+        fade_in = paddle.nn.functional.sigmoid(t)
+        fade_out = 1 - paddle.nn.functional.sigmoid(t)
+        # Concat the silence to the fades
+        fade_out = paddle.concat([linear, fade_out])
+        fade_in = paddle.concat([slience, fade_in])
+
+        # Apply the gain to the overlap samples
+        y[:, :overlap] *= fade_in
+        y[:, -overlap:] *= fade_out
+
+        unfolded = paddle.zeros([total_len], dtype=paddle.float32)
+
+        # Loop to add up all the samples
+        for i in range(num_folds):
+            start = i * (target + overlap)
+            end = start + target + 2 * overlap
+            unfolded[start:end] += y[i]
+
+        return unfolded
+
+    def gen_display(self, i, seq_len, b_size, start):
+        gen_rate = (i + 1) / (time.time() - start) * b_size / 1000
+        pbar = self.progbar(i, seq_len)
+        msg = f'| {pbar} {i*b_size}/{seq_len*b_size} | Batch Size: {b_size} | Gen Rate: {gen_rate:.1f}kHz | '
+        sys.stdout.write(f"\r{msg}")
+
+    def progbar(self, i, n, size=16):
+        done = int(i * size) // n
+        bar = ''
+        for i in range(size):
+            bar += '█' if i <= done else '░'
+        return bar
+
+
+class WaveRNNInference(nn.Layer):
+    def __init__(self, normalizer, wavernn):
+        super().__init__()
+        self.normalizer = normalizer
+        self.wavernn = wavernn
+
+    def forward(self,
+                logmel,
+                batched: bool=True,
+                target: int=12000,
+                overlap: int=600,
+                mu_law: bool=True,
+                gen_display: bool=False):
+        normalized_mel = self.normalizer(logmel)
+
+        wav = self.wavernn.generate(
+            normalized_mel, )
+        # batched=batched,
+        # target=target,
+        # overlap=overlap,
+        # mu_law=mu_law,
+        # gen_display=gen_display)
+
+        return wav
diff --git a/ernie-sat/paddlespeech/t2s/models/wavernn/wavernn_updater.py b/ernie-sat/paddlespeech/t2s/models/wavernn/wavernn_updater.py
new file mode 100644
index 0000000000000000000000000000000000000000..b2756d00c8f83d9b96bdc1a36f755a6c6f50ca09
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/models/wavernn/wavernn_updater.py
@@ -0,0 +1,201 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+from pathlib import Path
+
+import paddle
+import soundfile as sf
+from paddle import distributed as dist
+from paddle.io import DataLoader
+from paddle.nn import Layer
+from paddle.optimizer import Optimizer
+
+from paddlespeech.t2s.training.extensions.evaluator import StandardEvaluator
+from paddlespeech.t2s.training.reporter import report
+from paddlespeech.t2s.training.updaters.standard_updater import StandardUpdater
+logging.basicConfig(
+    format='%(asctime)s [%(levelname)s] [%(filename)s:%(lineno)d] %(message)s',
+    datefmt='[%Y-%m-%d %H:%M:%S]')
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+
+def calculate_grad_norm(parameters, norm_type: str=2):
+    '''
+    calculate grad norm of mdoel's parameters
+    parameters:
+        model's parameters
+    norm_type: str
+    Returns
+    ------------
+    Tensor
+        grad_norm
+    '''
+
+    grad_list = [
+        paddle.to_tensor(p.grad) for p in parameters if p.grad is not None
+    ]
+    norm_list = paddle.stack(
+        [paddle.norm(grad, norm_type) for grad in grad_list])
+    total_norm = paddle.norm(norm_list)
+    return total_norm
+
+
+# for save name in gen_valid_samples()
+ITERATION = 0
+
+
+class WaveRNNUpdater(StandardUpdater):
+    def __init__(self,
+                 model: Layer,
+                 optimizer: Optimizer,
+                 criterion: Layer,
+                 dataloader: DataLoader,
+                 init_state=None,
+                 output_dir: Path=None,
+                 mode='RAW'):
+        super().__init__(model, optimizer, dataloader, init_state=None)
+
+        self.criterion = criterion
+        # self.scheduler = scheduler
+
+        log_file = output_dir / 'worker_{}.log'.format(dist.get_rank())
+        self.filehandler = logging.FileHandler(str(log_file))
+        logger.addHandler(self.filehandler)
+        self.logger = logger
+        self.msg = ""
+        self.mode = mode
+
+    def update_core(self, batch):
+
+        self.msg = "Rank: {}, ".format(dist.get_rank())
+        losses_dict = {}
+        # parse batch
+        self.model.train()
+        self.optimizer.clear_grad()
+
+        wav, y, mel = batch
+
+        y_hat = self.model(wav, mel)
+        if self.mode == 'RAW':
+            y_hat = y_hat.transpose([0, 2, 1]).unsqueeze(-1)
+        elif self.mode == 'MOL':
+            y_hat = paddle.cast(y, dtype='float32')
+
+        y = y.unsqueeze(-1)
+        loss = self.criterion(y_hat, y)
+        loss.backward()
+        grad_norm = float(
+            calculate_grad_norm(self.model.parameters(), norm_type=2))
+
+        self.optimizer.step()
+
+        report("train/loss", float(loss))
+        report("train/grad_norm", float(grad_norm))
+
+        losses_dict["loss"] = float(loss)
+        losses_dict["grad_norm"] = float(grad_norm)
+        self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
+                              for k, v in losses_dict.items())
+        global ITERATION
+        ITERATION = self.state.iteration + 1
+
+
+class WaveRNNEvaluator(StandardEvaluator):
+    def __init__(self,
+                 model: Layer,
+                 criterion: Layer,
+                 dataloader: Optimizer,
+                 output_dir: Path=None,
+                 valid_generate_loader=None,
+                 config=None):
+        super().__init__(model, dataloader)
+
+        log_file = output_dir / 'worker_{}.log'.format(dist.get_rank())
+        self.filehandler = logging.FileHandler(str(log_file))
+        logger.addHandler(self.filehandler)
+        self.logger = logger
+        self.msg = ""
+
+        self.criterion = criterion
+        self.valid_generate_loader = valid_generate_loader
+        self.config = config
+        self.mode = config.model.mode
+
+        self.valid_samples_dir = output_dir / "valid_samples"
+        self.valid_samples_dir.mkdir(parents=True, exist_ok=True)
+
+    def evaluate_core(self, batch):
+        self.msg = "Evaluate: "
+        losses_dict = {}
+        # parse batch
+        wav, y, mel = batch
+        y_hat = self.model(wav, mel)
+
+        if self.mode == 'RAW':
+            y_hat = y_hat.transpose([0, 2, 1]).unsqueeze(-1)
+        elif self.mode == 'MOL':
+            y_hat = paddle.cast(y, dtype='float32')
+
+        y = y.unsqueeze(-1)
+        loss = self.criterion(y_hat, y)
+        report("eval/loss", float(loss))
+
+        losses_dict["loss"] = float(loss)
+
+        self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
+                              for k, v in losses_dict.items())
+        self.logger.info(self.msg)
+
+    def gen_valid_samples(self):
+
+        for i, item in enumerate(self.valid_generate_loader):
+            if i >= self.config.generate_num:
+                break
+            print(
+                '\n| Generating: {}/{}'.format(i + 1, self.config.generate_num))
+
+            mel = item['feats']
+            wav = item['wave']
+            wav = wav.squeeze(0)
+
+            origin_save_path = self.valid_samples_dir / '{}_steps_{}_target.wav'.format(
+                self.iteration, i)
+            sf.write(origin_save_path, wav.numpy(), samplerate=self.config.fs)
+
+            if self.config.inference.gen_batched:
+                batch_str = 'gen_batched_target{}_overlap{}'.format(
+                    self.config.inference.target, self.config.inference.overlap)
+            else:
+                batch_str = 'gen_not_batched'
+            gen_save_path = str(self.valid_samples_dir /
+                                '{}_steps_{}_{}.wav'.format(self.iteration, i,
+                                                            batch_str))
+            # (1, T, C_aux) -> (T, C_aux)
+            mel = mel.squeeze(0)
+            gen_sample = self.model.generate(
+                mel, self.config.inference.gen_batched,
+                self.config.inference.target, self.config.inference.overlap,
+                self.config.mu_law)
+            sf.write(
+                gen_save_path, gen_sample.numpy(), samplerate=self.config.fs)
+
+    def __call__(self, trainer=None):
+        summary = self.evaluate()
+        for k, v in summary.items():
+            report(k, v)
+        # gen samples at then end of evaluate
+        self.iteration = ITERATION
+        if self.iteration % self.config.gen_eval_samples_interval_steps == 0:
+            self.gen_valid_samples()
diff --git a/ernie-sat/paddlespeech/t2s/modules/__init__.py b/ernie-sat/paddlespeech/t2s/modules/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e3312002cada4503b6c4d43f2ea5b30ba9d7efb
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/modules/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .conv import *
+from .geometry import *
+from .losses import *
+from .positional_encoding import *
diff --git a/ernie-sat/paddlespeech/t2s/modules/activation.py b/ernie-sat/paddlespeech/t2s/modules/activation.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d8cd62ef4b232bbd32d2a0d3a050a7876e66b5c
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/modules/activation.py
@@ -0,0 +1,43 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddle
+import paddle.nn.functional as F
+from paddle import nn
+
+
+class GLU(nn.Layer):
+    """Gated Linear Units (GLU) Layer"""
+
+    def __init__(self, dim: int=-1):
+        super().__init__()
+        self.dim = dim
+
+    def forward(self, xs):
+        return F.glu(xs, axis=self.dim)
+
+
+def get_activation(act, **kwargs):
+    """Return activation function."""
+
+    activation_funcs = {
+        "hardtanh": paddle.nn.Hardtanh,
+        "tanh": paddle.nn.Tanh,
+        "relu": paddle.nn.ReLU,
+        "selu": paddle.nn.SELU,
+        "leakyrelu": paddle.nn.LeakyReLU,
+        "swish": paddle.nn.Swish,
+        "glu": GLU
+    }
+
+    return activation_funcs[act](**kwargs)
diff --git a/ernie-sat/paddlespeech/t2s/modules/causal_conv.py b/ernie-sat/paddlespeech/t2s/modules/causal_conv.py
new file mode 100644
index 0000000000000000000000000000000000000000..3abccc15f45e0911f18535efe8575177f735c66b
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/modules/causal_conv.py
@@ -0,0 +1,74 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Causal convolusion layer modules."""
+import paddle
+from paddle import nn
+
+
+class CausalConv1D(nn.Layer):
+    """CausalConv1D module with customized initialization."""
+
+    def __init__(
+            self,
+            in_channels,
+            out_channels,
+            kernel_size,
+            dilation=1,
+            bias=True,
+            pad="Pad1D",
+            pad_params={"value": 0.0}, ):
+        """Initialize CausalConv1d module."""
+        super().__init__()
+        self.pad = getattr(paddle.nn, pad)((kernel_size - 1) * dilation,
+                                           **pad_params)
+        self.conv = nn.Conv1D(
+            in_channels,
+            out_channels,
+            kernel_size,
+            dilation=dilation,
+            bias_attr=bias)
+
+    def forward(self, x):
+        """Calculate forward propagation.
+        Args:
+            x (Tensor): Input tensor (B, in_channels, T).
+        Returns: 
+            Tensor: Output tensor (B, out_channels, T).
+        """
+        return self.conv(self.pad(x))[:, :, :x.shape[2]]
+
+
+class CausalConv1DTranspose(nn.Layer):
+    """CausalConv1DTranspose module with customized initialization."""
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride,
+                 bias=True):
+        """Initialize CausalConvTranspose1d module."""
+        super().__init__()
+        self.deconv = nn.Conv1DTranspose(
+            in_channels, out_channels, kernel_size, stride, bias_attr=bias)
+        self.stride = stride
+
+    def forward(self, x):
+        """Calculate forward propagation.
+        Args:
+            x (Tensor): Input tensor (B, in_channels, T_in).
+        Returns:
+            Tensor: Output tensor (B, out_channels, T_out).
+        """
+        return self.deconv(x)[:, :, :-self.stride]
diff --git a/ernie-sat/paddlespeech/t2s/modules/conformer/__init__.py b/ernie-sat/paddlespeech/t2s/modules/conformer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..abf198b97e6e818e1fbe59006f98492640bcee54
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/modules/conformer/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/ernie-sat/paddlespeech/t2s/modules/conformer/convolution.py b/ernie-sat/paddlespeech/t2s/modules/conformer/convolution.py
new file mode 100644
index 0000000000000000000000000000000000000000..185c62fb3c804f9ce495323f590878072d8bafa6
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/modules/conformer/convolution.py
@@ -0,0 +1,81 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+"""ConvolutionModule definition."""
+from paddle import nn
+
+
+class ConvolutionModule(nn.Layer):
+    """ConvolutionModule in Conformer model.
+
+    Args:
+        channels (int): The number of channels of conv layers.
+        kernel_size (int): Kernerl size of conv layers.
+    """
+
+    def __init__(self, channels, kernel_size, activation=nn.ReLU(), bias=True):
+        """Construct an ConvolutionModule object."""
+        super().__init__()
+        # kernerl_size should be a odd number for 'SAME' padding
+        assert (kernel_size - 1) % 2 == 0
+
+        self.pointwise_conv1 = nn.Conv1D(
+            channels,
+            2 * channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias_attr=bias, )
+        self.depthwise_conv = nn.Conv1D(
+            channels,
+            channels,
+            kernel_size,
+            stride=1,
+            padding=(kernel_size - 1) // 2,
+            groups=channels,
+            bias_attr=bias, )
+        self.norm = nn.BatchNorm1D(channels)
+        self.pointwise_conv2 = nn.Conv1D(
+            channels,
+            channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias_attr=bias, )
+        self.activation = activation
+
+    def forward(self, x):
+        """Compute convolution module.
+
+        Args:
+            x (Tensor): Input tensor (#batch, time, channels).
+        Returns:
+            Tensor: Output tensor (#batch, time, channels).
+        """
+        # exchange the temporal dimension and the feature dimension
+        x = x.transpose([0, 2, 1])
+
+        # GLU mechanism
+        # (batch, 2*channel, time)
+        x = self.pointwise_conv1(x)
+        # (batch, channel, time)
+        x = nn.functional.glu(x, axis=1)
+
+        # 1D Depthwise Conv
+        x = self.depthwise_conv(x)
+        x = self.activation(self.norm(x))
+
+        x = self.pointwise_conv2(x)
+
+        return x.transpose([0, 2, 1])
diff --git a/ernie-sat/paddlespeech/t2s/modules/conformer/encoder_layer.py b/ernie-sat/paddlespeech/t2s/modules/conformer/encoder_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..61c32612527630ec66941b882335a208a50d1b11
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/modules/conformer/encoder_layer.py
@@ -0,0 +1,182 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+"""Encoder self-attention layer definition."""
+import paddle
+from paddle import nn
+
+from paddlespeech.t2s.modules.layer_norm import LayerNorm
+
+
+class EncoderLayer(nn.Layer):
+    """Encoder layer module.
+    
+    Args:
+        size (int): Input dimension.
+        self_attn (nn.Layer): Self-attention module instance.
+            `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` instance
+            can be used as the argument.
+        feed_forward (nn.Layer): Feed-forward module instance.
+            `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance
+            can be used as the argument.
+        feed_forward_macaron (nn.Layer): Additional feed-forward module instance.
+            `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance
+            can be used as the argument.
+        conv_module (nn.Layer): Convolution module instance.
+            `ConvlutionModule` instance can be used as the argument.
+        dropout_rate (float): Dropout rate.
+        normalize_before (bool): Whether to use layer_norm before the first block.
+        concat_after (bool): Whether to concat attention layer's input and output.
+            if True, additional linear will be applied.
+            i.e. x -> x + linear(concat(x, att(x)))
+            if False, no additional linear will be applied. i.e. x -> x + att(x)
+        stochastic_depth_rate (float): Proability to skip this layer.
+            During training, the layer may skip residual computation and return input
+            as-is with given probability.
+    """
+
+    def __init__(
+            self,
+            size,
+            self_attn,
+            feed_forward,
+            feed_forward_macaron,
+            conv_module,
+            dropout_rate,
+            normalize_before=True,
+            concat_after=False,
+            stochastic_depth_rate=0.0, ):
+        """Construct an EncoderLayer object."""
+        super().__init__()
+        self.self_attn = self_attn
+        self.feed_forward = feed_forward
+        self.feed_forward_macaron = feed_forward_macaron
+        self.conv_module = conv_module
+        self.norm_ff = LayerNorm(size)  # for the FNN module
+        self.norm_mha = LayerNorm(size)  # for the MHA module
+        if feed_forward_macaron is not None:
+            self.norm_ff_macaron = LayerNorm(size)
+            self.ff_scale = 0.5
+        else:
+            self.ff_scale = 1.0
+        if self.conv_module is not None:
+            self.norm_conv = LayerNorm(size)  # for the CNN module
+            self.norm_final = LayerNorm(
+                size)  # for the final output of the block
+        self.dropout = nn.Dropout(dropout_rate)
+        self.size = size
+        self.normalize_before = normalize_before
+        self.concat_after = concat_after
+        if self.concat_after:
+            self.concat_linear = nn.Linear(size + size, size)
+        self.stochastic_depth_rate = stochastic_depth_rate
+
+    def forward(self, x_input, mask, cache=None):
+        """Compute encoded features.
+
+        Args:
+            x_input(Union[Tuple, Tensor]): Input tensor w/ or w/o pos emb.
+                - w/ pos emb: Tuple of tensors [(#batch, time, size), (1, time, size)].
+                - w/o pos emb: Tensor (#batch, time, size).
+            mask(Tensor): Mask tensor for the input (#batch, time).
+            cache (Tensor): 
+
+        Returns:
+            Tensor: Output tensor (#batch, time, size).
+            Tensor: Mask tensor (#batch, time).
+        """
+        if isinstance(x_input, tuple):
+            x, pos_emb = x_input[0], x_input[1]
+        else:
+            x, pos_emb = x_input, None
+
+        skip_layer = False
+        # with stochastic depth, residual connection `x + f(x)` becomes
+        # `x <- x + 1 / (1 - p) * f(x)` at training time.
+        stoch_layer_coeff = 1.0
+        if self.training and self.stochastic_depth_rate > 0:
+            skip_layer = paddle.rand(1).item() < self.stochastic_depth_rate
+            stoch_layer_coeff = 1.0 / (1 - self.stochastic_depth_rate)
+
+        if skip_layer:
+            if cache is not None:
+                x = paddle.concat([cache, x], axis=1)
+            if pos_emb is not None:
+                return (x, pos_emb), mask
+            return x, mask
+
+        # whether to use macaron style
+        if self.feed_forward_macaron is not None:
+            residual = x
+            if self.normalize_before:
+                x = self.norm_ff_macaron(x)
+            x = residual + stoch_layer_coeff * self.ff_scale * self.dropout(
+                self.feed_forward_macaron(x))
+            if not self.normalize_before:
+                x = self.norm_ff_macaron(x)
+
+        # multi-headed self-attention module
+        residual = x
+        if self.normalize_before:
+            x = self.norm_mha(x)
+
+        if cache is None:
+            x_q = x
+        else:
+            assert cache.shape == (x.shape[0], x.shape[1] - 1, self.size)
+            x_q = x[:, -1:, :]
+            residual = residual[:, -1:, :]
+            mask = None if mask is None else mask[:, -1:, :]
+
+        if pos_emb is not None:
+            x_att = self.self_attn(x_q, x, x, pos_emb, mask)
+        else:
+            x_att = self.self_attn(x_q, x, x, mask)
+
+        if self.concat_after:
+            x_concat = paddle.concat((x, x_att), axis=-1)
+            x = residual + stoch_layer_coeff * self.concat_linear(x_concat)
+        else:
+            x = residual + stoch_layer_coeff * self.dropout(x_att)
+        if not self.normalize_before:
+            x = self.norm_mha(x)
+
+        # convolution module
+        if self.conv_module is not None:
+            residual = x
+            if self.normalize_before:
+                x = self.norm_conv(x)
+            x = residual + stoch_layer_coeff * self.dropout(self.conv_module(x))
+            if not self.normalize_before:
+                x = self.norm_conv(x)
+
+        # feed forward module
+        residual = x
+        if self.normalize_before:
+            x = self.norm_ff(x)
+        x = residual + stoch_layer_coeff * self.ff_scale * self.dropout(
+            self.feed_forward(x))
+        if not self.normalize_before:
+            x = self.norm_ff(x)
+
+        if self.conv_module is not None:
+            x = self.norm_final(x)
+
+        if cache is not None:
+            x = paddle.concat([cache, x], axis=1)
+
+        if pos_emb is not None:
+            return (x, pos_emb), mask
+
+        return x, mask
diff --git a/ernie-sat/paddlespeech/t2s/modules/conv.py b/ernie-sat/paddlespeech/t2s/modules/conv.py
new file mode 100644
index 0000000000000000000000000000000000000000..aa875bd500124e5bd3d3807b10f63ed8442d3800
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/modules/conv.py
@@ -0,0 +1,238 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddle
+from paddle import nn
+
+__all__ = [
+    "Conv1dCell",
+    "Conv1dBatchNorm",
+]
+
+
+class Conv1dCell(nn.Conv1D):
+    """A subclass of Conv1D layer, which can be used in an autoregressive
+    decoder like an RNN cell. 
+    
+    When used in autoregressive decoding, it performs causal temporal
+    convolution incrementally. At each time step, it takes a step input and
+    returns a step output.
+    
+    Notes
+    ------
+    It is done by caching an internal buffer of length ``receptive_file - 1``.
+    when adding a step input, the buffer is shited by one step, the latest
+    input is added to be buffer and the oldest step is discarded. And it
+    returns a step output. For single step case, convolution is equivalent to a
+    linear transformation.
+    That it can be used as a cell depends on several restrictions:
+    1. stride must be 1;
+    2. padding must be a causal padding (recpetive_field - 1, 0).
+    Thus, these arguments are removed from the ``__init__`` method of this
+    class.
+
+    Args:
+        in_channels (int): The feature size of the input.
+        out_channels (int): The feature size of the output.
+        kernel_size (int or Tuple[int]): The size of the kernel.
+        dilation (int or Tuple[int]): The dilation of the convolution, by default 1
+        weight_attr (ParamAttr, Initializer, str or bool, optional) : The parameter attribute of the convolution kernel, 
+            by default None.
+        bias_attr (ParamAttr, Initializer, str or bool, optional):The parameter attribute of the bias. 
+            If ``False``, this layer does not have a bias, by default None.
+            
+    Examples: 
+        >>> cell = Conv1dCell(3, 4, kernel_size=5)
+        >>> inputs = [paddle.randn([4, 3]) for _ in range(16)]
+        >>> outputs = []
+        >>> cell.eval()
+        >>> cell.start_sequence()
+        >>> for xt in inputs:
+        >>>     outputs.append(cell.add_input(xt))
+        >>> len(outputs))
+        16
+        >>> outputs[0].shape
+        [4, 4]
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 dilation=1,
+                 weight_attr=None,
+                 bias_attr=None):
+        _dilation = dilation[0] if isinstance(dilation,
+                                              (tuple, list)) else dilation
+        _kernel_size = kernel_size[0] if isinstance(kernel_size, (
+            tuple, list)) else kernel_size
+        self._r = 1 + (_kernel_size - 1) * _dilation
+        super().__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            padding=(self._r - 1, 0),
+            dilation=dilation,
+            weight_attr=weight_attr,
+            bias_attr=bias_attr,
+            data_format="NCL")
+
+    @property
+    def receptive_field(self):
+        """The receptive field of the Conv1dCell.
+        """
+        return self._r
+
+    def start_sequence(self):
+        """Prepare the layer for a series of incremental forward.
+        
+        Warnings:
+            This method should be called before a sequence of calls to
+            ``add_input``.
+
+        Raises:
+            Exception
+                If this method is called when the layer is in training mode.
+        """
+        if self.training:
+            raise Exception("only use start_sequence in evaluation")
+        self._buffer = None
+
+        # NOTE: call self's weight norm hook expliccitly since self.weight 
+        # is visited directly in this method without calling self.__call__ 
+        # method. If we do not trigger the weight norm hook, the weight 
+        # may be outdated. e.g. after loading from a saved checkpoint
+        # see also: https://github.com/pytorch/pytorch/issues/47588
+        for hook in self._forward_pre_hooks.values():
+            hook(self, None)
+        self._reshaped_weight = paddle.reshape(self.weight,
+                                               (self._out_channels, -1))
+
+    def initialize_buffer(self, x_t):
+        """Initialize the buffer for the step input.
+
+        Args:
+            x_t (Tensor): The step input. shape=(batch_size, in_channels)
+            
+        """
+        batch_size, _ = x_t.shape
+        self._buffer = paddle.zeros(
+            (batch_size, self._in_channels, self.receptive_field),
+            dtype=x_t.dtype)
+
+    def update_buffer(self, x_t):
+        """Shift the buffer by one step.
+
+        Args:
+            x_t (Tensor): The step input. shape=(batch_size, in_channels)
+            
+        """
+        self._buffer = paddle.concat(
+            [self._buffer[:, :, 1:], paddle.unsqueeze(x_t, -1)], -1)
+
+    def add_input(self, x_t):
+        """Add step input and compute step output.
+
+        Args:
+            x_t (Tensor): The step input. shape=(batch_size, in_channels)
+          
+        Returns: 
+            y_t (Tensor): The step output. shape=(batch_size, out_channels)
+
+        """
+        batch_size = x_t.shape[0]
+        if self.receptive_field > 1:
+            if self._buffer is None:
+                self.initialize_buffer(x_t)
+
+            # update buffer
+            self.update_buffer(x_t)
+            if self._dilation[0] > 1:
+                input = self._buffer[:, :, ::self._dilation[0]]
+            else:
+                input = self._buffer
+            input = paddle.reshape(input, (batch_size, -1))
+        else:
+            input = x_t
+        y_t = paddle.matmul(input, self._reshaped_weight, transpose_y=True)
+        y_t = y_t + self.bias
+        return y_t
+
+
+class Conv1dBatchNorm(nn.Layer):
+    """A Conv1D Layer followed by a BatchNorm1D.
+
+    Args:
+        in_channels (int): The feature size of the input.
+        out_channels (int): The feature size of the output.
+        kernel_size (int): The size of the convolution kernel.
+        stride (int, optional): The stride of the convolution, by default 1.
+        padding (int, str or Tuple[int], optional):
+            The padding of the convolution.
+            If int, a symmetrical padding is applied before convolution;
+            If str, it should be "same" or "valid";
+            If Tuple[int], its length should be 2, meaning
+            ``(pad_before, pad_after)``, by default 0.
+        weight_attr (ParamAttr, Initializer, str or bool, optional):
+            The parameter attribute of the convolution kernel,
+            by default None.
+        bias_attr (ParamAttr, Initializer, str or bool, optional):
+            The parameter attribute of the bias of the convolution,
+            by defaultNone.
+        data_format (str ["NCL" or "NLC"], optional): The data layout of the input, by default "NCL"
+        momentum (float, optional): The momentum of the BatchNorm1D layer, by default 0.9
+        epsilon (float, optional): The epsilon of the BatchNorm1D layer, by default 1e-05
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 weight_attr=None,
+                 bias_attr=None,
+                 data_format="NCL",
+                 momentum=0.9,
+                 epsilon=1e-05):
+        super().__init__()
+        self.conv = nn.Conv1D(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding=padding,
+            weight_attr=weight_attr,
+            bias_attr=bias_attr,
+            data_format=data_format)
+        self.bn = nn.BatchNorm1D(
+            out_channels,
+            momentum=momentum,
+            epsilon=epsilon,
+            data_format=data_format)
+
+    def forward(self, x):
+        """Forward pass of the Conv1dBatchNorm layer.
+        
+        Args:
+            x (Tensor): The input tensor. Its data layout depends on ``data_format``. 
+            shape=(B, C_in, T_in) or (B, T_in, C_in)
+    
+        Returns:
+            Tensor: The output tensor. 
+                shape=(B, C_out, T_out) or (B, T_out, C_out)
+                
+        """
+        x = self.conv(x)
+        x = self.bn(x)
+        return x
diff --git a/ernie-sat/paddlespeech/t2s/modules/geometry.py b/ernie-sat/paddlespeech/t2s/modules/geometry.py
new file mode 100644
index 0000000000000000000000000000000000000000..01eb5ad0ab2479cff21d210c3b2f1aa5742fbd4c
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/modules/geometry.py
@@ -0,0 +1,44 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+import paddle
+
+
+def shuffle_dim(x, axis, perm=None):
+    """Permute input tensor along aixs given the permutation or randomly.
+    
+    Args:
+        x (Tensor): The input tensor.
+        axis (int): The axis to shuffle.
+        perm (List[int], ndarray, optional): 
+            The order to reorder the tensor along the ``axis``-th dimension.
+            It is a permutation of ``[0, d)``, where d is the size of the
+            ``axis``-th dimension of the input tensor. If not provided,
+            a random permutation is used. Defaults to None.
+
+    Returns:
+        Tensor: The shuffled tensor, which has the same shape as x does.
+    """
+    size = x.shape[axis]
+    if perm is not None and len(perm) != size:
+        raise ValueError("length of permutation should equals the input "
+                         "tensor's axis-th dimension's size")
+    if perm is not None:
+        perm = np.array(perm)
+    else:
+        perm = np.random.permutation(size)
+
+    perm = paddle.to_tensor(perm)
+    out = paddle.gather(x, perm, axis)
+    return out
diff --git a/ernie-sat/paddlespeech/t2s/modules/layer_norm.py b/ernie-sat/paddlespeech/t2s/modules/layer_norm.py
new file mode 100644
index 0000000000000000000000000000000000000000..088b98e02cf3fc987da54b881cf8060dfe15ecf2
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/modules/layer_norm.py
@@ -0,0 +1,60 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Layer normalization module."""
+import paddle
+from paddle import nn
+
+
+class LayerNorm(nn.LayerNorm):
+    """Layer normalization module.
+    Args:
+        nout (int): Output dim size.
+        dim (int): Dimension to be normalized.
+    """
+
+    def __init__(self, nout, dim=-1):
+        """Construct an LayerNorm object."""
+        super().__init__(nout)
+        self.dim = dim
+
+    def forward(self, x):
+        """Apply layer normalization.
+
+        Args:
+            x (Tensor):Input tensor.
+
+        Returns: 
+            Tensor: Normalized tensor.
+        """
+
+        if self.dim == -1:
+            return super(LayerNorm, self).forward(x)
+        else:
+            len_dim = len(x.shape)
+            if self.dim < 0:
+                self.dim = len_dim + self.dim
+            assert self.dim >= 0
+
+            orig_perm = list(range(len_dim))
+            new_perm = orig_perm[:]
+            # Python style item change is not able when converting dygraph to static graph.
+            # new_perm[self.dim], new_perm[len_dim -1] = new_perm[len_dim -1], new_perm[self.dim]
+            # use C++ style item change here
+            temp = new_perm[self.dim]
+            new_perm[self.dim] = new_perm[len_dim - 1]
+            new_perm[len_dim - 1] = temp
+
+            return paddle.transpose(
+                super(LayerNorm, self).forward(paddle.transpose(x, new_perm)),
+                new_perm)
diff --git a/ernie-sat/paddlespeech/t2s/modules/losses.py b/ernie-sat/paddlespeech/t2s/modules/losses.py
new file mode 100644
index 0000000000000000000000000000000000000000..db31bcfbb4361281df49d3afeb00dfb97c59d7f9
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/modules/losses.py
@@ -0,0 +1,1008 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+
+import librosa
+import numpy as np
+import paddle
+from paddle import nn
+from paddle.fluid.layers import sequence_mask
+from paddle.nn import functional as F
+from scipy import signal
+
+from paddlespeech.t2s.modules.nets_utils import make_non_pad_mask
+
+
+# Losses for WaveRNN
+def log_sum_exp(x):
+    """ numerically stable log_sum_exp implementation that prevents overflow """
+    # TF ordering
+    axis = len(x.shape) - 1
+    m = paddle.max(x, axis=axis)
+    m2 = paddle.max(x, axis=axis, keepdim=True)
+    return m + paddle.log(paddle.sum(paddle.exp(x - m2), axis=axis))
+
+
+# It is adapted from https://github.com/r9y9/wavenet_vocoder/blob/master/wavenet_vocoder/mixture.py
+def discretized_mix_logistic_loss(y_hat,
+                                  y,
+                                  num_classes=65536,
+                                  log_scale_min=None,
+                                  reduce=True):
+    if log_scale_min is None:
+        log_scale_min = float(np.log(1e-14))
+    y_hat = y_hat.transpose([0, 2, 1])
+    assert y_hat.dim() == 3
+    assert y_hat.shape[1] % 3 == 0
+    nr_mix = y_hat.shape[1] // 3
+
+    # (B x T x C)
+    y_hat = y_hat.transpose([0, 2, 1])
+
+    # unpack parameters. (B, T, num_mixtures) x 3
+    logit_probs = y_hat[:, :, :nr_mix]
+    means = y_hat[:, :, nr_mix:2 * nr_mix]
+    log_scales = paddle.clip(
+        y_hat[:, :, 2 * nr_mix:3 * nr_mix], min=log_scale_min)
+
+    # B x T x 1 -> B x T x num_mixtures
+    y = y.expand_as(means)
+    centered_y = paddle.cast(y, dtype=paddle.get_default_dtype()) - means
+    inv_stdv = paddle.exp(-log_scales)
+    plus_in = inv_stdv * (centered_y + 1. / (num_classes - 1))
+    cdf_plus = F.sigmoid(plus_in)
+    min_in = inv_stdv * (centered_y - 1. / (num_classes - 1))
+    cdf_min = F.sigmoid(min_in)
+
+    # log probability for edge case of 0 (before scaling)
+    # equivalent: torch.log(F.sigmoid(plus_in))
+    # softplus: log(1+ e^{-x})
+    log_cdf_plus = plus_in - F.softplus(plus_in)
+
+    # log probability for edge case of 255 (before scaling)
+    # equivalent: (1 - F.sigmoid(min_in)).log()
+    log_one_minus_cdf_min = -F.softplus(min_in)
+
+    # probability for all other cases
+    cdf_delta = cdf_plus - cdf_min
+
+    mid_in = inv_stdv * centered_y
+    # log probability in the center of the bin, to be used in extreme cases
+    # (not actually used in our code)
+    log_pdf_mid = mid_in - log_scales - 2. * F.softplus(mid_in)
+
+    # TODO: cdf_delta <= 1e-5 actually can happen. How can we choose the value
+    # for num_classes=65536 case? 1e-7? not sure..
+    inner_inner_cond = cdf_delta > 1e-5
+
+    inner_inner_cond = paddle.cast(
+        inner_inner_cond, dtype=paddle.get_default_dtype())
+
+    # inner_inner_out = inner_inner_cond * \
+    #                   paddle.log(paddle.clip(cdf_delta, min=1e-12)) + \
+    #                   (1. - inner_inner_cond) * (log_pdf_mid - np.log((num_classes - 1) / 2))
+
+    inner_inner_out = inner_inner_cond * paddle.log(
+        paddle.clip(cdf_delta, min=1e-12)) + (1. - inner_inner_cond) * (
+            log_pdf_mid - np.log((num_classes - 1) / 2))
+
+    inner_cond = y > 0.999
+
+    inner_cond = paddle.cast(inner_cond, dtype=paddle.get_default_dtype())
+
+    inner_out = inner_cond * log_one_minus_cdf_min + (1. - inner_cond
+                                                      ) * inner_inner_out
+    cond = y < -0.999
+    cond = paddle.cast(cond, dtype=paddle.get_default_dtype())
+
+    log_probs = cond * log_cdf_plus + (1. - cond) * inner_out
+    log_probs = log_probs + F.log_softmax(logit_probs, -1)
+
+    if reduce:
+        return -paddle.mean(log_sum_exp(log_probs))
+    else:
+        return -log_sum_exp(log_probs).unsqueeze(-1)
+
+
+def sample_from_discretized_mix_logistic(y, log_scale_min=None):
+    """
+    Sample from discretized mixture of logistic distributions
+
+    Args:
+        y(Tensor): (B, C, T)
+        log_scale_min(float, optional):  (Default value = None)
+
+    Returns:
+        Tensor: sample in range of [-1, 1].
+    """
+    if log_scale_min is None:
+        log_scale_min = float(np.log(1e-14))
+
+    assert y.shape[1] % 3 == 0
+    nr_mix = y.shape[1] // 3
+
+    # (B, T, C)
+    y = y.transpose([0, 2, 1])
+    logit_probs = y[:, :, :nr_mix]
+
+    # sample mixture indicator from softmax
+    temp = paddle.uniform(
+        logit_probs.shape, dtype=logit_probs.dtype, min=1e-5, max=1.0 - 1e-5)
+    temp = logit_probs - paddle.log(-paddle.log(temp))
+    argmax = paddle.argmax(temp, axis=-1)
+
+    # (B, T) -> (B, T, nr_mix)
+    one_hot = F.one_hot(argmax, nr_mix)
+    one_hot = paddle.cast(one_hot, dtype=paddle.get_default_dtype())
+
+    # select logistic parameters
+    means = paddle.sum(y[:, :, nr_mix:2 * nr_mix] * one_hot, axis=-1)
+    log_scales = paddle.clip(
+        paddle.sum(y[:, :, 2 * nr_mix:3 * nr_mix] * one_hot, axis=-1),
+        min=log_scale_min)
+    # sample from logistic & clip to interval
+    # we don't actually round to the nearest 8bit value when sampling
+    u = paddle.uniform(means.shape, min=1e-5, max=1.0 - 1e-5)
+    x = means + paddle.exp(log_scales) * (paddle.log(u) - paddle.log(1. - u))
+    x = paddle.clip(x, min=-1., max=-1.)
+
+    return x
+
+
+# Loss for new Tacotron2
+class GuidedAttentionLoss(nn.Layer):
+    """Guided attention loss function module.
+
+    This module calculates the guided attention loss described
+    in `Efficiently Trainable Text-to-Speech System Based
+    on Deep Convolutional Networks with Guided Attention`_,
+    which forces the attention to be diagonal.
+
+    .. _`Efficiently Trainable Text-to-Speech System
+        Based on Deep Convolutional Networks with Guided Attention`:
+        https://arxiv.org/abs/1710.08969
+
+    """
+
+    def __init__(self, sigma=0.4, alpha=1.0, reset_always=True):
+        """Initialize guided attention loss module.
+
+        Args:
+            sigma (float, optional): Standard deviation to control how close attention to a diagonal.
+            alpha (float, optional): Scaling coefficient (lambda).
+            reset_always (bool, optional): Whether to always reset masks.
+
+        """
+        super().__init__()
+        self.sigma = sigma
+        self.alpha = alpha
+        self.reset_always = reset_always
+        self.guided_attn_masks = None
+        self.masks = None
+
+    def _reset_masks(self):
+        self.guided_attn_masks = None
+        self.masks = None
+
+    def forward(self, att_ws, ilens, olens):
+        """Calculate forward propagation.
+
+        Args:
+            att_ws(Tensor): Batch of attention weights (B, T_max_out, T_max_in).
+            ilens(Tensor(int64)): Batch of input lenghts (B,).
+            olens(Tensor(int64)): Batch of output lenghts (B,).
+
+        Returns:
+            Tensor: Guided attention loss value.
+
+        """
+        if self.guided_attn_masks is None:
+            self.guided_attn_masks = self._make_guided_attention_masks(ilens,
+                                                                       olens)
+        if self.masks is None:
+            self.masks = self._make_masks(ilens, olens)
+        losses = self.guided_attn_masks * att_ws
+        loss = paddle.mean(
+            losses.masked_select(self.masks.broadcast_to(losses.shape)))
+        if self.reset_always:
+            self._reset_masks()
+        return self.alpha * loss
+
+    def _make_guided_attention_masks(self, ilens, olens):
+        n_batches = len(ilens)
+        max_ilen = max(ilens)
+        max_olen = max(olens)
+        guided_attn_masks = paddle.zeros((n_batches, max_olen, max_ilen))
+
+        for idx, (ilen, olen) in enumerate(zip(ilens, olens)):
+            guided_attn_masks[idx, :olen, :
+                              ilen] = self._make_guided_attention_mask(
+                                  ilen, olen, self.sigma)
+        return guided_attn_masks
+
+    @staticmethod
+    def _make_guided_attention_mask(ilen, olen, sigma):
+        """Make guided attention mask.
+
+        Examples
+        ----------
+        >>> guided_attn_mask =_make_guided_attention(5, 5, 0.4)
+        >>> guided_attn_mask.shape
+        [5, 5]
+        >>> guided_attn_mask
+        tensor([[0.0000, 0.1175, 0.3935, 0.6753, 0.8647],
+                [0.1175, 0.0000, 0.1175, 0.3935, 0.6753],
+                [0.3935, 0.1175, 0.0000, 0.1175, 0.3935],
+                [0.6753, 0.3935, 0.1175, 0.0000, 0.1175],
+                [0.8647, 0.6753, 0.3935, 0.1175, 0.0000]])
+        >>> guided_attn_mask =_make_guided_attention(3, 6, 0.4)
+        >>> guided_attn_mask.shape
+        [6, 3]
+        >>> guided_attn_mask
+        tensor([[0.0000, 0.2934, 0.7506],
+                [0.0831, 0.0831, 0.5422],
+                [0.2934, 0.0000, 0.2934],
+                [0.5422, 0.0831, 0.0831],
+                [0.7506, 0.2934, 0.0000],
+                [0.8858, 0.5422, 0.0831]])
+
+        """
+        grid_x, grid_y = paddle.meshgrid(
+            paddle.arange(olen), paddle.arange(ilen))
+        grid_x = grid_x.cast(dtype=paddle.float32)
+        grid_y = grid_y.cast(dtype=paddle.float32)
+        return 1.0 - paddle.exp(-(
+            (grid_y / ilen - grid_x / olen)**2) / (2 * (sigma**2)))
+
+    @staticmethod
+    def _make_masks(ilens, olens):
+        """Make masks indicating non-padded part.
+
+        Args:
+            ilens(Tensor(int64) or List): Batch of lengths (B,).
+            olens(Tensor(int64) or List): Batch of lengths (B,).
+
+        Returns:
+            Tensor: Mask tensor indicating non-padded part.
+
+        Examples:
+            >>> ilens, olens = [5, 2], [8, 5]
+            >>> _make_mask(ilens, olens)
+            tensor([[[1, 1, 1, 1, 1],
+                    [1, 1, 1, 1, 1],
+                    [1, 1, 1, 1, 1],
+                    [1, 1, 1, 1, 1],
+                    [1, 1, 1, 1, 1],
+                    [1, 1, 1, 1, 1],
+                    [1, 1, 1, 1, 1],
+                    [1, 1, 1, 1, 1]],
+
+                    [[1, 1, 0, 0, 0],
+                    [1, 1, 0, 0, 0],
+                    [1, 1, 0, 0, 0],
+                    [1, 1, 0, 0, 0],
+                    [1, 1, 0, 0, 0],
+                    [0, 0, 0, 0, 0],
+                    [0, 0, 0, 0, 0],
+                    [0, 0, 0, 0, 0]]], dtype=paddle.uint8)
+
+        """
+        # (B, T_in)
+        in_masks = make_non_pad_mask(ilens)
+        # (B, T_out)
+        out_masks = make_non_pad_mask(olens)
+        # (B, T_out, T_in)
+
+        return paddle.logical_and(
+            out_masks.unsqueeze(-1), in_masks.unsqueeze(-2))
+
+
+class GuidedMultiHeadAttentionLoss(GuidedAttentionLoss):
+    """Guided attention loss function module for multi head attention.
+
+    Args:
+        sigma (float, optional): Standard deviation to controlGuidedAttentionLoss
+            how close attention to a diagonal.
+        alpha (float, optional): Scaling coefficient (lambda).
+        reset_always (bool, optional): Whether to always reset masks.
+
+    """
+
+    def forward(self, att_ws, ilens, olens):
+        """Calculate forward propagation.
+
+        Args:
+            att_ws(Tensor): Batch of multi head attention weights (B, H, T_max_out, T_max_in).
+            ilens(Tensor): Batch of input lenghts (B,).
+            olens(Tensor): Batch of output lenghts (B,).
+
+        Returns:
+            Tensor: Guided attention loss value.
+
+        """
+        if self.guided_attn_masks is None:
+            self.guided_attn_masks = (
+                self._make_guided_attention_masks(ilens, olens).unsqueeze(1))
+        if self.masks is None:
+            self.masks = self._make_masks(ilens, olens).unsqueeze(1)
+        losses = self.guided_attn_masks * att_ws
+        loss = paddle.mean(
+            losses.masked_select(self.masks.broadcast_to(losses.shape)))
+        if self.reset_always:
+            self._reset_masks()
+
+        return self.alpha * loss
+
+
+class Tacotron2Loss(nn.Layer):
+    """Loss function module for Tacotron2."""
+
+    def __init__(self,
+                 use_masking=True,
+                 use_weighted_masking=False,
+                 bce_pos_weight=20.0):
+        """Initialize Tactoron2 loss module.
+
+        Args:
+            use_masking (bool): Whether to apply masking for padded part in loss calculation.
+            use_weighted_masking (bool): Whether to apply weighted masking in loss calculation.
+            bce_pos_weight (float): Weight of positive sample of stop token.
+        """
+        super().__init__()
+        assert (use_masking != use_weighted_masking) or not use_masking
+        self.use_masking = use_masking
+        self.use_weighted_masking = use_weighted_masking
+
+        # define criterions
+        reduction = "none" if self.use_weighted_masking else "mean"
+        self.l1_criterion = nn.L1Loss(reduction=reduction)
+        self.mse_criterion = nn.MSELoss(reduction=reduction)
+        self.bce_criterion = nn.BCEWithLogitsLoss(
+            reduction=reduction, pos_weight=paddle.to_tensor(bce_pos_weight))
+
+    def forward(self, after_outs, before_outs, logits, ys, stop_labels, olens):
+        """Calculate forward propagation.
+
+        Args:
+            after_outs(Tensor): Batch of outputs after postnets (B, Lmax, odim).
+            before_outs(Tensor): Batch of outputs before postnets (B, Lmax, odim).
+            logits(Tensor): Batch of stop logits (B, Lmax).
+            ys(Tensor): Batch of padded target features (B, Lmax, odim).
+            stop_labels(Tensor(int64)): Batch of the sequences of stop token labels (B, Lmax).
+            olens(Tensor(int64)): 
+
+        Returns:
+            Tensor: L1 loss value.
+            Tensor: Mean square error loss value.
+            Tensor: Binary cross entropy loss value.
+        """
+        # make mask and apply it
+        if self.use_masking:
+            masks = make_non_pad_mask(olens).unsqueeze(-1)
+            ys = ys.masked_select(masks.broadcast_to(ys.shape))
+            after_outs = after_outs.masked_select(
+                masks.broadcast_to(after_outs.shape))
+            before_outs = before_outs.masked_select(
+                masks.broadcast_to(before_outs.shape))
+            stop_labels = stop_labels.masked_select(
+                masks[:, :, 0].broadcast_to(stop_labels.shape))
+            logits = logits.masked_select(
+                masks[:, :, 0].broadcast_to(logits.shape))
+
+        # calculate loss
+        l1_loss = self.l1_criterion(after_outs, ys) + self.l1_criterion(
+            before_outs, ys)
+        mse_loss = self.mse_criterion(after_outs, ys) + self.mse_criterion(
+            before_outs, ys)
+        bce_loss = self.bce_criterion(logits, stop_labels)
+
+        # make weighted mask and apply it
+        if self.use_weighted_masking:
+            masks = make_non_pad_mask(olens).unsqueeze(-1)
+            weights = masks.float() / masks.sum(axis=1, keepdim=True).float()
+            out_weights = weights.divide(
+                paddle.shape(ys)[0] * paddle.shape(ys)[2])
+            logit_weights = weights.divide(paddle.shape(ys)[0])
+
+            # apply weight
+            l1_loss = l1_loss.multiply(out_weights)
+            l1_loss = l1_loss.masked_select(masks.broadcast_to(l1_loss)).sum()
+            mse_loss = mse_loss.multiply(out_weights)
+            mse_loss = mse_loss.masked_select(
+                masks.broadcast_to(mse_loss)).sum()
+            bce_loss = bce_loss.multiply(logit_weights.squeeze(-1))
+            bce_loss = bce_loss.masked_select(
+                masks.squeeze(-1).broadcast_to(bce_loss)).sum()
+
+        return l1_loss, mse_loss, bce_loss
+
+
+# Loss for Tacotron2
+def attention_guide(dec_lens, enc_lens, N, T, g, dtype=None):
+    """Build that W matrix. shape(B, T_dec, T_enc)
+    W[i, n, t] = 1 - exp(-(n/dec_lens[i] - t/enc_lens[i])**2 / (2g**2)) 
+
+    See also:
+    Tachibana, Hideyuki, Katsuya Uenoyama, and Shunsuke Aihara. 2017. “Efficiently Trainable Text-to-Speech System Based on Deep Convolutional Networks with Guided Attention.” ArXiv:1710.08969 [Cs, Eess], October. http://arxiv.org/abs/1710.08969.
+    """
+    dtype = dtype or paddle.get_default_dtype()
+    dec_pos = paddle.arange(0, N).astype(dtype) / dec_lens.unsqueeze(
+        -1)  # n/N # shape(B, T_dec)
+    enc_pos = paddle.arange(0, T).astype(dtype) / enc_lens.unsqueeze(
+        -1)  # t/T # shape(B, T_enc)
+    W = 1 - paddle.exp(-(dec_pos.unsqueeze(-1) - enc_pos.unsqueeze(1))**2 /
+                       (2 * g**2))
+
+    dec_mask = sequence_mask(dec_lens, maxlen=N)
+    enc_mask = sequence_mask(enc_lens, maxlen=T)
+    mask = dec_mask.unsqueeze(-1) * enc_mask.unsqueeze(1)
+    mask = paddle.cast(mask, W.dtype)
+
+    W *= mask
+    return W
+
+
+def guided_attention_loss(attention_weight, dec_lens, enc_lens, g):
+    """Guided attention loss, masked to excluded padding parts."""
+    _, N, T = attention_weight.shape
+    W = attention_guide(dec_lens, enc_lens, N, T, g, attention_weight.dtype)
+
+    total_tokens = (dec_lens * enc_lens).astype(W.dtype)
+    loss = paddle.mean(paddle.sum(W * attention_weight, [1, 2]) / total_tokens)
+    return loss
+
+
+# Losses for GAN Vocoder
+def stft(x,
+         fft_size,
+         hop_length=None,
+         win_length=None,
+         window='hann',
+         center=True,
+         pad_mode='reflect'):
+    """Perform STFT and convert to magnitude spectrogram.
+    Args:
+        x(Tensor): Input signal tensor (B, T).
+        fft_size(int): FFT size.
+        hop_size(int): Hop size.
+        win_length(int, optional): window : str, optional (Default value = None)
+        window(str, optional): Name of window function, see `scipy.signal.get_window` for more
+            details. Defaults to "hann".
+        center(bool, optional, optional): center (bool, optional): Whether to pad `x` to make that the
+            :math:`t \times hop\\_length` at the center of :math:`t`-th frame. Default: `True`.
+        pad_mode(str, optional, optional):  (Default value = 'reflect')
+        hop_length:  (Default value = None)
+
+    Returns:
+        Tensor: Magnitude spectrogram (B, #frames, fft_size // 2 + 1).
+    """
+    # calculate window
+    window = signal.get_window(window, win_length, fftbins=True)
+    window = paddle.to_tensor(window, dtype=x.dtype)
+    x_stft = paddle.signal.stft(
+        x,
+        fft_size,
+        hop_length,
+        win_length,
+        window=window,
+        center=center,
+        pad_mode=pad_mode)
+
+    real = x_stft.real()
+    imag = x_stft.imag()
+
+    return paddle.sqrt(paddle.clip(real**2 + imag**2, min=1e-7)).transpose(
+        [0, 2, 1])
+
+
+class SpectralConvergenceLoss(nn.Layer):
+    """Spectral convergence loss module."""
+
+    def __init__(self):
+        """Initilize spectral convergence loss module."""
+        super().__init__()
+
+    def forward(self, x_mag, y_mag):
+        """Calculate forward propagation.
+        Args: 
+            x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
+            y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
+        Returns:
+            Tensor: Spectral convergence loss value.
+        """
+        return paddle.norm(
+            y_mag - x_mag, p="fro") / paddle.clip(
+                paddle.norm(y_mag, p="fro"), min=1e-10)
+
+
+class LogSTFTMagnitudeLoss(nn.Layer):
+    """Log STFT magnitude loss module."""
+
+    def __init__(self, epsilon=1e-7):
+        """Initilize los STFT magnitude loss module."""
+        super().__init__()
+        self.epsilon = epsilon
+
+    def forward(self, x_mag, y_mag):
+        """Calculate forward propagation.
+        Args:
+            x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
+            y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
+        Returns:
+            Tensor: Log STFT magnitude loss value.
+        """
+        return F.l1_loss(
+            paddle.log(paddle.clip(y_mag, min=self.epsilon)),
+            paddle.log(paddle.clip(x_mag, min=self.epsilon)))
+
+
+class STFTLoss(nn.Layer):
+    """STFT loss module."""
+
+    def __init__(self,
+                 fft_size=1024,
+                 shift_size=120,
+                 win_length=600,
+                 window="hann"):
+        """Initialize STFT loss module."""
+        super().__init__()
+        self.fft_size = fft_size
+        self.shift_size = shift_size
+        self.win_length = win_length
+        self.window = window
+        self.spectral_convergence_loss = SpectralConvergenceLoss()
+        self.log_stft_magnitude_loss = LogSTFTMagnitudeLoss()
+
+    def forward(self, x, y):
+        """Calculate forward propagation.
+        Args:
+            x (Tensor): Predicted signal (B, T).
+            y (Tensor): Groundtruth signal (B, T).
+        Returns:
+            Tensor: Spectral convergence loss value.
+            Tensor: Log STFT magnitude loss value.
+        """
+        x_mag = stft(x, self.fft_size, self.shift_size, self.win_length,
+                     self.window)
+        y_mag = stft(y, self.fft_size, self.shift_size, self.win_length,
+                     self.window)
+        sc_loss = self.spectral_convergence_loss(x_mag, y_mag)
+        mag_loss = self.log_stft_magnitude_loss(x_mag, y_mag)
+
+        return sc_loss, mag_loss
+
+
+class MultiResolutionSTFTLoss(nn.Layer):
+    """Multi resolution STFT loss module."""
+
+    def __init__(
+            self,
+            fft_sizes=[1024, 2048, 512],
+            hop_sizes=[120, 240, 50],
+            win_lengths=[600, 1200, 240],
+            window="hann", ):
+        """Initialize Multi resolution STFT loss module.
+        Args:
+            fft_sizes (list): List of FFT sizes.
+            hop_sizes (list): List of hop sizes.
+            win_lengths (list): List of window lengths.
+            window (str): Window function type.
+        """
+        super().__init__()
+        assert len(fft_sizes) == len(hop_sizes) == len(win_lengths)
+        self.stft_losses = nn.LayerList()
+        for fs, ss, wl in zip(fft_sizes, hop_sizes, win_lengths):
+            self.stft_losses.append(STFTLoss(fs, ss, wl, window))
+
+    def forward(self, x, y):
+        """Calculate forward propagation.
+        
+        Args:
+            x (Tensor): Predicted signal (B, T) or (B, #subband, T).
+            y (Tensor): Groundtruth signal (B, T) or (B, #subband, T).
+        Returns:
+            Tensor: Multi resolution spectral convergence loss value.
+            Tensor: Multi resolution log STFT magnitude loss value.
+        """
+        if len(x.shape) == 3:
+            # (B, C, T) -> (B x C, T)
+            x = x.reshape([-1, x.shape[2]])
+            # (B, C, T) -> (B x C, T)
+            y = y.reshape([-1, y.shape[2]])
+        sc_loss = 0.0
+        mag_loss = 0.0
+        for f in self.stft_losses:
+            sc_l, mag_l = f(x, y)
+            sc_loss += sc_l
+            mag_loss += mag_l
+        sc_loss /= len(self.stft_losses)
+        mag_loss /= len(self.stft_losses)
+
+        return sc_loss, mag_loss
+
+
+class GeneratorAdversarialLoss(nn.Layer):
+    """Generator adversarial loss module."""
+
+    def __init__(
+            self,
+            average_by_discriminators=True,
+            loss_type="mse", ):
+        """Initialize GeneratorAversarialLoss module."""
+        super().__init__()
+        self.average_by_discriminators = average_by_discriminators
+        assert loss_type in ["mse", "hinge"], f"{loss_type} is not supported."
+        if loss_type == "mse":
+            self.criterion = self._mse_loss
+        else:
+            self.criterion = self._hinge_loss
+
+    def forward(self, outputs):
+        """Calcualate generator adversarial loss.
+        Args:
+            outputs (Tensor or List): Discriminator outputs or list of discriminator outputs.
+        Returns:
+            Tensor: Generator adversarial loss value.
+        """
+        if isinstance(outputs, (tuple, list)):
+            adv_loss = 0.0
+            for i, outputs_ in enumerate(outputs):
+                if isinstance(outputs_, (tuple, list)):
+                    # case including feature maps
+                    outputs_ = outputs_[-1]
+                adv_loss += self.criterion(outputs_)
+            if self.average_by_discriminators:
+                adv_loss /= i + 1
+        else:
+            adv_loss = self.criterion(outputs)
+
+        return adv_loss
+
+    def _mse_loss(self, x):
+        return F.mse_loss(x, paddle.ones_like(x))
+
+    def _hinge_loss(self, x):
+        return -x.mean()
+
+
+class DiscriminatorAdversarialLoss(nn.Layer):
+    """Discriminator adversarial loss module."""
+
+    def __init__(
+            self,
+            average_by_discriminators=True,
+            loss_type="mse", ):
+        """Initialize DiscriminatorAversarialLoss module."""
+        super().__init__()
+        self.average_by_discriminators = average_by_discriminators
+        assert loss_type in ["mse"], f"{loss_type} is not supported."
+        if loss_type == "mse":
+            self.fake_criterion = self._mse_fake_loss
+            self.real_criterion = self._mse_real_loss
+
+    def forward(self, outputs_hat, outputs):
+        """Calcualate discriminator adversarial loss.
+
+        Args:
+            outputs_hat (Tensor or list): Discriminator outputs or list of
+                discriminator outputs calculated from generator outputs.
+            outputs (Tensor or list): Discriminator outputs or list of
+                discriminator outputs calculated from groundtruth.
+        Returns:
+            Tensor: Discriminator real loss value.
+            Tensor: Discriminator fake loss value.
+        """
+        if isinstance(outputs, (tuple, list)):
+            real_loss = 0.0
+            fake_loss = 0.0
+            for i, (outputs_hat_,
+                    outputs_) in enumerate(zip(outputs_hat, outputs)):
+                if isinstance(outputs_hat_, (tuple, list)):
+                    # case including feature maps
+                    outputs_hat_ = outputs_hat_[-1]
+                    outputs_ = outputs_[-1]
+                real_loss += self.real_criterion(outputs_)
+                fake_loss += self.fake_criterion(outputs_hat_)
+            if self.average_by_discriminators:
+                fake_loss /= i + 1
+                real_loss /= i + 1
+        else:
+            real_loss = self.real_criterion(outputs)
+            fake_loss = self.fake_criterion(outputs_hat)
+
+        return real_loss, fake_loss
+
+    def _mse_real_loss(self, x):
+        return F.mse_loss(x, paddle.ones_like(x))
+
+    def _mse_fake_loss(self, x):
+        return F.mse_loss(x, paddle.zeros_like(x))
+
+
+# Losses for SpeedySpeech
+# Structural Similarity Index Measure (SSIM)
+def gaussian(window_size, sigma):
+    gauss = paddle.to_tensor([
+        math.exp(-(x - window_size // 2)**2 / float(2 * sigma**2))
+        for x in range(window_size)
+    ])
+    return gauss / gauss.sum()
+
+
+def create_window(window_size, channel):
+    _1D_window = gaussian(window_size, 1.5).unsqueeze(1)
+    _2D_window = paddle.matmul(_1D_window, paddle.transpose(
+        _1D_window, [1, 0])).unsqueeze([0, 1])
+    window = paddle.expand(_2D_window, [channel, 1, window_size, window_size])
+    return window
+
+
+def _ssim(img1, img2, window, window_size, channel, size_average=True):
+    mu1 = F.conv2d(img1, window, padding=window_size // 2, groups=channel)
+    mu2 = F.conv2d(img2, window, padding=window_size // 2, groups=channel)
+
+    mu1_sq = mu1.pow(2)
+    mu2_sq = mu2.pow(2)
+    mu1_mu2 = mu1 * mu2
+
+    sigma1_sq = F.conv2d(
+        img1 * img1, window, padding=window_size // 2, groups=channel) - mu1_sq
+    sigma2_sq = F.conv2d(
+        img2 * img2, window, padding=window_size // 2, groups=channel) - mu2_sq
+    sigma12 = F.conv2d(
+        img1 * img2, window, padding=window_size // 2, groups=channel) - mu1_mu2
+
+    C1 = 0.01**2
+    C2 = 0.03**2
+
+    ssim_map = ((2 * mu1_mu2 + C1) * (2 * sigma12 + C2)) \
+             / ((mu1_sq + mu2_sq + C1) * (sigma1_sq + sigma2_sq + C2))
+
+    if size_average:
+        return ssim_map.mean()
+    else:
+        return ssim_map.mean(1).mean(1).mean(1)
+
+
+def ssim(img1, img2, window_size=11, size_average=True):
+    (_, channel, _, _) = img1.shape
+    window = create_window(window_size, channel)
+    return _ssim(img1, img2, window, window_size, channel, size_average)
+
+
+def weighted_mean(input, weight):
+    """Weighted mean. It can also be used as masked mean.
+
+    Args:
+        input(Tensor): The input tensor.
+        weight(Tensor): The weight tensor with broadcastable shape with the input.
+
+    Returns:
+        Tensor: Weighted mean tensor with the same dtype as input. shape=(1,)
+            
+    """
+    weight = paddle.cast(weight, input.dtype)
+    # paddle.Tensor.size is different with torch.size() and has been overrided in s2t.__init__
+    broadcast_ratio = input.numel() / weight.numel()
+    return paddle.sum(input * weight) / (paddle.sum(weight) * broadcast_ratio)
+
+
+def masked_l1_loss(prediction, target, mask):
+    """Compute maksed L1 loss.
+
+    Args:
+        prediction(Tensor): The prediction.
+        target(Tensor): The target. The shape should be broadcastable to ``prediction``.
+        mask(Tensor): The mask. The shape should be broadcatable to the broadcasted shape of
+            ``prediction`` and ``target``.
+
+    Returns:
+        Tensor: The masked L1 loss. shape=(1,)
+        
+    """
+    abs_error = F.l1_loss(prediction, target, reduction='none')
+    loss = weighted_mean(abs_error, mask)
+    return loss
+
+
+class MelSpectrogram(nn.Layer):
+    """Calculate Mel-spectrogram."""
+
+    def __init__(
+            self,
+            fs=22050,
+            fft_size=1024,
+            hop_size=256,
+            win_length=None,
+            window="hann",
+            num_mels=80,
+            fmin=80,
+            fmax=7600,
+            center=True,
+            normalized=False,
+            onesided=True,
+            eps=1e-10,
+            log_base=10.0, ):
+        """Initialize MelSpectrogram module."""
+        super().__init__()
+        self.fft_size = fft_size
+        if win_length is None:
+            self.win_length = fft_size
+        else:
+            self.win_length = win_length
+        self.hop_size = hop_size
+        self.center = center
+        self.normalized = normalized
+        self.onesided = onesided
+
+        if window is not None and not hasattr(signal.windows, f"{window}"):
+            raise ValueError(f"{window} window is not implemented")
+        self.window = window
+        self.eps = eps
+
+        fmin = 0 if fmin is None else fmin
+        fmax = fs / 2 if fmax is None else fmax
+        melmat = librosa.filters.mel(
+            sr=fs,
+            n_fft=fft_size,
+            n_mels=num_mels,
+            fmin=fmin,
+            fmax=fmax, )
+
+        self.melmat = paddle.to_tensor(melmat.T)
+        self.stft_params = {
+            "n_fft": self.fft_size,
+            "win_length": self.win_length,
+            "hop_length": self.hop_size,
+            "center": self.center,
+            "normalized": self.normalized,
+            "onesided": self.onesided,
+        }
+
+        self.log_base = log_base
+        if self.log_base is None:
+            self.log = paddle.log
+        elif self.log_base == 2.0:
+            self.log = paddle.log2
+        elif self.log_base == 10.0:
+            self.log = paddle.log10
+        else:
+            raise ValueError(f"log_base: {log_base} is not supported.")
+
+    def forward(self, x):
+        """Calculate Mel-spectrogram.
+        Args:
+        
+            x (Tensor): Input waveform tensor (B, T) or (B, 1, T).
+        Returns:
+            Tensor: Mel-spectrogram (B, #mels, #frames).
+        """
+        if len(x.shape) == 3:
+            # (B, C, T) -> (B*C, T)
+            x = x.reshape([-1, paddle.shape(x)[2]])
+
+        if self.window is not None:
+            # calculate window
+            window = signal.get_window(
+                self.window, self.win_length, fftbins=True)
+            window = paddle.to_tensor(window, dtype=x.dtype)
+        else:
+            window = None
+
+        x_stft = paddle.signal.stft(x, window=window, **self.stft_params)
+        real = x_stft.real()
+        imag = x_stft.imag()
+        # (B, #freqs, #frames) -> (B, $frames, #freqs)
+        real = real.transpose([0, 2, 1])
+        imag = imag.transpose([0, 2, 1])
+        x_power = real**2 + imag**2
+        x_amp = paddle.sqrt(paddle.clip(x_power, min=self.eps))
+        x_mel = paddle.matmul(x_amp, self.melmat)
+        x_mel = paddle.clip(x_mel, min=self.eps)
+
+        return self.log(x_mel).transpose([0, 2, 1])
+
+
+class MelSpectrogramLoss(nn.Layer):
+    """Mel-spectrogram loss."""
+
+    def __init__(
+            self,
+            fs=22050,
+            fft_size=1024,
+            hop_size=256,
+            win_length=None,
+            window="hann",
+            num_mels=80,
+            fmin=80,
+            fmax=7600,
+            center=True,
+            normalized=False,
+            onesided=True,
+            eps=1e-10,
+            log_base=10.0, ):
+        """Initialize Mel-spectrogram loss."""
+        super().__init__()
+        self.mel_spectrogram = MelSpectrogram(
+            fs=fs,
+            fft_size=fft_size,
+            hop_size=hop_size,
+            win_length=win_length,
+            window=window,
+            num_mels=num_mels,
+            fmin=fmin,
+            fmax=fmax,
+            center=center,
+            normalized=normalized,
+            onesided=onesided,
+            eps=eps,
+            log_base=log_base, )
+
+    def forward(self, y_hat, y):
+        """Calculate Mel-spectrogram loss.
+        Args:
+            y_hat(Tensor): Generated single tensor (B, 1, T).
+            y(Tensor): Groundtruth single tensor (B, 1, T).
+
+        Returns:
+            Tensor: Mel-spectrogram loss value.
+        """
+        mel_hat = self.mel_spectrogram(y_hat)
+        mel = self.mel_spectrogram(y)
+        mel_loss = F.l1_loss(mel_hat, mel)
+
+        return mel_loss
+
+
+class FeatureMatchLoss(nn.Layer):
+    """Feature matching loss module."""
+
+    def __init__(
+            self,
+            average_by_layers=True,
+            average_by_discriminators=True,
+            include_final_outputs=False, ):
+        """Initialize FeatureMatchLoss module."""
+        super().__init__()
+        self.average_by_layers = average_by_layers
+        self.average_by_discriminators = average_by_discriminators
+        self.include_final_outputs = include_final_outputs
+
+    def forward(self, feats_hat, feats):
+        """Calcualate feature matching loss.
+
+        Args:
+            feats_hat(list): List of list of discriminator outputs
+                calcuated from generater outputs.
+            feats(list): List of list of discriminator outputs
+
+        Returns:
+            Tensor: Feature matching loss value.
+
+        """
+        feat_match_loss = 0.0
+        for i, (feats_hat_, feats_) in enumerate(zip(feats_hat, feats)):
+            feat_match_loss_ = 0.0
+            if not self.include_final_outputs:
+                feats_hat_ = feats_hat_[:-1]
+                feats_ = feats_[:-1]
+            for j, (feat_hat_, feat_) in enumerate(zip(feats_hat_, feats_)):
+                feat_match_loss_ += F.l1_loss(feat_hat_, feat_.detach())
+            if self.average_by_layers:
+                feat_match_loss_ /= j + 1
+            feat_match_loss += feat_match_loss_
+        if self.average_by_discriminators:
+            feat_match_loss /= i + 1
+
+        return feat_match_loss
diff --git a/ernie-sat/paddlespeech/t2s/modules/masked_fill.py b/ernie-sat/paddlespeech/t2s/modules/masked_fill.py
new file mode 100644
index 0000000000000000000000000000000000000000..b322225479c843672073ec5567fa4403137bdb26
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/modules/masked_fill.py
@@ -0,0 +1,49 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Union
+
+import paddle
+
+
+def is_broadcastable(shp1, shp2):
+    for a, b in zip(shp1[::-1], shp2[::-1]):
+        if a == 1 or b == 1 or a == b:
+            pass
+        else:
+            return False
+    return True
+
+
+# assume that len(shp1) == len(shp2)
+def broadcast_shape(shp1, shp2):
+    result = []
+    for a, b in zip(shp1[::-1], shp2[::-1]):
+        result.append(max(a, b))
+    return result[::-1]
+
+
+def masked_fill(xs: paddle.Tensor,
+                mask: paddle.Tensor,
+                value: Union[float, int]):
+    # comment following line for converting dygraph to static graph. 
+    # assert is_broadcastable(xs.shape, mask.shape) is True
+    # bshape = paddle.broadcast_shape(xs.shape, mask.shape)   
+    bshape = broadcast_shape(xs.shape, mask.shape)
+    mask.stop_gradient = True
+    mask = mask.broadcast_to(bshape)
+
+    trues = paddle.ones_like(xs) * value
+    mask = mask.cast(dtype=paddle.bool)
+    xs = paddle.where(mask, trues, xs)
+    return xs
diff --git a/ernie-sat/paddlespeech/t2s/modules/nets_utils.py b/ernie-sat/paddlespeech/t2s/modules/nets_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..4207d316c4d07922924a649b0cb5ae45f6032450
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/modules/nets_utils.py
@@ -0,0 +1,131 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+import paddle
+from paddle import nn
+from typeguard import check_argument_types
+
+
+def pad_list(xs, pad_value):
+    """Perform padding for the list of tensors.
+
+    Args:
+        xs (List[Tensor]): List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)].
+        pad_value (float): Value for padding.
+
+    Returns:
+        Tensor: Padded tensor (B, Tmax, `*`).
+
+    Examples:
+        >>> x = [paddle.ones([4]), paddle.ones([2]), paddle.ones([1])]
+        >>> x
+        [tensor([1., 1., 1., 1.]), tensor([1., 1.]), tensor([1.])]
+        >>> pad_list(x, 0)
+        tensor([[1., 1., 1., 1.],
+                [1., 1., 0., 0.],
+                [1., 0., 0., 0.]])
+    """
+    n_batch = len(xs)
+    max_len = max(x.shape[0] for x in xs)
+    pad = paddle.full([n_batch, max_len, *xs[0].shape[1:]], pad_value)
+
+    for i in range(n_batch):
+        pad[i, :xs[i].shape[0]] = xs[i]
+
+    return pad
+
+
+def make_pad_mask(lengths, length_dim=-1):
+    """Make mask tensor containing indices of padded part.
+
+    Args:
+        lengths (Tensor(int64)): Batch of lengths (B,).
+
+    Returns: 
+        Tensor(bool): Mask tensor containing indices of padded part bool.
+
+    Examples:
+        With only lengths.
+
+        >>> lengths = [5, 3, 2]
+        >>> make_non_pad_mask(lengths)
+        masks = [[0, 0, 0, 0 ,0],
+                    [0, 0, 0, 1, 1],
+                    [0, 0, 1, 1, 1]]
+    """
+    if length_dim == 0:
+        raise ValueError("length_dim cannot be 0: {}".format(length_dim))
+
+    bs = paddle.shape(lengths)[0]
+    maxlen = lengths.max()
+    seq_range = paddle.arange(0, maxlen, dtype=paddle.int64)
+    seq_range_expand = seq_range.unsqueeze(0).expand([bs, maxlen])
+    seq_length_expand = lengths.unsqueeze(-1)
+    mask = seq_range_expand >= seq_length_expand
+
+    return mask
+
+
+def make_non_pad_mask(lengths, length_dim=-1):
+    """Make mask tensor containing indices of non-padded part.
+
+    Args:
+        lengths (Tensor(int64) or List): Batch of lengths (B,).
+        xs (Tensor, optional): The reference tensor.
+            If set, masks will be the same shape as this tensor.
+        length_dim (int, optional): Dimension indicator of the above tensor.
+            See the example.
+
+    Returns:
+        Tensor(bool): mask tensor containing indices of padded part bool.
+
+    Examples: 
+        With only lengths.
+
+        >>> lengths = [5, 3, 2]
+        >>> make_non_pad_mask(lengths)
+        masks = [[1, 1, 1, 1 ,1],
+                    [1, 1, 1, 0, 0],
+                    [1, 1, 0, 0, 0]]
+    """
+    return paddle.logical_not(make_pad_mask(lengths, length_dim))
+
+
+def initialize(model: nn.Layer, init: str):
+    """Initialize weights of a neural network module.
+
+    Parameters are initialized using the given method or distribution.
+
+    Custom initialization routines can be implemented into submodules
+
+    Args:
+        model (nn.Layer): Target.
+        init (str): Method of initialization.
+    """
+    assert check_argument_types()
+
+    if init == "xavier_uniform":
+        nn.initializer.set_global_initializer(nn.initializer.XavierUniform(),
+                                              nn.initializer.Constant())
+    elif init == "xavier_normal":
+        nn.initializer.set_global_initializer(nn.initializer.XavierNormal(),
+                                              nn.initializer.Constant())
+    elif init == "kaiming_uniform":
+        nn.initializer.set_global_initializer(nn.initializer.KaimingUniform(),
+                                              nn.initializer.Constant())
+    elif init == "kaiming_normal":
+        nn.initializer.set_global_initializer(nn.initializer.KaimingNormal(),
+                                              nn.initializer.Constant())
+    else:
+        raise ValueError("Unknown initialization: " + init)
diff --git a/ernie-sat/paddlespeech/t2s/modules/normalizer.py b/ernie-sat/paddlespeech/t2s/modules/normalizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..a4fc598c2a55759d5ac0be6077a54bf9402b0ea8
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/modules/normalizer.py
@@ -0,0 +1,33 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddle
+from paddle import nn
+
+
+class ZScore(nn.Layer):
+    # feature last
+    def __init__(self, mu, sigma):
+        super().__init__()
+        self.register_buffer("mu", mu)
+        self.register_buffer("sigma", sigma)
+
+    def forward(self, x):
+        # NOTE: to be compatible with paddle's to_static, we must explicitly
+        # call multiply, or add, etc, instead of +-*/, etc.
+        return paddle.divide(paddle.subtract(x, self.mu), self.sigma)
+
+    def inverse(self, x):
+        # NOTE: to be compatible with paddle's to_static, we must explicitly
+        # call multiply, or add, etc, instead of +-*/, etc.
+        return paddle.add(paddle.multiply(x, self.sigma), self.mu)
diff --git a/ernie-sat/paddlespeech/t2s/modules/positional_encoding.py b/ernie-sat/paddlespeech/t2s/modules/positional_encoding.py
new file mode 100644
index 0000000000000000000000000000000000000000..715c576f52ba586992b77e66b398f1a56e8a0fc7
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/modules/positional_encoding.py
@@ -0,0 +1,67 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddle
+from paddle import Tensor
+
+__all__ = ["sinusoid_position_encoding", "scaled_position_encoding"]
+
+
+def sinusoid_position_encoding(num_positions: int,
+                               feature_size: int,
+                               omega: float=1.0,
+                               start_pos: int=0,
+                               dtype=None) -> paddle.Tensor:
+    # return tensor shape (num_positions, feature_size)
+    # NOTE: to be compatible with paddle's to_static, we cannnot raise 
+    # an exception here, take care of it by yourself
+    # if (feature_size % 2 != 0):
+    #     raise ValueError("size should be divisible by 2")
+    dtype = dtype or paddle.get_default_dtype()
+
+    channel = paddle.arange(0, feature_size, 2, dtype=dtype)
+    index = paddle.arange(start_pos, start_pos + num_positions, 1, dtype=dtype)
+    denominator = channel / float(feature_size)
+    denominator = paddle.to_tensor([10000.0], dtype='float32')**denominator
+    p = (paddle.unsqueeze(index, -1) * omega) / denominator
+    encodings = paddle.zeros([num_positions, feature_size], dtype=dtype)
+    encodings[:, 0::2] = paddle.sin(p)
+    encodings[:, 1::2] = paddle.cos(p)
+    return encodings
+
+
+def scaled_position_encoding(num_positions: int,
+                             feature_size: int,
+                             omega: Tensor,
+                             start_pos: int=0,
+                             dtype=None) -> Tensor:
+    # omega: Tensor (batch_size, )
+    # return tensor shape (batch_size, num_positions, feature_size)
+    # consider renaming this as batched positioning encoding
+    if (feature_size % 2 != 0):
+        raise ValueError("size should be divisible by 2")
+    dtype = dtype or paddle.get_default_dtype()
+
+    channel = paddle.arange(0, feature_size, 2, dtype=dtype)
+    index = paddle.arange(
+        start_pos, start_pos + num_positions, 1, dtype=omega.dtype)
+    batch_size = omega.shape[0]
+    omega = paddle.unsqueeze(omega, [1, 2])
+    p = (paddle.unsqueeze(index, -1) *
+         omega) / (10000.0**(channel / float(feature_size)))
+    encodings = paddle.zeros(
+        [batch_size, num_positions, feature_size], dtype=dtype)
+    # it is nice to have fancy indexing and inplace operations
+    encodings[:, :, 0::2] = paddle.sin(p)
+    encodings[:, :, 1::2] = paddle.cos(p)
+    return encodings
diff --git a/ernie-sat/paddlespeech/t2s/modules/pqmf.py b/ernie-sat/paddlespeech/t2s/modules/pqmf.py
new file mode 100644
index 0000000000000000000000000000000000000000..9860da906094ad930a7791ca527b44cc2a3e51d1
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/modules/pqmf.py
@@ -0,0 +1,127 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+"""Pseudo QMF modules."""
+import numpy as np
+import paddle
+import paddle.nn.functional as F
+from paddle import nn
+from scipy.signal import kaiser
+
+
+def design_prototype_filter(taps=62, cutoff_ratio=0.142, beta=9.0):
+    """Design prototype filter for PQMF.
+    This method is based on `A Kaiser window approach for the design of prototype
+    filters of cosine modulated filterbanks`_.
+
+    Args:
+        taps (int): The number of filter taps.
+        cutoff_ratio (float): Cut-off frequency ratio.
+        beta (float): Beta coefficient for kaiser window.
+    Returns:
+        ndarray:
+            Impluse response of prototype filter (taps + 1,).
+        .. _`A Kaiser window approach for the design of prototype filters of cosine modulated filterbanks`:
+            https://ieeexplore.ieee.org/abstract/document/681427
+    """
+    # check the arguments are valid
+    assert taps % 2 == 0, "The number of taps mush be even number."
+    assert 0.0 < cutoff_ratio < 1.0, "Cutoff ratio must be > 0.0 and < 1.0."
+    # make initial filter
+    omega_c = np.pi * cutoff_ratio
+    with np.errstate(invalid="ignore"):
+        h_i = np.sin(omega_c * (np.arange(taps + 1) - 0.5 * taps)) / (
+            np.pi * (np.arange(taps + 1) - 0.5 * taps))
+    h_i[taps //
+        2] = np.cos(0) * cutoff_ratio  # fix nan due to indeterminate form
+
+    # apply kaiser window
+    w = kaiser(taps + 1, beta)
+    h = h_i * w
+
+    return h
+
+
+class PQMF(nn.Layer):
+    """PQMF module.
+    This module is based on `Near-perfect-reconstruction pseudo-QMF banks`_.
+    .. _`Near-perfect-reconstruction pseudo-QMF banks`:
+        https://ieeexplore.ieee.org/document/258122
+    """
+
+    def __init__(self, subbands=4, taps=62, cutoff_ratio=0.142, beta=9.0):
+        """Initilize PQMF module.
+        The cutoff_ratio and beta parameters are optimized for #subbands = 4.
+        See dicussion in https://github.com/kan-bayashi/ParallelWaveGAN/issues/195.
+
+        Args:
+            subbands (int): The number of subbands.
+            taps (int): The number of filter taps.
+            cutoff_ratio (float): Cut-off frequency ratio.
+            beta (float): Beta coefficient for kaiser window.
+        """
+        super().__init__()
+
+        h_proto = design_prototype_filter(taps, cutoff_ratio, beta)
+        h_analysis = np.zeros((subbands, len(h_proto)))
+        h_synthesis = np.zeros((subbands, len(h_proto)))
+        for k in range(subbands):
+            h_analysis[k] = (
+                2 * h_proto * np.cos((2 * k + 1) * (np.pi / (2 * subbands)) * (
+                    np.arange(taps + 1) - (taps / 2)) + (-1)**k * np.pi / 4))
+            h_synthesis[k] = (
+                2 * h_proto * np.cos((2 * k + 1) * (np.pi / (2 * subbands)) * (
+                    np.arange(taps + 1) - (taps / 2)) - (-1)**k * np.pi / 4))
+
+        # convert to tensor
+        self.analysis_filter = paddle.to_tensor(
+            h_analysis, dtype="float32").unsqueeze(1)
+        self.synthesis_filter = paddle.to_tensor(
+            h_synthesis, dtype="float32").unsqueeze(0)
+
+        # filter for downsampling & upsampling
+        updown_filter = paddle.zeros(
+            (subbands, subbands, subbands), dtype="float32")
+        for k in range(subbands):
+            updown_filter[k, k, 0] = 1.0
+        self.updown_filter = updown_filter
+        self.subbands = subbands
+        # keep padding info
+        self.pad_fn = nn.Pad1D(taps // 2, mode='constant', value=0.0)
+
+    def analysis(self, x):
+        """Analysis with PQMF.
+        Args:
+            x (Tensor): Input tensor (B, 1, T).
+        Returns:
+            Tensor: Output tensor (B, subbands, T // subbands).
+        """
+        x = F.conv1d(self.pad_fn(x), self.analysis_filter)
+        return F.conv1d(x, self.updown_filter, stride=self.subbands)
+
+    def synthesis(self, x):
+        """Synthesis with PQMF.
+        Args:
+            x (Tensor): Input tensor (B, subbands, T // subbands).
+        Returns:
+            Tensor: Output tensor (B, 1, T).
+        """
+        x = F.conv1d_transpose(
+            x, self.updown_filter * self.subbands, stride=self.subbands)
+
+        return F.conv1d(self.pad_fn(x), self.synthesis_filter)
+
+    # when converting dygraph to static graph, can not use self.pqmf.synthesis directly
+    def forward(self, x):
+        return self.synthesis(x)
diff --git a/ernie-sat/paddlespeech/t2s/modules/predictor/__init__.py b/ernie-sat/paddlespeech/t2s/modules/predictor/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..185a92b8d94d3426d616c0624f0f2ee04339349e
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/modules/predictor/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/ernie-sat/paddlespeech/t2s/modules/predictor/duration_predictor.py b/ernie-sat/paddlespeech/t2s/modules/predictor/duration_predictor.py
new file mode 100644
index 0000000000000000000000000000000000000000..33ed575b4245506438e439fff5d5b8a6ff1b238a
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/modules/predictor/duration_predictor.py
@@ -0,0 +1,156 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+"""Duration predictor related modules."""
+import paddle
+from paddle import nn
+
+from paddlespeech.t2s.modules.layer_norm import LayerNorm
+from paddlespeech.t2s.modules.masked_fill import masked_fill
+
+
+class DurationPredictor(nn.Layer):
+    """Duration predictor module.
+
+    This is a module of duration predictor described
+    in `FastSpeech: Fast, Robust and Controllable Text to Speech`_.
+    The duration predictor predicts a duration of each frame in log domain
+    from the hidden embeddings of encoder.
+
+    .. _`FastSpeech: Fast, Robust and Controllable Text to Speech`:
+        https://arxiv.org/pdf/1905.09263.pdf
+
+    Note
+    ----------
+    The calculation domain of outputs is different
+    between in `forward` and in `inference`. In `forward`,
+    the outputs are calculated in log domain but in `inference`,
+    those are calculated in linear domain.
+
+    """
+
+    def __init__(self,
+                 idim,
+                 n_layers=2,
+                 n_chans=384,
+                 kernel_size=3,
+                 dropout_rate=0.1,
+                 offset=1.0):
+        """Initilize duration predictor module.
+
+        Args:
+            idim (int):Input dimension.
+            n_layers (int, optional): Number of convolutional layers.
+            n_chans (int, optional): Number of channels of convolutional layers.
+            kernel_size (int, optional): Kernel size of convolutional layers.
+            dropout_rate (float, optional): Dropout rate.
+            offset (float, optional): Offset value to avoid nan in log domain.
+
+        """
+        super().__init__()
+        self.offset = offset
+        self.conv = nn.LayerList()
+        for idx in range(n_layers):
+            in_chans = idim if idx == 0 else n_chans
+            self.conv.append(
+                nn.Sequential(
+                    nn.Conv1D(
+                        in_chans,
+                        n_chans,
+                        kernel_size,
+                        stride=1,
+                        padding=(kernel_size - 1) // 2, ),
+                    nn.ReLU(),
+                    LayerNorm(n_chans, dim=1),
+                    nn.Dropout(dropout_rate), ))
+        self.linear = nn.Linear(n_chans, 1, bias_attr=True)
+
+    def _forward(self, xs, x_masks=None, is_inference=False):
+        # (B, idim, Tmax)
+        xs = xs.transpose([0, 2, 1])
+        # (B, C, Tmax)
+        for f in self.conv:
+            xs = f(xs)
+
+        # NOTE: calculate in log domain
+        # (B, Tmax)
+        xs = self.linear(xs.transpose([0, 2, 1])).squeeze(-1)
+
+        if is_inference:
+            # NOTE: calculate in linear domain
+            xs = paddle.clip(paddle.round(xs.exp() - self.offset), min=0)
+
+        if x_masks is not None:
+            xs = masked_fill(xs, x_masks, 0.0)
+
+        return xs
+
+    def forward(self, xs, x_masks=None):
+        """Calculate forward propagation.
+        Args:
+            xs(Tensor): Batch of input sequences (B, Tmax, idim).
+            x_masks(ByteTensor, optional, optional): Batch of masks indicating padded part (B, Tmax). (Default value = None)
+
+        Returns:
+            Tensor: Batch of predicted durations in log domain (B, Tmax).
+        """
+        return self._forward(xs, x_masks, False)
+
+    def inference(self, xs, x_masks=None):
+        """Inference duration.
+        Args:
+            xs(Tensor): Batch of input sequences (B, Tmax, idim).
+            x_masks(Tensor(bool), optional, optional): Batch of masks indicating padded part (B, Tmax). (Default value = None)
+
+        Returns:
+            Tensor: Batch of predicted durations in linear domain int64 (B, Tmax).
+        """
+        return self._forward(xs, x_masks, True)
+
+
+class DurationPredictorLoss(nn.Layer):
+    """Loss function module for duration predictor.
+
+    The loss value is Calculated in log domain to make it Gaussian.
+
+    """
+
+    def __init__(self, offset=1.0, reduction="mean"):
+        """Initilize duration predictor loss module.
+        Args:
+            offset (float, optional): Offset value to avoid nan in log domain.
+            reduction (str): Reduction type in loss calculation.
+        """
+        super().__init__()
+        self.criterion = nn.MSELoss(reduction=reduction)
+        self.offset = offset
+
+    def forward(self, outputs, targets):
+        """Calculate forward propagation.
+
+        Args:
+            outputs(Tensor): Batch of prediction durations in log domain (B, T)
+            targets(Tensor): Batch of groundtruth durations in linear domain (B, T)
+
+        Returns: 
+            Tensor: Mean squared error loss value.
+
+        Note: 
+            `outputs` is in log domain but `targets` is in linear domain.
+        """
+        # NOTE: outputs is in log domain while targets in linear
+        targets = paddle.log(targets.cast(dtype='float32') + self.offset)
+        loss = self.criterion(outputs, targets)
+
+        return loss
diff --git a/ernie-sat/paddlespeech/t2s/modules/predictor/length_regulator.py b/ernie-sat/paddlespeech/t2s/modules/predictor/length_regulator.py
new file mode 100644
index 0000000000000000000000000000000000000000..be788e6ed214af92aec0da31582193f56bcfb419
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/modules/predictor/length_regulator.py
@@ -0,0 +1,123 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+"""Length regulator related modules."""
+import numpy as np
+import paddle
+from paddle import nn
+
+
+class LengthRegulator(nn.Layer):
+    """Length regulator module for feed-forward Transformer.
+
+    This is a module of length regulator described in
+    `FastSpeech: Fast, Robust and Controllable Text to Speech`_.
+    The length regulator expands char or
+    phoneme-level embedding features to frame-level by repeating each
+    feature based on the corresponding predicted durations.
+
+    .. _`FastSpeech: Fast, Robust and Controllable Text to Speech`:
+        https://arxiv.org/pdf/1905.09263.pdf
+
+    """
+
+    def __init__(self, pad_value=0.0):
+        """Initilize length regulator module.
+
+        Args:
+            pad_value (float, optional): Value used for padding.
+
+        """
+        super().__init__()
+        self.pad_value = pad_value
+
+    # expand_numpy is faster than expand
+    def expand_numpy(self, encodings: paddle.Tensor,
+                     durations: paddle.Tensor) -> paddle.Tensor:
+        """
+        encodings: (B, T, C)
+        durations: (B, T)
+        """
+        batch_size, t_enc = durations.shape
+        durations = durations.numpy()
+        slens = np.sum(durations, -1)
+        t_dec = np.max(slens)
+        M = np.zeros([batch_size, t_dec, t_enc])
+        for i in range(batch_size):
+            k = 0
+            for j in range(t_enc):
+                d = durations[i, j]
+                M[i, k:k + d, j] = 1
+                k += d
+        M = paddle.to_tensor(M, dtype=encodings.dtype)
+        encodings = paddle.matmul(M, encodings)
+        return encodings
+
+    def expand(self, encodings: paddle.Tensor,
+               durations: paddle.Tensor) -> paddle.Tensor:
+        """
+        encodings: (B, T, C)
+        durations: (B, T)
+        """
+        batch_size, t_enc = paddle.shape(durations)
+        slens = paddle.sum(durations, -1)
+        t_dec = paddle.max(slens)
+        t_dec_1 = t_dec + 1
+        flatten_duration = paddle.cumsum(
+            paddle.reshape(durations, [batch_size * t_enc])) + 1
+        init = paddle.zeros(t_dec_1)
+        m_batch = batch_size * t_enc
+        M = paddle.zeros([t_dec_1, m_batch])
+        for i in range(m_batch):
+            d = flatten_duration[i]
+            m = paddle.concat(
+                [paddle.ones(d), paddle.zeros(t_dec_1 - d)], axis=0)
+            M[:, i] = m - init
+            init = m
+        M = paddle.reshape(M, shape=[t_dec_1, batch_size, t_enc])
+        M = M[1:, :, :]
+        M = paddle.transpose(M, (1, 0, 2))
+        encodings = paddle.matmul(M, encodings)
+        return encodings
+
+    def forward(self, xs, ds, alpha=1.0, is_inference=False):
+        """Calculate forward propagation.
+
+        Args:
+            xs (Tensor): Batch of sequences of char or phoneme embeddings (B, Tmax, D).
+            ds (Tensor(int64)): Batch of durations of each frame (B, T).
+            alpha (float, optional): Alpha value to control speed of speech.
+
+        Returns:
+            Tensor: replicated input tensor based on durations (B, T*, D).
+        """
+
+        if alpha != 1.0:
+            assert alpha > 0
+            ds = paddle.round(ds.cast(dtype=paddle.float32) * alpha)
+        ds = ds.cast(dtype=paddle.int64)
+        '''
+        from distutils.version import LooseVersion
+        from paddlespeech.t2s.modules.nets_utils import pad_list
+        # 这里在 paddle 2.2.2 的动转静是不通的
+        # if LooseVersion(paddle.__version__) >= "2.3.0" or hasattr(paddle, 'repeat_interleave'):
+        # if LooseVersion(paddle.__version__) >= "2.3.0":
+        if hasattr(paddle, 'repeat_interleave'):
+            repeat = [paddle.repeat_interleave(x, d, axis=0) for x, d in zip(xs, ds)]
+            return pad_list(repeat, self.pad_value)
+        '''
+        if is_inference:
+            return self.expand(xs, ds)
+        else:
+            return self.expand_numpy(xs, ds)
diff --git a/ernie-sat/paddlespeech/t2s/modules/predictor/variance_predictor.py b/ernie-sat/paddlespeech/t2s/modules/predictor/variance_predictor.py
new file mode 100644
index 0000000000000000000000000000000000000000..8afbf2576d158c9df7a56800f7fdea386bb0ae2b
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/modules/predictor/variance_predictor.py
@@ -0,0 +1,94 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+"""Variance predictor related modules."""
+import paddle
+from paddle import nn
+from typeguard import check_argument_types
+
+from paddlespeech.t2s.modules.layer_norm import LayerNorm
+from paddlespeech.t2s.modules.masked_fill import masked_fill
+
+
+class VariancePredictor(nn.Layer):
+    """Variance predictor module.
+
+    This is a module of variacne predictor described in `FastSpeech 2:
+    Fast and High-Quality End-to-End Text to Speech`_.
+
+    .. _`FastSpeech 2: Fast and High-Quality End-to-End Text to Speech`:
+        https://arxiv.org/abs/2006.04558
+
+    """
+
+    def __init__(
+            self,
+            idim: int,
+            n_layers: int=2,
+            n_chans: int=384,
+            kernel_size: int=3,
+            bias: bool=True,
+            dropout_rate: float=0.5, ):
+        """Initilize duration predictor module.
+
+        Args:
+            idim (int): Input dimension.
+            n_layers (int, optional): Number of convolutional layers.
+            n_chans (int, optional): Number of channels of convolutional layers.
+            kernel_size (int, optional): Kernel size of convolutional layers.
+            dropout_rate (float, optional): Dropout rate.
+        """
+        assert check_argument_types()
+        super().__init__()
+        self.conv = nn.LayerList()
+        for idx in range(n_layers):
+            in_chans = idim if idx == 0 else n_chans
+            self.conv.append(
+                nn.Sequential(
+                    nn.Conv1D(
+                        in_chans,
+                        n_chans,
+                        kernel_size,
+                        stride=1,
+                        padding=(kernel_size - 1) // 2,
+                        bias_attr=True, ),
+                    nn.ReLU(),
+                    LayerNorm(n_chans, dim=1),
+                    nn.Dropout(dropout_rate), ))
+
+        self.linear = nn.Linear(n_chans, 1, bias_attr=True)
+
+    def forward(self, xs: paddle.Tensor,
+                x_masks: paddle.Tensor=None) -> paddle.Tensor:
+        """Calculate forward propagation.
+
+        Args:
+            xs (Tensor): Batch of input sequences (B, Tmax, idim).
+            x_masks (Tensor(bool), optional): Batch of masks indicating padded part (B, Tmax, 1).
+
+        Returns:
+            Tensor: Batch of predicted sequences (B, Tmax, 1).
+        """
+        # (B, idim, Tmax)
+        xs = xs.transpose([0, 2, 1])
+        # (B, C, Tmax)
+        for f in self.conv:
+            # (B, C, Tmax)
+            xs = f(xs)
+        # (B, Tmax, 1)
+        xs = self.linear(xs.transpose([0, 2, 1]))
+
+        if x_masks is not None:
+            xs = masked_fill(xs, x_masks, 0.0)
+        return xs
diff --git a/ernie-sat/paddlespeech/t2s/modules/residual_block.py b/ernie-sat/paddlespeech/t2s/modules/residual_block.py
new file mode 100644
index 0000000000000000000000000000000000000000..efbfce27ad69e9aace787e1ed6ab0ab945e88ffb
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/modules/residual_block.py
@@ -0,0 +1,179 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from typing import Any
+from typing import Dict
+from typing import List
+
+import paddle
+from paddle import nn
+from paddle.nn import functional as F
+
+from paddlespeech.t2s.modules.activation import get_activation
+
+
+class WaveNetResidualBlock(nn.Layer):
+    """A gated activation unit composed of an 1D convolution, a gated tanh
+    unit and parametric redidual and skip connections. For more details, 
+    refer to `WaveNet: A Generative Model for Raw Audio <https://arxiv.org/abs/1609.03499>`_.
+
+    Args:
+        kernel_size (int, optional): Kernel size of the 1D convolution, by default 3
+        residual_channels (int, optional): Feature size of the resiaudl output(and also the input), by default 64
+        gate_channels (int, optional): Output feature size of the 1D convolution, by default 128
+        skip_channels (int, optional): Feature size of the skip output, by default 64
+        aux_channels (int, optional): Feature size of the auxiliary input (e.g. spectrogram), by default 80
+        dropout (float, optional): Probability of the dropout before the 1D convolution, by default 0.
+        dilation (int, optional): Dilation of the 1D convolution, by default 1
+        bias (bool, optional): Whether to use bias in the 1D convolution, by default True
+        use_causal_conv (bool, optional): Whether to use causal padding for the 1D convolution, by default False
+    """
+
+    def __init__(self,
+                 kernel_size: int=3,
+                 residual_channels: int=64,
+                 gate_channels: int=128,
+                 skip_channels: int=64,
+                 aux_channels: int=80,
+                 dropout: float=0.,
+                 dilation: int=1,
+                 bias: bool=True,
+                 use_causal_conv: bool=False):
+        super().__init__()
+        self.dropout = dropout
+        if use_causal_conv:
+            padding = (kernel_size - 1) * dilation
+        else:
+            assert kernel_size % 2 == 1
+            padding = (kernel_size - 1) // 2 * dilation
+        self.use_causal_conv = use_causal_conv
+
+        self.conv = nn.Conv1D(
+            residual_channels,
+            gate_channels,
+            kernel_size,
+            padding=padding,
+            dilation=dilation,
+            bias_attr=bias)
+        if aux_channels is not None:
+            self.conv1x1_aux = nn.Conv1D(
+                aux_channels, gate_channels, kernel_size=1, bias_attr=False)
+        else:
+            self.conv1x1_aux = None
+
+        gate_out_channels = gate_channels // 2
+        self.conv1x1_out = nn.Conv1D(
+            gate_out_channels, residual_channels, kernel_size=1, bias_attr=bias)
+        self.conv1x1_skip = nn.Conv1D(
+            gate_out_channels, skip_channels, kernel_size=1, bias_attr=bias)
+
+    def forward(self, x, c):
+        """
+        Args:
+            x (Tensor): the input features. Shape (N, C_res, T)
+            c (Tensor): the auxiliary input. Shape (N, C_aux, T)
+
+        Returns:
+            res (Tensor): Shape (N, C_res, T), the residual output, which is used as the 
+                input of the next ResidualBlock in a stack of ResidualBlocks.
+            skip (Tensor): Shape (N, C_skip, T), the skip output, which is collected among
+                each layer in a stack of ResidualBlocks.
+        """
+        x_input = x
+        x = F.dropout(x, self.dropout, training=self.training)
+        x = self.conv(x)
+        x = x[:, :, x_input.shape[-1]] if self.use_causal_conv else x
+        if c is not None:
+            c = self.conv1x1_aux(c)
+            x += c
+
+        a, b = paddle.chunk(x, 2, axis=1)
+        x = paddle.tanh(a) * F.sigmoid(b)
+
+        skip = self.conv1x1_skip(x)
+        res = (self.conv1x1_out(x) + x_input) * math.sqrt(0.5)
+        return res, skip
+
+
+class HiFiGANResidualBlock(nn.Layer):
+    """Residual block module in HiFiGAN."""
+
+    def __init__(
+            self,
+            kernel_size: int=3,
+            channels: int=512,
+            dilations: List[int]=(1, 3, 5),
+            bias: bool=True,
+            use_additional_convs: bool=True,
+            nonlinear_activation: str="leakyrelu",
+            nonlinear_activation_params: Dict[str, Any]={"negative_slope": 0.1},
+    ):
+        """Initialize HiFiGANResidualBlock module.
+        Args:
+            kernel_size (int): Kernel size of dilation convolution layer.
+            channels (int): Number of channels for convolution layer.
+            dilations (List[int]): List of dilation factors.
+            use_additional_convs (bool): Whether to use additional convolution layers.
+            bias (bool): Whether to add bias parameter in convolution layers.
+            nonlinear_activation (str): Activation function module name.
+            nonlinear_activation_params (dict): Hyperparameters for activation function.
+        """
+        super().__init__()
+
+        self.use_additional_convs = use_additional_convs
+        self.convs1 = nn.LayerList()
+        if use_additional_convs:
+            self.convs2 = nn.LayerList()
+        assert kernel_size % 2 == 1, "Kernel size must be odd number."
+
+        for dilation in dilations:
+            self.convs1.append(
+                nn.Sequential(
+                    get_activation(nonlinear_activation, **
+                                   nonlinear_activation_params),
+                    nn.Conv1D(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation,
+                        bias_attr=bias,
+                        padding=(kernel_size - 1) // 2 * dilation, ), ))
+            if use_additional_convs:
+                self.convs2.append(
+                    nn.Sequential(
+                        get_activation(nonlinear_activation, **
+                                       nonlinear_activation_params),
+                        nn.Conv1D(
+                            channels,
+                            channels,
+                            kernel_size,
+                            1,
+                            dilation=1,
+                            bias_attr=bias,
+                            padding=(kernel_size - 1) // 2, ), ))
+
+    def forward(self, x):
+        """Calculate forward propagation.
+        Args:
+            x (Tensor): Input tensor (B, channels, T).
+        Returns:
+            Tensor: Output tensor (B, channels, T).
+        """
+        for idx in range(len(self.convs1)):
+            xt = self.convs1[idx](x)
+            if self.use_additional_convs:
+                xt = self.convs2[idx](xt)
+            x = xt + x
+        return x
diff --git a/ernie-sat/paddlespeech/t2s/modules/residual_stack.py b/ernie-sat/paddlespeech/t2s/modules/residual_stack.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d949b5635329819a613a748e34015964d2fed5c
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/modules/residual_stack.py
@@ -0,0 +1,102 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+"""Residual stack module in MelGAN."""
+from typing import Any
+from typing import Dict
+
+from paddle import nn
+
+from paddlespeech.t2s.modules.activation import get_activation
+from paddlespeech.t2s.modules.causal_conv import CausalConv1D
+
+
+class ResidualStack(nn.Layer):
+    """Residual stack module introduced in MelGAN."""
+
+    def __init__(
+            self,
+            kernel_size: int=3,
+            channels: int=32,
+            dilation: int=1,
+            bias: bool=True,
+            nonlinear_activation: str="leakyrelu",
+            nonlinear_activation_params: Dict[str, Any]={"negative_slope": 0.2},
+            pad: str="Pad1D",
+            pad_params: Dict[str, Any]={"mode": "reflect"},
+            use_causal_conv: bool=False, ):
+        """Initialize ResidualStack module.
+
+        Args:
+            kernel_size (int): Kernel size of dilation convolution layer.
+            channels (int): Number of channels of convolution layers.
+            dilation (int): Dilation factor.
+            bias (bool): Whether to add bias parameter in convolution layers.
+            nonlinear_activation (str): Activation function module name.
+            nonlinear_activation_params (Dict[str,Any]): Hyperparameters for activation function.
+            pad (str): Padding function module name before dilated convolution layer.
+            pad_params (Dict[str, Any]): Hyperparameters for padding function.
+            use_causal_conv (bool): Whether to use causal convolution.
+        """
+        super().__init__()
+        # for compatibility
+        if nonlinear_activation:
+            nonlinear_activation = nonlinear_activation.lower()
+
+        # defile residual stack part
+        if not use_causal_conv:
+            assert (kernel_size - 1
+                    ) % 2 == 0, "Not support even number kernel size."
+            self.stack = nn.Sequential(
+                get_activation(nonlinear_activation,
+                               **nonlinear_activation_params),
+                getattr(nn, pad)((kernel_size - 1) // 2 * dilation,
+                                 **pad_params),
+                nn.Conv1D(
+                    channels,
+                    channels,
+                    kernel_size,
+                    dilation=dilation,
+                    bias_attr=bias),
+                get_activation(nonlinear_activation,
+                               **nonlinear_activation_params),
+                nn.Conv1D(channels, channels, 1, bias_attr=bias), )
+        else:
+            self.stack = nn.Sequential(
+                get_activation(nonlinear_activation,
+                               **nonlinear_activation_params),
+                CausalConv1D(
+                    channels,
+                    channels,
+                    kernel_size,
+                    dilation=dilation,
+                    bias=bias,
+                    pad=pad,
+                    pad_params=pad_params, ),
+                get_activation(nonlinear_activation,
+                               **nonlinear_activation_params),
+                nn.Conv1D(channels, channels, 1, bias_attr=bias), )
+
+        # defile extra layer for skip connection
+        self.skip_layer = nn.Conv1D(channels, channels, 1, bias_attr=bias)
+
+    def forward(self, c):
+        """Calculate forward propagation.
+
+        Args:
+            c (Tensor): Input tensor (B, channels, T).
+        Returns:     
+            Tensor: Output tensor (B, chennels, T).
+        """
+        return self.stack(c) + self.skip_layer(c)
diff --git a/ernie-sat/paddlespeech/t2s/modules/style_encoder.py b/ernie-sat/paddlespeech/t2s/modules/style_encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..49091eac8215898d1428b937a353adb037f774c6
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/modules/style_encoder.py
@@ -0,0 +1,273 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+"""Style encoder of GST-Tacotron."""
+from typing import Sequence
+
+import paddle
+from paddle import nn
+from typeguard import check_argument_types
+
+from paddlespeech.t2s.modules.transformer.attention import MultiHeadedAttention as BaseMultiHeadedAttention
+
+
+class StyleEncoder(nn.Layer):
+    """Style encoder.
+
+    This module is style encoder introduced in `Style Tokens: Unsupervised Style
+    Modeling, Control and Transfer in End-to-End Speech Synthesis`.
+
+    .. _`Style Tokens: Unsupervised Style Modeling, Control and Transfer in End-to-End
+        Speech Synthesis`: https://arxiv.org/abs/1803.09017
+    
+    Args:
+        idim (int, optional): Dimension of the input mel-spectrogram.
+        gst_tokens (int, optional): The number of GST embeddings.
+        gst_token_dim (int, optional): Dimension of each GST embedding.
+        gst_heads (int, optional): The number of heads in GST multihead attention.
+        conv_layers (int, optional): The number of conv layers in the reference encoder.
+        conv_chans_list (Sequence[int], optional): List of the number of channels of conv layers in the referece encoder.
+        conv_kernel_size (int, optional): Kernal size of conv layers in the reference encoder.
+        conv_stride (int, optional): Stride size of conv layers in the reference encoder.
+        gru_layers (int, optional): The number of GRU layers in the reference encoder.
+        gru_units (int, optional):The number of GRU units in the reference encoder.
+
+    Todo:
+        * Support manual weight specification in inference.
+
+    """
+
+    def __init__(
+            self,
+            idim: int=80,
+            gst_tokens: int=10,
+            gst_token_dim: int=256,
+            gst_heads: int=4,
+            conv_layers: int=6,
+            conv_chans_list: Sequence[int]=(32, 32, 64, 64, 128, 128),
+            conv_kernel_size: int=3,
+            conv_stride: int=2,
+            gru_layers: int=1,
+            gru_units: int=128, ):
+        """Initilize global style encoder module."""
+        assert check_argument_types()
+        super().__init__()
+
+        self.ref_enc = ReferenceEncoder(
+            idim=idim,
+            conv_layers=conv_layers,
+            conv_chans_list=conv_chans_list,
+            conv_kernel_size=conv_kernel_size,
+            conv_stride=conv_stride,
+            gru_layers=gru_layers,
+            gru_units=gru_units, )
+        self.stl = StyleTokenLayer(
+            ref_embed_dim=gru_units,
+            gst_tokens=gst_tokens,
+            gst_token_dim=gst_token_dim,
+            gst_heads=gst_heads, )
+
+    def forward(self, speech: paddle.Tensor) -> paddle.Tensor:
+        """Calculate forward propagation.
+
+        Args:
+            speech (Tensor): Batch of padded target features (B, Lmax, odim).
+
+        Returns: 
+            Tensor: Style token embeddings (B, token_dim).
+
+        """
+        ref_embs = self.ref_enc(speech)
+        style_embs = self.stl(ref_embs)
+
+        return style_embs
+
+
+class ReferenceEncoder(nn.Layer):
+    """Reference encoder module.
+
+    This module is refernece encoder introduced in `Style Tokens: Unsupervised Style
+    Modeling, Control and Transfer in End-to-End Speech Synthesis`.
+
+    .. _`Style Tokens: Unsupervised Style Modeling, Control and Transfer in End-to-End
+        Speech Synthesis`: https://arxiv.org/abs/1803.09017
+    
+    Args:
+        idim (int, optional): Dimension of the input mel-spectrogram.
+        conv_layers (int, optional): The number of conv layers in the reference encoder.
+        conv_chans_list: (Sequence[int], optional): List of the number of channels of conv layers in the referece encoder.
+        conv_kernel_size (int, optional): Kernal size of conv layers in the reference encoder.
+        conv_stride (int, optional): Stride size of conv layers in the reference encoder.
+        gru_layers (int, optional): The number of GRU layers in the reference encoder.
+        gru_units (int, optional): The number of GRU units in the reference encoder.
+
+    """
+
+    def __init__(
+            self,
+            idim=80,
+            conv_layers: int=6,
+            conv_chans_list: Sequence[int]=(32, 32, 64, 64, 128, 128),
+            conv_kernel_size: int=3,
+            conv_stride: int=2,
+            gru_layers: int=1,
+            gru_units: int=128, ):
+        """Initilize reference encoder module."""
+        assert check_argument_types()
+        super().__init__()
+
+        # check hyperparameters are valid
+        assert conv_kernel_size % 2 == 1, "kernel size must be odd."
+        assert (
+            len(conv_chans_list) == conv_layers
+        ), "the number of conv layers and length of channels list must be the same."
+
+        convs = []
+        padding = (conv_kernel_size - 1) // 2
+        for i in range(conv_layers):
+            conv_in_chans = 1 if i == 0 else conv_chans_list[i - 1]
+            conv_out_chans = conv_chans_list[i]
+            convs += [
+                nn.Conv2D(
+                    conv_in_chans,
+                    conv_out_chans,
+                    kernel_size=conv_kernel_size,
+                    stride=conv_stride,
+                    padding=padding,
+                    # Do not use bias due to the following batch norm
+                    bias_attr=False, ),
+                nn.BatchNorm2D(conv_out_chans),
+                nn.ReLU(),
+            ]
+        self.convs = nn.Sequential(*convs)
+
+        self.conv_layers = conv_layers
+        self.kernel_size = conv_kernel_size
+        self.stride = conv_stride
+        self.padding = padding
+
+        # get the number of GRU input units
+        gru_in_units = idim
+        for i in range(conv_layers):
+            gru_in_units = (gru_in_units - conv_kernel_size + 2 * padding
+                            ) // conv_stride + 1
+        gru_in_units *= conv_out_chans
+        self.gru = nn.GRU(gru_in_units, gru_units, gru_layers, time_major=False)
+
+    def forward(self, speech: paddle.Tensor) -> paddle.Tensor:
+        """Calculate forward propagation.
+        Args:
+            speech (Tensor): Batch of padded target features (B, Lmax, idim).
+
+        Returns:
+            Tensor: Reference embedding (B, gru_units)
+
+        """
+        batch_size = speech.shape[0]
+        # (B, 1, Lmax, idim)
+        xs = speech.unsqueeze(1)
+        # (B, Lmax', conv_out_chans, idim')
+        hs = self.convs(xs).transpose([0, 2, 1, 3])
+        time_length = hs.shape[1]
+        # (B, Lmax', gru_units)
+        hs = hs.reshape(shape=[batch_size, time_length, -1])
+        self.gru.flatten_parameters()
+        # (gru_layers, batch_size, gru_units)
+        _, ref_embs = self.gru(hs)
+        # (batch_size, gru_units)
+        ref_embs = ref_embs[-1]
+
+        return ref_embs
+
+
+class StyleTokenLayer(nn.Layer):
+    """Style token layer module.
+
+    This module is style token layer introduced in `Style Tokens: Unsupervised Style
+    Modeling, Control and Transfer in End-to-End Speech Synthesis`.
+
+    .. _`Style Tokens: Unsupervised Style Modeling, Control and Transfer in End-to-End
+        Speech Synthesis`: https://arxiv.org/abs/1803.09017
+    Args:
+        ref_embed_dim (int, optional): Dimension of the input reference embedding.
+        gst_tokens (int, optional): The number of GST embeddings.
+        gst_token_dim (int, optional): Dimension of each GST embedding.
+        gst_heads (int, optional): The number of heads in GST multihead attention.
+        dropout_rate (float, optional): Dropout rate in multi-head attention.
+
+    """
+
+    def __init__(
+            self,
+            ref_embed_dim: int=128,
+            gst_tokens: int=10,
+            gst_token_dim: int=256,
+            gst_heads: int=4,
+            dropout_rate: float=0.0, ):
+        """Initilize style token layer module."""
+        assert check_argument_types()
+        super().__init__()
+
+        gst_embs = paddle.randn(shape=[gst_tokens, gst_token_dim // gst_heads])
+        self.gst_embs = paddle.create_parameter(
+            shape=gst_embs.shape,
+            dtype=str(gst_embs.numpy().dtype),
+            default_initializer=paddle.nn.initializer.Assign(gst_embs))
+        self.mha = MultiHeadedAttention(
+            q_dim=ref_embed_dim,
+            k_dim=gst_token_dim // gst_heads,
+            v_dim=gst_token_dim // gst_heads,
+            n_head=gst_heads,
+            n_feat=gst_token_dim,
+            dropout_rate=dropout_rate, )
+
+    def forward(self, ref_embs: paddle.Tensor) -> paddle.Tensor:
+        """Calculate forward propagation.
+
+        Args:
+            ref_embs (Tensor): Reference embeddings (B, ref_embed_dim).
+
+        Returns: 
+            Tensor: Style token embeddings (B, gst_token_dim).
+
+        """
+        batch_size = ref_embs.shape[0]
+        # (num_tokens, token_dim) -> (batch_size, num_tokens, token_dim)
+        gst_embs = paddle.tanh(self.gst_embs).unsqueeze(0).expand(
+            [batch_size, -1, -1])
+        # (batch_size, 1 ,ref_embed_dim)
+        ref_embs = ref_embs.unsqueeze(1)
+        style_embs = self.mha(ref_embs, gst_embs, gst_embs, None)
+
+        return style_embs.squeeze(1)
+
+
+class MultiHeadedAttention(BaseMultiHeadedAttention):
+    """Multi head attention module with different input dimension."""
+
+    def __init__(self, q_dim, k_dim, v_dim, n_head, n_feat, dropout_rate=0.0):
+        """Initialize multi head attention module."""
+        # Do not use super().__init__() here since we want to
+        # overwrite BaseMultiHeadedAttention.__init__() method.
+        nn.Layer.__init__(self)
+        assert n_feat % n_head == 0
+        # We assume d_v always equals d_k
+        self.d_k = n_feat // n_head
+        self.h = n_head
+        self.linear_q = nn.Linear(q_dim, n_feat)
+        self.linear_k = nn.Linear(k_dim, n_feat)
+        self.linear_v = nn.Linear(v_dim, n_feat)
+        self.linear_out = nn.Linear(n_feat, n_feat)
+        self.attn = None
+        self.dropout = nn.Dropout(p=dropout_rate)
diff --git a/ernie-sat/paddlespeech/t2s/modules/tacotron2/__init__.py b/ernie-sat/paddlespeech/t2s/modules/tacotron2/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..abf198b97e6e818e1fbe59006f98492640bcee54
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/modules/tacotron2/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/ernie-sat/paddlespeech/t2s/modules/tacotron2/attentions.py b/ernie-sat/paddlespeech/t2s/modules/tacotron2/attentions.py
new file mode 100644
index 0000000000000000000000000000000000000000..a6fde742d98f90d4db06f734e5f7f4508848d989
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/modules/tacotron2/attentions.py
@@ -0,0 +1,454 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Attention modules for RNN."""
+import paddle
+import paddle.nn.functional as F
+from paddle import nn
+
+from paddlespeech.t2s.modules.masked_fill import masked_fill
+from paddlespeech.t2s.modules.nets_utils import make_pad_mask
+
+
+def _apply_attention_constraint(e,
+                                last_attended_idx,
+                                backward_window=1,
+                                forward_window=3):
+    """Apply monotonic attention constraint.
+
+    This function apply the monotonic attention constraint
+    introduced in `Deep Voice 3: Scaling
+    Text-to-Speech with Convolutional Sequence Learning`_.
+
+    Args:
+        e(Tensor): Attention energy before applying softmax (1, T).
+       last_attended_idx(int): The index of the inputs of the last attended [0, T].
+       backward_window(int, optional, optional): Backward window size in attention constraint. (Default value = 1)
+       forward_window(int, optional, optional): Forward window size in attetion constraint. (Default value = 3)
+
+    Returns:
+        Tensor: Monotonic constrained attention energy (1, T).
+
+    .. _`Deep Voice 3: Scaling Text-to-Speech with Convolutional Sequence Learning`:
+        https://arxiv.org/abs/1710.07654
+
+    """
+    if paddle.shape(e)[0] != 1:
+        raise NotImplementedError(
+            "Batch attention constraining is not yet supported.")
+    backward_idx = last_attended_idx - backward_window
+    forward_idx = last_attended_idx + forward_window
+    if backward_idx > 0:
+        e[:, :backward_idx] = -float("inf")
+    if forward_idx < paddle.shape(e)[1]:
+        e[:, forward_idx:] = -float("inf")
+    return e
+
+
+class AttLoc(nn.Layer):
+    """location-aware attention module.
+
+    Reference: Attention-Based Models for Speech Recognition
+        (https://arxiv.org/pdf/1506.07503.pdf)
+
+    Args:
+        eprojs (int): projection-units of encoder
+        dunits (int): units of decoder
+        att_dim (int): attention dimension
+        aconv_chans (int): channels of attention convolution
+        aconv_filts (int): filter size of attention convolution
+        han_mode (bool): flag to swith on mode of hierarchical attention and not store pre_compute_enc_h
+    """
+
+    def __init__(self,
+                 eprojs,
+                 dunits,
+                 att_dim,
+                 aconv_chans,
+                 aconv_filts,
+                 han_mode=False):
+        super().__init__()
+        self.mlp_enc = nn.Linear(eprojs, att_dim)
+        self.mlp_dec = nn.Linear(dunits, att_dim, bias_attr=False)
+        self.mlp_att = nn.Linear(aconv_chans, att_dim, bias_attr=False)
+        self.loc_conv = nn.Conv2D(
+            1,
+            aconv_chans,
+            (1, 2 * aconv_filts + 1),
+            padding=(0, aconv_filts),
+            bias_attr=False, )
+        self.gvec = nn.Linear(att_dim, 1)
+
+        self.dunits = dunits
+        self.eprojs = eprojs
+        self.att_dim = att_dim
+        self.h_length = None
+        self.enc_h = None
+        self.pre_compute_enc_h = None
+        self.mask = None
+        self.han_mode = han_mode
+
+    def reset(self):
+        """reset states"""
+        self.h_length = None
+        self.enc_h = None
+        self.pre_compute_enc_h = None
+        self.mask = None
+
+    def forward(
+            self,
+            enc_hs_pad,
+            enc_hs_len,
+            dec_z,
+            att_prev,
+            scaling=2.0,
+            last_attended_idx=None,
+            backward_window=1,
+            forward_window=3, ):
+        """Calculate AttLoc forward propagation.
+        Args:
+            enc_hs_pad(Tensor): padded encoder hidden state (B, T_max, D_enc)
+            enc_hs_len(Tensor): padded encoder hidden state length (B)
+            dec_z(Tensor dec_z): decoder hidden state (B, D_dec)
+            att_prev(Tensor): previous attention weight (B, T_max)
+            scaling(float, optional): scaling parameter before applying softmax (Default value = 2.0)
+            forward_window(Tensor, optional): forward window size when constraining attention (Default value = 3)
+            last_attended_idx(int, optional): index of the inputs of the last attended (Default value = None)
+            backward_window(int, optional): backward window size in attention constraint (Default value = 1)
+            forward_window(int, optional): forward window size in attetion constraint (Default value = 3)
+        Returns:
+            Tensor: attention weighted encoder state (B, D_enc)
+            Tensor: previous attention weights (B, T_max)
+        """
+        batch = paddle.shape(enc_hs_pad)[0]
+        # pre-compute all h outside the decoder loop
+        if self.pre_compute_enc_h is None or self.han_mode:
+            # (utt, frame, hdim)
+            self.enc_h = enc_hs_pad
+            self.h_length = paddle.shape(self.enc_h)[1]
+            # (utt, frame, att_dim)
+            self.pre_compute_enc_h = self.mlp_enc(self.enc_h)
+
+        if dec_z is None:
+            dec_z = paddle.zeros([batch, self.dunits])
+        else:
+            dec_z = dec_z.reshape([batch, self.dunits])
+
+        # initialize attention weight with uniform dist.
+        if paddle.sum(att_prev) == 0:
+            # if no bias, 0 0-pad goes 0
+            att_prev = 1.0 - make_pad_mask(enc_hs_len)
+            att_prev = att_prev / enc_hs_len.unsqueeze(-1)
+
+        # att_prev: (utt, frame) -> (utt, 1, 1, frame)
+        # -> (utt, att_conv_chans, 1, frame)
+        att_conv = self.loc_conv(att_prev.reshape([batch, 1, 1, self.h_length]))
+        # att_conv: (utt, att_conv_chans, 1, frame) -> (utt, frame, att_conv_chans)
+        att_conv = att_conv.squeeze(2).transpose([0, 2, 1])
+        # att_conv: (utt, frame, att_conv_chans) -> (utt, frame, att_dim)
+        att_conv = self.mlp_att(att_conv)
+        # dec_z_tiled: (utt, frame, att_dim)        
+        dec_z_tiled = self.mlp_dec(dec_z).reshape([batch, 1, self.att_dim])
+
+        # dot with gvec
+        # (utt, frame, att_dim) -> (utt, frame)
+        e = paddle.tanh(att_conv + self.pre_compute_enc_h + dec_z_tiled)
+        e = self.gvec(e).squeeze(2)
+
+        # NOTE: consider zero padding when compute w.
+        if self.mask is None:
+            self.mask = make_pad_mask(enc_hs_len)
+
+        e = masked_fill(e, self.mask, -float("inf"))
+        # apply monotonic attention constraint (mainly for TTS)
+        if last_attended_idx is not None:
+            e = _apply_attention_constraint(e, last_attended_idx,
+                                            backward_window, forward_window)
+
+        w = F.softmax(scaling * e, axis=1)
+
+        # weighted sum over frames
+        # utt x hdim
+        c = paddle.sum(
+            self.enc_h * w.reshape([batch, self.h_length, 1]), axis=1)
+        return c, w
+
+
+class AttForward(nn.Layer):
+    """Forward attention module.
+    Reference
+    ----------
+    Forward attention in sequence-to-sequence acoustic modeling for speech synthesis
+        (https://arxiv.org/pdf/1807.06736.pdf)
+
+    Args:
+        eprojs (int): projection-units of encoder
+        dunits (int): units of decoder
+        att_dim (int): attention dimension
+        aconv_chans (int): channels of attention convolution
+        aconv_filts (int): filter size of attention convolution
+    """
+
+    def __init__(self, eprojs, dunits, att_dim, aconv_chans, aconv_filts):
+        super().__init__()
+        self.mlp_enc = nn.Linear(eprojs, att_dim)
+        self.mlp_dec = nn.Linear(dunits, att_dim, bias_attr=False)
+        self.mlp_att = nn.Linear(aconv_chans, att_dim, bias_attr=False)
+        self.loc_conv = nn.Conv2D(
+            1,
+            aconv_chans,
+            (1, 2 * aconv_filts + 1),
+            padding=(0, aconv_filts),
+            bias_attr=False, )
+        self.gvec = nn.Linear(att_dim, 1)
+        self.dunits = dunits
+        self.eprojs = eprojs
+        self.att_dim = att_dim
+        self.h_length = None
+        self.enc_h = None
+        self.pre_compute_enc_h = None
+        self.mask = None
+
+    def reset(self):
+        """reset states"""
+        self.h_length = None
+        self.enc_h = None
+        self.pre_compute_enc_h = None
+        self.mask = None
+
+    def forward(
+            self,
+            enc_hs_pad,
+            enc_hs_len,
+            dec_z,
+            att_prev,
+            scaling=1.0,
+            last_attended_idx=None,
+            backward_window=1,
+            forward_window=3, ):
+        """Calculate AttForward forward propagation.
+
+        Args:
+            enc_hs_pad(Tensor): padded encoder hidden state (B, T_max, D_enc)
+            enc_hs_len(list): padded encoder hidden state length (B,)
+            dec_z(Tensor): decoder hidden state (B, D_dec)
+            att_prev(Tensor): attention weights of previous step (B, T_max)
+            scaling(float, optional): scaling parameter before applying softmax (Default value = 1.0)
+            last_attended_idx(int, optional): index of the inputs of the last attended (Default value = None)
+            backward_window(int, optional): backward window size in attention constraint (Default value = 1)
+            forward_window(int, optional):  (Default value = 3)
+
+        Returns:
+            Tensor: attention weighted encoder state (B, D_enc)
+            Tensor: previous attention weights (B, T_max)
+        """
+        batch = len(enc_hs_pad)
+        # pre-compute all h outside the decoder loop
+        if self.pre_compute_enc_h is None:
+            self.enc_h = enc_hs_pad  # utt x frame x hdim
+            self.h_length = paddle.shape(self.enc_h)[1]
+            # utt x frame x att_dim
+            self.pre_compute_enc_h = self.mlp_enc(self.enc_h)
+
+        if dec_z is None:
+            dec_z = paddle.zeros([batch, self.dunits])
+        else:
+            dec_z = dec_z.reshape([batch, self.dunits])
+
+        if att_prev is None:
+            # initial attention will be [1, 0, 0, ...]
+            att_prev = paddle.zeros([*paddle.shape(enc_hs_pad)[:2]])
+            att_prev[:, 0] = 1.0
+
+        # att_prev: utt x frame -> utt x 1 x 1 x frame
+        # -> utt x att_conv_chans x 1 x frame
+        att_conv = self.loc_conv(att_prev.reshape([batch, 1, 1, self.h_length]))
+        # att_conv: utt x att_conv_chans x 1 x frame -> utt x frame x att_conv_chans
+        att_conv = att_conv.squeeze(2).transpose([0, 2, 1])
+        # att_conv: utt x frame x att_conv_chans -> utt x frame x att_dim
+        att_conv = self.mlp_att(att_conv)
+
+        # dec_z_tiled: utt x frame x att_dim
+        dec_z_tiled = self.mlp_dec(dec_z).unsqueeze(1)
+
+        # dot with gvec
+        # utt x frame x att_dim -> utt x frame
+        e = self.gvec(
+            paddle.tanh(self.pre_compute_enc_h + dec_z_tiled +
+                        att_conv)).squeeze(2)
+
+        # NOTE: consider zero padding when compute w.
+        if self.mask is None:
+            self.mask = make_pad_mask(enc_hs_len)
+        e = masked_fill(e, self.mask, -float("inf"))
+
+        # apply monotonic attention constraint (mainly for TTS)
+        if last_attended_idx is not None:
+            e = _apply_attention_constraint(e, last_attended_idx,
+                                            backward_window, forward_window)
+
+        w = F.softmax(scaling * e, axis=1)
+
+        # forward attention
+        att_prev_shift = F.pad(att_prev, (0, 0, 1, 0))[:, :-1]
+
+        w = (att_prev + att_prev_shift) * w
+        # NOTE: clip is needed to avoid nan gradient
+        w = F.normalize(paddle.clip(w, 1e-6), p=1, axis=1)
+
+        # weighted sum over flames
+        # utt x hdim
+        # NOTE use bmm instead of sum(*)
+        c = paddle.sum(self.enc_h * w.unsqueeze(-1), axis=1)
+
+        return c, w
+
+
+class AttForwardTA(nn.Layer):
+    """Forward attention with transition agent module.
+    Reference:
+        Forward attention in sequence-to-sequence acoustic modeling for speech synthesis
+            (https://arxiv.org/pdf/1807.06736.pdf)
+
+    Args:
+        eunits (int): units of encoder
+        dunits (int): units of decoder
+        att_dim (int): attention dimension
+        aconv_chans (int): channels of attention convolution
+        aconv_filts (int): filter size of attention convolution
+        odim (int): output dimension
+    """
+
+    def __init__(self, eunits, dunits, att_dim, aconv_chans, aconv_filts, odim):
+        super().__init__()
+        self.mlp_enc = nn.Linear(eunits, att_dim)
+        self.mlp_dec = nn.Linear(dunits, att_dim, bias_attr=False)
+        self.mlp_ta = nn.Linear(eunits + dunits + odim, 1)
+        self.mlp_att = nn.Linear(aconv_chans, att_dim, bias_attr=False)
+        self.loc_conv = nn.Conv2D(
+            1,
+            aconv_chans,
+            (1, 2 * aconv_filts + 1),
+            padding=(0, aconv_filts),
+            bias_attr=False, )
+        self.gvec = nn.Linear(att_dim, 1)
+        self.dunits = dunits
+        self.eunits = eunits
+        self.att_dim = att_dim
+        self.h_length = None
+        self.enc_h = None
+        self.pre_compute_enc_h = None
+        self.mask = None
+        self.trans_agent_prob = 0.5
+
+    def reset(self):
+        self.h_length = None
+        self.enc_h = None
+        self.pre_compute_enc_h = None
+        self.mask = None
+        self.trans_agent_prob = 0.5
+
+    def forward(
+            self,
+            enc_hs_pad,
+            enc_hs_len,
+            dec_z,
+            att_prev,
+            out_prev,
+            scaling=1.0,
+            last_attended_idx=None,
+            backward_window=1,
+            forward_window=3, ):
+        """Calculate AttForwardTA forward propagation.
+
+        Args:
+            enc_hs_pad(Tensor): padded encoder hidden state (B, Tmax, eunits)
+            enc_hs_len(list Tensor): padded encoder hidden state length (B,)
+            dec_z(Tensor): decoder hidden state (B, dunits)
+            att_prev(Tensor): attention weights of previous step (B, T_max)
+            out_prev(Tensor): decoder outputs of previous step (B, odim)
+            scaling(float, optional): scaling parameter before applying softmax (Default value = 1.0)
+            last_attended_idx(int, optional): index of the inputs of the last attended (Default value = None)
+            backward_window(int, optional): backward window size in attention constraint (Default value = 1)
+            forward_window(int, optional):  (Default value = 3)
+
+        Returns:
+            Tensor: attention weighted encoder state (B, dunits)
+            Tensor: previous attention weights (B, Tmax)
+        """
+        batch = len(enc_hs_pad)
+        # pre-compute all h outside the decoder loop
+        if self.pre_compute_enc_h is None:
+            self.enc_h = enc_hs_pad  # utt x frame x hdim
+            self.h_length = paddle.shape(self.enc_h)[1]
+            # utt x frame x att_dim
+            self.pre_compute_enc_h = self.mlp_enc(self.enc_h)
+
+        if dec_z is None:
+            dec_z = paddle.zeros([batch, self.dunits])
+        else:
+            dec_z = dec_z.reshape([batch, self.dunits])
+
+        if att_prev is None:
+            # initial attention will be [1, 0, 0, ...]
+            att_prev = paddle.zeros([*paddle.shape(enc_hs_pad)[:2]])
+            att_prev[:, 0] = 1.0
+
+        # att_prev: utt x frame -> utt x 1 x 1 x frame
+        # -> utt x att_conv_chans x 1 x frame
+        att_conv = self.loc_conv(att_prev.reshape([batch, 1, 1, self.h_length]))
+        # att_conv: utt x att_conv_chans x 1 x frame -> utt x frame x att_conv_chans
+        att_conv = att_conv.squeeze(2).transpose([0, 2, 1])
+        # att_conv: utt x frame x att_conv_chans -> utt x frame x att_dim
+        att_conv = self.mlp_att(att_conv)
+
+        # dec_z_tiled: utt x frame x att_dim
+        dec_z_tiled = self.mlp_dec(dec_z).reshape([batch, 1, self.att_dim])
+
+        # dot with gvec
+        # utt x frame x att_dim -> utt x frame
+        e = self.gvec(
+            paddle.tanh(att_conv + self.pre_compute_enc_h +
+                        dec_z_tiled)).squeeze(2)
+
+        # NOTE consider zero padding when compute w.
+        if self.mask is None:
+            self.mask = make_pad_mask(enc_hs_len)
+        e = masked_fill(e, self.mask, -float("inf"))
+
+        # apply monotonic attention constraint (mainly for TTS)
+        if last_attended_idx is not None:
+            e = _apply_attention_constraint(e, last_attended_idx,
+                                            backward_window, forward_window)
+
+        w = F.softmax(scaling * e, axis=1)
+
+        # forward attention
+        # att_prev_shift = F.pad(att_prev.unsqueeze(0), (1, 0), data_format='NCL').squeeze(0)[:, :-1]
+        att_prev_shift = F.pad(att_prev, (0, 0, 1, 0))[:, :-1]
+        w = (self.trans_agent_prob * att_prev +
+             (1 - self.trans_agent_prob) * att_prev_shift) * w
+        # NOTE: clip is needed to avoid nan gradient
+        w = F.normalize(paddle.clip(w, 1e-6), p=1, axis=1)
+
+        # weighted sum over flames
+        # utt x hdim
+        # NOTE use bmm instead of sum(*)
+        c = paddle.sum(
+            self.enc_h * w.reshape([batch, self.h_length, 1]), axis=1)
+
+        # update transition agent prob
+        self.trans_agent_prob = F.sigmoid(
+            self.mlp_ta(paddle.concat([c, out_prev, dec_z], axis=1)))
+
+        return c, w
diff --git a/ernie-sat/paddlespeech/t2s/modules/tacotron2/decoder.py b/ernie-sat/paddlespeech/t2s/modules/tacotron2/decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..ebdfa387989828eb4c92df8a1d6bbf215a50b775
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/modules/tacotron2/decoder.py
@@ -0,0 +1,686 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+"""Tacotron2 decoder related modules."""
+import paddle
+import paddle.nn.functional as F
+from paddle import nn
+
+from paddlespeech.t2s.modules.tacotron2.attentions import AttForwardTA
+
+
+class Prenet(nn.Layer):
+    """Prenet module for decoder of Spectrogram prediction network.
+
+    This is a module of Prenet in the decoder of Spectrogram prediction network,
+    which described in `Natural TTS
+    Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions`_.
+    The Prenet preforms nonlinear conversion
+    of inputs before input to auto-regressive lstm,
+    which helps to learn diagonal attentions.
+
+    Notes
+    ----------
+    This module alway applies dropout even in evaluation.
+    See the detail in `Natural TTS Synthesis by
+    Conditioning WaveNet on Mel Spectrogram Predictions`_.
+
+    .. _`Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions`:
+       https://arxiv.org/abs/1712.05884
+
+    """
+
+    def __init__(self, idim, n_layers=2, n_units=256, dropout_rate=0.5):
+        """Initialize prenet module.
+
+        Args:
+            idim (int): Dimension of the inputs.
+            odim (int): Dimension of the outputs.
+            n_layers (int, optional): The number of prenet layers.
+            n_units (int, optional): The number of prenet units.
+        """
+        super().__init__()
+        self.dropout_rate = dropout_rate
+        self.prenet = nn.LayerList()
+        for layer in range(n_layers):
+            n_inputs = idim if layer == 0 else n_units
+            self.prenet.append(
+                nn.Sequential(nn.Linear(n_inputs, n_units), nn.ReLU()))
+
+    def forward(self, x):
+        """Calculate forward propagation.
+
+        Args:
+            x (Tensor): Batch of input tensors (B, ..., idim).
+
+        Returns: 
+            Tensor: Batch of output tensors (B, ..., odim).
+
+        """
+        for i in range(len(self.prenet)):
+            # F.dropout 引入了随机, tacotron2 的 dropout 是不能去掉的
+            x = F.dropout(self.prenet[i](x))
+        return x
+
+
+class Postnet(nn.Layer):
+    """Postnet module for Spectrogram prediction network.
+
+    This is a module of Postnet in Spectrogram prediction network,
+    which described in `Natural TTS Synthesis by
+    Conditioning WaveNet on Mel Spectrogram Predictions`_.
+    The Postnet predicts refines the predicted
+    Mel-filterbank of the decoder,
+    which helps to compensate the detail sturcture of spectrogram.
+
+    .. _`Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions`:
+       https://arxiv.org/abs/1712.05884
+
+    """
+
+    def __init__(
+            self,
+            idim,
+            odim,
+            n_layers=5,
+            n_chans=512,
+            n_filts=5,
+            dropout_rate=0.5,
+            use_batch_norm=True, ):
+        """Initialize postnet module.
+
+        Args:
+            idim (int): Dimension of the inputs.
+            odim (int): Dimension of the outputs.
+            n_layers (int, optional): The number of layers.
+            n_filts (int, optional): The number of filter size.
+            n_units (int, optional): The number of filter channels.
+            use_batch_norm (bool, optional): Whether to use batch normalization..
+            dropout_rate (float, optional): Dropout rate..
+        """
+        super().__init__()
+        self.postnet = nn.LayerList()
+        for layer in range(n_layers - 1):
+            ichans = odim if layer == 0 else n_chans
+            ochans = odim if layer == n_layers - 1 else n_chans
+            if use_batch_norm:
+                self.postnet.append(
+                    nn.Sequential(
+                        nn.Conv1D(
+                            ichans,
+                            ochans,
+                            n_filts,
+                            stride=1,
+                            padding=(n_filts - 1) // 2,
+                            bias_attr=False, ),
+                        nn.BatchNorm1D(ochans),
+                        nn.Tanh(),
+                        nn.Dropout(dropout_rate), ))
+            else:
+                self.postnet.append(
+                    nn.Sequential(
+                        nn.Conv1D(
+                            ichans,
+                            ochans,
+                            n_filts,
+                            stride=1,
+                            padding=(n_filts - 1) // 2,
+                            bias_attr=False, ),
+                        nn.Tanh(),
+                        nn.Dropout(dropout_rate), ))
+        ichans = n_chans if n_layers != 1 else odim
+        if use_batch_norm:
+            self.postnet.append(
+                nn.Sequential(
+                    nn.Conv1D(
+                        ichans,
+                        odim,
+                        n_filts,
+                        stride=1,
+                        padding=(n_filts - 1) // 2,
+                        bias_attr=False, ),
+                    nn.BatchNorm1D(odim),
+                    nn.Dropout(dropout_rate), ))
+        else:
+            self.postnet.append(
+                nn.Sequential(
+                    nn.Conv1D(
+                        ichans,
+                        odim,
+                        n_filts,
+                        stride=1,
+                        padding=(n_filts - 1) // 2,
+                        bias_attr=False, ),
+                    nn.Dropout(dropout_rate), ))
+
+    def forward(self, xs):
+        """Calculate forward propagation.
+
+        Args:
+            xs (Tensor): Batch of the sequences of padded input tensors (B, idim, Tmax).
+        Returns:
+            Tensor: Batch of padded output tensor. (B, odim, Tmax).
+        """
+        for i in range(len(self.postnet)):
+            xs = self.postnet[i](xs)
+        return xs
+
+
+class ZoneOutCell(nn.Layer):
+    """ZoneOut Cell module.
+    This is a module of zoneout described in
+    `Zoneout: Regularizing RNNs by Randomly Preserving Hidden Activations`_.
+    This code is modified from `eladhoffer/seq2seq.pytorch`_.
+    Examples
+    ----------
+        >>> lstm = paddle.nn.LSTMCell(16, 32)
+        >>> lstm = ZoneOutCell(lstm, 0.5)
+    .. _`Zoneout: Regularizing RNNs by Randomly Preserving Hidden Activations`:
+        https://arxiv.org/abs/1606.01305
+    .. _`eladhoffer/seq2seq.pytorch`:
+        https://github.com/eladhoffer/seq2seq.pytorch
+    """
+
+    def __init__(self, cell, zoneout_rate=0.1):
+        """Initialize zone out cell module.
+
+        Args:
+            cell (nn.Layer): Paddle recurrent cell module
+                e.g. `paddle.nn.LSTMCell`.
+            zoneout_rate (float, optional): Probability of zoneout from 0.0 to 1.0.
+        """
+        super().__init__()
+        self.cell = cell
+        self.hidden_size = cell.hidden_size
+        self.zoneout_rate = zoneout_rate
+        if zoneout_rate > 1.0 or zoneout_rate < 0.0:
+            raise ValueError(
+                "zoneout probability must be in the range from 0.0 to 1.0.")
+
+    def forward(self, inputs, hidden):
+        """Calculate forward propagation.
+
+        Args:
+            inputs (Tensor): Batch of input tensor (B, input_size).
+            hidden (tuple):
+                - Tensor: Batch of initial hidden states (B, hidden_size).
+                - Tensor: Batch of initial cell states (B, hidden_size).
+        Returns:
+            Tensor:
+                Batch of next hidden states (B, hidden_size).
+            tuple:
+                - Tensor: Batch of next hidden states (B, hidden_size).
+                - Tensor: Batch of next cell states (B, hidden_size).
+        """
+        # we only use the second output of LSTMCell in paddle
+        _, next_hidden = self.cell(inputs, hidden)
+        next_hidden = self._zoneout(hidden, next_hidden, self.zoneout_rate)
+        # to have the same output format with LSTMCell in paddle
+        return next_hidden[0], next_hidden
+
+    def _zoneout(self, h, next_h, prob):
+        # apply recursively
+        if isinstance(h, tuple):
+            num_h = len(h)
+            if not isinstance(prob, tuple):
+                prob = tuple([prob] * num_h)
+            return tuple(
+                [self._zoneout(h[i], next_h[i], prob[i]) for i in range(num_h)])
+        if self.training:
+            mask = paddle.bernoulli(paddle.ones([*paddle.shape(h)]) * prob)
+            return mask * h + (1 - mask) * next_h
+        else:
+            return prob * h + (1 - prob) * next_h
+
+
+class Decoder(nn.Layer):
+    """Decoder module of Spectrogram prediction network.
+    This is a module of decoder of Spectrogram prediction network in Tacotron2,
+    which described in `Natural TTS
+    Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions`_.
+    The decoder generates the sequence of
+    features from the sequence of the hidden states.
+    .. _`Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions`:
+       https://arxiv.org/abs/1712.05884
+    """
+
+    def __init__(
+            self,
+            idim,
+            odim,
+            att,
+            dlayers=2,
+            dunits=1024,
+            prenet_layers=2,
+            prenet_units=256,
+            postnet_layers=5,
+            postnet_chans=512,
+            postnet_filts=5,
+            output_activation_fn=None,
+            cumulate_att_w=True,
+            use_batch_norm=True,
+            use_concate=True,
+            dropout_rate=0.5,
+            zoneout_rate=0.1,
+            reduction_factor=1, ):
+        """Initialize Tacotron2 decoder module.
+
+        Args:
+            idim (int): Dimension of the inputs.
+            odim (int): Dimension of the outputs.
+            att (nn.Layer): Instance of attention class.
+            dlayers (int, optional): The number of decoder lstm layers.
+            dunits (int, optional): The number of decoder lstm units.
+            prenet_layers (int, optional): The number of prenet layers.
+            prenet_units (int, optional): The number of prenet units.
+            postnet_layers (int, optional): The number of postnet layers.
+            postnet_filts (int, optional): The number of postnet filter size.
+            postnet_chans (int, optional): The number of postnet filter channels.
+            output_activation_fn (nn.Layer, optional): Activation function for outputs.
+            cumulate_att_w (bool, optional): Whether to cumulate previous attention weight.
+            use_batch_norm (bool, optional): Whether to use batch normalization.
+            use_concate : bool, optional
+                Whether to concatenate encoder embedding with decoder lstm outputs.
+            dropout_rate : float, optional
+                Dropout rate.
+            zoneout_rate : float, optional
+                Zoneout rate.
+            reduction_factor : int, optional
+                Reduction factor.
+        """
+        super().__init__()
+
+        # store the hyperparameters
+        self.idim = idim
+        self.odim = odim
+        self.att = att
+        self.output_activation_fn = output_activation_fn
+        self.cumulate_att_w = cumulate_att_w
+        self.use_concate = use_concate
+        self.reduction_factor = reduction_factor
+
+        # check attention type
+        if isinstance(self.att, AttForwardTA):
+            self.use_att_extra_inputs = True
+        else:
+            self.use_att_extra_inputs = False
+
+        # define lstm network
+        prenet_units = prenet_units if prenet_layers != 0 else odim
+        self.lstm = nn.LayerList()
+        for layer in range(dlayers):
+            iunits = idim + prenet_units if layer == 0 else dunits
+            lstm = nn.LSTMCell(iunits, dunits)
+            if zoneout_rate > 0.0:
+                lstm = ZoneOutCell(lstm, zoneout_rate)
+            self.lstm.append(lstm)
+
+        # define prenet
+        if prenet_layers > 0:
+            self.prenet = Prenet(
+                idim=odim,
+                n_layers=prenet_layers,
+                n_units=prenet_units,
+                dropout_rate=dropout_rate, )
+        else:
+            self.prenet = None
+
+        # define postnet
+        if postnet_layers > 0:
+            self.postnet = Postnet(
+                idim=idim,
+                odim=odim,
+                n_layers=postnet_layers,
+                n_chans=postnet_chans,
+                n_filts=postnet_filts,
+                use_batch_norm=use_batch_norm,
+                dropout_rate=dropout_rate, )
+        else:
+            self.postnet = None
+
+        # define projection layers
+        iunits = idim + dunits if use_concate else dunits
+        self.feat_out = nn.Linear(
+            iunits, odim * reduction_factor, bias_attr=False)
+        self.prob_out = nn.Linear(iunits, reduction_factor)
+
+    def _zero_state(self, hs):
+        init_hs = paddle.zeros([paddle.shape(hs)[0], self.lstm[0].hidden_size])
+        return init_hs
+
+    def forward(self, hs, hlens, ys):
+        """Calculate forward propagation.
+
+        Args:
+            hs (Tensor): Batch of the sequences of padded hidden states (B, Tmax, idim).
+            hlens (Tensor(int64) padded): Batch of lengths of each input batch (B,).
+            ys (Tensor): Batch of the sequences of padded target features (B, Lmax, odim).
+
+        Returns:
+            Tensor: Batch of output tensors after postnet (B, Lmax, odim).
+            Tensor: Batch of output tensors before postnet (B, Lmax, odim).
+            Tensor: Batch of logits of stop prediction (B, Lmax).
+            Tensor: Batch of attention weights (B, Lmax, Tmax).
+            
+        Note: 
+            This computation is performed in teacher-forcing manner.
+        """
+        # thin out frames (B, Lmax, odim) ->  (B, Lmax/r, odim)
+        if self.reduction_factor > 1:
+            ys = ys[:, self.reduction_factor - 1::self.reduction_factor]
+
+        # length list should be list of int
+        # hlens = list(map(int, hlens))
+
+        # initialize hidden states of decoder
+        c_list = [self._zero_state(hs)]
+        z_list = [self._zero_state(hs)]
+        for _ in range(1, len(self.lstm)):
+            c_list.append(self._zero_state(hs))
+            z_list.append(self._zero_state(hs))
+        prev_out = paddle.zeros([paddle.shape(hs)[0], self.odim])
+
+        # initialize attention
+        prev_att_ws = []
+        prev_att_w = paddle.zeros(paddle.shape(hlens))
+        prev_att_ws.append(prev_att_w)
+        self.att.reset()
+
+        # loop for an output sequence
+        outs, logits, att_ws = [], [], []
+        for y in ys.transpose([1, 0, 2]):
+            if self.use_att_extra_inputs:
+                att_c, att_w = self.att(hs, hlens, z_list[0], prev_att_ws[-1],
+                                        prev_out)
+            else:
+                att_c, att_w = self.att(hs, hlens, z_list[0], prev_att_ws[-1])
+            prenet_out = self.prenet(
+                prev_out) if self.prenet is not None else prev_out
+            xs = paddle.concat([att_c, prenet_out], axis=1)
+            # we only use the second output of LSTMCell in paddle
+            _, next_hidden = self.lstm[0](xs, (z_list[0], c_list[0]))
+            z_list[0], c_list[0] = next_hidden
+            for i in range(1, len(self.lstm)):
+                # we only use the second output of LSTMCell in paddle
+                _, next_hidden = self.lstm[i](z_list[i - 1],
+                                              (z_list[i], c_list[i]))
+                z_list[i], c_list[i] = next_hidden
+            zcs = (paddle.concat([z_list[-1], att_c], axis=1)
+                   if self.use_concate else z_list[-1])
+            outs.append(
+                self.feat_out(zcs).reshape([paddle.shape(hs)[0], self.odim, -1
+                                            ]))
+            logits.append(self.prob_out(zcs))
+            att_ws.append(att_w)
+            # teacher forcing
+            prev_out = y
+            if self.cumulate_att_w and paddle.sum(prev_att_w) != 0:
+                prev_att_w = prev_att_w + att_w  # Note: error when use +=
+            else:
+                prev_att_w = att_w
+            prev_att_ws.append(prev_att_w)
+        # (B, Lmax)
+        logits = paddle.concat(logits, axis=1)
+        # (B, odim, Lmax) 
+        before_outs = paddle.concat(outs, axis=2)
+        # (B, Lmax, Tmax)
+        att_ws = paddle.stack(att_ws, axis=1)
+
+        if self.reduction_factor > 1:
+            # (B, odim, Lmax)
+            before_outs = before_outs.reshape(
+                [paddle.shape(before_outs)[0], self.odim, -1])
+
+        if self.postnet is not None:
+            # (B, odim, Lmax)
+            after_outs = before_outs + self.postnet(before_outs)
+        else:
+            after_outs = before_outs
+        # (B, Lmax, odim)
+        before_outs = before_outs.transpose([0, 2, 1])
+        # (B, Lmax, odim)
+        after_outs = after_outs.transpose([0, 2, 1])
+        logits = logits
+
+        # apply activation function for scaling
+        if self.output_activation_fn is not None:
+            before_outs = self.output_activation_fn(before_outs)
+            after_outs = self.output_activation_fn(after_outs)
+
+        return after_outs, before_outs, logits, att_ws
+
+    def inference(
+            self,
+            h,
+            threshold=0.5,
+            minlenratio=0.0,
+            maxlenratio=10.0,
+            use_att_constraint=False,
+            backward_window=None,
+            forward_window=None, ):
+        """Generate the sequence of features given the sequences of characters.
+        Args:
+            h(Tensor): Input sequence of encoder hidden states (T, C).
+            threshold(float, optional, optional): Threshold to stop generation. (Default value = 0.5)
+            minlenratio(float, optional, optional): Minimum length ratio. If set to 1.0 and the length of input is 10,
+                the minimum length of outputs will be 10 * 1 = 10. (Default value = 0.0)
+            maxlenratio(float, optional, optional): Minimum length ratio. If set to 10 and the length of input is 10,
+                the maximum length of outputs will be 10 * 10 = 100. (Default value = 0.0)
+            use_att_constraint(bool, optional): Whether to apply attention constraint introduced in `Deep Voice 3`_. (Default value = False)
+            backward_window(int, optional): Backward window size in attention constraint. (Default value = None)
+            forward_window(int, optional):  (Default value = None)
+
+        Returns:
+            Tensor: Output sequence of features (L, odim).
+            Tensor: Output sequence of stop probabilities (L,).
+            Tensor: Attention weights (L, T).
+
+        Note: 
+            This computation is performed in auto-regressive manner.
+    .. _`Deep Voice 3`: https://arxiv.org/abs/1710.07654
+        """
+        # setup
+
+        assert len(paddle.shape(h)) == 2
+        hs = h.unsqueeze(0)
+        ilens = paddle.shape(h)[0]
+        # 本来 maxlen 和 minlen 外面有 int()，防止动转静的问题此处删除
+        maxlen = paddle.shape(h)[0] * maxlenratio
+        minlen = paddle.shape(h)[0] * minlenratio
+        # 本来是直接使用 threshold 的，此处为了防止动转静的问题把 threshold 转成 tensor
+        threshold = paddle.ones([1]) * threshold
+
+        # initialize hidden states of decoder
+        c_list = [self._zero_state(hs)]
+        z_list = [self._zero_state(hs)]
+        for _ in range(1, len(self.lstm)):
+            c_list.append(self._zero_state(hs))
+            z_list.append(self._zero_state(hs))
+        prev_out = paddle.zeros([1, self.odim])
+
+        # initialize attention
+        prev_att_ws = []
+        prev_att_w = paddle.zeros([ilens])
+        prev_att_ws.append(prev_att_w)
+
+        self.att.reset()
+
+        # setup for attention constraint
+        if use_att_constraint:
+            last_attended_idx = 0
+        else:
+            last_attended_idx = None
+
+        # loop for an output sequence
+        idx = 0
+        outs, att_ws, probs = [], [], []
+        prob = paddle.zeros([1])
+        while True:
+            # updated index
+            idx += self.reduction_factor
+
+            # decoder calculation
+            if self.use_att_extra_inputs:
+                att_c, att_w = self.att(
+                    hs,
+                    ilens,
+                    z_list[0],
+                    prev_att_ws[-1],
+                    prev_out,
+                    last_attended_idx=last_attended_idx,
+                    backward_window=backward_window,
+                    forward_window=forward_window, )
+            else:
+                att_c, att_w = self.att(
+                    hs,
+                    ilens,
+                    z_list[0],
+                    prev_att_ws[-1],
+                    last_attended_idx=last_attended_idx,
+                    backward_window=backward_window,
+                    forward_window=forward_window, )
+
+            att_ws.append(att_w)
+            prenet_out = self.prenet(
+                prev_out) if self.prenet is not None else prev_out
+            xs = paddle.concat([att_c, prenet_out], axis=1)
+            # we only use the second output of LSTMCell in paddle
+            _, next_hidden = self.lstm[0](xs, (z_list[0], c_list[0]))
+
+            z_list[0], c_list[0] = next_hidden
+            for i in range(1, len(self.lstm)):
+                # we only use the second output of LSTMCell in paddle
+                _, next_hidden = self.lstm[i](z_list[i - 1],
+                                              (z_list[i], c_list[i]))
+                z_list[i], c_list[i] = next_hidden
+            zcs = (paddle.concat([z_list[-1], att_c], axis=1)
+                   if self.use_concate else z_list[-1])
+            # [(1, odim, r), ...]
+            outs.append(self.feat_out(zcs).reshape([1, self.odim, -1]))
+
+            prob = F.sigmoid(self.prob_out(zcs))[0]
+            probs.append(prob)
+
+            if self.output_activation_fn is not None:
+                prev_out = self.output_activation_fn(
+                    outs[-1][:, :, -1])  # (1, odim)
+            else:
+                prev_out = outs[-1][:, :, -1]  # (1, odim)
+            if self.cumulate_att_w and paddle.sum(prev_att_w) != 0:
+                prev_att_w = prev_att_w + att_w  # Note: error when use +=
+            else:
+                prev_att_w = att_w
+            prev_att_ws.append(prev_att_w)
+            if use_att_constraint:
+                last_attended_idx = int(att_w.argmax())
+
+            # tacotron2 ljspeech 动转静的问题应该是这里没有正确判断 prob >= threshold 导致的
+            if prob >= threshold or idx >= maxlen:
+                # check mininum length
+                if idx < minlen:
+                    continue
+                break
+            """
+            仅解开 665~667 行的代码块，动转静时会卡死，但是动态图时可以正确生成音频，证明模型没问题
+            同时解开 665~667 行 和 668 ~ 670 行的代码块，动转静时不会卡死，但是生成的音频末尾有多余的噪声
+            证明动转静没有进入 prob >= threshold 的判断，但是静态图可以进入 prob >= threshold 并退出循环
+            动转静时是通过 idx >= maxlen 退出循环（所以没有这个逻辑的时候会一直循环，也就是卡死），
+            没有在模型判断该结束的时候结束，而是在超出最大长度时结束，所以合成的音频末尾有很长的额外预测的噪声
+            动转静用 prob <= threshold 的条件可以退出循环（虽然结果不正确），证明条件参数的类型本身没问题，可能是 prob 有问题
+            """
+            # if prob >= threshold:
+            #     print("prob >= threshold")
+            #     break
+            # elif idx >= maxlen:
+            #     print("idx >= maxlen")
+            #     break
+
+        # (1, odim, L)
+        outs = paddle.concat(outs, axis=2)
+        if self.postnet is not None:
+            # (1, odim, L)
+            outs = outs + self.postnet(outs)
+        # (L, odim)
+        outs = outs.transpose([0, 2, 1]).squeeze(0)
+        probs = paddle.concat(probs, axis=0)
+        att_ws = paddle.concat(att_ws, axis=0)
+
+        if self.output_activation_fn is not None:
+            outs = self.output_activation_fn(outs)
+
+        return outs, probs, att_ws
+
+    def calculate_all_attentions(self, hs, hlens, ys):
+        """Calculate all of the attention weights.
+
+        Args:
+            hs (Tensor): Batch of the sequences of padded hidden states (B, Tmax, idim).
+            hlens (Tensor(int64)): Batch of lengths of each input batch (B,).
+            ys (Tensor): Batch of the sequences of padded target features (B, Lmax, odim).
+
+        Returns:
+            numpy.ndarray:
+                Batch of attention weights (B, Lmax, Tmax).
+    
+        Note:
+            This computation is performed in teacher-forcing manner.
+        """
+        # thin out frames (B, Lmax, odim) ->  (B, Lmax/r, odim)
+        if self.reduction_factor > 1:
+            ys = ys[:, self.reduction_factor - 1::self.reduction_factor]
+
+        # length list should be list of int
+        hlens = list(map(int, hlens))
+
+        # initialize hidden states of decoder
+        c_list = [self._zero_state(hs)]
+        z_list = [self._zero_state(hs)]
+        for _ in range(1, len(self.lstm)):
+            c_list.append(self._zero_state(hs))
+            z_list.append(self._zero_state(hs))
+        prev_out = paddle.zeros([paddle.shape(hs)[0], self.odim])
+
+        # initialize attention
+        prev_att_w = None
+        self.att.reset()
+
+        # loop for an output sequence
+        att_ws = []
+        for y in ys.transpose([1, 0, 2]):
+            if self.use_att_extra_inputs:
+                att_c, att_w = self.att(hs, hlens, z_list[0], prev_att_w,
+                                        prev_out)
+            else:
+                att_c, att_w = self.att(hs, hlens, z_list[0], prev_att_w)
+            att_ws.append(att_w)
+            prenet_out = self.prenet(
+                prev_out) if self.prenet is not None else prev_out
+            xs = paddle.concat([att_c, prenet_out], axis=1)
+            # we only use the second output of LSTMCell in paddle
+            _, next_hidden = self.lstm[0](xs, (z_list[0], c_list[0]))
+            z_list[0], c_list[0] = next_hidden
+            for i in range(1, len(self.lstm)):
+                z_list[i], c_list[i] = self.lstm[i](z_list[i - 1],
+                                                    (z_list[i], c_list[i]))
+            # teacher forcing
+            prev_out = y
+            if self.cumulate_att_w and prev_att_w is not None:
+                # Note: error when use +=
+                prev_att_w = prev_att_w + att_w
+            else:
+                prev_att_w = att_w
+        # (B, Lmax, Tmax)
+        att_ws = paddle.stack(att_ws, axis=1)
+
+        return att_ws
diff --git a/ernie-sat/paddlespeech/t2s/modules/tacotron2/encoder.py b/ernie-sat/paddlespeech/t2s/modules/tacotron2/encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..db102a115a067a0c9872cf0bebceb355711da482
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/modules/tacotron2/encoder.py
@@ -0,0 +1,174 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+"""Tacotron2 encoder related modules."""
+import paddle
+from paddle import nn
+
+
+class Encoder(nn.Layer):
+    """Encoder module of Spectrogram prediction network.
+
+    This is a module of encoder of Spectrogram prediction network in Tacotron2,
+    which described in `Natural TTS Synthesis by Conditioning WaveNet on Mel
+    Spectrogram Predictions`_. This is the encoder which converts either a sequence
+    of characters or acoustic features into the sequence of hidden states.
+
+    .. _`Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions`:
+       https://arxiv.org/abs/1712.05884
+
+    """
+
+    def __init__(
+            self,
+            idim,
+            input_layer="embed",
+            embed_dim=512,
+            elayers=1,
+            eunits=512,
+            econv_layers=3,
+            econv_chans=512,
+            econv_filts=5,
+            use_batch_norm=True,
+            use_residual=False,
+            dropout_rate=0.5,
+            padding_idx=0, ):
+        """Initialize Tacotron2 encoder module.
+        Args:
+            idim (int): Dimension of the inputs.
+            input_layer (str): Input layer type.
+            embed_dim (int, optional): Dimension of character embedding.
+            elayers (int, optional): The number of encoder blstm layers.
+            eunits (int, optional): The number of encoder blstm units.
+            econv_layers (int, optional): The number of encoder conv layers.
+            econv_filts (int, optional): The number of encoder conv filter size.
+            econv_chans (int, optional): The number of encoder conv filter channels.
+            use_batch_norm (bool, optional): Whether to use batch normalization.
+            use_residual (bool, optional): Whether to use residual connection.
+            dropout_rate (float, optional): Dropout rate.
+
+        """
+        super().__init__()
+        # store the hyperparameters
+        self.idim = idim
+        self.use_residual = use_residual
+
+        # define network layer modules
+        if input_layer == "linear":
+            self.embed = nn.Linear(idim, econv_chans)
+        elif input_layer == "embed":
+            self.embed = nn.Embedding(idim, embed_dim, padding_idx=padding_idx)
+        else:
+            raise ValueError("unknown input_layer: " + input_layer)
+
+        if econv_layers > 0:
+            self.convs = nn.LayerList()
+            for layer in range(econv_layers):
+                ichans = (embed_dim if layer == 0 and input_layer == "embed"
+                          else econv_chans)
+                if use_batch_norm:
+                    self.convs.append(
+                        nn.Sequential(
+                            nn.Conv1D(
+                                ichans,
+                                econv_chans,
+                                econv_filts,
+                                stride=1,
+                                padding=(econv_filts - 1) // 2,
+                                bias_attr=False, ),
+                            nn.BatchNorm1D(econv_chans),
+                            nn.ReLU(),
+                            nn.Dropout(dropout_rate), ))
+                else:
+                    self.convs += [
+                        nn.Sequential(
+                            nn.Conv1D(
+                                ichans,
+                                econv_chans,
+                                econv_filts,
+                                stride=1,
+                                padding=(econv_filts - 1) // 2,
+                                bias_attr=False, ),
+                            nn.ReLU(),
+                            nn.Dropout(dropout_rate), )
+                    ]
+        else:
+            self.convs = None
+        if elayers > 0:
+            iunits = econv_chans if econv_layers != 0 else embed_dim
+            # batch_first=True, bidirectional=True
+            self.blstm = nn.LSTM(
+                iunits,
+                eunits // 2,
+                elayers,
+                time_major=False,
+                direction='bidirectional',
+                bias_ih_attr=True,
+                bias_hh_attr=True)
+            self.blstm.flatten_parameters()
+        else:
+            self.blstm = None
+
+        # # initialize
+        # self.apply(encoder_init)
+
+    def forward(self, xs, ilens=None):
+        """Calculate forward propagation.
+
+        Args:
+            xs (Tensor): Batch of the padded sequence. Either character ids (B, Tmax)
+                or acoustic feature (B, Tmax, idim * encoder_reduction_factor). 
+                Padded value should be 0.
+            ilens (Tensor(int64)): Batch of lengths of each input batch (B,).
+
+        Returns:
+            Tensor: Batch of the sequences of encoder states(B, Tmax, eunits).
+            Tensor(int64): Batch of lengths of each sequence (B,)
+        """
+        xs = self.embed(xs).transpose([0, 2, 1])
+        if self.convs is not None:
+            for i in range(len(self.convs)):
+                if self.use_residual:
+                    xs += self.convs[i](xs)
+                else:
+                    xs = self.convs[i](xs)
+        if self.blstm is None:
+            return xs.transpose([0, 2, 1])
+        if not isinstance(ilens, paddle.Tensor):
+            ilens = paddle.to_tensor(ilens)
+        xs = xs.transpose([0, 2, 1])
+        # for dygraph to static graph
+        # self.blstm.flatten_parameters()
+        # (B, Tmax, C)
+        # see https://www.paddlepaddle.org.cn/documentation/docs/zh/faq/train_cn.html#paddletorch-nn-utils-rnn-pack-padded-sequencetorch-nn-utils-rnn-pad-packed-sequenceapi
+        xs, _ = self.blstm(xs, sequence_length=ilens)
+        hlens = ilens
+
+        return xs, hlens
+
+    def inference(self, x):
+        """Inference.
+
+        Args:
+            x (Tensor): The sequeunce of character ids (T,) 
+                or acoustic feature (T, idim * encoder_reduction_factor).
+
+        Returns:
+            Tensor: The sequences of encoder states(T, eunits).
+
+        """
+        xs = x.unsqueeze(0)
+        ilens = paddle.shape(x)[0]
+
+        return self.forward(xs, ilens)[0][0]
diff --git a/ernie-sat/paddlespeech/t2s/modules/tade_res_block.py b/ernie-sat/paddlespeech/t2s/modules/tade_res_block.py
new file mode 100644
index 0000000000000000000000000000000000000000..b2275e2361405c81542042d92ea161c5dc6bb4bf
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/modules/tade_res_block.py
@@ -0,0 +1,157 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+"""StyleMelGAN's TADEResBlock Modules."""
+from functools import partial
+
+import paddle.nn.functional as F
+from paddle import nn
+
+
+class TADELayer(nn.Layer):
+    """TADE Layer module."""
+
+    def __init__(
+            self,
+            in_channels: int=64,
+            aux_channels: int=80,
+            kernel_size: int=9,
+            bias: bool=True,
+            upsample_factor: int=2,
+            upsample_mode: str="nearest", ):
+        """Initilize TADE layer."""
+        super().__init__()
+        self.norm = nn.InstanceNorm1D(
+            in_channels,
+            momentum=0.1,
+            data_format="NCL",
+            weight_attr=False,
+            bias_attr=False)
+        self.aux_conv = nn.Sequential(
+            nn.Conv1D(
+                aux_channels,
+                in_channels,
+                kernel_size,
+                1,
+                bias_attr=bias,
+                padding=(kernel_size - 1) // 2, ), )
+        self.gated_conv = nn.Sequential(
+            nn.Conv1D(
+                in_channels,
+                in_channels * 2,
+                kernel_size,
+                1,
+                bias_attr=bias,
+                padding=(kernel_size - 1) // 2, ), )
+        self.upsample = nn.Upsample(
+            scale_factor=upsample_factor, mode=upsample_mode)
+
+    def forward(self, x, c):
+        """Calculate forward propagation.
+        Args:
+            x (Tensor): Input tensor (B, in_channels, T).
+            c (Tensor): Auxiliary input tensor (B, aux_channels, T).
+        Returns:
+            Tensor: Output tensor (B, in_channels, T * upsample_factor).
+            Tensor: Upsampled aux tensor (B, in_channels, T * upsample_factor).
+        """
+
+        x = self.norm(x)
+        # 'bilinear', 'bicubic' and 'nearest' only support 4-D tensor.
+        c = self.upsample(c.unsqueeze(-1))
+        c = c[:, :, :, 0]
+
+        c = self.aux_conv(c)
+        cg = self.gated_conv(c)
+        cg1, cg2 = cg.split(2, axis=1)
+        # 'bilinear', 'bicubic' and 'nearest' only support 4-D tensor.
+        y = cg1 * self.upsample(x.unsqueeze(-1))[:, :, :, 0] + cg2
+        return y, c
+
+
+class TADEResBlock(nn.Layer):
+    """TADEResBlock module."""
+
+    def __init__(
+            self,
+            in_channels: int=64,
+            aux_channels: int=80,
+            kernel_size: int=9,
+            dilation: int=2,
+            bias: bool=True,
+            upsample_factor: int=2,
+            # this is a diff in paddle, the mode only can be "linear" when input is 3D
+            upsample_mode: str="nearest",
+            gated_function: str="softmax", ):
+        """Initialize TADEResBlock module."""
+        super().__init__()
+        self.tade1 = TADELayer(
+            in_channels=in_channels,
+            aux_channels=aux_channels,
+            kernel_size=kernel_size,
+            bias=bias,
+            upsample_factor=1,
+            upsample_mode=upsample_mode, )
+        self.gated_conv1 = nn.Conv1D(
+            in_channels,
+            in_channels * 2,
+            kernel_size,
+            1,
+            bias_attr=bias,
+            padding=(kernel_size - 1) // 2, )
+        self.tade2 = TADELayer(
+            in_channels=in_channels,
+            aux_channels=in_channels,
+            kernel_size=kernel_size,
+            bias=bias,
+            upsample_factor=upsample_factor,
+            upsample_mode=upsample_mode, )
+        self.gated_conv2 = nn.Conv1D(
+            in_channels,
+            in_channels * 2,
+            kernel_size,
+            1,
+            bias_attr=bias,
+            dilation=dilation,
+            padding=(kernel_size - 1) // 2 * dilation, )
+        self.upsample = nn.Upsample(
+            scale_factor=upsample_factor, mode=upsample_mode)
+        if gated_function == "softmax":
+            self.gated_function = partial(F.softmax, axis=1)
+        elif gated_function == "sigmoid":
+            self.gated_function = F.sigmoid
+        else:
+            raise ValueError(f"{gated_function} is not supported.")
+
+    def forward(self, x, c):
+        """Calculate forward propagation.
+        Args:
+
+            x (Tensor): Input tensor (B, in_channels, T).
+            c (Tensor): Auxiliary input tensor (B, aux_channels, T).
+        Returns:
+            Tensor: Output tensor (B, in_channels, T * upsample_factor).
+            Tensor: Upsampled auxirialy tensor (B, in_channels, T * upsample_factor).
+        """
+        residual = x
+        x, c = self.tade1(x, c)
+        x = self.gated_conv1(x)
+        xa, xb = x.split(2, axis=1)
+        x = self.gated_function(xa) * F.tanh(xb)
+        x, c = self.tade2(x, c)
+        x = self.gated_conv2(x)
+        xa, xb = x.split(2, axis=1)
+        x = self.gated_function(xa) * F.tanh(xb)
+        # 'bilinear', 'bicubic' and 'nearest' only support 4-D tensor.
+        return self.upsample(residual.unsqueeze(-1))[:, :, :, 0] + x, c
diff --git a/ernie-sat/paddlespeech/t2s/modules/transformer/__init__.py b/ernie-sat/paddlespeech/t2s/modules/transformer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..abf198b97e6e818e1fbe59006f98492640bcee54
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/modules/transformer/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/ernie-sat/paddlespeech/t2s/modules/transformer/attention.py b/ernie-sat/paddlespeech/t2s/modules/transformer/attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..cdb95b211ab9a60fad27c64fad6bb4dca86ffb3a
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/modules/transformer/attention.py
@@ -0,0 +1,222 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+"""Multi-Head Attention layer definition."""
+import math
+
+import numpy
+import paddle
+from paddle import nn
+
+from paddlespeech.t2s.modules.masked_fill import masked_fill
+
+
+class MultiHeadedAttention(nn.Layer):
+    """Multi-Head Attention layer.
+    Args:
+        n_head (int): The number of heads.
+        n_feat (int): The number of features.
+        dropout_rate (float): Dropout rate.
+    """
+
+    def __init__(self, n_head, n_feat, dropout_rate):
+        """Construct an MultiHeadedAttention object."""
+        super().__init__()
+        assert n_feat % n_head == 0
+        # We assume d_v always equals d_k
+        self.d_k = n_feat // n_head
+        self.h = n_head
+        self.linear_q = nn.Linear(n_feat, n_feat, bias_attr=True)
+        self.linear_k = nn.Linear(n_feat, n_feat, bias_attr=True)
+        self.linear_v = nn.Linear(n_feat, n_feat, bias_attr=True)
+        self.linear_out = nn.Linear(n_feat, n_feat, bias_attr=True)
+        self.attn = None
+        self.dropout = nn.Dropout(p=dropout_rate)
+
+    def forward_qkv(self, query, key, value):
+        """Transform query, key and value.
+
+        Args:
+            query(Tensor): query tensor (#batch, time1, size).
+            key(Tensor): Key tensor (#batch, time2, size).
+            value(Tensor): Value tensor (#batch, time2, size).
+
+        Returns:
+            Tensor: Transformed query tensor (#batch, n_head, time1, d_k).
+            Tensor: Transformed key tensor (#batch, n_head, time2, d_k).
+            Tensor: Transformed value tensor (#batch, n_head, time2, d_k).
+        """
+        n_batch = paddle.shape(query)[0]
+
+        q = paddle.reshape(
+            self.linear_q(query), [n_batch, -1, self.h, self.d_k])
+        k = paddle.reshape(self.linear_k(key), [n_batch, -1, self.h, self.d_k])
+        v = paddle.reshape(
+            self.linear_v(value), [n_batch, -1, self.h, self.d_k])
+
+        # (batch, head, time1, d_k)
+        q = q.transpose((0, 2, 1, 3))
+        # (batch, head, time2, d_k)
+        k = k.transpose((0, 2, 1, 3))
+        # (batch, head, time2, d_k)
+        v = v.transpose((0, 2, 1, 3))
+        return q, k, v
+
+    def forward_attention(self, value, scores, mask=None):
+        """Compute attention context vector.
+
+        Args:
+            value(Tensor): Transformed value (#batch, n_head, time2, d_k).
+            scores(Tensor): Attention score (#batch, n_head, time1, time2).
+            mask(Tensor, optional): Mask (#batch, 1, time2) or (#batch, time1, time2). (Default value = None)
+
+        Returns:
+            Tensor: Transformed value (#batch, time1, d_model) weighted by the attention score (#batch, time1, time2).
+        """
+        n_batch = paddle.shape(value)[0]
+        softmax = paddle.nn.Softmax(axis=-1)
+        if mask is not None:
+            mask = mask.unsqueeze(1)
+            mask = paddle.logical_not(mask)
+            # assume scores.dtype==paddle.float32, we only use "float32" here
+            dtype = str(scores.dtype).split(".")[-1]
+            min_value = numpy.finfo(dtype).min
+            scores = masked_fill(scores, mask, min_value)
+            # (batch, head, time1, time2)
+            self.attn = softmax(scores)
+            self.attn = masked_fill(self.attn, mask, 0.0)
+        else:
+            # (batch, head, time1, time2)
+            self.attn = softmax(scores)
+            # (batch, head, time1, time2)
+        p_attn = self.dropout(self.attn)
+        # (batch, head, time1, time2) * (batch, head, time2, d_k) -> # (batch, head, time1, d_k)
+        x = paddle.matmul(p_attn, value)
+        # (batch, time1, d_model)
+        x = (paddle.reshape(
+            x.transpose((0, 2, 1, 3)), (n_batch, -1, self.h * self.d_k)))
+        # (batch, time1, d_model)
+        return self.linear_out(x)
+
+    def forward(self, query, key, value, mask=None):
+        """Compute scaled dot product attention.
+
+        Args:
+            query(Tensor): Query tensor (#batch, time1, size).
+            key(Tensor): Key tensor (#batch, time2, size).
+            value(Tensor): Value tensor (#batch, time2, size).
+            mask(Tensor, optional): Mask tensor (#batch, 1, time2) or (#batch, time1, time2). (Default value = None)
+
+        Returns:
+            Tensor: Output tensor (#batch, time1, d_model).
+        """
+        q, k, v = self.forward_qkv(query, key, value)
+        scores = paddle.matmul(q, k.transpose(
+            (0, 1, 3, 2))) / math.sqrt(self.d_k)
+
+        return self.forward_attention(v, scores, mask)
+
+
+class RelPositionMultiHeadedAttention(MultiHeadedAttention):
+    """Multi-Head Attention layer with relative position encoding (new implementation).
+    Details can be found in https://github.com/espnet/espnet/pull/2816.
+    Paper: https://arxiv.org/abs/1901.02860
+
+    Args:
+        n_head (int): The number of heads.
+        n_feat (int): The number of features.
+        dropout_rate (float): Dropout rate.
+        zero_triu (bool): Whether to zero the upper triangular part of attention matrix.
+    """
+
+    def __init__(self, n_head, n_feat, dropout_rate, zero_triu=False):
+        """Construct an RelPositionMultiHeadedAttention object."""
+        super().__init__(n_head, n_feat, dropout_rate)
+        self.zero_triu = zero_triu
+        # linear transformation for positional encoding
+        self.linear_pos = nn.Linear(n_feat, n_feat, bias_attr=False)
+        # these two learnable bias are used in matrix c and matrix d
+        # as described in https://arxiv.org/abs/1901.02860 Section 3.3
+
+        self.pos_bias_u = paddle.create_parameter(
+            shape=(self.h, self.d_k),
+            dtype='float32',
+            default_initializer=paddle.nn.initializer.XavierUniform())
+        self.pos_bias_v = paddle.create_parameter(
+            shape=(self.h, self.d_k),
+            dtype='float32',
+            default_initializer=paddle.nn.initializer.XavierUniform())
+
+    def rel_shift(self, x):
+        """Compute relative positional encoding.
+        Args:
+            x(Tensor): Input tensor (batch, head, time1, 2*time1-1).
+
+        Returns:
+            Tensor:Output tensor.
+        """
+        b, h, t1, t2 = paddle.shape(x)
+        zero_pad = paddle.zeros((b, h, t1, 1))
+        x_padded = paddle.concat([zero_pad, x], axis=-1)
+        x_padded = x_padded.reshape([b, h, t2 + 1, t1])
+        # only keep the positions from 0 to time2
+        x = x_padded[:, :, 1:].reshape([b, h, t1, t2])[:, :, :, :t2 // 2 + 1]
+
+        if self.zero_triu:
+            ones = paddle.ones((t1, t2))
+            x = x * paddle.tril(ones, t2 - 1)[None, None, :, :]
+
+        return x
+
+    def forward(self, query, key, value, pos_emb, mask):
+        """Compute 'Scaled Dot Product Attention' with rel. positional encoding.
+
+        Args:
+            query(Tensor): Query tensor (#batch, time1, size).
+            key(Tensor): Key tensor (#batch, time2, size).
+            value(Tensor): Value tensor (#batch, time2, size).
+            pos_emb(Tensor): Positional embedding tensor (#batch, 2*time1-1, size).
+            mask(Tensor): Mask tensor (#batch, 1, time2) or (#batch, time1, time2).
+
+        Returns:
+            Tensor: Output tensor (#batch, time1, d_model).
+        """
+        q, k, v = self.forward_qkv(query, key, value)
+        # (batch, time1, head, d_k)
+        q = q.transpose([0, 2, 1, 3])
+
+        n_batch_pos = paddle.shape(pos_emb)[0]
+        p = self.linear_pos(pos_emb).reshape(
+            [n_batch_pos, -1, self.h, self.d_k])
+        # (batch, head, 2*time1-1, d_k)
+        p = p.transpose([0, 2, 1, 3])
+        # (batch, head, time1, d_k)
+        q_with_bias_u = (q + self.pos_bias_u).transpose([0, 2, 1, 3])
+        # (batch, head, time1, d_k)
+        q_with_bias_v = (q + self.pos_bias_v).transpose([0, 2, 1, 3])
+
+        # compute attention score
+        # first compute matrix a and matrix c
+        # as described in https://arxiv.org/abs/1901.02860 Section 3.3
+        # (batch, head, time1, time2)
+        matrix_ac = paddle.matmul(q_with_bias_u, k.transpose([0, 1, 3, 2]))
+
+        # compute matrix b and matrix d
+        # (batch, head, time1, 2*time1-1)
+        matrix_bd = paddle.matmul(q_with_bias_v, p.transpose([0, 1, 3, 2]))
+        matrix_bd = self.rel_shift(matrix_bd)
+        # (batch, head, time1, time2)
+        scores = (matrix_ac + matrix_bd) / math.sqrt(self.d_k)
+
+        return self.forward_attention(v, scores, mask)
diff --git a/ernie-sat/paddlespeech/t2s/modules/transformer/decoder.py b/ernie-sat/paddlespeech/t2s/modules/transformer/decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..a8db7345ad07b336debee14ff692cfe4a363a1dd
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/modules/transformer/decoder.py
@@ -0,0 +1,250 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+# 暂时删除了 dyminic conv
+"""Decoder definition."""
+import logging
+from typing import Any
+from typing import List
+from typing import Tuple
+
+import paddle
+import paddle.nn.functional as F
+from paddle import nn
+
+from paddlespeech.t2s.modules.layer_norm import LayerNorm
+from paddlespeech.t2s.modules.transformer.attention import MultiHeadedAttention
+from paddlespeech.t2s.modules.transformer.decoder_layer import DecoderLayer
+from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding
+from paddlespeech.t2s.modules.transformer.lightconv import LightweightConvolution
+from paddlespeech.t2s.modules.transformer.mask import subsequent_mask
+from paddlespeech.t2s.modules.transformer.positionwise_feed_forward import PositionwiseFeedForward
+from paddlespeech.t2s.modules.transformer.repeat import repeat
+
+
+class Decoder(nn.Layer):
+    """Transfomer decoder module.
+
+    Args:
+        odim (int): Output diminsion.
+        self_attention_layer_type (str): Self-attention layer type.
+        attention_dim (int): Dimention of attention.
+        attention_heads (int): The number of heads of multi head attention.
+        conv_wshare (int): The number of kernel of convolution. Only used in
+            self_attention_layer_type == "lightconv*" or "dynamiconv*".
+        conv_kernel_length (Union[int, str]):Kernel size str of convolution
+            (e.g. 71_71_71_71_71_71). Only used in self_attention_layer_type == "lightconv*" or "dynamiconv*".
+        conv_usebias (bool): Whether to use bias in convolution. Only used in
+            self_attention_layer_type == "lightconv*" or "dynamiconv*".
+        linear_units(int): The number of units of position-wise feed forward.
+        num_blocks (int): The number of decoder blocks.
+        dropout_rate (float): Dropout rate.
+        positional_dropout_rate (float): Dropout rate after adding positional encoding.
+        self_attention_dropout_rate (float): Dropout rate in self-attention.
+        src_attention_dropout_rate (float): Dropout rate in source-attention.
+        input_layer (Union[str, nn.Layer]): Input layer type.
+        use_output_layer (bool): Whether to use output layer.
+        pos_enc_class (nn.Layer): Positional encoding module class.
+            `PositionalEncoding `or `ScaledPositionalEncoding`
+        normalize_before (bool): Whether to use layer_norm before the first block.
+        concat_after (bool): Whether to concat attention layer's input and output.
+            if True, additional linear will be applied.
+            i.e. x -> x + linear(concat(x, att(x)))
+            if False, no additional linear will be applied. i.e. x -> x + att(x)
+
+    """
+
+    def __init__(
+            self,
+            odim,
+            selfattention_layer_type="selfattn",
+            attention_dim=256,
+            attention_heads=4,
+            conv_wshare=4,
+            conv_kernel_length=11,
+            conv_usebias=False,
+            linear_units=2048,
+            num_blocks=6,
+            dropout_rate=0.1,
+            positional_dropout_rate=0.1,
+            self_attention_dropout_rate=0.0,
+            src_attention_dropout_rate=0.0,
+            input_layer="embed",
+            use_output_layer=True,
+            pos_enc_class=PositionalEncoding,
+            normalize_before=True,
+            concat_after=False, ):
+        """Construct an Decoder object."""
+        nn.Layer.__init__(self)
+        if input_layer == "embed":
+            self.embed = nn.Sequential(
+                nn.Embedding(odim, attention_dim),
+                pos_enc_class(attention_dim, positional_dropout_rate), )
+        elif input_layer == "linear":
+            self.embed = nn.Sequential(
+                nn.Linear(odim, attention_dim),
+                nn.LayerNorm(attention_dim),
+                nn.Dropout(dropout_rate),
+                nn.ReLU(),
+                pos_enc_class(attention_dim, positional_dropout_rate), )
+        elif isinstance(input_layer, nn.Layer):
+            self.embed = nn.Sequential(
+                input_layer,
+                pos_enc_class(attention_dim, positional_dropout_rate))
+        else:
+            raise NotImplementedError("only `embed` or nn.Layer is supported.")
+        self.normalize_before = normalize_before
+
+        # self-attention module definition
+        if selfattention_layer_type == "selfattn":
+            logging.info("decoder self-attention layer type = self-attention")
+            decoder_selfattn_layer = MultiHeadedAttention
+            decoder_selfattn_layer_args = [
+                (attention_heads, attention_dim, self_attention_dropout_rate, )
+            ] * num_blocks
+        elif selfattention_layer_type == "lightconv":
+            logging.info(
+                "decoder self-attention layer type = lightweight convolution")
+            decoder_selfattn_layer = LightweightConvolution
+            decoder_selfattn_layer_args = [(
+                conv_wshare, attention_dim, self_attention_dropout_rate,
+                int(conv_kernel_length.split("_")[lnum]), True, conv_usebias, )
+                                           for lnum in range(num_blocks)]
+
+        self.decoders = repeat(
+            num_blocks,
+            lambda lnum: DecoderLayer(
+                attention_dim,
+                decoder_selfattn_layer(*decoder_selfattn_layer_args[lnum]),
+                MultiHeadedAttention(attention_heads, attention_dim, src_attention_dropout_rate),
+                PositionwiseFeedForward(attention_dim, linear_units, dropout_rate),
+                dropout_rate,
+                normalize_before,
+                concat_after, ), )
+        self.selfattention_layer_type = selfattention_layer_type
+        if self.normalize_before:
+            self.after_norm = LayerNorm(attention_dim)
+        if use_output_layer:
+            self.output_layer = nn.Linear(attention_dim, odim)
+        else:
+            self.output_layer = None
+
+    def forward(self, tgt, tgt_mask, memory, memory_mask):
+        """Forward decoder.
+        Args:
+            tgt(Tensor): Input token ids, int64 (#batch, maxlen_out) if input_layer == "embed".
+                In the other case, input tensor (#batch, maxlen_out, odim).
+            tgt_mask(Tensor): Input token mask (#batch, maxlen_out).
+            memory(Tensor): Encoded memory, float32 (#batch, maxlen_in, feat).
+            memory_mask(Tensor): Encoded memory mask (#batch, maxlen_in).
+
+        Returns:
+            Tensor:
+                Decoded token score before softmax (#batch, maxlen_out, odim) if use_output_layer is True. 
+                In the other case,final block outputs (#batch, maxlen_out, attention_dim).
+            Tensor: Score mask before softmax (#batch, maxlen_out).
+
+        """
+        x = self.embed(tgt)
+        x, tgt_mask, memory, memory_mask = self.decoders(x, tgt_mask, memory,
+                                                         memory_mask)
+        if self.normalize_before:
+            x = self.after_norm(x)
+        if self.output_layer is not None:
+            x = self.output_layer(x)
+        return x, tgt_mask
+
+    def forward_one_step(self, tgt, tgt_mask, memory, cache=None):
+        """Forward one step.
+
+        Args:
+            tgt(Tensor): Input token ids, int64 (#batch, maxlen_out).
+            tgt_mask(Tensor): Input token mask (#batch, maxlen_out).
+            memory(Tensor): Encoded memory, float32 (#batch, maxlen_in, feat).
+            cache((List[Tensor]), optional): List of cached tensors. (Default value = None)
+
+        Returns:
+            Tensor: Output tensor (batch, maxlen_out, odim).
+            List[Tensor]: List of cache tensors of each decoder layer.
+
+        """
+        x = self.embed(tgt)
+        if cache is None:
+            cache = [None] * len(self.decoders)
+        new_cache = []
+        for c, decoder in zip(cache, self.decoders):
+            x, tgt_mask, memory, memory_mask = decoder(
+                x, tgt_mask, memory, None, cache=c)
+            new_cache.append(x)
+
+        if self.normalize_before:
+            y = self.after_norm(x[:, -1])
+        else:
+            y = x[:, -1]
+        if self.output_layer is not None:
+            y = F.log_softmax(self.output_layer(y), axis=-1)
+
+        return y, new_cache
+
+    # beam search API (see ScorerInterface)
+    def score(self, ys, state, x):
+        """Score."""
+        ys_mask = subsequent_mask(len(ys)).unsqueeze(0)
+        if self.selfattention_layer_type != "selfattn":
+            # TODO(karita): implement cache
+            logging.warning(
+                f"{self.selfattention_layer_type} does not support cached decoding."
+            )
+            state = None
+        logp, state = self.forward_one_step(
+            ys.unsqueeze(0), ys_mask, x.unsqueeze(0), cache=state)
+        return logp.squeeze(0), state
+
+    # batch beam search API (see BatchScorerInterface)
+    def batch_score(self,
+                    ys: paddle.Tensor,
+                    states: List[Any],
+                    xs: paddle.Tensor) -> Tuple[paddle.Tensor, List[Any]]:
+        """Score new token batch (required).
+
+        Args:
+            ys(Tensor): paddle.int64 prefix tokens (n_batch, ylen).
+            states(List[Any]): Scorer states for prefix tokens.
+            xs(Tensor): The encoder feature that generates ys (n_batch, xlen, n_feat).
+
+        Returns:
+            tuple[Tensor, List[Any]]:
+                Tuple ofbatchfied scores for next token with shape of `(n_batch, n_vocab)` and next state list for ys.
+
+        """
+        # merge states
+        n_batch = len(ys)
+        n_layers = len(self.decoders)
+        if states[0] is None:
+            batch_state = None
+        else:
+            # transpose state of [batch, layer] into [layer, batch]
+            batch_state = [
+                paddle.stack([states[b][i] for b in range(n_batch)])
+                for i in range(n_layers)
+            ]
+
+        # batch decoding
+        ys_mask = subsequent_mask(ys.shape[-1]).unsqueeze(0)
+        logp, states = self.forward_one_step(ys, ys_mask, xs, cache=batch_state)
+
+        # transpose state of [layer, batch] into [batch, layer]
+        state_list = [[states[i][b] for i in range(n_layers)]
+                      for b in range(n_batch)]
+        return logp, state_list
diff --git a/ernie-sat/paddlespeech/t2s/modules/transformer/decoder_layer.py b/ernie-sat/paddlespeech/t2s/modules/transformer/decoder_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a13cd794c52cdfab8e7e5ae4cc3aa7842a71688
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/modules/transformer/decoder_layer.py
@@ -0,0 +1,144 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+"""Decoder self-attention layer definition."""
+import paddle
+from paddle import nn
+
+from paddlespeech.t2s.modules.layer_norm import LayerNorm
+
+
+class DecoderLayer(nn.Layer):
+    """Single decoder layer module.
+
+ 
+    Args:
+        size (int): Input dimension.
+        self_attn (nn.Layer): Self-attention module instance.
+            `MultiHeadedAttention` instance can be used as the argument.
+        src_attn (nn.Layer): Self-attention module instance.
+            `MultiHeadedAttention` instance can be used as the argument.
+        feed_forward (nn.Layer): Feed-forward module instance.
+            `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance can be used as the argument.
+        dropout_rate (float): Dropout rate.
+        normalize_before (bool): Whether to use layer_norm before the first block.
+        concat_after (bool): Whether to concat attention layer's input and output.
+            if True, additional linear will be applied.
+            i.e. x -> x + linear(concat(x, att(x)))
+            if False, no additional linear will be applied. i.e. x -> x + att(x)
+
+    """
+
+    def __init__(
+            self,
+            size,
+            self_attn,
+            src_attn,
+            feed_forward,
+            dropout_rate,
+            normalize_before=True,
+            concat_after=False, ):
+        """Construct an DecoderLayer object."""
+        super().__init__()
+        self.size = size
+        self.self_attn = self_attn
+        self.src_attn = src_attn
+        self.feed_forward = feed_forward
+        self.norm1 = LayerNorm(size)
+        self.norm2 = LayerNorm(size)
+        self.norm3 = LayerNorm(size)
+        self.dropout = nn.Dropout(dropout_rate)
+        self.normalize_before = normalize_before
+        self.concat_after = concat_after
+        if self.concat_after:
+            self.concat_linear1 = nn.Linear(size + size, size)
+            self.concat_linear2 = nn.Linear(size + size, size)
+
+    def forward(self, tgt, tgt_mask, memory, memory_mask, cache=None):
+        """Compute decoded features.
+
+        Args:
+            tgt(Tensor): Input tensor (#batch, maxlen_out, size).
+            tgt_mask(Tensor): Mask for input tensor (#batch, maxlen_out).
+            memory(Tensor): Encoded memory, float32 (#batch, maxlen_in, size).
+            memory_mask(Tensor): Encoded memory mask (#batch, maxlen_in).
+            cache(List[Tensor], optional): List of cached tensors.
+                Each tensor shape should be (#batch, maxlen_out - 1, size). (Default value = None)
+        Returns:
+            Tensor
+                Output tensor(#batch, maxlen_out, size).
+            Tensor
+                Mask for output tensor (#batch, maxlen_out).
+            Tensor
+                Encoded memory (#batch, maxlen_in, size).
+            Tensor
+                Encoded memory mask (#batch, maxlen_in).
+
+        """
+        residual = tgt
+        if self.normalize_before:
+            tgt = self.norm1(tgt)
+
+        if cache is None:
+            tgt_q = tgt
+            tgt_q_mask = tgt_mask
+        else:
+            # compute only the last frame query keeping dim: max_time_out -> 1
+            assert cache.shape == [
+                tgt.shape[0],
+                tgt.shape[1] - 1,
+                self.size,
+            ], f"{cache.shape} == {(tgt.shape[0], tgt.shape[1] - 1, self.size)}"
+            tgt_q = tgt[:, -1:, :]
+            residual = residual[:, -1:, :]
+            tgt_q_mask = None
+            if tgt_mask is not None:
+                tgt_mask = paddle.cast(tgt_mask, dtype="int64")
+                tgt_q_mask = tgt_mask[:, -1:, :]
+                tgt_q_mask = paddle.cast(tgt_q_mask, dtype="bool")
+
+        if self.concat_after:
+            tgt_concat = paddle.concat(
+                (tgt_q, self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)), axis=-1)
+            x = residual + self.concat_linear1(tgt_concat)
+        else:
+            x = residual + self.dropout(
+                self.self_attn(tgt_q, tgt, tgt, tgt_q_mask))
+        if not self.normalize_before:
+            x = self.norm1(x)
+
+        residual = x
+        if self.normalize_before:
+            x = self.norm2(x)
+        if self.concat_after:
+            x_concat = paddle.concat(
+                (x, self.src_attn(x, memory, memory, memory_mask)), axis=-1)
+            x = residual + self.concat_linear2(x_concat)
+        else:
+            x = residual + self.dropout(
+                self.src_attn(x, memory, memory, memory_mask))
+        if not self.normalize_before:
+            x = self.norm2(x)
+
+        residual = x
+        if self.normalize_before:
+            x = self.norm3(x)
+        x = residual + self.dropout(self.feed_forward(x))
+        if not self.normalize_before:
+            x = self.norm3(x)
+
+        if cache is not None:
+            x = paddle.concat([cache, x], axis=1)
+
+        return x, tgt_mask, memory, memory_mask
diff --git a/ernie-sat/paddlespeech/t2s/modules/transformer/embedding.py b/ernie-sat/paddlespeech/t2s/modules/transformer/embedding.py
new file mode 100644
index 0000000000000000000000000000000000000000..d9339d20bb79c1732e453b5b17f9ef127b8a687a
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/modules/transformer/embedding.py
@@ -0,0 +1,187 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+"""Positional Encoding Module."""
+import math
+
+import paddle
+from paddle import nn
+
+
+class PositionalEncoding(nn.Layer):
+    """Positional encoding.
+
+    Args:
+        d_model (int):  Embedding dimension.
+        dropout_rate (float): Dropout rate.
+        max_len (int): Maximum input length.
+        reverse (bool): Whether to reverse the input position.
+        type (str): dtype of param
+    """
+
+    def __init__(self,
+                 d_model,
+                 dropout_rate,
+                 max_len=5000,
+                 dtype="float32",
+                 reverse=False):
+        """Construct an PositionalEncoding object."""
+        super().__init__()
+        self.d_model = d_model
+        self.reverse = reverse
+        self.xscale = math.sqrt(self.d_model)
+        self.dropout = nn.Dropout(p=dropout_rate)
+        self.pe = None
+        self.dtype = dtype
+        self.extend_pe(paddle.expand(paddle.zeros([1]), (1, max_len)))
+
+    def extend_pe(self, x):
+        """Reset the positional encodings."""
+        x_shape = paddle.shape(x)
+        pe = paddle.zeros([x_shape[1], self.d_model])
+        if self.reverse:
+            position = paddle.arange(
+                x_shape[1] - 1, -1, -1.0, dtype=self.dtype).unsqueeze(1)
+        else:
+            position = paddle.arange(
+                0, x_shape[1], dtype=self.dtype).unsqueeze(1)
+        div_term = paddle.exp(
+            paddle.arange(0, self.d_model, 2, dtype=self.dtype) *
+            -(math.log(10000.0) / self.d_model))
+        pe[:, 0::2] = paddle.sin(position * div_term)
+        pe[:, 1::2] = paddle.cos(position * div_term)
+        pe = pe.unsqueeze(0)
+        self.pe = pe
+
+    def forward(self, x: paddle.Tensor):
+        """Add positional encoding.
+
+        Args:
+            x (Tensor): Input tensor (batch, time, `*`).
+
+        Returns:
+            Tensor: Encoded tensor (batch, time, `*`).
+        """
+        self.extend_pe(x)
+        T = paddle.shape(x)[1]
+        x = x * self.xscale + self.pe[:, :T]
+        return self.dropout(x)
+
+
+class ScaledPositionalEncoding(PositionalEncoding):
+    """Scaled positional encoding module.
+    See Sec. 3.2  https://arxiv.org/abs/1809.08895
+
+    Args:
+        d_model (int): Embedding dimension.
+        dropout_rate (float): Dropout rate.
+        max_len (int): Maximum input length.
+        dtype (str): dtype of param
+    """
+
+    def __init__(self, d_model, dropout_rate, max_len=5000, dtype="float32"):
+        """Initialize class."""
+        super().__init__(
+            d_model=d_model,
+            dropout_rate=dropout_rate,
+            max_len=max_len,
+            dtype=dtype)
+        x = paddle.ones([1], dtype=self.dtype)
+        self.alpha = paddle.create_parameter(
+            shape=x.shape,
+            dtype=self.dtype,
+            default_initializer=nn.initializer.Assign(x))
+
+    def reset_parameters(self):
+        """Reset parameters."""
+        self.alpha = paddle.ones([1])
+
+    def forward(self, x):
+        """Add positional encoding.
+
+        Args:
+            x (Tensor): Input tensor (batch, time, `*`).
+        Returns:
+            Tensor: Encoded tensor (batch, time, `*`).
+        """
+        self.extend_pe(x)
+        T = paddle.shape(x)[1]
+        x = x + self.alpha * self.pe[:, :T]
+        return self.dropout(x)
+
+
+class RelPositionalEncoding(nn.Layer):
+    """Relative positional encoding module (new implementation).
+    Details can be found in https://github.com/espnet/espnet/pull/2816.
+    See : Appendix B in https://arxiv.org/abs/1901.02860
+
+    Args:
+        d_model (int): Embedding dimension.
+        dropout_rate (float): Dropout rate.
+        max_len (int): Maximum input length.
+    """
+
+    def __init__(self, d_model, dropout_rate, max_len=5000, dtype="float32"):
+        """Construct an PositionalEncoding object."""
+        super().__init__()
+        self.d_model = d_model
+        self.xscale = math.sqrt(self.d_model)
+        self.dropout = nn.Dropout(p=dropout_rate)
+        self.pe = None
+        self.dtype = dtype
+        self.extend_pe(paddle.expand(paddle.zeros([1]), (1, max_len)))
+
+    def extend_pe(self, x):
+        """Reset the positional encodings."""
+        if self.pe is not None:
+            # self.pe contains both positive and negative parts
+            # the length of self.pe is 2 * input_len - 1
+            if paddle.shape(self.pe)[1] >= paddle.shape(x)[1] * 2 - 1:
+                return
+        # Suppose `i` means to the position of query vecotr and `j` means the
+        # position of key vector. We use position relative positions when keys
+        # are to the left (i>j) and negative relative positions otherwise (i<j).
+        x_shape = paddle.shape(x)
+        pe_positive = paddle.zeros([x_shape[1], self.d_model])
+        pe_negative = paddle.zeros([x_shape[1], self.d_model])
+        position = paddle.arange(0, x_shape[1], dtype=self.dtype).unsqueeze(1)
+        div_term = paddle.exp(
+            paddle.arange(0, self.d_model, 2, dtype=self.dtype) *
+            -(math.log(10000.0) / self.d_model))
+        pe_positive[:, 0::2] = paddle.sin(position * div_term)
+        pe_positive[:, 1::2] = paddle.cos(position * div_term)
+        pe_negative[:, 0::2] = paddle.sin(-1 * position * div_term)
+        pe_negative[:, 1::2] = paddle.cos(-1 * position * div_term)
+
+        # Reserve the order of positive indices and concat both positive and
+        # negative indices. This is used to support the shifting trick
+        # as in https://arxiv.org/abs/1901.02860
+        pe_positive = paddle.flip(pe_positive, [0]).unsqueeze(0)
+        pe_negative = pe_negative[1:].unsqueeze(0)
+        pe = paddle.concat([pe_positive, pe_negative], axis=1)
+        self.pe = pe
+
+    def forward(self, x: paddle.Tensor):
+        """Add positional encoding.
+        Args:
+            x (Tensor):Input tensor (batch, time, `*`).
+        Returns:
+            Tensor: Encoded tensor (batch, time, `*`).
+        """
+        self.extend_pe(x)
+        x = x * self.xscale
+        T = paddle.shape(x)[1]
+        pe_size = paddle.shape(self.pe)
+        pos_emb = self.pe[:, pe_size[1] // 2 - T + 1:pe_size[1] // 2 + T, ]
+        return self.dropout(x), self.dropout(pos_emb)
diff --git a/ernie-sat/paddlespeech/t2s/modules/transformer/encoder.py b/ernie-sat/paddlespeech/t2s/modules/transformer/encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..f64202824c9a7ceb63395641c22326f06d768809
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/modules/transformer/encoder.py
@@ -0,0 +1,646 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+from typing import List
+from typing import Union
+
+from paddle import nn
+
+from paddlespeech.t2s.modules.activation import get_activation
+from paddlespeech.t2s.modules.conformer.convolution import ConvolutionModule
+from paddlespeech.t2s.modules.conformer.encoder_layer import EncoderLayer as ConformerEncoderLayer
+from paddlespeech.t2s.modules.layer_norm import LayerNorm
+from paddlespeech.t2s.modules.transformer.attention import MultiHeadedAttention
+from paddlespeech.t2s.modules.transformer.attention import RelPositionMultiHeadedAttention
+from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding
+from paddlespeech.t2s.modules.transformer.embedding import RelPositionalEncoding
+from paddlespeech.t2s.modules.transformer.embedding import ScaledPositionalEncoding
+from paddlespeech.t2s.modules.transformer.encoder_layer import EncoderLayer
+from paddlespeech.t2s.modules.transformer.multi_layer_conv import Conv1dLinear
+from paddlespeech.t2s.modules.transformer.multi_layer_conv import MultiLayeredConv1d
+from paddlespeech.t2s.modules.transformer.positionwise_feed_forward import PositionwiseFeedForward
+from paddlespeech.t2s.modules.transformer.repeat import repeat
+from paddlespeech.t2s.modules.transformer.subsampling import Conv2dSubsampling
+
+
+class BaseEncoder(nn.Layer):
+    """Base Encoder module.
+
+    Args:
+        idim (int): Input dimension.
+        attention_dim (int): Dimention of attention.
+        attention_heads (int): The number of heads of multi head attention.
+        linear_units (int): The number of units of position-wise feed forward.
+        num_blocks (int): The number of decoder blocks.
+        dropout_rate (float): Dropout rate.
+        positional_dropout_rate (float): Dropout rate after adding positional encoding.
+        attention_dropout_rate (float): Dropout rate in attention.
+        input_layer (Union[str, nn.Layer]): Input layer type.
+        normalize_before (bool): Whether to use layer_norm before the first block.
+        concat_after (bool): Whether to concat attention layer's input and output.
+            if True, additional linear will be applied.
+            i.e. x -> x + linear(concat(x, att(x)))
+            if False, no additional linear will be applied. i.e. x -> x + att(x)
+        positionwise_layer_type (str): "linear", "conv1d", or "conv1d-linear".
+        positionwise_conv_kernel_size (int): Kernel size of positionwise conv1d layer.
+        macaron_style (bool): Whether to use macaron style for positionwise layer.
+        pos_enc_layer_type (str): Encoder positional encoding layer type.
+        selfattention_layer_type (str): Encoder attention layer type.
+        activation_type (str): Encoder activation function type.
+        use_cnn_module (bool): Whether to use convolution module.
+        zero_triu (bool): Whether to zero the upper triangular part of attention matrix.
+        cnn_module_kernel (int): Kernerl size of convolution module.
+        padding_idx (int): Padding idx for input_layer=embed.
+        stochastic_depth_rate (float): Maximum probability to skip the encoder layer.
+        intermediate_layers (Union[List[int], None]): indices of intermediate CTC layer.
+            indices start from 1.
+            if not None, intermediate outputs are returned (which changes return type
+            signature.)
+        encoder_type (str): "transformer", or "conformer".
+    """
+
+    def __init__(self,
+                 idim: int,
+                 attention_dim: int=256,
+                 attention_heads: int=4,
+                 linear_units: int=2048,
+                 num_blocks: int=6,
+                 dropout_rate: float=0.1,
+                 positional_dropout_rate: float=0.1,
+                 attention_dropout_rate: float=0.0,
+                 input_layer: str="conv2d",
+                 normalize_before: bool=True,
+                 concat_after: bool=False,
+                 positionwise_layer_type: str="linear",
+                 positionwise_conv_kernel_size: int=1,
+                 macaron_style: bool=False,
+                 pos_enc_layer_type: str="abs_pos",
+                 selfattention_layer_type: str="selfattn",
+                 activation_type: str="swish",
+                 use_cnn_module: bool=False,
+                 zero_triu: bool=False,
+                 cnn_module_kernel: int=31,
+                 padding_idx: int=-1,
+                 stochastic_depth_rate: float=0.0,
+                 intermediate_layers: Union[List[int], None]=None,
+                 encoder_type: str="transformer"):
+        """Construct an Base Encoder object."""
+        super().__init__()
+        activation = get_activation(activation_type)
+        pos_enc_class = self.get_pos_enc_class(pos_enc_layer_type,
+                                               selfattention_layer_type)
+        self.encoder_type = encoder_type
+
+        self.conv_subsampling_factor = 1
+        self.embed = self.get_embed(
+            idim=idim,
+            input_layer=input_layer,
+            attention_dim=attention_dim,
+            pos_enc_class=pos_enc_class,
+            dropout_rate=dropout_rate,
+            positional_dropout_rate=positional_dropout_rate,
+            padding_idx=padding_idx)
+
+        self.normalize_before = normalize_before
+
+        # self-attention module definition
+        encoder_selfattn_layer, encoder_selfattn_layer_args = self.get_encoder_selfattn_layer(
+            selfattention_layer_type=selfattention_layer_type,
+            attention_heads=attention_heads,
+            attention_dim=attention_dim,
+            attention_dropout_rate=attention_dropout_rate,
+            zero_triu=zero_triu,
+            pos_enc_layer_type=pos_enc_layer_type)
+        # feed-forward module definition
+        positionwise_layer, positionwise_layer_args = self.get_positionwise_layer(
+            positionwise_layer_type, attention_dim, linear_units, dropout_rate,
+            positionwise_conv_kernel_size, activation)
+
+        # convolution module definition
+        convolution_layer = ConvolutionModule
+        convolution_layer_args = (attention_dim, cnn_module_kernel, activation)
+
+        if self.encoder_type == "transformer":
+            self.encoders = repeat(
+                num_blocks,
+                lambda lnum: EncoderLayer(
+                    attention_dim,
+                    encoder_selfattn_layer(*encoder_selfattn_layer_args),
+                    positionwise_layer(*positionwise_layer_args),
+                    dropout_rate,
+                    normalize_before,
+                    concat_after, ), )
+
+        elif self.encoder_type == "conformer":
+            self.encoders = repeat(
+                num_blocks,
+                lambda lnum: ConformerEncoderLayer(
+                    attention_dim,
+                    encoder_selfattn_layer(*encoder_selfattn_layer_args),
+                    positionwise_layer(*positionwise_layer_args),
+                    positionwise_layer(*positionwise_layer_args) if macaron_style else None,
+                    convolution_layer(*convolution_layer_args) if use_cnn_module else None,
+                    dropout_rate,
+                    normalize_before,
+                    concat_after,
+                    stochastic_depth_rate * float(1 + lnum) / num_blocks, ), )
+            self.intermediate_layers = intermediate_layers
+        else:
+            raise NotImplementedError("Support only linear or conv1d.")
+
+        if self.normalize_before:
+            self.after_norm = LayerNorm(attention_dim)
+
+    def get_positionwise_layer(self,
+                               positionwise_layer_type: str="linear",
+                               attention_dim: int=256,
+                               linear_units: int=2048,
+                               dropout_rate: float=0.1,
+                               positionwise_conv_kernel_size: int=1,
+                               activation: nn.Layer=nn.ReLU()):
+        """Define positionwise layer."""
+        if positionwise_layer_type == "linear":
+            positionwise_layer = PositionwiseFeedForward
+            positionwise_layer_args = (attention_dim, linear_units,
+                                       dropout_rate, activation)
+        elif positionwise_layer_type == "conv1d":
+            positionwise_layer = MultiLayeredConv1d
+            positionwise_layer_args = (attention_dim, linear_units,
+                                       positionwise_conv_kernel_size,
+                                       dropout_rate, )
+        elif positionwise_layer_type == "conv1d-linear":
+            positionwise_layer = Conv1dLinear
+            positionwise_layer_args = (attention_dim, linear_units,
+                                       positionwise_conv_kernel_size,
+                                       dropout_rate, )
+        else:
+            raise NotImplementedError("Support only linear or conv1d.")
+        return positionwise_layer, positionwise_layer_args
+
+    def get_encoder_selfattn_layer(self,
+                                   selfattention_layer_type: str="selfattn",
+                                   attention_heads: int=4,
+                                   attention_dim: int=256,
+                                   attention_dropout_rate: float=0.0,
+                                   zero_triu: bool=False,
+                                   pos_enc_layer_type: str="abs_pos"):
+        if selfattention_layer_type == "selfattn":
+            encoder_selfattn_layer = MultiHeadedAttention
+            encoder_selfattn_layer_args = (attention_heads, attention_dim,
+                                           attention_dropout_rate, )
+        elif selfattention_layer_type == "rel_selfattn":
+            assert pos_enc_layer_type == "rel_pos"
+            encoder_selfattn_layer = RelPositionMultiHeadedAttention
+            encoder_selfattn_layer_args = (attention_heads, attention_dim,
+                                           attention_dropout_rate, zero_triu, )
+        else:
+            raise ValueError("unknown encoder_attn_layer: " +
+                             selfattention_layer_type)
+        return encoder_selfattn_layer, encoder_selfattn_layer_args
+
+    def get_pos_enc_class(self,
+                          pos_enc_layer_type: str="abs_pos",
+                          selfattention_layer_type: str="selfattn"):
+        if pos_enc_layer_type == "abs_pos":
+            pos_enc_class = PositionalEncoding
+        elif pos_enc_layer_type == "scaled_abs_pos":
+            pos_enc_class = ScaledPositionalEncoding
+        elif pos_enc_layer_type == "rel_pos":
+            assert selfattention_layer_type == "rel_selfattn"
+            pos_enc_class = RelPositionalEncoding
+        else:
+            raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type)
+        return pos_enc_class
+
+    def get_embed(self,
+                  idim,
+                  input_layer="conv2d",
+                  attention_dim: int=256,
+                  pos_enc_class=PositionalEncoding,
+                  dropout_rate: int=0.1,
+                  positional_dropout_rate: int=0.1,
+                  padding_idx: int=-1):
+
+        if input_layer == "linear":
+            embed = nn.Sequential(
+                nn.Linear(idim, attention_dim),
+                nn.LayerNorm(attention_dim),
+                nn.Dropout(dropout_rate),
+                nn.ReLU(),
+                pos_enc_class(attention_dim, positional_dropout_rate), )
+        elif input_layer == "conv2d":
+            embed = Conv2dSubsampling(
+                idim,
+                attention_dim,
+                dropout_rate,
+                pos_enc_class(attention_dim, positional_dropout_rate), )
+            self.conv_subsampling_factor = 4
+        elif input_layer == "embed":
+            embed = nn.Sequential(
+                nn.Embedding(idim, attention_dim, padding_idx=padding_idx),
+                pos_enc_class(attention_dim, positional_dropout_rate), )
+        elif isinstance(input_layer, nn.Layer):
+            embed = nn.Sequential(
+                input_layer,
+                pos_enc_class(attention_dim, positional_dropout_rate), )
+        elif input_layer is None:
+            embed = nn.Sequential(
+                pos_enc_class(attention_dim, positional_dropout_rate))
+        else:
+            raise ValueError("unknown input_layer: " + input_layer)
+
+        return embed
+
+    def forward(self, xs, masks):
+        """Encode input sequence.
+
+        Args:
+            xs (Tensor): Input tensor (#batch, time, idim).
+            masks (Tensor): Mask tensor (#batch, 1, time).
+
+        Returns: 
+            Tensor: Output tensor (#batch, time, attention_dim).
+            Tensor: Mask tensor (#batch, 1, time).
+        """
+        xs = self.embed(xs)
+        xs, masks = self.encoders(xs, masks)
+        if self.normalize_before:
+            xs = self.after_norm(xs)
+        return xs, masks
+
+
+class TransformerEncoder(BaseEncoder):
+    """Transformer encoder module.
+
+    Args:
+        idim (int): Input dimension.
+        attention_dim (int): Dimention of attention.
+        attention_heads (int): The number of heads of multi head attention.
+        linear_units (int): The number of units of position-wise feed forward.
+        num_blocks (int): The number of decoder blocks.
+        dropout_rate (float): Dropout rate.
+        positional_dropout_rate (float): Dropout rate after adding positional encoding.
+        attention_dropout_rate (float): Dropout rate in attention.
+        input_layer (Union[str, paddle.nn.Layer]): Input layer type.
+        pos_enc_layer_type (str): Encoder positional encoding layer type.
+        normalize_before (bool): Whether to use layer_norm before the first block.
+        concat_after (bool): Whether to concat attention layer's input and output.
+            if True, additional linear will be applied.
+            i.e. x -> x + linear(concat(x, att(x)))
+            if False, no additional linear will be applied. i.e. x -> x + att(x)
+        positionwise_layer_type (str): "linear", "conv1d", or "conv1d-linear".
+        positionwise_conv_kernel_size (int): Kernel size of positionwise conv1d layer.
+        selfattention_layer_type (str): Encoder attention layer type.
+        activation_type (str): Encoder activation function type.
+        padding_idx (int): Padding idx for input_layer=embed.
+    """
+
+    def __init__(
+            self,
+            idim,
+            attention_dim: int=256,
+            attention_heads: int=4,
+            linear_units: int=2048,
+            num_blocks: int=6,
+            dropout_rate: float=0.1,
+            positional_dropout_rate: float=0.1,
+            attention_dropout_rate: float=0.0,
+            input_layer: str="conv2d",
+            pos_enc_layer_type: str="abs_pos",
+            normalize_before: bool=True,
+            concat_after: bool=False,
+            positionwise_layer_type: str="linear",
+            positionwise_conv_kernel_size: int=1,
+            selfattention_layer_type: str="selfattn",
+            activation_type: str="relu",
+            padding_idx: int=-1, ):
+        """Construct an Transformer Encoder object."""
+        super().__init__(
+            idim,
+            attention_dim=attention_dim,
+            attention_heads=attention_heads,
+            linear_units=linear_units,
+            num_blocks=num_blocks,
+            dropout_rate=dropout_rate,
+            positional_dropout_rate=positional_dropout_rate,
+            attention_dropout_rate=attention_dropout_rate,
+            input_layer=input_layer,
+            pos_enc_layer_type=pos_enc_layer_type,
+            normalize_before=normalize_before,
+            concat_after=concat_after,
+            positionwise_layer_type=positionwise_layer_type,
+            positionwise_conv_kernel_size=positionwise_conv_kernel_size,
+            selfattention_layer_type=selfattention_layer_type,
+            activation_type=activation_type,
+            padding_idx=padding_idx,
+            encoder_type="transformer")
+
+    def forward(self, xs, masks):
+        """Encode input sequence.
+
+        Args:
+            xs(Tensor): Input tensor (#batch, time, idim).
+            masks(Tensor): Mask tensor (#batch, 1, time).
+
+        Returns:
+            Tensor: Output tensor (#batch, time, attention_dim).
+            Tensor:Mask tensor (#batch, 1, time).
+        """
+        xs = self.embed(xs)
+        xs, masks = self.encoders(xs, masks)
+        if self.normalize_before:
+            xs = self.after_norm(xs)
+        return xs, masks
+
+    def forward_one_step(self, xs, masks, cache=None):
+        """Encode input frame.
+
+        Args:
+            xs (Tensor): Input tensor.
+            masks (Tensor): Mask tensor.
+            cache (List[Tensor]): List of cache tensors.
+
+        Returns:
+            Tensor: Output tensor.
+            Tensor: Mask tensor.
+            List[Tensor]: List of new cache tensors.
+        """
+
+        xs = self.embed(xs)
+        if cache is None:
+            cache = [None for _ in range(len(self.encoders))]
+        new_cache = []
+        for c, e in zip(cache, self.encoders):
+            xs, masks = e(xs, masks, cache=c)
+            new_cache.append(xs)
+        if self.normalize_before:
+            xs = self.after_norm(xs)
+        return xs, masks, new_cache
+
+
+class ConformerEncoder(BaseEncoder):
+    """Conformer encoder module.
+
+    Args:
+        idim (int): Input dimension.
+        attention_dim (int): Dimention of attention.
+        attention_heads (int): The number of heads of multi head attention.
+        linear_units (int): The number of units of position-wise feed forward.
+        num_blocks (int): The number of decoder blocks.
+        dropout_rate (float): Dropout rate.
+        positional_dropout_rate (float): Dropout rate after adding positional encoding.
+        attention_dropout_rate (float): Dropout rate in attention.
+        input_layer (Union[str, nn.Layer]): Input layer type.
+        normalize_before (bool): Whether to use layer_norm before the first block.
+        concat_after (bool):Whether to concat attention layer's input and output.
+            if True, additional linear will be applied.
+            i.e. x -> x + linear(concat(x, att(x)))
+            if False, no additional linear will be applied. i.e. x -> x + att(x)
+        positionwise_layer_type (str): "linear", "conv1d", or "conv1d-linear".
+        positionwise_conv_kernel_size (int): Kernel size of positionwise conv1d layer.
+        macaron_style (bool): Whether to use macaron style for positionwise layer.
+        pos_enc_layer_type (str): Encoder positional encoding layer type.
+        selfattention_layer_type (str): Encoder attention layer type.
+        activation_type (str): Encoder activation function type.
+        use_cnn_module (bool): Whether to use convolution module.
+        zero_triu (bool): Whether to zero the upper triangular part of attention matrix.
+        cnn_module_kernel (int): Kernerl size of convolution module.
+        padding_idx (int): Padding idx for input_layer=embed.
+        stochastic_depth_rate (float): Maximum probability to skip the encoder layer.
+        intermediate_layers (Union[List[int], None]):indices of intermediate CTC layer. indices start from 1.
+            if not None, intermediate outputs are returned (which changes return type signature.)
+    """
+
+    def __init__(
+            self,
+            idim: int,
+            attention_dim: int=256,
+            attention_heads: int=4,
+            linear_units: int=2048,
+            num_blocks: int=6,
+            dropout_rate: float=0.1,
+            positional_dropout_rate: float=0.1,
+            attention_dropout_rate: float=0.0,
+            input_layer: str="conv2d",
+            normalize_before: bool=True,
+            concat_after: bool=False,
+            positionwise_layer_type: str="linear",
+            positionwise_conv_kernel_size: int=1,
+            macaron_style: bool=False,
+            pos_enc_layer_type: str="rel_pos",
+            selfattention_layer_type: str="rel_selfattn",
+            activation_type: str="swish",
+            use_cnn_module: bool=False,
+            zero_triu: bool=False,
+            cnn_module_kernel: int=31,
+            padding_idx: int=-1,
+            stochastic_depth_rate: float=0.0,
+            intermediate_layers: Union[List[int], None]=None, ):
+        """Construct an Conformer Encoder object."""
+        super().__init__(
+            idim=idim,
+            attention_dim=attention_dim,
+            attention_heads=attention_heads,
+            linear_units=linear_units,
+            num_blocks=num_blocks,
+            dropout_rate=dropout_rate,
+            positional_dropout_rate=positional_dropout_rate,
+            attention_dropout_rate=attention_dropout_rate,
+            input_layer=input_layer,
+            normalize_before=normalize_before,
+            concat_after=concat_after,
+            positionwise_layer_type=positionwise_layer_type,
+            positionwise_conv_kernel_size=positionwise_conv_kernel_size,
+            macaron_style=macaron_style,
+            pos_enc_layer_type=pos_enc_layer_type,
+            selfattention_layer_type=selfattention_layer_type,
+            activation_type=activation_type,
+            use_cnn_module=use_cnn_module,
+            zero_triu=zero_triu,
+            cnn_module_kernel=cnn_module_kernel,
+            padding_idx=padding_idx,
+            stochastic_depth_rate=stochastic_depth_rate,
+            intermediate_layers=intermediate_layers,
+            encoder_type="conformer")
+
+    def forward(self, xs, masks):
+        """Encode input sequence.
+
+        Args:
+            xs (Tensor): Input tensor (#batch, time, idim).
+            masks (Tensor): Mask tensor (#batch, 1, time).
+        Returns:
+            Tensor: Output tensor (#batch, time, attention_dim).
+            Tensor: Mask tensor (#batch, 1, time).
+        """
+        if isinstance(self.embed, (Conv2dSubsampling)):
+            xs, masks = self.embed(xs, masks)
+        else:
+            xs = self.embed(xs)
+
+        if self.intermediate_layers is None:
+            xs, masks = self.encoders(xs, masks)
+        else:
+            intermediate_outputs = []
+            for layer_idx, encoder_layer in enumerate(self.encoders):
+                xs, masks = encoder_layer(xs, masks)
+
+                if (self.intermediate_layers is not None and
+                        layer_idx + 1 in self.intermediate_layers):
+                    # intermediate branches also require normalization.
+                    encoder_output = xs
+                    if isinstance(encoder_output, tuple):
+                        encoder_output = encoder_output[0]
+                        if self.normalize_before:
+                            encoder_output = self.after_norm(encoder_output)
+                    intermediate_outputs.append(encoder_output)
+
+        if isinstance(xs, tuple):
+            xs = xs[0]
+
+        if self.normalize_before:
+            xs = self.after_norm(xs)
+
+        if self.intermediate_layers is not None:
+            return xs, masks, intermediate_outputs
+        return xs, masks
+
+
+class Conv1dResidualBlock(nn.Layer):
+    """
+    Special module for simplified version of Encoder class.
+    """
+
+    def __init__(self,
+                 idim: int=256,
+                 odim: int=256,
+                 kernel_size: int=5,
+                 dropout_rate: float=0.2):
+        super().__init__()
+        self.main_block = nn.Sequential(
+            nn.Conv1D(
+                idim, odim, kernel_size=kernel_size, padding=kernel_size // 2),
+            nn.ReLU(),
+            nn.BatchNorm1D(odim),
+            nn.Dropout(p=dropout_rate))
+        self.conv1d_residual = nn.Conv1D(idim, odim, kernel_size=1)
+
+    def forward(self, xs):
+        """Encode input sequence.
+        Args:
+            xs (Tensor): Input tensor (#batch, idim, T).
+        Returns:
+            Tensor: Output tensor (#batch, odim, T).
+        """
+        outputs = self.main_block(xs)
+        outputs = self.conv1d_residual(xs) + outputs
+        return outputs
+
+
+class CNNDecoder(nn.Layer):
+    """
+    Much simplified decoder than the original one with Prenet.
+    """
+
+    def __init__(
+            self,
+            emb_dim: int=256,
+            odim: int=80,
+            kernel_size: int=5,
+            dropout_rate: float=0.2,
+            resblock_kernel_sizes: List[int]=[256, 256], ):
+
+        super().__init__()
+
+        input_shape = emb_dim
+        out_sizes = resblock_kernel_sizes
+        out_sizes.append(out_sizes[-1])
+
+        in_sizes = [input_shape] + out_sizes[:-1]
+        self.residual_blocks = nn.LayerList([
+            Conv1dResidualBlock(
+                idim=in_channels,
+                odim=out_channels,
+                kernel_size=kernel_size,
+                dropout_rate=dropout_rate, )
+            for in_channels, out_channels in zip(in_sizes, out_sizes)
+        ])
+        self.conv1d = nn.Conv1D(
+            in_channels=out_sizes[-1], out_channels=odim, kernel_size=1)
+
+    def forward(self, xs, masks=None):
+        """Encode input sequence.
+        Args:
+            xs (Tensor): Input tensor (#batch, time, idim).
+            masks (Tensor): Mask tensor (#batch, 1, time).
+        Returns:
+            Tensor: Output tensor (#batch, time, odim).
+        """
+        # exchange the temporal dimension and the feature dimension
+        xs = xs.transpose([0, 2, 1])
+        if masks is not None:
+            xs = xs * masks
+
+        for layer in self.residual_blocks:
+            outputs = layer(xs)
+            if masks is not None:
+                # input_mask B * 1 * T
+                outputs = outputs * masks
+            xs = outputs
+        outputs = self.conv1d(outputs)
+        if masks is not None:
+            outputs = outputs * masks
+        outputs = outputs.transpose([0, 2, 1])
+        return outputs, masks
+
+
+class CNNPostnet(nn.Layer):
+    def __init__(
+            self,
+            odim: int=80,
+            kernel_size: int=5,
+            dropout_rate: float=0.2,
+            resblock_kernel_sizes: List[int]=[256, 256], ):
+        super().__init__()
+        out_sizes = resblock_kernel_sizes
+        in_sizes = [odim] + out_sizes[:-1]
+        self.residual_blocks = nn.LayerList([
+            Conv1dResidualBlock(
+                idim=in_channels,
+                odim=out_channels,
+                kernel_size=kernel_size,
+                dropout_rate=dropout_rate)
+            for in_channels, out_channels in zip(in_sizes, out_sizes)
+        ])
+        self.conv1d = nn.Conv1D(
+            in_channels=out_sizes[-1], out_channels=odim, kernel_size=1)
+
+    def forward(self, xs, masks=None):
+        """Encode input sequence.
+        Args:
+            xs (Tensor): Input tensor (#batch, odim, time).
+            masks (Tensor): Mask tensor (#batch, 1, time).
+        Returns:
+            Tensor: Output tensor (#batch, odim, time).
+        """
+        for layer in self.residual_blocks:
+            outputs = layer(xs)
+            if masks is not None:
+                # input_mask B * 1 * T
+                outputs = outputs * masks
+            xs = outputs
+        outputs = self.conv1d(outputs)
+        if masks is not None:
+            outputs = outputs * masks
+        return outputs
diff --git a/ernie-sat/paddlespeech/t2s/modules/transformer/encoder_layer.py b/ernie-sat/paddlespeech/t2s/modules/transformer/encoder_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..72372b69b92bcae4dab8485498f56d0ad639f91f
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/modules/transformer/encoder_layer.py
@@ -0,0 +1,102 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+"""Encoder self-attention layer definition."""
+import paddle
+from paddle import nn
+
+
+class EncoderLayer(nn.Layer):
+    """Encoder layer module.
+
+    Args:
+        size (int): Input dimension.
+        self_attn (nn.Layer): Self-attention module instance.
+            `MultiHeadedAttention`  instance can be used as the argument.
+        feed_forward (nn.Layer): Feed-forward module instance.
+            `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance can be used as the argument.
+        dropout_rate (float): Dropout rate.
+        normalize_before (bool): Whether to use layer_norm before the first block.
+        concat_after (bool): Whether to concat attention layer's input and output.
+            if True, additional linear will be applied.
+            i.e. x -> x + linear(concat(x, att(x)))
+            if False, no additional linear will be applied. i.e. x -> x + att(x)
+    """
+
+    def __init__(
+            self,
+            size,
+            self_attn,
+            feed_forward,
+            dropout_rate,
+            normalize_before=True,
+            concat_after=False, ):
+        """Construct an EncoderLayer object."""
+        super().__init__()
+        self.self_attn = self_attn
+        self.feed_forward = feed_forward
+        self.norm1 = nn.LayerNorm(size)
+        self.norm2 = nn.LayerNorm(size)
+        self.dropout = nn.Dropout(dropout_rate)
+        self.size = size
+        self.normalize_before = normalize_before
+        self.concat_after = concat_after
+        if self.concat_after:
+            self.concat_linear = nn.Linear(size + size, size, bias_attr=True)
+
+    def forward(self, x, mask, cache=None):
+        """Compute encoded features.
+
+        Args:
+            x(Tensor): Input tensor (#batch, time, size).
+            mask(Tensor): Mask tensor for the input (#batch, time).
+            cache(Tensor, optional): Cache tensor of the input (#batch, time - 1, size). 
+
+        Returns:
+            Tensor: Output tensor (#batch, time, size).
+            Tensor: Mask tensor (#batch, time).
+        """
+        residual = x
+        if self.normalize_before:
+            x = self.norm1(x)
+
+        if cache is None:
+            x_q = x
+        else:
+            assert cache.shape == (x.shape[0], x.shape[1] - 1, self.size)
+            x_q = x[:, -1:, :]
+            residual = residual[:, -1:, :]
+            mask = None if mask is None else mask[:, -1:, :]
+
+        if self.concat_after:
+            x_concat = paddle.concat(
+                (x, self.self_attn(x_q, x, x, mask)), axis=-1)
+            x = residual + self.concat_linear(x_concat)
+        else:
+
+            x = residual + self.dropout(self.self_attn(x_q, x, x, mask))
+        if not self.normalize_before:
+            x = self.norm1(x)
+
+        residual = x
+        if self.normalize_before:
+            x = self.norm2(x)
+        x = residual + self.dropout(self.feed_forward(x))
+        if not self.normalize_before:
+            x = self.norm2(x)
+
+        if cache is not None:
+            x = paddle.concat([cache, x], axis=1)
+
+        return x, mask
diff --git a/ernie-sat/paddlespeech/t2s/modules/transformer/lightconv.py b/ernie-sat/paddlespeech/t2s/modules/transformer/lightconv.py
new file mode 100644
index 0000000000000000000000000000000000000000..9bcc1acfba021d91bec798f4eea41461cfffb81e
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/modules/transformer/lightconv.py
@@ -0,0 +1,141 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+"""Lightweight Convolution Module."""
+import numpy
+import paddle
+import paddle.nn.functional as F
+from paddle import nn
+
+from paddlespeech.t2s.modules.activation import get_activation
+from paddlespeech.t2s.modules.masked_fill import masked_fill
+
+MIN_VALUE = float(numpy.finfo(numpy.float32).min)
+
+
+class LightweightConvolution(nn.Layer):
+    """Lightweight Convolution layer.
+
+    This implementation is based on
+    https://github.com/pytorch/fairseq/tree/master/fairseq
+
+    Args:
+        wshare (int): the number of kernel of convolution
+        n_feat (int): the number of features
+        dropout_rate (float): dropout_rate
+        kernel_size (int): kernel size (length)
+        use_kernel_mask (bool): Use causal mask or not for convolution kernel
+        use_bias (bool): Use bias term or not.
+
+    """
+
+    def __init__(
+            self,
+            wshare,
+            n_feat,
+            dropout_rate,
+            kernel_size,
+            use_kernel_mask=False,
+            use_bias=False, ):
+        """Construct Lightweight Convolution layer."""
+        super().__init__()
+
+        assert n_feat % wshare == 0
+        self.wshare = wshare
+        self.use_kernel_mask = use_kernel_mask
+        self.dropout_rate = dropout_rate
+        self.kernel_size = kernel_size
+        self.padding_size = int(kernel_size / 2)
+
+        # linear -> GLU -> lightconv -> linear
+        self.linear1 = nn.Linear(n_feat, n_feat * 2)
+        self.linear2 = nn.Linear(n_feat, n_feat)
+        self.act = get_activation("glu")
+
+        # lightconv related
+        self.uniform_ = nn.initializer.Uniform()
+        self.weight = paddle.to_tensor(
+            numpy.random.uniform(0, 1, size=[self.wshare, 1, kernel_size]),
+            dtype="float32")
+        self.uniform_(self.weight)
+        self.weight = paddle.create_parameter(
+            shape=self.weight.shape,
+            dtype=str(self.weight.numpy().dtype),
+            default_initializer=paddle.nn.initializer.Assign(self.weight))
+        self.use_bias = use_bias
+        if self.use_bias:
+            self.bias = paddle.Tensor(n_feat)
+            self.bias = paddle.create_parameter(
+                shape=self.bias.shape,
+                dtype=str(self.bias.numpy().dtype),
+                default_initializer=paddle.nn.initializer.Assign(self.bias))
+
+        # mask of kernel
+        kernel_mask0 = paddle.zeros([self.wshare, int(kernel_size / 2)])
+        kernel_mask1 = paddle.ones([self.wshare, int(kernel_size / 2 + 1)])
+        self.kernel_mask = paddle.concat(
+            (kernel_mask1, kernel_mask0), axis=-1).unsqueeze(1)
+
+    def forward(self, query, key, value, mask):
+        """Forward of 'Lightweight Convolution'.
+
+        This function takes query, key and value but uses only query.
+        This is just for compatibility with self-attention layer (attention.py)
+
+        Args:
+            query (Tensor): input tensor. (batch, time1, d_model)
+            key (Tensor): NOT USED. (batch, time2, d_model)  
+            value (Tensor): NOT USED. (batch, time2, d_model) 
+            mask : (Tensor): (batch, time1, time2) mask
+
+        Return:
+            Tensor: ouput. (batch, time1, d_model) 
+
+        """
+        # linear -> GLU -> lightconv -> linear
+        x = query
+        B, T, C = x.shape
+        H = self.wshare
+
+        # first liner layer
+        x = self.linear1(x)
+
+        # GLU activation
+        x = self.act(x)
+
+        # lightconv
+        # B x C x T
+        x = x.transpose([0, 2, 1]).reshape([-1, H, T])
+        weight = F.dropout(
+            self.weight, self.dropout_rate, training=self.training)
+        if self.use_kernel_mask:
+            weight = masked_fill(weight, self.kernel_mask == 0.0, float("-inf"))
+            # weight = weight.masked_fill(self.kernel_mask == 0.0, float("-inf"))
+        weight = F.softmax(weight, axis=-1)
+        x = F.conv1d(
+            x, weight, padding=self.padding_size,
+            groups=self.wshare).reshape([B, C, T])
+        if self.use_bias:
+            x = x + self.bias.reshape([1, -1, 1])
+        # B x T x C
+        x = x.transpose([0, 2, 1])
+
+        if mask is not None and not self.use_kernel_mask:
+            mask = mask.transpose([0, 2, 1])
+            # x = x.masked_fill(mask == 0, 0.0)
+            x = masked_fill(x, mask == 0, 0.0)
+
+        # second linear layer
+        x = self.linear2(x)
+        return x
diff --git a/ernie-sat/paddlespeech/t2s/modules/transformer/mask.py b/ernie-sat/paddlespeech/t2s/modules/transformer/mask.py
new file mode 100644
index 0000000000000000000000000000000000000000..c10e6add2a0c37e052a9df9d3ae6a8535d03e942
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/modules/transformer/mask.py
@@ -0,0 +1,47 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Mask module."""
+import paddle
+
+
+def subsequent_mask(size, dtype=paddle.bool):
+    """Create mask for subsequent steps (size, size).
+
+    Args:
+        size (int): size of mask
+        dtype (paddle.dtype): result dtype
+    Return:
+        Tensor:
+            >>> subsequent_mask(3)
+            [[1, 0, 0],
+            [1, 1, 0],
+            [1, 1, 1]]
+    """
+    ret = paddle.ones([size, size], dtype=dtype)
+    return paddle.tril(ret)
+
+
+def target_mask(ys_in_pad, ignore_id, dtype=paddle.bool):
+    """Create mask for decoder self-attention.
+
+    Args:
+        ys_pad (Tensor): batch of padded target sequences (B, Lmax)
+        ignore_id (int): index of padding
+        dtype (paddle.dtype): result dtype
+    Return: 
+        Tensor: (B, Lmax, Lmax)
+    """
+    ys_mask = ys_in_pad != ignore_id
+    m = subsequent_mask(ys_mask.shape[-1]).unsqueeze(0)
+    return ys_mask.unsqueeze(-2) & m
diff --git a/ernie-sat/paddlespeech/t2s/modules/transformer/multi_layer_conv.py b/ernie-sat/paddlespeech/t2s/modules/transformer/multi_layer_conv.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3285b65f3113c4aaa844d5ccb35d0399e3f6331
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/modules/transformer/multi_layer_conv.py
@@ -0,0 +1,110 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Layer modules for FFT block in FastSpeech (Feed-forward Transformer)."""
+from paddle import nn
+
+
+class MultiLayeredConv1d(nn.Layer):
+    """Multi-layered conv1d for Transformer block.
+
+    This is a module of multi-leyered conv1d designed
+    to replace positionwise feed-forward network
+    in Transforner block, which is introduced in
+    `FastSpeech: Fast, Robust and Controllable Text to Speech`_.
+
+    .. _`FastSpeech: Fast, Robust and Controllable Text to Speech`:
+        https://arxiv.org/pdf/1905.09263.pdf
+
+    """
+
+    def __init__(self, in_chans, hidden_chans, kernel_size, dropout_rate):
+        """Initialize MultiLayeredConv1d module.
+
+        Args: 
+            in_chans (int): Number of input channels.
+            hidden_chans (int): Number of hidden channels.
+            kernel_size (int): Kernel size of conv1d.
+            dropout_rate (float): Dropout rate.
+
+        """
+        super().__init__()
+        self.w_1 = nn.Conv1D(
+            in_chans,
+            hidden_chans,
+            kernel_size,
+            stride=1,
+            padding=(kernel_size - 1) // 2, )
+        self.w_2 = nn.Conv1D(
+            hidden_chans,
+            in_chans,
+            kernel_size,
+            stride=1,
+            padding=(kernel_size - 1) // 2, )
+        self.dropout = nn.Dropout(dropout_rate)
+        self.relu = nn.ReLU()
+
+    def forward(self, x):
+        """Calculate forward propagation.
+
+        Args:
+            x (Tensor): Batch of input tensors (B, T, in_chans).
+
+        Returns: 
+            Tensor: Batch of output tensors (B, T, in_chans).
+        """
+        x = self.relu(self.w_1(x.transpose([0, 2, 1]))).transpose([0, 2, 1])
+        return self.w_2(self.dropout(x).transpose([0, 2, 1])).transpose(
+            [0, 2, 1])
+
+
+class Conv1dLinear(nn.Layer):
+    """Conv1D + Linear for Transformer block.
+
+    A variant of MultiLayeredConv1d, which replaces second conv-layer to linear.
+
+    """
+
+    def __init__(self, in_chans, hidden_chans, kernel_size, dropout_rate):
+        """Initialize Conv1dLinear module.
+
+        Args:
+            in_chans (int): Number of input channels.
+            hidden_chans (int): Number of hidden channels.
+            kernel_size (int): Kernel size of conv1d.
+            dropout_rate (float): Dropout rate.
+        """
+        super().__init__()
+        self.w_1 = nn.Conv1D(
+            in_chans,
+            hidden_chans,
+            kernel_size,
+            stride=1,
+            padding=(kernel_size - 1) // 2, )
+        self.w_2 = nn.Linear(hidden_chans, in_chans, bias_attr=True)
+        self.dropout = nn.Dropout(dropout_rate)
+        self.relu = nn.ReLU()
+
+    def forward(self, x):
+        """Calculate forward propagation.
+
+        Args:
+            x (Tensor): Batch of input tensors (B, T, in_chans).
+
+        Returns:
+            Tensor: Batch of output tensors (B, T, in_chans).
+
+        """
+        x = self.relu(self.w_1(x.transpose([0, 2, 1]))).transpose([0, 2, 1])
+
+        return self.w_2(self.dropout(x))
diff --git a/ernie-sat/paddlespeech/t2s/modules/transformer/positionwise_feed_forward.py b/ernie-sat/paddlespeech/t2s/modules/transformer/positionwise_feed_forward.py
new file mode 100644
index 0000000000000000000000000000000000000000..92af6851c402b969a5e590be287ba5e7f9c5a262
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/modules/transformer/positionwise_feed_forward.py
@@ -0,0 +1,43 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+"""Positionwise feed forward layer definition."""
+import paddle
+from paddle import nn
+
+
+class PositionwiseFeedForward(nn.Layer):
+    """Positionwise feed forward layer.
+
+    Args:
+        idim (int): Input dimenstion.
+        hidden_units (int): The number of hidden units.
+        dropout_rate (float): Dropout rate.
+    """
+
+    def __init__(self,
+                 idim,
+                 hidden_units,
+                 dropout_rate,
+                 activation=paddle.nn.ReLU()):
+        """Construct an PositionwiseFeedForward object."""
+        super().__init__()
+        self.w_1 = paddle.nn.Linear(idim, hidden_units, bias_attr=True)
+        self.w_2 = paddle.nn.Linear(hidden_units, idim, bias_attr=True)
+        self.dropout = paddle.nn.Dropout(dropout_rate)
+        self.activation = activation
+
+    def forward(self, x):
+        """Forward funciton."""
+        return self.w_2(self.dropout(self.activation(self.w_1(x))))
diff --git a/ernie-sat/paddlespeech/t2s/modules/transformer/repeat.py b/ernie-sat/paddlespeech/t2s/modules/transformer/repeat.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e946adf7e469fd6c05c2a8c8d9e6f16f638524e
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/modules/transformer/repeat.py
@@ -0,0 +1,39 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+"""Repeat the same layer definition."""
+import paddle
+
+
+class MultiSequential(paddle.nn.Sequential):
+    """Multi-input multi-output paddle.nn.Sequential."""
+
+    def forward(self, *args):
+        """Repeat."""
+        for m in self:
+            args = m(*args)
+        return args
+
+
+def repeat(N, fn):
+    """Repeat module N times.
+
+    Args:
+        N (int): Number of repeat time.
+        fn (Callable): Function to generate module.
+
+    Returns:
+        MultiSequential: Repeated model instance.
+    """
+    return MultiSequential(* [fn(n) for n in range(N)])
diff --git a/ernie-sat/paddlespeech/t2s/modules/transformer/subsampling.py b/ernie-sat/paddlespeech/t2s/modules/transformer/subsampling.py
new file mode 100644
index 0000000000000000000000000000000000000000..07439705a66cb6bc683bfa5a977aef0db379516c
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/modules/transformer/subsampling.py
@@ -0,0 +1,71 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+"""Subsampling layer definition."""
+import paddle
+from paddle import nn
+
+from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding
+
+
+class Conv2dSubsampling(nn.Layer):
+    """Convolutional 2D subsampling (to 1/4 length).
+
+    Args:
+        idim (int): Input dimension.
+        odim (int): Output dimension.
+        dropout_rate (float): Dropout rate.
+        pos_enc (nn.Layer): Custom position encoding layer.
+    """
+
+    def __init__(self, idim, odim, dropout_rate, pos_enc=None):
+        """Construct an Conv2dSubsampling object."""
+        super().__init__()
+        self.conv = nn.Sequential(
+            nn.Conv2D(1, odim, 3, 2),
+            nn.ReLU(),
+            nn.Conv2D(odim, odim, 3, 2),
+            nn.ReLU(), )
+        self.out = nn.Sequential(
+            nn.Linear(odim * (((idim - 1) // 2 - 1) // 2), odim),
+            pos_enc if pos_enc is not None else
+            PositionalEncoding(odim, dropout_rate), )
+
+    def forward(self, x, x_mask):
+        """Subsample x.
+        Args:
+            x (Tensor): Input tensor (#batch, time, idim).
+            x_mask (Tensor): Input mask (#batch, 1, time).
+        Returns:
+            Tensor: Subsampled tensor (#batch, time', odim), where time' = time // 4.
+            Tensor: Subsampled mask (#batch, 1, time'), where time' = time // 4.
+        """
+        # (b, c, t, f)
+        x = x.unsqueeze(1)
+        x = self.conv(x)
+        b, c, t, f = paddle.shape(x)
+        x = self.out(x.transpose([0, 2, 1, 3]).reshape([b, t, c * f]))
+        if x_mask is None:
+            return x, None
+        return x, x_mask[:, :, :-2:2][:, :, :-2:2]
+
+    def __getitem__(self, key):
+        """Get item.
+        When reset_parameters() is called, if use_scaled_pos_enc is used,
+            return the positioning encoding.
+        """
+        if key != -1:
+            raise NotImplementedError(
+                "Support only `-1` (for `reset_parameters`).")
+        return self.out[key]
diff --git a/ernie-sat/paddlespeech/t2s/modules/upsample.py b/ernie-sat/paddlespeech/t2s/modules/upsample.py
new file mode 100644
index 0000000000000000000000000000000000000000..65e78a8928adcab69379c883a00bd1ab90bccbc0
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/modules/upsample.py
@@ -0,0 +1,181 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+from typing import Any
+from typing import Dict
+from typing import List
+from typing import Optional
+
+from paddle import nn
+from paddle.nn import functional as F
+
+from paddlespeech.t2s.modules.activation import get_activation
+
+
+class Stretch2D(nn.Layer):
+    def __init__(self, w_scale: int, h_scale: int, mode: str="nearest"):
+        """Strech an image (or image-like object) with some interpolation.
+
+        Args:
+            w_scale (int): Scalar of width.
+            h_scale (int): Scalar of the height.
+            mode (str, optional): Interpolation mode, modes suppored are "nearest", "bilinear", 
+                "trilinear", "bicubic", "linear" and "area",by default "nearest"
+        For more details about interpolation, see 
+            `paddle.nn.functional.interpolate <https://www.paddlepaddle.org.cn/documentation/docs/en/api/paddle/nn/functional/interpolate_en.html>`_.
+        """
+        super().__init__()
+        self.w_scale = w_scale
+        self.h_scale = h_scale
+        self.mode = mode
+
+    def forward(self, x):
+        """
+
+        Args: 
+            x (Tensor): Shape (N, C, H, W)
+
+        Returns:
+            Tensor: The stretched image.
+                Shape (N, C, H', W'), where ``H'=h_scale * H``, ``W'=w_scale * W``.
+            
+        """
+        out = F.interpolate(
+            x, scale_factor=(self.h_scale, self.w_scale), mode=self.mode)
+        return out
+
+
+class UpsampleNet(nn.Layer):
+    """A Layer to upsample spectrogram by applying consecutive stretch and
+    convolutions.
+
+    Args:
+        upsample_scales (List[int]): Upsampling factors for each strech.
+        nonlinear_activation (Optional[str], optional): Activation after each convolution, by default None
+        nonlinear_activation_params (Dict[str, Any], optional): Parameters passed to construct the activation, by default {}
+        interpolate_mode (str, optional): Interpolation mode of the strech, by default "nearest"
+        freq_axis_kernel_size (int, optional): Convolution kernel size along the frequency axis, by default 1
+        use_causal_conv (bool, optional): Whether to use causal padding before convolution, by default False
+            If True, Causal padding is used along the time axis, 
+            i.e. padding amount is ``receptive field - 1`` and 0 for before and after, respectively.
+            If False, "same" padding is used along the time axis.
+    """
+
+    def __init__(self,
+                 upsample_scales: List[int],
+                 nonlinear_activation: Optional[str]=None,
+                 nonlinear_activation_params: Dict[str, Any]={},
+                 interpolate_mode: str="nearest",
+                 freq_axis_kernel_size: int=1,
+                 use_causal_conv: bool=False):
+        super().__init__()
+        self.use_causal_conv = use_causal_conv
+        self.up_layers = nn.LayerList()
+
+        for scale in upsample_scales:
+            stretch = Stretch2D(scale, 1, interpolate_mode)
+            assert freq_axis_kernel_size % 2 == 1
+            freq_axis_padding = (freq_axis_kernel_size - 1) // 2
+            kernel_size = (freq_axis_kernel_size, scale * 2 + 1)
+            if use_causal_conv:
+                padding = (freq_axis_padding, scale * 2)
+            else:
+                padding = (freq_axis_padding, scale)
+            conv = nn.Conv2D(
+                1, 1, kernel_size, padding=padding, bias_attr=False)
+            self.up_layers.extend([stretch, conv])
+            if nonlinear_activation is not None:
+                # for compatibility
+                nonlinear_activation = nonlinear_activation.lower()
+
+                nonlinear = get_activation(nonlinear_activation,
+                                           **nonlinear_activation_params)
+                self.up_layers.append(nonlinear)
+
+    def forward(self, c):
+        """
+        Args:
+            c (Tensor): spectrogram. Shape (N, F, T)
+
+        Returns: 
+            Tensor: upsampled spectrogram.
+                Shape (N, F, T'), where ``T' = upsample_factor * T``, 
+        """
+        c = c.unsqueeze(1)
+        for f in self.up_layers:
+            if self.use_causal_conv and isinstance(f, nn.Conv2D):
+                c = f(c)[:, :, :, c.shape[-1]]
+            else:
+                c = f(c)
+        return c.squeeze(1)
+
+
+class ConvInUpsampleNet(nn.Layer):
+    """A Layer to upsample spectrogram composed of a convolution and an 
+    UpsampleNet.
+    
+    Args:
+        upsample_scales (List[int]): Upsampling factors for each strech.
+        nonlinear_activation (Optional[str], optional): Activation after each convolution, by default None
+        nonlinear_activation_params (Dict[str, Any], optional): Parameters passed to construct the activation, by default {}
+        interpolate_mode (str, optional): Interpolation mode of the strech, by default "nearest"
+        freq_axis_kernel_size (int, optional): Convolution kernel size along the frequency axis, by default 1
+        aux_channels (int, optional): Feature size of the input, by default 80
+        aux_context_window (int, optional): Context window of the first 1D convolution applied to the input. It 
+            related to the kernel size of the convolution, by default 0
+            If use causal convolution, the kernel size is ``window + 1``, 
+            else the kernel size is ``2 * window + 1``.
+        use_causal_conv (bool, optional): Whether to use causal padding before convolution, by default False
+            If True, Causal padding is used along the time axis, i.e. padding 
+            amount is ``receptive field - 1`` and 0 for before and after, respectively.
+            If False, "same" padding is used along the time axis.
+    """
+
+    def __init__(self,
+                 upsample_scales: List[int],
+                 nonlinear_activation: Optional[str]=None,
+                 nonlinear_activation_params: Dict[str, Any]={},
+                 interpolate_mode: str="nearest",
+                 freq_axis_kernel_size: int=1,
+                 aux_channels: int=80,
+                 aux_context_window: int=0,
+                 use_causal_conv: bool=False):
+        super().__init__()
+        self.aux_context_window = aux_context_window
+        self.use_causal_conv = use_causal_conv and aux_context_window > 0
+        kernel_size = aux_context_window + 1 if use_causal_conv else 2 * aux_context_window + 1
+        self.conv_in = nn.Conv1D(
+            aux_channels,
+            aux_channels,
+            kernel_size=kernel_size,
+            bias_attr=False)
+        self.upsample = UpsampleNet(
+            upsample_scales=upsample_scales,
+            nonlinear_activation=nonlinear_activation,
+            nonlinear_activation_params=nonlinear_activation_params,
+            interpolate_mode=interpolate_mode,
+            freq_axis_kernel_size=freq_axis_kernel_size,
+            use_causal_conv=use_causal_conv)
+
+    def forward(self, c):
+        """
+        Args:
+            c (Tensor): spectrogram. Shape (N, F, T)
+
+        Returns:
+            Tensors: upsampled spectrogram. Shape (N, F, T'), where ``T' = upsample_factor * T``, 
+        """
+        c_ = self.conv_in(c)
+        c = c_[:, :, :-self.aux_context_window] if self.use_causal_conv else c_
+        return self.upsample(c)
diff --git a/ernie-sat/paddlespeech/t2s/training/__init__.py b/ernie-sat/paddlespeech/t2s/training/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..719e8445db528373bf3999e81e54b00ac41a1935
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/training/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .cli import *
+from .experiment import *
diff --git a/ernie-sat/paddlespeech/t2s/training/cli.py b/ernie-sat/paddlespeech/t2s/training/cli.py
new file mode 100644
index 0000000000000000000000000000000000000000..83dae11776e39f224ad6c02d46920bc21fdc9d7c
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/training/cli.py
@@ -0,0 +1,62 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+
+
+def default_argument_parser():
+    r"""A simple yet genral argument parser for experiments with t2s.
+    
+    This is used in examples with t2s. And it is intended to be used by 
+    other experiments with t2s. It requires a minimal set of command line 
+    arguments to start a training script.
+    
+    The ``--config`` and ``--opts`` are used for overwrite the deault 
+    configuration.
+    
+    The ``--data`` and ``--output`` specifies the data path and output path. 
+    Resuming training from existing progress at the output directory is the 
+    intended default behavior.
+    
+    The ``--checkpoint_path`` specifies the checkpoint to load from.
+    
+    The ``--ngpu`` specifies how to run the training.
+    
+    See Also
+    --------
+    paddlespeech.t2s.training.experiment
+
+    Returns
+    -------
+    argparse.ArgumentParser
+        the parser
+    """
+    parser = argparse.ArgumentParser()
+
+    # yapf: disable
+    # data and outpu
+    parser.add_argument("--config", metavar="FILE", help="path of the config file to overwrite to default config with.")
+    parser.add_argument("--data", metavar="DATA_DIR", help="path to the datatset.")
+    parser.add_argument("--output", metavar="OUTPUT_DIR", help="path to save checkpoint and logs.")
+
+    # load from saved checkpoint
+    parser.add_argument("--checkpoint_path", type=str, help="path of the checkpoint to load")
+
+    # running
+    parser.add_argument("--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
+
+    # overwrite extra config and default config
+    parser.add_argument("--opts", nargs=argparse.REMAINDER, help="options to overwrite --config file and the default config, passing in KEY VALUE pairs")
+    # yapd: enable
+
+    return parser
diff --git a/ernie-sat/paddlespeech/t2s/training/default_config.py b/ernie-sat/paddlespeech/t2s/training/default_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..7deb795ac9ef57f95b48dce7804c3e64b37bffe8
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/training/default_config.py
@@ -0,0 +1,25 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from yacs.config import CfgNode
+
+_C = CfgNode(
+    dict(
+        valid_interval=1000,  # validation
+        save_interval=10000,  # checkpoint
+        max_iteration=900000,  # max iteration to train
+    ))
+
+
+def get_default_training_config():
+    return _C.clone()
diff --git a/ernie-sat/paddlespeech/t2s/training/experiment.py b/ernie-sat/paddlespeech/t2s/training/experiment.py
new file mode 100644
index 0000000000000000000000000000000000000000..05a363ff204511ae5c18390277612ad69496732f
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/training/experiment.py
@@ -0,0 +1,303 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+import sys
+from pathlib import Path
+
+import paddle
+from paddle import distributed as dist
+from paddle.io import DistributedBatchSampler
+from visualdl import LogWriter
+
+from paddlespeech.t2s.utils import checkpoint
+from paddlespeech.t2s.utils import mp_tools
+
+__all__ = ["ExperimentBase"]
+
+
+class ExperimentBase(object):
+    """
+    An experiment template in order to structure the training code and take
+    care of saving, loading, logging, visualization stuffs. It's intended to
+    be flexible and simple.
+
+    So it only handles output directory (create directory for the output,
+    create a checkpoint directory, dump the config in use and create
+    visualizer and logger) in a standard way without enforcing any
+    input-output protocols to the model and dataloader. It leaves the main
+    part for the user to implement their own (setup the model, criterion,
+    optimizer, define a training step, define a validation function and
+    customize all the text and visual logs).
+
+    It does not save too much boilerplate code. The users still have to write
+    the forward/backward/update mannually, but they are free to add
+    non-standard behaviors if needed.
+
+    We have some conventions to follow.
+    1. Experiment should have ``model``, ``optimizer``, ``train_loader`` and
+    ``valid_loader``, ``config`` and ``args`` attributes.
+    2. The config should have a ``training`` field, which has
+    ``valid_interval``, ``save_interval`` and ``max_iteration`` keys. It is
+    used as the trigger to invoke validation, checkpointing and stop of the
+    experiment.
+    3. There are four methods, namely ``train_batch``, ``valid``,
+    ``setup_model`` and ``setup_dataloader`` that should be implemented.
+
+    Feel free to add/overwrite other methods and standalone functions if you
+    need.
+
+    Args:
+        config (yacs.config.CfgNode): The configuration used for the experiment.
+        args (argparse.Namespace): The parsed command line arguments.
+
+    Examples:
+        >>> def main_sp(config, args):
+        >>>     exp = Experiment(config, args)
+        >>>     exp.setup()
+        >>>     exe.resume_or_load()
+        >>>     exp.run()
+        >>>
+        >>> config = get_cfg_defaults()
+        >>> parser = default_argument_parser()
+        >>> args = parser.parse_args()
+        >>> if args.config:
+        >>>     config.merge_from_file(args.config)
+        >>> if args.opts:
+        >>>     config.merge_from_list(args.opts)
+        >>> config.freeze()
+        >>>
+        >>> if args.ngpu > 1:
+        >>>     dist.spawn(main_sp, args=(config, args), nprocs=args.ngpu)
+        >>> else:
+        >>>     main_sp(config, args)
+    """
+
+    def __init__(self, config, args):
+        self.config = config
+        self.args = args
+
+        self.model = None
+        self.optimizer = None
+        self.iteration = 0
+        self.epoch = 0
+        self.train_loader = None
+        self.valid_loader = None
+        self.iterator = None
+        self.logger = None
+        self.visualizer = None
+        self.output_dir = None
+        self.checkpoint_dir = None
+
+    def setup(self):
+        """Setup the experiment.
+        """
+        if self.args.ngpu == 0:
+            paddle.set_device("cpu")
+        elif self.args.ngpu > 0:
+            paddle.set_device("gpu")
+        else:
+            print("ngpu should >= 0 !")
+        if self.parallel:
+            self.init_parallel()
+
+        self.setup_output_dir()
+        self.dump_config()
+        self.setup_visualizer()
+        self.setup_logger()
+        self.setup_checkpointer()
+
+        self.setup_dataloader()
+        self.setup_model()
+
+        self.iteration = 0
+        self.epoch = 0
+
+    @property
+    def parallel(self):
+        """A flag indicating whether the experiment should run with
+        multiprocessing.
+        """
+        return self.args.ngpu > 1
+
+    def init_parallel(self):
+        """Init environment for multiprocess training.
+        """
+        dist.init_parallel_env()
+
+    @mp_tools.rank_zero_only
+    def save(self):
+        """Save checkpoint (model parameters and optimizer states).
+        """
+        checkpoint.save_parameters(self.checkpoint_dir, self.iteration,
+                                   self.model, self.optimizer)
+
+    def resume_or_load(self):
+        """Resume from latest checkpoint at checkpoints in the output
+        directory or load a specified checkpoint.
+
+        If ``args.checkpoint_path`` is not None, load the checkpoint, else
+        resume training.
+        """
+        iteration = checkpoint.load_parameters(
+            self.model,
+            self.optimizer,
+            checkpoint_dir=self.checkpoint_dir,
+            checkpoint_path=self.args.checkpoint_path)
+        self.iteration = iteration
+
+    def read_batch(self):
+        """Read a batch from the train_loader.
+
+        Returns
+        -------
+        List[Tensor]
+            A batch.
+        """
+        try:
+            batch = next(self.iterator)
+        except StopIteration:
+            self.new_epoch()
+            batch = next(self.iterator)
+        return batch
+
+    def new_epoch(self):
+        """Reset the train loader and increment ``epoch``.
+        """
+        self.epoch += 1
+        if self.parallel and isinstance(self.train_loader.batch_sampler,
+                                        DistributedBatchSampler):
+            self.train_loader.batch_sampler.set_epoch(self.epoch)
+        self.iterator = iter(self.train_loader)
+
+    def train(self):
+        """The training process.
+
+        It includes forward/backward/update and periodical validation and
+        saving.
+        """
+        self.new_epoch()
+        while self.iteration < self.config.training.max_iteration:
+            self.iteration += 1
+            self.train_batch()
+
+            if self.iteration % self.config.training.valid_interval == 0:
+                self.valid()
+
+            if self.iteration % self.config.training.save_interval == 0:
+                self.save()
+
+    def run(self):
+        """The routine of the experiment after setup. This method is intended
+        to be used by the user.
+        """
+        try:
+            self.train()
+        except KeyboardInterrupt as exception:
+            # delete this, because it can not save a complete model
+            # self.save()
+            self.close()
+            sys.exit(exception)
+        finally:
+            self.close()
+
+    def setup_output_dir(self):
+        """Create a directory used for output.
+        """
+        # output dir
+        output_dir = Path(self.args.output).expanduser()
+        output_dir.mkdir(parents=True, exist_ok=True)
+
+        self.output_dir = output_dir
+
+    def setup_checkpointer(self):
+        """Create a directory used to save checkpoints into.
+
+        It is "checkpoints" inside the output directory.
+        """
+        # checkpoint dir
+        checkpoint_dir = self.output_dir / "checkpoints"
+        checkpoint_dir.mkdir(exist_ok=True)
+
+        self.checkpoint_dir = checkpoint_dir
+
+    @mp_tools.rank_zero_only
+    def close(self):
+        """Close visualizer to avoid hanging after training"""
+        # https://github.com/pytorch/fairseq/issues/2357
+        self.visualizer.close()
+
+    @mp_tools.rank_zero_only
+    def setup_visualizer(self):
+        """Initialize a visualizer to log the experiment.
+
+        The visual log is saved in the output directory.
+
+        Notes
+        ------
+        Only the main process has a visualizer with it. Use multiple
+        visualizers in multiprocess to write to a same log file may cause
+        unexpected behaviors.
+        """
+        # visualizer
+        visualizer = LogWriter(logdir=str(self.output_dir))
+
+        self.visualizer = visualizer
+
+    def setup_logger(self):
+        """Initialize a text logger to log the experiment.
+
+        Each process has its own text logger. The logging message is write to
+        the standard output and a text file named ``worker_n.log`` in the
+        output directory, where ``n`` means the rank of the process.
+        """
+        logger = logging.getLogger(__name__)
+        logger.setLevel("INFO")
+        log_file = self.output_dir / 'worker_{}.log'.format(dist.get_rank())
+        logger.addHandler(logging.FileHandler(str(log_file)))
+
+        self.logger = logger
+
+    @mp_tools.rank_zero_only
+    def dump_config(self):
+        """Save the configuration used for this experiment.
+
+        It is saved in to ``config.yaml`` in the output directory at the
+        beginning of the experiment.
+        """
+        with open(self.output_dir / "config.yaml", 'wt') as f:
+            print(self.config, file=f)
+
+    def train_batch(self):
+        """The training loop. A subclass should implement this method.
+        """
+        raise NotImplementedError("train_batch should be implemented.")
+
+    @mp_tools.rank_zero_only
+    @paddle.no_grad()
+    def valid(self):
+        """The validation. A subclass should implement this method.
+        """
+        raise NotImplementedError("valid should be implemented.")
+
+    def setup_model(self):
+        """Setup model, criterion and optimizer, etc. A subclass should
+        implement this method.
+        """
+        raise NotImplementedError("setup_model should be implemented.")
+
+    def setup_dataloader(self):
+        """Setup training dataloader and validation dataloader. A subclass
+        should implement this method.
+        """
+        raise NotImplementedError("setup_dataloader should be implemented.")
diff --git a/ernie-sat/paddlespeech/t2s/training/extension.py b/ernie-sat/paddlespeech/t2s/training/extension.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f755a76a5e3ef89a41f14b6631eb5f1d345ad4c
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/training/extension.py
@@ -0,0 +1,80 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from chainer(https://github.com/chainer/chainer)
+from typing import Callable
+
+PRIORITY_WRITER = 300
+PRIORITY_EDITOR = 200
+PRIORITY_READER = 100
+
+
+class Extension(object):
+    """Extension to customize the behavior of Trainer."""
+    trigger = (1, 'iteration')
+    priority = PRIORITY_READER
+    name = None
+
+    @property
+    def default_name(self):
+        """Default name of the extension, class name by default."""
+        return type(self).__name__
+
+    def __call__(self, trainer):
+        """Main action of the extention. After each update, it is executed
+        when the trigger fires."""
+        raise NotImplementedError(
+            'Extension implementation must override __call__.')
+
+    def initialize(self, trainer):
+        """Action that is executed once to get the corect trainer state.
+        It is called before training normally, but if the trainer restores
+        states with an Snapshot extension, this method should also be called.g
+        """
+        pass
+
+    def on_error(self, trainer, exc, tb):
+        """Handles the error raised during training before finalization.
+        """
+        pass
+
+    def finalize(self, trainer):
+        """Action that is executed when training is done.
+        For example, visualizers would need to be closed.
+        """
+        pass
+
+
+def make_extension(trigger: Callable=None,
+                   default_name: str=None,
+                   priority: int=None,
+                   finalizer: Callable=None,
+                   initializer: Callable=None,
+                   on_error: Callable=None):
+    """Make an Extension-like object by injecting required attributes to it.
+    """
+    if trigger is None:
+        trigger = Extension.trigger
+    if priority is None:
+        priority = Extension.priority
+
+    def decorator(ext):
+        ext.trigger = trigger
+        ext.default_name = default_name or ext.__name__
+        ext.priority = priority
+        ext.finalize = finalizer
+        ext.on_error = on_error
+        ext.initialize = initializer
+        return ext
+
+    return decorator
diff --git a/ernie-sat/paddlespeech/t2s/training/extensions/__init__.py b/ernie-sat/paddlespeech/t2s/training/extensions/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..abf198b97e6e818e1fbe59006f98492640bcee54
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/training/extensions/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/ernie-sat/paddlespeech/t2s/training/extensions/evaluator.py b/ernie-sat/paddlespeech/t2s/training/extensions/evaluator.py
new file mode 100644
index 0000000000000000000000000000000000000000..3940dffe17c6312ff9fee0552a6df4e9903ee7fa
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/training/extensions/evaluator.py
@@ -0,0 +1,72 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from chainer(https://github.com/chainer/chainer)
+from typing import Dict
+
+import paddle
+from paddle.io import DataLoader
+from paddle.nn import Layer
+
+from paddlespeech.t2s.training import extension
+from paddlespeech.t2s.training.reporter import DictSummary
+from paddlespeech.t2s.training.reporter import report
+from paddlespeech.t2s.training.reporter import scope
+
+
+class StandardEvaluator(extension.Extension):
+
+    trigger = (1, 'epoch')
+    default_name = 'validation'
+    priority = extension.PRIORITY_WRITER
+
+    name = None
+
+    def __init__(self, model: Layer, dataloader: DataLoader):
+        # it is designed to hold multiple models
+        models = {"main": model}
+        self.models: Dict[str, Layer] = models
+        self.model = model
+
+        # dataloaders
+        self.dataloader = dataloader
+
+    def evaluate_core(self, batch):
+        # compute
+        self.model(batch)  # you may report here
+
+    def evaluate(self):
+        # switch to eval mode
+        for layer in self.models.values():
+            layer.eval()
+
+        # to average evaluation metrics
+        summary = DictSummary()
+        for batch in self.dataloader:
+            observation = {}
+            with scope(observation):
+                # main evaluation computation here.
+                with paddle.no_grad():
+                    self.evaluate_core(batch)
+            summary.add(observation)
+        summary = summary.compute_mean()
+        return summary
+
+    def __call__(self, trainer=None):
+        # evaluate and report the averaged metric to current observation
+        # if it is used to extend a trainer, the metrics is reported to
+        # to observation of the trainer
+        # or otherwise, you can use your own observation
+        summary = self.evaluate()
+        for k, v in summary.items():
+            report(k, v)
diff --git a/ernie-sat/paddlespeech/t2s/training/extensions/snapshot.py b/ernie-sat/paddlespeech/t2s/training/extensions/snapshot.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f8d3c45c8a3838c5e0df45ebdedaf43edabf537
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/training/extensions/snapshot.py
@@ -0,0 +1,110 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from chainer(https://github.com/chainer/chainer)
+import logging
+import os
+from datetime import datetime
+from pathlib import Path
+from typing import Any
+from typing import Dict
+from typing import List
+
+import jsonlines
+
+from paddlespeech.t2s.training import extension
+from paddlespeech.t2s.training.trainer import Trainer
+from paddlespeech.t2s.utils.mp_tools import rank_zero_only
+
+
+def load_records(records_fp):
+    """Load record files (json lines.)"""
+    with jsonlines.open(records_fp, 'r') as reader:
+        records = list(reader)
+    return records
+
+
+class Snapshot(extension.Extension):
+    """An extension to make snapshot of the updater object inside
+    the trainer. It is done by calling the updater's `save` method.
+
+    An Updater save its state_dict by default, which contains the
+    updater state, (i.e. epoch and iteration) and all the model
+    parameters and optimizer states. If the updater inside the trainer
+    subclasses StandardUpdater, everything is good to go.
+
+    Arsg:
+        checkpoint_dir (Union[str, Path]): The directory to save checkpoints into.
+    """
+
+    trigger = (1, 'epoch')
+    priority = -100
+    default_name = "snapshot"
+
+    def __init__(self, max_size: int=5, snapshot_on_error: bool=False):
+        self.records: List[Dict[str, Any]] = []
+        self.max_size = max_size
+        self._snapshot_on_error = snapshot_on_error
+        self._save_all = (max_size == -1)
+        self.checkpoint_dir = None
+
+    def initialize(self, trainer: Trainer):
+        """Setting up this extention."""
+        self.checkpoint_dir = trainer.out / "checkpoints"
+
+        # load existing records
+        record_path: Path = self.checkpoint_dir / "records.jsonl"
+        if record_path.exists():
+            logging.debug("Loading from an existing checkpoint dir")
+            self.records = load_records(record_path)
+            trainer.updater.load(self.records[-1]['path'])
+
+    def on_error(self, trainer, exc, tb):
+        if self._snapshot_on_error:
+            self.save_checkpoint_and_update(trainer)
+
+    def __call__(self, trainer: Trainer):
+        self.save_checkpoint_and_update(trainer)
+
+    def full(self):
+        """Whether the number of snapshots it keeps track of is greater
+        than the max_size."""
+        return (not self._save_all) and len(self.records) > self.max_size
+
+    @rank_zero_only
+    def save_checkpoint_and_update(self, trainer: Trainer):
+        """Saving new snapshot and remove the oldest snapshot if needed."""
+        iteration = trainer.updater.state.iteration
+        path = self.checkpoint_dir / f"snapshot_iter_{iteration}.pdz"
+
+        # add the new one
+        trainer.updater.save(path)
+        record = {
+            "time": str(datetime.now()),
+            'path': str(path.resolve()),  # use absolute path
+            'iteration': iteration
+        }
+        self.records.append(record)
+
+        # remove the earist
+        if self.full():
+            eariest_record = self.records[0]
+            os.remove(eariest_record["path"])
+            self.records.pop(0)
+
+        # update the record file
+        record_path = self.checkpoint_dir / "records.jsonl"
+        with jsonlines.open(record_path, 'w') as writer:
+            for record in self.records:
+                # jsonlines.open may return a Writer or a Reader
+                writer.write(record)  # pylint: disable=no-member
diff --git a/ernie-sat/paddlespeech/t2s/training/extensions/visualizer.py b/ernie-sat/paddlespeech/t2s/training/extensions/visualizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..748a7c48f1b45841976a782ef1b941a83a929e8d
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/training/extensions/visualizer.py
@@ -0,0 +1,39 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from visualdl import LogWriter
+
+from paddlespeech.t2s.training import extension
+from paddlespeech.t2s.training.trainer import Trainer
+
+
+class VisualDL(extension.Extension):
+    """A wrapper of visualdl log writer. It assumes that the metrics to be visualized
+    are all scalars which are recorded into the `.observation` dictionary of the
+    trainer object. The dictionary is created for each step, thus the visualdl log
+    writer uses the iteration from the updater's `iteration` as the global step to
+    add records.
+    """
+    trigger = (1, 'iteration')
+    default_name = 'visualdl'
+    priority = extension.PRIORITY_READER
+
+    def __init__(self, logdir):
+        self.writer = LogWriter(str(logdir))
+
+    def __call__(self, trainer: Trainer):
+        for k, v in trainer.observation.items():
+            self.writer.add_scalar(k, v, step=trainer.updater.state.iteration)
+
+    def finalize(self, trainer):
+        self.writer.close()
diff --git a/ernie-sat/paddlespeech/t2s/training/optimizer.py b/ernie-sat/paddlespeech/t2s/training/optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..64274d5380bffcdd483fbaca2a5448a77f6611ee
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/training/optimizer.py
@@ -0,0 +1,52 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddle
+from paddle import nn
+
+optim_classes = dict(
+    adadelta=paddle.optimizer.Adadelta,
+    adagrad=paddle.optimizer.Adagrad,
+    adam=paddle.optimizer.Adam,
+    adamax=paddle.optimizer.Adamax,
+    adamw=paddle.optimizer.AdamW,
+    lamb=paddle.optimizer.Lamb,
+    momentum=paddle.optimizer.Momentum,
+    rmsprop=paddle.optimizer.RMSProp,
+    sgd=paddle.optimizer.SGD, )
+
+
+def build_optimizers(
+        model: nn.Layer,
+        optim='adadelta',
+        max_grad_norm=None,
+        learning_rate=0.01,
+        weight_decay=None,
+        epsilon=1.0e-6, ) -> paddle.optimizer:
+    optim_class = optim_classes.get(optim)
+    if optim_class is None:
+        raise ValueError(f"must be one of {list(optim_classes)}: {optim}")
+    else:
+        grad_clip = None
+        if max_grad_norm:
+            grad_clip = paddle.nn.ClipGradByGlobalNorm(max_grad_norm)
+        optim_dict = {}
+        optim_dict['parameters'] = model.parameters()
+        optim_dict['learning_rate'] = learning_rate
+        optim_dict['grad_clip'] = grad_clip
+        optim_dict['weight_decay'] = weight_decay
+        if optim_class not in {'momentum', 'sgd'}:
+            optim_dict['epsilon'] = epsilon
+        optimizers = optim_class(**optim_dict)
+
+    return optimizers
diff --git a/ernie-sat/paddlespeech/t2s/training/reporter.py b/ernie-sat/paddlespeech/t2s/training/reporter.py
new file mode 100644
index 0000000000000000000000000000000000000000..a61506daf04c3b74b02cf4805b6720fdfbd41dc3
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/training/reporter.py
@@ -0,0 +1,159 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from chainer(https://github.com/chainer/chainer)
+import contextlib
+import math
+from collections import defaultdict
+
+OBSERVATIONS = None
+
+
+@contextlib.contextmanager
+def scope(observations):
+    # make `observation` the target to report to.
+    # it is basically a dictionary that stores temporary observations
+    global OBSERVATIONS
+    old = OBSERVATIONS
+    OBSERVATIONS = observations
+
+    try:
+        yield
+    finally:
+        OBSERVATIONS = old
+
+
+def get_observations():
+    global OBSERVATIONS
+    return OBSERVATIONS
+
+
+def report(name, value):
+    # a simple function to report named value
+    # you can use it everywhere, it will get the default target and writ to it
+    # you can think of it as std.out
+    observations = get_observations()
+    if observations is None:
+        return
+    else:
+        observations[name] = value
+
+
+class Summary(object):
+    """Online summarization of a sequence of scalars.
+    Summary computes the statistics of given scalars online.
+    """
+
+    def __init__(self):
+        self._x = 0.0
+        self._x2 = 0.0
+        self._n = 0
+
+    def add(self, value, weight=1):
+        """Adds a scalar value.
+
+        Args:
+            value: Scalar value to accumulate. It is either a NumPy scalar or
+                a zero-dimensional array (on CPU or GPU).
+            weight: An optional weight for the value. It is a NumPy scalar or
+                a zero-dimensional array (on CPU or GPU).
+                Default is 1 (integer).
+
+        """
+        self._x += weight * value
+        self._x2 += weight * value * value
+        self._n += weight
+
+    def compute_mean(self):
+        """Computes the mean."""
+        x, n = self._x, self._n
+        return x / n
+
+    def make_statistics(self):
+        """Computes and returns the mean and standard deviation values.
+
+        Returns:
+            tuple: Mean and standard deviation values.
+
+        """
+        x, n = self._x, self._n
+        mean = x / n
+        var = self._x2 / n - mean * mean
+        std = math.sqrt(var)
+        return mean, std
+
+
+class DictSummary(object):
+    """Online summarization of a sequence of dictionaries.
+
+    ``DictSummary`` computes the statistics of a given set of scalars online.
+    It only computes the statistics for scalar values and variables of scalar
+    values in the dictionaries.
+
+    """
+
+    def __init__(self):
+        self._summaries = defaultdict(Summary)
+
+    def add(self, d):
+        """Adds a dictionary of scalars.
+
+        Args:
+            d (dict): Dictionary of scalars to accumulate. Only elements of
+               scalars, zero-dimensional arrays, and variables of
+               zero-dimensional arrays are accumulated. When the value
+               is a tuple, the second element is interpreted as a weight.
+
+        """
+        summaries = self._summaries
+        for k, v in d.items():
+            w = 1
+            if isinstance(v, tuple):
+                w = v[1]
+                v = v[0]
+            summaries[k].add(v, weight=w)
+
+    def compute_mean(self):
+        """Creates a dictionary of mean values.
+
+        It returns a single dictionary that holds a mean value for each entry
+        added to the summary.
+
+        Returns:
+            dict: Dictionary of mean values.
+
+        """
+        return {
+            name: summary.compute_mean()
+            for name, summary in self._summaries.items()
+        }
+
+    def make_statistics(self):
+        """Creates a dictionary of statistics.
+
+        It returns a single dictionary that holds mean and standard deviation
+        values for every entry added to the summary. For an entry of name
+        ``'key'``, these values are added to the dictionary by names ``'key'``
+        and ``'key.std'``, respectively.
+
+        Returns:
+            dict: Dictionary of statistics of all entries.
+
+        """
+        stats = {}
+        for name, summary in self._summaries.items():
+            mean, std = summary.make_statistics()
+            stats[name] = mean
+            stats[name + '.std'] = std
+
+        return stats
diff --git a/ernie-sat/paddlespeech/t2s/training/seeding.py b/ernie-sat/paddlespeech/t2s/training/seeding.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ca30fd3009a99fa551a6b053f1103ee597977a6
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/training/seeding.py
@@ -0,0 +1,26 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+import random
+
+import numpy as np
+import paddle
+
+
+def seed_everything(seed: int):
+    """Seed paddle, random and np.random to help reproductivity."""
+    paddle.seed(seed)
+    random.seed(seed)
+    np.random.seed(seed)
+    logging.debug(f"Set the seed of paddle, random, np.random to {seed}.")
diff --git a/ernie-sat/paddlespeech/t2s/training/trainer.py b/ernie-sat/paddlespeech/t2s/training/trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a32bca8dba548cf9a52a06d7c2e45b8bd881898
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/training/trainer.py
@@ -0,0 +1,202 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+import traceback
+from collections import OrderedDict
+from pathlib import Path
+from typing import Callable
+from typing import List
+from typing import Union
+
+import six
+
+from paddlespeech.t2s.training.extension import Extension
+from paddlespeech.t2s.training.extension import PRIORITY_READER
+from paddlespeech.t2s.training.reporter import scope
+from paddlespeech.t2s.training.trigger import get_trigger
+from paddlespeech.t2s.training.triggers.limit_trigger import LimitTrigger
+from paddlespeech.t2s.training.updater import UpdaterBase
+from paddlespeech.t2s.utils import profiler
+
+
+class _ExtensionEntry(object):
+    def __init__(self, extension, trigger, priority):
+        self.extension = extension
+        self.trigger = trigger
+        self.priority = priority
+
+
+class Trainer(object):
+    def __init__(self,
+                 updater: UpdaterBase,
+                 stop_trigger: Callable=None,
+                 out: Union[str, Path]='result',
+                 extensions: List[Extension]=None,
+                 profiler_options: str=None):
+        self.updater = updater
+        self.extensions = OrderedDict()
+        self.stop_trigger = LimitTrigger(*stop_trigger)
+        self.out = Path(out)
+        self.observation = None
+        self.profiler_options = profiler_options
+        self._done = False
+        if extensions:
+            for ext in extensions:
+                self.extend(ext)
+
+    @property
+    def is_before_training(self):
+        return self.updater.state.iteration == 0
+
+    def extend(self, extension, name=None, trigger=None, priority=None):
+        # get name for the extension
+        # argument \
+        # -> extention's name \
+        # -> default_name (class name, when it is an object) \
+        # -> function name when it is a function \
+        # -> error
+
+        if name is None:
+            name = getattr(extension, 'name', None)
+            if name is None:
+                name = getattr(extension, 'default_name', None)
+                if name is None:
+                    name = getattr(extension, '__name__', None)
+                    if name is None:
+                        raise ValueError("Name is not given for the extension.")
+        if name == 'training':
+            raise ValueError("training is a reserved name.")
+
+        if trigger is None:
+            trigger = getattr(extension, 'trigger', (1, 'iteration'))
+        trigger = get_trigger(trigger)
+
+        if priority is None:
+            priority = getattr(extension, 'priority', PRIORITY_READER)
+
+        # add suffix to avoid nameing conflict
+        ordinal = 0
+        modified_name = name
+        while modified_name in self.extensions:
+            ordinal += 1
+            modified_name = f"{name}_{ordinal}"
+        extension.name = modified_name
+
+        self.extensions[modified_name] = _ExtensionEntry(extension, trigger,
+                                                         priority)
+
+    def get_extension(self, name):
+        """get extension by name."""
+        extensions = self.extensions
+        if name in extensions:
+            return extensions[name].extension
+        else:
+            raise ValueError(f'extension {name} not found')
+
+    def run(self):
+        if self._done:
+            raise RuntimeError("Training is already done!.")
+
+        self.out.mkdir(parents=True, exist_ok=True)
+
+        # sort extensions by priorities once
+        extension_order = sorted(
+            self.extensions.keys(),
+            key=lambda name: self.extensions[name].priority,
+            reverse=True)
+        extensions = [(name, self.extensions[name]) for name in extension_order]
+
+        # initializing all extensions
+        for name, entry in extensions:
+            if hasattr(entry.extension, "initialize"):
+                entry.extension.initialize(self)
+
+        update = self.updater.update  # training step
+
+        stop_trigger = self.stop_trigger
+
+        # display only one progress bar
+        max_iteration = None
+        if isinstance(stop_trigger, LimitTrigger):
+            if stop_trigger.unit == 'epoch':
+                max_epoch = self.stop_trigger.limit
+                updates_per_epoch = getattr(self.updater, "updates_per_epoch",
+                                            None)
+                max_iteration = max_epoch * updates_per_epoch if updates_per_epoch else None
+            else:
+                max_iteration = self.stop_trigger.limit
+
+        try:
+            while not stop_trigger(self):
+                self.observation = {}
+                # set observation as the report target
+                # you can use report freely in Updater.update()
+
+                # updating parameters and state
+                with scope(self.observation):
+
+                    update()
+                    if self.profiler_options:
+                        profiler.add_profiler_step(self.profiler_options)
+                    batch_read_time = self.updater.batch_read_time
+                    batch_time = self.updater.batch_time
+                    avg_batch_cost = batch_read_time + batch_time
+                    logger = self.updater.logger
+                    logger.removeHandler(self.updater.filehandler)
+                    msg = self.updater.msg
+                    msg = " iter: {}/{}, ".format(self.updater.state.iteration,
+                                                  max_iteration) + msg
+                    msg += ", avg_reader_cost: {:.5f} sec, ".format(
+                        batch_read_time
+                    ) + "avg_batch_cost: {:.5f} sec, ".format(avg_batch_cost)
+                    msg += "avg_samples: {}, ".format(
+                        self.updater.
+                        batch_size) + "avg_ips: {:.5f} sequences/sec".format(
+                            self.updater.batch_size / avg_batch_cost)
+                    logger.info(msg)
+
+                    # execute extension when necessary
+                    for name, entry in extensions:
+                        if entry.trigger(self):
+                            entry.extension(self)
+
+                # print("###", self.observation)
+        except Exception as e:
+            f = sys.stderr
+            f.write(f"Exception in main training loop: {e}\n")
+            f.write("Traceback (most recent call last):\n")
+            traceback.print_tb(sys.exc_info()[2])
+            f.write(
+                "Trainer extensions will try to handle the extension. Then all extensions will finalize."
+            )
+
+            # capture the exception in the mian training loop
+            exc_info = sys.exc_info()
+
+            # try to handle it
+            for name, entry in extensions:
+                if hasattr(entry.extension, "on_error"):
+                    try:
+                        entry.extension.on_error(self, e, sys.exc_info()[2])
+                    except Exception as ee:
+                        f.write(f"Exception in error handler: {ee}\n")
+                        f.write('Traceback (most recent call last):\n')
+                        traceback.print_tb(sys.exc_info()[2])
+
+            # raise exception in main training loop
+            six.reraise(*exc_info)
+        finally:
+            for name, entry in extensions:
+                if hasattr(entry.extension, "finalize"):
+                    entry.extension.finalize(self)
diff --git a/ernie-sat/paddlespeech/t2s/training/trigger.py b/ernie-sat/paddlespeech/t2s/training/trigger.py
new file mode 100644
index 0000000000000000000000000000000000000000..2899562397fdce5a9e6ec88bf61931a52aa202f0
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/training/trigger.py
@@ -0,0 +1,28 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from paddlespeech.t2s.training.triggers.interval_trigger import IntervalTrigger
+
+
+def never_fail_trigger(trainer):
+    return False
+
+
+def get_trigger(trigger):
+    if trigger is None:
+        return never_fail_trigger
+    if callable(trigger):
+        return trigger
+    else:
+        trigger = IntervalTrigger(*trigger)
+        return trigger
diff --git a/ernie-sat/paddlespeech/t2s/training/triggers/__init__.py b/ernie-sat/paddlespeech/t2s/training/triggers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..abf198b97e6e818e1fbe59006f98492640bcee54
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/training/triggers/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/ernie-sat/paddlespeech/t2s/training/triggers/interval_trigger.py b/ernie-sat/paddlespeech/t2s/training/triggers/interval_trigger.py
new file mode 100644
index 0000000000000000000000000000000000000000..a83139ba9e1e6560ac2841aecafae8713cfff370
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/training/triggers/interval_trigger.py
@@ -0,0 +1,39 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Reference chainer MIT (https://opensource.org/licenses/MIT)
+
+
+class IntervalTrigger(object):
+    """A Predicate to do something every N cycle."""
+
+    def __init__(self, period: int, unit: str):
+        if unit not in ("iteration", "epoch"):
+            raise ValueError("unit should be 'iteration' or 'epoch'")
+        if period <= 0:
+            raise ValueError("period should be a positive integer.")
+        self.period = period
+        self.unit = unit
+        self.last_index = None
+
+    def __call__(self, trainer):
+        if self.last_index is None:
+            last_index = getattr(trainer.updater.state, self.unit)
+            self.last_index = last_index
+
+        last_index = self.last_index
+        index = getattr(trainer.updater.state, self.unit)
+        fire = index // self.period != last_index // self.period
+
+        self.last_index = index
+        return fire
diff --git a/ernie-sat/paddlespeech/t2s/training/triggers/limit_trigger.py b/ernie-sat/paddlespeech/t2s/training/triggers/limit_trigger.py
new file mode 100644
index 0000000000000000000000000000000000000000..db1db774e9cadf15aa0a5625bbce5e3966c1f7b3
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/training/triggers/limit_trigger.py
@@ -0,0 +1,32 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Reference chainer MIT (https://opensource.org/licenses/MIT)
+
+
+class LimitTrigger(object):
+    """A Predicate to decide whether to stop."""
+
+    def __init__(self, limit: int, unit: str):
+        if unit not in ("iteration", "epoch"):
+            raise ValueError("unit should be 'iteration' or 'epoch'")
+        if limit <= 0:
+            raise ValueError("limit should be a positive integer.")
+        self.limit = limit
+        self.unit = unit
+
+    def __call__(self, trainer):
+        state = trainer.updater.state
+        index = getattr(state, self.unit)
+        fire = index >= self.limit
+        return fire
diff --git a/ernie-sat/paddlespeech/t2s/training/triggers/time_trigger.py b/ernie-sat/paddlespeech/t2s/training/triggers/time_trigger.py
new file mode 100644
index 0000000000000000000000000000000000000000..d7123524b30645935571739c8cac3d78f7d22079
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/training/triggers/time_trigger.py
@@ -0,0 +1,36 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Reference chainer MIT (https://opensource.org/licenses/MIT)
+
+
+class TimeTrigger(object):
+    """Trigger based on a fixed time interval.
+
+    This trigger accepts iterations with a given interval time.
+
+    Args:
+        period (float): Interval time. It is given in seconds.
+
+    """
+
+    def __init__(self, period):
+        self._period = period
+        self._next_time = self._period
+
+    def __call__(self, trainer):
+        if self._next_time < trainer.elapsed_time:
+            self._next_time += self._period
+            return True
+        else:
+            return False
diff --git a/ernie-sat/paddlespeech/t2s/training/updater.py b/ernie-sat/paddlespeech/t2s/training/updater.py
new file mode 100644
index 0000000000000000000000000000000000000000..a70550317e1ae6e0014db1a849ce3139eb19c542
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/training/updater.py
@@ -0,0 +1,86 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from chainer(https://github.com/chainer/chainer)
+import logging
+from dataclasses import dataclass
+
+import paddle
+
+
+@dataclass
+class UpdaterState:
+    iteration: int = 0
+    epoch: int = 0
+
+
+class UpdaterBase(object):
+    """An updater is the abstraction of how a model is trained given the
+    dataloader and the optimizer.
+
+    The `update_core` method is a step in the training loop with only necessary
+    operations (get a batch, forward and backward, update the parameters).
+
+    Other stuffs are made extensions. Visualization, saving, loading and
+    periodical validation and evaluation are not considered here.
+
+    But even in such simplist case, things are not that simple. There is an
+    attempt to standardize this process and requires only the model and
+    dataset and do all the stuffs automatically. But this may hurt flexibility.
+
+    If we assume a batch yield from the dataloader is just the input to the
+    model, we will find that some model requires more arguments, or just some
+    keyword arguments. But this prevents us from over-simplifying it.
+
+    From another perspective, the batch may includes not just the input, but
+    also the target. But the model's forward method may just need the input.
+    We can pass a dict or a super-long tuple to the model and let it pick what
+    it really needs. But this is an abuse of lazy interface.
+
+    After all, we care about how a model is trained. But just how the model is
+    used for inference. We want to control how a model is trained. We just
+    don't want to be messed up with other auxiliary code.
+
+    So the best practice is to define a model and define a updater for it.
+    """
+
+    def __init__(self, init_state=None):
+        if init_state is None:
+            self.state = UpdaterState()
+        else:
+            self.state = init_state
+
+    def update(self, batch):
+        raise NotImplementedError(
+            "Implement your own `update` method for training a step.")
+
+    def state_dict(self):
+        state_dict = {
+            "epoch": self.state.epoch,
+            "iteration": self.state.iteration,
+        }
+        return state_dict
+
+    def set_state_dict(self, state_dict):
+        self.state.epoch = state_dict["epoch"]
+        self.state.iteration = state_dict["iteration"]
+
+    def save(self, path):
+        logging.debug(f"Saving to {path}.")
+        archive = self.state_dict()
+        paddle.save(archive, str(path))
+
+    def load(self, path):
+        logging.debug(f"Loading from {path}.")
+        archive = paddle.load(str(path))
+        self.set_state_dict(archive)
diff --git a/ernie-sat/paddlespeech/t2s/training/updaters/__init__.py b/ernie-sat/paddlespeech/t2s/training/updaters/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..abf198b97e6e818e1fbe59006f98492640bcee54
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/training/updaters/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/ernie-sat/paddlespeech/t2s/training/updaters/standard_updater.py b/ernie-sat/paddlespeech/t2s/training/updaters/standard_updater.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1c48620ee64b3879e8d3175ca3507b656b79a5f
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/training/updaters/standard_updater.py
@@ -0,0 +1,200 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from chainer(https://github.com/chainer/chainer)
+import logging
+import time
+from typing import Dict
+from typing import Optional
+
+from paddle import Tensor
+from paddle.io import DataLoader
+from paddle.io import DistributedBatchSampler
+from paddle.nn import Layer
+from paddle.optimizer import Optimizer
+from timer import timer
+
+from paddlespeech.t2s.training.reporter import report
+from paddlespeech.t2s.training.updater import UpdaterBase
+from paddlespeech.t2s.training.updater import UpdaterState
+
+
+class StandardUpdater(UpdaterBase):
+    """An example of over-simplification. Things may not be that simple, but
+    you can subclass it to fit your need.
+    """
+
+    def __init__(self,
+                 model: Layer,
+                 optimizer: Optimizer,
+                 dataloader: DataLoader,
+                 init_state: Optional[UpdaterState]=None):
+        # it is designed to hold multiple models
+        models = {"main": model}
+        self.models: Dict[str, Layer] = models
+        self.model = model
+
+        # it is designed to hold multiple optimizers
+        optimizers = {"main": optimizer}
+        self.optimizer = optimizer
+        self.optimizers: Dict[str, Optimizer] = optimizers
+
+        # dataloaders
+        self.dataloader = dataloader
+
+        # init state
+        if init_state is None:
+            self.state = UpdaterState()
+        else:
+            self.state = init_state
+
+        self.train_iterator = iter(dataloader)
+        self.batch_read_time = 0
+        self.batch_time = 0
+
+    def update(self):
+        # We increase the iteration index after updating and before extension.
+        # Here are the reasons.
+
+        # 0. Snapshotting(as well as other extensions, like visualizer) is
+        #    executed after a step of updating;
+        # 1. We decide to increase the iteration index after updating and
+        #    before any all extension is executed. 
+        # 3. We do not increase the iteration after extension because we
+        #    prefer a consistent resume behavior, when load from a
+        #    `snapshot_iter_100.pdz` then the next step to train is `101`,
+        #    naturally. But if iteration is increased increased after
+        #    extension(including snapshot), then, a `snapshot_iter_99` is
+        #    loaded. You would need a extra increasing of the iteration idex
+        #    before training to avoid another iteration `99`, which has been
+        #    done before snapshotting.
+        # 4. Thus iteration index represrnts "currently how mant epochs has
+        #    been done."
+        # NOTE: use report to capture the correctly value. If you want to
+        # report the learning rate used for a step, you must report it before
+        # the learning rate scheduler's step() has been called. In paddle's
+        # convention, we do not use an extension to change the learning rate.
+        # so if you want to report it, do it in the updater.
+
+        # Then here comes the next question. When is the proper time to
+        # increase the epoch index? Since all extensions are executed after
+        # updating, it is the time that after updating is the proper time to
+        # increase epoch index.
+        # 1. If we increase the epoch index before updating, then an extension
+        #    based ot epoch would miss the correct timing. It could only be
+        #    triggerd after an extra updating.
+        # 2. Theoretically, when an epoch is done, the epoch index should be
+        #    increased. So it would be increase after updating.
+        # 3. Thus, eppoch index represents "currently how many epochs has been
+        #    done." So it starts from 0.
+
+        # switch to training mode
+        for layer in self.models.values():
+            layer.train()
+
+        # training for a step is implemented here
+        time_before_read = time.time()
+        batch = self.read_batch()
+        time_before_core = time.time()
+        self.update_core(batch)
+        self.batch_time = time.time() - time_before_core
+        self.batch_read_time = time_before_core - time_before_read
+        if isinstance(batch, dict):
+            self.batch_size = len(list(batch.items())[0][-1])
+        # for pwg
+        elif isinstance(batch, list):
+            self.batch_size = batch[0].shape[0]
+
+        self.state.iteration += 1
+        if self.updates_per_epoch is not None:
+            if self.state.iteration % self.updates_per_epoch == 0:
+                self.state.epoch += 1
+
+    def update_core(self, batch):
+        """A simple case for a training step. Basic assumptions are:
+        Single model;
+        Single optimizer;
+        A batch from the dataloader is just the input of the model;
+        The model return a single loss, or a dict containing serval losses.
+        Parameters updates at every batch, no gradient accumulation.
+        """
+        loss = self.model(*batch)
+
+        if isinstance(loss, Tensor):
+            loss_dict = {"main": loss}
+        else:
+            # Dict[str, Tensor]
+            loss_dict = loss
+            if "main" not in loss_dict:
+                main_loss = 0
+                for loss_item in loss.values():
+                    main_loss += loss_item
+                loss_dict["main"] = main_loss
+
+        for name, loss_item in loss_dict.items():
+            report(name, float(loss_item))
+
+        self.optimizer.clear_gradient()
+        loss_dict["main"].backward()
+        self.optimizer.update()
+
+    @property
+    def updates_per_epoch(self):
+        """Number of updater per epoch, determined by the length of the
+        dataloader."""
+        length_of_dataloader = None
+        try:
+            length_of_dataloader = len(self.dataloader)
+        except TypeError:
+            logging.debug("This dataloader has no __len__.")
+        finally:
+            return length_of_dataloader
+
+    def new_epoch(self):
+        """Start a new epoch."""
+        # NOTE: all batch sampler for distributed training should
+        # subclass DistributedBatchSampler and implement `set_epoch` method
+        batch_sampler = self.dataloader.batch_sampler
+        if isinstance(batch_sampler, DistributedBatchSampler):
+            batch_sampler.set_epoch(self.state.epoch)
+        self.train_iterator = iter(self.dataloader)
+
+    def read_batch(self):
+        """Read a batch from the data loader, auto renew when data is exhausted."""
+        with timer() as t:
+            try:
+                batch = next(self.train_iterator)
+            except StopIteration:
+                self.new_epoch()
+                batch = next(self.train_iterator)
+            logging.debug(
+                f"Read a batch takes {t.elapse}s.")  # replace it with logging
+        return batch
+
+    def state_dict(self):
+        """State dict of a Updater, model, optimizer and updater state are included."""
+        state_dict = super().state_dict()
+        for name, layer in self.models.items():
+            state_dict[f"{name}_params"] = layer.state_dict()
+        for name, optim in self.optimizers.items():
+            state_dict[f"{name}_optimizer"] = optim.state_dict()
+        return state_dict
+
+    def set_state_dict(self, state_dict):
+        """Set state dict for a Updater. Parameters of models, states for
+        optimizers and UpdaterState are restored."""
+        for name, layer in self.models.items():
+            layer.set_state_dict(state_dict[f"{name}_params"])
+        for name, optim in self.optimizers.items():
+            optim.set_state_dict(state_dict[f"{name}_optimizer"])
+        super().set_state_dict(state_dict)
diff --git a/ernie-sat/paddlespeech/t2s/utils/__init__.py b/ernie-sat/paddlespeech/t2s/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..520c81a26e275eb090b5816e4af584b52036aa6b
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/utils/__init__.py
@@ -0,0 +1,22 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from . import checkpoint
+from . import display
+from . import layer_tools
+from . import mp_tools
+from . import scheduler
+
+
+def str2bool(str):
+    return True if str.lower() == 'true' else False
diff --git a/ernie-sat/paddlespeech/t2s/utils/checkpoint.py b/ernie-sat/paddlespeech/t2s/utils/checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e222c50c12790f3ef5b63d24a6ebd1483122b1b
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/utils/checkpoint.py
@@ -0,0 +1,138 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+
+import paddle
+from paddle import distributed as dist
+
+from paddlespeech.t2s.utils import mp_tools
+
+__all__ = ["load_parameters", "save_parameters"]
+
+
+def _load_latest_checkpoint(checkpoint_dir: str) -> int:
+    """Get the iteration number corresponding to the latest saved checkpoint.
+
+    Args:
+        checkpoint_dir (str): the directory where checkpoint is saved.
+
+    Returns:
+        int: the latest iteration number.
+    """
+    checkpoint_record = os.path.join(checkpoint_dir, "checkpoint")
+    if (not os.path.isfile(checkpoint_record)):
+        return 0
+
+    # Fetch the latest checkpoint index.
+    with open(checkpoint_record, "rt") as handle:
+        latest_checkpoint = handle.readline().split()[-1]
+        iteration = int(latest_checkpoint.split("-")[-1])
+
+    return iteration
+
+
+def _save_checkpoint(checkpoint_dir: str, iteration: int):
+    """Save the iteration number of the latest model to be checkpointed.
+
+    Args:
+        checkpoint_dir (str): the directory where checkpoint is saved.
+        iteration (int): the latest iteration number.
+
+    Returns:
+        None
+    """
+    checkpoint_record = os.path.join(checkpoint_dir, "checkpoint")
+    # Update the latest checkpoint index.
+    with open(checkpoint_record, "wt") as handle:
+        handle.write("model_checkpoint_path: step-{}".format(iteration))
+
+
+def load_parameters(model,
+                    optimizer=None,
+                    checkpoint_dir=None,
+                    checkpoint_path=None):
+    """Load a specific model checkpoint from disk.
+
+    Args:
+        model (Layer): model to load parameters.
+        optimizer (Optimizer, optional): optimizer to load states if needed.
+            Defaults to None.
+        checkpoint_dir (str, optional): the directory where checkpoint is saved.
+        checkpoint_path (str, optional): if specified, load the checkpoint
+            stored in the checkpoint_path and the argument 'checkpoint_dir' will
+            be ignored. Defaults to None.
+
+    Returns:
+        iteration (int): number of iterations that the loaded checkpoint has 
+            been trained.
+    """
+    if checkpoint_path is not None:
+        iteration = int(os.path.basename(checkpoint_path).split("-")[-1])
+    elif checkpoint_dir is not None:
+        iteration = _load_latest_checkpoint(checkpoint_dir)
+        if iteration == 0:
+            return iteration
+        checkpoint_path = os.path.join(checkpoint_dir,
+                                       "step-{}".format(iteration))
+    else:
+        raise ValueError(
+            "At least one of 'checkpoint_dir' and 'checkpoint_path' should be specified!"
+        )
+
+    local_rank = dist.get_rank()
+
+    params_path = checkpoint_path + ".pdparams"
+    model_dict = paddle.load(params_path)
+    model.set_state_dict(model_dict)
+    print("[checkpoint] Rank {}: loaded model from {}".format(local_rank,
+                                                              params_path))
+
+    optimizer_path = checkpoint_path + ".pdopt"
+    if optimizer and os.path.isfile(optimizer_path):
+        optimizer_dict = paddle.load(optimizer_path)
+        optimizer.set_state_dict(optimizer_dict)
+        print("[checkpoint] Rank {}: loaded optimizer state from {}".format(
+            local_rank, optimizer_path))
+
+    return iteration
+
+
+@mp_tools.rank_zero_only
+def save_parameters(checkpoint_dir, iteration, model, optimizer=None):
+    """Checkpoint the latest trained model parameters.
+
+    Args:
+        checkpoint_dir (str): the directory where checkpoint is saved.
+        iteration (int): the latest iteration number.
+        model (Layer): model to be checkpointed.
+        optimizer (Optimizer, optional): optimizer to be checkpointed.
+            Defaults to None.
+
+    Returns:
+        None
+    """
+    checkpoint_path = os.path.join(checkpoint_dir, "step-{}".format(iteration))
+
+    model_dict = model.state_dict()
+    params_path = checkpoint_path + ".pdparams"
+    paddle.save(model_dict, params_path)
+    print("[checkpoint] Saved model to {}".format(params_path))
+
+    if optimizer:
+        opt_dict = optimizer.state_dict()
+        optimizer_path = checkpoint_path + ".pdopt"
+        paddle.save(opt_dict, optimizer_path)
+        print("[checkpoint] Saved optimzier state to {}".format(optimizer_path))
+
+    _save_checkpoint(checkpoint_dir, iteration)
diff --git a/ernie-sat/paddlespeech/t2s/utils/display.py b/ernie-sat/paddlespeech/t2s/utils/display.py
new file mode 100644
index 0000000000000000000000000000000000000000..af7d44ea42754954175a7e69b487aca92308b645
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/utils/display.py
@@ -0,0 +1,110 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import librosa.display
+import matplotlib.pylab as plt
+
+__all__ = [
+    "plot_alignment",
+    "plot_spectrogram",
+    "plot_waveform",
+    "plot_multihead_alignments",
+    "plot_multilayer_multihead_alignments",
+]
+
+
+def plot_alignment(alignment, title=None):
+    # alignment: [encoder_steps, decoder_steps)
+    fig, ax = plt.subplots(figsize=(6, 4))
+    im = ax.imshow(
+        alignment, aspect='auto', origin='lower', interpolation='none')
+    fig.colorbar(im, ax=ax)
+    xlabel = 'Decoder timestep'
+    if title is not None:
+        xlabel += '\n\n' + title
+    plt.xlabel(xlabel)
+    plt.ylabel('Encoder timestep')
+    plt.tight_layout()
+    return fig
+
+
+def plot_multihead_alignments(alignments, title=None):
+    # alignments: [N, encoder_steps, decoder_steps)
+    num_subplots = alignments.shape[0]
+
+    fig, axes = plt.subplots(
+        figsize=(6 * num_subplots, 4),
+        ncols=num_subplots,
+        sharey=True,
+        squeeze=True)
+    for i, ax in enumerate(axes):
+        im = ax.imshow(
+            alignments[i], aspect='auto', origin='lower', interpolation='none')
+        fig.colorbar(im, ax=ax)
+        xlabel = 'Decoder timestep'
+        if title is not None:
+            xlabel += '\n\n' + title
+        ax.set_xlabel(xlabel)
+        if i == 0:
+            ax.set_ylabel('Encoder timestep')
+    plt.tight_layout()
+    return fig
+
+
+def plot_multilayer_multihead_alignments(alignments, title=None):
+    # alignments: [num_layers, num_heads, encoder_steps, decoder_steps)
+    num_layers, num_heads, *_ = alignments.shape
+
+    fig, axes = plt.subplots(
+        figsize=(6 * num_heads, 4 * num_layers),
+        nrows=num_layers,
+        ncols=num_heads,
+        sharex=True,
+        sharey=True,
+        squeeze=True)
+    for i, row in enumerate(axes):
+        for j, ax in enumerate(row):
+            im = ax.imshow(
+                alignments[i, j],
+                aspect='auto',
+                origin='lower',
+                interpolation='none')
+            fig.colorbar(im, ax=ax)
+            xlabel = 'Decoder timestep'
+            if title is not None:
+                xlabel += '\n\n' + title
+            if i == num_layers - 1:
+                ax.set_xlabel(xlabel)
+            if j == 0:
+                ax.set_ylabel('Encoder timestep')
+    plt.tight_layout()
+    return fig
+
+
+def plot_spectrogram(spec):
+    # spec: [C, T] librosa convention
+    fig, ax = plt.subplots(figsize=(12, 3))
+    im = ax.imshow(spec, aspect="auto", origin="lower", interpolation='none')
+    plt.colorbar(im, ax=ax)
+    plt.xlabel("Frames")
+    plt.ylabel("Channels")
+    plt.tight_layout()
+    return fig
+
+
+def plot_waveform(wav, sr=22050):
+    fig, ax = plt.subplots(figsize=(12, 3))
+    im = librosa.display.waveplot(wav, sr=22050)
+    plt.colorbar(im, ax=ax)
+    plt.tight_layout()
+    return fig
diff --git a/ernie-sat/paddlespeech/t2s/utils/error_rate.py b/ernie-sat/paddlespeech/t2s/utils/error_rate.py
new file mode 100644
index 0000000000000000000000000000000000000000..41b13b75f06eceefa1c35492fece64864037adc7
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/utils/error_rate.py
@@ -0,0 +1,206 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""This module provides functions to calculate error rate in different level.
+e.g. wer for word-level, cer for char-level.
+"""
+import numpy as np
+
+__all__ = ['word_errors', 'char_errors', 'wer', 'cer']
+
+
+def _levenshtein_distance(ref, hyp):
+    """Levenshtein distance is a string metric for measuring the difference
+    between two sequences. Informally, the levenshtein disctance is defined as
+    the minimum number of single-character edits (substitutions, insertions or
+    deletions) required to change one word into the other. We can naturally
+    extend the edits to word level when calculate levenshtein disctance for
+    two sentences.
+    """
+    m = len(ref)
+    n = len(hyp)
+
+    # special case
+    if ref == hyp:
+        return 0
+    if m == 0:
+        return n
+    if n == 0:
+        return m
+
+    if m < n:
+        ref, hyp = hyp, ref
+        m, n = n, m
+
+    # use O(min(m, n)) space
+    distance = np.zeros((2, n + 1), dtype=np.int32)
+
+    # initialize distance matrix
+    for j in range(n + 1):
+        distance[0][j] = j
+
+    # calculate levenshtein distance
+    for i in range(1, m + 1):
+        prev_row_idx = (i - 1) % 2
+        cur_row_idx = i % 2
+        distance[cur_row_idx][0] = i
+        for j in range(1, n + 1):
+            if ref[i - 1] == hyp[j - 1]:
+                distance[cur_row_idx][j] = distance[prev_row_idx][j - 1]
+            else:
+                s_num = distance[prev_row_idx][j - 1] + 1
+                i_num = distance[cur_row_idx][j - 1] + 1
+                d_num = distance[prev_row_idx][j] + 1
+                distance[cur_row_idx][j] = min(s_num, i_num, d_num)
+
+    return distance[m % 2][n]
+
+
+def word_errors(reference, hypothesis, ignore_case=False, delimiter=' '):
+    """Compute the levenshtein distance between reference sequence and
+    hypothesis sequence in word-level.
+
+    Args:
+        reference (str): The reference sentence.
+        hypothesis (str): The hypothesis sentence.
+        ignore_case (bool): Whether case-sensitive or not.
+        delimiter (char(str)): Delimiter of input sentences.
+
+    Returns:
+        list: Levenshtein distance and word number of reference sentence.
+    """
+    if ignore_case:
+        reference = reference.lower()
+        hypothesis = hypothesis.lower()
+
+    ref_words = list(filter(None, reference.split(delimiter)))
+    hyp_words = list(filter(None, hypothesis.split(delimiter)))
+
+    edit_distance = _levenshtein_distance(ref_words, hyp_words)
+    return float(edit_distance), len(ref_words)
+
+
+def char_errors(reference, hypothesis, ignore_case=False, remove_space=False):
+    """Compute the levenshtein distance between reference sequence and
+    hypothesis sequence in char-level.
+
+    Args:
+        reference (str): The reference sentence.
+        hypothesis (str): The hypothesis sentence.
+        ignore_case (bool): Whether case-sensitive or not.
+        remove_space (bool): Whether remove internal space characters
+
+    Returns:
+        list: Levenshtein distance and length of reference sentence.
+    """
+    if ignore_case:
+        reference = reference.lower()
+        hypothesis = hypothesis.lower()
+
+    join_char = ' '
+    if remove_space:
+        join_char = ''
+
+    reference = join_char.join(list(filter(None, reference.split(' '))))
+    hypothesis = join_char.join(list(filter(None, hypothesis.split(' '))))
+
+    edit_distance = _levenshtein_distance(reference, hypothesis)
+    return float(edit_distance), len(reference)
+
+
+def wer(reference, hypothesis, ignore_case=False, delimiter=' '):
+    """Calculate word error rate (WER). WER compares reference text and
+    hypothesis text in word-level. WER is defined as:
+    .. math::
+        WER = (Sw + Dw + Iw) / Nw
+    where
+    .. code-block:: text
+        Sw is the number of words subsituted,
+        Dw is the number of words deleted,
+        Iw is the number of words inserted,
+        Nw is the number of words in the reference
+    We can use levenshtein distance to calculate WER. Please draw an attention
+    that empty items will be removed when splitting sentences by delimiter.
+
+    Args:
+        reference (str): The reference sentence.
+        hypothesis (str): The hypothesis sentence.
+        ignore_case (bool): Whether case-sensitive or not.
+        delimiter (char): Delimiter of input sentences.
+
+    Returns: 
+        float: Word error rate.
+
+    Raises:
+        ValueError: If word number of reference is zero.
+    """
+    edit_distance, ref_len = word_errors(reference, hypothesis, ignore_case,
+                                         delimiter)
+
+    if ref_len == 0:
+        raise ValueError("Reference's word number should be greater than 0.")
+
+    wer = float(edit_distance) / ref_len
+    return wer
+
+
+def cer(reference, hypothesis, ignore_case=False, remove_space=False):
+    """Calculate charactor error rate (CER). CER compares reference text and
+    hypothesis text in char-level. CER is defined as:
+    .. math::
+        CER = (Sc + Dc + Ic) / Nc
+    where
+    .. code-block:: text
+        Sc is the number of characters substituted,
+        Dc is the number of characters deleted,
+        Ic is the number of characters inserted
+        Nc is the number of characters in the reference
+    We can use levenshtein distance to calculate CER. Chinese input should be
+    encoded to unicode. Please draw an attention that the leading and tailing
+    space characters will be truncated and multiple consecutive space
+    characters in a sentence will be replaced by one space character.
+
+    Args:
+        reference (str): The reference sentence.
+        hypothesis (str): The hypothesis sentence.
+        ignore_case (bool): Whether case-sensitive or not.
+        remove_space (bool): Whether remove internal space characters
+
+    Returns: 
+        float: Character error rate.
+
+    Raises: 
+        ValueError: If the reference length is zero.
+    """
+    edit_distance, ref_len = char_errors(reference, hypothesis, ignore_case,
+                                         remove_space)
+
+    if ref_len == 0:
+        raise ValueError("Length of reference should be greater than 0.")
+
+    cer = float(edit_distance) / ref_len
+    return cer
+
+
+if __name__ == "__main__":
+    reference = [
+        'j', 'iou4', 'zh', 'e4', 'iang5', 'x', 'v2', 'b', 'o1', 'k', 'ai1',
+        'sh', 'iii3', 'l', 'e5', 'b', 'ei3', 'p', 'iao1', 'sh', 'eng1', 'ia2'
+    ]
+    hypothesis = [
+        'j', 'iou4', 'zh', 'e4', 'iang4', 'x', 'v2', 'b', 'o1', 'k', 'ai1',
+        'sh', 'iii3', 'l', 'e5', 'b', 'ei3', 'p', 'iao1', 'sh', 'eng1', 'ia2'
+    ]
+    reference = " ".join(reference)
+    hypothesis = " ".join(hypothesis)
+    print(wer(reference, hypothesis))
diff --git a/ernie-sat/paddlespeech/t2s/utils/h5_utils.py b/ernie-sat/paddlespeech/t2s/utils/h5_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..75c2e448820da8a6dc183e69e5b1e7683f258b28
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/utils/h5_utils.py
@@ -0,0 +1,93 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+import sys
+from pathlib import Path
+from typing import Any
+from typing import Union
+
+import h5py
+import numpy as np
+
+
+def read_hdf5(filename: Union[Path, str], dataset_name: str) -> Any:
+    """Read a dataset from a HDF5 file.
+    Args:
+        filename (Union[Path, str]): Path of the HDF5 file.
+        dataset_name (str): Name of the dataset to read.
+
+    Returns:
+        Any: The retrieved dataset.
+    """
+    filename = Path(filename)
+
+    if not filename.exists():
+        logging.error(f"There is no such a hdf5 file ({filename}).")
+        sys.exit(1)
+
+    hdf5_file = h5py.File(filename, "r")
+
+    if dataset_name not in hdf5_file:
+        logging.error(f"There is no such a data in hdf5 file. ({dataset_name})")
+        sys.exit(1)
+
+    # [()]: a special syntax of h5py to get the dataset as-is
+    hdf5_data = hdf5_file[dataset_name][()]
+    hdf5_file.close()
+
+    return hdf5_data
+
+
+def write_hdf5(filename: Union[Path, str],
+               dataset_name: str,
+               write_data: np.ndarray,
+               is_overwrite: bool=True) -> None:
+    """Write dataset to HDF5 file.
+    Args:
+        filename (Union[Path, str]): Path of the HDF5 file.
+        dataset_name (str): Name of the dataset to write to.
+        write_data (np.ndarrays): The data to write.
+        is_overwrite (bool, optional): Whether to overwrite, by default True
+    """
+    # convert to numpy array
+    filename = Path(filename)
+    write_data = np.array(write_data)
+
+    # check folder existence
+    filename.parent.mkdir(parents=True, exist_ok=True)
+
+    # check hdf5 existence
+    if filename.exists():
+        # if already exists, open with r+ mode
+        hdf5_file = h5py.File(filename, "r+")
+        # check dataset existence
+        if dataset_name in hdf5_file:
+            if is_overwrite:
+                logging.warning("Dataset in hdf5 file already exists. "
+                                "recreate dataset in hdf5.")
+                hdf5_file.__delitem__(dataset_name)
+            else:
+                logging.error(
+                    "Dataset in hdf5 file already exists. "
+                    "if you want to overwrite, please set is_overwrite = True.")
+                hdf5_file.close()
+                sys.exit(1)
+    else:
+        # if not exists, open with w mode
+        hdf5_file = h5py.File(filename, "w")
+
+    # write data to hdf5
+    hdf5_file.create_dataset(dataset_name, data=write_data)
+    hdf5_file.flush()
+    hdf5_file.close()
diff --git a/ernie-sat/paddlespeech/t2s/utils/internals.py b/ernie-sat/paddlespeech/t2s/utils/internals.py
new file mode 100644
index 0000000000000000000000000000000000000000..6c10bd2d53ebb944e065ab8fac4fc1ffdfadd994
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/utils/internals.py
@@ -0,0 +1,52 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+from paddle.framework import core
+
+__all__ = ["convert_dtype_to_np_dtype_"]
+
+
+def convert_dtype_to_np_dtype_(dtype):
+    """
+    Convert paddle's data type to corrsponding numpy data type.
+
+    Args:
+        dtype(np.dtype): the data type in paddle.
+
+    Returns:
+        type: the data type in numpy.
+
+    """
+    if dtype is core.VarDesc.VarType.FP32:
+        return np.float32
+    elif dtype is core.VarDesc.VarType.FP64:
+        return np.float64
+    elif dtype is core.VarDesc.VarType.FP16:
+        return np.float16
+    elif dtype is core.VarDesc.VarType.BOOL:
+        return np.bool
+    elif dtype is core.VarDesc.VarType.INT32:
+        return np.int32
+    elif dtype is core.VarDesc.VarType.INT64:
+        return np.int64
+    elif dtype is core.VarDesc.VarType.INT16:
+        return np.int16
+    elif dtype is core.VarDesc.VarType.INT8:
+        return np.int8
+    elif dtype is core.VarDesc.VarType.UINT8:
+        return np.uint8
+    elif dtype is core.VarDesc.VarType.BF16:
+        return np.uint16
+    else:
+        raise ValueError("Not supported dtype %s" % dtype)
diff --git a/ernie-sat/paddlespeech/t2s/utils/layer_tools.py b/ernie-sat/paddlespeech/t2s/utils/layer_tools.py
new file mode 100644
index 0000000000000000000000000000000000000000..6e971f9863c43420858efa8bd4c06d34d86651c7
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/utils/layer_tools.py
@@ -0,0 +1,56 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+from paddle import nn
+
+__all__ = ["summary", "gradient_norm", "freeze", "unfreeze"]
+
+
+def summary(layer: nn.Layer):
+    num_params = num_elements = 0
+    print("layer summary:")
+    for name, param in layer.state_dict().items():
+        print("{}|{}|{}".format(name, param.shape, np.prod(param.shape)))
+        num_elements += np.prod(param.shape)
+        num_params += 1
+    print("layer has {} parameters, {} elements.".format(num_params,
+                                                         num_elements))
+
+
+def gradient_norm(layer: nn.Layer):
+    grad_norm_dict = {}
+    for name, param in layer.state_dict().items():
+        if param.trainable:
+            grad = param.gradient()
+            grad_norm_dict[name] = np.linalg.norm(grad) / grad.size
+    return grad_norm_dict
+
+
+def recursively_remove_weight_norm(layer: nn.Layer):
+    for layer in layer.sublayers():
+        try:
+            nn.utils.remove_weight_norm(layer)
+        except Exception as e:
+            # ther is not weight norm hoom in this layer
+            pass
+
+
+def freeze(layer: nn.Layer):
+    for param in layer.parameters():
+        param.trainable = False
+
+
+def unfreeze(layer: nn.Layer):
+    for param in layer.parameters():
+        param.trainable = True
diff --git a/ernie-sat/paddlespeech/t2s/utils/mp_tools.py b/ernie-sat/paddlespeech/t2s/utils/mp_tools.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed8c83ee5880edc04fa1f68fe63d0478bf87ace2
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/utils/mp_tools.py
@@ -0,0 +1,29 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from functools import wraps
+
+from paddle import distributed as dist
+
+__all__ = ["rank_zero_only"]
+
+
+def rank_zero_only(func):
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        if dist.get_rank() != 0:
+            return
+        result = func(*args, **kwargs)
+        return result
+
+    return wrapper
diff --git a/ernie-sat/paddlespeech/t2s/utils/profile.py b/ernie-sat/paddlespeech/t2s/utils/profile.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f9b49526c6f1ef036724efdb0deab73ebab9c16
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/utils/profile.py
@@ -0,0 +1,34 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from contextlib import contextmanager
+
+import paddle
+from paddle.framework import core
+from paddle.framework import CUDAPlace
+
+
+def synchronize():
+    """Trigger cuda synchronization for better timing."""
+    place = paddle.fluid.framework._current_expected_place()
+    if isinstance(place, CUDAPlace):
+        paddle.fluid.core._cuda_synchronize(place)
+
+
+@contextmanager
+def nvtx_span(name):
+    try:
+        core.nvprof_nvtx_push(name)
+        yield
+    finally:
+        core.nvprof_nvtx_pop()
diff --git a/ernie-sat/paddlespeech/t2s/utils/profiler.py b/ernie-sat/paddlespeech/t2s/utils/profiler.py
new file mode 100644
index 0000000000000000000000000000000000000000..2bbeb02d19f2c865f43477433cd7870a22bd3779
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/utils/profiler.py
@@ -0,0 +1,110 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+
+import paddle
+
+# A global variable to record the number of calling times for profiler
+# functions. It is used to specify the tracing range of training steps.
+_profiler_step_id = 0
+
+# A global variable to avoid parsing from string every time.
+_profiler_options = None
+
+
+class ProfilerOptions(object):
+    '''
+    Use a string to initialize a ProfilerOptions.
+    The string should be in the format: "key1=value1;key2=value;key3=value3".
+    For example:
+      "profile_path=model.profile"
+      "batch_range=[50, 60]; profile_path=model.profile"
+      "batch_range=[50, 60]; tracer_option=OpDetail; profile_path=model.profile"
+    ProfilerOptions supports following key-value pair:
+      batch_range      - a integer list, e.g. [100, 110].
+      state            - a string, the optional values are 'CPU', 'GPU' or 'All'. 
+      sorted_key       - a string, the optional values are 'calls', 'total',
+                         'max', 'min' or 'ave.
+      tracer_option    - a string, the optional values are 'Default', 'OpDetail',
+                         'AllOpDetail'.
+      profile_path     - a string, the path to save the serialized profile data,
+                         which can be used to generate a timeline.
+      exit_on_finished - a boolean.
+    '''
+
+    def __init__(self, options_str):
+        assert isinstance(options_str, str)
+
+        self._options = {
+            'batch_range': [10, 20],
+            'state': 'All',
+            'sorted_key': 'total',
+            'tracer_option': 'Default',
+            'profile_path': '/tmp/profile',
+            'exit_on_finished': True
+        }
+        self._parse_from_string(options_str)
+
+    def _parse_from_string(self, options_str):
+        for kv in options_str.replace(' ', '').split(';'):
+            key, value = kv.split('=')
+            if key == 'batch_range':
+                value_list = value.replace('[', '').replace(']', '').split(',')
+                value_list = list(map(int, value_list))
+                if len(value_list) >= 2 and value_list[0] >= 0 and value_list[
+                        1] > value_list[0]:
+                    self._options[key] = value_list
+            elif key == 'exit_on_finished':
+                self._options[key] = value.lower() in ("yes", "true", "t", "1")
+            elif key in [
+                    'state', 'sorted_key', 'tracer_option', 'profile_path'
+            ]:
+                self._options[key] = value
+
+    def __getitem__(self, name):
+        if self._options.get(name, None) is None:
+            raise ValueError(
+                "ProfilerOptions does not have an option named %s." % name)
+        return self._options[name]
+
+
+def add_profiler_step(options_str=None):
+    '''
+    Enable the operator-level timing using PaddlePaddle's profiler.
+    The profiler uses a independent variable to count the profiler steps.
+    One call of this function is treated as a profiler step.
+    
+    Args:
+      profiler_options - a string to initialize the ProfilerOptions.
+                         Default is None, and the profiler is disabled.
+    '''
+    if options_str is None:
+        return
+
+    global _profiler_step_id
+    global _profiler_options
+
+    if _profiler_options is None:
+        _profiler_options = ProfilerOptions(options_str)
+
+    if _profiler_step_id == _profiler_options['batch_range'][0]:
+        paddle.utils.profiler.start_profiler(_profiler_options['state'],
+                                             _profiler_options['tracer_option'])
+    elif _profiler_step_id == _profiler_options['batch_range'][1]:
+        paddle.utils.profiler.stop_profiler(_profiler_options['sorted_key'],
+                                            _profiler_options['profile_path'])
+        if _profiler_options['exit_on_finished']:
+            sys.exit(0)
+
+    _profiler_step_id += 1
diff --git a/ernie-sat/paddlespeech/t2s/utils/scheduler.py b/ernie-sat/paddlespeech/t2s/utils/scheduler.py
new file mode 100644
index 0000000000000000000000000000000000000000..9338995a7f2615a4d2d8508c10914df6e5552c91
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/utils/scheduler.py
@@ -0,0 +1,73 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__all__ = ["SchedulerBase", "Constant", "PieceWise", "StepWise"]
+
+
+class SchedulerBase(object):
+    def __call__(self, step):
+        raise NotImplementedError("You should implement the __call__ method.")
+
+
+class Constant(SchedulerBase):
+    def __init__(self, value):
+        self.value = value
+
+    def __call__(self, step):
+        return self.value
+
+
+class PieceWise(SchedulerBase):
+    def __init__(self, anchors):
+        anchors = list(anchors)
+        anchors = sorted(anchors, key=lambda x: x[0])
+        assert anchors[0][0] == 0, "it must start from zero"
+        self.xs = [item[0] for item in anchors]
+        self.ys = [item[1] for item in anchors]
+        self.num_anchors = len(self.xs)
+
+    def __call__(self, step):
+        i = 0
+        for x in self.xs:
+            if step >= x:
+                i += 1
+        if i == 0:
+            return self.ys[0]
+        if i == self.num_anchors:
+            return self.ys[-1]
+        k = (self.ys[i] - self.ys[i - 1]) / (self.xs[i] - self.xs[i - 1])
+        out = self.ys[i - 1] + (step - self.xs[i - 1]) * k
+        return out
+
+
+class StepWise(SchedulerBase):
+    def __init__(self, anchors):
+        anchors = list(anchors)
+        anchors = sorted(anchors, key=lambda x: x[0])
+        assert anchors[0][0] == 0, "it must start from zero"
+        self.xs = [item[0] for item in anchors]
+        self.ys = [item[1] for item in anchors]
+        self.num_anchors = len(self.xs)
+
+    def __call__(self, step):
+        i = 0
+        for x in self.xs:
+            if step >= x:
+                i += 1
+
+        if i == self.num_anchors:
+            return self.ys[-1]
+        if i == 0:
+            return self.ys[0]
+        return self.ys[i - 1]
diff --git a/ernie-sat/paddlespeech/t2s/utils/timeline.py b/ernie-sat/paddlespeech/t2s/utils/timeline.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a5509dbe4530536df81313f7104afcd528a646f
--- /dev/null
+++ b/ernie-sat/paddlespeech/t2s/utils/timeline.py
@@ -0,0 +1,315 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import json
+
+import paddle.fluid.proto.profiler.profiler_pb2 as profiler_pb2
+import six
+
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    '--profile_path',
+    type=str,
+    default='',
+    help='Input profile file name. If there are multiple file, the format '
+    'should be trainer1=file1,trainer2=file2,ps=file3')
+parser.add_argument(
+    '--timeline_path', type=str, default='', help='Output timeline file name.')
+args = parser.parse_args()
+
+
+class _ChromeTraceFormatter(object):
+    def __init__(self):
+        self._events = []
+        self._metadata = []
+
+    def _create_event(self, ph, category, name, pid, tid, timestamp):
+        """Creates a new Chrome Trace event.
+
+        For details of the file format, see:
+        https://github.com/catapult-project/catapult/blob/master/tracing/README.md
+
+        Args:
+          ph:  The type of event - usually a single character.
+          category: The event category as a string.
+          name:  The event name as a string.
+          pid:  Identifier of the process generating this event as an integer.
+          tid:  Identifier of the thread generating this event as an integer.
+          timestamp:  The timestamp of this event as a long integer.
+
+        Returns:
+          A JSON compatible event object.
+        """
+        event = {}
+        event['ph'] = ph
+        event['cat'] = category
+        event['name'] = name.replace("ParallelExecutor::Run/", "")
+        event['pid'] = pid
+        event['tid'] = tid
+        event['ts'] = timestamp
+        return event
+
+    def emit_pid(self, name, pid):
+        """Adds a process metadata event to the trace.
+
+        Args:
+          name:  The process name as a string.
+          pid:  Identifier of the process as an integer.
+        """
+        event = {}
+        event['name'] = 'process_name'
+        event['ph'] = 'M'
+        event['pid'] = pid
+        event['args'] = {'name': name}
+        self._metadata.append(event)
+
+    def emit_region(self, timestamp, duration, pid, tid, category, name, args):
+        """Adds a region event to the trace.
+
+        Args:
+          timestamp:  The start timestamp of this region as a long integer.
+          duration:  The duration of this region as a long integer.
+          pid:  Identifier of the process generating this event as an integer.
+          tid:  Identifier of the thread generating this event as an integer.
+          category: The event category as a string.
+          name:  The event name as a string.
+          args:  A JSON-compatible dictionary of event arguments.
+        """
+        event = self._create_event('X', category, name, pid, tid, timestamp)
+        event['dur'] = duration
+        event['args'] = args
+        self._events.append(event)
+
+    def emit_counter(self, category, name, pid, timestamp, counter, value):
+        """Emits a record for a single counter.
+
+        Args:
+            category: The event category as string
+            name: The event name as string
+            pid: Identifier of the process generating this event as integer
+            timestamp: The timestamps of this event as long integer
+            counter: Name of the counter as string
+            value: Value of the counter as integer
+            tid: Thread id of the allocation as integer
+        """
+        event = self._create_event('C', category, name, pid, 0, timestamp)
+        event['args'] = {counter: value}
+        self._events.append(event)
+
+    def format_to_string(self, pretty=False):
+        """Formats the chrome trace to a string.
+
+        Args:
+          pretty: (Optional.)  If True, produce human-readable JSON output.
+
+        Returns:
+          A JSON-formatted string in Chrome Trace format.
+        """
+        trace = {}
+        trace['traceEvents'] = self._metadata + self._events
+        if pretty:
+            return json.dumps(trace, indent=4, separators=(',', ': '))
+        else:
+            return json.dumps(trace, separators=(',', ':'))
+
+
+class Timeline(object):
+    def __init__(self, profile_dict):
+        self._profile_dict = profile_dict
+        self._pid = 0
+        self._devices = dict()
+        self._mem_devices = dict()
+        self._chrome_trace = _ChromeTraceFormatter()
+
+    def _allocate_pid(self):
+        cur_pid = self._pid
+        self._pid += 1
+        return cur_pid
+
+    def _allocate_pids(self):
+        for k, profile_pb in six.iteritems(self._profile_dict):
+            for event in profile_pb.events:
+                if event.type == profiler_pb2.Event.CPU:
+                    if (k, event.device_id, "CPU") not in self._devices:
+                        pid = self._allocate_pid()
+                        self._devices[(k, event.device_id, "CPU")] = pid
+                        # -1 device id represents CUDA API(RunTime) call.(e.g. cudaLaunch, cudaMemcpy)
+                        if event.device_id == -1:
+                            self._chrome_trace.emit_pid("%s:cuda_api" % k, pid)
+                        else:
+                            self._chrome_trace.emit_pid(
+                                "%s:cpu:block:%d" % (k, event.device_id), pid)
+                elif event.type == profiler_pb2.Event.GPUKernel:
+                    if (k, event.device_id, "GPUKernel") not in self._devices:
+                        pid = self._allocate_pid()
+                        self._devices[(k, event.device_id, "GPUKernel")] = pid
+                        self._chrome_trace.emit_pid("%s:gpu:%d" %
+                                                    (k, event.device_id), pid)
+            if not hasattr(profile_pb, "mem_events"):
+                continue
+            for mevent in profile_pb.mem_events:
+                if mevent.place == profiler_pb2.MemEvent.CUDAPlace:
+                    if (k, mevent.device_id, "GPU") not in self._mem_devices:
+                        pid = self._allocate_pid()
+                        self._mem_devices[(k, mevent.device_id, "GPU")] = pid
+                        self._chrome_trace.emit_pid(
+                            "memory usage on %s:gpu:%d" % (k, mevent.device_id),
+                            pid)
+                elif mevent.place == profiler_pb2.MemEvent.CPUPlace:
+                    if (k, mevent.device_id, "CPU") not in self._mem_devices:
+                        pid = self._allocate_pid()
+                        self._mem_devices[(k, mevent.device_id, "CPU")] = pid
+                        self._chrome_trace.emit_pid(
+                            "memory usage on %s:cpu:%d" % (k, mevent.device_id),
+                            pid)
+                elif mevent.place == profiler_pb2.MemEvent.CUDAPinnedPlace:
+                    if (k, mevent.device_id,
+                            "CUDAPinnedPlace") not in self._mem_devices:
+                        pid = self._allocate_pid()
+                        self._mem_devices[(k, mevent.device_id,
+                                           "CUDAPinnedPlace")] = pid
+                        self._chrome_trace.emit_pid(
+                            "memory usage on %s:cudapinnedplace:%d" %
+                            (k, mevent.device_id), pid)
+                elif mevent.place == profiler_pb2.MemEvent.NPUPlace:
+                    if (k, mevent.device_id, "NPU") not in self._mem_devices:
+                        pid = self._allocate_pid()
+                        self._mem_devices[(k, mevent.device_id, "NPU")] = pid
+                        self._chrome_trace.emit_pid(
+                            "memory usage on %s:npu:%d" % (k, mevent.device_id),
+                            pid)
+                if (k, 0, "CPU") not in self._mem_devices:
+                    pid = self._allocate_pid()
+                    self._mem_devices[(k, 0, "CPU")] = pid
+                    self._chrome_trace.emit_pid("memory usage on %s:cpu:%d" %
+                                                (k, 0), pid)
+                if (k, 0, "GPU") not in self._mem_devices:
+                    pid = self._allocate_pid()
+                    self._mem_devices[(k, 0, "GPU")] = pid
+                    self._chrome_trace.emit_pid("memory usage on %s:gpu:%d" %
+                                                (k, 0), pid)
+                if (k, 0, "CUDAPinnedPlace") not in self._mem_devices:
+                    pid = self._allocate_pid()
+                    self._mem_devices[(k, 0, "CUDAPinnedPlace")] = pid
+                    self._chrome_trace.emit_pid(
+                        "memory usage on %s:cudapinnedplace:%d" % (k, 0), pid)
+                if (k, 0, "NPU") not in self._mem_devices:
+                    pid = self._allocate_pid()
+                    self._mem_devices[(k, 0, "NPU")] = pid
+                    self._chrome_trace.emit_pid("memory usage on %s:npu:%d" %
+                                                (k, 0), pid)
+
+    def _allocate_events(self):
+        for k, profile_pb in six.iteritems(self._profile_dict):
+            for event in profile_pb.events:
+                if event.type == profiler_pb2.Event.CPU:
+                    type = "CPU"
+                elif event.type == profiler_pb2.Event.GPUKernel:
+                    type = "GPUKernel"
+                pid = self._devices[(k, event.device_id, type)]
+                args = {'name': event.name}
+                if event.memcopy.bytes > 0:
+                    args['mem_bytes'] = event.memcopy.bytes
+                if hasattr(event, "detail_info") and event.detail_info:
+                    args['detail_info'] = event.detail_info
+                # TODO(panyx0718): Chrome tracing only handles ms. However, some
+                # ops takes micro-seconds. Hence, we keep the ns here.
+                self._chrome_trace.emit_region(
+                    event.start_ns, (event.end_ns - event.start_ns) / 1.0, pid,
+                    event.sub_device_id, 'Op', event.name, args)
+
+    def _allocate_memory_event(self):
+        if not hasattr(profiler_pb2, "MemEvent"):
+            return
+        place_to_str = {
+            profiler_pb2.MemEvent.CPUPlace: "CPU",
+            profiler_pb2.MemEvent.CUDAPlace: "GPU",
+            profiler_pb2.MemEvent.CUDAPinnedPlace: "CUDAPinnedPlace",
+            profiler_pb2.MemEvent.NPUPlace: "NPU"
+        }
+        for k, profile_pb in six.iteritems(self._profile_dict):
+            mem_list = []
+            end_profiler = 0
+            for mevent in profile_pb.mem_events:
+                crt_info = dict()
+                crt_info['time'] = mevent.start_ns
+                crt_info['size'] = mevent.bytes
+                if mevent.place in place_to_str:
+                    place = place_to_str[mevent.place]
+                else:
+                    place = "UnDefine"
+                crt_info['place'] = place
+                pid = self._mem_devices[(k, mevent.device_id, place)]
+                crt_info['pid'] = pid
+                crt_info['thread_id'] = mevent.thread_id
+                crt_info['device_id'] = mevent.device_id
+                mem_list.append(crt_info)
+                crt_info = dict()
+                crt_info['place'] = place
+                crt_info['pid'] = pid
+                crt_info['thread_id'] = mevent.thread_id
+                crt_info['device_id'] = mevent.device_id
+                crt_info['time'] = mevent.end_ns
+                crt_info['size'] = -mevent.bytes
+                mem_list.append(crt_info)
+                end_profiler = max(end_profiler, crt_info['time'])
+            mem_list.sort(key=lambda tmp: (tmp.get('time', 0)))
+            i = 0
+            total_size = 0
+            while i < len(mem_list):
+                total_size += mem_list[i]['size']
+                while i < len(mem_list) - 1 and mem_list[i]['time'] == mem_list[
+                        i + 1]['time']:
+                    total_size += mem_list[i + 1]['size']
+                    i += 1
+
+                self._chrome_trace.emit_counter(
+                    "Memory", "Memory", mem_list[i]['pid'], mem_list[i]['time'],
+                    0, total_size)
+                i += 1
+
+    def generate_chrome_trace(self):
+        self._allocate_pids()
+        self._allocate_events()
+        self._allocate_memory_event()
+        return self._chrome_trace.format_to_string()
+
+
+profile_path = '/tmp/profile'
+if args.profile_path:
+    profile_path = args.profile_path
+timeline_path = '/tmp/timeline'
+if args.timeline_path:
+    timeline_path = args.timeline_path
+
+profile_paths = profile_path.split(',')
+profile_dict = dict()
+if len(profile_paths) == 1:
+    with open(profile_path, 'rb') as f:
+        profile_s = f.read()
+        profile_pb = profiler_pb2.Profile()
+        profile_pb.ParseFromString(profile_s)
+    profile_dict['trainer'] = profile_pb
+else:
+    for profile_path in profile_paths:
+        k, v = profile_path.split('=')
+        with open(v, 'rb') as f:
+            profile_s = f.read()
+            profile_pb = profiler_pb2.Profile()
+            profile_pb.ParseFromString(profile_s)
+        profile_dict[k] = profile_pb
+
+tl = Timeline(profile_dict)
+with open(timeline_path, 'w') as f:
+    f.write(tl.generate_chrome_trace())
diff --git a/ernie-sat/paddlespeech/text/__init__.py b/ernie-sat/paddlespeech/text/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..185a92b8d94d3426d616c0624f0f2ee04339349e
--- /dev/null
+++ b/ernie-sat/paddlespeech/text/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/ernie-sat/paddlespeech/text/exps/__init__.py b/ernie-sat/paddlespeech/text/exps/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..abf198b97e6e818e1fbe59006f98492640bcee54
--- /dev/null
+++ b/ernie-sat/paddlespeech/text/exps/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/ernie-sat/paddlespeech/text/exps/ernie_linear/__init__.py b/ernie-sat/paddlespeech/text/exps/ernie_linear/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..abf198b97e6e818e1fbe59006f98492640bcee54
--- /dev/null
+++ b/ernie-sat/paddlespeech/text/exps/ernie_linear/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/ernie-sat/paddlespeech/text/exps/ernie_linear/avg_model.py b/ernie-sat/paddlespeech/text/exps/ernie_linear/avg_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..036ca14eb6cb17e68d124670bbe8c051c50bb628
--- /dev/null
+++ b/ernie-sat/paddlespeech/text/exps/ernie_linear/avg_model.py
@@ -0,0 +1,112 @@
+#!/usr/bin/env python3
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import glob
+import json
+import os
+
+import numpy as np
+import paddle
+
+
+def main(args):
+    paddle.set_device('cpu')
+
+    val_scores = []
+    beat_val_scores = []
+    selected_epochs = []
+    if args.val_best:
+        jsons = glob.glob(f'{args.ckpt_dir}/[!train]*.json')
+        for y in jsons:
+            with open(y, 'r') as f:
+                dict_json = json.load(f)
+            loss = dict_json['F1']
+            epoch = dict_json['epoch']
+            if epoch >= args.min_epoch and epoch <= args.max_epoch:
+                val_scores.append((epoch, loss))
+
+        val_scores = np.array(val_scores)
+        sort_idx = np.argsort(-val_scores[:, 1])
+        sorted_val_scores = val_scores[sort_idx]
+        path_list = [
+            args.ckpt_dir + '/{}.pdparams'.format(int(epoch))
+            for epoch in sorted_val_scores[:args.num, 0]
+        ]
+
+        beat_val_scores = sorted_val_scores[:args.num, 1]
+        selected_epochs = sorted_val_scores[:args.num, 0].astype(np.int64)
+        print("best val scores = " + str(beat_val_scores))
+        print("selected epochs = " + str(selected_epochs))
+    else:
+        path_list = glob.glob(f'{args.ckpt_dir}/[!avg][!final]*.pdparams')
+        path_list = sorted(path_list, key=os.path.getmtime)
+        path_list = path_list[-args.num:]
+
+    print(path_list)
+
+    avg = None
+    num = args.num
+    assert num == len(path_list)
+    for path in path_list:
+        print(f'Processing {path}')
+        states = paddle.load(path)
+        if avg is None:
+            avg = states
+        else:
+            for k in avg.keys():
+                avg[k] += states[k]
+    # average
+    for k in avg.keys():
+        if avg[k] is not None:
+            avg[k] /= num
+
+    paddle.save(avg, args.dst_model)
+    print(f'Saving to {args.dst_model}')
+
+    meta_path = os.path.splitext(args.dst_model)[0] + '.avg.json'
+    with open(meta_path, 'w') as f:
+        data = json.dumps({
+            "avg_ckpt": args.dst_model,
+            "ckpt": path_list,
+            "epoch": selected_epochs.tolist(),
+            "val_loss": beat_val_scores.tolist(),
+        })
+        f.write(data + "\n")
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='average model')
+    parser.add_argument('--dst_model', required=True, help='averaged model')
+    parser.add_argument(
+        '--ckpt_dir', required=True, help='ckpt model dir for average')
+    parser.add_argument(
+        '--val_best', action="store_true", help='averaged model')
+    parser.add_argument(
+        '--num', default=5, type=int, help='nums for averaged model')
+    parser.add_argument(
+        '--min_epoch',
+        default=0,
+        type=int,
+        help='min epoch used for averaging model')
+    parser.add_argument(
+        '--max_epoch',
+        default=65536,  # Big enough
+        type=int,
+        help='max epoch used for averaging model')
+
+    args = parser.parse_args()
+    print(args)
+
+    main(args)
diff --git a/ernie-sat/paddlespeech/text/exps/ernie_linear/punc_restore.py b/ernie-sat/paddlespeech/text/exps/ernie_linear/punc_restore.py
new file mode 100644
index 0000000000000000000000000000000000000000..2cb4d07199d790ba600834c836d383f6b6f19238
--- /dev/null
+++ b/ernie-sat/paddlespeech/text/exps/ernie_linear/punc_restore.py
@@ -0,0 +1,110 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import re
+
+import paddle
+import yaml
+from paddlenlp.transformers import ErnieTokenizer
+from yacs.config import CfgNode
+
+from paddlespeech.text.models.ernie_linear import ErnieLinear
+
+DefinedClassifier = {
+    'ErnieLinear': ErnieLinear,
+}
+
+tokenizer = ErnieTokenizer.from_pretrained('ernie-1.0')
+
+
+def _clean_text(text, punc_list):
+    text = text.lower()
+    text = re.sub('[^A-Za-z0-9\u4e00-\u9fa5]', '', text)
+    text = re.sub(f'[{"".join([p for p in punc_list][1:])}]', '', text)
+    return text
+
+
+def preprocess(text, punc_list):
+    clean_text = _clean_text(text, punc_list)
+    assert len(clean_text) > 0, f'Invalid input string: {text}'
+    tokenized_input = tokenizer(
+        list(clean_text), return_length=True, is_split_into_words=True)
+    _inputs = dict()
+    _inputs['input_ids'] = tokenized_input['input_ids']
+    _inputs['seg_ids'] = tokenized_input['token_type_ids']
+    _inputs['seq_len'] = tokenized_input['seq_len']
+    return _inputs
+
+
+def test(args):
+    with open(args.config) as f:
+        config = CfgNode(yaml.safe_load(f))
+    print("========Args========")
+    print(yaml.safe_dump(vars(args)))
+    print("========Config========")
+    print(config)
+
+    punc_list = []
+    with open(config["data_params"]["punc_path"], 'r') as f:
+        for line in f:
+            punc_list.append(line.strip())
+
+    model = DefinedClassifier[config["model_type"]](**config["model"])
+    state_dict = paddle.load(args.checkpoint)
+    model.set_state_dict(state_dict["main_params"])
+    model.eval()
+    _inputs = preprocess(args.text, punc_list)
+    seq_len = _inputs['seq_len']
+    input_ids = paddle.to_tensor(_inputs['input_ids']).unsqueeze(0)
+    seg_ids = paddle.to_tensor(_inputs['seg_ids']).unsqueeze(0)
+    logits, _ = model(input_ids, seg_ids)
+    preds = paddle.argmax(logits, axis=-1).squeeze(0)
+    tokens = tokenizer.convert_ids_to_tokens(
+        _inputs['input_ids'][1:seq_len - 1])
+    labels = preds[1:seq_len - 1].tolist()
+    assert len(tokens) == len(labels)
+    # add 0 for non punc
+    punc_list = [0] + punc_list
+    text = ''
+    for t, l in zip(tokens, labels):
+        text += t
+        if l != 0:  # Non punc.
+            text += punc_list[l]
+    print("Punctuation Restoration Result:", text)
+    return text
+
+
+def main():
+    # parse args and config and redirect to train_sp
+    parser = argparse.ArgumentParser(description="Run Punctuation Restoration.")
+    parser.add_argument("--config", type=str, help="ErnieLinear config file.")
+    parser.add_argument("--checkpoint", type=str, help="snapshot to load.")
+    parser.add_argument("--text", type=str, help="raw text to be restored.")
+    parser.add_argument(
+        "--ngpu", type=int, default=1, help="if ngpu=0, use cpu.")
+
+    args = parser.parse_args()
+
+    if args.ngpu == 0:
+        paddle.set_device("cpu")
+    elif args.ngpu > 0:
+        paddle.set_device("gpu")
+    else:
+        print("ngpu should >= 0 !")
+
+    test(args)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/ernie-sat/paddlespeech/text/exps/ernie_linear/test.py b/ernie-sat/paddlespeech/text/exps/ernie_linear/test.py
new file mode 100644
index 0000000000000000000000000000000000000000..4302a1a3bfa80e8e8417fc7d5ec2786eda8417df
--- /dev/null
+++ b/ernie-sat/paddlespeech/text/exps/ernie_linear/test.py
@@ -0,0 +1,120 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+
+import numpy as np
+import paddle
+import pandas as pd
+import yaml
+from paddle import nn
+from paddle.io import DataLoader
+from sklearn.metrics import classification_report
+from sklearn.metrics import precision_recall_fscore_support
+from yacs.config import CfgNode
+
+from paddlespeech.text.models.ernie_linear import ErnieLinear
+from paddlespeech.text.models.ernie_linear import PuncDataset
+from paddlespeech.text.models.ernie_linear import PuncDatasetFromErnieTokenizer
+
+DefinedClassifier = {
+    'ErnieLinear': ErnieLinear,
+}
+
+DefinedLoss = {
+    "ce": nn.CrossEntropyLoss,
+}
+
+DefinedDataset = {
+    'Punc': PuncDataset,
+    'Ernie': PuncDatasetFromErnieTokenizer,
+}
+
+
+def evaluation(y_pred, y_test):
+    precision, recall, f1, _ = precision_recall_fscore_support(
+        y_test, y_pred, average=None, labels=[1, 2, 3])
+    overall = precision_recall_fscore_support(
+        y_test, y_pred, average='macro', labels=[1, 2, 3])
+    result = pd.DataFrame(
+        np.array([precision, recall, f1]),
+        columns=list(['O', 'COMMA', 'PERIOD', 'QUESTION'])[1:],
+        index=['Precision', 'Recall', 'F1'])
+    result['OVERALL'] = overall[:3]
+    return result
+
+
+def test(args):
+    with open(args.config) as f:
+        config = CfgNode(yaml.safe_load(f))
+    print("========Args========")
+    print(yaml.safe_dump(vars(args)))
+    print("========Config========")
+    print(config)
+
+    test_dataset = DefinedDataset[config["dataset_type"]](
+        train_path=config["test_path"], **config["data_params"])
+    test_loader = DataLoader(
+        test_dataset,
+        batch_size=config.batch_size,
+        shuffle=False,
+        drop_last=False)
+    model = DefinedClassifier[config["model_type"]](**config["model"])
+    state_dict = paddle.load(args.checkpoint)
+    model.set_state_dict(state_dict["main_params"])
+    model.eval()
+
+    punc_list = []
+    for i in range(len(test_loader.dataset.id2punc)):
+        punc_list.append(test_loader.dataset.id2punc[i])
+
+    test_total_label = []
+    test_total_predict = []
+
+    for i, batch in enumerate(test_loader):
+        input, label = batch
+        label = paddle.reshape(label, shape=[-1])
+        y, logit = model(input)
+        pred = paddle.argmax(logit, axis=1)
+        test_total_label.extend(label.numpy().tolist())
+        test_total_predict.extend(pred.numpy().tolist())
+    t = classification_report(
+        test_total_label, test_total_predict, target_names=punc_list)
+    print(t)
+    t2 = evaluation(test_total_label, test_total_predict)
+    print('=========================================================')
+    print(t2)
+
+
+def main():
+    # parse args and config and redirect to train_sp
+    parser = argparse.ArgumentParser(description="Test a ErnieLinear model.")
+    parser.add_argument("--config", type=str, help="ErnieLinear config file.")
+    parser.add_argument("--checkpoint", type=str, help="snapshot to load.")
+    parser.add_argument(
+        "--ngpu", type=int, default=1, help="if ngpu=0, use cpu.")
+
+    args = parser.parse_args()
+
+    if args.ngpu == 0:
+        paddle.set_device("cpu")
+    elif args.ngpu > 0:
+        paddle.set_device("gpu")
+    else:
+        print("ngpu should >= 0 !")
+
+    test(args)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/ernie-sat/paddlespeech/text/exps/ernie_linear/train.py b/ernie-sat/paddlespeech/text/exps/ernie_linear/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..22c25e172ce9a1b3fadf4a59b9d27f02cd5332c7
--- /dev/null
+++ b/ernie-sat/paddlespeech/text/exps/ernie_linear/train.py
@@ -0,0 +1,172 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import logging
+import os
+import shutil
+from pathlib import Path
+
+import paddle
+import yaml
+from paddle import DataParallel
+from paddle import distributed as dist
+from paddle import nn
+from paddle.io import DataLoader
+from paddle.optimizer import Adam
+from paddle.optimizer.lr import ExponentialDecay
+from yacs.config import CfgNode
+
+from paddlespeech.t2s.training.extensions.snapshot import Snapshot
+from paddlespeech.t2s.training.extensions.visualizer import VisualDL
+from paddlespeech.t2s.training.seeding import seed_everything
+from paddlespeech.t2s.training.trainer import Trainer
+from paddlespeech.text.models.ernie_linear import ErnieLinear
+from paddlespeech.text.models.ernie_linear import ErnieLinearEvaluator
+from paddlespeech.text.models.ernie_linear import ErnieLinearUpdater
+from paddlespeech.text.models.ernie_linear import PuncDataset
+from paddlespeech.text.models.ernie_linear import PuncDatasetFromErnieTokenizer
+
+DefinedClassifier = {
+    'ErnieLinear': ErnieLinear,
+}
+
+DefinedLoss = {
+    "ce": nn.CrossEntropyLoss,
+}
+
+DefinedDataset = {
+    'Punc': PuncDataset,
+    'Ernie': PuncDatasetFromErnieTokenizer,
+}
+
+
+def train_sp(args, config):
+    # decides device type and whether to run in parallel
+    # setup running environment correctly
+    if (not paddle.is_compiled_with_cuda()) or args.ngpu == 0:
+        paddle.set_device("cpu")
+    else:
+        paddle.set_device("gpu")
+    world_size = paddle.distributed.get_world_size()
+    if world_size > 1:
+        paddle.distributed.init_parallel_env()
+
+    # set the random seed, it is a must for multiprocess training
+    seed_everything(config.seed)
+
+    print(
+        f"rank: {dist.get_rank()}, pid: {os.getpid()}, parent_pid: {os.getppid()}",
+    )
+    # dataloader has been too verbose
+    logging.getLogger("DataLoader").disabled = True
+    train_dataset = DefinedDataset[config["dataset_type"]](
+        train_path=config["train_path"], **config["data_params"])
+    dev_dataset = DefinedDataset[config["dataset_type"]](
+        train_path=config["dev_path"], **config["data_params"])
+    train_dataloader = DataLoader(
+        train_dataset,
+        shuffle=True,
+        num_workers=config.num_workers,
+        batch_size=config.batch_size)
+
+    dev_dataloader = DataLoader(
+        dev_dataset,
+        batch_size=config.batch_size,
+        shuffle=False,
+        drop_last=False,
+        num_workers=config.num_workers)
+
+    print("dataloaders done!")
+
+    model = DefinedClassifier[config["model_type"]](**config["model"])
+
+    if world_size > 1:
+        model = DataParallel(model)
+    print("model done!")
+
+    criterion = DefinedLoss[config["loss_type"]](
+        **config["loss"]) if "loss_type" in config else DefinedLoss["ce"]()
+
+    print("criterions done!")
+
+    lr_schedule = ExponentialDecay(**config["scheduler_params"])
+    optimizer = Adam(
+        learning_rate=lr_schedule,
+        parameters=model.parameters(),
+        weight_decay=paddle.regularizer.L2Decay(
+            config["optimizer_params"]["weight_decay"]))
+
+    print("optimizer done!")
+
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    if dist.get_rank() == 0:
+        config_name = args.config.split("/")[-1]
+        # copy conf to output_dir
+        shutil.copyfile(args.config, output_dir / config_name)
+
+    updater = ErnieLinearUpdater(
+        model=model,
+        criterion=criterion,
+        scheduler=lr_schedule,
+        optimizer=optimizer,
+        dataloader=train_dataloader,
+        output_dir=output_dir)
+
+    trainer = Trainer(updater, (config.max_epoch, 'epoch'), output_dir)
+
+    evaluator = ErnieLinearEvaluator(
+        model=model,
+        criterion=criterion,
+        dataloader=dev_dataloader,
+        output_dir=output_dir)
+
+    if dist.get_rank() == 0:
+        trainer.extend(evaluator, trigger=(1, "epoch"))
+        trainer.extend(VisualDL(output_dir), trigger=(1, "iteration"))
+    trainer.extend(
+        Snapshot(max_size=config.num_snapshots), trigger=(1, 'epoch'))
+    trainer.run()
+
+
+def main():
+    # parse args and config and redirect to train_sp
+    parser = argparse.ArgumentParser(description="Train a ErnieLinear model.")
+    parser.add_argument("--config", type=str, help="ErnieLinear config file.")
+    parser.add_argument("--output-dir", type=str, help="output dir.")
+    parser.add_argument(
+        "--ngpu", type=int, default=1, help="if ngpu=0, use cpu.")
+
+    args = parser.parse_args()
+
+    with open(args.config) as f:
+        config = CfgNode(yaml.safe_load(f))
+
+    print("========Args========")
+    print(yaml.safe_dump(vars(args)))
+    print("========Config========")
+    print(config)
+    print(
+        f"master see the word size: {dist.get_world_size()}, from pid: {os.getpid()}"
+    )
+
+    # dispatch
+    if args.ngpu > 1:
+        dist.spawn(train_sp, (args, config), nprocs=args.ngpu)
+    else:
+        train_sp(args, config)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/ernie-sat/paddlespeech/text/models/__init__.py b/ernie-sat/paddlespeech/text/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3828e1363cbfb5dd258053c439905662380c6ba7
--- /dev/null
+++ b/ernie-sat/paddlespeech/text/models/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .ernie_crf import ErnieCrf
+from .ernie_linear import ErnieLinear
diff --git a/ernie-sat/paddlespeech/text/models/ernie_crf/__init__.py b/ernie-sat/paddlespeech/text/models/ernie_crf/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..bbe467ab374f1b9cae51c0073842d5ad1db6912a
--- /dev/null
+++ b/ernie-sat/paddlespeech/text/models/ernie_crf/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .model import ErnieCrf
diff --git a/ernie-sat/paddlespeech/text/models/ernie_crf/model.py b/ernie-sat/paddlespeech/text/models/ernie_crf/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1ce8099e123c229ecbcfc5b9b16002d4e034d27
--- /dev/null
+++ b/ernie-sat/paddlespeech/text/models/ernie_crf/model.py
@@ -0,0 +1,65 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddle
+import paddle.nn as nn
+from paddlenlp.layers.crf import LinearChainCrf
+from paddlenlp.layers.crf import LinearChainCrfLoss
+from paddlenlp.layers.crf import ViterbiDecoder
+from paddlenlp.transformers import ErnieForTokenClassification
+
+
+class ErnieCrf(nn.Layer):
+    def __init__(self,
+                 num_classes,
+                 pretrained_token='ernie-1.0',
+                 crf_lr=100,
+                 **kwargs):
+        super().__init__()
+        self.ernie = ErnieForTokenClassification.from_pretrained(
+            pretrained_token, num_classes=num_classes, **kwargs)
+        self.num_classes = num_classes
+        self.crf = LinearChainCrf(
+            self.num_classes, crf_lr=crf_lr, with_start_stop_tag=False)
+        self.crf_loss = LinearChainCrfLoss(self.crf)
+        self.viterbi_decoder = ViterbiDecoder(
+            self.crf.transitions, with_start_stop_tag=False)
+
+    def forward(self,
+                input_ids,
+                token_type_ids=None,
+                position_ids=None,
+                attention_mask=None,
+                lengths=None,
+                labels=None):
+        logits = self.ernie(
+            input_ids,
+            token_type_ids=token_type_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids)
+
+        if lengths is None:
+            lengths = paddle.ones(
+                shape=[input_ids.shape[0]],
+                dtype=paddle.int64) * input_ids.shape[1]
+
+        _, prediction = self.viterbi_decoder(logits, lengths)
+        prediction = prediction.reshape([-1])
+
+        if labels is not None:
+            labels = labels.reshape([input_ids.shape[0], -1])
+            loss = self.crf_loss(logits, lengths, labels)
+            avg_loss = paddle.mean(loss)
+            return avg_loss, prediction
+        else:
+            return prediction
diff --git a/ernie-sat/paddlespeech/text/models/ernie_linear/__init__.py b/ernie-sat/paddlespeech/text/models/ernie_linear/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a10a6eb2d3119b89a7e88d98254571735b29cfa
--- /dev/null
+++ b/ernie-sat/paddlespeech/text/models/ernie_linear/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .dataset import *
+from .ernie_linear import *
+from .ernie_linear_updater import *
diff --git a/ernie-sat/paddlespeech/text/models/ernie_linear/dataset.py b/ernie-sat/paddlespeech/text/models/ernie_linear/dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..64c8d0bdfd34d32d016b539f147bdfa145b604f0
--- /dev/null
+++ b/ernie-sat/paddlespeech/text/models/ernie_linear/dataset.py
@@ -0,0 +1,154 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+import paddle
+from paddle.io import Dataset
+from paddlenlp.transformers import ErnieTokenizer
+
+__all__ = ["PuncDataset", "PuncDatasetFromErnieTokenizer"]
+
+
+class PuncDataset(Dataset):
+    def __init__(self, train_path, vocab_path, punc_path, seq_len=100):
+        self.seq_len = seq_len
+
+        self.word2id = self.load_vocab(
+            vocab_path, extra_word_list=['<UNK>', '<END>'])
+        self.id2word = {v: k for k, v in self.word2id.items()}
+        self.punc2id = self.load_vocab(punc_path, extra_word_list=[" "])
+        self.id2punc = {k: v for (v, k) in self.punc2id.items()}
+
+        tmp_seqs = open(train_path, encoding='utf-8').readlines()
+        self.txt_seqs = [i for seq in tmp_seqs for i in seq.split()]
+        self.preprocess(self.txt_seqs)
+
+    def __len__(self):
+        """return the sentence nums in .txt
+        """
+        return self.in_len
+
+    def __getitem__(self, index):
+        return self.input_data[index], self.label[index]
+
+    def load_vocab(self, vocab_path, extra_word_list=[], encoding='utf-8'):
+        n = len(extra_word_list)
+        with open(vocab_path, encoding='utf-8') as vf:
+            vocab = {word.strip(): i + n for i, word in enumerate(vf)}
+        for i, word in enumerate(extra_word_list):
+            vocab[word] = i
+        return vocab
+
+    def preprocess(self, txt_seqs: list):
+        input_data = []
+        label = []
+        input_r = []
+        label_r = []
+
+        count = 0
+        length = len(txt_seqs)
+        for token in txt_seqs:
+            count += 1
+            if count == length:
+                break
+            if token in self.punc2id:
+                continue
+            punc = txt_seqs[count]
+            if punc not in self.punc2id:
+                label.append(self.punc2id[" "])
+                input_data.append(
+                    self.word2id.get(token, self.word2id["<UNK>"]))
+                input_r.append(token)
+                label_r.append(' ')
+            else:
+                label.append(self.punc2id[punc])
+                input_data.append(
+                    self.word2id.get(token, self.word2id["<UNK>"]))
+                input_r.append(token)
+                label_r.append(punc)
+        if len(input_data) != len(label):
+            assert 'error: length input_data != label'
+
+        self.in_len = len(input_data) // self.seq_len
+        len_tmp = self.in_len * self.seq_len
+        input_data = input_data[:len_tmp]
+        label = label[:len_tmp]
+
+        self.input_data = paddle.to_tensor(
+            np.array(input_data, dtype='int64').reshape(-1, self.seq_len))
+        self.label = paddle.to_tensor(
+            np.array(label, dtype='int64').reshape(-1, self.seq_len))
+
+
+class PuncDatasetFromErnieTokenizer(Dataset):
+    def __init__(self,
+                 train_path,
+                 punc_path,
+                 pretrained_token='ernie-1.0',
+                 seq_len=100):
+        self.tokenizer = ErnieTokenizer.from_pretrained(pretrained_token)
+        self.paddingID = self.tokenizer.pad_token_id
+        self.seq_len = seq_len
+        self.punc2id = self.load_vocab(punc_path, extra_word_list=[" "])
+        self.id2punc = {k: v for (v, k) in self.punc2id.items()}
+        tmp_seqs = open(train_path, encoding='utf-8').readlines()
+        self.txt_seqs = [i for seq in tmp_seqs for i in seq.split()]
+        self.preprocess(self.txt_seqs)
+
+    def __len__(self):
+        return self.in_len
+
+    def __getitem__(self, index):
+        return self.input_data[index], self.label[index]
+
+    def load_vocab(self, vocab_path, extra_word_list=[], encoding='utf-8'):
+        n = len(extra_word_list)
+        with open(vocab_path, encoding='utf-8') as vf:
+            vocab = {word.strip(): i + n for i, word in enumerate(vf)}
+        for i, word in enumerate(extra_word_list):
+            vocab[word] = i
+        return vocab
+
+    def preprocess(self, txt_seqs: list):
+        input_data = []
+        label = []
+        count = 0
+        print("Preprocessing in PuncDatasetFromErnieTokenizer...")
+        for i in range(len(txt_seqs) - 1):
+            word = txt_seqs[i]
+            punc = txt_seqs[i + 1]
+            if word in self.punc2id:
+                continue
+
+            token = self.tokenizer(word)
+            x = token["input_ids"][1:-1]
+            input_data.extend(x)
+
+            for i in range(len(x) - 1):
+                label.append(self.punc2id[" "])
+
+            if punc not in self.punc2id:
+                label.append(self.punc2id[" "])
+            else:
+                label.append(self.punc2id[punc])
+
+        if len(input_data) != len(label):
+            assert 'error: length input_data != label'
+
+        self.in_len = len(input_data) // self.seq_len
+        len_tmp = self.in_len * self.seq_len
+        input_data = input_data[:len_tmp]
+        label = label[:len_tmp]
+        self.input_data = np.array(
+            input_data, dtype='int64').reshape(-1, self.seq_len)
+        self.label = np.array(label, dtype='int64').reshape(-1, self.seq_len)
diff --git a/ernie-sat/paddlespeech/text/models/ernie_linear/ernie_linear.py b/ernie-sat/paddlespeech/text/models/ernie_linear/ernie_linear.py
new file mode 100644
index 0000000000000000000000000000000000000000..c450a904cc2a82c5f945fa08e3d837a85e381bb7
--- /dev/null
+++ b/ernie-sat/paddlespeech/text/models/ernie_linear/ernie_linear.py
@@ -0,0 +1,65 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+
+import paddle
+import paddle.nn as nn
+from paddlenlp.transformers import ErnieForTokenClassification
+
+
+class ErnieLinear(nn.Layer):
+    def __init__(self,
+                 num_classes=None,
+                 pretrained_token='ernie-1.0',
+                 cfg_path=None,
+                 ckpt_path=None,
+                 **kwargs):
+        super(ErnieLinear, self).__init__()
+
+        if cfg_path is not None and ckpt_path is not None:
+            cfg_path = os.path.abspath(os.path.expanduser(cfg_path))
+            ckpt_path = os.path.abspath(os.path.expanduser(ckpt_path))
+
+            assert os.path.isfile(
+                cfg_path), 'Config file is not valid: {}'.format(cfg_path)
+            assert os.path.isfile(
+                ckpt_path), 'Checkpoint file is not valid: {}'.format(ckpt_path)
+
+            self.ernie = ErnieForTokenClassification.from_pretrained(
+                os.path.dirname(cfg_path))
+        else:
+            assert isinstance(
+                num_classes, int
+            ) and num_classes > 0, 'Argument `num_classes` must be an integer.'
+            self.ernie = ErnieForTokenClassification.from_pretrained(
+                pretrained_token, num_classes=num_classes, **kwargs)
+
+        self.num_classes = self.ernie.num_classes
+        self.softmax = nn.Softmax()
+
+    def forward(self,
+                input_ids,
+                token_type_ids=None,
+                position_ids=None,
+                attention_mask=None):
+        y = self.ernie(
+            input_ids,
+            token_type_ids=token_type_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids)
+
+        y = paddle.reshape(y, shape=[-1, self.num_classes])
+        logits = self.softmax(y)
+
+        return y, logits
diff --git a/ernie-sat/paddlespeech/text/models/ernie_linear/ernie_linear_updater.py b/ernie-sat/paddlespeech/text/models/ernie_linear/ernie_linear_updater.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b3d7410e04afff43ea16a821473fe05eb4f195d
--- /dev/null
+++ b/ernie-sat/paddlespeech/text/models/ernie_linear/ernie_linear_updater.py
@@ -0,0 +1,123 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+
+import paddle
+from paddle import distributed as dist
+from paddle.io import DataLoader
+from paddle.nn import Layer
+from paddle.optimizer import Optimizer
+from paddle.optimizer.lr import LRScheduler
+from sklearn.metrics import f1_score
+
+from paddlespeech.t2s.training.extensions.evaluator import StandardEvaluator
+from paddlespeech.t2s.training.reporter import report
+from paddlespeech.t2s.training.updaters.standard_updater import StandardUpdater
+logging.basicConfig(
+    format='%(asctime)s [%(levelname)s] [%(filename)s:%(lineno)d] %(message)s',
+    datefmt='[%Y-%m-%d %H:%M:%S]')
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+
+class ErnieLinearUpdater(StandardUpdater):
+    def __init__(self,
+                 model: Layer,
+                 criterion: Layer,
+                 scheduler: LRScheduler,
+                 optimizer: Optimizer,
+                 dataloader: DataLoader,
+                 output_dir=None):
+        super().__init__(model, optimizer, dataloader, init_state=None)
+        self.model = model
+        self.dataloader = dataloader
+
+        self.criterion = criterion
+        self.scheduler = scheduler
+        self.optimizer = optimizer
+
+        log_file = output_dir / 'worker_{}.log'.format(dist.get_rank())
+        self.filehandler = logging.FileHandler(str(log_file))
+        logger.addHandler(self.filehandler)
+        self.logger = logger
+        self.msg = ""
+
+    def update_core(self, batch):
+        self.msg = "Rank: {}, ".format(dist.get_rank())
+        losses_dict = {}
+
+        input, label = batch
+        label = paddle.reshape(label, shape=[-1])
+        y, logit = self.model(input)
+        pred = paddle.argmax(logit, axis=1)
+
+        loss = self.criterion(y, label)
+
+        self.optimizer.clear_grad()
+        loss.backward()
+
+        self.optimizer.step()
+        self.scheduler.step()
+
+        F1_score = f1_score(
+            label.numpy().tolist(), pred.numpy().tolist(), average="macro")
+
+        report("train/loss", float(loss))
+        losses_dict["loss"] = float(loss)
+        report("train/F1_score", float(F1_score))
+        losses_dict["F1_score"] = float(F1_score)
+
+        self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
+                              for k, v in losses_dict.items())
+
+
+class ErnieLinearEvaluator(StandardEvaluator):
+    def __init__(self,
+                 model: Layer,
+                 criterion: Layer,
+                 dataloader: DataLoader,
+                 output_dir=None):
+        super().__init__(model, dataloader)
+        self.model = model
+        self.criterion = criterion
+        self.dataloader = dataloader
+
+        log_file = output_dir / 'worker_{}.log'.format(dist.get_rank())
+        self.filehandler = logging.FileHandler(str(log_file))
+        logger.addHandler(self.filehandler)
+        self.logger = logger
+        self.msg = ""
+
+    def evaluate_core(self, batch):
+        self.msg = "Evaluate: "
+        losses_dict = {}
+
+        input, label = batch
+        label = paddle.reshape(label, shape=[-1])
+        y, logit = self.model(input)
+        pred = paddle.argmax(logit, axis=1)
+
+        loss = self.criterion(y, label)
+
+        F1_score = f1_score(
+            label.numpy().tolist(), pred.numpy().tolist(), average="macro")
+
+        report("eval/loss", float(loss))
+        losses_dict["loss"] = float(loss)
+        report("eval/F1_score", float(F1_score))
+        losses_dict["F1_score"] = float(F1_score)
+
+        self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
+                              for k, v in losses_dict.items())
+        self.logger.info(self.msg)
diff --git a/ernie-sat/paddlespeech/vector/__init__.py b/ernie-sat/paddlespeech/vector/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..185a92b8d94d3426d616c0624f0f2ee04339349e
--- /dev/null
+++ b/ernie-sat/paddlespeech/vector/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/ernie-sat/paddlespeech/vector/cluster/__init__.py b/ernie-sat/paddlespeech/vector/cluster/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..97043fd7ba6885aac81cad5a49924c23c67d4d47
--- /dev/null
+++ b/ernie-sat/paddlespeech/vector/cluster/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/ernie-sat/paddlespeech/vector/cluster/diarization.py b/ernie-sat/paddlespeech/vector/cluster/diarization.py
new file mode 100644
index 0000000000000000000000000000000000000000..597aa48070eb6bf8845ccb323e7353a87799029b
--- /dev/null
+++ b/ernie-sat/paddlespeech/vector/cluster/diarization.py
@@ -0,0 +1,1080 @@
+# Copyright (c) 2022 SpeechBrain Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This script contains basic functions used for speaker diarization.
+This script has an optional dependency on open source sklearn library.
+A few sklearn functions are modified in this script as per requirement.
+"""
+import argparse
+import warnings
+
+import numpy as np
+import scipy
+import sklearn
+from distutils.util import strtobool
+from scipy import sparse
+from scipy.sparse.csgraph import connected_components
+from scipy.sparse.csgraph import laplacian as csgraph_laplacian
+from scipy.sparse.linalg import eigsh
+from sklearn.cluster import SpectralClustering
+from sklearn.cluster._kmeans import k_means
+from sklearn.neighbors import kneighbors_graph
+
+
+def _graph_connected_component(graph, node_id):
+    """
+    Find the largest graph connected components that contains one
+    given node.
+
+    Arguments
+    ---------
+    graph : array-like, shape: (n_samples, n_samples)
+        Adjacency matrix of the graph, non-zero weight means an edge
+        between the nodes.
+    node_id : int
+        The index of the query node of the graph.
+
+    Returns
+    -------
+    connected_components_matrix : array-like
+        shape - (n_samples,).
+        An array of bool value indicating the indexes of the nodes belonging
+        to the largest connected components of the given query node.
+    """
+
+    n_node = graph.shape[0]
+    if sparse.issparse(graph):
+        # speed up row-wise access to boolean connection mask
+        graph = graph.tocsr()
+    connected_nodes = np.zeros(n_node, dtype=bool)
+    nodes_to_explore = np.zeros(n_node, dtype=bool)
+    nodes_to_explore[node_id] = True
+    for _ in range(n_node):
+        last_num_component = connected_nodes.sum()
+        np.logical_or(connected_nodes, nodes_to_explore, out=connected_nodes)
+        if last_num_component >= connected_nodes.sum():
+            break
+        indices = np.where(nodes_to_explore)[0]
+        nodes_to_explore.fill(False)
+        for i in indices:
+            if sparse.issparse(graph):
+                neighbors = graph[i].toarray().ravel()
+            else:
+                neighbors = graph[i]
+            np.logical_or(nodes_to_explore, neighbors, out=nodes_to_explore)
+    return connected_nodes
+
+
+def _graph_is_connected(graph):
+    """
+    Return whether the graph is connected (True) or Not (False)
+
+    Arguments
+    ---------
+    graph : array-like or sparse matrix, shape: (n_samples, n_samples)
+        Adjacency matrix of the graph, non-zero weight means an edge between the nodes.
+
+    Returns
+    -------
+    is_connected : bool
+        True means the graph is fully connected and False means not.
+    """
+
+    if sparse.isspmatrix(graph):
+        # sparse graph, find all the connected components
+        n_connected_components, _ = connected_components(graph)
+        return n_connected_components == 1
+    else:
+        # dense graph, find all connected components start from node 0
+        return _graph_connected_component(graph, 0).sum() == graph.shape[0]
+
+
+def _set_diag(laplacian, value, norm_laplacian):
+    """
+    Set the diagonal of the laplacian matrix and convert it to a sparse
+    format well suited for eigenvalue decomposition.
+
+    Arguments
+    ---------
+    laplacian : array or sparse matrix
+        The graph laplacian.
+    value : float
+        The value of the diagonal.
+    norm_laplacian : bool
+        Whether the value of the diagonal should be changed or not.
+
+    Returns
+    -------
+    laplacian : array or sparse matrix
+        An array of matrix in a form that is well suited to fast eigenvalue
+        decomposition, depending on the bandwidth of the matrix.
+    """
+
+    n_nodes = laplacian.shape[0]
+    # We need all entries in the diagonal to values
+    if not sparse.isspmatrix(laplacian):
+        if norm_laplacian:
+            laplacian.flat[::n_nodes + 1] = value
+    else:
+        laplacian = laplacian.tocoo()
+        if norm_laplacian:
+            diag_idx = laplacian.row == laplacian.col
+            laplacian.data[diag_idx] = value
+        # If the matrix has a small number of diagonals (as in the
+        # case of structured matrices coming from images), the
+        # dia format might be best suited for matvec products:
+        n_diags = np.unique(laplacian.row - laplacian.col).size
+        if n_diags <= 7:
+            # 3 or less outer diagonals on each side
+            laplacian = laplacian.todia()
+        else:
+            # csr has the fastest matvec and is thus best suited to
+            # arpack
+            laplacian = laplacian.tocsr()
+    return laplacian
+
+
+def _deterministic_vector_sign_flip(u):
+    """
+    Modify the sign of vectors for reproducibility. Flips the sign of
+    elements of all the vectors (rows of u) such that the absolute
+    maximum element of each vector is positive.
+
+    Arguments
+    ---------
+    u : ndarray
+        Array with vectors as its rows.
+
+    Returns
+    -------
+    u_flipped : ndarray
+        Array with the sign flipped vectors as its rows. The same shape as `u`.
+    """
+
+    max_abs_rows = np.argmax(np.abs(u), axis=1)
+    signs = np.sign(u[range(u.shape[0]), max_abs_rows])
+    u *= signs[:, np.newaxis]
+    return u
+
+
+def _check_random_state(seed):
+    """
+    Turn seed into a np.random.RandomState instance.
+
+    Arguments
+    ---------
+    seed : None | int | instance of RandomState
+        If seed is None, return the RandomState singleton used by np.random.
+        If seed is an int, return a new RandomState instance seeded with seed.
+        If seed is already a RandomState instance, return it.
+        Otherwise raise ValueError.
+    """
+
+    if seed is None or seed is np.random:
+        return np.random.mtrand._rand
+    if isinstance(seed, numbers.Integral):
+        return np.random.RandomState(seed)
+    if isinstance(seed, np.random.RandomState):
+        return seed
+    raise ValueError("%r cannot be used to seed a np.random.RandomState"
+                     " instance" % seed)
+
+
+def spectral_embedding(
+        adjacency,
+        n_components=8,
+        norm_laplacian=True,
+        drop_first=True, ):
+    """
+    Returns spectral embeddings.
+
+    Arguments
+    ---------
+    adjacency : array-like or sparse graph
+        shape - (n_samples, n_samples)
+        The adjacency matrix of the graph to embed.
+    n_components : int
+        The dimension of the projection subspace.
+    norm_laplacian : bool
+        If True, then compute normalized Laplacian.
+    drop_first : bool
+        Whether to drop the first eigenvector.
+
+    Returns
+    -------
+    embedding : array
+        Spectral embeddings for each sample.
+
+    Example
+    -------
+    >>> import numpy as np
+    >>> import diarization as diar
+    >>> affinity = np.array([[1, 1, 1, 0.5, 0, 0, 0, 0, 0, 0.5],
+    ... [1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
+    ... [1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
+    ... [0.5, 0, 0, 1, 1, 1, 0, 0, 0, 0],
+    ... [0, 0, 0, 1, 1, 1, 0, 0, 0, 0],
+    ... [0, 0, 0, 1, 1, 1, 0, 0, 0, 0],
+    ... [0, 0, 0, 0, 0, 0, 1, 1, 1, 1],
+    ... [0, 0, 0, 0, 0, 0, 1, 1, 1, 1],
+    ... [0, 0, 0, 0, 0, 0, 1, 1, 1, 1],
+    ... [0.5, 0, 0, 0, 0, 0, 1, 1, 1, 1]])
+    >>> embs = diar.spectral_embedding(affinity, 3)
+    >>> # Notice similar embeddings
+    >>> print(np.around(embs , decimals=3))
+    [[ 0.075  0.244  0.285]
+     [ 0.083  0.356 -0.203]
+     [ 0.083  0.356 -0.203]
+     [ 0.26  -0.149  0.154]
+     [ 0.29  -0.218 -0.11 ]
+     [ 0.29  -0.218 -0.11 ]
+     [-0.198 -0.084 -0.122]
+     [-0.198 -0.084 -0.122]
+     [-0.198 -0.084 -0.122]
+     [-0.167 -0.044  0.316]]
+    """
+
+    # Whether to drop the first eigenvector
+    if drop_first:
+        n_components = n_components + 1
+
+    if not _graph_is_connected(adjacency):
+        warnings.warn("Graph is not fully connected, spectral embedding"
+                      " may not work as expected.")
+
+    laplacian, dd = csgraph_laplacian(
+        adjacency, normed=norm_laplacian, return_diag=True)
+
+    laplacian = _set_diag(laplacian, 1, norm_laplacian)
+
+    laplacian *= -1
+
+    vals, diffusion_map = eigsh(
+        laplacian,
+        k=n_components,
+        sigma=1.0,
+        which="LM", )
+
+    embedding = diffusion_map.T[n_components::-1]
+
+    if norm_laplacian:
+        embedding = embedding / dd
+
+    embedding = _deterministic_vector_sign_flip(embedding)
+    if drop_first:
+        return embedding[1:n_components].T
+    else:
+        return embedding[:n_components].T
+
+
+def spectral_clustering(
+        affinity,
+        n_clusters=8,
+        n_components=None,
+        random_state=None,
+        n_init=10, ):
+    """
+    Performs spectral clustering.
+
+    Arguments
+    ---------
+    affinity : matrix
+        Affinity matrix.
+    n_clusters : int
+        Number of clusters for kmeans.
+    n_components : int
+        Number of components to retain while estimating spectral embeddings.
+    random_state : int
+        A pseudo random number generator used by kmeans.
+     n_init : int
+        Number of time the k-means algorithm will be run with different centroid seeds.
+
+    Returns
+    -------
+    labels : array
+        Cluster label for each sample.
+
+    Example
+    -------
+    >>> import numpy as np
+    >>> diarization as diar
+    >>> affinity = np.array([[1, 1, 1, 0.5, 0, 0, 0, 0, 0, 0.5],
+    ... [1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
+    ... [1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
+    ... [0.5, 0, 0, 1, 1, 1, 0, 0, 0, 0],
+    ... [0, 0, 0, 1, 1, 1, 0, 0, 0, 0],
+    ... [0, 0, 0, 1, 1, 1, 0, 0, 0, 0],
+    ... [0, 0, 0, 0, 0, 0, 1, 1, 1, 1],
+    ... [0, 0, 0, 0, 0, 0, 1, 1, 1, 1],
+    ... [0, 0, 0, 0, 0, 0, 1, 1, 1, 1],
+    ... [0.5, 0, 0, 0, 0, 0, 1, 1, 1, 1]])
+    >>> labs = diar.spectral_clustering(affinity, 3)
+    >>> # print (labs) # [2 2 2 1 1 1 0 0 0 0]
+    """
+
+    random_state = _check_random_state(random_state)
+    n_components = n_clusters if n_components is None else n_components
+
+    maps = spectral_embedding(
+        affinity,
+        n_components=n_components,
+        drop_first=False, )
+
+    _, labels, _ = k_means(
+        maps, n_clusters, random_state=random_state, n_init=n_init)
+
+    return labels
+
+
+class EmbeddingMeta:
+    """
+    A utility class to pack deep embeddings and meta-information in one object.
+
+    Arguments
+    ---------
+    segset : list
+        List of session IDs as an array of strings.
+    stats : tensor
+        An ndarray of float64. Each line contains embedding
+        from the corresponding session.
+    """
+
+    def __init__(
+            self,
+            segset=None,
+            stats=None, ):
+
+        if segset is None:
+            self.segset = numpy.empty(0, dtype="|O")
+            self.stats = numpy.array([], dtype=np.float64)
+        else:
+            self.segset = segset
+            self.stats = stats
+
+    def norm_stats(self):
+        """
+        Divide all first-order statistics by their Euclidean norm.
+        """
+
+        vect_norm = np.clip(np.linalg.norm(self.stats, axis=1), 1e-08, np.inf)
+        self.stats = (self.stats.transpose() / vect_norm).transpose()
+
+
+class SpecClustUnorm:
+    """
+    This class implements the spectral clustering with unnormalized affinity matrix.
+    Useful when affinity matrix is based on cosine similarities.
+
+    Reference
+    ---------
+    Von Luxburg, U. A tutorial on spectral clustering. Stat Comput 17, 395–416 (2007).
+    https://doi.org/10.1007/s11222-007-9033-z
+
+    Example
+    -------
+    >>> import diarization as diar
+    >>> clust = diar.SpecClustUnorm(min_num_spkrs=2, max_num_spkrs=10)
+    >>> emb = [[ 2.1, 3.1, 4.1, 4.2, 3.1],
+    ... [ 2.2, 3.1, 4.2, 4.2, 3.2],
+    ... [ 2.0, 3.0, 4.0, 4.1, 3.0],
+    ... [ 8.0, 7.0, 7.0, 8.1, 9.0],
+    ... [ 8.1, 7.1, 7.2, 8.1, 9.2],
+    ... [ 8.3, 7.4, 7.0, 8.4, 9.0],
+    ... [ 0.3, 0.4, 0.4, 0.5, 0.8],
+    ... [ 0.4, 0.3, 0.6, 0.7, 0.8],
+    ... [ 0.2, 0.3, 0.2, 0.3, 0.7],
+    ... [ 0.3, 0.4, 0.4, 0.4, 0.7],]
+    >>> # Estimating similarity matrix
+    >>> sim_mat = clust.get_sim_mat(emb)
+    >>> print (np.around(sim_mat[5:,5:], decimals=3))
+    [[1.    0.957 0.961 0.904 0.966]
+     [0.957 1.    0.977 0.982 0.997]
+     [0.961 0.977 1.    0.928 0.972]
+     [0.904 0.982 0.928 1.    0.976]
+     [0.966 0.997 0.972 0.976 1.   ]]
+    >>> # Prunning
+    >>> prunned_sim_mat = clust.p_pruning(sim_mat, 0.3)
+    >>> print (np.around(prunned_sim_mat[5:,5:], decimals=3))
+    [[1.    0.    0.    0.    0.   ]
+     [0.    1.    0.    0.982 0.997]
+     [0.    0.977 1.    0.    0.972]
+     [0.    0.982 0.    1.    0.976]
+     [0.    0.997 0.    0.976 1.   ]]
+    >>> # Symmetrization
+    >>> sym_prund_sim_mat = 0.5 * (prunned_sim_mat + prunned_sim_mat.T)
+    >>> print (np.around(sym_prund_sim_mat[5:,5:], decimals=3))
+    [[1.    0.    0.    0.    0.   ]
+     [0.    1.    0.489 0.982 0.997]
+     [0.    0.489 1.    0.    0.486]
+     [0.    0.982 0.    1.    0.976]
+     [0.    0.997 0.486 0.976 1.   ]]
+    >>> # Laplacian
+    >>> laplacian = clust.get_laplacian(sym_prund_sim_mat)
+    >>> print (np.around(laplacian[5:,5:], decimals=3))
+    [[ 1.999  0.     0.     0.     0.   ]
+     [ 0.     2.468 -0.489 -0.982 -0.997]
+     [ 0.    -0.489  0.975  0.    -0.486]
+     [ 0.    -0.982  0.     1.958 -0.976]
+     [ 0.    -0.997 -0.486 -0.976  2.458]]
+    >>> # Spectral Embeddings
+    >>> spec_emb, num_of_spk = clust.get_spec_embs(laplacian, 3)
+    >>> print(num_of_spk)
+    3
+    >>> # Clustering
+    >>> clust.cluster_embs(spec_emb, num_of_spk)
+    >>> # print (clust.labels_) # [0 0 0 2 2 2 1 1 1 1]
+    >>> # Complete spectral clustering
+    >>> clust.do_spec_clust(emb, k_oracle=3, p_val=0.3)
+    >>> # print(clust.labels_) # [0 0 0 2 2 2 1 1 1 1]
+    """
+
+    def __init__(self, min_num_spkrs=2, max_num_spkrs=10):
+
+        self.min_num_spkrs = min_num_spkrs
+        self.max_num_spkrs = max_num_spkrs
+
+    def do_spec_clust(self, X, k_oracle, p_val):
+        """
+        Function for spectral clustering.
+
+        Arguments
+        ---------
+        X : array
+            (n_samples, n_features).
+            Embeddings extracted from the model.
+        k_oracle : int
+            Number of speakers (when oracle number of speakers).
+        p_val : float
+            p percent value to prune the affinity matrix.
+        """
+
+        # Similarity matrix computation
+        sim_mat = self.get_sim_mat(X)
+
+        # Refining similarity matrix with p_val
+        prunned_sim_mat = self.p_pruning(sim_mat, p_val)
+
+        # Symmetrization
+        sym_prund_sim_mat = 0.5 * (prunned_sim_mat + prunned_sim_mat.T)
+
+        # Laplacian calculation
+        laplacian = self.get_laplacian(sym_prund_sim_mat)
+
+        # Get Spectral Embeddings
+        emb, num_of_spk = self.get_spec_embs(laplacian, k_oracle)
+
+        # Perform clustering
+        self.cluster_embs(emb, num_of_spk)
+
+    def get_sim_mat(self, X):
+        """
+        Returns the similarity matrix based on cosine similarities.
+
+        Arguments
+        ---------
+        X : array
+            (n_samples, n_features).
+            Embeddings extracted from the model.
+
+        Returns
+        -------
+        M : array
+            (n_samples, n_samples).
+            Similarity matrix with cosine similarities between each pair of embedding.
+        """
+
+        # Cosine similarities
+        M = sklearn.metrics.pairwise.cosine_similarity(X, X)
+        return M
+
+    def p_pruning(self, A, pval):
+        """
+        Refine the affinity matrix by zeroing less similar values.
+
+        Arguments
+        ---------
+        A : array
+            (n_samples, n_samples).
+            Affinity matrix.
+        pval : float
+            p-value to be retained in each row of the affinity matrix.
+
+        Returns
+        -------
+        A : array
+            (n_samples, n_samples).
+            Prunned affinity matrix based on p_val.
+        """
+
+        n_elems = int((1 - pval) * A.shape[0])
+
+        # For each row in a affinity matrix
+        for i in range(A.shape[0]):
+            low_indexes = np.argsort(A[i, :])
+            low_indexes = low_indexes[0:n_elems]
+
+            # Replace smaller similarity values by 0s
+            A[i, low_indexes] = 0
+
+        return A
+
+    def get_laplacian(self, M):
+        """
+        Returns the un-normalized laplacian for the given affinity matrix.
+
+        Arguments
+        ---------
+        M : array
+            (n_samples, n_samples)
+            Affinity matrix.
+
+        Returns
+        -------
+        L : array
+            (n_samples, n_samples)
+            Laplacian matrix.
+        """
+
+        M[np.diag_indices(M.shape[0])] = 0
+        D = np.sum(np.abs(M), axis=1)
+        D = np.diag(D)
+        L = D - M
+        return L
+
+    def get_spec_embs(self, L, k_oracle=4):
+        """
+        Returns spectral embeddings and estimates the number of speakers
+        using maximum Eigen gap.
+
+        Arguments
+        ---------
+        L : array (n_samples, n_samples)
+            Laplacian matrix.
+        k_oracle : int
+            Number of speakers when the condition is oracle number of speakers,
+            else None.
+
+        Returns
+        -------
+        emb : array (n_samples, n_components)
+            Spectral embedding for each sample with n Eigen components.
+        num_of_spk : int
+            Estimated number of speakers. If the condition is set to the oracle
+            number of speakers then returns k_oracle.
+        """
+
+        lambdas, eig_vecs = scipy.linalg.eigh(L)
+
+        # if params["oracle_n_spkrs"] is True:
+        if k_oracle is not None:
+            num_of_spk = k_oracle
+        else:
+            lambda_gap_list = self.get_eigen_gaps(lambdas[1:self.max_num_spkrs])
+
+            num_of_spk = (np.argmax(
+                lambda_gap_list[:min(self.max_num_spkrs, len(lambda_gap_list))])
+                          + 2)
+
+            if num_of_spk < self.min_num_spkrs:
+                num_of_spk = self.min_num_spkrs
+
+        emb = eig_vecs[:, 0:num_of_spk]
+
+        return emb, num_of_spk
+
+    def cluster_embs(self, emb, k):
+        """
+        Clusters the embeddings using kmeans.
+
+        Arguments
+        ---------
+        emb : array (n_samples, n_components)
+            Spectral embedding for each sample with n Eigen components.
+        k : int
+            Number of clusters to kmeans.
+
+        Returns
+        -------
+        self.labels_ : self
+            Labels for each sample embedding.
+        """
+        _, self.labels_, _ = k_means(emb, k)
+
+    def get_eigen_gaps(self, eig_vals):
+        """
+        Returns the difference (gaps) between the Eigen values.
+
+        Arguments
+        ---------
+        eig_vals : list
+            List of eigen values
+
+        Returns
+        -------
+        eig_vals_gap_list : list
+            List of differences (gaps) between adjacent Eigen values.
+        """
+
+        eig_vals_gap_list = []
+        for i in range(len(eig_vals) - 1):
+            gap = float(eig_vals[i + 1]) - float(eig_vals[i])
+            eig_vals_gap_list.append(gap)
+
+        return eig_vals_gap_list
+
+
+class SpecCluster(SpectralClustering):
+    def perform_sc(self, X, n_neighbors=10):
+        """
+        Performs spectral clustering using sklearn on embeddings.
+
+        Arguments
+        ---------
+        X : array (n_samples, n_features)
+            Embeddings to be clustered.
+        n_neighbors : int
+            Number of neighbors in estimating affinity matrix.
+        """
+
+        # Computation of affinity matrix
+        connectivity = kneighbors_graph(
+            X,
+            n_neighbors=n_neighbors,
+            include_self=True, )
+        self.affinity_matrix_ = 0.5 * (connectivity + connectivity.T)
+
+        # Perform spectral clustering on affinity matrix
+        self.labels_ = spectral_clustering(
+            self.affinity_matrix_,
+            n_clusters=self.n_clusters, )
+        return self
+
+
+def is_overlapped(end1, start2):
+    """
+    Returns True if segments are overlapping.
+
+    Arguments
+    ---------
+    end1 : float
+        End time of the first segment.
+    start2 : float
+        Start time of the second segment.
+
+    Returns
+    -------
+    overlapped : bool
+        True of segments overlapped else False.
+
+    Example
+    -------
+    >>> import diarization as diar
+    >>> diar.is_overlapped(5.5, 3.4)
+    True
+    >>> diar.is_overlapped(5.5, 6.4)
+    False
+    """
+
+    if start2 > end1:
+        return False
+    else:
+        return True
+
+
+def merge_ssegs_same_speaker(lol):
+    """
+    Merge adjacent sub-segs from the same speaker.
+
+    Arguments
+    ---------
+    lol : list of list
+        Each list contains [rec_id, seg_start, seg_end, spkr_id].
+
+    Returns
+    -------
+    new_lol : list of list
+        new_lol contains adjacent segments merged from the same speaker ID.
+
+    Example
+    -------
+    >>> import diarization as diar
+    >>> lol=[['r1', 5.5, 7.0, 's1'],
+    ... ['r1', 6.5, 9.0, 's1'],
+    ... ['r1', 8.0, 11.0, 's1'],
+    ... ['r1', 11.5, 13.0, 's2'],
+    ... ['r1', 14.0, 15.0, 's2'],
+    ... ['r1', 14.5, 15.0, 's1']]
+    >>> diar.merge_ssegs_same_speaker(lol)
+    [['r1', 5.5, 11.0, 's1'], ['r1', 11.5, 13.0, 's2'], ['r1', 14.0, 15.0, 's2'], ['r1', 14.5, 15.0, 's1']]
+    """
+
+    new_lol = []
+
+    # Start from the first sub-seg
+    sseg = lol[0]
+    flag = False
+    for i in range(1, len(lol)):
+        next_sseg = lol[i]
+
+        # IF sub-segments overlap AND has same speaker THEN merge
+        if is_overlapped(sseg[2], next_sseg[1]) and sseg[3] == next_sseg[3]:
+            sseg[2] = next_sseg[2]  # just update the end time
+            # This is important. For the last sseg, if it is the same speaker the merge
+            # Make sure we don't append the last segment once more. Hence, set FLAG=True
+            if i == len(lol) - 1:
+                flag = True
+                new_lol.append(sseg)
+        else:
+            new_lol.append(sseg)
+            sseg = next_sseg
+
+    # Add last segment only when it was skipped earlier.
+    if flag is False:
+        new_lol.append(lol[-1])
+
+    return new_lol
+
+
+def distribute_overlap(lol):
+    """
+    Distributes the overlapped speech equally among the adjacent segments
+    with different speakers.
+
+    Arguments
+    ---------
+    lol : list of list
+        It has each list structure as [rec_id, seg_start, seg_end, spkr_id].
+
+    Returns
+    -------
+    new_lol : list of list
+        It contains the overlapped part equally divided among the adjacent
+        segments with different speaker IDs.
+
+    Example
+    -------
+    >>> import diarization as diar
+    >>> lol = [['r1', 5.5, 9.0, 's1'],
+    ... ['r1', 8.0, 11.0, 's2'],
+    ... ['r1', 11.5, 13.0, 's2'],
+    ... ['r1', 12.0, 15.0, 's1']]
+    >>> diar.distribute_overlap(lol)
+    [['r1', 5.5, 8.5, 's1'], ['r1', 8.5, 11.0, 's2'], ['r1', 11.5, 12.5, 's2'], ['r1', 12.5, 15.0, 's1']]
+    """
+
+    new_lol = []
+    sseg = lol[0]
+
+    # Add first sub-segment here to avoid error at: "if new_lol[-1] != sseg:" when new_lol is empty
+    # new_lol.append(sseg)
+
+    for i in range(1, len(lol)):
+        next_sseg = lol[i]
+        # No need to check if they are different speakers.
+        # Because if segments are overlapped then they always have different speakers.
+        # This is because similar speaker's adjacent sub-segments are already merged by "merge_ssegs_same_speaker()"
+
+        if is_overlapped(sseg[2], next_sseg[1]):
+
+            # Get overlap duration.
+            # Now this overlap will be divided equally between adjacent segments.
+            overlap = sseg[2] - next_sseg[1]
+
+            # Update end time of old seg
+            sseg[2] = sseg[2] - (overlap / 2.0)
+
+            # Update start time of next seg
+            next_sseg[1] = next_sseg[1] + (overlap / 2.0)
+
+            if len(new_lol) == 0:
+                # For first sub-segment entry
+                new_lol.append(sseg)
+            else:
+                # To avoid duplicate entries
+                if new_lol[-1] != sseg:
+                    new_lol.append(sseg)
+
+            # Current sub-segment is next sub-segment
+            sseg = next_sseg
+
+        else:
+            # For the first sseg
+            if len(new_lol) == 0:
+                new_lol.append(sseg)
+            else:
+                # To avoid duplicate entries
+                if new_lol[-1] != sseg:
+                    new_lol.append(sseg)
+
+            # Update the current sub-segment
+            sseg = next_sseg
+
+    # Add the remaining last sub-segment
+    new_lol.append(next_sseg)
+
+    return new_lol
+
+
+def write_rttm(segs_list, out_rttm_file):
+    """
+    Writes the segment list in RTTM format (A standard NIST format).
+
+    Arguments
+    ---------
+    segs_list : list of list
+        Each list contains [rec_id, seg_start, seg_end, spkr_id].
+    out_rttm_file : str
+        Path of the output RTTM file.
+    """
+
+    rttm = []
+    rec_id = segs_list[0][0]
+
+    for seg in segs_list:
+        new_row = [
+            "SPEAKER",
+            rec_id,
+            "0",
+            str(round(seg[1], 4)),
+            str(round(seg[2] - seg[1], 4)),
+            "<NA>",
+            "<NA>",
+            seg[3],
+            "<NA>",
+            "<NA>",
+        ]
+        rttm.append(new_row)
+
+    with open(out_rttm_file, "w") as f:
+        for row in rttm:
+            line_str = " ".join(row)
+            f.write("%s\n" % line_str)
+
+
+def do_AHC(diary_obj, out_rttm_file, rec_id, k_oracle=4, p_val=0.3):
+    """
+    Performs Agglomerative Hierarchical Clustering on embeddings.
+
+    Arguments
+    ---------
+    diary_obj : EmbeddingMeta type
+        Contains embeddings in diary_obj.stats and segment IDs in diary_obj.segset.
+    out_rttm_file : str
+        Path of the output RTTM file.
+    rec_id : str
+        Recording ID for the recording under processing.
+    k : int
+        Number of speaker (None, if it has to be estimated).
+    pval : float
+        `pval` for prunning affinity matrix. Used only when number of speakers
+        are unknown. Note that this is just for experiment. Prefer Spectral clustering
+        for better clustering results.
+    """
+
+    from sklearn.cluster import AgglomerativeClustering
+
+    # p_val is the threshold_val (for AHC)
+    diary_obj.norm_stats()
+
+    # processing
+    if k_oracle is not None:
+        num_of_spk = k_oracle
+
+        clustering = AgglomerativeClustering(
+            n_clusters=num_of_spk,
+            affinity="cosine",
+            linkage="average", ).fit(diary_obj.stats)
+        labels = clustering.labels_
+
+    else:
+        # Estimate num of using max eigen gap with `cos` affinity matrix.
+        # This is just for experimentation.
+        clustering = AgglomerativeClustering(
+            n_clusters=None,
+            affinity="cosine",
+            linkage="average",
+            distance_threshold=p_val, ).fit(diary_obj.stats)
+        labels = clustering.labels_
+
+    # Convert labels to speaker boundaries
+    subseg_ids = diary_obj.segset
+    lol = []
+
+    for i in range(labels.shape[0]):
+        spkr_id = rec_id + "_" + str(labels[i])
+
+        sub_seg = subseg_ids[i]
+
+        splitted = sub_seg.rsplit("_", 2)
+        rec_id = str(splitted[0])
+        sseg_start = float(splitted[1])
+        sseg_end = float(splitted[2])
+
+        a = [rec_id, sseg_start, sseg_end, spkr_id]
+        lol.append(a)
+
+    # Sorting based on start time of sub-segment
+    lol.sort(key=lambda x: float(x[1]))
+
+    # Merge and split in 2 simple steps: (i) Merge sseg of same speakers then (ii) split different speakers
+    # Step 1: Merge adjacent sub-segments that belong to same speaker (or cluster)
+    lol = merge_ssegs_same_speaker(lol)
+
+    # Step 2: Distribute duration of adjacent overlapping sub-segments belonging to different speakers (or cluster)
+    # Taking mid-point as the splitting time location.
+    lol = distribute_overlap(lol)
+
+    # logger.info("Completed diarizing " + rec_id)
+    write_rttm(lol, out_rttm_file)
+
+
+def do_spec_clustering(diary_obj, out_rttm_file, rec_id, k, pval, affinity_type,
+                       n_neighbors):
+    """
+    Performs spectral clustering on embeddings. This function calls specific
+    clustering algorithms as per affinity.
+
+    Arguments
+    ---------
+    diary_obj : EmbeddingMeta type
+        Contains embeddings in diary_obj.stats and segment IDs in diary_obj.segset.
+    out_rttm_file : str
+        Path of the output RTTM file.
+    rec_id : str
+        Recording ID for the recording under processing.
+    k : int
+        Number of speaker (None, if it has to be estimated).
+    pval : float
+        `pval` for prunning affinity matrix.
+    affinity_type : str
+        Type of similarity to be used to get affinity matrix (cos or nn).
+    """
+
+    if affinity_type == "cos":
+        clust_obj = SpecClustUnorm(min_num_spkrs=2, max_num_spkrs=10)
+        k_oracle = k  # use it only when oracle num of speakers
+        clust_obj.do_spec_clust(diary_obj.stats, k_oracle, pval)
+        labels = clust_obj.labels_
+    else:
+        clust_obj = SpecCluster(
+            n_clusters=k,
+            assign_labels="kmeans",
+            random_state=1234,
+            affinity="nearest_neighbors", )
+        clust_obj.perform_sc(diary_obj.stats, n_neighbors)
+        labels = clust_obj.labels_
+
+    # Convert labels to speaker boundaries
+    subseg_ids = diary_obj.segset
+    lol = []
+
+    for i in range(labels.shape[0]):
+        spkr_id = rec_id + "_" + str(labels[i])
+
+        sub_seg = subseg_ids[i]
+
+        splitted = sub_seg.rsplit("_", 2)
+        rec_id = str(splitted[0])
+        sseg_start = float(splitted[1])
+        sseg_end = float(splitted[2])
+
+        a = [rec_id, sseg_start, sseg_end, spkr_id]
+        lol.append(a)
+
+    # Sorting based on start time of sub-segment
+    lol.sort(key=lambda x: float(x[1]))
+
+    # Merge and split in 2 simple steps: (i) Merge sseg of same speakers then (ii) split different speakers
+    # Step 1: Merge adjacent sub-segments that belong to same speaker (or cluster)
+    lol = merge_ssegs_same_speaker(lol)
+
+    # Step 2: Distribute duration of adjacent overlapping sub-segments belonging to different speakers (or cluster)
+    # Taking mid-point as the splitting time location.
+    lol = distribute_overlap(lol)
+
+    # logger.info("Completed diarizing " + rec_id)
+    write_rttm(lol, out_rttm_file)
+
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser(
+        prog='python diarization.py --backend AHC', description='diarizing')
+    parser.add_argument(
+        '--sys_rttm_dir',
+        required=False,
+        help='Directory to store system RTTM files')
+    parser.add_argument(
+        '--ref_rttm_dir',
+        required=False,
+        help='Directory to store reference RTTM files')
+    parser.add_argument(
+        '--backend', default="AHC", help='type of backend, AHC or SC or kmeans')
+    parser.add_argument(
+        '--oracle_n_spkrs',
+        default=True,
+        type=strtobool,
+        help='Oracle num of speakers')
+    parser.add_argument(
+        '--mic_type',
+        default="Mix-Headset",
+        help='Type of microphone to be used')
+    parser.add_argument(
+        '--affinity', default="cos", help='affinity matrix, cos or nn')
+    parser.add_argument(
+        '--max_subseg_dur',
+        default=3.0,
+        type=float,
+        help='Duration in seconds of a subsegments to be prepared from larger segments'
+    )
+    parser.add_argument(
+        '--overlap',
+        default=1.5,
+        type=float,
+        help='Overlap duration in seconds between adjacent subsegments')
+
+    args = parser.parse_args()
+
+    pval = 0.3
+    rec_id = "utt0001"
+    n_neighbors = 10
+    out_rttm_file = "./out.rttm"
+
+    embeddings = np.empty(shape=[0, 32], dtype=np.float64)
+    segset = []
+
+    for i in range(10):
+        seg = [rec_id + "_" + str(i) + "_" + str(i + 1)]
+        segset = segset + seg
+        emb = np.random.rand(1, 32)
+        embeddings = np.concatenate((embeddings, emb), axis=0)
+
+    segset = np.array(segset, dtype="|O")
+    stat_obj = EmbeddingMeta(segset, embeddings)
+    if args.oracle_n_spkrs is True:
+        num_spkrs = 2
+
+    if args.backend == "SC":
+        print("begin SC ")
+        do_spec_clustering(
+            stat_obj,
+            out_rttm_file,
+            rec_id,
+            num_spkrs,
+            pval,
+            args.affinity,
+            n_neighbors, )
+    if args.backend == "AHC":
+        print("begin AHC ")
+        do_AHC(stat_obj, out_rttm_file, rec_id, num_spkrs, pval)
diff --git a/ernie-sat/paddlespeech/vector/exps/__init__.py b/ernie-sat/paddlespeech/vector/exps/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..abf198b97e6e818e1fbe59006f98492640bcee54
--- /dev/null
+++ b/ernie-sat/paddlespeech/vector/exps/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/ernie-sat/paddlespeech/vector/exps/ecapa_tdnn/extract_emb.py b/ernie-sat/paddlespeech/vector/exps/ecapa_tdnn/extract_emb.py
new file mode 100644
index 0000000000000000000000000000000000000000..686de9363e82f121e59348441c09bb150984d218
--- /dev/null
+++ b/ernie-sat/paddlespeech/vector/exps/ecapa_tdnn/extract_emb.py
@@ -0,0 +1,119 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+import time
+
+import paddle
+from yacs.config import CfgNode
+
+from paddleaudio.backends import load as load_audio
+from paddleaudio.compliance.librosa import melspectrogram
+from paddlespeech.s2t.utils.log import Log
+from paddlespeech.vector.io.batch import feature_normalize
+from paddlespeech.vector.models.ecapa_tdnn import EcapaTdnn
+from paddlespeech.vector.modules.sid_model import SpeakerIdetification
+from paddlespeech.vector.training.seeding import seed_everything
+
+logger = Log(__name__).getlog()
+
+
+def extract_audio_embedding(args, config):
+    # stage 0: set the training device, cpu or gpu
+    paddle.set_device(args.device)
+    # set the random seed, it is a must for multiprocess training
+    seed_everything(config.seed)
+
+    # stage 1: build the dnn backbone model network
+    ecapa_tdnn = EcapaTdnn(**config.model)
+
+    # stage4: build the speaker verification train instance with backbone model
+    model = SpeakerIdetification(
+        backbone=ecapa_tdnn, num_class=config.num_speakers)
+    # stage 2: load the pre-trained model
+    args.load_checkpoint = os.path.abspath(
+        os.path.expanduser(args.load_checkpoint))
+
+    # load model checkpoint to sid model
+    state_dict = paddle.load(
+        os.path.join(args.load_checkpoint, 'model.pdparams'))
+    model.set_state_dict(state_dict)
+    logger.info(f'Checkpoint loaded from {args.load_checkpoint}')
+
+    # stage 3: we must set the model to eval mode
+    model.eval()
+
+    # stage 4: read the audio data and extract the embedding
+    # wavform is one dimension numpy array 
+    waveform, sr = load_audio(args.audio_path)
+
+    # feat type is numpy array, whose shape is [dim, time]
+    # we need convert the audio feat to one-batch shape [batch, dim, time], where the batch is one
+    # so the final shape is [1, dim, time]
+    start_time = time.time()
+    feat = melspectrogram(
+        x=waveform,
+        sr=config.sr,
+        n_mels=config.n_mels,
+        window_size=config.window_size,
+        hop_length=config.hop_size)
+    feat = paddle.to_tensor(feat).unsqueeze(0)
+
+    # in inference period, the lengths is all one without padding
+    lengths = paddle.ones([1])
+    feat = feature_normalize(feat, mean_norm=True, std_norm=False)
+
+    # model backbone network forward the feats and get the embedding
+    embedding = model.backbone(
+        feat, lengths).squeeze().numpy()  # (1, emb_size, 1) -> (emb_size)
+    elapsed_time = time.time() - start_time
+    audio_length = waveform.shape[0] / sr
+
+    # stage 5: do global norm with external mean and std
+    rtf = elapsed_time / audio_length
+    logger.info(f"{args.device} rft={rtf}")
+
+    return embedding
+
+
+if __name__ == "__main__":
+    # yapf: disable
+    parser = argparse.ArgumentParser(__doc__)
+    parser.add_argument('--device',
+                        choices=['cpu', 'gpu'],
+                        default="cpu",
+                        help="Select which device to train model, defaults to gpu.")
+    parser.add_argument("--config",
+                        default=None,
+                        type=str,
+                        help="configuration file")
+    parser.add_argument("--load-checkpoint",
+                        type=str,
+                        default='',
+                        help="Directory to load model checkpoint to contiune trainning.")
+    parser.add_argument("--audio-path",
+                        default="./data/demo.wav",
+                        type=str,
+                        help="Single audio file path")
+    args = parser.parse_args()
+    # yapf: enable
+    # https://yaml.org/type/float.html
+    config = CfgNode(new_allowed=True)
+    if args.config:
+        config.merge_from_file(args.config)
+
+    config.freeze()
+    print(config)
+
+    extract_audio_embedding(args, config)
diff --git a/ernie-sat/paddlespeech/vector/exps/ecapa_tdnn/test.py b/ernie-sat/paddlespeech/vector/exps/ecapa_tdnn/test.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0de6dc51a8829c8bc11cb2ae943a3f164f1fd1d
--- /dev/null
+++ b/ernie-sat/paddlespeech/vector/exps/ecapa_tdnn/test.py
@@ -0,0 +1,203 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+
+import numpy as np
+import paddle
+from paddle.io import BatchSampler
+from paddle.io import DataLoader
+from tqdm import tqdm
+from yacs.config import CfgNode
+
+from paddleaudio.datasets import VoxCeleb
+from paddleaudio.metric import compute_eer
+from paddlespeech.s2t.utils.log import Log
+from paddlespeech.vector.io.batch import batch_feature_normalize
+from paddlespeech.vector.models.ecapa_tdnn import EcapaTdnn
+from paddlespeech.vector.modules.sid_model import SpeakerIdetification
+from paddlespeech.vector.training.seeding import seed_everything
+
+logger = Log(__name__).getlog()
+
+
+def main(args, config):
+    # stage0: set the training device, cpu or gpu
+    paddle.set_device(args.device)
+    # set the random seed, it is a must for multiprocess training
+    seed_everything(config.seed)
+
+    # stage1: build the dnn backbone model network
+    ecapa_tdnn = EcapaTdnn(**config.model)
+
+    # stage2: build the speaker verification eval instance with backbone model
+    model = SpeakerIdetification(
+        backbone=ecapa_tdnn, num_class=config.num_speakers)
+
+    # stage3: load the pre-trained model
+    #         we get the last model from the epoch and save_interval
+    args.load_checkpoint = os.path.abspath(
+        os.path.expanduser(args.load_checkpoint))
+
+    # load model checkpoint to sid model
+    state_dict = paddle.load(
+        os.path.join(args.load_checkpoint, 'model.pdparams'))
+    model.set_state_dict(state_dict)
+    logger.info(f'Checkpoint loaded from {args.load_checkpoint}')
+
+    # stage4: construct the enroll and test dataloader
+
+    enroll_dataset = VoxCeleb(
+        subset='enroll',
+        target_dir=args.data_dir,
+        feat_type='melspectrogram',
+        random_chunk=False,
+        n_mels=config.n_mels,
+        window_size=config.window_size,
+        hop_length=config.hop_size)
+    enroll_sampler = BatchSampler(
+        enroll_dataset, batch_size=config.batch_size,
+        shuffle=True)  # Shuffle to make embedding normalization more robust.
+    enrol_loader = DataLoader(enroll_dataset,
+                    batch_sampler=enroll_sampler,
+                    collate_fn=lambda x: batch_feature_normalize(
+                            x, mean_norm=True, std_norm=False),
+                    num_workers=config.num_workers,
+                    return_list=True,)
+    test_dataset = VoxCeleb(
+        subset='test',
+        target_dir=args.data_dir,
+        feat_type='melspectrogram',
+        random_chunk=False,
+        n_mels=config.n_mels,
+        window_size=config.window_size,
+        hop_length=config.hop_size)
+
+    test_sampler = BatchSampler(
+        test_dataset, batch_size=config.batch_size, shuffle=True)
+    test_loader = DataLoader(test_dataset,
+                            batch_sampler=test_sampler,
+                            collate_fn=lambda x: batch_feature_normalize(
+                                x, mean_norm=True, std_norm=False),
+                            num_workers=config.num_workers,
+                            return_list=True,)
+    # stage5: we must set the model to eval mode
+    model.eval()
+
+    # stage6: global embedding norm to imporve the performance
+    logger.info(f"global embedding norm: {config.global_embedding_norm}")
+    if config.global_embedding_norm:
+        global_embedding_mean = None
+        global_embedding_std = None
+        mean_norm_flag = config.embedding_mean_norm
+        std_norm_flag = config.embedding_std_norm
+        batch_count = 0
+
+    # stage7: Compute embeddings of audios in enrol and test dataset from model.
+    id2embedding = {}
+    # Run multi times to make embedding normalization more stable.
+    for i in range(2):
+        for dl in [enrol_loader, test_loader]:
+            logger.info(
+                f'Loop {[i+1]}: Computing embeddings on {dl.dataset.subset} dataset'
+            )
+            with paddle.no_grad():
+                for batch_idx, batch in enumerate(tqdm(dl)):
+
+                    # stage 8-1: extrac the audio embedding
+                    ids, feats, lengths = batch['ids'], batch['feats'], batch[
+                        'lengths']
+                    embeddings = model.backbone(feats, lengths).squeeze(
+                        -1).numpy()  # (N, emb_size, 1) -> (N, emb_size)
+
+                    # Global embedding normalization.
+                    # if we use the global embedding norm
+                    # eer can reduece about relative 10%
+                    if config.global_embedding_norm:
+                        batch_count += 1
+                        current_mean = embeddings.mean(
+                            axis=0) if mean_norm_flag else 0
+                        current_std = embeddings.std(
+                            axis=0) if std_norm_flag else 1
+                        # Update global mean and std.
+                        if global_embedding_mean is None and global_embedding_std is None:
+                            global_embedding_mean, global_embedding_std = current_mean, current_std
+                        else:
+                            weight = 1 / batch_count  # Weight decay by batches.
+                            global_embedding_mean = (
+                                1 - weight
+                            ) * global_embedding_mean + weight * current_mean
+                            global_embedding_std = (
+                                1 - weight
+                            ) * global_embedding_std + weight * current_std
+                        # Apply global embedding normalization.
+                        embeddings = (embeddings - global_embedding_mean
+                                      ) / global_embedding_std
+
+                    # Update embedding dict.
+                    id2embedding.update(dict(zip(ids, embeddings)))
+
+    # stage 8: Compute cosine scores.
+    labels = []
+    enroll_ids = []
+    test_ids = []
+    logger.info(f"read the trial from {VoxCeleb.veri_test_file}")
+    with open(VoxCeleb.veri_test_file, 'r') as f:
+        for line in f.readlines():
+            label, enroll_id, test_id = line.strip().split(' ')
+            labels.append(int(label))
+            enroll_ids.append(enroll_id.split('.')[0].replace('/', '-'))
+            test_ids.append(test_id.split('.')[0].replace('/', '-'))
+
+    cos_sim_func = paddle.nn.CosineSimilarity(axis=1)
+    enrol_embeddings, test_embeddings = map(lambda ids: paddle.to_tensor(
+        np.asarray([id2embedding[uttid] for uttid in ids], dtype='float32')),
+                                            [enroll_ids, test_ids
+                                             ])  # (N, emb_size)
+    scores = cos_sim_func(enrol_embeddings, test_embeddings)
+    EER, threshold = compute_eer(np.asarray(labels), scores.numpy())
+    logger.info(
+        f'EER of verification test: {EER*100:.4f}%, score threshold: {threshold:.5f}'
+    )
+
+
+if __name__ == "__main__":
+    # yapf: disable
+    parser = argparse.ArgumentParser(__doc__)
+    parser.add_argument('--device',
+                        choices=['cpu', 'gpu'],
+                        default="gpu",
+                        help="Select which device to train model, defaults to gpu.")
+    parser.add_argument("--config",
+                        default=None,
+                        type=str,
+                        help="configuration file")
+    parser.add_argument("--data-dir",
+                        default="./data/",
+                        type=str,
+                        help="data directory")
+    parser.add_argument("--load-checkpoint",
+                        type=str,
+                        default='',
+                        help="Directory to load model checkpoint to contiune trainning.")
+    args = parser.parse_args()
+    # yapf: enable
+    # https://yaml.org/type/float.html
+    config = CfgNode(new_allowed=True)
+    if args.config:
+        config.merge_from_file(args.config)
+
+    config.freeze()
+    print(config)
+    main(args, config)
diff --git a/ernie-sat/paddlespeech/vector/exps/ecapa_tdnn/train.py b/ernie-sat/paddlespeech/vector/exps/ecapa_tdnn/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..257b97abed7f8ca2edebf2d85e75f26370851116
--- /dev/null
+++ b/ernie-sat/paddlespeech/vector/exps/ecapa_tdnn/train.py
@@ -0,0 +1,351 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+import time
+
+import numpy as np
+import paddle
+from paddle.io import BatchSampler
+from paddle.io import DataLoader
+from paddle.io import DistributedBatchSampler
+from yacs.config import CfgNode
+
+from paddleaudio.compliance.librosa import melspectrogram
+from paddleaudio.datasets.voxceleb import VoxCeleb
+from paddlespeech.s2t.utils.log import Log
+from paddlespeech.vector.io.augment import build_augment_pipeline
+from paddlespeech.vector.io.augment import waveform_augment
+from paddlespeech.vector.io.batch import batch_pad_right
+from paddlespeech.vector.io.batch import feature_normalize
+from paddlespeech.vector.io.batch import waveform_collate_fn
+from paddlespeech.vector.models.ecapa_tdnn import EcapaTdnn
+from paddlespeech.vector.modules.loss import AdditiveAngularMargin
+from paddlespeech.vector.modules.loss import LogSoftmaxWrapper
+from paddlespeech.vector.modules.sid_model import SpeakerIdetification
+from paddlespeech.vector.training.scheduler import CyclicLRScheduler
+from paddlespeech.vector.training.seeding import seed_everything
+from paddlespeech.vector.utils.time import Timer
+
+logger = Log(__name__).getlog()
+
+
+def main(args, config):
+    # stage0: set the training device, cpu or gpu
+    paddle.set_device(args.device)
+
+    # stage1: we must call the paddle.distributed.init_parallel_env() api at the begining
+    paddle.distributed.init_parallel_env()
+    nranks = paddle.distributed.get_world_size()
+    local_rank = paddle.distributed.get_rank()
+    # set the random seed, it is a must for multiprocess training
+    seed_everything(config.seed)
+
+    # stage2: data prepare, such vox1 and vox2 data, and augment noise data and pipline
+    # note: some cmd must do in rank==0, so wo will refactor the data prepare code
+    train_dataset = VoxCeleb('train', target_dir=args.data_dir)
+    dev_dataset = VoxCeleb('dev', target_dir=args.data_dir)
+
+    if config.augment:
+        augment_pipeline = build_augment_pipeline(target_dir=args.data_dir)
+    else:
+        augment_pipeline = []
+
+    # stage3: build the dnn backbone model network
+    ecapa_tdnn = EcapaTdnn(**config.model)
+
+    # stage4: build the speaker verification train instance with backbone model
+    model = SpeakerIdetification(
+        backbone=ecapa_tdnn, num_class=VoxCeleb.num_speakers)
+
+    # stage5: build the optimizer, we now only construct the AdamW optimizer
+    #         140000 is single gpu steps
+    #         so, in multi-gpu mode, wo reduce the step_size to 140000//nranks to enable CyclicLRScheduler
+    lr_schedule = CyclicLRScheduler(
+        base_lr=config.learning_rate, max_lr=1e-3, step_size=140000 // nranks)
+    optimizer = paddle.optimizer.AdamW(
+        learning_rate=lr_schedule, parameters=model.parameters())
+
+    # stage6: build the loss function, we now only support LogSoftmaxWrapper
+    criterion = LogSoftmaxWrapper(
+        loss_fn=AdditiveAngularMargin(margin=0.2, scale=30))
+
+    # stage7: confirm training start epoch
+    #         if pre-trained model exists, start epoch confirmed by the pre-trained model
+    start_epoch = 0
+    if args.load_checkpoint:
+        logger.info("load the check point")
+        args.load_checkpoint = os.path.abspath(
+            os.path.expanduser(args.load_checkpoint))
+        try:
+            # load model checkpoint
+            state_dict = paddle.load(
+                os.path.join(args.load_checkpoint, 'model.pdparams'))
+            model.set_state_dict(state_dict)
+
+            # load optimizer checkpoint
+            state_dict = paddle.load(
+                os.path.join(args.load_checkpoint, 'model.pdopt'))
+            optimizer.set_state_dict(state_dict)
+            if local_rank == 0:
+                logger.info(f'Checkpoint loaded from {args.load_checkpoint}')
+        except FileExistsError:
+            if local_rank == 0:
+                logger.info('Train from scratch.')
+
+        try:
+            start_epoch = int(args.load_checkpoint[-1])
+            logger.info(f'Restore training from epoch {start_epoch}.')
+        except ValueError:
+            pass
+
+    # stage8: we build the batch sampler for paddle.DataLoader
+    train_sampler = DistributedBatchSampler(
+        train_dataset,
+        batch_size=config.batch_size,
+        shuffle=True,
+        drop_last=False)
+    train_loader = DataLoader(
+        train_dataset,
+        batch_sampler=train_sampler,
+        num_workers=config.num_workers,
+        collate_fn=waveform_collate_fn,
+        return_list=True,
+        use_buffer_reader=True, )
+
+    # stage9: start to train
+    #         we will comment the training process
+    steps_per_epoch = len(train_sampler)
+    timer = Timer(steps_per_epoch * config.epochs)
+    last_saved_epoch = ""
+    timer.start()
+
+    for epoch in range(start_epoch + 1, config.epochs + 1):
+        # at the begining, model must set to train mode
+        model.train()
+
+        avg_loss = 0
+        num_corrects = 0
+        num_samples = 0
+        train_reader_cost = 0.0
+        train_feat_cost = 0.0
+        train_run_cost = 0.0
+
+        reader_start = time.time()
+        for batch_idx, batch in enumerate(train_loader):
+            train_reader_cost += time.time() - reader_start
+
+            # stage 9-1: batch data is audio sample points and speaker id label
+            feat_start = time.time()
+            waveforms, labels = batch['waveforms'], batch['labels']
+            waveforms, lengths = batch_pad_right(waveforms.numpy())
+            waveforms = paddle.to_tensor(waveforms)
+
+            # stage 9-2: audio sample augment method, which is done on the audio sample point
+            #            the original wavefrom and the augmented waveform is concatented in a batch
+            #            eg. five augment method in the augment pipeline
+            #                the final data nums is batch_size * [five + one] 
+            #                -> five augmented waveform batch plus one original batch waveform
+            if len(augment_pipeline) != 0:
+                waveforms = waveform_augment(waveforms, augment_pipeline)
+                labels = paddle.concat(
+                    [labels for i in range(len(augment_pipeline) + 1)])
+
+            # stage 9-3: extract the audio feats,such fbank, mfcc, spectrogram
+            feats = []
+            for waveform in waveforms.numpy():
+                feat = melspectrogram(
+                    x=waveform,
+                    sr=config.sr,
+                    n_mels=config.n_mels,
+                    window_size=config.window_size,
+                    hop_length=config.hop_size)
+                feats.append(feat)
+            feats = paddle.to_tensor(np.asarray(feats))
+
+            # stage 9-4: feature normalize, which help converge and imporve the performance
+            feats = feature_normalize(
+                feats, mean_norm=True, std_norm=False)  # Features normalization
+            train_feat_cost += time.time() - feat_start
+
+            # stage 9-5: model forward, such ecapa-tdnn, x-vector
+            train_start = time.time()
+            logits = model(feats)
+
+            # stage 9-6: loss function criterion, such AngularMargin, AdditiveAngularMargin
+            loss = criterion(logits, labels)
+
+            # stage 9-7: update the gradient and clear the gradient cache
+            loss.backward()
+            optimizer.step()
+            if isinstance(optimizer._learning_rate,
+                          paddle.optimizer.lr.LRScheduler):
+                optimizer._learning_rate.step()
+            optimizer.clear_grad()
+            train_run_cost += time.time() - train_start
+
+            # stage 9-8: Calculate average loss per batch
+            avg_loss += loss.numpy()[0]
+
+            # stage 9-9: Calculate metrics, which is one-best accuracy
+            preds = paddle.argmax(logits, axis=1)
+            num_corrects += (preds == labels).numpy().sum()
+            num_samples += feats.shape[0]
+            timer.count()  # step plus one in timer
+
+            # stage 9-10: print the log information only on 0-rank per log-freq batchs
+            if (batch_idx + 1) % config.log_interval == 0 and local_rank == 0:
+                lr = optimizer.get_lr()
+                avg_loss /= config.log_interval
+                avg_acc = num_corrects / num_samples
+
+                print_msg = 'Train Epoch={}/{}, Step={}/{}'.format(
+                    epoch, config.epochs, batch_idx + 1, steps_per_epoch)
+                print_msg += ' loss={:.4f}'.format(avg_loss)
+                print_msg += ' acc={:.4f}'.format(avg_acc)
+                print_msg += ' avg_reader_cost: {:.5f} sec,'.format(
+                    train_reader_cost / config.log_interval)
+                print_msg += ' avg_feat_cost: {:.5f} sec,'.format(
+                    train_feat_cost / config.log_interval)
+                print_msg += ' avg_train_cost: {:.5f} sec,'.format(
+                    train_run_cost / config.log_interval)
+                print_msg += ' lr={:.4E} step/sec={:.2f} | ETA {}'.format(
+                    lr, timer.timing, timer.eta)
+                logger.info(print_msg)
+
+                avg_loss = 0
+                num_corrects = 0
+                num_samples = 0
+                train_reader_cost = 0.0
+                train_feat_cost = 0.0
+                train_run_cost = 0.0
+
+            reader_start = time.time()
+
+        # stage 9-11: save the model parameters only on 0-rank per save-freq batchs
+        if epoch % config.save_interval == 0 and batch_idx + 1 == steps_per_epoch:
+            if local_rank != 0:
+                paddle.distributed.barrier(
+                )  # Wait for valid step in main process
+                continue  # Resume trainning on other process
+
+            # stage 9-12: construct the valid dataset dataloader
+            dev_sampler = BatchSampler(
+                dev_dataset,
+                batch_size=config.batch_size,
+                shuffle=False,
+                drop_last=False)
+            dev_loader = DataLoader(
+                dev_dataset,
+                batch_sampler=dev_sampler,
+                collate_fn=waveform_collate_fn,
+                num_workers=config.num_workers,
+                return_list=True, )
+
+            # set the model to eval mode
+            model.eval()
+            num_corrects = 0
+            num_samples = 0
+
+            # stage 9-13: evaluation the valid dataset batch data
+            logger.info('Evaluate on validation dataset')
+            with paddle.no_grad():
+                for batch_idx, batch in enumerate(dev_loader):
+                    waveforms, labels = batch['waveforms'], batch['labels']
+
+                    feats = []
+                    for waveform in waveforms.numpy():
+                        feat = melspectrogram(
+                            x=waveform,
+                            sr=config.sr,
+                            n_mels=config.n_mels,
+                            window_size=config.window_size,
+                            hop_length=config.hop_size)
+                        feats.append(feat)
+
+                    feats = paddle.to_tensor(np.asarray(feats))
+                    feats = feature_normalize(
+                        feats, mean_norm=True, std_norm=False)
+                    logits = model(feats)
+
+                    preds = paddle.argmax(logits, axis=1)
+                    num_corrects += (preds == labels).numpy().sum()
+                    num_samples += feats.shape[0]
+
+            print_msg = '[Evaluation result]'
+            print_msg += ' dev_acc={:.4f}'.format(num_corrects / num_samples)
+            logger.info(print_msg)
+
+            # stage 9-14: Save model parameters
+            save_dir = os.path.join(args.checkpoint_dir,
+                                    'epoch_{}'.format(epoch))
+            last_saved_epoch = os.path.join('epoch_{}'.format(epoch),
+                                            "model.pdparams")
+            logger.info('Saving model checkpoint to {}'.format(save_dir))
+            paddle.save(model.state_dict(),
+                        os.path.join(save_dir, 'model.pdparams'))
+            paddle.save(optimizer.state_dict(),
+                        os.path.join(save_dir, 'model.pdopt'))
+
+            if nranks > 1:
+                paddle.distributed.barrier()  # Main process
+
+    # stage 10: create the final trained model.pdparams with soft link
+    if local_rank == 0:
+        final_model = os.path.join(args.checkpoint_dir, "model.pdparams")
+        logger.info(f"we will create the final model: {final_model}")
+        if os.path.islink(final_model):
+            logger.info(
+                f"An {final_model} already exists, we will rm is and create it again"
+            )
+            os.unlink(final_model)
+        os.symlink(last_saved_epoch, final_model)
+
+
+if __name__ == "__main__":
+    # yapf: disable
+    parser = argparse.ArgumentParser(__doc__)
+    parser.add_argument('--device',
+                        choices=['cpu', 'gpu'],
+                        default="cpu",
+                        help="Select which device to train model, defaults to gpu.")
+    parser.add_argument("--config",
+                        default=None,
+                        type=str,
+                        help="configuration file")
+    parser.add_argument("--data-dir",
+                        default="./data/",
+                        type=str,
+                        help="data directory")
+    parser.add_argument("--load-checkpoint",
+                        type=str,
+                        default=None,
+                        help="Directory to load model checkpoint to contiune trainning.")
+    parser.add_argument("--checkpoint-dir",
+                        type=str,
+                        default='./checkpoint',
+                        help="Directory to save model checkpoints.")
+
+    args = parser.parse_args()
+    # yapf: enable
+
+    # https://yaml.org/type/float.html
+    config = CfgNode(new_allowed=True)
+    if args.config:
+        config.merge_from_file(args.config)
+
+    config.freeze()
+    print(config)
+
+    main(args, config)
diff --git a/ernie-sat/paddlespeech/vector/exps/ge2e/__init__.py b/ernie-sat/paddlespeech/vector/exps/ge2e/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..abf198b97e6e818e1fbe59006f98492640bcee54
--- /dev/null
+++ b/ernie-sat/paddlespeech/vector/exps/ge2e/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/ernie-sat/paddlespeech/vector/exps/ge2e/audio_processor.py b/ernie-sat/paddlespeech/vector/exps/ge2e/audio_processor.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ab0419e118e20cab11e52be12983f7cc951514a
--- /dev/null
+++ b/ernie-sat/paddlespeech/vector/exps/ge2e/audio_processor.py
@@ -0,0 +1,246 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import struct
+from pathlib import Path
+from warnings import warn
+
+import librosa
+import numpy as np
+from scipy.ndimage.morphology import binary_dilation
+
+try:
+    import webrtcvad
+except ModuleNotFoundError:
+    warn("Unable to import 'webrtcvad'."
+         "This package enables noise removal and is recommended.")
+    webrtcvad = None
+
+INT16_MAX = (2**15) - 1
+
+
+def normalize_volume(wav, target_dBFS, increase_only=False,
+                     decrease_only=False):
+    # this function implements Loudness normalization, instead of peak
+    # normalization, See https://en.wikipedia.org/wiki/Audio_normalization
+    # dBFS: Decibels relative to full scale
+    # See https://en.wikipedia.org/wiki/DBFS for more details
+    # for 16Bit PCM audio, minimal level is -96dB
+    # compute the mean dBFS and adjust to target dBFS, with by increasing
+    # or decreasing
+    if increase_only and decrease_only:
+        raise ValueError("Both increase only and decrease only are set")
+    dBFS_change = target_dBFS - 10 * np.log10(np.mean(wav**2))
+    if dBFS_change < 0 and increase_only:
+        return wav
+    if dBFS_change > 0 and decrease_only:
+        return wav
+    gain = 10**(dBFS_change / 20)
+    return wav * gain
+
+
+def trim_long_silences(wav,
+                       vad_window_length: int,
+                       vad_moving_average_width: int,
+                       vad_max_silence_length: int,
+                       sampling_rate: int):
+    """
+    Ensures that segments without voice in the waveform remain no longer than a
+    threshold determined by the VAD parameters in params.py.
+    Parameters
+    ----------
+    wav : np.array
+        the raw waveform as a numpy array of floats
+    Returns
+    ----------
+    np.array
+        the same waveform with silences trimmed away (length <= original wav length)
+    """
+    # Compute the voice detection window size
+    samples_per_window = (vad_window_length * sampling_rate) // 1000
+
+    # Trim the end of the audio to have a multiple of the window size
+    wav = wav[:len(wav) - (len(wav) % samples_per_window)]
+
+    # Convert the float waveform to 16-bit mono PCM
+    pcm_wave = struct.pack("%dh" % len(wav),
+                           *(np.round(wav * INT16_MAX)).astype(np.int16))
+
+    # Perform voice activation detection
+    voice_flags = []
+    vad = webrtcvad.Vad(mode=3)
+    for window_start in range(0, len(wav), samples_per_window):
+        window_end = window_start + samples_per_window
+        voice_flags.append(
+            vad.is_speech(
+                pcm_wave[window_start * 2:window_end * 2],
+                sample_rate=sampling_rate))
+    voice_flags = np.array(voice_flags)
+
+    # Smooth the voice detection with a moving average
+    def moving_average(array, width):
+        array_padded = np.concatenate((np.zeros((width - 1) // 2), array,
+                                       np.zeros(width // 2)))
+        ret = np.cumsum(array_padded, dtype=float)
+        ret[width:] = ret[width:] - ret[:-width]
+        return ret[width - 1:] / width
+
+    audio_mask = moving_average(voice_flags, vad_moving_average_width)
+    audio_mask = np.round(audio_mask).astype(bool)
+
+    # Dilate the voiced regions
+    audio_mask = binary_dilation(audio_mask,
+                                 np.ones(vad_max_silence_length + 1))
+    audio_mask = np.repeat(audio_mask, samples_per_window)
+
+    return wav[audio_mask]
+
+
+def compute_partial_slices(n_samples: int,
+                           partial_utterance_n_frames: int,
+                           hop_length: int,
+                           min_pad_coverage: float=0.75,
+                           overlap: float=0.5):
+    """
+    Computes where to split an utterance waveform and its corresponding mel spectrogram to obtain
+    partial utterances of <partial_utterance_n_frames> each. Both the waveform and the mel
+    spectrogram slices are returned, so as to make each partial utterance waveform correspond to
+    its spectrogram. This function assumes that the mel spectrogram parameters used are those
+    defined in params_data.py.
+
+    The returned ranges may be indexing further than the length of the waveform. It is
+    recommended that you pad the waveform with zeros up to wave_slices[-1].stop.
+    Parameters
+    ----------
+    n_samples : int
+        the number of samples in the waveform.
+    partial_utterance_n_frames : int
+        the number of mel spectrogram frames in each partial utterance.
+
+    min_pad_coverage : int
+        when reaching the last partial utterance, it may or may not have enough frames.
+        If at least <min_pad_coverage> of <partial_utterance_n_frames> are present,
+        then the last partial utterance will be considered, as if we padded the audio. Otherwise,
+        it will be discarded, as if we trimmed the audio. If there aren't enough frames for 1 partial
+        utterance, this parameter is ignored so that the function always returns at least 1 slice.
+    overlap : float
+        by how much the partial utterance should overlap. If set to 0, the partial utterances are entirely disjoint.
+    Returns
+    ----------
+        the waveform slices and mel spectrogram slices as lists of array slices.
+        Index respectively the waveform and the mel spectrogram with these slices to obtain the partialutterances.
+    """
+    assert 0 <= overlap < 1
+    assert 0 < min_pad_coverage <= 1
+
+    # librosa's function to compute num_frames from num_samples
+    n_frames = int(np.ceil((n_samples + 1) / hop_length))
+    # frame shift between ajacent partials
+    frame_step = max(1,
+                     int(np.round(partial_utterance_n_frames * (1 - overlap))))
+
+    # Compute the slices
+    wav_slices, mel_slices = [], []
+    steps = max(1, n_frames - partial_utterance_n_frames + frame_step + 1)
+    for i in range(0, steps, frame_step):
+        mel_range = np.array([i, i + partial_utterance_n_frames])
+        wav_range = mel_range * hop_length
+        mel_slices.append(slice(*mel_range))
+        wav_slices.append(slice(*wav_range))
+
+    # Evaluate whether extra padding is warranted or not
+    last_wav_range = wav_slices[-1]
+    coverage = (n_samples - last_wav_range.start) / (
+        last_wav_range.stop - last_wav_range.start)
+    if coverage < min_pad_coverage and len(mel_slices) > 1:
+        mel_slices = mel_slices[:-1]
+        wav_slices = wav_slices[:-1]
+
+    return wav_slices, mel_slices
+
+
+class SpeakerVerificationPreprocessor(object):
+    def __init__(self,
+                 sampling_rate: int,
+                 audio_norm_target_dBFS: float,
+                 vad_window_length,
+                 vad_moving_average_width,
+                 vad_max_silence_length,
+                 mel_window_length,
+                 mel_window_step,
+                 n_mels,
+                 partial_n_frames: int,
+                 min_pad_coverage: float=0.75,
+                 partial_overlap_ratio: float=0.5):
+        self.sampling_rate = sampling_rate
+        self.audio_norm_target_dBFS = audio_norm_target_dBFS
+
+        self.vad_window_length = vad_window_length
+        self.vad_moving_average_width = vad_moving_average_width
+        self.vad_max_silence_length = vad_max_silence_length
+
+        self.n_fft = int(mel_window_length * sampling_rate / 1000)
+        self.hop_length = int(mel_window_step * sampling_rate / 1000)
+        self.n_mels = n_mels
+
+        self.partial_n_frames = partial_n_frames
+        self.min_pad_coverage = min_pad_coverage
+        self.partial_overlap_ratio = partial_overlap_ratio
+
+    def preprocess_wav(self, fpath_or_wav, source_sr=None):
+        # Load the wav from disk if needed
+        if isinstance(fpath_or_wav, (str, Path)):
+            wav, source_sr = librosa.load(str(fpath_or_wav), sr=None)
+        else:
+            wav = fpath_or_wav
+
+        # Resample if numpy.array is passed and sr does not match
+        if source_sr is not None and source_sr != self.sampling_rate:
+            wav = librosa.resample(
+                wav, orig_sr=source_sr, target_sr=self.sampling_rate)
+
+        # loudness normalization
+        wav = normalize_volume(
+            wav, self.audio_norm_target_dBFS, increase_only=True)
+
+        # trim long silence
+        if webrtcvad:
+            wav = trim_long_silences(
+                wav, self.vad_window_length, self.vad_moving_average_width,
+                self.vad_max_silence_length, self.sampling_rate)
+        return wav
+
+    def melspectrogram(self, wav):
+        mel = librosa.feature.melspectrogram(
+            y=wav,
+            sr=self.sampling_rate,
+            n_fft=self.n_fft,
+            hop_length=self.hop_length,
+            n_mels=self.n_mels)
+        mel = mel.astype(np.float32).T
+        return mel
+
+    def extract_mel_partials(self, wav):
+        wav_slices, mel_slices = compute_partial_slices(
+            len(wav), self.partial_n_frames, self.hop_length,
+            self.min_pad_coverage, self.partial_overlap_ratio)
+
+        # pad audio if needed
+        max_wave_length = wav_slices[-1].stop
+        if max_wave_length >= len(wav):
+            wav = np.pad(wav, (0, max_wave_length - len(wav)), "constant")
+
+        # Split the utterance into partials
+        frames = self.melspectrogram(wav)
+        frames_batch = np.array([frames[s] for s in mel_slices])
+        return frames_batch  # [B, T, C]
diff --git a/ernie-sat/paddlespeech/vector/exps/ge2e/config.py b/ernie-sat/paddlespeech/vector/exps/ge2e/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e114291647e5c7869c3f50c556cbae3c382bd92
--- /dev/null
+++ b/ernie-sat/paddlespeech/vector/exps/ge2e/config.py
@@ -0,0 +1,61 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from yacs.config import CfgNode
+
+_C = CfgNode()
+
+data_config = _C.data = CfgNode()
+
+## Audio volume normalization
+data_config.audio_norm_target_dBFS = -30
+
+## Audio sample rate
+data_config.sampling_rate = 16000  # Hz
+
+## Voice Activation Detection
+# Window size of the VAD. Must be either 10, 20 or 30 milliseconds.
+# This sets the granularity of the VAD. Should not need to be changed.
+data_config.vad_window_length = 30  # In milliseconds
+# Number of frames to average together when performing the moving average smoothing.
+# The larger this value, the larger the VAD variations must be to not get smoothed out.
+data_config.vad_moving_average_width = 8
+# Maximum number of consecutive silent frames a segment can have.
+data_config.vad_max_silence_length = 6
+
+## Mel-filterbank
+data_config.mel_window_length = 25  # In milliseconds
+data_config.mel_window_step = 10  # In milliseconds
+data_config.n_mels = 40  # mel bands
+
+# Number of spectrogram frames in a partial utterance
+data_config.partial_n_frames = 160  # 1600 ms
+data_config.min_pad_coverage = 0.75  # at least 75% of the audio is valid in a partial
+data_config.partial_overlap_ratio = 0.5  # overlap ratio between ajancent partials
+
+model_config = _C.model = CfgNode()
+model_config.num_layers = 3
+model_config.hidden_size = 256
+model_config.embedding_size = 256  # output size
+
+training_config = _C.training = CfgNode()
+training_config.learning_rate_init = 1e-4
+training_config.speakers_per_batch = 64
+training_config.utterances_per_speaker = 10
+training_config.max_iteration = 1560000
+training_config.save_interval = 10000
+training_config.valid_interval = 10000
+
+
+def get_cfg_defaults():
+    return _C.clone()
diff --git a/ernie-sat/paddlespeech/vector/exps/ge2e/dataset_processors.py b/ernie-sat/paddlespeech/vector/exps/ge2e/dataset_processors.py
new file mode 100644
index 0000000000000000000000000000000000000000..908c852b2ec8121838f249ed04310f714776cffb
--- /dev/null
+++ b/ernie-sat/paddlespeech/vector/exps/ge2e/dataset_processors.py
@@ -0,0 +1,173 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import multiprocessing as mp
+from functools import partial
+from pathlib import Path
+from typing import List
+
+import numpy as np
+from tqdm import tqdm
+
+from paddlespeech.vector.exps.ge2e.audio_processor import SpeakerVerificationPreprocessor
+
+
+def _process_utterance(path_pair, processor: SpeakerVerificationPreprocessor):
+    # Load and preprocess the waveform
+    input_path, output_path = path_pair
+    wav = processor.preprocess_wav(input_path)
+    if len(wav) == 0:
+        return
+
+    # Create the mel spectrogram, discard those that are too short
+    frames = processor.melspectrogram(wav)
+    if len(frames) < processor.partial_n_frames:
+        return
+
+    np.save(output_path, frames)
+
+
+def _process_speaker(speaker_dir: Path,
+                     processor: SpeakerVerificationPreprocessor,
+                     datasets_root: Path,
+                     output_dir: Path,
+                     pattern: str,
+                     skip_existing: bool=False):
+    # datastes root: a reference path to compute speaker_name
+    # we prepand dataset name to speaker_id becase we are mixing serveal
+    # multispeaker datasets together
+    speaker_name = "_".join(speaker_dir.relative_to(datasets_root).parts)
+    speaker_output_dir = output_dir / speaker_name
+    speaker_output_dir.mkdir(parents=True, exist_ok=True)
+
+    # load exsiting file set
+    sources_fpath = speaker_output_dir / "_sources.txt"
+    if sources_fpath.exists():
+        try:
+            with sources_fpath.open("rt") as sources_file:
+                existing_names = {line.split(",")[0] for line in sources_file}
+        except Exception as e:
+            existing_names = {}
+    else:
+        existing_names = {}
+
+    sources_file = sources_fpath.open("at" if skip_existing else "wt")
+    for in_fpath in speaker_dir.rglob(pattern):
+        out_name = "_".join(
+            in_fpath.relative_to(speaker_dir).with_suffix(".npy").parts)
+        if skip_existing and out_name in existing_names:
+            continue
+        out_fpath = speaker_output_dir / out_name
+        _process_utterance((in_fpath, out_fpath), processor)
+        sources_file.write(f"{out_name},{in_fpath}\n")
+
+    sources_file.close()
+
+
+def _process_dataset(processor: SpeakerVerificationPreprocessor,
+                     datasets_root: Path,
+                     speaker_dirs: List[Path],
+                     dataset_name: str,
+                     output_dir: Path,
+                     pattern: str,
+                     skip_existing: bool=False):
+    print(
+        f"{dataset_name}: Preprocessing data for {len(speaker_dirs)} speakers.")
+
+    _func = partial(
+        _process_speaker,
+        processor=processor,
+        datasets_root=datasets_root,
+        output_dir=output_dir,
+        pattern=pattern,
+        skip_existing=skip_existing)
+
+    with mp.Pool(16) as pool:
+        list(
+            tqdm(
+                pool.imap(_func, speaker_dirs),
+                dataset_name,
+                len(speaker_dirs),
+                unit="speakers"))
+    print(f"Done preprocessing {dataset_name}.")
+
+
+def process_librispeech(processor,
+                        datasets_root,
+                        output_dir,
+                        skip_existing=False):
+    dataset_name = "LibriSpeech/train-other-500"
+    dataset_root = datasets_root / dataset_name
+    speaker_dirs = list(dataset_root.glob("*"))
+    _process_dataset(processor, datasets_root, speaker_dirs, dataset_name,
+                     output_dir, "*.flac", skip_existing)
+
+
+def process_voxceleb1(processor, datasets_root, output_dir,
+                      skip_existing=False):
+    dataset_name = "VoxCeleb1"
+    dataset_root = datasets_root / dataset_name
+
+    anglophone_nationalites = ["australia", "canada", "ireland", "uk", "usa"]
+    with dataset_root.joinpath("vox1_meta.csv").open("rt") as metafile:
+        metadata = [line.strip().split("\t") for line in metafile][1:]
+
+    # speaker id -> nationality
+    nationalities = {line[0]: line[3] for line in metadata if line[-1] == "dev"}
+    keep_speaker_ids = [
+        speaker_id for speaker_id, nationality in nationalities.items()
+        if nationality.lower() in anglophone_nationalites
+    ]
+    print(
+        "VoxCeleb1: using samples from {} (presumed anglophone) speakers out of {}."
+        .format(len(keep_speaker_ids), len(nationalities)))
+
+    speaker_dirs = list((dataset_root / "wav").glob("*"))
+    speaker_dirs = [
+        speaker_dir for speaker_dir in speaker_dirs
+        if speaker_dir.name in keep_speaker_ids
+    ]
+    _process_dataset(processor, datasets_root, speaker_dirs, dataset_name,
+                     output_dir, "*.wav", skip_existing)
+
+
+def process_voxceleb2(processor, datasets_root, output_dir,
+                      skip_existing=False):
+    dataset_name = "VoxCeleb2"
+    dataset_root = datasets_root / dataset_name
+    # There is no nationality in meta data for VoxCeleb2
+    speaker_dirs = list((dataset_root / "wav").glob("*"))
+    _process_dataset(processor, datasets_root, speaker_dirs, dataset_name,
+                     output_dir, "*.wav", skip_existing)
+
+
+def process_aidatatang_200zh(processor,
+                             datasets_root,
+                             output_dir,
+                             skip_existing=False):
+    dataset_name = "aidatatang_200zh/train"
+    dataset_root = datasets_root / dataset_name
+
+    speaker_dirs = list((dataset_root).glob("*"))
+    _process_dataset(processor, datasets_root, speaker_dirs, dataset_name,
+                     output_dir, "*.wav", skip_existing)
+
+
+def process_magicdata(processor, datasets_root, output_dir,
+                      skip_existing=False):
+    dataset_name = "magicdata/train"
+    dataset_root = datasets_root / dataset_name
+
+    speaker_dirs = list((dataset_root).glob("*"))
+    _process_dataset(processor, datasets_root, speaker_dirs, dataset_name,
+                     output_dir, "*.wav", skip_existing)
diff --git a/ernie-sat/paddlespeech/vector/exps/ge2e/inference.py b/ernie-sat/paddlespeech/vector/exps/ge2e/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..7660de5e876529448b0e8f0e2a3f6185d15e9322
--- /dev/null
+++ b/ernie-sat/paddlespeech/vector/exps/ge2e/inference.py
@@ -0,0 +1,140 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+from pathlib import Path
+
+import numpy as np
+import paddle
+import tqdm
+
+from paddlespeech.vector.exps.ge2e.audio_processor import SpeakerVerificationPreprocessor
+from paddlespeech.vector.exps.ge2e.config import get_cfg_defaults
+from paddlespeech.vector.models.lstm_speaker_encoder import LSTMSpeakerEncoder
+
+
+def embed_utterance(processor, model, fpath_or_wav):
+    # audio processor
+    wav = processor.preprocess_wav(fpath_or_wav)
+    mel_partials = processor.extract_mel_partials(wav)
+
+    model.eval()
+    # speaker encoder
+    with paddle.no_grad():
+        mel_partials = paddle.to_tensor(mel_partials)
+        with paddle.no_grad():
+            embed = model.embed_utterance(mel_partials)
+    embed = embed.numpy()
+    return embed
+
+
+def _process_utterance(ifpath: Path,
+                       input_dir: Path,
+                       output_dir: Path,
+                       processor: SpeakerVerificationPreprocessor,
+                       model: LSTMSpeakerEncoder):
+    rel_path = ifpath.relative_to(input_dir)
+    ofpath = (output_dir / rel_path).with_suffix(".npy")
+    ofpath.parent.mkdir(parents=True, exist_ok=True)
+    embed = embed_utterance(processor, model, ifpath)
+    np.save(ofpath, embed)
+
+
+def main(config, args):
+
+    if args.ngpu == 0:
+        paddle.set_device("cpu")
+    elif args.ngpu > 0:
+        paddle.set_device("gpu")
+    else:
+        print("ngpu should >= 0 !")
+
+    # load model
+    model = LSTMSpeakerEncoder(config.data.n_mels, config.model.num_layers,
+                               config.model.hidden_size,
+                               config.model.embedding_size)
+    weights_fpath = str(Path(args.checkpoint_path).expanduser())
+    model_state_dict = paddle.load(weights_fpath + ".pdparams")
+    model.set_state_dict(model_state_dict)
+    model.eval()
+    print(f"Loaded encoder {weights_fpath}")
+
+    # create audio processor
+    c = config.data
+    processor = SpeakerVerificationPreprocessor(
+        sampling_rate=c.sampling_rate,
+        audio_norm_target_dBFS=c.audio_norm_target_dBFS,
+        vad_window_length=c.vad_window_length,
+        vad_moving_average_width=c.vad_moving_average_width,
+        vad_max_silence_length=c.vad_max_silence_length,
+        mel_window_length=c.mel_window_length,
+        mel_window_step=c.mel_window_step,
+        n_mels=c.n_mels,
+        partial_n_frames=c.partial_n_frames,
+        min_pad_coverage=c.min_pad_coverage,
+        partial_overlap_ratio=c.min_pad_coverage, )
+
+    # input output preparation
+    input_dir = Path(args.input).expanduser()
+    ifpaths = list(input_dir.rglob(args.pattern))
+    print(f"{len(ifpaths)} utterances in total")
+    output_dir = Path(args.output).expanduser()
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    for ifpath in tqdm.tqdm(ifpaths, unit="utterance"):
+        _process_utterance(ifpath, input_dir, output_dir, processor, model)
+
+
+if __name__ == "__main__":
+    config = get_cfg_defaults()
+    parser = argparse.ArgumentParser(description="compute utterance embed.")
+    parser.add_argument(
+        "--config",
+        metavar="FILE",
+        help="path of the config file to overwrite to default config with.")
+    parser.add_argument(
+        "--input", type=str, help="path of the audio_file folder.")
+    parser.add_argument(
+        "--pattern",
+        type=str,
+        default="*.wav",
+        help="pattern to filter audio files.")
+    parser.add_argument(
+        "--output",
+        metavar="OUTPUT_DIR",
+        help="path to save checkpoint and logs.")
+
+    # load from saved checkpoint
+    parser.add_argument(
+        "--checkpoint_path", type=str, help="path of the checkpoint to load")
+
+    # overwrite extra config and default config
+    parser.add_argument(
+        "--opts",
+        nargs=argparse.REMAINDER,
+        help="options to overwrite --config file and the default config, passing in KEY VALUE pairs"
+    )
+
+    parser.add_argument(
+        "--ngpu", type=int, default=1, help="if ngpu=0, use cpu.")
+
+    args = parser.parse_args()
+    if args.config:
+        config.merge_from_file(args.config)
+    if args.opts:
+        config.merge_from_list(args.opts)
+    config.freeze()
+    print(config)
+    print(args)
+
+    main(config, args)
diff --git a/ernie-sat/paddlespeech/vector/exps/ge2e/preprocess.py b/ernie-sat/paddlespeech/vector/exps/ge2e/preprocess.py
new file mode 100644
index 0000000000000000000000000000000000000000..dabe0ce7694547ed197a4d570bcec0399e9ac54e
--- /dev/null
+++ b/ernie-sat/paddlespeech/vector/exps/ge2e/preprocess.py
@@ -0,0 +1,103 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+from pathlib import Path
+
+from paddlespeech.vector.exps.ge2e.audio_processor import SpeakerVerificationPreprocessor
+from paddlespeech.vector.exps.ge2e.config import get_cfg_defaults
+from paddlespeech.vector.exps.ge2e.dataset_processors import process_aidatatang_200zh
+from paddlespeech.vector.exps.ge2e.dataset_processors import process_librispeech
+from paddlespeech.vector.exps.ge2e.dataset_processors import process_magicdata
+from paddlespeech.vector.exps.ge2e.dataset_processors import process_voxceleb1
+from paddlespeech.vector.exps.ge2e.dataset_processors import process_voxceleb2
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="preprocess dataset for speaker verification task")
+    parser.add_argument(
+        "--datasets_root",
+        type=Path,
+        help="Path to the directory containing your LibriSpeech, LibriTTS and VoxCeleb datasets."
+    )
+    parser.add_argument(
+        "--output_dir", type=Path, help="Path to save processed dataset.")
+    parser.add_argument(
+        "--dataset_names",
+        type=str,
+        default="librispeech_other,voxceleb1,voxceleb2",
+        help="comma-separated list of names of the datasets you want to preprocess. only "
+        "the train set of these datastes will be used. Possible names: librispeech_other, "
+        "voxceleb1, voxceleb2, aidatatang_200zh, magicdata.")
+    parser.add_argument(
+        "--skip_existing",
+        action="store_true",
+        help="Whether to skip ouput files with the same name. Useful if this script was interrupted."
+    )
+    parser.add_argument(
+        "--no_trim",
+        action="store_true",
+        help="Preprocess audio without trimming silences (not recommended).")
+
+    args = parser.parse_args()
+
+    if not args.no_trim:
+        try:
+            import webrtcvad
+            print(webrtcvad.__version__)
+        except Exception as e:
+            raise ModuleNotFoundError(
+                "Package 'webrtcvad' not found. This package enables "
+                "noise removal and is recommended. Please install and "
+                "try again. If installation fails, "
+                "use --no_trim to disable this error message.")
+    del args.no_trim
+
+    args.datasets = [item.strip() for item in args.dataset_names.split(",")]
+    if not hasattr(args, "output_dir"):
+        args.output_dir = args.dataset_root / "SV2TTS" / "encoder"
+
+    args.output_dir = args.output_dir.expanduser()
+    args.datasets_root = args.datasets_root.expanduser()
+    assert args.datasets_root.exists()
+    args.output_dir.mkdir(exist_ok=True, parents=True)
+
+    config = get_cfg_defaults()
+    print(args)
+
+    c = config.data
+    processor = SpeakerVerificationPreprocessor(
+        sampling_rate=c.sampling_rate,
+        audio_norm_target_dBFS=c.audio_norm_target_dBFS,
+        vad_window_length=c.vad_window_length,
+        vad_moving_average_width=c.vad_moving_average_width,
+        vad_max_silence_length=c.vad_max_silence_length,
+        mel_window_length=c.mel_window_length,
+        mel_window_step=c.mel_window_step,
+        n_mels=c.n_mels,
+        partial_n_frames=c.partial_n_frames,
+        min_pad_coverage=c.min_pad_coverage,
+        partial_overlap_ratio=c.min_pad_coverage, )
+
+    preprocess_func = {
+        "librispeech_other": process_librispeech,
+        "voxceleb1": process_voxceleb1,
+        "voxceleb2": process_voxceleb2,
+        "aidatatang_200zh": process_aidatatang_200zh,
+        "magicdata": process_magicdata,
+    }
+
+    for dataset in args.datasets:
+        print("Preprocessing %s" % dataset)
+        preprocess_func[dataset](processor, args.datasets_root, args.output_dir,
+                                 args.skip_existing)
diff --git a/ernie-sat/paddlespeech/vector/exps/ge2e/random_cycle.py b/ernie-sat/paddlespeech/vector/exps/ge2e/random_cycle.py
new file mode 100644
index 0000000000000000000000000000000000000000..290fd2fa274b66f7802cb0ab529d04099118f624
--- /dev/null
+++ b/ernie-sat/paddlespeech/vector/exps/ge2e/random_cycle.py
@@ -0,0 +1,38 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import random
+
+
+def cycle(iterable):
+    # cycle('ABCD') --> A B C D A B C D A B C D ...
+    saved = []
+    for element in iterable:
+        yield element
+        saved.append(element)
+    while saved:
+        for element in saved:
+            yield element
+
+
+def random_cycle(iterable):
+    # cycle('ABCD') --> A B C D B C D A A D B C ...
+    saved = []
+    for element in iterable:
+        yield element
+        saved.append(element)
+    random.shuffle(saved)
+    while saved:
+        for element in saved:
+            yield element
+        random.shuffle(saved)
diff --git a/ernie-sat/paddlespeech/vector/exps/ge2e/speaker_verification_dataset.py b/ernie-sat/paddlespeech/vector/exps/ge2e/speaker_verification_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae6f6ad974a8cad5795bda99b55befd25559f5fa
--- /dev/null
+++ b/ernie-sat/paddlespeech/vector/exps/ge2e/speaker_verification_dataset.py
@@ -0,0 +1,125 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import random
+from pathlib import Path
+
+import numpy as np
+from paddle.io import BatchSampler
+from paddle.io import Dataset
+
+from paddlespeech.vector.exps.ge2e.random_cycle import random_cycle
+
+
+class MultiSpeakerMelDataset(Dataset):
+    """A 2 layer directory thatn contains mel spectrograms in *.npy format.
+    An Example file structure tree is shown below. We prefer to preprocess
+    raw datasets and organized them like this.
+
+    dataset_root/
+      speaker1/
+        utterance1.npy
+        utterance2.npy
+        utterance3.npy
+      speaker2/
+        utterance1.npy
+        utterance2.npy
+        utterance3.npy
+    """
+
+    def __init__(self, dataset_root: Path):
+        self.root = Path(dataset_root).expanduser()
+        speaker_dirs = [f for f in self.root.glob("*") if f.is_dir()]
+
+        speaker_utterances = {
+            speaker_dir: list(speaker_dir.glob("*.npy"))
+            for speaker_dir in speaker_dirs
+        }
+
+        self.speaker_dirs = speaker_dirs
+        self.speaker_to_utterances = speaker_utterances
+
+        # meta data
+        self.num_speakers = len(self.speaker_dirs)
+        self.num_utterances = np.sum(
+            len(utterances)
+            for speaker, utterances in self.speaker_to_utterances.items())
+
+    def get_example_by_index(self, speaker_index, utterance_index):
+        speaker_dir = self.speaker_dirs[speaker_index]
+        fpath = self.speaker_to_utterances[speaker_dir][utterance_index]
+        return self[fpath]
+
+    def __getitem__(self, fpath):
+        return np.load(fpath)
+
+    def __len__(self):
+        return int(self.num_utterances)
+
+
+class MultiSpeakerSampler(BatchSampler):
+    """A multi-stratal sampler designed for speaker verification task.
+    First, N speakers from all speakers are sampled randomly. Then, for each
+    speaker, randomly sample M utterances from their corresponding utterances.
+    """
+
+    def __init__(self,
+                 dataset: MultiSpeakerMelDataset,
+                 speakers_per_batch: int,
+                 utterances_per_speaker: int):
+        self._speakers = list(dataset.speaker_dirs)
+        self._speaker_to_utterances = dataset.speaker_to_utterances
+
+        self.speakers_per_batch = speakers_per_batch
+        self.utterances_per_speaker = utterances_per_speaker
+
+    def __iter__(self):
+        # yield list of Paths
+        speaker_generator = iter(random_cycle(self._speakers))
+        speaker_utterances_generator = {
+            s: iter(random_cycle(us))
+            for s, us in self._speaker_to_utterances.items()
+        }
+
+        while True:
+            speakers = []
+            for _ in range(self.speakers_per_batch):
+                speakers.append(next(speaker_generator))
+
+            utterances = []
+            for s in speakers:
+                us = speaker_utterances_generator[s]
+                for _ in range(self.utterances_per_speaker):
+                    utterances.append(next(us))
+            yield utterances
+
+
+class RandomClip(object):
+    def __init__(self, frames):
+        self.frames = frames
+
+    def __call__(self, spec):
+        # spec [T, C]
+        T = spec.shape[0]
+        start = random.randint(0, T - self.frames)
+        return spec[start:start + self.frames, :]
+
+
+class Collate(object):
+    def __init__(self, num_frames):
+        self.random_crop = RandomClip(num_frames)
+
+    def __call__(self, examples):
+        frame_clips = [self.random_crop(mel) for mel in examples]
+        batced_clips = np.stack(frame_clips)
+        return batced_clips
diff --git a/ernie-sat/paddlespeech/vector/exps/ge2e/train.py b/ernie-sat/paddlespeech/vector/exps/ge2e/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..bf1cf1074b5dec41f2287f5113b6facef9909283
--- /dev/null
+++ b/ernie-sat/paddlespeech/vector/exps/ge2e/train.py
@@ -0,0 +1,123 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import time
+
+from paddle import DataParallel
+from paddle import distributed as dist
+from paddle.io import DataLoader
+from paddle.nn.clip import ClipGradByGlobalNorm
+from paddle.optimizer import Adam
+
+from paddlespeech.t2s.training import default_argument_parser
+from paddlespeech.t2s.training import ExperimentBase
+from paddlespeech.vector.exps.ge2e.config import get_cfg_defaults
+from paddlespeech.vector.exps.ge2e.speaker_verification_dataset import Collate
+from paddlespeech.vector.exps.ge2e.speaker_verification_dataset import MultiSpeakerMelDataset
+from paddlespeech.vector.exps.ge2e.speaker_verification_dataset import MultiSpeakerSampler
+from paddlespeech.vector.models.lstm_speaker_encoder import LSTMSpeakerEncoder
+
+
+class Ge2eExperiment(ExperimentBase):
+    def setup_model(self):
+        config = self.config
+        model = LSTMSpeakerEncoder(config.data.n_mels, config.model.num_layers,
+                                   config.model.hidden_size,
+                                   config.model.embedding_size)
+        optimizer = Adam(
+            config.training.learning_rate_init,
+            parameters=model.parameters(),
+            grad_clip=ClipGradByGlobalNorm(3))
+        self.model = DataParallel(model) if self.parallel else model
+        self.model_core = model
+        self.optimizer = optimizer
+
+    def setup_dataloader(self):
+        config = self.config
+        train_dataset = MultiSpeakerMelDataset(self.args.data)
+        sampler = MultiSpeakerSampler(train_dataset,
+                                      config.training.speakers_per_batch,
+                                      config.training.utterances_per_speaker)
+        train_loader = DataLoader(
+            train_dataset,
+            batch_sampler=sampler,
+            collate_fn=Collate(config.data.partial_n_frames),
+            num_workers=16)
+
+        self.train_dataset = train_dataset
+        self.train_loader = train_loader
+
+    def train_batch(self):
+        start = time.time()
+        batch = self.read_batch()
+        data_loader_time = time.time() - start
+
+        self.optimizer.clear_grad()
+        self.model.train()
+        specs = batch
+        loss, eer = self.model(specs, self.config.training.speakers_per_batch)
+        loss.backward()
+        self.model_core.do_gradient_ops()
+        self.optimizer.step()
+        iteration_time = time.time() - start
+
+        # logging
+        loss_value = float(loss)
+        msg = "Rank: {}, ".format(dist.get_rank())
+        msg += "step: {}, ".format(self.iteration)
+        msg += "time: {:>.3f}s/{:>.3f}s, ".format(data_loader_time,
+                                                  iteration_time)
+        msg += 'loss: {:>.6f} err: {:>.6f}'.format(loss_value, eer)
+        self.logger.info(msg)
+
+        if dist.get_rank() == 0:
+            self.visualizer.add_scalar("train/loss", loss_value, self.iteration)
+            self.visualizer.add_scalar("train/eer", eer, self.iteration)
+            self.visualizer.add_scalar("param/w",
+                                       float(self.model_core.similarity_weight),
+                                       self.iteration)
+            self.visualizer.add_scalar("param/b",
+                                       float(self.model_core.similarity_bias),
+                                       self.iteration)
+
+    def valid(self):
+        pass
+
+
+def main_sp(config, args):
+    exp = Ge2eExperiment(config, args)
+    exp.setup()
+    exp.resume_or_load()
+    exp.run()
+
+
+def main(config, args):
+    if args.ngpu > 1:
+        dist.spawn(main_sp, args=(config, args), nprocs=args.ngpu)
+    else:
+        main_sp(config, args)
+
+
+if __name__ == "__main__":
+    config = get_cfg_defaults()
+    parser = default_argument_parser()
+    args = parser.parse_args()
+    if args.config:
+        config.merge_from_file(args.config)
+    if args.opts:
+        config.merge_from_list(args.opts)
+    config.freeze()
+    print(config)
+    print(args)
+
+    main(config, args)
diff --git a/ernie-sat/paddlespeech/vector/io/__init__.py b/ernie-sat/paddlespeech/vector/io/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..97043fd7ba6885aac81cad5a49924c23c67d4d47
--- /dev/null
+++ b/ernie-sat/paddlespeech/vector/io/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/ernie-sat/paddlespeech/vector/io/augment.py b/ernie-sat/paddlespeech/vector/io/augment.py
new file mode 100644
index 0000000000000000000000000000000000000000..3baace13977d0ba4ce324597b1d821850a79119b
--- /dev/null
+++ b/ernie-sat/paddlespeech/vector/io/augment.py
@@ -0,0 +1,906 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# this is modified from SpeechBrain
+# https://github.com/speechbrain/speechbrain/blob/085be635c07f16d42cd1295045bc46c407f1e15b/speechbrain/lobes/augment.py
+import math
+from typing import List
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddleaudio.datasets.rirs_noises import OpenRIRNoise
+from paddlespeech.s2t.utils.log import Log
+from paddlespeech.vector.io.signal_processing import compute_amplitude
+from paddlespeech.vector.io.signal_processing import convolve1d
+from paddlespeech.vector.io.signal_processing import dB_to_amplitude
+from paddlespeech.vector.io.signal_processing import notch_filter
+from paddlespeech.vector.io.signal_processing import reverberate
+
+logger = Log(__name__).getlog()
+
+
+# TODO: Complete type-hint and doc string.
+class DropFreq(nn.Layer):
+    def __init__(
+            self,
+            drop_freq_low=1e-14,
+            drop_freq_high=1,
+            drop_count_low=1,
+            drop_count_high=2,
+            drop_width=0.05,
+            drop_prob=1, ):
+        super(DropFreq, self).__init__()
+        self.drop_freq_low = drop_freq_low
+        self.drop_freq_high = drop_freq_high
+        self.drop_count_low = drop_count_low
+        self.drop_count_high = drop_count_high
+        self.drop_width = drop_width
+        self.drop_prob = drop_prob
+
+    def forward(self, waveforms):
+        # Don't drop (return early) 1-`drop_prob` portion of the batches
+        dropped_waveform = waveforms.clone()
+        if paddle.rand([1]) > self.drop_prob:
+            return dropped_waveform
+
+        # Add channels dimension
+        if len(waveforms.shape) == 2:
+            dropped_waveform = dropped_waveform.unsqueeze(-1)
+
+        # Pick number of frequencies to drop
+        drop_count = paddle.randint(
+            low=self.drop_count_low, high=self.drop_count_high + 1, shape=[1])
+
+        # Pick a frequency to drop
+        drop_range = self.drop_freq_high - self.drop_freq_low
+        drop_frequency = (
+            paddle.rand([drop_count]) * drop_range + self.drop_freq_low)
+
+        # Filter parameters
+        filter_length = 101
+        pad = filter_length // 2
+
+        # Start with delta function
+        drop_filter = paddle.zeros([1, filter_length, 1])
+        drop_filter[0, pad, 0] = 1
+
+        # Subtract each frequency
+        for frequency in drop_frequency:
+            notch_kernel = notch_filter(frequency, filter_length,
+                                        self.drop_width)
+            drop_filter = convolve1d(drop_filter, notch_kernel, pad)
+
+        # Apply filter
+        dropped_waveform = convolve1d(dropped_waveform, drop_filter, pad)
+
+        # Remove channels dimension if added
+        return dropped_waveform.squeeze(-1)
+
+
+class DropChunk(nn.Layer):
+    def __init__(
+            self,
+            drop_length_low=100,
+            drop_length_high=1000,
+            drop_count_low=1,
+            drop_count_high=10,
+            drop_start=0,
+            drop_end=None,
+            drop_prob=1,
+            noise_factor=0.0, ):
+        super(DropChunk, self).__init__()
+        self.drop_length_low = drop_length_low
+        self.drop_length_high = drop_length_high
+        self.drop_count_low = drop_count_low
+        self.drop_count_high = drop_count_high
+        self.drop_start = drop_start
+        self.drop_end = drop_end
+        self.drop_prob = drop_prob
+        self.noise_factor = noise_factor
+
+        # Validate low < high
+        if drop_length_low > drop_length_high:
+            raise ValueError("Low limit must not be more than high limit")
+        if drop_count_low > drop_count_high:
+            raise ValueError("Low limit must not be more than high limit")
+
+        # Make sure the length doesn't exceed end - start
+        if drop_end is not None and drop_end >= 0:
+            if drop_start > drop_end:
+                raise ValueError("Low limit must not be more than high limit")
+
+            drop_range = drop_end - drop_start
+            self.drop_length_low = min(drop_length_low, drop_range)
+            self.drop_length_high = min(drop_length_high, drop_range)
+
+    def forward(self, waveforms, lengths):
+        # Reading input list
+        lengths = (lengths * waveforms.shape[1]).astype('int64')
+        batch_size = waveforms.shape[0]
+        dropped_waveform = waveforms.clone()
+
+        # Don't drop (return early) 1-`drop_prob` portion of the batches
+        if paddle.rand([1]) > self.drop_prob:
+            return dropped_waveform
+
+        # Store original amplitude for computing white noise amplitude
+        clean_amplitude = compute_amplitude(waveforms, lengths.unsqueeze(1))
+
+        # Pick a number of times to drop
+        drop_times = paddle.randint(
+            low=self.drop_count_low,
+            high=self.drop_count_high + 1,
+            shape=[batch_size], )
+
+        # Iterate batch to set mask
+        for i in range(batch_size):
+            if drop_times[i] == 0:
+                continue
+
+            # Pick lengths
+            length = paddle.randint(
+                low=self.drop_length_low,
+                high=self.drop_length_high + 1,
+                shape=[drop_times[i]], )
+
+            # Compute range of starting locations
+            start_min = self.drop_start
+            if start_min < 0:
+                start_min += lengths[i]
+            start_max = self.drop_end
+            if start_max is None:
+                start_max = lengths[i]
+            if start_max < 0:
+                start_max += lengths[i]
+            start_max = max(0, start_max - length.max())
+
+            # Pick starting locations
+            start = paddle.randint(
+                low=start_min,
+                high=start_max + 1,
+                shape=[drop_times[i]], )
+
+            end = start + length
+
+            # Update waveform
+            if not self.noise_factor:
+                for j in range(drop_times[i]):
+                    if start[j] < end[j]:
+                        dropped_waveform[i, start[j]:end[j]] = 0.0
+            else:
+                # Uniform distribution of -2 to +2 * avg amplitude should
+                # preserve the average for normalization
+                noise_max = 2 * clean_amplitude[i] * self.noise_factor
+                for j in range(drop_times[i]):
+                    # zero-center the noise distribution
+                    noise_vec = paddle.rand([length[j]], dtype='float32')
+
+                    noise_vec = 2 * noise_max * noise_vec - noise_max
+                    dropped_waveform[i, int(start[j]):int(end[j])] = noise_vec
+
+        return dropped_waveform
+
+
+class Resample(nn.Layer):
+    def __init__(
+            self,
+            orig_freq=16000,
+            new_freq=16000,
+            lowpass_filter_width=6, ):
+        super(Resample, self).__init__()
+        self.orig_freq = orig_freq
+        self.new_freq = new_freq
+        self.lowpass_filter_width = lowpass_filter_width
+
+        # Compute rate for striding
+        self._compute_strides()
+        assert self.orig_freq % self.conv_stride == 0
+        assert self.new_freq % self.conv_transpose_stride == 0
+
+    def _compute_strides(self):
+        # Compute new unit based on ratio of in/out frequencies
+        base_freq = math.gcd(self.orig_freq, self.new_freq)
+        input_samples_in_unit = self.orig_freq // base_freq
+        self.output_samples = self.new_freq // base_freq
+
+        # Store the appropriate stride based on the new units
+        self.conv_stride = input_samples_in_unit
+        self.conv_transpose_stride = self.output_samples
+
+    def forward(self, waveforms):
+        if not hasattr(self, "first_indices"):
+            self._indices_and_weights(waveforms)
+
+        # Don't do anything if the frequencies are the same
+        if self.orig_freq == self.new_freq:
+            return waveforms
+
+        unsqueezed = False
+        if len(waveforms.shape) == 2:
+            waveforms = waveforms.unsqueeze(1)
+            unsqueezed = True
+        elif len(waveforms.shape) == 3:
+            waveforms = waveforms.transpose([0, 2, 1])
+        else:
+            raise ValueError("Input must be 2 or 3 dimensions")
+
+        # Do resampling
+        resampled_waveform = self._perform_resample(waveforms)
+
+        if unsqueezed:
+            resampled_waveform = resampled_waveform.squeeze(1)
+        else:
+            resampled_waveform = resampled_waveform.transpose([0, 2, 1])
+
+        return resampled_waveform
+
+    def _perform_resample(self, waveforms):
+        # Compute output size and initialize
+        batch_size, num_channels, wave_len = waveforms.shape
+        window_size = self.weights.shape[1]
+        tot_output_samp = self._output_samples(wave_len)
+        resampled_waveform = paddle.zeros((batch_size, num_channels,
+                                           tot_output_samp))
+
+        # eye size: (num_channels, num_channels, 1)
+        eye = paddle.eye(num_channels).unsqueeze(2)
+
+        # Iterate over the phases in the polyphase filter
+        for i in range(self.first_indices.shape[0]):
+            wave_to_conv = waveforms
+            first_index = int(self.first_indices[i].item())
+            if first_index >= 0:
+                # trim the signal as the filter will not be applied
+                # before the first_index
+                wave_to_conv = wave_to_conv[:, :, first_index:]
+
+            # pad the right of the signal to allow partial convolutions
+            # meaning compute values for partial windows (e.g. end of the
+            # window is outside the signal length)
+            max_index = (tot_output_samp - 1) // self.output_samples
+            end_index = max_index * self.conv_stride + window_size
+            current_wave_len = wave_len - first_index
+            right_padding = max(0, end_index + 1 - current_wave_len)
+            left_padding = max(0, -first_index)
+            wave_to_conv = paddle.nn.functional.pad(
+                wave_to_conv, [left_padding, right_padding], data_format='NCL')
+            conv_wave = paddle.nn.functional.conv1d(
+                x=wave_to_conv,
+                # weight=self.weights[i].repeat(num_channels, 1, 1),
+                weight=self.weights[i].expand((num_channels, 1, -1)),
+                stride=self.conv_stride,
+                groups=num_channels, )
+
+            # we want conv_wave[:, i] to be at
+            # output[:, i + n*conv_transpose_stride]
+            dilated_conv_wave = paddle.nn.functional.conv1d_transpose(
+                conv_wave, eye, stride=self.conv_transpose_stride)
+
+            # pad dilated_conv_wave so it reaches the output length if needed.
+            left_padding = i
+            previous_padding = left_padding + dilated_conv_wave.shape[-1]
+            right_padding = max(0, tot_output_samp - previous_padding)
+            dilated_conv_wave = paddle.nn.functional.pad(
+                dilated_conv_wave, [left_padding, right_padding],
+                data_format='NCL')
+            dilated_conv_wave = dilated_conv_wave[:, :, :tot_output_samp]
+
+            resampled_waveform += dilated_conv_wave
+
+        return resampled_waveform
+
+    def _output_samples(self, input_num_samp):
+        samp_in = int(self.orig_freq)
+        samp_out = int(self.new_freq)
+
+        tick_freq = abs(samp_in * samp_out) // math.gcd(samp_in, samp_out)
+        ticks_per_input_period = tick_freq // samp_in
+
+        # work out the number of ticks in the time interval
+        # [ 0, input_num_samp/samp_in ).
+        interval_length = input_num_samp * ticks_per_input_period
+        if interval_length <= 0:
+            return 0
+        ticks_per_output_period = tick_freq // samp_out
+
+        # Get the last output-sample in the closed interval,
+        # i.e. replacing [ ) with [ ]. Note: integer division rounds down.
+        # See http://en.wikipedia.org/wiki/Interval_(mathematics) for an
+        # explanation of the notation.
+        last_output_samp = interval_length // ticks_per_output_period
+
+        # We need the last output-sample in the open interval, so if it
+        # takes us to the end of the interval exactly, subtract one.
+        if last_output_samp * ticks_per_output_period == interval_length:
+            last_output_samp -= 1
+
+        # First output-sample index is zero, so the number of output samples
+        # is the last output-sample plus one.
+        num_output_samp = last_output_samp + 1
+
+        return num_output_samp
+
+    def _indices_and_weights(self, waveforms):
+        # Lowpass filter frequency depends on smaller of two frequencies
+        min_freq = min(self.orig_freq, self.new_freq)
+        lowpass_cutoff = 0.99 * 0.5 * min_freq
+
+        assert lowpass_cutoff * 2 <= min_freq
+        window_width = self.lowpass_filter_width / (2.0 * lowpass_cutoff)
+
+        assert lowpass_cutoff < min(self.orig_freq, self.new_freq) / 2
+        output_t = paddle.arange(start=0.0, end=self.output_samples)
+        output_t /= self.new_freq
+        min_t = output_t - window_width
+        max_t = output_t + window_width
+
+        min_input_index = paddle.ceil(min_t * self.orig_freq)
+        max_input_index = paddle.floor(max_t * self.orig_freq)
+        num_indices = max_input_index - min_input_index + 1
+
+        max_weight_width = num_indices.max()
+        j = paddle.arange(max_weight_width, dtype='float32')
+        input_index = min_input_index.unsqueeze(1) + j.unsqueeze(0)
+        delta_t = (input_index / self.orig_freq) - output_t.unsqueeze(1)
+
+        weights = paddle.zeros_like(delta_t)
+        inside_window_indices = delta_t.abs().less_than(
+            paddle.to_tensor(window_width))
+
+        # raised-cosine (Hanning) window with width `window_width`
+        weights[inside_window_indices] = 0.5 * (1 + paddle.cos(
+            2 * math.pi * lowpass_cutoff / self.lowpass_filter_width *
+            delta_t.masked_select(inside_window_indices)))
+
+        t_eq_zero_indices = delta_t.equal(paddle.zeros_like(delta_t))
+        t_not_eq_zero_indices = delta_t.not_equal(paddle.zeros_like(delta_t))
+
+        # sinc filter function
+        weights = paddle.where(
+            t_not_eq_zero_indices,
+            weights * paddle.sin(2 * math.pi * lowpass_cutoff * delta_t) /
+            (math.pi * delta_t), weights)
+
+        # limit of the function at t = 0
+        weights = paddle.where(t_eq_zero_indices, weights * 2 * lowpass_cutoff,
+                               weights)
+
+        # size (output_samples, max_weight_width)
+        weights /= self.orig_freq
+
+        self.first_indices = min_input_index
+        self.weights = weights
+
+
+class SpeedPerturb(nn.Layer):
+    def __init__(
+            self,
+            orig_freq,
+            speeds=[90, 100, 110],
+            perturb_prob=1.0, ):
+        super(SpeedPerturb, self).__init__()
+        self.orig_freq = orig_freq
+        self.speeds = speeds
+        self.perturb_prob = perturb_prob
+
+        # Initialize index of perturbation
+        self.samp_index = 0
+
+        # Initialize resamplers
+        self.resamplers = []
+        for speed in self.speeds:
+            config = {
+                "orig_freq": self.orig_freq,
+                "new_freq": self.orig_freq * speed // 100,
+            }
+            self.resamplers.append(Resample(**config))
+
+    def forward(self, waveform):
+        # Don't perturb (return early) 1-`perturb_prob` portion of the batches
+        if paddle.rand([1]) > self.perturb_prob:
+            return waveform.clone()
+
+        # Perform a random perturbation
+        self.samp_index = paddle.randint(len(self.speeds), shape=[1]).item()
+        perturbed_waveform = self.resamplers[self.samp_index](waveform)
+
+        return perturbed_waveform
+
+
+class AddNoise(nn.Layer):
+    def __init__(
+            self,
+            noise_dataset=None,  # None for white noise
+            num_workers=0,
+            snr_low=0,
+            snr_high=0,
+            mix_prob=1.0,
+            start_index=None,
+            normalize=False, ):
+        super(AddNoise, self).__init__()
+
+        self.num_workers = num_workers
+        self.snr_low = snr_low
+        self.snr_high = snr_high
+        self.mix_prob = mix_prob
+        self.start_index = start_index
+        self.normalize = normalize
+        self.noise_dataset = noise_dataset
+        self.noise_dataloader = None
+
+    def forward(self, waveforms, lengths=None):
+        if lengths is None:
+            lengths = paddle.ones([len(waveforms)])
+
+        # Copy clean waveform to initialize noisy waveform
+        noisy_waveform = waveforms.clone()
+        lengths = (lengths * waveforms.shape[1]).astype('int64').unsqueeze(1)
+
+        # Don't add noise (return early) 1-`mix_prob` portion of the batches
+        if paddle.rand([1]) > self.mix_prob:
+            return noisy_waveform
+
+        # Compute the average amplitude of the clean waveforms
+        clean_amplitude = compute_amplitude(waveforms, lengths)
+
+        # Pick an SNR and use it to compute the mixture amplitude factors
+        SNR = paddle.rand((len(waveforms), 1))
+        SNR = SNR * (self.snr_high - self.snr_low) + self.snr_low
+        noise_amplitude_factor = 1 / (dB_to_amplitude(SNR) + 1)
+        new_noise_amplitude = noise_amplitude_factor * clean_amplitude
+
+        # Scale clean signal appropriately
+        noisy_waveform *= 1 - noise_amplitude_factor
+
+        # Loop through clean samples and create mixture
+        if self.noise_dataset is None:
+            white_noise = paddle.normal(shape=waveforms.shape)
+            noisy_waveform += new_noise_amplitude * white_noise
+        else:
+            tensor_length = waveforms.shape[1]
+            noise_waveform, noise_length = self._load_noise(
+                lengths,
+                tensor_length, )
+
+            # Rescale and add
+            noise_amplitude = compute_amplitude(noise_waveform, noise_length)
+            noise_waveform *= new_noise_amplitude / (noise_amplitude + 1e-14)
+            noisy_waveform += noise_waveform
+
+        # Normalizing to prevent clipping
+        if self.normalize:
+            abs_max, _ = paddle.max(
+                paddle.abs(noisy_waveform), axis=1, keepdim=True)
+            noisy_waveform = noisy_waveform / abs_max.clip(min=1.0)
+
+        return noisy_waveform
+
+    def _load_noise(self, lengths, max_length):
+        """
+        Load a batch of noises
+
+        args
+        lengths(Paddle.Tensor): Num samples of waveforms with shape (N, 1).
+        max_length(int): Width of a batch.
+        """
+        lengths = lengths.squeeze(1)
+        batch_size = len(lengths)
+
+        # Load a noise batch
+        if self.noise_dataloader is None:
+
+            def noise_collate_fn(batch):
+                def pad(x, target_length, mode='constant', **kwargs):
+                    x = np.asarray(x)
+                    w = target_length - x.shape[0]
+                    assert w >= 0, f'Target length {target_length} is less than origin length {x.shape[0]}'
+                    return np.pad(x, [0, w], mode=mode, **kwargs)
+
+                ids = [item['id'] for item in batch]
+                lengths = np.asarray([item['feat'].shape[0] for item in batch])
+                waveforms = list(
+                    map(lambda x: pad(x, max(max_length, lengths.max().item())),
+                        [item['feat'] for item in batch]))
+                waveforms = np.stack(waveforms)
+                return {'ids': ids, 'feats': waveforms, 'lengths': lengths}
+
+            # Create noise data loader.
+            self.noise_dataloader = paddle.io.DataLoader(
+                self.noise_dataset,
+                batch_size=batch_size,
+                shuffle=True,
+                num_workers=self.num_workers,
+                collate_fn=noise_collate_fn,
+                return_list=True, )
+            self.noise_data = iter(self.noise_dataloader)
+
+        noise_batch, noise_len = self._load_noise_batch_of_size(batch_size)
+
+        # Select a random starting location in the waveform
+        start_index = self.start_index
+        if self.start_index is None:
+            start_index = 0
+            max_chop = (noise_len - lengths).min().clip(min=1)
+            start_index = paddle.randint(high=max_chop, shape=[1])
+
+        # Truncate noise_batch to max_length
+        noise_batch = noise_batch[:, start_index:start_index + max_length]
+        noise_len = (noise_len - start_index).clip(max=max_length).unsqueeze(1)
+        return noise_batch, noise_len
+
+    def _load_noise_batch_of_size(self, batch_size):
+        """Concatenate noise batches, then chop to correct size"""
+        noise_batch, noise_lens = self._load_noise_batch()
+
+        # Expand
+        while len(noise_batch) < batch_size:
+            noise_batch = paddle.concat((noise_batch, noise_batch))
+            noise_lens = paddle.concat((noise_lens, noise_lens))
+
+        # Contract
+        if len(noise_batch) > batch_size:
+            noise_batch = noise_batch[:batch_size]
+            noise_lens = noise_lens[:batch_size]
+
+        return noise_batch, noise_lens
+
+    def _load_noise_batch(self):
+        """Load a batch of noises, restarting iteration if necessary."""
+        try:
+            batch = next(self.noise_data)
+        except StopIteration:
+            self.noise_data = iter(self.noise_dataloader)
+            batch = next(self.noise_data)
+
+        noises, lens = batch['feats'], batch['lengths']
+        return noises, lens
+
+
+class AddReverb(nn.Layer):
+    def __init__(
+            self,
+            rir_dataset,
+            reverb_prob=1.0,
+            rir_scale_factor=1.0,
+            num_workers=0, ):
+        super(AddReverb, self).__init__()
+        self.rir_dataset = rir_dataset
+        self.reverb_prob = reverb_prob
+        self.rir_scale_factor = rir_scale_factor
+
+        # Create rir data loader.
+        def rir_collate_fn(batch):
+            def pad(x, target_length, mode='constant', **kwargs):
+                x = np.asarray(x)
+                w = target_length - x.shape[0]
+                assert w >= 0, f'Target length {target_length} is less than origin length {x.shape[0]}'
+                return np.pad(x, [0, w], mode=mode, **kwargs)
+
+            ids = [item['id'] for item in batch]
+            lengths = np.asarray([item['feat'].shape[0] for item in batch])
+            waveforms = list(
+                map(lambda x: pad(x, lengths.max().item()),
+                    [item['feat'] for item in batch]))
+            waveforms = np.stack(waveforms)
+            return {'ids': ids, 'feats': waveforms, 'lengths': lengths}
+
+        self.rir_dataloader = paddle.io.DataLoader(
+            self.rir_dataset,
+            collate_fn=rir_collate_fn,
+            num_workers=num_workers,
+            shuffle=True,
+            return_list=True, )
+
+        self.rir_data = iter(self.rir_dataloader)
+
+    def forward(self, waveforms, lengths=None):
+        """
+        Arguments
+        ---------
+        waveforms : tensor
+            Shape should be `[batch, time]` or `[batch, time, channels]`.
+        lengths : tensor
+            Shape should be a single dimension, `[batch]`.
+
+        Returns
+        -------
+        Tensor of shape `[batch, time]` or `[batch, time, channels]`.
+        """
+
+        if lengths is None:
+            lengths = paddle.ones([len(waveforms)])
+
+        # Don't add reverb (return early) 1-`reverb_prob` portion of the time
+        if paddle.rand([1]) > self.reverb_prob:
+            return waveforms.clone()
+
+        # Add channels dimension if necessary
+        channel_added = False
+        if len(waveforms.shape) == 2:
+            waveforms = waveforms.unsqueeze(-1)
+            channel_added = True
+
+        # Load and prepare RIR
+        rir_waveform = self._load_rir()
+
+        # Compress or dilate RIR
+        if self.rir_scale_factor != 1:
+            rir_waveform = F.interpolate(
+                rir_waveform.transpose([0, 2, 1]),
+                scale_factor=self.rir_scale_factor,
+                mode="linear",
+                align_corners=False,
+                data_format='NCW', )
+            # (N, C, L) -> (N, L, C)
+            rir_waveform = rir_waveform.transpose([0, 2, 1])
+
+        rev_waveform = reverberate(
+            waveforms,
+            rir_waveform,
+            self.rir_dataset.sample_rate,
+            rescale_amp="avg")
+
+        # Remove channels dimension if added
+        if channel_added:
+            return rev_waveform.squeeze(-1)
+
+        return rev_waveform
+
+    def _load_rir(self):
+        try:
+            batch = next(self.rir_data)
+        except StopIteration:
+            self.rir_data = iter(self.rir_dataloader)
+            batch = next(self.rir_data)
+
+        rir_waveform = batch['feats']
+
+        # Make sure RIR has correct channels
+        if len(rir_waveform.shape) == 2:
+            rir_waveform = rir_waveform.unsqueeze(-1)
+
+        return rir_waveform
+
+
+class AddBabble(nn.Layer):
+    def __init__(
+            self,
+            speaker_count=3,
+            snr_low=0,
+            snr_high=0,
+            mix_prob=1, ):
+        super(AddBabble, self).__init__()
+        self.speaker_count = speaker_count
+        self.snr_low = snr_low
+        self.snr_high = snr_high
+        self.mix_prob = mix_prob
+
+    def forward(self, waveforms, lengths=None):
+        if lengths is None:
+            lengths = paddle.ones([len(waveforms)])
+
+        babbled_waveform = waveforms.clone()
+        lengths = (lengths * waveforms.shape[1]).unsqueeze(1)
+        batch_size = len(waveforms)
+
+        # Don't mix (return early) 1-`mix_prob` portion of the batches
+        if paddle.rand([1]) > self.mix_prob:
+            return babbled_waveform
+
+        # Pick an SNR and use it to compute the mixture amplitude factors
+        clean_amplitude = compute_amplitude(waveforms, lengths)
+        SNR = paddle.rand((batch_size, 1))
+        SNR = SNR * (self.snr_high - self.snr_low) + self.snr_low
+        noise_amplitude_factor = 1 / (dB_to_amplitude(SNR) + 1)
+        new_noise_amplitude = noise_amplitude_factor * clean_amplitude
+
+        # Scale clean signal appropriately
+        babbled_waveform *= 1 - noise_amplitude_factor
+
+        # For each speaker in the mixture, roll and add
+        babble_waveform = waveforms.roll((1, ), axis=0)
+        babble_len = lengths.roll((1, ), axis=0)
+        for i in range(1, self.speaker_count):
+            babble_waveform += waveforms.roll((1 + i, ), axis=0)
+            babble_len = paddle.concat(
+                [babble_len, babble_len.roll((1, ), axis=0)], axis=-1).max(
+                    axis=-1, keepdim=True)
+
+        # Rescale and add to mixture
+        babble_amplitude = compute_amplitude(babble_waveform, babble_len)
+        babble_waveform *= new_noise_amplitude / (babble_amplitude + 1e-14)
+        babbled_waveform += babble_waveform
+
+        return babbled_waveform
+
+
+class TimeDomainSpecAugment(nn.Layer):
+    def __init__(
+            self,
+            perturb_prob=1.0,
+            drop_freq_prob=1.0,
+            drop_chunk_prob=1.0,
+            speeds=[95, 100, 105],
+            sample_rate=16000,
+            drop_freq_count_low=0,
+            drop_freq_count_high=3,
+            drop_chunk_count_low=0,
+            drop_chunk_count_high=5,
+            drop_chunk_length_low=1000,
+            drop_chunk_length_high=2000,
+            drop_chunk_noise_factor=0, ):
+        super(TimeDomainSpecAugment, self).__init__()
+        self.speed_perturb = SpeedPerturb(
+            perturb_prob=perturb_prob,
+            orig_freq=sample_rate,
+            speeds=speeds, )
+        self.drop_freq = DropFreq(
+            drop_prob=drop_freq_prob,
+            drop_count_low=drop_freq_count_low,
+            drop_count_high=drop_freq_count_high, )
+        self.drop_chunk = DropChunk(
+            drop_prob=drop_chunk_prob,
+            drop_count_low=drop_chunk_count_low,
+            drop_count_high=drop_chunk_count_high,
+            drop_length_low=drop_chunk_length_low,
+            drop_length_high=drop_chunk_length_high,
+            noise_factor=drop_chunk_noise_factor, )
+
+    def forward(self, waveforms, lengths=None):
+        if lengths is None:
+            lengths = paddle.ones([len(waveforms)])
+
+        with paddle.no_grad():
+            # Augmentation
+            waveforms = self.speed_perturb(waveforms)
+            waveforms = self.drop_freq(waveforms)
+            waveforms = self.drop_chunk(waveforms, lengths)
+
+        return waveforms
+
+
+class EnvCorrupt(nn.Layer):
+    def __init__(
+            self,
+            reverb_prob=1.0,
+            babble_prob=1.0,
+            noise_prob=1.0,
+            rir_dataset=None,
+            noise_dataset=None,
+            num_workers=0,
+            babble_speaker_count=0,
+            babble_snr_low=0,
+            babble_snr_high=0,
+            noise_snr_low=0,
+            noise_snr_high=0,
+            rir_scale_factor=1.0, ):
+        super(EnvCorrupt, self).__init__()
+
+        # Initialize corrupters
+        if rir_dataset is not None and reverb_prob > 0.0:
+            self.add_reverb = AddReverb(
+                rir_dataset=rir_dataset,
+                num_workers=num_workers,
+                reverb_prob=reverb_prob,
+                rir_scale_factor=rir_scale_factor, )
+
+        if babble_speaker_count > 0 and babble_prob > 0.0:
+            self.add_babble = AddBabble(
+                speaker_count=babble_speaker_count,
+                snr_low=babble_snr_low,
+                snr_high=babble_snr_high,
+                mix_prob=babble_prob, )
+
+        if noise_dataset is not None and noise_prob > 0.0:
+            self.add_noise = AddNoise(
+                noise_dataset=noise_dataset,
+                num_workers=num_workers,
+                snr_low=noise_snr_low,
+                snr_high=noise_snr_high,
+                mix_prob=noise_prob, )
+
+    def forward(self, waveforms, lengths=None):
+        if lengths is None:
+            lengths = paddle.ones([len(waveforms)])
+
+        # Augmentation
+        with paddle.no_grad():
+            if hasattr(self, "add_reverb"):
+                try:
+                    waveforms = self.add_reverb(waveforms, lengths)
+                except Exception:
+                    pass
+            if hasattr(self, "add_babble"):
+                waveforms = self.add_babble(waveforms, lengths)
+            if hasattr(self, "add_noise"):
+                waveforms = self.add_noise(waveforms, lengths)
+
+        return waveforms
+
+
+def build_augment_pipeline(target_dir=None) -> List[paddle.nn.Layer]:
+    """build augment pipeline
+    Note: this pipeline cannot be used in the paddle.DataLoader
+
+    Returns:
+        List[paddle.nn.Layer]: all augment process
+    """
+    logger.info("start to build the augment pipeline")
+    noise_dataset = OpenRIRNoise('noise', target_dir=target_dir)
+    rir_dataset = OpenRIRNoise('rir', target_dir=target_dir)
+
+    wavedrop = TimeDomainSpecAugment(
+        sample_rate=16000,
+        speeds=[100], )
+    speed_perturb = TimeDomainSpecAugment(
+        sample_rate=16000,
+        speeds=[95, 100, 105], )
+    add_noise = EnvCorrupt(
+        noise_dataset=noise_dataset,
+        reverb_prob=0.0,
+        noise_prob=1.0,
+        noise_snr_low=0,
+        noise_snr_high=15,
+        rir_scale_factor=1.0, )
+    add_rev = EnvCorrupt(
+        rir_dataset=rir_dataset,
+        reverb_prob=1.0,
+        noise_prob=0.0,
+        rir_scale_factor=1.0, )
+    add_rev_noise = EnvCorrupt(
+        noise_dataset=noise_dataset,
+        rir_dataset=rir_dataset,
+        reverb_prob=1.0,
+        noise_prob=1.0,
+        noise_snr_low=0,
+        noise_snr_high=15,
+        rir_scale_factor=1.0, )
+
+    return [wavedrop, speed_perturb, add_noise, add_rev, add_rev_noise]
+
+
+def waveform_augment(waveforms: paddle.Tensor,
+                     augment_pipeline: List[paddle.nn.Layer]) -> paddle.Tensor:
+    """process the augment pipeline and return all the waveforms
+
+    Args:
+        waveforms (paddle.Tensor): original batch waveform
+        augment_pipeline (List[paddle.nn.Layer]): agument pipeline process
+
+    Returns:
+        paddle.Tensor: all the audio waveform including the original waveform and augmented waveform
+    """
+    # stage 0: store the original waveforms
+    waveforms_aug_list = [waveforms]
+
+    # augment the original batch waveform
+    for aug in augment_pipeline:
+        # stage 1: augment the data
+        waveforms_aug = aug(waveforms)  # (N, L)
+        if waveforms_aug.shape[1] >= waveforms.shape[1]:
+            # Trunc
+            waveforms_aug = waveforms_aug[:, :waveforms.shape[1]]
+        else:
+            # Pad
+            lengths_to_pad = waveforms.shape[1] - waveforms_aug.shape[1]
+            waveforms_aug = F.pad(
+                waveforms_aug.unsqueeze(-1), [0, lengths_to_pad],
+                data_format='NLC').squeeze(-1)
+        # stage 2: append the augmented waveform into the list
+        waveforms_aug_list.append(waveforms_aug)
+
+    # get the all the waveforms
+    return paddle.concat(waveforms_aug_list, axis=0)
diff --git a/ernie-sat/paddlespeech/vector/io/batch.py b/ernie-sat/paddlespeech/vector/io/batch.py
new file mode 100644
index 0000000000000000000000000000000000000000..92ca990cf2dd83f6a22127e15b50885e6809c21f
--- /dev/null
+++ b/ernie-sat/paddlespeech/vector/io/batch.py
@@ -0,0 +1,166 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy
+import numpy as np
+import paddle
+
+
+def waveform_collate_fn(batch):
+    waveforms = np.stack([item['feat'] for item in batch])
+    labels = np.stack([item['label'] for item in batch])
+
+    return {'waveforms': waveforms, 'labels': labels}
+
+
+def feature_normalize(feats: paddle.Tensor,
+                      mean_norm: bool=True,
+                      std_norm: bool=True,
+                      convert_to_numpy: bool=False):
+    # Features normalization if needed
+    # numpy.mean is a little with paddle.mean about 1e-6
+    if convert_to_numpy:
+        feats_np = feats.numpy()
+        mean = feats_np.mean(axis=-1, keepdims=True) if mean_norm else 0
+        std = feats_np.std(axis=-1, keepdims=True) if std_norm else 1
+        feats_np = (feats_np - mean) / std
+        feats = paddle.to_tensor(feats_np, dtype=feats.dtype)
+    else:
+        mean = feats.mean(axis=-1, keepdim=True) if mean_norm else 0
+        std = feats.std(axis=-1, keepdim=True) if std_norm else 1
+        feats = (feats - mean) / std
+
+    return feats
+
+
+def pad_right_2d(x, target_length, axis=-1, mode='constant', **kwargs):
+    x = np.asarray(x)
+    assert len(
+        x.shape) == 2, f'Only 2D arrays supported, but got shape: {x.shape}'
+
+    w = target_length - x.shape[axis]
+    assert w >= 0, f'Target length {target_length} is less than origin length {x.shape[axis]}'
+
+    if axis == 0:
+        pad_width = [[0, w], [0, 0]]
+    else:
+        pad_width = [[0, 0], [0, w]]
+
+    return np.pad(x, pad_width, mode=mode, **kwargs)
+
+
+def batch_feature_normalize(batch, mean_norm: bool=True, std_norm: bool=True):
+    ids = [item['id'] for item in batch]
+    lengths = np.asarray([item['feat'].shape[1] for item in batch])
+    feats = list(
+        map(lambda x: pad_right_2d(x, lengths.max()),
+            [item['feat'] for item in batch]))
+    feats = np.stack(feats)
+
+    # Features normalization if needed
+    for i in range(len(feats)):
+        feat = feats[i][:, :lengths[i]]  # Excluding pad values.
+        mean = feat.mean(axis=-1, keepdims=True) if mean_norm else 0
+        std = feat.std(axis=-1, keepdims=True) if std_norm else 1
+        feats[i][:, :lengths[i]] = (feat - mean) / std
+        assert feats[i][:, lengths[
+            i]:].sum() == 0  # Padding valus should all be 0.
+
+    # Converts into ratios.
+    # the utterance of the max length doesn't need to padding
+    # the remaining utterances need to padding and all of them will be padded to max length
+    # we convert the original length of each utterance to the ratio of the max length
+    lengths = (lengths / lengths.max()).astype(np.float32)
+
+    return {'ids': ids, 'feats': feats, 'lengths': lengths}
+
+
+def pad_right_to(array, target_shape, mode="constant", value=0):
+    """
+    This function takes a numpy array of arbitrary shape and pads it to target
+    shape by appending values on the right.
+
+    Args:
+        array: input numpy array. Input array whose dimension we need to pad.
+    target_shape : (list, tuple). Target shape we want for the target array its len must be equal to array.ndim
+    mode : str. Pad mode, please refer to numpy.pad documentation.
+    value : float. Pad value, please refer to numpy.pad documentation.
+
+    Returns:
+        array: numpy.array. Padded array.
+        valid_vals : list. List containing proportion for each dimension of original, non-padded values.
+    """
+    assert len(target_shape) == array.ndim
+    pads = []  # this contains the abs length of the padding for each dimension.
+    valid_vals = []  # this contains the relative lengths for each dimension.
+    i = 0  # iterating over target_shape ndims
+    while i < len(target_shape):
+        assert (target_shape[i] >= array.shape[i]
+                ), "Target shape must be >= original shape for every dim"
+        pads.append([0, target_shape[i] - array.shape[i]])
+        valid_vals.append(array.shape[i] / target_shape[i])
+        i += 1
+
+    array = numpy.pad(array, pads, mode=mode, constant_values=value)
+
+    return array, valid_vals
+
+
+def batch_pad_right(arrays, mode="constant", value=0):
+    """Given a list of numpy arrays it batches them together by padding to the right
+    on each dimension in order to get same length for all.
+
+    Args:
+        arrays : list. List of array we wish to pad together.
+        mode : str. Padding mode see numpy.pad documentation.
+        value : float. Padding value see numpy.pad documentation.
+
+    Returns:
+        array : numpy.array. Padded array.
+        valid_vals : list. List containing proportion for each dimension of original, non-padded values.
+    """
+
+    if not len(arrays):
+        raise IndexError("arrays list must not be empty")
+
+    if len(arrays) == 1:
+        # if there is only one array in the batch we simply unsqueeze it.
+        return numpy.expand_dims(arrays[0], axis=0), numpy.array([1.0])
+
+    if not (any(
+        [arrays[i].ndim == arrays[0].ndim for i in range(1, len(arrays))])):
+        raise IndexError("All arrays must have same number of dimensions")
+
+    # FIXME we limit the support here: we allow padding of only the last dimension
+    # need to remove this when feat extraction is updated to handle multichannel.
+    max_shape = []
+    for dim in range(arrays[0].ndim):
+        if dim != (arrays[0].ndim - 1):
+            if not all(
+                [x.shape[dim] == arrays[0].shape[dim] for x in arrays[1:]]):
+                raise EnvironmentError(
+                    "arrays should have same dimensions except for last one")
+        max_shape.append(max([x.shape[dim] for x in arrays]))
+
+    batched = []
+    valid = []
+    for t in arrays:
+        # for each array we apply pad_right_to
+        padded, valid_percent = pad_right_to(
+            t, max_shape, mode=mode, value=value)
+        batched.append(padded)
+        valid.append(valid_percent[-1])
+
+    batched = numpy.stack(batched)
+
+    return batched, numpy.array(valid)
diff --git a/ernie-sat/paddlespeech/vector/io/signal_processing.py b/ernie-sat/paddlespeech/vector/io/signal_processing.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee939bdb1042d14a653d3bbf1496c20f0336d2ce
--- /dev/null
+++ b/ernie-sat/paddlespeech/vector/io/signal_processing.py
@@ -0,0 +1,217 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+import paddle
+
+# TODO: Complete type-hint and doc string.
+
+
+def blackman_window(win_len, dtype=np.float32):
+    arcs = np.pi * np.arange(win_len) / float(win_len)
+    win = np.asarray(
+        [0.42 - 0.5 * np.cos(2 * arc) + 0.08 * np.cos(4 * arc) for arc in arcs],
+        dtype=dtype)
+    return paddle.to_tensor(win)
+
+
+def compute_amplitude(waveforms, lengths=None, amp_type="avg", scale="linear"):
+    if len(waveforms.shape) == 1:
+        waveforms = waveforms.unsqueeze(0)
+
+    assert amp_type in ["avg", "peak"]
+    assert scale in ["linear", "dB"]
+
+    if amp_type == "avg":
+        if lengths is None:
+            out = paddle.mean(paddle.abs(waveforms), axis=1, keepdim=True)
+        else:
+            wav_sum = paddle.sum(paddle.abs(waveforms), axis=1, keepdim=True)
+            out = wav_sum / lengths
+    elif amp_type == "peak":
+        out = paddle.max(paddle.abs(waveforms), axis=1, keepdim=True)
+    else:
+        raise NotImplementedError
+
+    if scale == "linear":
+        return out
+    elif scale == "dB":
+        return paddle.clip(20 * paddle.log10(out), min=-80)
+    else:
+        raise NotImplementedError
+
+
+def dB_to_amplitude(SNR):
+    return 10**(SNR / 20)
+
+
+def convolve1d(
+        waveform,
+        kernel,
+        padding=0,
+        pad_type="constant",
+        stride=1,
+        groups=1, ):
+    if len(waveform.shape) != 3:
+        raise ValueError("Convolve1D expects a 3-dimensional tensor")
+
+    # Padding can be a tuple (left_pad, right_pad) or an int
+    if isinstance(padding, list):
+        waveform = paddle.nn.functional.pad(
+            x=waveform,
+            pad=padding,
+            mode=pad_type,
+            data_format='NLC', )
+
+    # Move time dimension last, which pad and fft and conv expect.
+    # (N, L, C) -> (N, C, L)
+    waveform = waveform.transpose([0, 2, 1])
+    kernel = kernel.transpose([0, 2, 1])
+
+    convolved = paddle.nn.functional.conv1d(
+        x=waveform,
+        weight=kernel,
+        stride=stride,
+        groups=groups,
+        padding=padding if not isinstance(padding, list) else 0, )
+
+    # Return time dimension to the second dimension.
+    return convolved.transpose([0, 2, 1])
+
+
+def notch_filter(notch_freq, filter_width=101, notch_width=0.05):
+    # Check inputs
+    assert 0 < notch_freq <= 1
+    assert filter_width % 2 != 0
+    pad = filter_width // 2
+    inputs = paddle.arange(filter_width, dtype='float32') - pad
+
+    # Avoid frequencies that are too low
+    notch_freq += notch_width
+
+    # Define sinc function, avoiding division by zero
+    def sinc(x):
+        def _sinc(x):
+            return paddle.sin(x) / x
+
+        # The zero is at the middle index
+        res = paddle.concat(
+            [_sinc(x[:pad]), paddle.ones([1]), _sinc(x[pad + 1:])])
+        return res
+
+    # Compute a low-pass filter with cutoff frequency notch_freq.
+    hlpf = sinc(3 * (notch_freq - notch_width) * inputs)
+    # import torch
+    # hlpf *= paddle.to_tensor(torch.blackman_window(filter_width).detach().numpy())
+    hlpf *= blackman_window(filter_width)
+    hlpf /= paddle.sum(hlpf)
+
+    # Compute a high-pass filter with cutoff frequency notch_freq.
+    hhpf = sinc(3 * (notch_freq + notch_width) * inputs)
+    # hhpf *= paddle.to_tensor(torch.blackman_window(filter_width).detach().numpy())
+    hhpf *= blackman_window(filter_width)
+    hhpf /= -paddle.sum(hhpf)
+    hhpf[pad] += 1
+
+    # Adding filters creates notch filter
+    return (hlpf + hhpf).reshape([1, -1, 1])
+
+
+def reverberate(waveforms,
+                rir_waveform,
+                sample_rate,
+                impulse_duration=0.3,
+                rescale_amp="avg"):
+    orig_shape = waveforms.shape
+
+    if len(waveforms.shape) > 3 or len(rir_waveform.shape) > 3:
+        raise NotImplementedError
+
+    # if inputs are mono tensors we reshape to 1, samples
+    if len(waveforms.shape) == 1:
+        waveforms = waveforms.unsqueeze(0).unsqueeze(-1)
+    elif len(waveforms.shape) == 2:
+        waveforms = waveforms.unsqueeze(-1)
+
+    if len(rir_waveform.shape) == 1:  # convolve1d expects a 3d tensor !
+        rir_waveform = rir_waveform.unsqueeze(0).unsqueeze(-1)
+    elif len(rir_waveform.shape) == 2:
+        rir_waveform = rir_waveform.unsqueeze(-1)
+
+    # Compute the average amplitude of the clean
+    orig_amplitude = compute_amplitude(waveforms, waveforms.shape[1],
+                                       rescale_amp)
+
+    # Compute index of the direct signal, so we can preserve alignment
+    impulse_index_start = rir_waveform.abs().argmax(axis=1).item()
+    impulse_index_end = min(
+        impulse_index_start + int(sample_rate * impulse_duration),
+        rir_waveform.shape[1])
+    rir_waveform = rir_waveform[:, impulse_index_start:impulse_index_end, :]
+    rir_waveform = rir_waveform / paddle.norm(rir_waveform, p=2)
+    rir_waveform = paddle.flip(rir_waveform, [1])
+
+    waveforms = convolve1d(
+        waveform=waveforms,
+        kernel=rir_waveform,
+        padding=[rir_waveform.shape[1] - 1, 0], )
+
+    # Rescale to the peak amplitude of the clean waveform
+    waveforms = rescale(waveforms, waveforms.shape[1], orig_amplitude,
+                        rescale_amp)
+
+    if len(orig_shape) == 1:
+        waveforms = waveforms.squeeze(0).squeeze(-1)
+    if len(orig_shape) == 2:
+        waveforms = waveforms.squeeze(-1)
+
+    return waveforms
+
+
+def rescale(waveforms, lengths, target_lvl, amp_type="avg", scale="linear"):
+    assert amp_type in ["peak", "avg"]
+    assert scale in ["linear", "dB"]
+
+    batch_added = False
+    if len(waveforms.shape) == 1:
+        batch_added = True
+        waveforms = waveforms.unsqueeze(0)
+
+    waveforms = normalize(waveforms, lengths, amp_type)
+
+    if scale == "linear":
+        out = target_lvl * waveforms
+    elif scale == "dB":
+        out = dB_to_amplitude(target_lvl) * waveforms
+
+    else:
+        raise NotImplementedError("Invalid scale, choose between dB and linear")
+
+    if batch_added:
+        out = out.squeeze(0)
+
+    return out
+
+
+def normalize(waveforms, lengths=None, amp_type="avg", eps=1e-14):
+    assert amp_type in ["avg", "peak"]
+
+    batch_added = False
+    if len(waveforms.shape) == 1:
+        batch_added = True
+        waveforms = waveforms.unsqueeze(0)
+
+    den = compute_amplitude(waveforms, lengths, amp_type) + eps
+    if batch_added:
+        waveforms = waveforms.squeeze(0)
+    return waveforms / den
diff --git a/ernie-sat/paddlespeech/vector/models/__init__.py b/ernie-sat/paddlespeech/vector/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..185a92b8d94d3426d616c0624f0f2ee04339349e
--- /dev/null
+++ b/ernie-sat/paddlespeech/vector/models/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/ernie-sat/paddlespeech/vector/models/ecapa_tdnn.py b/ernie-sat/paddlespeech/vector/models/ecapa_tdnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..895ff13f4509c7070d2473aebf8ce693a50dbcee
--- /dev/null
+++ b/ernie-sat/paddlespeech/vector/models/ecapa_tdnn.py
@@ -0,0 +1,520 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+
+def length_to_mask(length, max_len=None, dtype=None):
+    assert len(length.shape) == 1
+
+    if max_len is None:
+        max_len = length.max().astype(
+            'int').item()  # using arange to generate mask
+    mask = paddle.arange(
+        max_len, dtype=length.dtype).expand(
+            (len(length), max_len)) < length.unsqueeze(1)
+
+    if dtype is None:
+        dtype = length.dtype
+
+    mask = paddle.to_tensor(mask, dtype=dtype)
+    return mask
+
+
+class Conv1d(nn.Layer):
+    def __init__(
+            self,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=1,
+            padding="same",
+            dilation=1,
+            groups=1,
+            bias=True,
+            padding_mode="reflect", ):
+        """_summary_
+
+        Args:
+            in_channels (int): intput channel or input data dimensions
+            out_channels (int): output channel or output data dimensions
+            kernel_size (int): kernel size of 1-d convolution
+            stride (int, optional): strid in 1-d convolution . Defaults to 1.
+            padding (str, optional): padding value. Defaults to "same".
+            dilation (int, optional): dilation in 1-d convolution. Defaults to 1.
+            groups (int, optional): groups in 1-d convolution. Defaults to 1.
+            bias (bool, optional): bias in 1-d convolution . Defaults to True.
+            padding_mode (str, optional): padding mode. Defaults to "reflect".
+        """
+        super().__init__()
+
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.dilation = dilation
+        self.padding = padding
+        self.padding_mode = padding_mode
+
+        self.conv = nn.Conv1D(
+            in_channels,
+            out_channels,
+            self.kernel_size,
+            stride=self.stride,
+            padding=0,
+            dilation=self.dilation,
+            groups=groups,
+            bias_attr=bias, )
+
+    def forward(self, x):
+        """Do conv1d forward
+
+        Args:
+            x (paddle.Tensor): [N, C, L] input data, 
+                                N is the batch,
+                                C is the data dimension, 
+                                L is the time
+
+        Raises:
+            ValueError: only support the same padding type
+
+        Returns:
+            paddle.Tensor: the value of conv1d
+        """
+        if self.padding == "same":
+            x = self._manage_padding(x, self.kernel_size, self.dilation,
+                                     self.stride)
+        else:
+            raise ValueError("Padding must be 'same'. Got {self.padding}")
+
+        return self.conv(x)
+
+    def _manage_padding(self, x, kernel_size: int, dilation: int, stride: int):
+        """Padding the input data
+
+        Args:
+            x (paddle.Tensor): [N, C, L] input data
+                                N is the batch,
+                                C is the data dimension, 
+                                L is the time
+            kernel_size (int): 1-d convolution kernel size
+            dilation (int): 1-d convolution dilation
+            stride (int): 1-d convolution stride
+
+        Returns:
+            paddle.Tensor: the padded input data
+        """
+        L_in = x.shape[-1]  # Detecting input shape
+        padding = self._get_padding_elem(L_in, stride, kernel_size,
+                                         dilation)  # Time padding
+        x = F.pad(
+            x, padding, mode=self.padding_mode,
+            data_format="NCL")  # Applying padding
+        return x
+
+    def _get_padding_elem(self,
+                          L_in: int,
+                          stride: int,
+                          kernel_size: int,
+                          dilation: int):
+        """Calculate the padding value in same mode
+
+        Args:
+            L_in (int): the times of the input data, 
+            stride (int): 1-d convolution stride
+            kernel_size (int): 1-d convolution kernel size
+            dilation (int): 1-d convolution stride
+
+        Returns:
+            int: return the padding value in same mode
+        """
+        if stride > 1:
+            n_steps = math.ceil(((L_in - kernel_size * dilation) / stride) + 1)
+            L_out = stride * (n_steps - 1) + kernel_size * dilation
+            padding = [kernel_size // 2, kernel_size // 2]
+        else:
+            L_out = (L_in - dilation * (kernel_size - 1) - 1) // stride + 1
+
+            padding = [(L_in - L_out) // 2, (L_in - L_out) // 2]
+
+        return padding
+
+
+class BatchNorm1d(nn.Layer):
+    def __init__(
+            self,
+            input_size,
+            eps=1e-05,
+            momentum=0.9,
+            weight_attr=None,
+            bias_attr=None,
+            data_format='NCL',
+            use_global_stats=None, ):
+        super().__init__()
+
+        self.norm = nn.BatchNorm1D(
+            input_size,
+            epsilon=eps,
+            momentum=momentum,
+            weight_attr=weight_attr,
+            bias_attr=bias_attr,
+            data_format=data_format,
+            use_global_stats=use_global_stats, )
+
+    def forward(self, x):
+        x_n = self.norm(x)
+        return x_n
+
+
+class TDNNBlock(nn.Layer):
+    def __init__(
+            self,
+            in_channels,
+            out_channels,
+            kernel_size,
+            dilation,
+            activation=nn.ReLU, ):
+        """Implementation of TDNN network
+
+        Args:
+            in_channels (int): input channels or input embedding dimensions
+            out_channels (int): output channels or output embedding dimensions
+            kernel_size (int): the kernel size of the TDNN network block
+            dilation (int): the dilation of the TDNN network block
+            activation (paddle class, optional): the activation layers. Defaults to nn.ReLU.
+        """
+        super().__init__()
+        self.conv = Conv1d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            dilation=dilation, )
+        self.activation = activation()
+        self.norm = BatchNorm1d(input_size=out_channels)
+
+    def forward(self, x):
+        return self.norm(self.activation(self.conv(x)))
+
+
+class Res2NetBlock(nn.Layer):
+    def __init__(self, in_channels, out_channels, scale=8, dilation=1):
+        """Implementation of Res2Net Block with dilation
+           The paper is refered as "Res2Net: A New Multi-scale Backbone Architecture",
+           whose url is https://arxiv.org/abs/1904.01169
+        Args:
+            in_channels (int): input channels or input dimensions
+            out_channels (int): output channels or output dimensions
+            scale (int, optional): scale in res2net bolck. Defaults to 8.
+            dilation (int, optional): dilation of 1-d convolution in TDNN block. Defaults to 1.
+        """
+        super().__init__()
+        assert in_channels % scale == 0
+        assert out_channels % scale == 0
+
+        in_channel = in_channels // scale
+        hidden_channel = out_channels // scale
+
+        self.blocks = nn.LayerList([
+            TDNNBlock(
+                in_channel, hidden_channel, kernel_size=3, dilation=dilation)
+            for i in range(scale - 1)
+        ])
+        self.scale = scale
+
+    def forward(self, x):
+        y = []
+        for i, x_i in enumerate(paddle.chunk(x, self.scale, axis=1)):
+            if i == 0:
+                y_i = x_i
+            elif i == 1:
+                y_i = self.blocks[i - 1](x_i)
+            else:
+                y_i = self.blocks[i - 1](x_i + y_i)
+            y.append(y_i)
+        y = paddle.concat(y, axis=1)
+        return y
+
+
+class SEBlock(nn.Layer):
+    def __init__(self, in_channels, se_channels, out_channels):
+        """Implementation of SEBlock
+           The paper is refered as "Squeeze-and-Excitation Networks"
+           whose url is https://arxiv.org/abs/1709.01507
+        Args:
+            in_channels (int): input channels or input data dimensions
+            se_channels (_type_): _description_
+            out_channels (int): output channels or output data dimensions
+        """
+        super().__init__()
+
+        self.conv1 = Conv1d(
+            in_channels=in_channels, out_channels=se_channels, kernel_size=1)
+        self.relu = paddle.nn.ReLU()
+        self.conv2 = Conv1d(
+            in_channels=se_channels, out_channels=out_channels, kernel_size=1)
+        self.sigmoid = paddle.nn.Sigmoid()
+
+    def forward(self, x, lengths=None):
+        L = x.shape[-1]
+        if lengths is not None:
+            mask = length_to_mask(lengths * L, max_len=L)
+            mask = mask.unsqueeze(1)
+            total = mask.sum(axis=2, keepdim=True)
+            s = (x * mask).sum(axis=2, keepdim=True) / total
+        else:
+            s = x.mean(axis=2, keepdim=True)
+
+        s = self.relu(self.conv1(s))
+        s = self.sigmoid(self.conv2(s))
+
+        return s * x
+
+
+class AttentiveStatisticsPooling(nn.Layer):
+    def __init__(self, channels, attention_channels=128, global_context=True):
+        """Compute the speaker verification statistics
+           The detail info is section 3.1 in https://arxiv.org/pdf/1709.01507.pdf 
+        Args:
+            channels (int): input data channel or data dimension
+            attention_channels (int, optional): attention dimension. Defaults to 128.
+            global_context (bool, optional): If use the global context information. Defaults to True.
+        """
+        super().__init__()
+
+        self.eps = 1e-12
+        self.global_context = global_context
+        if global_context:
+            self.tdnn = TDNNBlock(channels * 3, attention_channels, 1, 1)
+        else:
+            self.tdnn = TDNNBlock(channels, attention_channels, 1, 1)
+        self.tanh = nn.Tanh()
+        self.conv = Conv1d(
+            in_channels=attention_channels,
+            out_channels=channels,
+            kernel_size=1)
+
+    def forward(self, x, lengths=None):
+        C, L = x.shape[1], x.shape[2]  # KP: (N, C, L)
+
+        def _compute_statistics(x, m, axis=2, eps=self.eps):
+            mean = (m * x).sum(axis)
+            std = paddle.sqrt(
+                (m * (x - mean.unsqueeze(axis)).pow(2)).sum(axis).clip(eps))
+            return mean, std
+
+        if lengths is None:
+            lengths = paddle.ones([x.shape[0]])
+
+        # Make binary mask of shape [N, 1, L]
+        mask = length_to_mask(lengths * L, max_len=L)
+        mask = mask.unsqueeze(1)
+
+        # Expand the temporal context of the pooling layer by allowing the
+        # self-attention to look at global properties of the utterance.
+        if self.global_context:
+            total = mask.sum(axis=2, keepdim=True).astype('float32')
+            mean, std = _compute_statistics(x, mask / total)
+            mean = mean.unsqueeze(2).tile((1, 1, L))
+            std = std.unsqueeze(2).tile((1, 1, L))
+            attn = paddle.concat([x, mean, std], axis=1)
+        else:
+            attn = x
+
+        # Apply layers
+        attn = self.conv(self.tanh(self.tdnn(attn)))
+
+        # Filter out zero-paddings
+        attn = paddle.where(
+            mask.tile((1, C, 1)) == 0,
+            paddle.ones_like(attn) * float("-inf"), attn)
+
+        attn = F.softmax(attn, axis=2)
+        mean, std = _compute_statistics(x, attn)
+
+        # Append mean and std of the batch
+        pooled_stats = paddle.concat((mean, std), axis=1)
+        pooled_stats = pooled_stats.unsqueeze(2)
+
+        return pooled_stats
+
+
+class SERes2NetBlock(nn.Layer):
+    def __init__(
+            self,
+            in_channels,
+            out_channels,
+            res2net_scale=8,
+            se_channels=128,
+            kernel_size=1,
+            dilation=1,
+            activation=nn.ReLU, ):
+        """Implementation of Squeeze-Extraction Res2Blocks in ECAPA-TDNN network model
+           The paper is refered "Squeeze-and-Excitation Networks"
+           whose url is: https://arxiv.org/pdf/1709.01507.pdf
+        Args:
+            in_channels (int): input channels or input data dimensions
+            out_channels (int): output channels or output data dimensions
+            res2net_scale (int, optional): scale in the res2net block. Defaults to 8.
+            se_channels (int, optional): embedding dimensions of res2net block. Defaults to 128.
+            kernel_size (int, optional): kernel size of 1-d convolution in TDNN block. Defaults to 1.
+            dilation (int, optional): dilation of 1-d convolution in TDNN block. Defaults to 1.
+            activation (paddle.nn.class, optional): activation function. Defaults to nn.ReLU.
+        """
+        super().__init__()
+        self.out_channels = out_channels
+        self.tdnn1 = TDNNBlock(
+            in_channels,
+            out_channels,
+            kernel_size=1,
+            dilation=1,
+            activation=activation, )
+        self.res2net_block = Res2NetBlock(out_channels, out_channels,
+                                          res2net_scale, dilation)
+        self.tdnn2 = TDNNBlock(
+            out_channels,
+            out_channels,
+            kernel_size=1,
+            dilation=1,
+            activation=activation, )
+        self.se_block = SEBlock(out_channels, se_channels, out_channels)
+
+        self.shortcut = None
+        if in_channels != out_channels:
+            self.shortcut = Conv1d(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=1, )
+
+    def forward(self, x, lengths=None):
+        residual = x
+        if self.shortcut:
+            residual = self.shortcut(x)
+
+        x = self.tdnn1(x)
+        x = self.res2net_block(x)
+        x = self.tdnn2(x)
+        x = self.se_block(x, lengths)
+
+        return x + residual
+
+
+class EcapaTdnn(nn.Layer):
+    def __init__(
+            self,
+            input_size,
+            lin_neurons=192,
+            activation=nn.ReLU,
+            channels=[512, 512, 512, 512, 1536],
+            kernel_sizes=[5, 3, 3, 3, 1],
+            dilations=[1, 2, 3, 4, 1],
+            attention_channels=128,
+            res2net_scale=8,
+            se_channels=128,
+            global_context=True, ):
+        """Implementation of ECAPA-TDNN backbone model network
+           The paper is refered as "ECAPA-TDNN: Emphasized Channel Attention, Propagation and Aggregation in TDNN Based Speaker Verification"
+           whose url is: https://arxiv.org/abs/2005.07143
+        Args:
+            input_size (_type_): input fature dimension
+            lin_neurons (int, optional): speaker embedding size. Defaults to 192.
+            activation (paddle.nn.class, optional): activation function. Defaults to nn.ReLU.
+            channels (list, optional): inter embedding dimension. Defaults to [512, 512, 512, 512, 1536].
+            kernel_sizes (list, optional): kernel size of 1-d convolution in TDNN block . Defaults to [5, 3, 3, 3, 1].
+            dilations (list, optional): dilations of 1-d convolution in TDNN block. Defaults to [1, 2, 3, 4, 1].
+            attention_channels (int, optional): attention dimensions. Defaults to 128.
+            res2net_scale (int, optional): scale value in res2net. Defaults to 8.
+            se_channels (int, optional): dimensions of squeeze-excitation block. Defaults to 128.
+            global_context (bool, optional): global context flag. Defaults to True.
+        """
+        super().__init__()
+        assert len(channels) == len(kernel_sizes)
+        assert len(channels) == len(dilations)
+        self.channels = channels
+        self.blocks = nn.LayerList()
+        self.emb_size = lin_neurons
+
+        # The initial TDNN layer
+        self.blocks.append(
+            TDNNBlock(
+                input_size,
+                channels[0],
+                kernel_sizes[0],
+                dilations[0],
+                activation, ))
+
+        # SE-Res2Net layers
+        for i in range(1, len(channels) - 1):
+            self.blocks.append(
+                SERes2NetBlock(
+                    channels[i - 1],
+                    channels[i],
+                    res2net_scale=res2net_scale,
+                    se_channels=se_channels,
+                    kernel_size=kernel_sizes[i],
+                    dilation=dilations[i],
+                    activation=activation, ))
+
+        # Multi-layer feature aggregation
+        self.mfa = TDNNBlock(
+            channels[-1],
+            channels[-1],
+            kernel_sizes[-1],
+            dilations[-1],
+            activation, )
+
+        # Attentive Statistical Pooling
+        self.asp = AttentiveStatisticsPooling(
+            channels[-1],
+            attention_channels=attention_channels,
+            global_context=global_context, )
+        self.asp_bn = BatchNorm1d(input_size=channels[-1] * 2)
+
+        # Final linear transformation
+        self.fc = Conv1d(
+            in_channels=channels[-1] * 2,
+            out_channels=self.emb_size,
+            kernel_size=1, )
+
+    def forward(self, x, lengths=None):
+        """
+        Compute embeddings.
+
+        Args:
+            x (paddle.Tensor): Input log-fbanks with shape (N, n_mels, T).
+            lengths (paddle.Tensor, optional): Length proportions of batch length with shape (N). Defaults to None.
+
+        Returns:
+            paddle.Tensor: Output embeddings with shape (N, self.emb_size, 1)
+        """
+        xl = []
+        for layer in self.blocks:
+            try:
+                x = layer(x, lengths=lengths)
+            except TypeError:
+                x = layer(x)
+            xl.append(x)
+
+        # Multi-layer feature aggregation
+        x = paddle.concat(xl[1:], axis=1)
+        x = self.mfa(x)
+
+        # Attentive Statistical Pooling
+        x = self.asp(x, lengths=lengths)
+        x = self.asp_bn(x)
+
+        # Final linear transformation
+        x = self.fc(x)
+
+        return x
diff --git a/ernie-sat/paddlespeech/vector/models/lstm_speaker_encoder.py b/ernie-sat/paddlespeech/vector/models/lstm_speaker_encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..f92fddc0e85c84e3112306d5298e4f76e703471f
--- /dev/null
+++ b/ernie-sat/paddlespeech/vector/models/lstm_speaker_encoder.py
@@ -0,0 +1,147 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+import paddle
+from paddle import nn
+from paddle.nn import functional as F
+from paddle.nn import initializer as I
+from scipy.interpolate import interp1d
+from scipy.optimize import brentq
+from sklearn.metrics import roc_curve
+
+
+class LSTMSpeakerEncoder(nn.Layer):
+    def __init__(self, n_mels, num_layers, hidden_size, output_size):
+        super().__init__()
+        self.lstm = nn.LSTM(n_mels, hidden_size, num_layers)
+        self.linear = nn.Linear(hidden_size, output_size)
+        self.similarity_weight = self.create_parameter(
+            [1], default_initializer=I.Constant(10.))
+        self.similarity_bias = self.create_parameter(
+            [1], default_initializer=I.Constant(-5.))
+
+    def forward(self, utterances, num_speakers, initial_states=None):
+        normalized_embeds = self.embed_sequences(utterances, initial_states)
+        embeds = normalized_embeds.reshape([num_speakers, -1, num_speakers])
+        loss, eer = self.loss(embeds)
+        return loss, eer
+
+    def embed_sequences(self, utterances, initial_states=None, reduce=False):
+        out, (h, c) = self.lstm(utterances, initial_states)
+        embeds = F.relu(self.linear(h[-1]))
+        normalized_embeds = F.normalize(embeds)
+        if reduce:
+            embed = paddle.mean(normalized_embeds, 0)
+            embed = F.normalize(embed, axis=0)
+            return embed
+        return normalized_embeds
+
+    def embed_utterance(self, utterances, initial_states=None):
+        # utterances: [B, T, C] -> embed [C']
+        embed = self.embed_sequences(utterances, initial_states, reduce=True)
+        return embed
+
+    def similarity_matrix(self, embeds):
+        # (N, M, C)
+        speakers_per_batch, utterances_per_speaker, embed_dim = embeds.shape
+
+        # Inclusive centroids (1 per speaker). Cloning is needed for reverse differentiation
+        centroids_incl = paddle.mean(embeds, axis=1)
+        centroids_incl_norm = paddle.norm(
+            centroids_incl, p=2, axis=1, keepdim=True)
+        normalized_centroids_incl = centroids_incl / centroids_incl_norm
+
+        # Exclusive centroids (1 per utterance)
+        centroids_excl = paddle.broadcast_to(
+            paddle.sum(embeds, axis=1, keepdim=True), embeds.shape) - embeds
+        centroids_excl /= (utterances_per_speaker - 1)
+        centroids_excl_norm = paddle.norm(
+            centroids_excl, p=2, axis=2, keepdim=True)
+        normalized_centroids_excl = centroids_excl / centroids_excl_norm
+
+        p1 = paddle.matmul(
+            embeds.reshape([-1, embed_dim]),
+            normalized_centroids_incl,
+            transpose_y=True)  # (NMN)
+        p1 = p1.reshape([-1])
+        # print("p1: ", p1.shape)
+        p2 = paddle.bmm(
+            embeds.reshape([-1, 1, embed_dim]),
+            normalized_centroids_excl.reshape([-1, embed_dim, 1]))  # (NM, 1, 1)
+        p2 = p2.reshape([-1])  # （NM)
+
+        # begin: alternative implementation for scatter
+        with paddle.no_grad():
+            index = paddle.arange(
+                0, speakers_per_batch * utterances_per_speaker,
+                dtype="int64").reshape(
+                    [speakers_per_batch, utterances_per_speaker])
+            index = index * speakers_per_batch + paddle.arange(
+                0, speakers_per_batch, dtype="int64").unsqueeze(-1)
+            index = paddle.reshape(index, [-1])
+        ones = paddle.ones(
+            [speakers_per_batch * utterances_per_speaker * speakers_per_batch])
+        zeros = paddle.zeros_like(index, dtype=ones.dtype)
+        mask_p1 = paddle.scatter(ones, index, zeros)
+        p = p1 * mask_p1 + (1 - mask_p1) * paddle.scatter(ones, index, p2)
+        # end: alternative implementation for scatter
+        # p = paddle.scatter(p1, index, p2)
+
+        p = p * self.similarity_weight + self.similarity_bias  # neg
+        p = p.reshape(
+            [speakers_per_batch * utterances_per_speaker, speakers_per_batch])
+        return p, p1, p2
+
+    def do_gradient_ops(self):
+        for p in [self.similarity_weight, self.similarity_bias]:
+            g = p._grad_ivar()
+            g = g * 0.01
+
+    def inv_argmax(self, i, num):
+        return np.eye(1, num, i, dtype=int)[0]
+
+    def loss(self, embeds):
+        """
+        Computes the softmax loss according the section 2.1 of GE2E.
+
+        :param embeds: the embeddings as a tensor of shape (speakers_per_batch, 
+        utterances_per_speaker, embedding_size)
+        :return: the loss and the EER for this batch of embeddings.
+        """
+        speakers_per_batch, utterances_per_speaker = embeds.shape[:2]
+
+        # Loss
+        sim_matrix, *_ = self.similarity_matrix(embeds)
+        sim_matrix = sim_matrix.reshape(
+            [speakers_per_batch * utterances_per_speaker, speakers_per_batch])
+        target = paddle.arange(
+            0, speakers_per_batch, dtype="int64").unsqueeze(-1)
+        target = paddle.expand(target,
+                               [speakers_per_batch, utterances_per_speaker])
+        target = paddle.reshape(target, [-1])
+
+        loss = nn.CrossEntropyLoss()(sim_matrix, target)
+
+        # EER (not backpropagated)
+        with paddle.no_grad():
+            ground_truth = target.numpy()
+            labels = np.array(
+                [self.inv_argmax(i, speakers_per_batch) for i in ground_truth])
+            preds = sim_matrix.numpy()
+
+            # Snippet from https://yangcha.github.io/EER-ROC/
+            fpr, tpr, thresholds = roc_curve(labels.flatten(), preds.flatten())
+            eer = brentq(lambda x: 1. - x - interp1d(fpr, tpr)(x), 0., 1.)
+
+        return loss, eer
diff --git a/ernie-sat/paddlespeech/vector/modules/__init__.py b/ernie-sat/paddlespeech/vector/modules/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..97043fd7ba6885aac81cad5a49924c23c67d4d47
--- /dev/null
+++ b/ernie-sat/paddlespeech/vector/modules/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/ernie-sat/paddlespeech/vector/modules/loss.py b/ernie-sat/paddlespeech/vector/modules/loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c80dda4fc7ccd02aa0c66f5c9a24c5dd4e97a64
--- /dev/null
+++ b/ernie-sat/paddlespeech/vector/modules/loss.py
@@ -0,0 +1,93 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This is modified from SpeechBrain
+# https://github.com/speechbrain/speechbrain/blob/085be635c07f16d42cd1295045bc46c407f1e15b/speechbrain/nnet/losses.py
+import math
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+
+class AngularMargin(nn.Layer):
+    def __init__(self, margin=0.0, scale=1.0):
+        """An implementation of Angular Margin (AM) proposed in the following
+           paper: '''Margin Matters: Towards More Discriminative Deep Neural Network
+           Embeddings for Speaker Recognition''' (https://arxiv.org/abs/1906.07317)
+
+        Args:
+            margin (float, optional): The margin for cosine similiarity. Defaults to 0.0.
+            scale (float, optional): The scale for cosine similiarity. Defaults to 1.0.
+        """
+        super(AngularMargin, self).__init__()
+        self.margin = margin
+        self.scale = scale
+
+    def forward(self, outputs, targets):
+        outputs = outputs - self.margin * targets
+        return self.scale * outputs
+
+
+class AdditiveAngularMargin(AngularMargin):
+    def __init__(self, margin=0.0, scale=1.0, easy_margin=False):
+        """The Implementation of Additive Angular Margin (AAM) proposed
+       in the following paper: '''Margin Matters: Towards More Discriminative Deep Neural Network Embeddings for Speaker Recognition'''
+       (https://arxiv.org/abs/1906.07317)
+
+        Args:
+            margin (float, optional): margin factor. Defaults to 0.0.
+            scale (float, optional): scale factor. Defaults to 1.0.
+            easy_margin (bool, optional): easy_margin flag. Defaults to False.
+        """
+        super(AdditiveAngularMargin, self).__init__(margin, scale)
+        self.easy_margin = easy_margin
+
+        self.cos_m = math.cos(self.margin)
+        self.sin_m = math.sin(self.margin)
+        self.th = math.cos(math.pi - self.margin)
+        self.mm = math.sin(math.pi - self.margin) * self.margin
+
+    def forward(self, outputs, targets):
+        cosine = outputs.astype('float32')
+        sine = paddle.sqrt(1.0 - paddle.pow(cosine, 2))
+        phi = cosine * self.cos_m - sine * self.sin_m  # cos(theta + m)
+        if self.easy_margin:
+            phi = paddle.where(cosine > 0, phi, cosine)
+        else:
+            phi = paddle.where(cosine > self.th, phi, cosine - self.mm)
+        outputs = (targets * phi) + ((1.0 - targets) * cosine)
+        return self.scale * outputs
+
+
+class LogSoftmaxWrapper(nn.Layer):
+    def __init__(self, loss_fn):
+        """Speaker identificatin loss function wrapper 
+           including all of compositions of the loss transformation
+        Args:
+            loss_fn (_type_): the loss value of a batch
+        """
+        super(LogSoftmaxWrapper, self).__init__()
+        self.loss_fn = loss_fn
+        self.criterion = paddle.nn.KLDivLoss(reduction="sum")
+
+    def forward(self, outputs, targets, length=None):
+        targets = F.one_hot(targets, outputs.shape[1])
+        try:
+            predictions = self.loss_fn(outputs, targets)
+        except TypeError:
+            predictions = self.loss_fn(outputs)
+
+        predictions = F.log_softmax(predictions, axis=1)
+        loss = self.criterion(predictions, targets) / targets.sum()
+        return loss
diff --git a/ernie-sat/paddlespeech/vector/modules/sid_model.py b/ernie-sat/paddlespeech/vector/modules/sid_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..4045f75d1286bf2efc5b9a27f9cef25d715a8690
--- /dev/null
+++ b/ernie-sat/paddlespeech/vector/modules/sid_model.py
@@ -0,0 +1,87 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+
+class SpeakerIdetification(nn.Layer):
+    def __init__(
+            self,
+            backbone,
+            num_class,
+            lin_blocks=0,
+            lin_neurons=192,
+            dropout=0.1, ):
+        """The speaker identification model, which includes the speaker backbone network 
+           and the a linear transform to speaker class num in training
+
+        Args:
+            backbone (Paddle.nn.Layer class): the speaker identification backbone network model
+            num_class (_type_): the speaker class num in the training dataset
+            lin_blocks (int, optional): the linear layer transform between the embedding and the final linear layer. Defaults to 0.
+            lin_neurons (int, optional): the output dimension of final linear layer. Defaults to 192.
+            dropout (float, optional): the dropout factor on the embedding. Defaults to 0.1.
+        """
+        super(SpeakerIdetification, self).__init__()
+        # speaker idenfication backbone network model
+        # the output of the backbond network is the target embedding
+        self.backbone = backbone
+        if dropout > 0:
+            self.dropout = nn.Dropout(dropout)
+        else:
+            self.dropout = None
+
+        # construct the speaker classifer
+        input_size = self.backbone.emb_size
+        self.blocks = nn.LayerList()
+        for i in range(lin_blocks):
+            self.blocks.extend([
+                nn.BatchNorm1D(input_size),
+                nn.Linear(in_features=input_size, out_features=lin_neurons),
+            ])
+            input_size = lin_neurons
+
+        # the final layer
+        self.weight = paddle.create_parameter(
+            shape=(input_size, num_class),
+            dtype='float32',
+            attr=paddle.ParamAttr(initializer=nn.initializer.XavierUniform()), )
+
+    def forward(self, x, lengths=None):
+        """Do the speaker identification model forwrd, 
+           including the speaker embedding model and the classifier model network
+
+        Args:
+            x (paddle.Tensor): input audio feats, 
+                               shape=[batch, dimension, times]
+            lengths (paddle.Tensor, optional): input audio length.
+                                        shape=[batch, times]
+                                        Defaults to None.
+
+        Returns:
+            paddle.Tensor: return the logits of the feats
+        """
+        # x.shape: (N, C, L)
+        x = self.backbone(x, lengths).squeeze(
+            -1)  # (N, emb_size, 1) -> (N, emb_size)
+        if self.dropout is not None:
+            x = self.dropout(x)
+
+        for fc in self.blocks:
+            x = fc(x)
+
+        logits = F.linear(F.normalize(x), F.normalize(self.weight, axis=0))
+
+        return logits
diff --git a/ernie-sat/paddlespeech/vector/training/__init__.py b/ernie-sat/paddlespeech/vector/training/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..97043fd7ba6885aac81cad5a49924c23c67d4d47
--- /dev/null
+++ b/ernie-sat/paddlespeech/vector/training/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/ernie-sat/paddlespeech/vector/training/scheduler.py b/ernie-sat/paddlespeech/vector/training/scheduler.py
new file mode 100644
index 0000000000000000000000000000000000000000..3dcac0576c6b0eb7e76d60afaab410a5971faafa
--- /dev/null
+++ b/ernie-sat/paddlespeech/vector/training/scheduler.py
@@ -0,0 +1,45 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from paddle.optimizer.lr import LRScheduler
+
+
+class CyclicLRScheduler(LRScheduler):
+    def __init__(self,
+                 base_lr: float=1e-8,
+                 max_lr: float=1e-3,
+                 step_size: int=10000):
+
+        super(CyclicLRScheduler, self).__init__()
+
+        self.current_step = -1
+        self.base_lr = base_lr
+        self.max_lr = max_lr
+        self.step_size = step_size
+
+    def step(self):
+        if not hasattr(self, 'current_step'):
+            return
+
+        self.current_step += 1
+        if self.current_step >= 2 * self.step_size:
+            self.current_step %= 2 * self.step_size
+
+        self.last_lr = self.get_lr()
+
+    def get_lr(self):
+        p = self.current_step / (2 * self.step_size)  # Proportion in one cycle.
+        if p < 0.5:  # Increase
+            return self.base_lr + p / 0.5 * (self.max_lr - self.base_lr)
+        else:  # Decrease
+            return self.max_lr - (p / 0.5 - 1) * (self.max_lr - self.base_lr)
diff --git a/ernie-sat/paddlespeech/vector/training/seeding.py b/ernie-sat/paddlespeech/vector/training/seeding.py
new file mode 100644
index 0000000000000000000000000000000000000000..0778a27d61943ad63095a72a045b8ea52d8602d6
--- /dev/null
+++ b/ernie-sat/paddlespeech/vector/training/seeding.py
@@ -0,0 +1,28 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from paddlespeech.s2t.utils.log import Log
+
+logger = Log(__name__).getlog()
+import random
+
+import numpy as np
+import paddle
+
+
+def seed_everything(seed: int):
+    """Seed paddle, random and np.random to help reproductivity."""
+    paddle.seed(seed)
+    random.seed(seed)
+    np.random.seed(seed)
+    logger.info(f"Set the seed of paddle, random, np.random to {seed}.")
diff --git a/ernie-sat/paddlespeech/vector/utils/__init__.py b/ernie-sat/paddlespeech/vector/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..97043fd7ba6885aac81cad5a49924c23c67d4d47
--- /dev/null
+++ b/ernie-sat/paddlespeech/vector/utils/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/ernie-sat/paddlespeech/vector/utils/time.py b/ernie-sat/paddlespeech/vector/utils/time.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e85b0e120a3f0cf12d2e52aa3c397c873c3a869
--- /dev/null
+++ b/ernie-sat/paddlespeech/vector/utils/time.py
@@ -0,0 +1,66 @@
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+import time
+
+
+class Timer(object):
+    '''Calculate runing speed and estimated time of arrival(ETA)'''
+
+    def __init__(self, total_step: int):
+        self.total_step = total_step
+        self.last_start_step = 0
+        self.current_step = 0
+        self._is_running = True
+
+    def start(self):
+        self.last_time = time.time()
+        self.start_time = time.time()
+
+    def stop(self):
+        self._is_running = False
+        self.end_time = time.time()
+
+    def count(self) -> int:
+        if not self.current_step >= self.total_step:
+            self.current_step += 1
+        return self.current_step
+
+    @property
+    def timing(self) -> float:
+        run_steps = self.current_step - self.last_start_step
+        self.last_start_step = self.current_step
+        time_used = time.time() - self.last_time
+        self.last_time = time.time()
+        return time_used / run_steps
+
+    @property
+    def is_running(self) -> bool:
+        return self._is_running
+
+    @property
+    def eta(self) -> str:
+        if not self.is_running:
+            return '00:00:00'
+        remaining_time = time.time() - self.start_time
+        return seconds_to_hms(remaining_time)
+
+
+def seconds_to_hms(seconds: int) -> str:
+    '''Convert the number of seconds to hh:mm:ss'''
+    h = math.floor(seconds / 3600)
+    m = math.floor((seconds - h * 3600) / 60)
+    s = int(seconds - h * 3600 - m * 60)
+    hms_str = '{:0>2}:{:0>2}:{:0>2}'.format(h, m, s)
+    return hms_str
diff --git a/ernie-sat/phn_mapping.txt b/ernie-sat/phn_mapping.txt
new file mode 100644
index 0000000000000000000000000000000000000000..9553ced03b20fa0030dc51b3b5543978508dfc2c
--- /dev/null
+++ b/ernie-sat/phn_mapping.txt
@@ -0,0 +1,306 @@
+ou3 ou3
+a3 a3
+eng4 eng4
+u1 u1
+vn2 vn2
+uang3 uang3
+ang3 ang3
+ua1 ua1
+ou1 ou1
+in3 in3
+uai4 uai4
+van1 van1
+en2 en2
+ia4 ia4
+uai2 uai2
+iang4 iang4
+ai3 ai3
+sp sp
+in1 in1
+uai3 uai3
+ve1 ve1
+ou4 ou4
+d d
+ang2 ang2
+iang3 iang3
+o1 o1
+iao3 iao3
+an1 an1
+en5 en5
+ong3 ong3
+e5 e5
+e3 e3
+van3 van3
+i3 i3
+i2 i2
+uo4 uo4
+i1 i1
+in2 in2
+v1 v1
+uang4 uang4
+en3 en3
+ian5 ian5
+ie3 ie3
+o2 o2
+x x
+iang2 iang2
+ei1 ei1
+uang2 uang2
+t t
+ao4 ao4
+ch ch
+o3 o3
+en1 en1
+ie1 ie1
+uan3 uan3
+uo1 uo1
+iang5 iang5
+iong1 iong1
+l l
+a5 a5
+an4 an4
+u2 u2
+ei3 ei3
+uo3 uo3
+ai2 ai2
+v3 v3
+k k
+uan4 uan4
+ian2 ian2
+ei2 ei2
+sh sh
+g g
+ong2 ong2
+ing1 ing1
+vn3 vn3
+r r
+ong1 ong1
+ao1 ao1
+ua3 ua3
+ia1 ia1
+u3 u3
+s s
+b b
+e2 e2
+ua4 ua4
+iang1 iang1
+ie4 ie4
+ou5 ou5
+ing4 ing4
+ai1 ai1
+iong4 iong4
+uo5 uo5
+ei5 ei5
+ueng1 ueng1
+ou2 ou2
+e1 e1
+f f
+en4 en4
+v2 v2
+iao2 iao2
+ie2 ie2
+van2 van2
+eng1 eng1
+ai4 ai4
+uo2 uo2
+iao1 iao1
+in4 in4
+er4 er4
+e4 e4
+uan1 uan1
+ia3 ia3
+ao2 ao2
+u4 u4
+ei4 ei4
+eng3 eng3
+z z
+j j
+ve3 ve3
+n n
+an3 an3
+uan2 uan2
+o5 o5
+ve2 ve2
+ang4 ang4
+er2 er2
+ia5 ia5
+ian4 ian4
+er5 er5
+ia2 ia2
+eng2 eng2
+ie5 ie5
+ang1 ang1
+er3 er3
+ian1 ian1
+<unk> <unk>
+c c
+v4 v4
+iao4 iao4
+a4 a4
+m m
+a2 a2
+ong4 ong4
+q q
+uang1 uang1
+an2 an2
+ua2 ua2
+zh zh
+ing2 ing2
+ve4 ve4
+van4 van4
+vn4 vn4
+iong3 iong3
+i4 i4
+ian3 ian3
+ing3 ing3
+p p
+iong2 iong2
+ao3 ao3
+vn1 vn1
+uai1 uai1
+a1 a1
+o4 o4
+h h
+uenr4 un4 ee er5
+iaor3 iao3 ee er2
+iour4 ii iu4 ee er2
+iangr4 ii iang4 ee er5
+iou3 ii iu3 
+sil sp
+iour1 iu1 ee er5
+vn5 vn1
+ir1 i1 ee er2
+vanr1 van1 ee er2
+vanr2 van2 ee er5
+air3 ai3 ee er2
+uangr4 uu uang1
+enr1 en1 ee er2
+iour3 ii iu3 ee er5
+uenr1 un1 ee er5
+uenr3 un3 ee er5
+or2 o2 ee er2
+anr3 an3 ee er5
+ai5 ai4
+iaor2 iao2 ee er2
+uanr3 uan3 ee er5
+uanr2 uu uan4 ee er2
+uen1 un1
+ua5 uu ua2
+uen3 uu un3 
+iii4 ix4
+uor1 uo1 ee er5
+our2 ou5 ee er2
+uei1 uu ui1
+vr3 v3 ee er5
+uenr2 un2 ee er5
+uanr5 uu uan2 ee er5
+iiir4 ix4 ee er5
+iiir1 ix1 ee er5
+ur2 u3 ee er5 
+eng5 eng1
+ingr1 ii ing1 ee er2
+ii4 iy4
+ve5 vv ve1 
+？ <unk>
+ii1 iy1
+ao5 ao3
+v5 vv v2
+ing5 ing2
+i5 i1 
+iou5 ii iu3
+uen4 un4
+our4 ou4 ee er5
+io3 ii iu3
+ar4 a4 ee er5
+ingr2 ing2 ee er5
+ingr4 ing4 ee er5
+ir3 e5 ee er5
+iaor4 iao4 ee er5 
+ii2 ix2
+uanr4 uan4 ee er5
+enr5 en4 ee er2
+ianr3 ian3 ee er5 
+uei5 uu ui2
+ianr4 ian4 ee er2
+iar4 ia4 ee er2
+uair4 uai1 ee er2
+enr2 en2 ee er5
+iii1 ix1
+ver3 ve3 ee er2
+ianr5 ian3 ee er5 
+ong5 ong1
+air2 ai2 ee er5
+angr4 ang4 ee er5
+iii5 ix2
+ang5 ang1
+iou1 iu1
+uar4 ua4 ee er5
+ur4 u4 ee er5 
+iou4 iu4
+iou2 ii iu2
+in5 in1
+uor2 uo2 ee er5
+uar2 ua2 ee er5
+uei2 uu ui2
+<pad> <unk> 
+anr1 an1 ee er5
+ar5 a1 ee er5
+uen2 un2
+eir4 ei4 ee er2
+ingr3 ii ing3 ee er5
+aor4 ao4 ee er5
+enr4 en4 ee er5 
+iao5 ii iao2 
+iii2 ix2
+er1 e1 ee er5
+iaor1 iao1 ee er5
+ueir1 ui1 ee er2
+inr4 in4 ee er5
+ueir2 ui4 ee er5
+uan5 ai2 ee er5
+ir4 i4 ee er2
+ur1 u1 ee er5
+iour2 iu1 ee er2
+ar2 a2 ee er5
+an5 an2
+iii3 ix3
+ver4 vv ve4 ee er2
+。 <unk>
+aor3 ao3 ee er5
+iong5 ii iong4
+u5 u4
+air4 ai4 ee er5
+ii3 iy3
+our5 ou4 ee er5
+inr1 in1 ee er5
+uor3 uo3 ee er5
+van5 van4
+ur5 u4 ee er2
+aor5 ao4 ee er5
+engr4 eng4 ee er2
+ueir4 ui4 ee er5
+<eos> <unk>
+angr2 ang2 ee er2
+ii5 iy5
+vnr2 vn2 ee er5
+enr3 en3 ee er5
+uar1 ua1 ee er2
+vanr4 van4 ee er5
+， <unk>
+uor5 uo3 ee er5
+uei4 ui4
+aor1 ao1 ee er5
+uen5 uu un4
+anr4 an4 ee er5
+iar1 ia1 ee er5
+vanr3 van3 ee er5
+uei3 uu ui3
+！ <unk>
+io1 ii uo5 
+spl <unk>
+ar3 a3 ee er5
+our3 ou3 ee er5
+ueir3 ui3 ee er5
+ianr2 ian3 ee er5
+ueng4 uu un4
+ianr1 ian1 ee er5
diff --git a/ernie-sat/prompt/dev/mfa_end b/ernie-sat/prompt/dev/mfa_end
new file mode 100644
index 0000000000000000000000000000000000000000..70c1237a09a56ba06791b9f05c0f694a3816336b
--- /dev/null
+++ b/ernie-sat/prompt/dev/mfa_end
@@ -0,0 +1,3 @@
+Prompt_003_new 0.0425 0.0925 0.1825 0.2125 0.2425 0.3225 0.3725 0.4725 0.5325 0.5625 0.6225 0.7425 0.8625 0.9725 0.9975 1.0125 1.0825 1.2625 1.3125
+p299_096 0.7525 0.7925 0.8725 0.9125 0.9425 1.0325 1.0625 1.1925 1.2625 1.3225 1.3725 1.4125 1.5125 1.5425 1.6525 1.6925 1.7325 1.7625 1.8425 1.9625 2.0225 2.1825 2.3325 2.6825
+p243_new 1.0225 1.0525 1.0925 1.1325 1.1725 1.2625 1.3625 1.4125 1.5125 1.6225 1.6625 1.7925 1.8625 2.0025 2.0925 2.1725 2.2625 2.4325 2.4725 2.5225 2.5825 2.6125 2.6425 2.7425 2.8025 2.9025 2.9525 3.0525 3.0825 3.2125 3.4525
diff --git a/ernie-sat/prompt/dev/mfa_start b/ernie-sat/prompt/dev/mfa_start
new file mode 100644
index 0000000000000000000000000000000000000000..a975f8aafeab902a2da5d981f74126d1debf2290
--- /dev/null
+++ b/ernie-sat/prompt/dev/mfa_start
@@ -0,0 +1,3 @@
+Prompt_003_new 0.0125 0.0425 0.0925 0.1825 0.2125 0.2425 0.3225 0.3725 0.4725 0.5325 0.5625 0.6225 0.7425 0.8625 0.9725 0.9975 1.0125 1.0825 1.2625 
+p243_new 0.0125 1.0225 1.0525 1.0925 1.1325 1.1725 1.2625 1.3625 1.4125 1.5125 1.6225 1.6625 1.7925 1.8625 2.0025 2.0925 2.1725 2.2625 2.4325 2.4725 2.5225 2.5825 2.6125 2.6425 2.7425 2.8025 2.9025 2.9525 3.0525 3.0825 3.2125
+p299_096 0.0125 0.7525 0.7925 0.8725 0.9125 0.9425 1.0325 1.0625 1.1925 1.2625 1.3225 1.3725 1.4125 1.5125 1.5425 1.6525 1.6925 1.7325 1.7625 1.8425 1.9625 2.0225 2.1825 2.3325
diff --git a/ernie-sat/prompt/dev/mfa_text b/ernie-sat/prompt/dev/mfa_text
new file mode 100644
index 0000000000000000000000000000000000000000..68a33eb640acd6b1b9101e5bc79f50bd5c778272
--- /dev/null
+++ b/ernie-sat/prompt/dev/mfa_text
@@ -0,0 +1,3 @@
+Prompt_003_new DH IH1 S W AA1 Z N AA1 T DH AH0 SH OW1 F AO1 R M IY1 sp 
+p299_096 sp W IY1 AA1 R T R AY1 NG T UW1 AH0 S T AE1 B L IH0 SH AH0 D EY1 T sp
+p243_new sp F AO1 R DH AE1 T R IY1 Z AH0 N sp K AH1 V ER0 SH UH1 D N AA1 T B IY1 G IH1 V AH0 N sp
diff --git a/ernie-sat/prompt/dev/mfa_wav.scp b/ernie-sat/prompt/dev/mfa_wav.scp
new file mode 100644
index 0000000000000000000000000000000000000000..ad5b9d9cae8ccce228f7a922227ea641fdc8fc0e
--- /dev/null
+++ b/ernie-sat/prompt/dev/mfa_wav.scp
@@ -0,0 +1,3 @@
+Prompt_003_new ../../prompt_wav/this_was_not_the_show_for_me.wav
+p243_new ../../prompt_wav/p243_313.wav
+p299_096 ../../prompt_wav/p299_096.wav
diff --git a/ernie-sat/prompt/dev/text b/ernie-sat/prompt/dev/text
new file mode 100644
index 0000000000000000000000000000000000000000..026aa9ad4f7952e338dd2e8a1fedf68cfd709f23
--- /dev/null
+++ b/ernie-sat/prompt/dev/text
@@ -0,0 +1,3 @@
+Prompt_003_new This was not the show for me.
+p243_new For that reason cover should not be given.
+p299_096 We are trying to establish a date.
diff --git a/ernie-sat/prompt/dev/wav.scp b/ernie-sat/prompt/dev/wav.scp
new file mode 100644
index 0000000000000000000000000000000000000000..c0f8a1c7fdc1c76838b68eeeca4c4a5c416f4150
--- /dev/null
+++ b/ernie-sat/prompt/dev/wav.scp
@@ -0,0 +1,3 @@
+Prompt_003_new ../../prompt_wav/this_was_not_the_show_for_me.wav
+p299_096 ../../prompt_wav/p299_096.wav
+p243_new ../../prompt_wav/p243_313.wav
diff --git a/ernie-sat/prompt_wav/SSB03420111.wav b/ernie-sat/prompt_wav/SSB03420111.wav
new file mode 100755
index 0000000000000000000000000000000000000000..fe44397f4d5c4648ad13b67e270f64947bd17a21
Binary files /dev/null and b/ernie-sat/prompt_wav/SSB03420111.wav differ
diff --git a/ernie-sat/prompt_wav/SSB03540015.wav b/ernie-sat/prompt_wav/SSB03540015.wav
new file mode 100755
index 0000000000000000000000000000000000000000..5f5daea5016acbdf60e637a977660ade2550b898
Binary files /dev/null and b/ernie-sat/prompt_wav/SSB03540015.wav differ
diff --git a/ernie-sat/prompt_wav/SSB03540307.wav b/ernie-sat/prompt_wav/SSB03540307.wav
new file mode 100755
index 0000000000000000000000000000000000000000..ff9a0c035ac60eeb8432464f10a73812d21596b7
Binary files /dev/null and b/ernie-sat/prompt_wav/SSB03540307.wav differ
diff --git a/ernie-sat/prompt_wav/SSB03540428.wav b/ernie-sat/prompt_wav/SSB03540428.wav
new file mode 100755
index 0000000000000000000000000000000000000000..3e90b005147bf7fa46fb3c33c2fd57fffd7ff890
Binary files /dev/null and b/ernie-sat/prompt_wav/SSB03540428.wav differ
diff --git a/ernie-sat/prompt_wav/p243_313.wav b/ernie-sat/prompt_wav/p243_313.wav
new file mode 100644
index 0000000000000000000000000000000000000000..a53362743c66035bb6ee101da4ce453ccf6ecbee
Binary files /dev/null and b/ernie-sat/prompt_wav/p243_313.wav differ
diff --git a/ernie-sat/prompt_wav/p299_096.wav b/ernie-sat/prompt_wav/p299_096.wav
new file mode 100644
index 0000000000000000000000000000000000000000..1686ab30ef296f3d09a78f97ee31386a271dd0b5
Binary files /dev/null and b/ernie-sat/prompt_wav/p299_096.wav differ
diff --git a/ernie-sat/prompt_wav/p323_083.wav b/ernie-sat/prompt_wav/p323_083.wav
new file mode 100644
index 0000000000000000000000000000000000000000..ff1d4997a82bdc9d7839f49de118cd0f21d1beac
Binary files /dev/null and b/ernie-sat/prompt_wav/p323_083.wav differ
diff --git a/ernie-sat/prompt_wav/this_was_not_the_show_for_me.wav b/ernie-sat/prompt_wav/this_was_not_the_show_for_me.wav
new file mode 100644
index 0000000000000000000000000000000000000000..3f4f109da8c33b01c0ea32c429033e5803a1ab89
Binary files /dev/null and b/ernie-sat/prompt_wav/this_was_not_the_show_for_me.wav differ
diff --git a/ernie-sat/read_text.py b/ernie-sat/read_text.py
new file mode 100644
index 0000000000000000000000000000000000000000..f140c31f24842dab636dd9701c9047ba2b0181dd
--- /dev/null
+++ b/ernie-sat/read_text.py
@@ -0,0 +1,78 @@
+import logging
+from pathlib import Path
+from typing import Dict
+from typing import List
+from typing import Union
+
+
+
+def read_2column_text(path: Union[Path, str]) -> Dict[str, str]:
+    """Read a text file having 2 column as dict object.
+
+    Examples:
+        wav.scp:
+            key1 /some/path/a.wav
+            key2 /some/path/b.wav
+
+        >>> read_2column_text('wav.scp')
+        {'key1': '/some/path/a.wav', 'key2': '/some/path/b.wav'}
+
+    """
+
+    data = {}
+    with Path(path).open("r", encoding="utf-8") as f:
+        for linenum, line in enumerate(f, 1):
+            sps = line.rstrip().split(maxsplit=1)
+            if len(sps) == 1:
+                k, v = sps[0], ""
+            else:
+                k, v = sps
+            if k in data:
+                raise RuntimeError(f"{k} is duplicated ({path}:{linenum})")
+            data[k] = v
+    return data
+
+
+def load_num_sequence_text(
+    path: Union[Path, str], loader_type: str = "csv_int"
+) -> Dict[str, List[Union[float, int]]]:
+    """Read a text file indicating sequences of number
+
+    Examples:
+        key1 1 2 3
+        key2 34 5 6
+
+        >>> d = load_num_sequence_text('text')
+        >>> np.testing.assert_array_equal(d["key1"], np.array([1, 2, 3]))
+    """
+    if loader_type == "text_int":
+        delimiter = " "
+        dtype = int
+    elif loader_type == "text_float":
+        delimiter = " "
+        dtype = float
+    elif loader_type == "csv_int":
+        delimiter = ","
+        dtype = int
+    elif loader_type == "csv_float":
+        delimiter = ","
+        dtype = float
+    else:
+        raise ValueError(f"Not supported loader_type={loader_type}")
+
+    # path looks like:
+    #   utta 1,0
+    #   uttb 3,4,5
+    # -> return {'utta': np.ndarray([1, 0]),
+    #            'uttb': np.ndarray([3, 4, 5])}
+    d = read_2column_text(path)
+
+    # Using for-loop instead of dict-comprehension for debuggability
+    retval = {}
+    for k, v in d.items():
+        try:
+            retval[k] = [dtype(i) for i in v.split(delimiter)]
+        except TypeError:
+            logging.error(f'Error happened with path="{path}", id="{k}", value="{v}"')
+            raise
+    return retval
diff --git a/ernie-sat/run_clone_en_to_zh.sh b/ernie-sat/run_clone_en_to_zh.sh
new file mode 100644
index 0000000000000000000000000000000000000000..85b013c7612979e3eb60f318c73f398851663544
--- /dev/null
+++ b/ernie-sat/run_clone_en_to_zh.sh
@@ -0,0 +1,21 @@
+# en --> zh  的 clone
+python sedit_inference_0520.py \
+--task_name cross-lingual_clone \
+--model_name paddle_checkpoint_ench \
+--uid Prompt_003_new \
+--new_str '今天天气很好' \
+--prefix ./prompt/dev/ \
+--clone_prefix ./prompt/dev_aishell3/ \
+--clone_uid SSB07510054 \
+--source_language english \
+--target_language chinese \
+--output_name task_cross_lingual_pred.wav \
+--voc pwgan_aishell3 \
+--voc_config download/pwg_aishell3_ckpt_0.5/default.yaml \
+--voc_ckpt download/pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
+--voc_stat download/pwg_aishell3_ckpt_0.5/feats_stats.npy \
+--am fastspeech2_csmsc \
+--am_config download/fastspeech2_conformer_baker_ckpt_0.5/conformer.yaml \
+--am_ckpt download/fastspeech2_conformer_baker_ckpt_0.5/snapshot_iter_76000.pdz \
+--am_stat download/fastspeech2_conformer_baker_ckpt_0.5/speech_stats.npy \
+--phones_dict download/fastspeech2_conformer_baker_ckpt_0.5/phone_id_map.txt
\ No newline at end of file
diff --git a/ernie-sat/run_gen_en.sh b/ernie-sat/run_gen_en.sh
new file mode 100644
index 0000000000000000000000000000000000000000..c89431c0653c01dc79a94fe0088c05754a2a521c
--- /dev/null
+++ b/ernie-sat/run_gen_en.sh
@@ -0,0 +1,40 @@
+# 纯英文的语音合成
+# python sedit_inference_0518.py \
+# --task_name synthesize \
+# --model_name paddle_checkpoint_en \
+# --uid p323_083 \
+# --new_str 'I enjoy my life.' \
+# --prefix ./prompt/dev/ \
+# --source_language english \
+# --target_language english \
+# --output_name pred.wav \
+# --voc pwgan_aishell3 \
+# --voc_config download/pwg_aishell3_ckpt_0.5/default.yaml \
+# --voc_ckpt download/pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
+# --voc_stat download/pwg_aishell3_ckpt_0.5/feats_stats.npy \
+# --am fastspeech2_ljspeech \
+# --am_config download/fastspeech2_nosil_ljspeech_ckpt_0.5/default.yaml \
+# --am_ckpt download/fastspeech2_nosil_ljspeech_ckpt_0.5/snapshot_iter_100000.pdz \
+# --am_stat download/fastspeech2_nosil_ljspeech_ckpt_0.5/speech_stats.npy \
+# --phones_dict download/fastspeech2_nosil_ljspeech_ckpt_0.5/phone_id_map.txt
+
+
+# 纯英文的语音合成
+python sedit_inference_0520.py \
+--task_name synthesize \
+--model_name paddle_checkpoint_en \
+--uid p299_096 \
+--new_str 'I enjoy my life.' \
+--prefix ./prompt/dev/ \
+--source_language english \
+--target_language english \
+--output_name task_synthesize_pred.wav \
+--voc pwgan_aishell3 \
+--voc_config download/pwg_aishell3_ckpt_0.5/default.yaml \
+--voc_ckpt download/pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
+--voc_stat download/pwg_aishell3_ckpt_0.5/feats_stats.npy \
+--am fastspeech2_ljspeech \
+--am_config download/fastspeech2_nosil_ljspeech_ckpt_0.5/default.yaml \
+--am_ckpt download/fastspeech2_nosil_ljspeech_ckpt_0.5/snapshot_iter_100000.pdz \
+--am_stat download/fastspeech2_nosil_ljspeech_ckpt_0.5/speech_stats.npy \
+--phones_dict download/fastspeech2_nosil_ljspeech_ckpt_0.5/phone_id_map.txt
\ No newline at end of file
diff --git a/ernie-sat/run_sedit_en.sh b/ernie-sat/run_sedit_en.sh
new file mode 100644
index 0000000000000000000000000000000000000000..c3d5a7457aa1fa38732fc75eb1823e985cd29b47
--- /dev/null
+++ b/ernie-sat/run_sedit_en.sh
@@ -0,0 +1,19 @@
+# 纯英文的语音编辑
+python sedit_inference_0520.py \
+--task_name edit \
+--model_name paddle_checkpoint_en \
+--uid p243_new \
+--new_str 'for that reason cover is impossible to be given.' \
+--prefix ./prompt/dev/ \
+--source_language english \
+--target_language english \
+--output_name task_edit_pred.wav \
+--voc pwgan_aishell3 \
+--voc_config download/pwg_aishell3_ckpt_0.5/default.yaml \
+--voc_ckpt download/pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
+--voc_stat download/pwg_aishell3_ckpt_0.5/feats_stats.npy \
+--am fastspeech2_ljspeech \
+--am_config download/fastspeech2_nosil_ljspeech_ckpt_0.5/default.yaml \
+--am_ckpt download/fastspeech2_nosil_ljspeech_ckpt_0.5/snapshot_iter_100000.pdz \
+--am_stat download/fastspeech2_nosil_ljspeech_ckpt_0.5/speech_stats.npy \
+--phones_dict download/fastspeech2_nosil_ljspeech_ckpt_0.5/phone_id_map.txt
\ No newline at end of file
diff --git a/ernie-sat/sedit_arg_parser.py b/ernie-sat/sedit_arg_parser.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0c4296b594e17657da5e0ab8559fdd62e1b767b
--- /dev/null
+++ b/ernie-sat/sedit_arg_parser.py
@@ -0,0 +1,93 @@
+import argparse
+from paddlespeech.t2s.utils import str2bool
+
+def parse_args():
+    # parse args and config and redirect to train_sp
+    parser = argparse.ArgumentParser(
+        description="Synthesize with acoustic model & vocoder")
+    # acoustic model
+    parser.add_argument(
+        '--am',
+        type=str,
+        default='fastspeech2_csmsc',
+        choices=[
+            'speedyspeech_csmsc', 'fastspeech2_csmsc', 'fastspeech2_ljspeech',
+            'fastspeech2_aishell3', 'fastspeech2_vctk', 'tacotron2_csmsc',
+            'tacotron2_ljspeech', 'tacotron2_aishell3'
+        ],
+        help='Choose acoustic model type of tts task.')
+    parser.add_argument(
+        '--am_config',
+        type=str,
+        default=None,
+        help='Config of acoustic model. Use deault config when it is None.')
+    parser.add_argument(
+        '--am_ckpt',
+        type=str,
+        default=None,
+        help='Checkpoint file of acoustic model.')
+    parser.add_argument(
+        "--am_stat",
+        type=str,
+        default=None,
+        help="mean and standard deviation used to normalize spectrogram when training acoustic model."
+    )
+    parser.add_argument(
+        "--phones_dict", type=str, default=None, help="phone vocabulary file.")
+    parser.add_argument(
+        "--tones_dict", type=str, default=None, help="tone vocabulary file.")
+    parser.add_argument(
+        "--speaker_dict", type=str, default=None, help="speaker id map file.")
+    
+    # vocoder
+    parser.add_argument(
+        '--voc',
+        type=str,
+        default='pwgan_aishell3',
+        choices=[
+            'pwgan_csmsc', 'pwgan_ljspeech', 'pwgan_aishell3', 'pwgan_vctk',
+            'mb_melgan_csmsc', 'wavernn_csmsc', 'hifigan_csmsc',
+            'hifigan_ljspeech', 'hifigan_aishell3', 'hifigan_vctk',
+            'style_melgan_csmsc'
+        ],
+        help='Choose vocoder type of tts task.')
+    parser.add_argument(
+        '--voc_config',
+        type=str,
+        default=None,
+        help='Config of voc. Use deault config when it is None.')
+    parser.add_argument(
+        '--voc_ckpt', type=str, default=None, help='Checkpoint file of voc.')
+    parser.add_argument(
+        "--voc_stat",
+        type=str,
+        default=None,
+        help="mean and standard deviation used to normalize spectrogram when training voc."
+    )
+    # other
+    parser.add_argument(
+        '--lang',
+        type=str,
+        default='en',
+        help='Choose model language. zh or en')
+
+    parser.add_argument(
+        "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
+    parser.add_argument("--test_metadata", type=str, help="test metadata.")
+    parser.add_argument("--output_dir", type=str, help="output dir.")
+
+    parser.add_argument("--model_name", type=str, help="model name")
+    parser.add_argument("--uid", type=str, help="uid")
+    parser.add_argument("--new_str", type=str, help="new string")
+    parser.add_argument("--prefix", type=str, help="prefix")
+    parser.add_argument("--clone_prefix", type=str, default=None, help="clone prefix")
+    parser.add_argument("--clone_uid", type=str, default=None, help="clone uid")
+    parser.add_argument("--source_language", type=str, help="source language")
+    parser.add_argument("--target_language", type=str, help="target language")
+    parser.add_argument("--output_name", type=str, help="output name")
+    parser.add_argument("--task_name", type=str, help="task name")
+    
+
+    # pre
+    args = parser.parse_args()
+    return args
\ No newline at end of file
diff --git a/ernie-sat/sedit_inference_0520.py b/ernie-sat/sedit_inference_0520.py
new file mode 100644
index 0000000000000000000000000000000000000000..09ca3e567a6d2925091bc619cc8c19410c637ea6
--- /dev/null
+++ b/ernie-sat/sedit_inference_0520.py
@@ -0,0 +1,1086 @@
+#!/usr/bin/env python3
+
+"""Script to run the inference of text-to-speeech model."""
+
+import os 
+os.environ["CUDA_VISIBLE_DEVICES"] = "3"
+
+from parallel_wavegan.utils import download_pretrained_model
+from pathlib import Path
+import paddle
+import soundfile
+import os
+import math
+import string
+import numpy as np
+
+from espnet2.tasks.mlm import MLMTask
+from read_text import read_2column_text,load_num_sequence_text
+from util import sentence2phns,get_voc_out, evaluate_durations
+import librosa
+import random
+from ipywidgets import widgets
+import IPython.display as ipd
+import soundfile as sf
+import sys 
+import pickle
+from model_paddle import build_model_from_file
+
+from sedit_arg_parser import parse_args
+import argparse
+from typing import Collection
+from typing import Dict
+from typing import List
+from typing import Tuple
+from typing import Union
+
+from paddlespeech.t2s.datasets.get_feats import LogMelFBank
+from paddlespeech.t2s.modules.nets_utils import make_non_pad_mask 
+
+duration_path_dict = {
+    "ljspeech":"/mnt/home/v_baihe/projects/espnet/egs2/ljspeech/tts1/exp/kan-bayashi/ljspeech_tts_train_conformer_fastspeech2_raw_phn_tacotron_g2p_en_no_space_train.loss.ave/train.loss.ave_5best.pth",
+    "vctk": "/mnt/home/v_baihe/projects/espnet/egs2/vctk/tts1/exp/kan-bayashi/vctk_tts_train_gst+xvector_conformer_fastspeech2_transformer_teacher_raw_phn_tacotron_g2p_en_no_space_train.loss.ave/train.loss.ave_5best.pth",
+    # "ljspeech":"/home/mnt2/zz/workspace/work/espnet_richard_infer/egs2/ljspeech/tts1/exp/kan-bayashi/ljspeech_tts_train_conformer_fastspeech2_raw_phn_tacotron_g2p_en_no_space_train.loss.ave/train.loss.ave_5best.pth",
+    # "vctk": "/home/mnt2/zz/workspace/work/espnet_richard_infer/egs2/vctk/tts1/exp/kan-bayashi/vctk_tts_train_gst+xvector_conformer_fastspeech2_transformer_teacher_raw_phn_tacotron_g2p_en_no_space_train.loss.ave/train.loss.ave_5best.pth",
+    "vctk_unseen":"/mnt/home/v_baihe/projects/espnet/egs2/vctk/tts1/exp/tts_train_fs2_raw_phn_tacotron_g2p_en_no_space/train.loss.ave_5best.pth",
+    "libritts":"/mnt/home/v_baihe/projects/espnet/egs2/libritts/tts1/exp/kan-bayashi/libritts_tts_train_gst+xvector_conformer_fastspeech2_transformer_teacher_raw_phn_tacotron_g2p_en_no_space_train.loss/train.loss.ave_5best.pth"
+}
+
+random.seed(0)
+np.random.seed(0)
+
+
+def plot_mel_and_vocode_wav(uid, prefix, clone_uid, clone_prefix, source_language, target_language, model_name, wav_path,full_origin_str, old_str, new_str, vocoder,duration_preditor_path,sid=None, non_autoreg=True):
+    wav_org, input_feat, output_feat, old_span_boundary, new_span_boundary, fs, hop_length = get_mlm_output(
+                                                            uid,
+                                                            prefix,
+                                                            clone_uid,
+                                                            clone_prefix,
+                                                            source_language,
+                                                            target_language,
+                                                            model_name,
+                                                            wav_path,
+                                                            old_str,
+                                                            new_str, 
+                                                            duration_preditor_path,
+                                                            use_teacher_forcing=non_autoreg,
+                                                            sid=sid
+                                                            )
+    
+    masked_feat = output_feat[new_span_boundary[0]:new_span_boundary[1]].detach().float().cpu().numpy()
+    
+    if target_language == 'english':
+        output_feat_np = output_feat.detach().float().cpu().numpy()
+        replaced_wav_paddle_voc = get_voc_out(output_feat_np, target_language)
+
+    elif target_language == 'chinese':
+        output_feat_np = output_feat.detach().float().cpu().numpy()
+        replaced_wav_only_mask_fst2_voc = get_voc_out(masked_feat, target_language)
+    
+
+    old_time_boundary = [hop_length * x  for x in old_span_boundary]
+    new_time_boundary = [hop_length * x  for x in new_span_boundary]
+    
+    
+    if target_language == 'english':
+        wav_org_replaced_paddle_voc = np.concatenate([wav_org[:old_time_boundary[0]], replaced_wav_paddle_voc[new_time_boundary[0]:new_time_boundary[1]], wav_org[old_time_boundary[1]:]])
+
+        data_dict = {"origin":wav_org,
+                    "output":wav_org_replaced_paddle_voc}
+
+    elif  target_language == 'chinese':
+        wav_org_replaced_only_mask_fst2_voc = np.concatenate([wav_org[:old_time_boundary[0]], replaced_wav_only_mask_fst2_voc, wav_org[old_time_boundary[1]:]])
+        data_dict = {"origin":wav_org,
+                    "output": wav_org_replaced_only_mask_fst2_voc,}
+    
+    return data_dict, old_span_boundary
+
+
+
+def load_vocoder(vocoder_tag="parallel_wavegan/libritts_parallel_wavegan.v1"):
+    vocoder_tag = vocoder_tag.replace("parallel_wavegan/", "")
+    vocoder_file = download_pretrained_model(vocoder_tag)
+    vocoder_config = Path(vocoder_file).parent / "config.yml"
+
+    vocoder = TTSTask.build_vocoder_from_file(
+                    vocoder_config, vocoder_file, None, 'cpu'
+                )
+    return vocoder
+
+def load_model(model_name):
+    config_path='./pretrained_model/{}/config.yaml'.format(model_name)
+    model_path = './pretrained_model/{}/model.pdparams'.format(model_name)
+    
+    mlm_model, args = build_model_from_file(config_file=config_path,
+                                 model_file=model_path)
+    return mlm_model, args
+
+
+def read_data(uid,prefix):
+    mfa_text = read_2column_text(prefix+'/text')[uid]
+    mfa_wav_path = read_2column_text(prefix+'/wav.scp')[uid]
+    if 'mnt' not in mfa_wav_path:
+        mfa_wav_path = prefix.split('dump')[0] + mfa_wav_path
+    return mfa_text, mfa_wav_path
+ 
+def get_align_data(uid,prefix):
+    mfa_path = prefix+"mfa_"
+    mfa_text = read_2column_text(mfa_path+'text')[uid]
+    mfa_start = load_num_sequence_text(mfa_path+'start',loader_type='text_float')[uid]
+    mfa_end = load_num_sequence_text(mfa_path+'end',loader_type='text_float')[uid]
+    mfa_wav_path = read_2column_text(mfa_path+'wav.scp')[uid]
+    return mfa_text, mfa_start, mfa_end, mfa_wav_path
+
+
+
+def get_fs2_model(model_name):
+    model, config = TTSTask.build_model_from_file(model_file=model_name)
+    processor = TTSTask.build_preprocess_fn(config, train=False)
+    return model, processor
+
+def get_masked_mel_boundary(mfa_start, mfa_end, fs, hop_length, span_tobe_replaced):
+    align_start=paddle.to_tensor(mfa_start).unsqueeze(0)
+    align_end =paddle.to_tensor(mfa_end).unsqueeze(0)
+    align_start = paddle.floor(fs*align_start/hop_length).int()
+    align_end = paddle.floor(fs*align_end/hop_length).int()
+    if span_tobe_replaced[0]>=len(mfa_start):
+        span_boundary = [align_end[0].tolist()[-1],align_end[0].tolist()[-1]]
+    else:
+        span_boundary=[align_start[0].tolist()[span_tobe_replaced[0]],align_end[0].tolist()[span_tobe_replaced[1]-1]]
+    return span_boundary
+
+
+def get_mapping(phn_mapping="./phn_mapping.txt"):
+    zh_mapping = {}
+    with open(phn_mapping, "r") as f:
+        for line in f:
+            pd_phn = line.split(" ")[0]
+            if pd_phn not in zh_mapping.keys():
+                zh_mapping[pd_phn] = " ".join(line.split()[1:])
+    return zh_mapping
+
+
+def gen_phns(zh_mapping, phns):
+    new_phns = []
+    for x in phns:
+        if x in zh_mapping.keys():
+            new_phns.extend(zh_mapping[x].split(" "))
+        else:
+            new_phns.extend(['<unk>'])
+    return new_phns
+
+def get_phns_and_spans_paddle(uid, prefix, old_str, new_str, source_language, target_language):
+    zh_mapping = get_mapping()
+    old_str = old_str.strip()
+    new_str = new_str.strip()
+    words = []
+    for pun in [',', '.', ':', ';', '!', '?', '"', '(', ')', '--', '---', u'，', u'。', u'：', u'；', u'！', u'？', u'（', u'）']:
+        old_str = old_str.replace(pun, ' ')
+        new_str = new_str.replace(pun, ' ')
+
+
+    append_new_str = (old_str == new_str[:len(old_str)])
+    print("append_new_str: ", append_new_str)
+    old_phns, mfa_start, mfa_end = [], [], []
+    mfa_text, mfa_start, mfa_end, mfa_wav_path = get_align_data(uid, prefix)
+    old_phns = mfa_text.split(" ")
+
+    if append_new_str:
+        if source_language != target_language:
+            is_cross_lingual = True 
+        else:
+            is_cross_lingual = False
+
+        new_str_origin = new_str[:len(old_str)]
+        new_str_append = new_str[len(old_str):]
+        if is_cross_lingual:
+            if source_language == "english" and target_language == "chinese": 
+                new_phns_origin = old_phns
+                new_phns_append, _ = sentence2phns(new_str_append, "zh")
+
+            elif source_language=="chinese" and target_language == "english":
+                new_phns_origin = old_phns
+                new_phns_append, _ = sentence2phns(new_str_append, "en")
+            else:
+                assert target_language == "chinese" or target_language == "english", "cloning is not support for this language, please check it."
+            
+        else:  
+            if source_language == target_language and target_language == "english":
+                new_phns_origin = old_phns
+                new_phns_append, _ = sentence2phns(new_str_append, "en")
+
+            elif source_language == target_language and target_language == "chinese":
+                new_phns_origin = old_phns
+                new_phns_append, _ = sentence2phns(new_str_append, "zh")
+            else:
+                assert source_language == target_language, "source language is not same with target language..."
+
+        if target_language == "chinese":
+            new_phns_append = gen_phns(zh_mapping, new_phns_append)
+
+        new_phns = new_phns_origin + new_phns_append
+
+        span_tobe_replaced = [len(old_phns),len(old_phns)]
+        span_tobe_added = [len(old_phns),len(new_phns)] 
+        
+    else:
+        if source_language == target_language and target_language == "english":
+            new_phns, _ = sentence2phns(new_str, "en")
+        # 纯中文
+        elif source_language == target_language and target_language == "chinese":
+            new_phns, _ = sentence2phns(new_str, "zh")
+            new_phns = gen_phns(zh_mapping, new_phns)
+
+
+        else:
+            assert source_language == target_language, "source language is not same with target language..."
+    
+        while(new_phns[-1] == 'sp'):
+            new_phns.pop()
+
+        while(new_phns[0] == 'sp'):
+            new_phns.pop(0)
+
+        span_tobe_replaced = [0,len(old_phns)-1]
+        span_tobe_added = [0,len(new_phns)-1]
+        new_phns_left = []
+        left_index = 0
+        sp_count = 0
+
+        # find the left different index
+        for idx, phn in enumerate(old_phns):
+            if phn == "sp":
+                sp_count += 1 
+                new_phns_left.append('sp')
+            else:
+                idx = idx - sp_count
+                if phn == new_phns[idx]:
+                    left_index += 1 
+                    new_phns_left.append(phn)
+                else:
+                    span_tobe_replaced[0] = len(new_phns_left)
+                    span_tobe_added[0] = len(new_phns_left)
+                    break
+
+        right_index = 0
+        new_phns_middle = []
+        new_phns_right = []
+        sp_count = 0
+        word2phns_max_index = len(old_phns)
+        new_word2phns_max_index = len(new_phns)
+
+        for idx, phn in enumerate(old_phns[::-1]):
+            cur_idx = len(old_phns) - 1 - idx
+            if phn == "sp":
+                sp_count += 1 
+                new_phns_right = ['sp']+new_phns_right
+            else:
+                cur_idx = new_word2phns_max_index - (word2phns_max_index - cur_idx -sp_count)
+                if phn == new_phns[cur_idx]:
+                    right_index -= 1
+                    new_phns_right = [phn] + new_phns_right
+                
+                else:
+                    span_tobe_replaced[1] = len(old_phns) - len(new_phns_right)
+                    new_phns_middle = new_phns[left_index:right_index]
+                    span_tobe_added[1] = len(new_phns_left) + len(new_phns_middle)
+                    if len(new_phns_middle) == 0:
+                        span_tobe_added[1] = min(span_tobe_added[1]+1, len(new_phns))
+                        span_tobe_added[0] = max(0, span_tobe_added[0]-1)
+                        span_tobe_replaced[0] = max(0, span_tobe_replaced[0]-1)
+                        span_tobe_replaced[1] = min(span_tobe_replaced[1]+1, len(old_phns))
+                    break        
+
+        new_phns = new_phns_left+new_phns_middle+new_phns_right
+    
+    return mfa_start, mfa_end, old_phns, new_phns, span_tobe_replaced, span_tobe_added
+
+
+
+
+def duration_adjust_factor(original_dur, pred_dur, phns):
+    length = 0
+    accumulate = 0
+    factor_list = []
+    for ori,pred,phn in zip(original_dur, pred_dur,phns):
+        if pred==0 or phn=='sp':
+            continue
+        else:
+            factor_list.append(ori/pred)
+    factor_list = np.array(factor_list)
+    factor_list.sort()
+    if len(factor_list)<5:
+        return 1
+    length = 2
+    return np.average(factor_list[length:-length])
+
+
+def prepare_features_with_duration(uid, prefix, clone_uid, clone_prefix, source_language, target_language, mlm_model, old_str, new_str, wav_path,duration_preditor_path,sid=None, mask_reconstruct=False,duration_adjust=True,start_end_sp=False, train_args=None):
+    wav_org, rate = librosa.load(wav_path, sr=train_args.feats_extract_conf['fs'])
+    fs = train_args.feats_extract_conf['fs']
+    hop_length = train_args.feats_extract_conf['hop_length']
+    
+    mfa_start, mfa_end, old_phns, new_phns, span_tobe_replaced, span_tobe_added = get_phns_and_spans_paddle(uid, prefix, old_str, new_str, source_language, target_language)
+    
+    if start_end_sp:
+        if new_phns[-1]!='sp':
+            new_phns = new_phns+['sp']
+   
+
+    if target_language == "english":
+        old_durations = evaluate_durations(old_phns, target_language=target_language)
+
+    elif target_language =="chinese":
+        if source_language == "english":
+            old_durations = evaluate_durations(old_phns, target_language=source_language)
+        elif source_language == "chinese":
+            old_durations = evaluate_durations(old_phns, target_language=source_language)
+
+    else:
+        assert target_language == "chinese" or target_language == "english", "calculate duration_predict is not support for this language..."
+
+
+
+    original_old_durations = [e-s for e,s in zip(mfa_end, mfa_start)]
+    if '[MASK]' in new_str:
+        new_phns = old_phns
+        span_tobe_added = span_tobe_replaced
+        d_factor_left = duration_adjust_factor(original_old_durations[:span_tobe_replaced[0]],old_durations[:span_tobe_replaced[0]], old_phns[:span_tobe_replaced[0]])
+        d_factor_right = duration_adjust_factor(original_old_durations[span_tobe_replaced[1]:],old_durations[span_tobe_replaced[1]:], old_phns[span_tobe_replaced[1]:])
+        d_factor = (d_factor_left+d_factor_right)/2
+        new_durations_adjusted = [d_factor*i for i in old_durations]
+    else:
+        if duration_adjust:
+            d_factor = duration_adjust_factor(original_old_durations,old_durations, old_phns)
+            d_factor_paddle = duration_adjust_factor(original_old_durations,old_durations, old_phns)
+            if target_language =="chinese":
+                d_factor = d_factor * 1.35  
+        else:
+            d_factor = 1
+        
+        if target_language == "english":
+            new_durations = evaluate_durations(new_phns, target_language=target_language)
+
+
+        elif target_language =="chinese":
+            new_durations = evaluate_durations(new_phns, target_language=target_language)
+
+        new_durations_adjusted = [d_factor*i for i in new_durations]
+
+        if span_tobe_replaced[0]<len(old_phns) and old_phns[span_tobe_replaced[0]] == new_phns[span_tobe_added[0]]:
+            new_durations_adjusted[span_tobe_added[0]] = original_old_durations[span_tobe_replaced[0]]
+        if span_tobe_replaced[1]<len(old_phns) and span_tobe_added[1]<len(new_phns):
+            if old_phns[span_tobe_replaced[1]] == new_phns[span_tobe_added[1]]:
+                new_durations_adjusted[span_tobe_added[1]] = original_old_durations[span_tobe_replaced[1]]
+    new_span_duration_sum = sum(new_durations_adjusted[span_tobe_added[0]:span_tobe_added[1]])
+    old_span_duration_sum = sum(original_old_durations[span_tobe_replaced[0]:span_tobe_replaced[1]])
+    duration_offset =  new_span_duration_sum - old_span_duration_sum
+    new_mfa_start = mfa_start[:span_tobe_replaced[0]]
+    new_mfa_end = mfa_end[:span_tobe_replaced[0]]
+    for i in new_durations_adjusted[span_tobe_added[0]:span_tobe_added[1]]:
+        if len(new_mfa_end) ==0:
+            new_mfa_start.append(0)
+            new_mfa_end.append(i)
+        else:
+            new_mfa_start.append(new_mfa_end[-1])
+            new_mfa_end.append(new_mfa_end[-1]+i)
+    new_mfa_start += [i+duration_offset for i in mfa_start[span_tobe_replaced[1]:]]
+    new_mfa_end += [i+duration_offset for i in mfa_end[span_tobe_replaced[1]:]]
+    
+    # 3. get new wav 
+    if span_tobe_replaced[0]>=len(mfa_start):
+        left_index = len(wav_org)
+        right_index = left_index
+    else:
+        left_index = int(np.floor(mfa_start[span_tobe_replaced[0]]*fs))
+        right_index = int(np.ceil(mfa_end[span_tobe_replaced[1]-1]*fs))
+    new_blank_wav = np.zeros((int(np.ceil(new_span_duration_sum*fs)),), dtype=wav_org.dtype)
+    new_wav_org = np.concatenate([wav_org[:left_index], new_blank_wav, wav_org[right_index:]])
+
+
+    # 4. get old and new mel span to be mask
+    old_span_boundary = get_masked_mel_boundary(mfa_start, mfa_end, fs, hop_length, span_tobe_replaced)   # [92, 92]
+    new_span_boundary=get_masked_mel_boundary(new_mfa_start, new_mfa_end, fs, hop_length, span_tobe_added) # [92, 174]
+
+    
+    return new_wav_org, new_phns, new_mfa_start, new_mfa_end, old_span_boundary, new_span_boundary
+
+def prepare_features(uid, prefix, clone_uid, clone_prefix, source_language, target_language, mlm_model,processor, wav_path, old_str,new_str,duration_preditor_path, sid=None,duration_adjust=True,start_end_sp=False,
+mask_reconstruct=False, train_args=None):
+    wav_org, phns_list, mfa_start, mfa_end, old_span_boundary, new_span_boundary = prepare_features_with_duration(uid, prefix, clone_uid, clone_prefix, source_language, target_language, mlm_model, old_str, 
+    new_str, wav_path,duration_preditor_path,sid=sid,duration_adjust=duration_adjust,start_end_sp=start_end_sp,mask_reconstruct=mask_reconstruct, train_args = train_args)
+    speech = np.array(wav_org,dtype=np.float32)
+    align_start=np.array(mfa_start)
+    align_end =np.array(mfa_end)
+    token_to_id = {item: i for i, item in enumerate(train_args.token_list)}
+    text = np.array(list(map(lambda x: token_to_id.get(x, token_to_id['<unk>']), phns_list)))
+    print('unk id is', token_to_id['<unk>'])
+    # text = np.array(processor(uid='1', data={'text':" ".join(phns_list)})['text'])
+    span_boundary = np.array(new_span_boundary)
+    batch=[('1', {"speech":speech,"align_start":align_start,"align_end":align_end,"text":text,"span_boundary":span_boundary})]
+    
+    return batch, old_span_boundary, new_span_boundary
+
+def decode_with_model(uid, prefix, clone_uid, clone_prefix, source_language, target_language, mlm_model, processor, collate_fn, wav_path, old_str, new_str,duration_preditor_path, sid=None, decoder=False,use_teacher_forcing=False,duration_adjust=True,start_end_sp=False, train_args=None):
+    # fs, hop_length = mlm_model.feats_extract.fs, mlm_model.feats_extract.hop_length
+    fs, hop_length = train_args.feats_extract_conf['fs'], train_args.feats_extract_conf['hop_length']
+
+    batch,old_span_boundary,new_span_boundary = prepare_features(uid,prefix, clone_uid, clone_prefix, source_language, target_language, mlm_model,processor,wav_path,old_str,new_str,duration_preditor_path, sid,duration_adjust=duration_adjust,start_end_sp=start_end_sp, train_args=train_args)
+    
+    feats = pickle.load(open('tmp/tmp_pkl.'+str(uid), 'rb'))
+
+    # wav_len * 80
+    # set_all_random_seed(9999)
+    if 'text_masked_position' in feats.keys():
+        feats.pop('text_masked_position')
+    for k, v in feats.items():
+        feats[k] = paddle.to_tensor(v)
+    rtn = mlm_model.inference(**feats,span_boundary=new_span_boundary,use_teacher_forcing=use_teacher_forcing)
+    output = rtn['feat_gen'] 
+    if 0 in output[0].shape and 0 not in output[-1].shape:
+        output_feat = paddle.concat(output[1:-1]+[output[-1].squeeze()], axis=0).cpu()
+    elif 0 not in output[0].shape and 0 in output[-1].shape:
+        output_feat = paddle.concat([output[0].squeeze()]+output[1:-1], axis=0).cpu()
+    elif 0 in output[0].shape and 0 in output[-1].shape:
+        output_feat = paddle.concat(output[1:-1], axis=0).cpu()
+    else:
+        output_feat = paddle.concat([output[0].squeeze(0)]+ output[1:-1]+[output[-1].squeeze(0)], axis=0).cpu()
+
+
+    # wav_org, rate = soundfile.read(
+    #             wav_path, always_2d=False)
+    wav_org, rate = librosa.load(wav_path, sr=train_args.feats_extract_conf['fs'])
+    origin_speech = paddle.to_tensor(np.array(wav_org,dtype=np.float32)).unsqueeze(0)
+    speech_lengths = paddle.to_tensor(len(wav_org)).unsqueeze(0)
+    # input_feat, feats_lengths = mlm_model.feats_extract(origin_speech, speech_lengths)
+    # return wav_org, input_feat.squeeze(), output_feat, old_span_boundary, new_span_boundary, fs, hop_length
+    return wav_org, None, output_feat, old_span_boundary, new_span_boundary, fs, hop_length
+
+class MLMCollateFn:
+    """Functor class of common_collate_fn()"""
+
+    def __init__(
+        self,
+        feats_extract,
+        float_pad_value: Union[float, int] = 0.0,
+        int_pad_value: int = -32768,
+        not_sequence: Collection[str] = (),
+        mlm_prob: float=0.8,
+        mean_phn_span: int=8,
+        attention_window: int=0,
+        pad_speech: bool=False,
+        sega_emb: bool=False,
+        duration_collect: bool=False,
+        text_masking: bool=False
+
+    ):
+        self.mlm_prob=mlm_prob
+        self.mean_phn_span=mean_phn_span
+        self.feats_extract = feats_extract
+        self.float_pad_value = float_pad_value
+        self.int_pad_value = int_pad_value
+        self.not_sequence = set(not_sequence)
+        self.attention_window=attention_window
+        self.pad_speech=pad_speech
+        self.sega_emb=sega_emb
+        self.duration_collect = duration_collect
+        self.text_masking = text_masking
+
+    def __repr__(self):
+        return (
+            f"{self.__class__}(float_pad_value={self.float_pad_value}, "
+            f"int_pad_value={self.float_pad_value})"
+        )
+
+    def __call__(
+        self, data: Collection[Tuple[str, Dict[str, np.ndarray]]]
+    ) -> Tuple[List[str], Dict[str, paddle.Tensor]]:
+        return mlm_collate_fn(
+            data,
+            float_pad_value=self.float_pad_value,
+            int_pad_value=self.int_pad_value,
+            not_sequence=self.not_sequence,
+            mlm_prob=self.mlm_prob, 
+            mean_phn_span=self.mean_phn_span,
+            feats_extract=self.feats_extract,
+            attention_window=self.attention_window,
+            pad_speech=self.pad_speech,
+            sega_emb=self.sega_emb,
+            duration_collect=self.duration_collect,
+            text_masking=self.text_masking
+        )
+
+def pad_list(xs, pad_value):
+    """Perform padding for the list of tensors.
+
+    Args:
+        xs (List): List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)].
+        pad_value (float): Value for padding.
+
+    Returns:
+        Tensor: Padded tensor (B, Tmax, `*`).
+
+    Examples:
+        >>> x = [paddle.ones(4), paddle.ones(2), paddle.ones(1)]
+        >>> x
+        [tensor([1., 1., 1., 1.]), tensor([1., 1.]), tensor([1.])]
+        >>> pad_list(x, 0)
+        tensor([[1., 1., 1., 1.],
+                [1., 1., 0., 0.],
+                [1., 0., 0., 0.]])
+
+    """
+    n_batch = len(xs)
+    max_len = max(paddle.shape(x)[0] for x in xs)
+    pad = paddle.full((n_batch, max_len), pad_value, dtype = xs[0].dtype)
+
+    for i in range(n_batch):
+        pad[i, : paddle.shape(xs[i])[0]] = xs[i]
+    
+    return pad
+
+def pad_to_longformer_att_window(text, max_len, max_tlen,attention_window):
+    round = max_len % attention_window
+    if round != 0:
+        max_tlen += (attention_window - round)
+        n_batch = paddle.shape(text)[0]
+        text_pad = paddle.zeros((n_batch, max_tlen, *paddle.shape(text[0])[1:]), dtype=text.dtype)
+        for i in range(n_batch):
+            text_pad[i, : paddle.shape(text[i])[0]] = text[i]
+    else:
+        text_pad = text[:, : max_tlen]
+    return text_pad, max_tlen
+
+def make_pad_mask(lengths, xs=None, length_dim=-1):
+    print('inputs are:', lengths, xs, length_dim)
+    """Make mask tensor containing indices of padded part.
+
+    Args:
+        lengths (LongTensor or List): Batch of lengths (B,).
+        xs (Tensor, optional): The reference tensor.
+            If set, masks will be the same shape as this tensor.
+        length_dim (int, optional): Dimension indicator of the above tensor.
+            See the example.
+
+    Returns:
+        Tensor: Mask tensor containing indices of padded part.
+
+    Examples:
+        With only lengths.
+
+        >>> lengths = [5, 3, 2]
+        >>> make_non_pad_mask(lengths)
+        masks = [[0, 0, 0, 0 ,0],
+                 [0, 0, 0, 1, 1],
+                 [0, 0, 1, 1, 1]]
+
+        With the reference tensor.
+
+        >>> xs = paddle.zeros((3, 2, 4))
+        >>> make_pad_mask(lengths, xs)
+        tensor([[[0, 0, 0, 0],
+                 [0, 0, 0, 0]],
+                [[0, 0, 0, 1],
+                 [0, 0, 0, 1]],
+                [[0, 0, 1, 1],
+                 [0, 0, 1, 1]]], dtype=paddle.uint8)
+        >>> xs = paddle.zeros((3, 2, 6))
+        >>> make_pad_mask(lengths, xs)
+        tensor([[[0, 0, 0, 0, 0, 1],
+                 [0, 0, 0, 0, 0, 1]],
+                [[0, 0, 0, 1, 1, 1],
+                 [0, 0, 0, 1, 1, 1]],
+                [[0, 0, 1, 1, 1, 1],
+                 [0, 0, 1, 1, 1, 1]]], dtype=paddle.uint8)
+
+        With the reference tensor and dimension indicator.
+
+        >>> xs = paddle.zeros((3, 6, 6))
+        >>> make_pad_mask(lengths, xs, 1)
+        tensor([[[0, 0, 0, 0, 0, 0],
+                 [0, 0, 0, 0, 0, 0],
+                 [0, 0, 0, 0, 0, 0],
+                 [0, 0, 0, 0, 0, 0],
+                 [0, 0, 0, 0, 0, 0],
+                 [1, 1, 1, 1, 1, 1]],
+                [[0, 0, 0, 0, 0, 0],
+                 [0, 0, 0, 0, 0, 0],
+                 [0, 0, 0, 0, 0, 0],
+                 [1, 1, 1, 1, 1, 1],
+                 [1, 1, 1, 1, 1, 1],
+                 [1, 1, 1, 1, 1, 1]],
+                [[0, 0, 0, 0, 0, 0],
+                 [0, 0, 0, 0, 0, 0],
+                 [1, 1, 1, 1, 1, 1],
+                 [1, 1, 1, 1, 1, 1],
+                 [1, 1, 1, 1, 1, 1],
+                 [1, 1, 1, 1, 1, 1]]], dtype=paddle.uint8)
+        >>> make_pad_mask(lengths, xs, 2)
+        tensor([[[0, 0, 0, 0, 0, 1],
+                 [0, 0, 0, 0, 0, 1],
+                 [0, 0, 0, 0, 0, 1],
+                 [0, 0, 0, 0, 0, 1],
+                 [0, 0, 0, 0, 0, 1],
+                 [0, 0, 0, 0, 0, 1]],
+                [[0, 0, 0, 1, 1, 1],
+                 [0, 0, 0, 1, 1, 1],
+                 [0, 0, 0, 1, 1, 1],
+                 [0, 0, 0, 1, 1, 1],
+                 [0, 0, 0, 1, 1, 1],
+                 [0, 0, 0, 1, 1, 1]],
+                [[0, 0, 1, 1, 1, 1],
+                 [0, 0, 1, 1, 1, 1],
+                 [0, 0, 1, 1, 1, 1],
+                 [0, 0, 1, 1, 1, 1],
+                 [0, 0, 1, 1, 1, 1],
+                 [0, 0, 1, 1, 1, 1]]], dtype=paddle.uint8)
+
+    """
+    if length_dim == 0:
+        raise ValueError("length_dim cannot be 0: {}".format(length_dim))
+
+    if not isinstance(lengths, list):
+        lengths = list(lengths)
+    print('lengths', lengths)
+    bs = int(len(lengths))
+    if xs is None:
+        maxlen = int(max(lengths))
+    else:
+        maxlen = paddle.shape(xs)[length_dim]
+
+    seq_range = paddle.arange(0, maxlen, dtype=paddle.int64)
+    seq_range_expand = paddle.expand(paddle.unsqueeze(seq_range, 0), (bs, maxlen))
+    seq_length_expand = paddle.unsqueeze(paddle.to_tensor(lengths), -1)
+    print('seq_length_expand', paddle.shape(seq_length_expand))
+    print('seq_range_expand', paddle.shape(seq_range_expand))
+    mask = seq_range_expand >= seq_length_expand
+
+    if xs is not None:
+        assert paddle.shape(xs)[0] == bs, (paddle.shape(xs)[0], bs)
+
+        if length_dim < 0:
+            length_dim = len(paddle.shape(xs)) + length_dim
+        # ind = (:, None, ..., None, :, , None, ..., None)
+        ind = tuple(
+            slice(None) if i in (0, length_dim) else None for i in range(len(paddle.shape(xs)))
+        )
+        print('0:', paddle.shape(mask))
+        print('1:', paddle.shape(mask[ind]))
+        print('2:', paddle.shape(xs))
+        mask = paddle.expand(mask[ind], paddle.shape(xs))
+    return mask
+
+
+def make_non_pad_mask(lengths, xs=None, length_dim=-1):
+    """Make mask tensor containing indices of non-padded part.
+
+    Args:
+        lengths (LongTensor or List): Batch of lengths (B,).
+        xs (Tensor, optional): The reference tensor.
+            If set, masks will be the same shape as this tensor.
+        length_dim (int, optional): Dimension indicator of the above tensor.
+            See the example.
+
+    Returns:
+        ByteTensor: mask tensor containing indices of padded part.
+
+    Examples:
+        With only lengths.
+
+        >>> lengths = [5, 3, 2]
+        >>> make_non_pad_mask(lengths)
+        masks = [[1, 1, 1, 1 ,1],
+                 [1, 1, 1, 0, 0],
+                 [1, 1, 0, 0, 0]]
+
+        With the reference tensor.
+
+        >>> xs = paddle.zeros((3, 2, 4))
+        >>> make_non_pad_mask(lengths, xs)
+        tensor([[[1, 1, 1, 1],
+                 [1, 1, 1, 1]],
+                [[1, 1, 1, 0],
+                 [1, 1, 1, 0]],
+                [[1, 1, 0, 0],
+                 [1, 1, 0, 0]]], dtype=paddle.uint8)
+        >>> xs = paddle.zeros((3, 2, 6))
+        >>> make_non_pad_mask(lengths, xs)
+        tensor([[[1, 1, 1, 1, 1, 0],
+                 [1, 1, 1, 1, 1, 0]],
+                [[1, 1, 1, 0, 0, 0],
+                 [1, 1, 1, 0, 0, 0]],
+                [[1, 1, 0, 0, 0, 0],
+                 [1, 1, 0, 0, 0, 0]]], dtype=paddle.uint8)
+
+        With the reference tensor and dimension indicator.
+
+        >>> xs = paddle.zeros((3, 6, 6))
+        >>> make_non_pad_mask(lengths, xs, 1)
+        tensor([[[1, 1, 1, 1, 1, 1],
+                 [1, 1, 1, 1, 1, 1],
+                 [1, 1, 1, 1, 1, 1],
+                 [1, 1, 1, 1, 1, 1],
+                 [1, 1, 1, 1, 1, 1],
+                 [0, 0, 0, 0, 0, 0]],
+                [[1, 1, 1, 1, 1, 1],
+                 [1, 1, 1, 1, 1, 1],
+                 [1, 1, 1, 1, 1, 1],
+                 [0, 0, 0, 0, 0, 0],
+                 [0, 0, 0, 0, 0, 0],
+                 [0, 0, 0, 0, 0, 0]],
+                [[1, 1, 1, 1, 1, 1],
+                 [1, 1, 1, 1, 1, 1],
+                 [0, 0, 0, 0, 0, 0],
+                 [0, 0, 0, 0, 0, 0],
+                 [0, 0, 0, 0, 0, 0],
+                 [0, 0, 0, 0, 0, 0]]], dtype=paddle.uint8)
+        >>> make_non_pad_mask(lengths, xs, 2)
+        tensor([[[1, 1, 1, 1, 1, 0],
+                 [1, 1, 1, 1, 1, 0],
+                 [1, 1, 1, 1, 1, 0],
+                 [1, 1, 1, 1, 1, 0],
+                 [1, 1, 1, 1, 1, 0],
+                 [1, 1, 1, 1, 1, 0]],
+                [[1, 1, 1, 0, 0, 0],
+                 [1, 1, 1, 0, 0, 0],
+                 [1, 1, 1, 0, 0, 0],
+                 [1, 1, 1, 0, 0, 0],
+                 [1, 1, 1, 0, 0, 0],
+                 [1, 1, 1, 0, 0, 0]],
+                [[1, 1, 0, 0, 0, 0],
+                 [1, 1, 0, 0, 0, 0],
+                 [1, 1, 0, 0, 0, 0],
+                 [1, 1, 0, 0, 0, 0],
+                 [1, 1, 0, 0, 0, 0],
+                 [1, 1, 0, 0, 0, 0]]], dtype=paddle.uint8)
+
+    """
+    return ~make_pad_mask(lengths, xs, length_dim)
+
+def phones_masking(xs_pad, src_mask, align_start, align_end, align_start_lengths, mlm_prob, mean_phn_span, span_boundary=None):
+    bz, sent_len, _ = paddle.shape(xs_pad)
+    mask_num_lower = math.ceil(sent_len * mlm_prob)
+    masked_position = np.zeros((bz, sent_len))
+    y_masks = None
+    # y_masks = torch.ones(bz,sent_len,sent_len,device=xs_pad.device,dtype=xs_pad.dtype)
+    # tril_masks = torch.tril(y_masks)
+    if mlm_prob == 1.0:
+        masked_position += 1
+        # y_masks = tril_masks
+    elif mean_phn_span == 0:
+        # only speech 
+        length = sent_len
+        mean_phn_span = min(length*mlm_prob//3, 50)
+        masked_phn_indices = random_spans_noise_mask(length,mlm_prob, mean_phn_span).nonzero()
+        masked_position[:,masked_phn_indices]=1
+    else:
+        for idx in range(bz):
+            if span_boundary is not None:
+                for s,e in zip(span_boundary[idx][::2], span_boundary[idx][1::2]):
+                    masked_position[idx, s:e] = 1
+
+                    # y_masks[idx, :, s:e] = tril_masks[idx, :, s:e]
+                    # y_masks[idx, e:, s:e ] = 0
+            else:
+                length = align_start_lengths[idx].item()
+                if length<2:
+                    continue
+                masked_phn_indices = random_spans_noise_mask(length,mlm_prob, mean_phn_span).nonzero()
+                masked_start = align_start[idx][masked_phn_indices].tolist()
+                masked_end = align_end[idx][masked_phn_indices].tolist()
+                for s,e in zip(masked_start, masked_end):
+                    masked_position[idx, s:e] = 1
+                    # y_masks[idx, :, s:e] = tril_masks[idx, :, s:e]
+                    # y_masks[idx, e:, s:e ] = 0
+    non_eos_mask = np.array(paddle.reshape(src_mask, paddle.shape(xs_pad)[:2]).float().cpu())
+    masked_position = masked_position * non_eos_mask
+    # y_masks = src_mask & y_masks.bool()
+
+    return paddle.cast(paddle.to_tensor(masked_position), paddle.bool), y_masks
+
+def get_segment_pos(speech_pad, text_pad, align_start, align_end, align_start_lengths,sega_emb):
+    bz, speech_len, _ = speech_pad.size()
+    text_segment_pos = paddle.zeros_like(text_pad)
+    speech_segment_pos = paddle.zeros((bz, speech_len),dtype=text_pad.dtype)
+    if not sega_emb:
+        return speech_segment_pos, text_segment_pos
+    for idx in range(bz):
+        align_length = align_start_lengths[idx].item()
+        for j in range(align_length):
+            s,e = align_start[idx][j].item(), align_end[idx][j].item()
+            speech_segment_pos[idx][s:e] = j+1
+            text_segment_pos[idx][j] = j+1
+        
+    return speech_segment_pos, text_segment_pos
+
+def mlm_collate_fn(
+    data: Collection[Tuple[str, Dict[str, np.ndarray]]],
+    float_pad_value: Union[float, int] = 0.0,
+    int_pad_value: int = -32768,
+    not_sequence: Collection[str] = (),
+    mlm_prob: float = 0.8, 
+    mean_phn_span: int = 8,
+    feats_extract=None,
+    attention_window: int = 0,
+    pad_speech: bool=False,
+    sega_emb: bool=False,
+    duration_collect: bool=False,
+    text_masking: bool=False
+) -> Tuple[List[str], Dict[str, paddle.Tensor]]:
+    """Concatenate ndarray-list to an array and convert to paddle.Tensor.
+
+    Examples:
+        >>> from espnet2.samplers.constant_batch_sampler import ConstantBatchSampler,
+        >>> import espnet2.tasks.abs_task
+        >>> from espnet2.train.dataset import ESPnetDataset
+        >>> sampler = ConstantBatchSampler(...)
+        >>> dataset = ESPnetDataset(...)
+        >>> keys = next(iter(sampler)
+        >>> batch = [dataset[key] for key in keys]
+        >>> batch = common_collate_fn(batch)
+        >>> model(**batch)
+
+        Note that the dict-keys of batch are propagated from
+        that of the dataset as they are.
+
+    """
+    uttids = [u for u, _ in data]
+    data = [d for _, d in data]
+
+    assert all(set(data[0]) == set(d) for d in data), "dict-keys mismatching"
+    assert all(
+        not k.endswith("_lengths") for k in data[0]
+    ), f"*_lengths is reserved: {list(data[0])}"
+
+    output = {}
+    for key in data[0]:
+        # NOTE(kamo):
+        # Each models, which accepts these values finally, are responsible
+        # to repaint the pad_value to the desired value for each tasks.
+        if data[0][key].dtype.kind == "i":
+            pad_value = int_pad_value
+        else:
+            pad_value = float_pad_value
+
+        array_list = [d[key] for d in data]
+
+        # Assume the first axis is length:
+        # tensor_list: Batch x (Length, ...)
+        tensor_list = [paddle.to_tensor(a) for a in array_list]
+        # tensor: (Batch, Length, ...)
+        tensor = pad_list(tensor_list, pad_value)
+        output[key] = tensor
+
+        # lens: (Batch,)
+        if key not in not_sequence:
+            lens = paddle.to_tensor([d[key].shape[0] for d in data], dtype=paddle.long)
+            output[key + "_lengths"] = lens
+
+    f = open('tmp_var.out', 'w')
+    for item in [round(item, 6) for item in output["speech"][0].tolist()]:
+        f.write(str(item)+'\n')
+    feats = feats_extract.get_log_mel_fbank(np.array(output["speech"][0]))
+    feats = paddle.to_tensor(feats)
+    print('out shape', paddle.shape(feats))
+    feats_lengths = paddle.shape(feats)[0]
+    feats = paddle.unsqueeze(feats, 0)
+    batch_size = paddle.shape(feats)[0]
+    if 'text' not in output:
+        text=paddle.zeros_like(feats_lengths.unsqueeze(-1))-2
+        text_lengths=paddle.zeros_like(feats_lengths)+1
+        max_tlen=1
+        align_start=paddle.zeros_like(text)
+        align_end=paddle.zeros_like(text)
+        align_start_lengths=paddle.zeros_like(feats_lengths)
+        align_end_lengths=paddle.zeros_like(feats_lengths)
+        sega_emb=False
+        mean_phn_span = 0
+        mlm_prob = 0.15
+    else:
+        text, text_lengths = output["text"], output["text_lengths"]
+        align_start, align_start_lengths, align_end, align_end_lengths = output["align_start"], output["align_start_lengths"], output["align_end"], output["align_end_lengths"]
+        align_start = paddle.floor(feats_extract.sr*align_start/feats_extract.hop_length).int()
+        align_end = paddle.floor(feats_extract.sr*align_end/feats_extract.hop_length).int()
+        max_tlen = max(text_lengths).item()
+    max_slen = max(feats_lengths).item()
+    speech_pad = feats[:, : max_slen]
+    if attention_window>0 and pad_speech:
+        speech_pad,max_slen = pad_to_longformer_att_window(speech_pad, max_slen, max_slen, attention_window)
+    max_len = max_slen + max_tlen
+    if attention_window>0:
+        text_pad, max_tlen = pad_to_longformer_att_window(text, max_len, max_tlen, attention_window)
+    else:
+        text_pad = text
+    text_mask = make_non_pad_mask(text_lengths.tolist(), text_pad, length_dim=1).unsqueeze(-2)
+    if attention_window>0:
+        text_mask = text_mask*2 
+    speech_mask = make_non_pad_mask(feats_lengths.tolist(), speech_pad[:,:,0], length_dim=1).unsqueeze(-2)
+    span_boundary = None
+    if 'span_boundary' in output.keys():
+        span_boundary = output['span_boundary']
+
+    if text_masking:
+        masked_position, text_masked_position,_ = phones_text_masking(
+            speech_pad,
+            speech_mask,
+            text_pad, 
+            text_mask,
+            align_start,
+            align_end,
+            align_start_lengths,
+            mlm_prob,
+            mean_phn_span,
+            span_boundary)
+    else:
+        text_masked_position = np.zeros(text_pad.size())
+        masked_position, _ = phones_masking(
+                speech_pad,
+                speech_mask,
+                align_start,
+                align_end,
+                align_start_lengths,
+                mlm_prob,
+                mean_phn_span,
+                span_boundary)
+
+    output_dict = {}
+    if duration_collect and 'text' in output:
+        reordered_index, speech_segment_pos,text_segment_pos, durations,feats_lengths = get_segment_pos_reduce_duration(speech_pad, text_pad, align_start, align_end, align_start_lengths,sega_emb, masked_position, feats_lengths)
+        speech_mask = make_non_pad_mask(feats_lengths.tolist(), speech_pad[:,:reordered_index.shape[1],0], length_dim=1).unsqueeze(-2)
+        output_dict['durations'] = durations
+        output_dict['reordered_index'] = reordered_index
+    else:
+        speech_segment_pos, text_segment_pos = get_segment_pos(speech_pad, text_pad, align_start, align_end, align_start_lengths,sega_emb)
+    output_dict['speech'] = speech_pad
+    output_dict['text'] = text_pad
+    output_dict['masked_position'] = masked_position
+    output_dict['text_masked_position'] = text_masked_position
+    output_dict['speech_mask'] = speech_mask
+    output_dict['text_mask'] = text_mask
+    output_dict['speech_segment_pos'] = speech_segment_pos
+    output_dict['text_segment_pos'] = text_segment_pos
+    # output_dict['y_masks'] = y_masks
+    output_dict['speech_lengths'] = output["speech_lengths"]
+    output_dict['text_lengths'] = text_lengths
+    output = (uttids, output_dict)
+    # assert check_return_type(output)
+    return output
+
+def build_collate_fn(
+        args: argparse.Namespace, train: bool, epoch=-1
+    ):
+
+    # assert check_argument_types()
+    # return CommonCollateFn(float_pad_value=0.0, int_pad_value=0)
+    feats_extract_class = LogMelFBank
+    args_dic = {}
+    print ('type is', type(args.feats_extract_conf))
+    for k, v in args.feats_extract_conf.items():
+        if k == 'fs':
+            args_dic['sr'] = v
+        else:
+            args_dic[k] = v
+    # feats_extract = feats_extract_class(**args.feats_extract_conf)
+    feats_extract = feats_extract_class(**args_dic)
+
+    sega_emb = True if args.encoder_conf['input_layer'] == 'sega_mlm' else False
+    if args.encoder_conf['selfattention_layer_type'] == 'longformer':
+        attention_window = args.encoder_conf['attention_window']
+        pad_speech = True if 'pre_speech_layer' in args.encoder_conf and args.encoder_conf['pre_speech_layer'] >0 else False
+    else:
+        attention_window=0
+        pad_speech=False
+    if epoch==-1:
+        mlm_prob_factor = 1
+    else:
+        mlm_probs = [1.0, 1.0, 0.7, 0.6, 0.5]
+        mlm_prob_factor = 0.8 #mlm_probs[epoch // 100]
+    if 'duration_predictor_layers' in args.model_conf.keys() and args.model_conf['duration_predictor_layers']>0:
+        duration_collect=True
+    else:
+        duration_collect=False
+    return MLMCollateFn(feats_extract, float_pad_value=0.0, int_pad_value=0,
+    mlm_prob=args.model_conf['mlm_prob']*mlm_prob_factor,mean_phn_span=args.model_conf['mean_phn_span'],attention_window=attention_window,pad_speech=pad_speech,sega_emb=sega_emb,duration_collect=duration_collect)
+
+
+def get_mlm_output(uid, prefix, clone_uid, clone_prefix, source_language, target_language, model_name, wav_path, old_str, new_str,duration_preditor_path, sid=None, decoder=False,use_teacher_forcing=False, dynamic_eval=(0,0),duration_adjust=True,start_end_sp=False):
+    mlm_model,train_args = load_model(model_name)
+    mlm_model.eval()
+    # processor = MLMTask.build_preprocess_fn(train_args, False)
+    processor = None
+    collate_fn = MLMTask.build_collate_fn(train_args, False)
+    # collate_fn = build_collate_fn(train_args, False)
+
+    return decode_with_model(uid,prefix, clone_uid, clone_prefix, source_language, target_language, mlm_model, processor, collate_fn, wav_path, old_str, new_str,duration_preditor_path, sid=sid, decoder=decoder, use_teacher_forcing=use_teacher_forcing,
+    duration_adjust=duration_adjust,start_end_sp=start_end_sp, train_args = train_args)
+
+def prompt_decoding_fn(model_name, wav_path,full_origin_str, old_str, new_str, vocoder,duration_preditor_path,sid=None, non_autoreg=True, dynamic_eval=(0,0),duration_adjust=True):
+    wav_org, input_feat, output_feat, old_span_boundary, new_span_boundary, fs, hop_length = get_mlm_output(
+                                                            model_name,
+                                                            wav_path,
+                                                            old_str,
+                                                            new_str, 
+                                                            duration_preditor_path,
+                                                            use_teacher_forcing=non_autoreg,
+                                                            sid=sid,
+                                                            dynamic_eval=dynamic_eval,
+                                                            duration_adjust=duration_adjust,
+                                                            start_end_sp=False
+                                                            )
+
+    replaced_wav = vocoder(output_feat).detach().float().data.cpu().numpy()
+
+    old_time_boundary = [hop_length * x  for x in old_span_boundary]
+    new_time_boundary = [hop_length * x  for x in new_span_boundary]
+    new_wav = replaced_wav[new_time_boundary[0]:]
+    # "origin_vocoder":vocoder_origin_wav, 
+    data_dict = {"prompt":wav_org,
+                "new_wav":new_wav}
+    return data_dict
+
+def test_vctk(uid, clone_uid, clone_prefix, source_language, target_language, vocoder, prefix='dump/raw/dev', model_name="conformer", old_str="",new_str="",prompt_decoding=False,dynamic_eval=(0,0), task_name = None):
+
+    new_str = new_str.strip()
+    if clone_uid is not None and clone_prefix is not None:
+        if target_language == "english":
+            duration_preditor_path = duration_path_dict['ljspeech']
+        elif target_language == "chinese":    
+            duration_preditor_path = duration_path_dict['ljspeech'] 
+        else:
+            assert target_language == "chinese" or target_language == "english", "duration_preditor_path is not support for this language..."
+    
+    else:           
+        duration_preditor_path = duration_path_dict['ljspeech']
+
+    spemd = None
+    full_origin_str,wav_path = read_data(uid, prefix)
+
+    new_str = new_str if task_name == 'edit' else full_origin_str + new_str 
+    print('new_str is ', new_str)
+    
+    if not old_str:
+        old_str = full_origin_str
+    if not new_str:
+        new_str = input("input the new string:")
+    if prompt_decoding:
+        print(new_str)
+        return prompt_decoding_fn(model_name, wav_path,full_origin_str, old_str, new_str,vocoder,duration_preditor_path,sid=spemd,dynamic_eval=dynamic_eval)
+    print(full_origin_str)
+    results_dict, old_span = plot_mel_and_vocode_wav(uid, prefix, clone_uid, clone_prefix, source_language, target_language, model_name, wav_path,full_origin_str, old_str, new_str,vocoder,duration_preditor_path,sid=spemd)
+    return results_dict
+
+if __name__ == "__main__":
+    args = parse_args()
+    print(args)
+    data_dict = test_vctk(args.uid, 
+        args.clone_uid, 
+        args.clone_prefix, 
+        args.source_language, 
+        args.target_language, 
+        None,
+        args.prefix, 
+        args.model_name,
+        new_str=args.new_str,
+        task_name=args.task_name)
+    sf.write('./wavs/%s' % args.output_name, data_dict['output'], samplerate=24000)
+    
\ No newline at end of file
diff --git a/ernie-sat/tmp/tmp_pkl.Prompt_003_new b/ernie-sat/tmp/tmp_pkl.Prompt_003_new
new file mode 100644
index 0000000000000000000000000000000000000000..c7432dac7c6f3261a37ffc994f852dd7ea89fcd8
Binary files /dev/null and b/ernie-sat/tmp/tmp_pkl.Prompt_003_new differ
diff --git a/ernie-sat/tmp/tmp_pkl.p243_new b/ernie-sat/tmp/tmp_pkl.p243_new
new file mode 100644
index 0000000000000000000000000000000000000000..33075eb1cf1c5ab94bd85538409ed0c23cc90e0a
Binary files /dev/null and b/ernie-sat/tmp/tmp_pkl.p243_new differ
diff --git a/ernie-sat/tmp/tmp_pkl.p299_096 b/ernie-sat/tmp/tmp_pkl.p299_096
new file mode 100644
index 0000000000000000000000000000000000000000..c0553e427ffa71571d819f7b9879f062e7bb68fe
Binary files /dev/null and b/ernie-sat/tmp/tmp_pkl.p299_096 differ
diff --git a/ernie-sat/util.py b/ernie-sat/util.py
new file mode 100644
index 0000000000000000000000000000000000000000..45f10e7da157b9e8646c8318ebf6a6f72473944c
--- /dev/null
+++ b/ernie-sat/util.py
@@ -0,0 +1,239 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os 
+import argparse
+import logging
+from pathlib import Path
+
+import jsonlines
+import numpy as np
+import paddle
+import soundfile as sf
+import yaml
+from timer import timer
+from yacs.config import CfgNode
+from paddlespeech.s2t.utils.dynamic_import import dynamic_import
+
+from paddlespeech.t2s.exps.syn_utils import get_test_dataset
+from paddlespeech.t2s.exps.syn_utils import get_voc_inference
+from paddlespeech.t2s.utils import str2bool
+from paddlespeech.t2s.frontend.zh_frontend import Frontend
+from paddlespeech.t2s.models.fastspeech2 import FastSpeech2
+from paddlespeech.t2s.models.fastspeech2 import FastSpeech2Inference
+from paddlespeech.t2s.modules.normalizer import ZScore
+from yacs.config import CfgNode
+# new add
+import paddle.nn.functional as F
+from paddlespeech.t2s.modules.nets_utils import make_pad_mask
+from paddlespeech.t2s.exps.syn_utils import get_frontend
+
+from sedit_arg_parser import parse_args
+
+model_alias = {
+    # acoustic model
+    "speedyspeech":
+    "paddlespeech.t2s.models.speedyspeech:SpeedySpeech",
+    "speedyspeech_inference":
+    "paddlespeech.t2s.models.speedyspeech:SpeedySpeechInference",
+    "fastspeech2":
+    "paddlespeech.t2s.models.fastspeech2:FastSpeech2",
+    "fastspeech2_inference":
+    "paddlespeech.t2s.models.fastspeech2:FastSpeech2Inference",
+    "tacotron2":
+    "paddlespeech.t2s.models.tacotron2:Tacotron2",
+    "tacotron2_inference":
+    "paddlespeech.t2s.models.tacotron2:Tacotron2Inference",
+}
+
+
+
+
+
+def get_voc_out(mel, target_language="chinese"):
+    # vocoder
+    args = parse_args()
+    
+
+    assert target_language == "chinese" or target_language == "english", "In get_voc_out function, target_language is illegal..."
+        
+    print("current vocoder: ", args.voc)
+    with open(args.voc_config) as f:
+        voc_config = CfgNode(yaml.safe_load(f))
+
+    voc_inference = get_voc_inference(args, voc_config)
+
+    mel = paddle.to_tensor(mel)
+    with paddle.no_grad():
+        wav = voc_inference(mel)
+    print("shepe of wav (time x n_channels):%s"%wav.shape)   # (31800,1)
+    return np.squeeze(wav)
+
+# dygraph
+def get_am_inference(args, am_config):
+    with open(args.phones_dict, "r") as f:
+        phn_id = [line.strip().split() for line in f.readlines()]
+    vocab_size = len(phn_id)
+    # print("vocab_size:", vocab_size)
+
+    tone_size = None
+    if 'tones_dict' in args and args.tones_dict:
+        with open(args.tones_dict, "r") as f:
+            tone_id = [line.strip().split() for line in f.readlines()]
+        tone_size = len(tone_id)
+        print("tone_size:", tone_size)
+
+    spk_num = None
+    if 'speaker_dict' in args and args.speaker_dict:
+        with open(args.speaker_dict, 'rt') as f:
+            spk_id = [line.strip().split() for line in f.readlines()]
+        spk_num = len(spk_id)
+        print("spk_num:", spk_num)
+
+    odim = am_config.n_mels
+    # model: {model_name}_{dataset}
+    am_name = args.am[:args.am.rindex('_')]
+    am_dataset = args.am[args.am.rindex('_') + 1:]
+
+    am_class = dynamic_import(am_name, model_alias)
+    am_inference_class = dynamic_import(am_name + '_inference', model_alias)
+
+    if am_name == 'fastspeech2':
+        am = am_class(
+            idim=vocab_size, odim=odim, spk_num=spk_num, **am_config["model"])
+    elif am_name == 'speedyspeech':
+        am = am_class(
+            vocab_size=vocab_size,
+            tone_size=tone_size,
+            spk_num=spk_num,
+            **am_config["model"])
+    elif am_name == 'tacotron2':
+        am = am_class(idim=vocab_size, odim=odim, **am_config["model"])
+
+    am.set_state_dict(paddle.load(args.am_ckpt)["main_params"])
+    am.eval()
+    am_mu, am_std = np.load(args.am_stat)
+    am_mu = paddle.to_tensor(am_mu)
+    am_std = paddle.to_tensor(am_std)
+    am_normalizer = ZScore(am_mu, am_std)
+    am_inference = am_inference_class(am_normalizer, am)
+    am_inference.eval()
+    print("acoustic model done!")
+    return am, am_inference, am_name, am_dataset, phn_id
+
+
+def evaluate_durations(phns, target_language="chinese", fs=24000, hop_length=300):
+    args = parse_args()
+    if args.ngpu == 0:
+        paddle.set_device("cpu")
+    elif args.ngpu > 0:
+        paddle.set_device("gpu")
+    else:
+        print("ngpu should >= 0 !")
+
+
+    
+    assert target_language == "chinese" or target_language == "english", "In evaluate_durations function, target_language is illegal..."
+
+    # Init body.
+    with open(args.am_config) as f:
+        am_config = CfgNode(yaml.safe_load(f))
+    # print("========Config========")
+    # print(am_config)
+    # print("---------------------")
+    # acoustic model
+    am, am_inference, am_name, am_dataset,phn_id = get_am_inference(args, am_config)
+    
+    torch_phns = phns
+    vocab_phones = {}
+    for tone, id in phn_id:
+        vocab_phones[tone] = int(id)
+    # print("vocab_phones: ", len(vocab_phones))
+    vocab_size = len(vocab_phones)
+    phonemes = [
+        phn if phn in vocab_phones else "sp" for phn in torch_phns
+    ]
+    phone_ids = [vocab_phones[item] for item in phonemes]
+    phone_ids_new = phone_ids
+    
+    phone_ids_new.append(vocab_size-1)
+    phone_ids_new = paddle.to_tensor(np.array(phone_ids_new, np.int64))
+    normalized_mel, d_outs, p_outs, e_outs = am.inference(phone_ids_new, spk_id=None, spk_emb=None)
+    pre_d_outs = d_outs
+    phoneme_durations_new = pre_d_outs * hop_length / fs
+    phoneme_durations_new = phoneme_durations_new.tolist()[:-1]
+
+    return phoneme_durations_new
+
+
+def sentence2phns(sentence, target_language="en"):
+    args = parse_args()
+    if target_language == 'en':
+        args.lang='en'
+        args.phones_dict = "download/fastspeech2_nosil_ljspeech_ckpt_0.5/phone_id_map.txt"
+    elif target_language == 'zh':
+        args.lang='zh'
+        args.phones_dict="download/fastspeech2_conformer_baker_ckpt_0.5/phone_id_map.txt" 
+    else:
+        print("target_language should in {'zh', 'en'}!")
+    
+    frontend = get_frontend(args)
+    merge_sentences = True
+    get_tone_ids = False
+
+    if target_language == 'zh':
+        input_ids = frontend.get_input_ids(
+            sentence,
+            merge_sentences=merge_sentences,
+            get_tone_ids=get_tone_ids,
+            print_info=False)
+        phone_ids = input_ids["phone_ids"]
+
+        phonemes = frontend.get_phonemes(
+            sentence, 
+            merge_sentences=merge_sentences,
+            print_info=False)
+            
+        return phonemes[0], input_ids["phone_ids"][0]
+
+    elif target_language == 'en':
+        phonemes = frontend.phoneticize(sentence)
+        input_ids = frontend.get_input_ids(
+            sentence, merge_sentences=merge_sentences)
+        phone_ids = input_ids["phone_ids"]
+
+        phones_list = []
+        vocab_phones = {}
+        punc = "：，；。？！“”‘’':,;.?!"
+        with open(args.phones_dict, 'rt') as f:
+            phn_id = [line.strip().split() for line in f.readlines()]
+        for phn, id in phn_id:
+            vocab_phones[phn] = int(id)
+
+        phones = phonemes[1:-1]
+        phones = [phn for phn in phones if not phn.isspace()]
+        # replace unk phone with sp
+        phones = [
+            phn
+            if (phn in vocab_phones and phn not in punc) else "sp"
+            for phn in phones
+        ]
+        phones_list.append(phones)
+        return phones_list[0], input_ids["phone_ids"][0] 
+
+    else:
+        print("lang should in {'zh', 'en'}!")
+
+    
+
+
diff --git a/ernie-sat/wavs/ori.wav b/ernie-sat/wavs/ori.wav
new file mode 100644
index 0000000000000000000000000000000000000000..d50fcb59b8c4828138648e57c4dad168b874b4ca
Binary files /dev/null and b/ernie-sat/wavs/ori.wav differ
diff --git a/ernie-sat/wavs/pred.wav b/ernie-sat/wavs/pred.wav
new file mode 100644
index 0000000000000000000000000000000000000000..0210a420898d81f5d90068a8691572c30df25cb3
Binary files /dev/null and b/ernie-sat/wavs/pred.wav differ
diff --git a/ernie-sat/wavs/pred_en_edit_paddle_voc.wav b/ernie-sat/wavs/pred_en_edit_paddle_voc.wav
new file mode 100644
index 0000000000000000000000000000000000000000..8a05b71046ee44808bdc0228277610f686ef81ca
Binary files /dev/null and b/ernie-sat/wavs/pred_en_edit_paddle_voc.wav differ
diff --git a/ernie-sat/wavs/pred_zh.wav b/ernie-sat/wavs/pred_zh.wav
new file mode 100644
index 0000000000000000000000000000000000000000..124258b94eab1c608a7f95575c72d701894ab5b3
Binary files /dev/null and b/ernie-sat/wavs/pred_zh.wav differ
diff --git a/ernie-sat/wavs/pred_zh_fst2_voc.wav b/ernie-sat/wavs/pred_zh_fst2_voc.wav
new file mode 100644
index 0000000000000000000000000000000000000000..57ce66e5a4f9a44800292202f1b27c08d72c1b99
Binary files /dev/null and b/ernie-sat/wavs/pred_zh_fst2_voc.wav differ
diff --git a/ernie-sat/wavs/task_cross_lingual_pred.wav b/ernie-sat/wavs/task_cross_lingual_pred.wav
new file mode 100644
index 0000000000000000000000000000000000000000..cffebaf81c7bb95d3ad65272826efc11fd04fb01
Binary files /dev/null and b/ernie-sat/wavs/task_cross_lingual_pred.wav differ
diff --git a/ernie-sat/wavs/task_edit_pred.wav b/ernie-sat/wavs/task_edit_pred.wav
new file mode 100644
index 0000000000000000000000000000000000000000..6bfda0fa42584cce5690ac86da095384200647d4
Binary files /dev/null and b/ernie-sat/wavs/task_edit_pred.wav differ
diff --git a/ernie-sat/wavs/task_synthesize_pred.wav b/ernie-sat/wavs/task_synthesize_pred.wav
new file mode 100644
index 0000000000000000000000000000000000000000..ce1379919274e5a8cc113d09e73ed869db0bdd56
Binary files /dev/null and b/ernie-sat/wavs/task_synthesize_pred.wav differ