diff --git a/README.md b/README.md
index 66feb0982025fce8caf819fddca27b0d81598d7a..2f9d992895309f28ccfabc5d0bf83dfa94aaa443 100644
--- a/README.md
+++ b/README.md
@@ -154,7 +154,7 @@ If you want to try more functions like training and tuning, please see [Speech-t
 
 ## Model List
 
-PaddleSpeech supports a series of most popular models, summarized in [released models](./docs/source/released_models.md) with available pretrained models.
+PaddleSpeech supports a series of most popular models, summarized in [released models](./docs/source/released_model.md) with available pretrained models.
 
 Speech-to-Text module contains *Acoustic Model* and *Language Model*, with the following details:
 
@@ -344,4 +344,4 @@ year={2021}
 
 PaddleSpeech is provided under the [Apache-2.0 License](./LICENSE).
 
-PaddleSpeech depends on a lot of open source repositories. See [references](./docs/source/reference.md) for more information. 
\ No newline at end of file
+PaddleSpeech depends on a lot of open source repositories. See [references](./docs/source/reference.md) for more information. 
diff --git a/demos/style_fs2/style_syn.py b/demos/style_fs2/style_syn.py
index db15b7ef3a6c9baa8bdbf92ba3803c170a134932..5b8ce35139aea0edb084cd3b1d33b702b27d2628 100644
--- a/demos/style_fs2/style_syn.py
+++ b/demos/style_fs2/style_syn.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 import argparse
 from pathlib import Path
-from typing import Union
 
 import numpy as np
 import paddle
@@ -23,129 +22,12 @@ from yacs.config import CfgNode
 
 from paddlespeech.t2s.frontend.zh_frontend import Frontend
 from paddlespeech.t2s.models.fastspeech2 import FastSpeech2
-from paddlespeech.t2s.models.fastspeech2 import FastSpeech2Inference
+from paddlespeech.t2s.models.fastspeech2 import StyleFastSpeech2Inference
 from paddlespeech.t2s.models.parallel_wavegan import PWGGenerator
 from paddlespeech.t2s.models.parallel_wavegan import PWGInference
 from paddlespeech.t2s.modules.normalizer import ZScore
 
 
-class StyleFastSpeech2Inference(FastSpeech2Inference):
-    def __init__(self, normalizer, model, pitch_stats_path, energy_stats_path):
-        super().__init__(normalizer, model)
-        pitch_mean, pitch_std = np.load(pitch_stats_path)
-        self.pitch_mean = paddle.to_tensor(pitch_mean)
-        self.pitch_std = paddle.to_tensor(pitch_std)
-        energy_mean, energy_std = np.load(energy_stats_path)
-        self.energy_mean = paddle.to_tensor(energy_mean)
-        self.energy_std = paddle.to_tensor(energy_std)
-
-    def denorm(self, data, mean, std):
-        return data * std + mean
-
-    def norm(self, data, mean, std):
-        return (data - mean) / std
-
-    def forward(self,
-                text: paddle.Tensor,
-                durations: Union[paddle.Tensor, np.ndarray]=None,
-                durations_scale: Union[int, float]=None,
-                durations_bias: Union[int, float]=None,
-                pitch: Union[paddle.Tensor, np.ndarray]=None,
-                pitch_scale: Union[int, float]=None,
-                pitch_bias: Union[int, float]=None,
-                energy: Union[paddle.Tensor, np.ndarray]=None,
-                energy_scale: Union[int, float]=None,
-                energy_bias: Union[int, float]=None,
-                robot: bool=False):
-        """
-        Parameters
-        ----------
-        text : Tensor(int64)
-            Input sequence of characters (T,).
-        speech : Tensor, optional
-            Feature sequence to extract style (N, idim).
-        durations : paddle.Tensor/np.ndarray, optional (int64)
-            Groundtruth of duration (T,), this will overwrite the set of durations_scale and durations_bias
-        durations_scale: int/float, optional
-        durations_bias: int/float, optional
-        pitch : paddle.Tensor/np.ndarray, optional
-            Groundtruth of token-averaged pitch (T, 1), this will overwrite the set of pitch_scale and pitch_bias
-        pitch_scale: int/float, optional
-            In denormed HZ domain.
-        pitch_bias: int/float, optional
-            In denormed HZ domain.
-        energy : paddle.Tensor/np.ndarray, optional
-            Groundtruth of token-averaged energy (T, 1), this will overwrite the set of energy_scale and energy_bias
-        energy_scale: int/float, optional
-            In denormed domain.
-        energy_bias: int/float, optional
-            In denormed domain.
-        robot : bool, optional
-            Weather output robot style
-        Returns
-        ----------
-        Tensor
-            Output sequence of features (L, odim).
-        """
-        normalized_mel, d_outs, p_outs, e_outs = self.acoustic_model.inference(
-            text, durations=None, pitch=None, energy=None)
-        # priority: groundtruth > scale/bias > previous output
-        # set durations
-        if isinstance(durations, np.ndarray):
-            durations = paddle.to_tensor(durations)
-        elif isinstance(durations, paddle.Tensor):
-            durations = durations
-        elif durations_scale or durations_bias:
-            durations_scale = durations_scale if durations_scale is not None else 1
-            durations_bias = durations_bias if durations_bias is not None else 0
-            durations = durations_scale * d_outs + durations_bias
-        else:
-            durations = d_outs
-
-        if robot:
-            # set normed pitch to zeros have the same effect with set denormd ones to mean
-            pitch = paddle.zeros(p_outs.shape)
-
-        # set pitch, can overwrite robot set  
-        if isinstance(pitch, np.ndarray):
-            pitch = paddle.to_tensor(pitch)
-        elif isinstance(pitch, paddle.Tensor):
-            pitch = pitch
-        elif pitch_scale or pitch_bias:
-            pitch_scale = pitch_scale if pitch_scale is not None else 1
-            pitch_bias = pitch_bias if pitch_bias is not None else 0
-            p_Hz = paddle.exp(
-                self.denorm(p_outs, self.pitch_mean, self.pitch_std))
-            p_HZ = pitch_scale * p_Hz + pitch_bias
-            pitch = self.norm(paddle.log(p_HZ), self.pitch_mean, self.pitch_std)
-        else:
-            pitch = p_outs
-
-        # set energy
-        if isinstance(energy, np.ndarray):
-            energy = paddle.to_tensor(energy)
-        elif isinstance(energy, paddle.Tensor):
-            energy = energy
-        elif energy_scale or energy_bias:
-            energy_scale = energy_scale if energy_scale is not None else 1
-            energy_bias = energy_bias if energy_bias is not None else 0
-            e_dnorm = self.denorm(e_outs, self.energy_mean, self.energy_std)
-            e_dnorm = energy_scale * e_dnorm + energy_bias
-            energy = self.norm(e_dnorm, self.energy_mean, self.energy_std)
-        else:
-            energy = e_outs
-
-        normalized_mel, d_outs, p_outs, e_outs = self.acoustic_model.inference(
-            text,
-            durations=durations,
-            pitch=pitch,
-            energy=energy,
-            use_teacher_forcing=True)
-
-        logmel = self.normalizer.inverse(normalized_mel)
-        return logmel
-
-
 def evaluate(args, fastspeech2_config, pwg_config):
 
     # construct dataset for evaluation
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 53e5d15df5baaf307a3d0c24fce608af0d34a5e2..ea2599abe49e06c2b652488d073bda45a3a3b80e 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -23,7 +23,7 @@ Contents
    
 .. toctree::
    :maxdepth: 1
-   :caption: Speech-To-Text
+   :caption: Speech-to-Text
 
    asr/models_introduction
    asr/data_preparation
@@ -33,7 +33,7 @@ Contents
 
 .. toctree::
    :maxdepth: 1
-   :caption: Text-To-Speech
+   :caption: Text-to-Speech
 
    tts/basic_usage
    tts/advanced_usage
diff --git a/docs/source/install.md b/docs/source/install.md
index 0700a1667831bee9b303d5590388a5c9a49c0446..d68b990d2a6e3c4b9096808d7712c7792144960e 100644
--- a/docs/source/install.md
+++ b/docs/source/install.md
@@ -16,6 +16,22 @@ cd DeepSpeech
 pip install -e .
 ```
 
+For user who only needs the basic function of paddlespeech, using conda to do installing is recommended.
+You can go to [minicoda](https://docs.conda.io/en/latest/miniconda.html) to select a version and install it by yourself, or you can use the scripts below to install the last miniconda version.
+
+```python
+pushd tools
+bash extras/install_miniconda.sh
+popd
+bash
+```
+
+After installing the conda, run the setup.sh to complete the installing process.
+```python
+bash setup.sh
+```
+
+
 ## Setup (Other Platform)
 
 - Make sure these libraries or tools in [dependencies](./dependencies.md) installed. More information please see: `setup.py `and ` tools/Makefile`.
diff --git a/docs/source/introduction.md b/docs/source/introduction.md
index e7dd2892afe37c1391c04ca2bc9a410ea7754756..e3fc8b9ea9e1c2d9b6d80e8ea6edb1c6dbbf1385 100644
--- a/docs/source/introduction.md
+++ b/docs/source/introduction.md
@@ -1,11 +1,11 @@
 # PaddleSpeech
 
 ## What is PaddleSpeech?
-PaddleSpeech is an open-source toolkit on PaddlePaddle platform for two critical tasks in Speech -  Speech-To-Text (Automatic Speech Recognition, ASR) and Text-To-Speech Synthesis (TTS), with modules involving state-of-art and influential models.
+PaddleSpeech is an open-source toolkit on PaddlePaddle platform for two critical tasks in Speech -  Speech-to-Text (Automatic Speech Recognition, ASR) and Text-to-Speech Synthesis (TTS), with modules involving state-of-art and influential models.
 
 ## What can PaddleSpeech do?
 
-### Speech-To-Text
+### Speech-to-Text
 PaddleSpeech ASR mainly consists of components below:
 - Implementation of models and commonly used neural network layers.
 - Dataset abstraction and common data preprocessing pipelines.
@@ -29,9 +29,9 @@ PaddleSpeech ASR provides you with a complete ASR pipeline, including:
     - attention decoding (used in Transformer and Conformer)
     - attention rescoring (used in Transformer and Conformer)
 
-Speech-To-Text helps you training the ASR model very simply.
+Speech-to-Text helps you training the ASR model very simply.
 
-### Text-To-Speech
+### Text-to-Speech
 TTS mainly consists of components below:
 - Implementation of models and commonly used neural network layers.
 - Dataset abstraction and common data preprocessing pipelines.
@@ -53,4 +53,4 @@ PaddleSpeech TTS provides you with a complete TTS pipeline, including:
     - Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis
     - GE2E
 
-Text-To-Speech  helps you to train TTS models with simple commands.
+Text-to-Speech  helps you to train TTS models with simple commands.
diff --git a/docs/source/released_model.md b/docs/source/released_model.md
index bb03689c7e3f1712af5f4d0d47c328206765d770..a7c6a036b455410cc7d88947ef6c99d7b867924c 100644
--- a/docs/source/released_model.md
+++ b/docs/source/released_model.md
@@ -1,7 +1,7 @@
 
 # Released Models
 
-## Speech-To-Text Models
+## Speech-to-Text Models
 ### Acoustic Model Released in paddle 2.X
 Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | Hours of speech
 :-------------:| :------------:| :-----: | -----: | :----------------- |:--------- | :---------- | :---------
@@ -27,7 +27,7 @@ Language Model | Training Data | Token-based | Size | Descriptions
 [Mandarin LM Small](https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm) | Baidu Internal Corpus | Char-based | 2.8 GB | Pruned with 0 1 2 4 4; <br/> About 0.13 billion n-grams; <br/> 'probing' binary with default settings
 [Mandarin LM Large](https://deepspeech.bj.bcebos.com/zh_lm/zhidao_giga.klm) | Baidu Internal Corpus | Char-based | 70.4 GB | No Pruning; <br/> About 3.7 billion n-grams; <br/> 'probing' binary with default settings
 
-## Text-To-Speech Models
+## Text-to-Speech Models
 ### Acoustic Models
 Model Type | Dataset| Example Link | Pretrained Models|Static Models|Siize(static)
 :-------------:| :------------:| :-----: | :-----:| :-----:| :-----:
diff --git a/examples/csmsc/voc3/conf/finetune.yaml b/examples/csmsc/voc3/conf/finetune.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e02f3e220147e4ca78fffc1e564efa4c968c9089
--- /dev/null
+++ b/examples/csmsc/voc3/conf/finetune.yaml
@@ -0,0 +1,139 @@
+# This is the hyperparameter configuration file for MelGAN.
+# Please make sure this is adjusted for the CSMSC dataset. If you want to
+# apply to the other dataset, you might need to carefully change some parameters.
+# This configuration requires ~ 8GB memory and will finish within 7 days on Titan V.
+
+# This configuration is based on full-band MelGAN but the hop size and sampling
+# rate is different from the paper (16kHz vs 24kHz). The number of iteraions
+# is not shown in the paper so currently we train 1M iterations (not sure enough
+# to converge). The optimizer setting is based on @dathudeptrai advice.
+# https://github.com/kan-bayashi/ParallelWaveGAN/issues/143#issuecomment-632539906
+
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+fs: 24000                # Sampling rate.
+n_fft: 2048              # FFT size. (in samples)
+n_shift: 300             # Hop size. (in samples)
+win_length: 1200         # Window length. (in samples)
+                         # If set to null, it will be the same as fft_size.
+window: "hann"           # Window function.
+n_mels: 80               # Number of mel basis.
+fmin: 80                 # Minimum freq in mel basis calculation. (Hz)
+fmax: 7600               # Maximum frequency in mel basis calculation. (Hz)
+
+###########################################################
+#         GENERATOR NETWORK ARCHITECTURE SETTING          #
+###########################################################
+generator_params:
+    in_channels: 80               # Number of input channels.
+    out_channels: 4               # Number of output channels.
+    kernel_size: 7                # Kernel size of initial and final conv layers.
+    channels: 384                 # Initial number of channels for conv layers.
+    upsample_scales: [5, 5, 3]    # List of Upsampling scales.
+    stack_kernel_size: 3          # Kernel size of dilated conv layers in residual stack.
+    stacks: 4                     # Number of stacks in a single residual stack module.
+    use_weight_norm: True         # Whether to use weight normalization.
+    use_causal_conv: False        # Whether to use causal convolution.
+    use_final_nonlinear_activation: True
+
+
+###########################################################
+#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
+###########################################################
+discriminator_params:
+    in_channels: 1                    # Number of input channels.
+    out_channels: 1                   # Number of output channels.
+    scales: 3                         # Number of multi-scales.
+    downsample_pooling: "AvgPool1D"   # Pooling type for the input downsampling.
+    downsample_pooling_params:        # Parameters of the above pooling function.
+        kernel_size: 4
+        stride: 2
+        padding: 1
+        exclusive: True
+    kernel_sizes: [5, 3]              # List of kernel size.
+    channels: 16                      # Number of channels of the initial conv layer.
+    max_downsample_channels: 512      # Maximum number of channels of downsampling layers.
+    downsample_scales: [4, 4, 4]      # List of downsampling scales.
+    nonlinear_activation: "LeakyReLU" # Nonlinear activation function.
+    nonlinear_activation_params:      # Parameters of nonlinear activation function.
+        negative_slope: 0.2
+    use_weight_norm: True             # Whether to use weight norm.
+    
+
+###########################################################
+#                   STFT LOSS SETTING                     #
+###########################################################
+use_stft_loss: true
+stft_loss_params:
+    fft_sizes: [1024, 2048, 512]  # List of FFT size for STFT-based loss.
+    hop_sizes: [120, 240, 50]     # List of hop size for STFT-based loss
+    win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
+    window: "hann"                # Window function for STFT-based loss
+use_subband_stft_loss: true
+subband_stft_loss_params:
+    fft_sizes: [384, 683, 171]  # List of FFT size for STFT-based loss.
+    hop_sizes: [30, 60, 10]     # List of hop size for STFT-based loss
+    win_lengths: [150, 300, 60] # List of window length for STFT-based loss.
+    window: "hann"              # Window function for STFT-based loss
+
+###########################################################
+#               ADVERSARIAL LOSS SETTING                  #
+###########################################################
+use_feat_match_loss: false # Whether to use feature matching loss.
+lambda_adv: 2.5            # Loss balancing coefficient for adversarial loss.
+
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 64             # Batch size.
+batch_max_steps: 16200     # Length of each audio in batch. Make sure dividable by hop_size.
+num_workers: 2             # Number of workers in DataLoader.
+
+###########################################################
+#             OPTIMIZER & SCHEDULER SETTING               #
+###########################################################
+generator_optimizer_params:
+    epsilon: 1.0e-7                     # Generator's epsilon.
+    weight_decay: 0.0                   # Generator's weight decay coefficient.
+
+generator_grad_norm: -1                 # Generator's gradient norm.
+generator_scheduler_params:
+    learning_rate: 1.0e-3               # Generator's learning rate.
+    gamma: 0.5                          # Generator's scheduler gamma.
+    milestones:                         # At each milestone, lr will be multiplied by gamma.
+        - 100000
+        - 200000
+        - 300000
+        - 400000
+        - 500000
+        - 600000
+discriminator_optimizer_params:
+    epsilon: 1.0e-7                          # Discriminator's epsilon.
+    weight_decay: 0.0                       # Discriminator's weight decay coefficient.
+  
+discriminator_grad_norm: -1                 # Discriminator's gradient norm.
+discriminator_scheduler_params:
+    learning_rate: 1.0e-3                   # Discriminator's learning rate.
+    gamma: 0.5                              # Discriminator's scheduler gamma.
+    milestones:                             # At each milestone, lr will be multiplied by gamma.
+        - 100000
+        - 200000
+        - 300000
+        - 400000
+        - 500000
+        - 600000
+
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+discriminator_train_start_steps: 200000 # Number of steps to start to train discriminator.
+train_max_steps: 1200000                # Number of training steps.
+save_interval_steps: 1000              # Interval steps to save checkpoint.
+eval_interval_steps: 1000               # Interval steps to evaluate the network.
+
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_snapshots: 10                 # max number of snapshots to keep while training
+seed: 42                          # random seed for paddle, random, and np.random
\ No newline at end of file
diff --git a/examples/csmsc/voc3/finetune.sh b/examples/csmsc/voc3/finetune.sh
new file mode 100755
index 0000000000000000000000000000000000000000..42e5a39796acdb46a8104876d8c4086b61866fdb
--- /dev/null
+++ b/examples/csmsc/voc3/finetune.sh
@@ -0,0 +1,63 @@
+#!/bin/bash
+
+source path.sh
+
+gpus=0
+stage=0
+stop_stage=100
+
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+  python3 ${MAIN_ROOT}/paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py \
+      --fastspeech2-config=fastspeech2_nosil_baker_ckpt_0.4/default.yaml \
+      --fastspeech2-checkpoint=fastspeech2_nosil_baker_ckpt_0.4/snapshot_iter_76000.pdz \
+      --fastspeech2-stat=fastspeech2_nosil_baker_ckpt_0.4/speech_stats.npy \
+      --dur-file=durations.txt \
+      --output-dir=dump_finetune \
+      --phones-dict=fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+  python3 local/link_wav.py \
+    --old-dump-dir=dump \
+    --dump-dir=dump_finetune 
+
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # get features' stats(mean and std)
+    echo "Get features' stats ..."
+    cp dump/train/feats_stats.npy dump_finetune/train/
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    # normalize, dev and test should use train's stats
+    echo "Normalize ..."
+   
+    python3 ${BIN_DIR}/../normalize.py \
+        --metadata=dump_finetune/train/raw/metadata.jsonl \
+        --dumpdir=dump_finetune/train/norm \
+        --stats=dump_finetune/train/feats_stats.npy
+    python3 ${BIN_DIR}/../normalize.py \
+        --metadata=dump_finetune/dev/raw/metadata.jsonl \
+        --dumpdir=dump_finetune/dev/norm \
+        --stats=dump_finetune/train/feats_stats.npy
+    
+    python3 ${BIN_DIR}/../normalize.py \
+        --metadata=dump_finetune/test/raw/metadata.jsonl \
+        --dumpdir=dump_finetune/test/norm \
+        --stats=dump_finetune/train/feats_stats.npy
+fi
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+  CUDA_VISIBLE_DEVICES=${gpus} \
+  FLAGS_cudnn_exhaustive_search=true \
+  FLAGS_conv_workspace_size_limit=4000 \
+  python ${BIN_DIR}/train.py \
+      --train-metadata=dump_finetune/train/norm/metadata.jsonl \
+      --dev-metadata=dump_finetune/dev/norm/metadata.jsonl \
+      --config=conf/finetune.yaml \
+      --output-dir=exp/finetune \
+      --ngpu=1
+fi 
\ No newline at end of file
diff --git a/examples/csmsc/voc3/local/link_wav.py b/examples/csmsc/voc3/local/link_wav.py
new file mode 100644
index 0000000000000000000000000000000000000000..c81e0d4b83320665b98720d09a940e9de6dc63cd
--- /dev/null
+++ b/examples/csmsc/voc3/local/link_wav.py
@@ -0,0 +1,85 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+from operator import itemgetter
+from pathlib import Path
+
+import jsonlines
+import numpy as np
+
+
+def main():
+    # parse config and args
+    parser = argparse.ArgumentParser(
+        description="Preprocess audio and then extract features .")
+
+    parser.add_argument(
+        "--old-dump-dir",
+        default=None,
+        type=str,
+        help="directory to dump feature files.")
+    parser.add_argument(
+        "--dump-dir",
+        type=str,
+        required=True,
+        help="directory to finetune dump feature files.")
+    args = parser.parse_args()
+
+    old_dump_dir = Path(args.old_dump_dir).expanduser()
+    old_dump_dir = old_dump_dir.resolve()
+    dump_dir = Path(args.dump_dir).expanduser()
+    # use absolute path
+    dump_dir = dump_dir.resolve()
+    dump_dir.mkdir(parents=True, exist_ok=True)
+
+    assert old_dump_dir.is_dir()
+    assert dump_dir.is_dir()
+
+    for sub in ["train", "dev", "test"]:
+        # 把 old_dump_dir 里面的 *-wave.npy 软连接到 dump_dir 的对应位置
+        output_dir = dump_dir / sub
+        output_dir.mkdir(parents=True, exist_ok=True)
+        results = []
+        for name in os.listdir(output_dir / "raw"):
+            # 003918_feats.npy
+            utt_id = name.split("_")[0]
+            mel_path = output_dir / ("raw/" + name)
+            gen_mel = np.load(mel_path)
+            wave_name = utt_id + "_wave.npy"
+            wav = np.load(old_dump_dir / sub / ("raw/" + wave_name))
+            os.symlink(old_dump_dir / sub / ("raw/" + wave_name),
+                       output_dir / ("raw/" + wave_name))
+            num_sample = wav.shape[0]
+            num_frames = gen_mel.shape[0]
+            wav_path = output_dir / ("raw/" + wave_name)
+
+            record = {
+                "utt_id": utt_id,
+                "num_samples": num_sample,
+                "num_frames": num_frames,
+                "feats": str(mel_path),
+                "wave": str(wav_path),
+            }
+            results.append(record)
+
+        results.sort(key=itemgetter("utt_id"))
+
+        with jsonlines.open(output_dir / "raw/metadata.jsonl", 'w') as writer:
+            for item in results:
+                writer.write(item)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/librispeech/s2/conf/transformer.yaml b/examples/librispeech/s2/conf/transformer.yaml
index b2babca7bcde8b6e68480a5dddeb4950d49159ec..d77329f50843e270b52750ef5dcc2e9429bd8617 100644
--- a/examples/librispeech/s2/conf/transformer.yaml
+++ b/examples/librispeech/s2/conf/transformer.yaml
@@ -1,36 +1,6 @@
 # https://yaml.org/type/float.html
-data:
-  train_manifest: data/manifest.train
-  dev_manifest: data/manifest.dev
-  test_manifest: data/manifest.test-clean
-
-collator:
-  vocab_filepath: data/lang_char/train_960_unigram5000_units.txt
-  unit_type: spm
-  spm_model_prefix: data/lang_char/train_960_unigram5000
-  feat_dim: 83
-  stride_ms: 10.0
-  window_ms: 25.0
-  sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
-  batch_size: 30 
-  maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
-  maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
-  minibatches: 0 # for debug
-  batch_count: auto
-  batch_bins: 0 
-  batch_frames_in: 0
-  batch_frames_out: 0
-  batch_frames_inout: 0
-  augmentation_config: conf/augmentation.json
-  num_workers: 0
-  subsampling_factor: 1
-  num_encs: 1
-
-
 # network architecture
 model:
-    cmvn_file:  
-    cmvn_file_type: "json"
     # encoder related
     encoder: transformer
     encoder_conf:
@@ -63,6 +33,33 @@ model:
         lsm_weight: 0.1     # label smoothing option
         length_normalized_loss: false
 
+data:
+  train_manifest: data/manifest.train
+  dev_manifest: data/manifest.dev
+  test_manifest: data/manifest.test-clean
+
+collator:
+  vocab_filepath: data/lang_char/train_960_unigram5000_units.txt
+  unit_type: spm
+  spm_model_prefix: data/lang_char/train_960_unigram5000
+  feat_dim: 83
+  stride_ms: 10.0
+  window_ms: 25.0
+  sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
+  batch_size: 30 
+  maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+  maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
+  minibatches: 0 # for debug
+  batch_count: auto
+  batch_bins: 0 
+  batch_frames_in: 0
+  batch_frames_out: 0
+  batch_frames_inout: 0
+  augmentation_config: conf/augmentation.json
+  num_workers: 0
+  subsampling_factor: 1
+  num_encs: 1
+
 
 training:
   n_epoch: 120
diff --git a/paddlespeech/t2s/datasets/vocoder_batch_fn.py b/paddlespeech/t2s/datasets/vocoder_batch_fn.py
index 2de4fb124e1e50a7c5481366c8cec675922d8a98..2e4f740fb6f048dd91a1e799d598261a88a6419c 100644
--- a/paddlespeech/t2s/datasets/vocoder_batch_fn.py
+++ b/paddlespeech/t2s/datasets/vocoder_batch_fn.py
@@ -110,10 +110,10 @@ class Clip(object):
         if len(x) < c.shape[0] * self.hop_size:
             x = np.pad(x, (0, c.shape[0] * self.hop_size - len(x)), mode="edge")
         elif len(x) > c.shape[0] * self.hop_size:
-            print(
-                f"wave length: ({len(x)}), mel length: ({c.shape[0]}), hop size: ({self.hop_size })"
-            )
-            x = x[:c.shape[1] * self.hop_size]
+            # print(
+            #     f"wave length: ({len(x)}), mel length: ({c.shape[0]}), hop size: ({self.hop_size })"
+            # )
+            x = x[:c.shape[0] * self.hop_size]
 
         # check the legnth is valid
         assert len(x) == c.shape[
diff --git a/paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py b/paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a9ef370c0f2916149b62c50d2425e969b49a5cb
--- /dev/null
+++ b/paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py
@@ -0,0 +1,167 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# generate mels using durations.txt
+# for mb melgan finetune
+# 长度和原本的 mel 不一致怎么办？
+import argparse
+from pathlib import Path
+
+import numpy as np
+import paddle
+import yaml
+from yacs.config import CfgNode
+
+from paddlespeech.t2s.datasets.preprocess_utils import get_phn_dur
+from paddlespeech.t2s.datasets.preprocess_utils import merge_silence
+from paddlespeech.t2s.models.fastspeech2 import FastSpeech2
+from paddlespeech.t2s.models.fastspeech2 import StyleFastSpeech2Inference
+from paddlespeech.t2s.modules.normalizer import ZScore
+
+
+def evaluate(args, fastspeech2_config):
+
+    # construct dataset for evaluation
+    with open(args.phones_dict, "r") as f:
+        phn_id = [line.strip().split() for line in f.readlines()]
+    vocab_size = len(phn_id)
+    print("vocab_size:", vocab_size)
+
+    phone_dict = {}
+    for phn, id in phn_id:
+        phone_dict[phn] = int(id)
+
+    odim = fastspeech2_config.n_mels
+    model = FastSpeech2(
+        idim=vocab_size, odim=odim, **fastspeech2_config["model"])
+
+    model.set_state_dict(
+        paddle.load(args.fastspeech2_checkpoint)["main_params"])
+    model.eval()
+
+    stat = np.load(args.fastspeech2_stat)
+    mu, std = stat
+    mu = paddle.to_tensor(mu)
+    std = paddle.to_tensor(std)
+    fastspeech2_normalizer = ZScore(mu, std)
+
+    fastspeech2_inference = StyleFastSpeech2Inference(fastspeech2_normalizer,
+                                                      model)
+    fastspeech2_inference.eval()
+
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    sentences, speaker_set = get_phn_dur(args.dur_file)
+    merge_silence(sentences)
+
+    for i, utt_id in enumerate(sentences):
+        phones = sentences[utt_id][0]
+        durations = sentences[utt_id][1]
+        speaker = sentences[utt_id][2]
+        # 裁剪掉开头和结尾的 sil
+        if args.cut_sil:
+            if phones[0] == "sil" and len(durations) > 1:
+                durations = durations[1:]
+                phones = phones[1:]
+            if phones[-1] == 'sil' and len(durations) > 1:
+                durations = durations[:-1]
+                phones = phones[:-1]
+            # sentences[utt_id][0] = phones
+            # sentences[utt_id][1] = durations
+
+        phone_ids = [phone_dict[phn] for phn in phones]
+        phone_ids = paddle.to_tensor(np.array(phone_ids))
+        durations = paddle.to_tensor(np.array(durations))
+        # 生成的和真实的可能有 1, 2 帧的差距，但是 batch_fn 会修复
+        # split data into 3 sections
+        if args.dataset == "baker":
+            num_train = 9800
+            num_dev = 100
+        if i in range(0, num_train):
+            sub_output_dir = output_dir / ("train/raw")
+        elif i in range(num_train, num_train + num_dev):
+            sub_output_dir = output_dir / ("dev/raw")
+        else:
+            sub_output_dir = output_dir / ("test/raw")
+        sub_output_dir.mkdir(parents=True, exist_ok=True)
+        with paddle.no_grad():
+            mel = fastspeech2_inference(phone_ids, durations=durations)
+        np.save(sub_output_dir / (utt_id + "_feats.npy"), mel)
+
+
+def main():
+    # parse args and config and redirect to train_sp
+    parser = argparse.ArgumentParser(
+        description="Synthesize with fastspeech2 & parallel wavegan.")
+    parser.add_argument(
+        "--dataset",
+        default="baker",
+        type=str,
+        help="name of dataset, should in {baker, ljspeech, vctk} now")
+    parser.add_argument(
+        "--fastspeech2-config", type=str, help="fastspeech2 config file.")
+    parser.add_argument(
+        "--fastspeech2-checkpoint",
+        type=str,
+        help="fastspeech2 checkpoint to load.")
+    parser.add_argument(
+        "--fastspeech2-stat",
+        type=str,
+        help="mean and standard deviation used to normalize spectrogram when training fastspeech2."
+    )
+
+    parser.add_argument(
+        "--phones-dict",
+        type=str,
+        default="phone_id_map.txt",
+        help="phone vocabulary file.")
+
+    parser.add_argument(
+        "--dur-file", default=None, type=str, help="path to durations.txt.")
+    parser.add_argument("--output-dir", type=str, help="output dir.")
+    parser.add_argument(
+        "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
+    parser.add_argument("--verbose", type=int, default=1, help="verbose.")
+
+    def str2bool(str):
+        return True if str.lower() == 'true' else False
+
+    parser.add_argument(
+        "--cut-sil",
+        type=str2bool,
+        default=True,
+        help="whether cut sil in the edge of audio")
+
+    args = parser.parse_args()
+
+    if args.ngpu == 0:
+        paddle.set_device("cpu")
+    elif args.ngpu > 0:
+        paddle.set_device("gpu")
+    else:
+        print("ngpu should >= 0 !")
+
+    with open(args.fastspeech2_config) as f:
+        fastspeech2_config = CfgNode(yaml.safe_load(f))
+
+    print("========Args========")
+    print(yaml.safe_dump(vars(args)))
+    print("========Config========")
+    print(fastspeech2_config)
+
+    evaluate(args, fastspeech2_config)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
index 2202d156e85731919c3b44fe8c498230dded740c..2e52c10376e41a6cca508b7be2a6dea1ca4a2943 100644
--- a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
+++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
@@ -16,23 +16,25 @@
 from typing import Dict
 from typing import Sequence
 from typing import Tuple
+from typing import Union
 
+import numpy as np
 import paddle
 import paddle.nn.functional as F
 from paddle import nn
 from typeguard import check_argument_types
 
-from paddlespeech.t2s.modules.fastspeech2_predictor.duration_predictor import DurationPredictor
-from paddlespeech.t2s.modules.fastspeech2_predictor.duration_predictor import DurationPredictorLoss
-from paddlespeech.t2s.modules.fastspeech2_predictor.length_regulator import LengthRegulator
-from paddlespeech.t2s.modules.fastspeech2_predictor.variance_predictor import VariancePredictor
-from paddlespeech.t2s.modules.fastspeech2_transformer.embedding import PositionalEncoding
-from paddlespeech.t2s.modules.fastspeech2_transformer.embedding import ScaledPositionalEncoding
-from paddlespeech.t2s.modules.fastspeech2_transformer.encoder import Encoder as TransformerEncoder
 from paddlespeech.t2s.modules.nets_utils import initialize
 from paddlespeech.t2s.modules.nets_utils import make_non_pad_mask
 from paddlespeech.t2s.modules.nets_utils import make_pad_mask
+from paddlespeech.t2s.modules.predictor.duration_predictor import DurationPredictor
+from paddlespeech.t2s.modules.predictor.duration_predictor import DurationPredictorLoss
+from paddlespeech.t2s.modules.predictor.length_regulator import LengthRegulator
+from paddlespeech.t2s.modules.predictor.variance_predictor import VariancePredictor
 from paddlespeech.t2s.modules.tacotron2.decoder import Postnet
+from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding
+from paddlespeech.t2s.modules.transformer.embedding import ScaledPositionalEncoding
+from paddlespeech.t2s.modules.transformer.encoder import Encoder as TransformerEncoder
 
 
 class FastSpeech2(nn.Layer):
@@ -687,6 +689,129 @@ class FastSpeech2Inference(nn.Layer):
         return logmel
 
 
+class StyleFastSpeech2Inference(FastSpeech2Inference):
+    def __init__(self,
+                 normalizer,
+                 model,
+                 pitch_stats_path=None,
+                 energy_stats_path=None):
+        super().__init__(normalizer, model)
+        if pitch_stats_path:
+            pitch_mean, pitch_std = np.load(pitch_stats_path)
+            self.pitch_mean = paddle.to_tensor(pitch_mean)
+            self.pitch_std = paddle.to_tensor(pitch_std)
+        if energy_stats_path:
+            energy_mean, energy_std = np.load(energy_stats_path)
+            self.energy_mean = paddle.to_tensor(energy_mean)
+            self.energy_std = paddle.to_tensor(energy_std)
+
+    def denorm(self, data, mean, std):
+        return data * std + mean
+
+    def norm(self, data, mean, std):
+        return (data - mean) / std
+
+    def forward(self,
+                text: paddle.Tensor,
+                durations: Union[paddle.Tensor, np.ndarray]=None,
+                durations_scale: Union[int, float]=None,
+                durations_bias: Union[int, float]=None,
+                pitch: Union[paddle.Tensor, np.ndarray]=None,
+                pitch_scale: Union[int, float]=None,
+                pitch_bias: Union[int, float]=None,
+                energy: Union[paddle.Tensor, np.ndarray]=None,
+                energy_scale: Union[int, float]=None,
+                energy_bias: Union[int, float]=None,
+                robot: bool=False):
+        """
+        Parameters
+        ----------
+        text : Tensor(int64)
+            Input sequence of characters (T,).
+        speech : Tensor, optional
+            Feature sequence to extract style (N, idim).
+        durations : paddle.Tensor/np.ndarray, optional (int64)
+            Groundtruth of duration (T,), this will overwrite the set of durations_scale and durations_bias
+        durations_scale: int/float, optional
+        durations_bias: int/float, optional
+        pitch : paddle.Tensor/np.ndarray, optional
+            Groundtruth of token-averaged pitch (T, 1), this will overwrite the set of pitch_scale and pitch_bias
+        pitch_scale: int/float, optional
+            In denormed HZ domain.
+        pitch_bias: int/float, optional
+            In denormed HZ domain.
+        energy : paddle.Tensor/np.ndarray, optional
+            Groundtruth of token-averaged energy (T, 1), this will overwrite the set of energy_scale and energy_bias
+        energy_scale: int/float, optional
+            In denormed domain.
+        energy_bias: int/float, optional
+            In denormed domain.
+        robot : bool, optional
+            Weather output robot style
+        Returns
+        ----------
+        Tensor
+            Output sequence of features (L, odim).
+        """
+        normalized_mel, d_outs, p_outs, e_outs = self.acoustic_model.inference(
+            text, durations=None, pitch=None, energy=None)
+        # priority: groundtruth > scale/bias > previous output
+        # set durations
+        if isinstance(durations, np.ndarray):
+            durations = paddle.to_tensor(durations)
+        elif isinstance(durations, paddle.Tensor):
+            durations = durations
+        elif durations_scale or durations_bias:
+            durations_scale = durations_scale if durations_scale is not None else 1
+            durations_bias = durations_bias if durations_bias is not None else 0
+            durations = durations_scale * d_outs + durations_bias
+        else:
+            durations = d_outs
+
+        if robot:
+            # set normed pitch to zeros have the same effect with set denormd ones to mean
+            pitch = paddle.zeros(p_outs.shape)
+
+        # set pitch, can overwrite robot set  
+        if isinstance(pitch, np.ndarray):
+            pitch = paddle.to_tensor(pitch)
+        elif isinstance(pitch, paddle.Tensor):
+            pitch = pitch
+        elif pitch_scale or pitch_bias:
+            pitch_scale = pitch_scale if pitch_scale is not None else 1
+            pitch_bias = pitch_bias if pitch_bias is not None else 0
+            p_Hz = paddle.exp(
+                self.denorm(p_outs, self.pitch_mean, self.pitch_std))
+            p_HZ = pitch_scale * p_Hz + pitch_bias
+            pitch = self.norm(paddle.log(p_HZ), self.pitch_mean, self.pitch_std)
+        else:
+            pitch = p_outs
+
+        # set energy
+        if isinstance(energy, np.ndarray):
+            energy = paddle.to_tensor(energy)
+        elif isinstance(energy, paddle.Tensor):
+            energy = energy
+        elif energy_scale or energy_bias:
+            energy_scale = energy_scale if energy_scale is not None else 1
+            energy_bias = energy_bias if energy_bias is not None else 0
+            e_dnorm = self.denorm(e_outs, self.energy_mean, self.energy_std)
+            e_dnorm = energy_scale * e_dnorm + energy_bias
+            energy = self.norm(e_dnorm, self.energy_mean, self.energy_std)
+        else:
+            energy = e_outs
+
+        normalized_mel, d_outs, p_outs, e_outs = self.acoustic_model.inference(
+            text,
+            durations=durations,
+            pitch=pitch,
+            energy=energy,
+            use_teacher_forcing=True)
+
+        logmel = self.normalizer.inverse(normalized_mel)
+        return logmel
+
+
 class FastSpeech2Loss(nn.Layer):
     """Loss function module for FastSpeech2."""
 
diff --git a/paddlespeech/t2s/models/transformer_tts/transformer_tts.py b/paddlespeech/t2s/models/transformer_tts/transformer_tts.py
index 97233c766eb0d71ad51a17d7d33c87a8bc2f4da3..03620fd4e0b50ad827508deb8efba4459ea4bf05 100644
--- a/paddlespeech/t2s/models/transformer_tts/transformer_tts.py
+++ b/paddlespeech/t2s/models/transformer_tts/transformer_tts.py
@@ -23,12 +23,6 @@ import paddle.nn.functional as F
 from paddle import nn
 from typeguard import check_argument_types
 
-from paddlespeech.t2s.modules.fastspeech2_transformer.attention import MultiHeadedAttention
-from paddlespeech.t2s.modules.fastspeech2_transformer.decoder import Decoder
-from paddlespeech.t2s.modules.fastspeech2_transformer.embedding import PositionalEncoding
-from paddlespeech.t2s.modules.fastspeech2_transformer.embedding import ScaledPositionalEncoding
-from paddlespeech.t2s.modules.fastspeech2_transformer.encoder import Encoder
-from paddlespeech.t2s.modules.fastspeech2_transformer.mask import subsequent_mask
 from paddlespeech.t2s.modules.nets_utils import initialize
 from paddlespeech.t2s.modules.nets_utils import make_non_pad_mask
 from paddlespeech.t2s.modules.nets_utils import make_pad_mask
@@ -36,6 +30,12 @@ from paddlespeech.t2s.modules.style_encoder import StyleEncoder
 from paddlespeech.t2s.modules.tacotron2.decoder import Postnet
 from paddlespeech.t2s.modules.tacotron2.decoder import Prenet as DecoderPrenet
 from paddlespeech.t2s.modules.tacotron2.encoder import Encoder as EncoderPrenet
+from paddlespeech.t2s.modules.transformer.attention import MultiHeadedAttention
+from paddlespeech.t2s.modules.transformer.decoder import Decoder
+from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding
+from paddlespeech.t2s.modules.transformer.embedding import ScaledPositionalEncoding
+from paddlespeech.t2s.modules.transformer.encoder import Encoder
+from paddlespeech.t2s.modules.transformer.mask import subsequent_mask
 
 
 class TransformerTTS(nn.Layer):
diff --git a/paddlespeech/t2s/modules/__init__.py b/paddlespeech/t2s/modules/__init__.py
index 664267895491c47ad0b3ecaaaae9412f3ce5110f..5b569f5d05100fa80587c9a06dc2c16f1d58a936 100644
--- a/paddlespeech/t2s/modules/__init__.py
+++ b/paddlespeech/t2s/modules/__init__.py
@@ -11,10 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from .attention import *
 from .conv import *
 from .geometry import *
 from .losses import *
 from .masking import *
 from .positional_encoding import *
-from .transformer import *
diff --git a/paddlespeech/t2s/modules/attention.py b/paddlespeech/t2s/modules/attention.py
deleted file mode 100644
index 154625cc3c1d9426ed2d21edc3064798b73ccd3a..0000000000000000000000000000000000000000
--- a/paddlespeech/t2s/modules/attention.py
+++ /dev/null
@@ -1,348 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import math
-
-import numpy as np
-import paddle
-from paddle import nn
-from paddle.nn import functional as F
-
-
-def scaled_dot_product_attention(q, k, v, mask=None, dropout=0.0,
-                                 training=True):
-    r"""Scaled dot product attention with masking. 
-    
-    Assume that q, k, v all have the same leading dimensions (denoted as * in 
-    descriptions below). Dropout is applied to attention weights before 
-    weighted sum of values.
-
-    Parameters
-    -----------
-    q : Tensor [shape=(\*, T_q, d)]
-        the query tensor.
-    k : Tensor [shape=(\*, T_k, d)]
-        the key tensor.
-    v : Tensor [shape=(\*, T_k, d_v)]
-        the value tensor.
-    mask : Tensor, [shape=(\*, T_q, T_k) or broadcastable shape], optional
-        the mask tensor, zeros correspond to paddings. Defaults to None.
-
-    Returns
-    ----------
-    out : Tensor [shape=(\*, T_q, d_v)]
-        the context vector.
-    attn_weights : Tensor [shape=(\*, T_q, T_k)]
-        the attention weights.
-    """
-    d = q.shape[-1]  # we only support imperative execution
-    qk = paddle.matmul(q, k, transpose_y=True)
-    scaled_logit = paddle.scale(qk, 1.0 / math.sqrt(d))
-
-    if mask is not None:
-        scaled_logit += paddle.scale((1.0 - mask), -1e9)  # hard coded here
-
-    attn_weights = F.softmax(scaled_logit, axis=-1)
-    attn_weights = F.dropout(attn_weights, dropout, training=training)
-    out = paddle.matmul(attn_weights, v)
-    return out, attn_weights
-
-
-def drop_head(x, drop_n_heads, training=True):
-    """Drop n context vectors from multiple ones.
-
-    Parameters
-    ----------
-    x : Tensor [shape=(batch_size, num_heads, time_steps, channels)]
-        The input, multiple context vectors.
-    drop_n_heads : int [0<= drop_n_heads <= num_heads]
-        Number of vectors to drop.
-    training : bool
-        A flag indicating whether it is in training. If `False`, no dropout is 
-        applied.
-
-    Returns
-    -------
-    Tensor
-        The output.
-    """
-    if not training or (drop_n_heads == 0):
-        return x
-
-    batch_size, num_heads, _, _ = x.shape
-    # drop all heads
-    if num_heads == drop_n_heads:
-        return paddle.zeros_like(x)
-
-    mask = np.ones([batch_size, num_heads])
-    mask[:, :drop_n_heads] = 0
-    for subarray in mask:
-        np.random.shuffle(subarray)
-    scale = float(num_heads) / (num_heads - drop_n_heads)
-    mask = scale * np.reshape(mask, [batch_size, num_heads, 1, 1])
-    out = x * paddle.to_tensor(mask)
-    return out
-
-
-def _split_heads(x, num_heads):
-    batch_size, time_steps, _ = x.shape
-    x = paddle.reshape(x, [batch_size, time_steps, num_heads, -1])
-    x = paddle.transpose(x, [0, 2, 1, 3])
-    return x
-
-
-def _concat_heads(x):
-    batch_size, _, time_steps, _ = x.shape
-    x = paddle.transpose(x, [0, 2, 1, 3])
-    x = paddle.reshape(x, [batch_size, time_steps, -1])
-    return x
-
-
-# Standard implementations of Monohead Attention & Multihead Attention
-class MonoheadAttention(nn.Layer):
-    """Monohead Attention module.
-
-    Parameters
-    ----------
-    model_dim : int
-        Feature size of the query.
-    dropout : float, optional
-        Dropout probability of scaled dot product attention and final context
-        vector. Defaults to 0.0.
-    k_dim : int, optional
-        Feature size of the key of each scaled dot product attention. If not
-        provided, it is set to `model_dim / num_heads`. Defaults to None.
-    v_dim : int, optional
-        Feature size of the key of each scaled dot product attention. If not
-        provided, it is set to `model_dim / num_heads`. Defaults to None.
-    """
-
-    def __init__(self,
-                 model_dim: int,
-                 dropout: float=0.0,
-                 k_dim: int=None,
-                 v_dim: int=None):
-        super(MonoheadAttention, self).__init__()
-        k_dim = k_dim or model_dim
-        v_dim = v_dim or model_dim
-        self.affine_q = nn.Linear(model_dim, k_dim)
-        self.affine_k = nn.Linear(model_dim, k_dim)
-        self.affine_v = nn.Linear(model_dim, v_dim)
-        self.affine_o = nn.Linear(v_dim, model_dim)
-
-        self.model_dim = model_dim
-        self.dropout = dropout
-
-    def forward(self, q, k, v, mask):
-        """Compute context vector and attention weights.
-        
-        Parameters
-        -----------
-        q : Tensor [shape=(batch_size, time_steps_q, model_dim)]
-            The queries.
-        k : Tensor [shape=(batch_size, time_steps_k, model_dim)]
-            The keys.
-        v : Tensor [shape=(batch_size, time_steps_k, model_dim)]
-            The values.
-        mask : Tensor [shape=(batch_size, times_steps_q, time_steps_k] or broadcastable shape
-            The mask.
-
-        Returns
-        ----------
-        out : Tensor [shape=(batch_size, time_steps_q, model_dim)]
-            The context vector.
-        attention_weights : Tensor [shape=(batch_size, times_steps_q, time_steps_k)]
-            The attention weights.
-        """
-        q = self.affine_q(q)  # (B, T, C)
-        k = self.affine_k(k)
-        v = self.affine_v(v)
-
-        context_vectors, attention_weights = scaled_dot_product_attention(
-            q, k, v, mask, self.dropout, self.training)
-
-        out = self.affine_o(context_vectors)
-        return out, attention_weights
-
-
-class MultiheadAttention(nn.Layer):
-    """Multihead Attention module.
-
-    Parameters
-    -----------
-    model_dim: int
-        The feature size of query.
-    num_heads : int
-        The number of attention heads.
-    dropout : float, optional
-        Dropout probability of scaled dot product attention and final context
-        vector. Defaults to 0.0.
-    k_dim : int, optional
-        Feature size of the key of each scaled dot product attention. If not
-        provided, it is set to ``model_dim / num_heads``. Defaults to None.
-    v_dim : int, optional
-        Feature size of the key of each scaled dot product attention. If not
-        provided, it is set to ``model_dim / num_heads``. Defaults to None.
-
-    Raises
-    ---------
-    ValueError
-        If ``model_dim`` is not divisible by ``num_heads``.
-    """
-
-    def __init__(self,
-                 model_dim: int,
-                 num_heads: int,
-                 dropout: float=0.0,
-                 k_dim: int=None,
-                 v_dim: int=None):
-        super(MultiheadAttention, self).__init__()
-        if model_dim % num_heads != 0:
-            raise ValueError("model_dim must be divisible by num_heads")
-        depth = model_dim // num_heads
-        k_dim = k_dim or depth
-        v_dim = v_dim or depth
-        self.affine_q = nn.Linear(model_dim, num_heads * k_dim)
-        self.affine_k = nn.Linear(model_dim, num_heads * k_dim)
-        self.affine_v = nn.Linear(model_dim, num_heads * v_dim)
-        self.affine_o = nn.Linear(num_heads * v_dim, model_dim)
-
-        self.num_heads = num_heads
-        self.model_dim = model_dim
-        self.dropout = dropout
-
-    def forward(self, q, k, v, mask):
-        """Compute context vector and attention weights.
-        
-        Parameters
-        -----------
-        q : Tensor [shape=(batch_size, time_steps_q, model_dim)]
-            The queries.
-        k : Tensor [shape=(batch_size, time_steps_k, model_dim)]
-            The keys.
-        v : Tensor [shape=(batch_size, time_steps_k, model_dim)]
-            The values.
-        mask : Tensor [shape=(batch_size, times_steps_q, time_steps_k] or broadcastable shape
-            The mask.
-
-        Returns
-        ----------
-        out : Tensor [shape=(batch_size, time_steps_q, model_dim)]
-            The context vector.
-        attention_weights : Tensor [shape=(batch_size, times_steps_q, time_steps_k)]
-            The attention weights.
-        """
-        q = _split_heads(self.affine_q(q), self.num_heads)  # (B, h, T, C)
-        k = _split_heads(self.affine_k(k), self.num_heads)
-        v = _split_heads(self.affine_v(v), self.num_heads)
-        mask = paddle.unsqueeze(mask, 1)  # unsqueeze for the h dim
-
-        context_vectors, attention_weights = scaled_dot_product_attention(
-            q, k, v, mask, self.dropout, self.training)
-        # NOTE: there is more sophisticated implementation: Scheduled DropHead
-        context_vectors = _concat_heads(context_vectors)  # (B, T, h*C)
-        out = self.affine_o(context_vectors)
-        return out, attention_weights
-
-
-class LocationSensitiveAttention(nn.Layer):
-    """Location Sensitive Attention module.
-
-    Reference: `Attention-Based Models for Speech Recognition <https://arxiv.org/pdf/1506.07503.pdf>`_
-
-    Parameters
-    -----------
-    d_query: int
-        The feature size of query.
-    d_key : int
-        The feature size of key.
-    d_attention : int
-        The feature size of dimension.
-    location_filters : int
-        Filter size of attention convolution.
-    location_kernel_size : int
-        Kernel size of attention convolution.
-    """
-
-    def __init__(self,
-                 d_query: int,
-                 d_key: int,
-                 d_attention: int,
-                 location_filters: int,
-                 location_kernel_size: int):
-        super().__init__()
-
-        self.query_layer = nn.Linear(d_query, d_attention, bias_attr=False)
-        self.key_layer = nn.Linear(d_key, d_attention, bias_attr=False)
-        self.value = nn.Linear(d_attention, 1, bias_attr=False)
-
-        # Location Layer
-        self.location_conv = nn.Conv1D(
-            2,
-            location_filters,
-            kernel_size=location_kernel_size,
-            padding=int((location_kernel_size - 1) / 2),
-            bias_attr=False,
-            data_format='NLC')
-        self.location_layer = nn.Linear(
-            location_filters, d_attention, bias_attr=False)
-
-    def forward(self,
-                query,
-                processed_key,
-                value,
-                attention_weights_cat,
-                mask=None):
-        """Compute context vector and attention weights.
-        
-        Parameters
-        -----------
-        query : Tensor [shape=(batch_size, d_query)]
-            The queries.
-        processed_key : Tensor [shape=(batch_size, time_steps_k, d_attention)]
-            The keys after linear layer.
-        value : Tensor [shape=(batch_size, time_steps_k, d_key)]
-            The values.
-        attention_weights_cat : Tensor [shape=(batch_size, time_step_k, 2)]
-            Attention weights concat.
-        mask : Tensor, optional
-            The mask. Shape should be (batch_size, times_steps_k, 1).
-            Defaults to None.
-
-        Returns
-        ----------
-        attention_context : Tensor [shape=(batch_size, d_attention)]
-            The context vector.
-        attention_weights : Tensor [shape=(batch_size, time_steps_k)]
-            The attention weights.
-        """
-
-        processed_query = self.query_layer(paddle.unsqueeze(query, axis=[1]))
-        processed_attention_weights = self.location_layer(
-            self.location_conv(attention_weights_cat))
-        # (B, T_enc, 1)
-        alignment = self.value(
-            paddle.tanh(processed_attention_weights + processed_key +
-                        processed_query))
-
-        if mask is not None:
-            alignment = alignment + (1.0 - mask) * -1e9
-
-        attention_weights = F.softmax(alignment, axis=1)
-        attention_context = paddle.matmul(
-            attention_weights, value, transpose_x=True)
-
-        attention_weights = paddle.squeeze(attention_weights, axis=-1)
-        attention_context = paddle.squeeze(attention_context, axis=1)
-
-        return attention_context, attention_weights
diff --git a/paddlespeech/t2s/modules/conformer/convolution.py b/paddlespeech/t2s/modules/conformer/convolution.py
new file mode 100644
index 0000000000000000000000000000000000000000..25246736b92dfda364cf53a02ed37bb670e99c55
--- /dev/null
+++ b/paddlespeech/t2s/modules/conformer/convolution.py
@@ -0,0 +1,84 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+"""ConvolutionModule definition."""
+from paddle import nn
+
+
+class ConvolutionModule(nn.Layer):
+    """ConvolutionModule in Conformer model.
+    Parameters
+    ----------
+    channels : int
+        The number of channels of conv layers.
+    kernel_size : int
+        Kernerl size of conv layers.
+    """
+
+    def __init__(self, channels, kernel_size, activation=nn.ReLU(), bias=True):
+        """Construct an ConvolutionModule object."""
+        super().__init__()
+        # kernerl_size should be a odd number for 'SAME' padding
+        assert (kernel_size - 1) % 2 == 0
+
+        self.pointwise_conv1 = nn.Conv1D(
+            channels,
+            2 * channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias_attr=bias, )
+        self.depthwise_conv = nn.Conv1D(
+            channels,
+            channels,
+            kernel_size,
+            stride=1,
+            padding=(kernel_size - 1) // 2,
+            groups=channels,
+            bias_attr=bias, )
+        self.norm = nn.BatchNorm1D(channels)
+        self.pointwise_conv2 = nn.Conv1D(
+            channels,
+            channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias_attr=bias, )
+        self.activation = activation
+
+    def forward(self, x):
+        """Compute convolution module.
+        Parameters
+        ----------
+        x : paddle.Tensor
+            Input tensor (#batch, time, channels).
+        Returns
+        ----------
+        paddle.Tensor
+            Output tensor (#batch, time, channels).
+        """
+        # exchange the temporal dimension and the feature dimension
+        x = x.transpose([0, 2, 1])
+
+        # GLU mechanism
+        x = self.pointwise_conv1(x)  # (batch, 2*channel, dim)
+        x = nn.functional.glu(x, axis=1)  # (batch, channel, dim)
+
+        # 1D Depthwise Conv
+        x = self.depthwise_conv(x)
+        x = self.activation(self.norm(x))
+
+        x = self.pointwise_conv2(x)
+
+        return x.transpose([0, 2, 1])
diff --git a/paddlespeech/t2s/modules/conformer/encoder.py b/paddlespeech/t2s/modules/conformer/encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..568597ba56d8ac27691577c23a244068240958ae
--- /dev/null
+++ b/paddlespeech/t2s/modules/conformer/encoder.py
@@ -0,0 +1,274 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+"""Encoder definition."""
+import logging
+
+import paddle
+
+from paddlespeech.t2s.modules.conformer.convolution import ConvolutionModule
+from paddlespeech.t2s.modules.conformer.encoder_layer import EncoderLayer
+from paddlespeech.t2s.modules.layer_norm import LayerNorm
+from paddlespeech.t2s.modules.nets_utils import get_activation
+from paddlespeech.t2s.modules.transformer.attention import LegacyRelPositionMultiHeadedAttention
+from paddlespeech.t2s.modules.transformer.attention import MultiHeadedAttention
+from paddlespeech.t2s.modules.transformer.attention import RelPositionMultiHeadedAttention
+from paddlespeech.t2s.modules.transformer.embedding import LegacyRelPositionalEncoding
+from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding
+from paddlespeech.t2s.modules.transformer.embedding import RelPositionalEncoding
+from paddlespeech.t2s.modules.transformer.embedding import ScaledPositionalEncoding
+from paddlespeech.t2s.modules.transformer.multi_layer_conv import Conv1dLinear
+from paddlespeech.t2s.modules.transformer.multi_layer_conv import MultiLayeredConv1d
+from paddlespeech.t2s.modules.transformer.positionwise_feed_forward import PositionwiseFeedForward
+from paddlespeech.t2s.modules.transformer.repeat import repeat
+from paddlespeech.t2s.modules.transformer.subsampling import Conv2dSubsampling
+
+
+class Encoder(paddle.nn.Layer):
+    """Conformer encoder module.
+    Parameters
+    ----------
+    idim : int
+        Input dimension.
+    attention_dim : int
+        Dimension of attention.
+    attention_heads : int
+        The number of heads of multi head attention.
+    linear_units : int
+        The number of units of position-wise feed forward.
+    num_blocks : int
+        The number of decoder blocks.
+    dropout_rate : float
+        Dropout rate.
+    positional_dropout_rate : float
+        Dropout rate after adding positional encoding.
+    attention_dropout_rate : float
+        Dropout rate in attention.
+    input_layer : Union[str, paddle.nn.Layer]
+        Input layer type.
+    normalize_before : bool
+        Whether to use layer_norm before the first block.
+    concat_after : bool
+        Whether to concat attention layer's input and output.
+        if True, additional linear will be applied.
+        i.e. x -> x + linear(concat(x, att(x)))
+        if False, no additional linear will be applied. i.e. x -> x + att(x)
+    positionwise_layer_type : str
+        "linear", "conv1d", or "conv1d-linear".
+    positionwise_conv_kernel_size : int
+        Kernel size of positionwise conv1d layer.
+    macaron_style : bool
+        Whether to use macaron style for positionwise layer.
+    pos_enc_layer_type : str
+        Encoder positional encoding layer type.
+    selfattention_layer_type : str
+        Encoder attention layer type.
+    activation_type : str
+        Encoder activation function type.
+    use_cnn_module : bool
+        Whether to use convolution module.
+    zero_triu : bool
+        Whether to zero the upper triangular part of attention matrix.
+    cnn_module_kernel : int
+        Kernerl size of convolution module.
+    padding_idx : int
+        Padding idx for input_layer=embed.
+    stochastic_depth_rate : float
+        Maximum probability to skip the encoder layer.
+    intermediate_layers : Union[List[int], None]
+        indices of intermediate CTC layer.
+        indices start from 1.
+        if not None, intermediate outputs are returned (which changes return type
+        signature.)
+    """
+
+    def __init__(
+            self,
+            idim,
+            attention_dim=256,
+            attention_heads=4,
+            linear_units=2048,
+            num_blocks=6,
+            dropout_rate=0.1,
+            positional_dropout_rate=0.1,
+            attention_dropout_rate=0.0,
+            input_layer="conv2d",
+            normalize_before=True,
+            concat_after=False,
+            positionwise_layer_type="linear",
+            positionwise_conv_kernel_size=1,
+            macaron_style=False,
+            pos_enc_layer_type="abs_pos",
+            selfattention_layer_type="selfattn",
+            activation_type="swish",
+            use_cnn_module=False,
+            zero_triu=False,
+            cnn_module_kernel=31,
+            padding_idx=-1,
+            stochastic_depth_rate=0.0,
+            intermediate_layers=None, ):
+        """Construct an Encoder object."""
+        super(Encoder, self).__init__()
+
+        activation = get_activation(activation_type)
+        if pos_enc_layer_type == "abs_pos":
+            pos_enc_class = PositionalEncoding
+        elif pos_enc_layer_type == "scaled_abs_pos":
+            pos_enc_class = ScaledPositionalEncoding
+        elif pos_enc_layer_type == "rel_pos":
+            assert selfattention_layer_type == "rel_selfattn"
+            pos_enc_class = RelPositionalEncoding
+        elif pos_enc_layer_type == "legacy_rel_pos":
+            pos_enc_class = LegacyRelPositionalEncoding
+            assert selfattention_layer_type == "legacy_rel_selfattn"
+        else:
+            raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type)
+
+        self.conv_subsampling_factor = 1
+        if input_layer == "linear":
+            self.embed = paddle.nn.Sequential(
+                paddle.nn.Linear(idim, attention_dim),
+                paddle.nn.LayerNorm(attention_dim),
+                paddle.nn.Dropout(dropout_rate),
+                pos_enc_class(attention_dim, positional_dropout_rate), )
+        elif input_layer == "conv2d":
+            self.embed = Conv2dSubsampling(
+                idim,
+                attention_dim,
+                dropout_rate,
+                pos_enc_class(attention_dim, positional_dropout_rate), )
+            self.conv_subsampling_factor = 4
+
+        elif input_layer == "embed":
+            self.embed = paddle.nn.Sequential(
+                paddle.nn.Embedding(
+                    idim, attention_dim, padding_idx=padding_idx),
+                pos_enc_class(attention_dim, positional_dropout_rate), )
+        elif isinstance(input_layer, paddle.nn.Layer):
+            self.embed = paddle.nn.Sequential(
+                input_layer,
+                pos_enc_class(attention_dim, positional_dropout_rate), )
+        elif input_layer is None:
+            self.embed = paddle.nn.Sequential(
+                pos_enc_class(attention_dim, positional_dropout_rate))
+        else:
+            raise ValueError("unknown input_layer: " + input_layer)
+        self.normalize_before = normalize_before
+
+        # self-attention module definition
+        if selfattention_layer_type == "selfattn":
+            logging.info("encoder self-attention layer type = self-attention")
+            encoder_selfattn_layer = MultiHeadedAttention
+            encoder_selfattn_layer_args = (attention_heads, attention_dim,
+                                           attention_dropout_rate, )
+        elif selfattention_layer_type == "legacy_rel_selfattn":
+            assert pos_enc_layer_type == "legacy_rel_pos"
+            encoder_selfattn_layer = LegacyRelPositionMultiHeadedAttention
+            encoder_selfattn_layer_args = (attention_heads, attention_dim,
+                                           attention_dropout_rate, )
+        elif selfattention_layer_type == "rel_selfattn":
+            logging.info(
+                "encoder self-attention layer type = relative self-attention")
+            assert pos_enc_layer_type == "rel_pos"
+            encoder_selfattn_layer = RelPositionMultiHeadedAttention
+            encoder_selfattn_layer_args = (attention_heads, attention_dim,
+                                           attention_dropout_rate, zero_triu, )
+        else:
+            raise ValueError("unknown encoder_attn_layer: " +
+                             selfattention_layer_type)
+
+        # feed-forward module definition
+        if positionwise_layer_type == "linear":
+            positionwise_layer = PositionwiseFeedForward
+            positionwise_layer_args = (attention_dim, linear_units,
+                                       dropout_rate, activation, )
+        elif positionwise_layer_type == "conv1d":
+            positionwise_layer = MultiLayeredConv1d
+            positionwise_layer_args = (attention_dim, linear_units,
+                                       positionwise_conv_kernel_size,
+                                       dropout_rate, )
+        elif positionwise_layer_type == "conv1d-linear":
+            positionwise_layer = Conv1dLinear
+            positionwise_layer_args = (attention_dim, linear_units,
+                                       positionwise_conv_kernel_size,
+                                       dropout_rate, )
+        else:
+            raise NotImplementedError("Support only linear or conv1d.")
+
+        # convolution module definition
+        convolution_layer = ConvolutionModule
+        convolution_layer_args = (attention_dim, cnn_module_kernel, activation)
+
+        self.encoders = repeat(
+            num_blocks,
+            lambda lnum: EncoderLayer(
+                attention_dim,
+                encoder_selfattn_layer(*encoder_selfattn_layer_args),
+                positionwise_layer(*positionwise_layer_args),
+                positionwise_layer(*positionwise_layer_args) if macaron_style else None,
+                convolution_layer(*convolution_layer_args) if use_cnn_module else None,
+                dropout_rate,
+                normalize_before,
+                concat_after,
+                stochastic_depth_rate * float(1 + lnum) / num_blocks, ), )
+        if self.normalize_before:
+            self.after_norm = LayerNorm(attention_dim)
+
+        self.intermediate_layers = intermediate_layers
+
+    def forward(self, xs, masks):
+        """Encode input sequence.
+        Parameters
+        ----------
+        xs : paddle.Tensor
+            Input tensor (#batch, time, idim).
+            masks (paddle.Tensor): Mask tensor (#batch, 1, time).
+        Returns
+        ----------
+        paddle.Tensor
+            Output tensor (#batch, time, attention_dim).
+        paddle.Tensor
+            Mask tensor (#batch, time).
+        """
+        if isinstance(self.embed, (Conv2dSubsampling)):
+            xs, masks = self.embed(xs, masks)
+        else:
+            xs = self.embed(xs)
+
+        if self.intermediate_layers is None:
+            xs, masks = self.encoders(xs, masks)
+        else:
+            intermediate_outputs = []
+            for layer_idx, encoder_layer in enumerate(self.encoders):
+                xs, masks = encoder_layer(xs, masks)
+
+                if (self.intermediate_layers is not None and
+                        layer_idx + 1 in self.intermediate_layers):
+                    # intermediate branches also require normalization.
+                    encoder_output = xs
+                    if isinstance(encoder_output, tuple):
+                        encoder_output = encoder_output[0]
+                        if self.normalize_before:
+                            encoder_output = self.after_norm(encoder_output)
+                    intermediate_outputs.append(encoder_output)
+
+        if isinstance(xs, tuple):
+            xs = xs[0]
+
+        if self.normalize_before:
+            xs = self.after_norm(xs)
+
+        if self.intermediate_layers is not None:
+            return xs, masks, intermediate_outputs
+        return xs, masks
diff --git a/paddlespeech/t2s/modules/conformer/encoder_layer.py b/paddlespeech/t2s/modules/conformer/encoder_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..a7a4936786f9b47f945740d4b45eb7a2b98101ee
--- /dev/null
+++ b/paddlespeech/t2s/modules/conformer/encoder_layer.py
@@ -0,0 +1,196 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+"""Encoder self-attention layer definition."""
+import paddle
+from paddle import nn
+
+from paddlespeech.t2s.modules.layer_norm import LayerNorm
+
+
+class EncoderLayer(nn.Layer):
+    """Encoder layer module.
+    Parameters
+    ----------
+    size : int
+        Input dimension.
+    self_attn : paddle.nn.Layer
+        Self-attention module instance.
+        `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` instance
+        can be used as the argument.
+    feed_forward : paddle.nn.Layer
+        Feed-forward module instance.
+        `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance
+        can be used as the argument.
+    feed_forward_macaron : paddle.nn.Layer
+        Additional feed-forward module instance.
+        `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance
+        can be used as the argument.
+    conv_module : paddle.nn.Layer
+        Convolution module instance.
+        `ConvlutionModule` instance can be used as the argument.
+    dropout_rate : float
+        Dropout rate.
+    normalize_before : bool
+        Whether to use layer_norm before the first block.
+    concat_after : bool
+        Whether to concat attention layer's input and output.
+        if True, additional linear will be applied.
+        i.e. x -> x + linear(concat(x, att(x)))
+        if False, no additional linear will be applied. i.e. x -> x + att(x)
+    stochastic_depth_rate : float
+        Proability to skip this layer.
+        During training, the layer may skip residual computation and return input
+        as-is with given probability.
+    """
+
+    def __init__(
+            self,
+            size,
+            self_attn,
+            feed_forward,
+            feed_forward_macaron,
+            conv_module,
+            dropout_rate,
+            normalize_before=True,
+            concat_after=False,
+            stochastic_depth_rate=0.0, ):
+        """Construct an EncoderLayer object."""
+        super(EncoderLayer, self).__init__()
+        self.self_attn = self_attn
+        self.feed_forward = feed_forward
+        self.feed_forward_macaron = feed_forward_macaron
+        self.conv_module = conv_module
+        self.norm_ff = LayerNorm(size)  # for the FNN module
+        self.norm_mha = LayerNorm(size)  # for the MHA module
+        if feed_forward_macaron is not None:
+            self.norm_ff_macaron = LayerNorm(size)
+            self.ff_scale = 0.5
+        else:
+            self.ff_scale = 1.0
+        if self.conv_module is not None:
+            self.norm_conv = LayerNorm(size)  # for the CNN module
+            self.norm_final = LayerNorm(
+                size)  # for the final output of the block
+        self.dropout = nn.Dropout(dropout_rate)
+        self.size = size
+        self.normalize_before = normalize_before
+        self.concat_after = concat_after
+        if self.concat_after:
+            self.concat_linear = nn.Linear(size + size, size)
+        self.stochastic_depth_rate = stochastic_depth_rate
+
+    def forward(self, x_input, mask, cache=None):
+        """Compute encoded features.
+        Parameters
+        ----------
+        x_input : Union[Tuple, paddle.Tensor]
+            Input tensor w/ or w/o pos emb.
+            - w/ pos emb: Tuple of tensors [(#batch, time, size), (1, time, size)].
+            - w/o pos emb: Tensor (#batch, time, size).
+        mask : paddle.Tensor
+            Mask tensor for the input (#batch, time).
+        cache paddle.Tensor
+            Cache tensor of the input (#batch, time - 1, size).
+        Returns
+        ----------
+        paddle.Tensor
+            Output tensor (#batch, time, size).
+        paddle.Tensor
+            Mask tensor (#batch, time).
+        """
+        if isinstance(x_input, tuple):
+            x, pos_emb = x_input[0], x_input[1]
+        else:
+            x, pos_emb = x_input, None
+
+        skip_layer = False
+        # with stochastic depth, residual connection `x + f(x)` becomes
+        # `x <- x + 1 / (1 - p) * f(x)` at training time.
+        stoch_layer_coeff = 1.0
+        if self.training and self.stochastic_depth_rate > 0:
+            skip_layer = paddle.rand(1).item() < self.stochastic_depth_rate
+            stoch_layer_coeff = 1.0 / (1 - self.stochastic_depth_rate)
+
+        if skip_layer:
+            if cache is not None:
+                x = paddle.concat([cache, x], axis=1)
+            if pos_emb is not None:
+                return (x, pos_emb), mask
+            return x, mask
+
+        # whether to use macaron style
+        if self.feed_forward_macaron is not None:
+            residual = x
+            if self.normalize_before:
+                x = self.norm_ff_macaron(x)
+            x = residual + stoch_layer_coeff * self.ff_scale * self.dropout(
+                self.feed_forward_macaron(x))
+            if not self.normalize_before:
+                x = self.norm_ff_macaron(x)
+
+        # multi-headed self-attention module
+        residual = x
+        if self.normalize_before:
+            x = self.norm_mha(x)
+
+        if cache is None:
+            x_q = x
+        else:
+            assert cache.shape == (x.shape[0], x.shape[1] - 1, self.size)
+            x_q = x[:, -1:, :]
+            residual = residual[:, -1:, :]
+            mask = None if mask is None else mask[:, -1:, :]
+
+        if pos_emb is not None:
+            x_att = self.self_attn(x_q, x, x, pos_emb, mask)
+        else:
+            x_att = self.self_attn(x_q, x, x, mask)
+
+        if self.concat_after:
+            x_concat = paddle.concat((x, x_att), axis=-1)
+            x = residual + stoch_layer_coeff * self.concat_linear(x_concat)
+        else:
+            x = residual + stoch_layer_coeff * self.dropout(x_att)
+        if not self.normalize_before:
+            x = self.norm_mha(x)
+
+        # convolution module
+        if self.conv_module is not None:
+            residual = x
+            if self.normalize_before:
+                x = self.norm_conv(x)
+            x = residual + stoch_layer_coeff * self.dropout(self.conv_module(x))
+            if not self.normalize_before:
+                x = self.norm_conv(x)
+
+        # feed forward module
+        residual = x
+        if self.normalize_before:
+            x = self.norm_ff(x)
+        x = residual + stoch_layer_coeff * self.ff_scale * self.dropout(
+            self.feed_forward(x))
+        if not self.normalize_before:
+            x = self.norm_ff(x)
+
+        if self.conv_module is not None:
+            x = self.norm_final(x)
+
+        if cache is not None:
+            x = paddle.concat([cache, x], axis=1)
+
+        if pos_emb is not None:
+            return (x, pos_emb), mask
+
+        return x, mask
diff --git a/paddlespeech/t2s/modules/nets_utils.py b/paddlespeech/t2s/modules/nets_utils.py
index 30d3db86c885a56204562b82f7e0d709a96717e2..fbb3a9a3d65f83fd43b19902c9e97137691f2a2d 100644
--- a/paddlespeech/t2s/modules/nets_utils.py
+++ b/paddlespeech/t2s/modules/nets_utils.py
@@ -17,6 +17,14 @@ from paddle import nn
 from typeguard import check_argument_types
 
 
+class Swish(paddle.nn.Layer):
+    """Construct an Swish object."""
+
+    def forward(self, x):
+        """Return Swich activation function."""
+        return x * paddle.nn.Sigmoid(x)
+
+
 def pad_list(xs, pad_value):
     """Perform padding for the list of tensors.
 
@@ -150,3 +158,17 @@ def initialize(model: nn.Layer, init: str):
                                               nn.initializer.Constant())
     else:
         raise ValueError("Unknown initialization: " + init)
+
+
+def get_activation(act):
+    """Return activation function."""
+
+    activation_funcs = {
+        "hardtanh": paddle.nn.Hardtanh,
+        "tanh": paddle.nn.Tanh,
+        "relu": paddle.nn.ReLU,
+        "selu": paddle.nn.SELU,
+        "swish": Swish,
+    }
+
+    return activation_funcs[act]()
diff --git a/paddlespeech/t2s/modules/fastspeech2_predictor/__init__.py b/paddlespeech/t2s/modules/predictor/__init__.py
similarity index 100%
rename from paddlespeech/t2s/modules/fastspeech2_predictor/__init__.py
rename to paddlespeech/t2s/modules/predictor/__init__.py
diff --git a/paddlespeech/t2s/modules/fastspeech2_predictor/duration_predictor.py b/paddlespeech/t2s/modules/predictor/duration_predictor.py
similarity index 100%
rename from paddlespeech/t2s/modules/fastspeech2_predictor/duration_predictor.py
rename to paddlespeech/t2s/modules/predictor/duration_predictor.py
diff --git a/paddlespeech/t2s/modules/fastspeech2_predictor/length_regulator.py b/paddlespeech/t2s/modules/predictor/length_regulator.py
similarity index 100%
rename from paddlespeech/t2s/modules/fastspeech2_predictor/length_regulator.py
rename to paddlespeech/t2s/modules/predictor/length_regulator.py
diff --git a/paddlespeech/t2s/modules/fastspeech2_predictor/variance_predictor.py b/paddlespeech/t2s/modules/predictor/variance_predictor.py
similarity index 100%
rename from paddlespeech/t2s/modules/fastspeech2_predictor/variance_predictor.py
rename to paddlespeech/t2s/modules/predictor/variance_predictor.py
diff --git a/paddlespeech/t2s/modules/style_encoder.py b/paddlespeech/t2s/modules/style_encoder.py
index 868a73a969edb6d3dc1affe6b0e401a88fb7d11b..8a23e85c61dd2f2cbbd06e281335317575dfc5ff 100644
--- a/paddlespeech/t2s/modules/style_encoder.py
+++ b/paddlespeech/t2s/modules/style_encoder.py
@@ -19,7 +19,7 @@ import paddle
 from paddle import nn
 from typeguard import check_argument_types
 
-from paddlespeech.t2s.modules.fastspeech2_transformer.attention import MultiHeadedAttention as BaseMultiHeadedAttention
+from paddlespeech.t2s.modules.transformer.attention import MultiHeadedAttention as BaseMultiHeadedAttention
 
 
 class StyleEncoder(nn.Layer):
diff --git a/paddlespeech/t2s/modules/transformer.py b/paddlespeech/t2s/modules/transformer.py
deleted file mode 100644
index e50d58d44bc6663414a7390589d3a8d7ad6f2c5b..0000000000000000000000000000000000000000
--- a/paddlespeech/t2s/modules/transformer.py
+++ /dev/null
@@ -1,208 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from paddle import nn
-from paddle.nn import functional as F
-
-from paddlespeech.t2s.modules import attention as attn
-
-__all__ = [
-    "PositionwiseFFN",
-    "TransformerEncoderLayer",
-    "TransformerDecoderLayer",
-]
-
-
-class PositionwiseFFN(nn.Layer):
-    """A faithful implementation of Position-wise Feed-Forward Network 
-    in `Attention is All You Need <https://arxiv.org/abs/1706.03762>`_.
-    It is basically a 2-layer MLP, with relu actication and dropout in between.
-
-    Parameters
-    ----------
-    input_size: int
-        The feature size of the intput. It is also the feature size of the
-        output.
-    hidden_size: int
-        The hidden size.
-    dropout: float
-        The probability of the Dropout applied to the output of the first
-        layer, by default 0.
-    """
-
-    def __init__(self, input_size: int, hidden_size: int, dropout=0.0):
-        super(PositionwiseFFN, self).__init__()
-        self.linear1 = nn.Linear(input_size, hidden_size)
-        self.linear2 = nn.Linear(hidden_size, input_size)
-        self.dropout = nn.Dropout(dropout)
-
-        self.input_size = input_size
-        self.hidden_szie = hidden_size
-
-    def forward(self, x):
-        r"""Forward pass of positionwise feed forward network.
-
-        Parameters
-        ----------
-        x : Tensor [shape=(\*, input_size)]
-            The input tensor, where ``\*`` means arbitary shape.
-
-        Returns
-        -------
-        Tensor [shape=(\*, input_size)]
-            The output tensor.
-        """
-        l1 = self.dropout(F.relu(self.linear1(x)))
-        l2 = self.linear2(l1)
-        return l2
-
-
-class TransformerEncoderLayer(nn.Layer):
-    """A faithful implementation of Transformer encoder layer in
-    `Attention is All You Need <https://arxiv.org/abs/1706.03762>`_.
-
-    Parameters
-    ----------
-    d_model :int 
-        The feature size of the input. It is also the feature size of the
-        output.
-    n_heads : int
-        The number of heads of self attention (a ``MultiheadAttention``
-        layer).
-    d_ffn : int 
-        The hidden size of the positional feed forward network (a
-        ``PositionwiseFFN`` layer).
-    dropout : float, optional
-        The probability of the dropout in MultiHeadAttention and
-        PositionwiseFFN, by default 0.
-
-    Notes
-    ------
-    It uses the PostLN (post layer norm) scheme.
-    """
-
-    def __init__(self, d_model, n_heads, d_ffn, dropout=0.):
-        super(TransformerEncoderLayer, self).__init__()
-        self.self_mha = attn.MultiheadAttention(d_model, n_heads, dropout)
-        self.layer_norm1 = nn.LayerNorm([d_model], epsilon=1e-6)
-
-        self.ffn = PositionwiseFFN(d_model, d_ffn, dropout)
-        self.layer_norm2 = nn.LayerNorm([d_model], epsilon=1e-6)
-
-        self.dropout = dropout
-
-    def forward(self, x, mask):
-        """Forward pass of TransformerEncoderLayer.
-
-        Parameters
-        ----------
-        x : Tensor [shape=(batch_size, time_steps, d_model)]
-            The input.
-        mask : Tensor
-            The padding mask. The shape is (batch_size, time_steps,
-            time_steps) or broadcastable shape.
-
-        Returns
-        -------
-        x :Tensor [shape=(batch_size, time_steps, d_model)]
-            The encoded output.
-
-        attn_weights : Tensor [shape=(batch_size, n_heads, time_steps, time_steps)]
-            The attention weights of the self attention.
-        """
-        context_vector, attn_weights = self.self_mha(x, x, x, mask)
-        x = self.layer_norm1(
-            F.dropout(x + context_vector, self.dropout, training=self.training))
-
-        x = self.layer_norm2(
-            F.dropout(x + self.ffn(x), self.dropout, training=self.training))
-        return x, attn_weights
-
-
-class TransformerDecoderLayer(nn.Layer):
-    """A faithful implementation of Transformer decoder layer in 
-    `Attention is All You Need <https://arxiv.org/abs/1706.03762>`_.
-
-    Parameters
-    ----------
-    d_model :int 
-        The feature size of the input. It is also the feature size of the
-        output.
-    n_heads : int
-        The number of heads of attentions (``MultiheadAttention``
-        layers).
-    d_ffn : int 
-        The hidden size of the positional feed forward network (a
-        ``PositionwiseFFN`` layer).
-    dropout : float, optional
-        The probability of the dropout in MultiHeadAttention and
-        PositionwiseFFN, by default 0.
-
-    Notes
-    ------
-    It uses the PostLN (post layer norm) scheme.
-    """
-
-    def __init__(self, d_model, n_heads, d_ffn, dropout=0.):
-        super(TransformerDecoderLayer, self).__init__()
-        self.self_mha = attn.MultiheadAttention(d_model, n_heads, dropout)
-        self.layer_norm1 = nn.LayerNorm([d_model], epsilon=1e-6)
-
-        self.cross_mha = attn.MultiheadAttention(d_model, n_heads, dropout)
-        self.layer_norm2 = nn.LayerNorm([d_model], epsilon=1e-6)
-
-        self.ffn = PositionwiseFFN(d_model, d_ffn, dropout)
-        self.layer_norm3 = nn.LayerNorm([d_model], epsilon=1e-6)
-
-        self.dropout = dropout
-
-    def forward(self, q, k, v, encoder_mask, decoder_mask):
-        """Forward pass of TransformerEncoderLayer.
-
-        Parameters
-        ----------
-        q : Tensor [shape=(batch_size, time_steps_q, d_model)]
-            The decoder input.
-        k : Tensor [shape=(batch_size, time_steps_k, d_model)]
-            The keys.
-        v : Tensor [shape=(batch_size, time_steps_k, d_model)]
-            The values
-        encoder_mask : Tensor
-            Encoder padding mask, shape is ``(batch_size, time_steps_k,
-            time_steps_k)`` or broadcastable shape.
-        decoder_mask : Tensor
-            Decoder mask, shape is ``(batch_size, time_steps_q, time_steps_k)``
-            or broadcastable shape. 
-
-        Returns
-        --------
-        q : Tensor [shape=(batch_size, time_steps_q, d_model)]
-            The decoder output.
-        self_attn_weights : Tensor [shape=(batch_size, n_heads, time_steps_q, time_steps_q)]
-            Decoder self attention.
-
-        cross_attn_weights : Tensor [shape=(batch_size, n_heads, time_steps_q, time_steps_k)]
-            Decoder-encoder cross attention.
-        """
-        context_vector, self_attn_weights = self.self_mha(q, q, q, decoder_mask)
-        q = self.layer_norm1(
-            F.dropout(q + context_vector, self.dropout, training=self.training))
-
-        context_vector, cross_attn_weights = self.cross_mha(q, k, v,
-                                                            encoder_mask)
-        q = self.layer_norm2(
-            F.dropout(q + context_vector, self.dropout, training=self.training))
-
-        q = self.layer_norm3(
-            F.dropout(q + self.ffn(q), self.dropout, training=self.training))
-        return q, self_attn_weights, cross_attn_weights
diff --git a/paddlespeech/t2s/modules/fastspeech2_transformer/__init__.py b/paddlespeech/t2s/modules/transformer/__init__.py
similarity index 100%
rename from paddlespeech/t2s/modules/fastspeech2_transformer/__init__.py
rename to paddlespeech/t2s/modules/transformer/__init__.py
diff --git a/paddlespeech/t2s/modules/fastspeech2_transformer/attention.py b/paddlespeech/t2s/modules/transformer/attention.py
similarity index 100%
rename from paddlespeech/t2s/modules/fastspeech2_transformer/attention.py
rename to paddlespeech/t2s/modules/transformer/attention.py
diff --git a/paddlespeech/t2s/modules/fastspeech2_transformer/decoder.py b/paddlespeech/t2s/modules/transformer/decoder.py
similarity index 94%
rename from paddlespeech/t2s/modules/fastspeech2_transformer/decoder.py
rename to paddlespeech/t2s/modules/transformer/decoder.py
index 489fda12bc9d5708418ef2b8e3b96ea264f7101e..072fc813737f3963ccfb6536a1e90e033116e7d4 100644
--- a/paddlespeech/t2s/modules/fastspeech2_transformer/decoder.py
+++ b/paddlespeech/t2s/modules/transformer/decoder.py
@@ -23,14 +23,14 @@ import paddle
 import paddle.nn.functional as F
 from paddle import nn
 
-from paddlespeech.t2s.modules.fastspeech2_transformer.attention import MultiHeadedAttention
-from paddlespeech.t2s.modules.fastspeech2_transformer.decoder_layer import DecoderLayer
-from paddlespeech.t2s.modules.fastspeech2_transformer.embedding import PositionalEncoding
-from paddlespeech.t2s.modules.fastspeech2_transformer.lightconv import LightweightConvolution
-from paddlespeech.t2s.modules.fastspeech2_transformer.mask import subsequent_mask
-from paddlespeech.t2s.modules.fastspeech2_transformer.positionwise_feed_forward import PositionwiseFeedForward
-from paddlespeech.t2s.modules.fastspeech2_transformer.repeat import repeat
 from paddlespeech.t2s.modules.layer_norm import LayerNorm
+from paddlespeech.t2s.modules.transformer.attention import MultiHeadedAttention
+from paddlespeech.t2s.modules.transformer.decoder_layer import DecoderLayer
+from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding
+from paddlespeech.t2s.modules.transformer.lightconv import LightweightConvolution
+from paddlespeech.t2s.modules.transformer.mask import subsequent_mask
+from paddlespeech.t2s.modules.transformer.positionwise_feed_forward import PositionwiseFeedForward
+from paddlespeech.t2s.modules.transformer.repeat import repeat
 
 
 class Decoder(nn.Layer):
diff --git a/paddlespeech/t2s/modules/fastspeech2_transformer/decoder_layer.py b/paddlespeech/t2s/modules/transformer/decoder_layer.py
similarity index 100%
rename from paddlespeech/t2s/modules/fastspeech2_transformer/decoder_layer.py
rename to paddlespeech/t2s/modules/transformer/decoder_layer.py
diff --git a/paddlespeech/t2s/modules/fastspeech2_transformer/embedding.py b/paddlespeech/t2s/modules/transformer/embedding.py
similarity index 100%
rename from paddlespeech/t2s/modules/fastspeech2_transformer/embedding.py
rename to paddlespeech/t2s/modules/transformer/embedding.py
diff --git a/paddlespeech/t2s/modules/fastspeech2_transformer/encoder.py b/paddlespeech/t2s/modules/transformer/encoder.py
similarity index 92%
rename from paddlespeech/t2s/modules/fastspeech2_transformer/encoder.py
rename to paddlespeech/t2s/modules/transformer/encoder.py
index f91c76b727e8af153ec82bf70410c3c6cae0f227..f088ac7fad38a2b3fc77b6251cea9dc845ebd813 100644
--- a/paddlespeech/t2s/modules/fastspeech2_transformer/encoder.py
+++ b/paddlespeech/t2s/modules/transformer/encoder.py
@@ -14,13 +14,13 @@
 # Modified from espnet(https://github.com/espnet/espnet)
 from paddle import nn
 
-from paddlespeech.t2s.modules.fastspeech2_transformer.attention import MultiHeadedAttention
-from paddlespeech.t2s.modules.fastspeech2_transformer.embedding import PositionalEncoding
-from paddlespeech.t2s.modules.fastspeech2_transformer.encoder_layer import EncoderLayer
-from paddlespeech.t2s.modules.fastspeech2_transformer.multi_layer_conv import Conv1dLinear
-from paddlespeech.t2s.modules.fastspeech2_transformer.multi_layer_conv import MultiLayeredConv1d
-from paddlespeech.t2s.modules.fastspeech2_transformer.positionwise_feed_forward import PositionwiseFeedForward
-from paddlespeech.t2s.modules.fastspeech2_transformer.repeat import repeat
+from paddlespeech.t2s.modules.transformer.attention import MultiHeadedAttention
+from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding
+from paddlespeech.t2s.modules.transformer.encoder_layer import EncoderLayer
+from paddlespeech.t2s.modules.transformer.multi_layer_conv import Conv1dLinear
+from paddlespeech.t2s.modules.transformer.multi_layer_conv import MultiLayeredConv1d
+from paddlespeech.t2s.modules.transformer.positionwise_feed_forward import PositionwiseFeedForward
+from paddlespeech.t2s.modules.transformer.repeat import repeat
 
 
 class Encoder(nn.Layer):
diff --git a/paddlespeech/t2s/modules/fastspeech2_transformer/encoder_layer.py b/paddlespeech/t2s/modules/transformer/encoder_layer.py
similarity index 100%
rename from paddlespeech/t2s/modules/fastspeech2_transformer/encoder_layer.py
rename to paddlespeech/t2s/modules/transformer/encoder_layer.py
diff --git a/paddlespeech/t2s/modules/fastspeech2_transformer/lightconv.py b/paddlespeech/t2s/modules/transformer/lightconv.py
similarity index 100%
rename from paddlespeech/t2s/modules/fastspeech2_transformer/lightconv.py
rename to paddlespeech/t2s/modules/transformer/lightconv.py
diff --git a/paddlespeech/t2s/modules/fastspeech2_transformer/mask.py b/paddlespeech/t2s/modules/transformer/mask.py
similarity index 100%
rename from paddlespeech/t2s/modules/fastspeech2_transformer/mask.py
rename to paddlespeech/t2s/modules/transformer/mask.py
diff --git a/paddlespeech/t2s/modules/fastspeech2_transformer/multi_layer_conv.py b/paddlespeech/t2s/modules/transformer/multi_layer_conv.py
similarity index 100%
rename from paddlespeech/t2s/modules/fastspeech2_transformer/multi_layer_conv.py
rename to paddlespeech/t2s/modules/transformer/multi_layer_conv.py
diff --git a/paddlespeech/t2s/modules/fastspeech2_transformer/positionwise_feed_forward.py b/paddlespeech/t2s/modules/transformer/positionwise_feed_forward.py
similarity index 100%
rename from paddlespeech/t2s/modules/fastspeech2_transformer/positionwise_feed_forward.py
rename to paddlespeech/t2s/modules/transformer/positionwise_feed_forward.py
diff --git a/paddlespeech/t2s/modules/fastspeech2_transformer/repeat.py b/paddlespeech/t2s/modules/transformer/repeat.py
similarity index 100%
rename from paddlespeech/t2s/modules/fastspeech2_transformer/repeat.py
rename to paddlespeech/t2s/modules/transformer/repeat.py
diff --git a/paddlespeech/t2s/modules/transformer/subsampling.py b/paddlespeech/t2s/modules/transformer/subsampling.py
new file mode 100644
index 0000000000000000000000000000000000000000..300b35beda72dda735629b525a0f00bb25129e94
--- /dev/null
+++ b/paddlespeech/t2s/modules/transformer/subsampling.py
@@ -0,0 +1,291 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+# Conv2dSubsampling 测试通过
+"""Subsampling layer definition."""
+import paddle
+
+from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding
+
+
+class TooShortUttError(Exception):
+    """Raised when the utt is too short for subsampling.
+    Parameters
+    ----------
+    message : str
+        Message for error catch
+    actual_size : int
+        the short size that cannot pass the subsampling
+    limit : int
+        the limit size for subsampling
+    """
+
+    def __init__(self, message, actual_size, limit):
+        """Construct a TooShortUttError for error handler."""
+        super().__init__(message)
+        self.actual_size = actual_size
+        self.limit = limit
+
+
+def check_short_utt(ins, size):
+    """Check if the utterance is too short for subsampling."""
+    if isinstance(ins, Conv2dSubsampling2) and size < 3:
+        return True, 3
+    if isinstance(ins, Conv2dSubsampling) and size < 7:
+        return True, 7
+    if isinstance(ins, Conv2dSubsampling6) and size < 11:
+        return True, 11
+    if isinstance(ins, Conv2dSubsampling8) and size < 15:
+        return True, 15
+    return False, -1
+
+
+class Conv2dSubsampling(paddle.nn.Layer):
+    """Convolutional 2D subsampling (to 1/4 length).
+    Parameters
+    ----------
+    idim : int
+        Input dimension.
+    odim : int
+        Output dimension.
+    dropout_rate : float
+        Dropout rate.
+    pos_enc : paddle.nn.Layer
+        Custom position encoding layer.
+    """
+
+    def __init__(self, idim, odim, dropout_rate, pos_enc=None):
+        """Construct an Conv2dSubsampling object."""
+        super(Conv2dSubsampling, self).__init__()
+        self.conv = paddle.nn.Sequential(
+            paddle.nn.Conv2D(1, odim, 3, 2),
+            paddle.nn.ReLU(),
+            paddle.nn.Conv2D(odim, odim, 3, 2),
+            paddle.nn.ReLU(), )
+        self.out = paddle.nn.Sequential(
+            paddle.nn.Linear(odim * (((idim - 1) // 2 - 1) // 2), odim),
+            pos_enc if pos_enc is not None else
+            PositionalEncoding(odim, dropout_rate), )
+
+    def forward(self, x, x_mask):
+        """Subsample x.
+        Parameters
+        ----------
+        x : paddle.Tensor
+            Input tensor (#batch, time, idim).
+        x_mask : paddle.Tensor
+            Input mask (#batch, 1, time).
+        Returns
+        ----------
+        paddle.Tensor
+            Subsampled tensor (#batch, time', odim),
+            where time' = time // 4.
+        paddle.Tensor
+            Subsampled mask (#batch, 1, time'),
+            where time' = time // 4.
+        """
+        # (b, c, t, f)
+        x = x.unsqueeze(1)
+        x = self.conv(x)
+        b, c, t, f = x.shape
+        # x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f))
+        x = self.out(x.transpose([0, 2, 1, 3]).reshape([b, t, c * f]))
+        if x_mask is None:
+            return x, None
+        return x, x_mask[:, :, :-2:2][:, :, :-2:2]
+
+    def __getitem__(self, key):
+        """Get item.
+        When reset_parameters() is called, if use_scaled_pos_enc is used,
+            return the positioning encoding.
+        """
+        if key != -1:
+            raise NotImplementedError(
+                "Support only `-1` (for `reset_parameters`).")
+        return self.out[key]
+
+
+class Conv2dSubsampling2(paddle.nn.Layer):
+    """Convolutional 2D subsampling (to 1/2 length).
+    Parameters
+    ----------
+    idim : int
+        Input dimension.
+    odim : int
+        Output dimension.
+    dropout_rate : float
+        Dropout rate.
+    pos_enc : paddle.nn.Layer
+        Custom position encoding layer.
+    """
+
+    def __init__(self, idim, odim, dropout_rate, pos_enc=None):
+        """Construct an Conv2dSubsampling2 object."""
+        super(Conv2dSubsampling2, self).__init__()
+        self.conv = paddle.nn.Sequential(
+            paddle.nn.Conv2D(1, odim, 3, 2),
+            paddle.nn.ReLU(),
+            paddle.nn.Conv2D(odim, odim, 3, 1),
+            paddle.nn.ReLU(), )
+        self.out = paddle.nn.Sequential(
+            paddle.nn.Linear(odim * (((idim - 1) // 2 - 2)), odim),
+            pos_enc if pos_enc is not None else
+            PositionalEncoding(odim, dropout_rate), )
+
+    def forward(self, x, x_mask):
+        """Subsample x.
+        Parameters
+        ----------
+        x : paddle.Tensor
+            Input tensor (#batch, time, idim).
+        x_mask : paddle.Tensor
+            Input mask (#batch, 1, time).
+        Returns
+        ----------
+        paddle.Tensor
+            ubsampled tensor (#batch, time', odim),
+            where time' = time // 2.
+        paddle.Tensor
+            Subsampled mask (#batch, 1, time'),
+            where time' = time // 2.
+        """
+        # (b, c, t, f)
+        x = x.unsqueeze(1)
+        x = self.conv(x)
+        b, c, t, f = x.shape
+        x = self.out(x.transpose([0, 2, 1, 3]).reshape([b, t, c * f]))
+        if x_mask is None:
+            return x, None
+        return x, x_mask[:, :, :-2:2][:, :, :-2:1]
+
+    def __getitem__(self, key):
+        """Get item.
+        When reset_parameters() is called, if use_scaled_pos_enc is used,
+            return the positioning encoding.
+        """
+        if key != -1:
+            raise NotImplementedError(
+                "Support only `-1` (for `reset_parameters`).")
+        return self.out[key]
+
+
+class Conv2dSubsampling6(paddle.nn.Layer):
+    """Convolutional 2D subsampling (to 1/6 length).
+    Parameters
+    ----------
+    idim : int
+        Input dimension.
+    odim : int
+        Output dimension.
+    dropout_rate : float
+        Dropout rate.
+    pos_enc : paddle.nn.Layer
+        Custom position encoding layer.
+    """
+
+    def __init__(self, idim, odim, dropout_rate, pos_enc=None):
+        """Construct an Conv2dSubsampling6 object."""
+        super(Conv2dSubsampling6, self).__init__()
+        self.conv = paddle.nn.Sequential(
+            paddle.nn.Conv2D(1, odim, 3, 2),
+            paddle.nn.ReLU(),
+            paddle.nn.Conv2D(odim, odim, 5, 3),
+            paddle.nn.ReLU(), )
+        self.out = paddle.nn.Sequential(
+            paddle.nn.Linear(odim * (((idim - 1) // 2 - 2) // 3), odim),
+            pos_enc if pos_enc is not None else
+            PositionalEncoding(odim, dropout_rate), )
+
+    def forward(self, x, x_mask):
+        """Subsample x.
+        Parameters
+        ----------
+        x : paddle.Tensor
+            Input tensor (#batch, time, idim).
+        x_mask paddle.Tensor
+            Input mask (#batch, 1, time).
+        Returns
+        ----------
+        paddle.Tensor
+            Subsampled tensor (#batch, time', odim),
+            where time' = time // 6.
+        paddle.Tensor
+            Subsampled mask (#batch, 1, time'),
+            where time' = time // 6.
+        """
+        # (b, c, t, f)
+        x = x.unsqueeze(1)
+        x = self.conv(x)
+        b, c, t, f = x.shape
+        x = self.out(x.transpose([0, 2, 1, 3]).reshape([b, t, c * f]))
+        if x_mask is None:
+            return x, None
+        return x, x_mask[:, :, :-2:2][:, :, :-4:3]
+
+
+class Conv2dSubsampling8(paddle.nn.Layer):
+    """Convolutional 2D subsampling (to 1/8 length).
+    Parameters
+    ----------
+    idim : int
+        Input dimension.
+    odim : int
+        Output dimension.
+    dropout_rate : float
+        Dropout rate.
+    pos_enc : paddle.nn.Layer
+        Custom position encoding layer.
+    """
+
+    def __init__(self, idim, odim, dropout_rate, pos_enc=None):
+        """Construct an Conv2dSubsampling8 object."""
+        super(Conv2dSubsampling8, self).__init__()
+        self.conv = paddle.nn.Sequential(
+            paddle.nn.Conv2D(1, odim, 3, 2),
+            paddle.nn.ReLU(),
+            paddle.nn.Conv2D(odim, odim, 3, 2),
+            paddle.nn.ReLU(),
+            paddle.nn.Conv2D(odim, odim, 3, 2),
+            paddle.nn.ReLU(), )
+        self.out = paddle.nn.Sequential(
+            paddle.nn.Linear(odim * (((
+                (idim - 1) // 2 - 1) // 2 - 1) // 2), odim),
+            pos_enc if pos_enc is not None else
+            PositionalEncoding(odim, dropout_rate), )
+
+    def forward(self, x, x_mask):
+        """Subsample x.
+        Parameters
+        ----------
+        x : paddle.Tensor
+            Input tensor (#batch, time, idim).
+        x_mask : paddle.Tensor
+            Input mask (#batch, 1, time).
+        Returns
+        ----------
+        paddle.Tensor
+            Subsampled tensor (#batch, time', odim),
+            where time' = time // 8.
+        paddle.Tensor
+            Subsampled mask (#batch, 1, time'),
+            where time' = time // 8.
+        """
+        # (b, c, t, f)
+        x = x.unsqueeze(1)
+        x = self.conv(x)
+        b, c, t, f = x.shape
+        x = self.out(x.transpose([0, 2, 1, 3]).reshape([b, t, c * f]))
+        if x_mask is None:
+            return x, None
+        return x, x_mask[:, :, :-2:2][:, :, :-2:2][:, :, :-2:2]
diff --git a/requirements.txt b/requirements.txt
index 2b34d36bdb467e0286c5d1e87d2f1383a9356f98..8e2552e7059e24ebbedb1ef2b67530e6780eb0cb 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -28,7 +28,7 @@ python-dateutil
 pyworld
 resampy==0.2.2
 sacrebleu
-scipy==1.2.1
+scipy
 sentencepiece
 snakeviz
 soundfile~=0.10
@@ -44,3 +44,9 @@ visualdl==2.2.0
 webrtcvad
 yacs
 yq
+pypi-kenlm
+GPUtil
+psutil
+pynvml
+distro
+
diff --git a/setup.sh b/setup.sh
new file mode 100644
index 0000000000000000000000000000000000000000..0bfacb548bfa6eb61bcb506c1fbc0a5acc185577
--- /dev/null
+++ b/setup.sh
@@ -0,0 +1,20 @@
+# Install conda dependencies
+conda install -c conda-forge sox libsndfile swig bzip2 bottleneck gcc_linux-64=8.4.0 gxx_linux-64=8.4.0 --yes
+
+# Install the python lib
+pip install -r requirements.txt
+
+# Install the auto_log
+pushd tools/extras
+bash install_autolog.sh
+popd
+
+# Install the ctcdecoder
+pushd paddlespeech/s2t/decoders/ctcdecoder/swig
+bash -e setup.sh
+popd
+
+# Install the python_speech_features
+pushd third_party
+bash -e install.sh
+popd
diff --git a/tests/benchmark/conformer/README.md b/tests/benchmark/conformer/README.md
index 71d5f91b8f283fe65afed2cfdf54eb7691e56ed8..22e0009d4445820a9ca6a226a1978ac065d698a9 100644
--- a/tests/benchmark/conformer/README.md
+++ b/tests/benchmark/conformer/README.md
@@ -43,16 +43,6 @@ bash prepare.sh
 bash run.sh
 ```
 
-### Analyse the sp
-```
-bash run_analysis_sp.sh
-```
-
-### Analyse the mp
-```
-bash run_analysis_mp.sh
-```
-
 ### The log
 ```
 {"log_file": "recoder_sp_bs16_fp32_ngpu1.txt",
diff --git a/tests/benchmark/conformer/analysis.py b/tests/benchmark/conformer/analysis.py
deleted file mode 100644
index 610791c8cf11640a4d1142441cd1d349cf8b3be1..0000000000000000000000000000000000000000
--- a/tests/benchmark/conformer/analysis.py
+++ /dev/null
@@ -1,345 +0,0 @@
-# copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import print_function
-
-import argparse
-import json
-import re
-import traceback
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(description=__doc__)
-    parser.add_argument(
-        "--filename", type=str, help="The name of log which need to analysis.")
-    parser.add_argument(
-        "--log_with_profiler",
-        type=str,
-        help="The path of train log with profiler")
-    parser.add_argument(
-        "--profiler_path", type=str, help="The path of profiler timeline log.")
-    parser.add_argument(
-        "--keyword", type=str, help="Keyword to specify analysis data")
-    parser.add_argument(
-        "--separator",
-        type=str,
-        default=None,
-        help="Separator of different field in log")
-    parser.add_argument(
-        '--position', type=int, default=None, help='The position of data field')
-    parser.add_argument(
-        '--range',
-        type=str,
-        default="",
-        help='The range of data field to intercept')
-    parser.add_argument(
-        '--base_batch_size', type=int, help='base_batch size on gpu')
-    parser.add_argument(
-        '--skip_steps',
-        type=int,
-        default=0,
-        help='The number of steps to be skipped')
-    parser.add_argument(
-        '--model_mode',
-        type=int,
-        default=-1,
-        help='Analysis mode, default value is -1')
-    parser.add_argument('--ips_unit', type=str, default=None, help='IPS unit')
-    parser.add_argument(
-        '--model_name',
-        type=str,
-        default=0,
-        help='training model_name, transformer_base')
-    parser.add_argument(
-        '--mission_name', type=str, default=0, help='training mission name')
-    parser.add_argument(
-        '--direction_id', type=int, default=0, help='training direction_id')
-    parser.add_argument(
-        '--run_mode',
-        type=str,
-        default="sp",
-        help='multi process or single process')
-    parser.add_argument(
-        '--index',
-        type=int,
-        default=1,
-        help='{1: speed, 2:mem, 3:profiler, 6:max_batch_size}')
-    parser.add_argument(
-        '--gpu_num', type=int, default=1, help='nums of training gpus')
-    parser.add_argument(
-        '--use_num', type=int, default=1, help='nums of used recoders')
-    args = parser.parse_args()
-    args.separator = None if args.separator == "None" else args.separator
-    return args
-
-
-def _is_number(num):
-    pattern = re.compile(r'^[-+]?[-0-9]\d*\.\d*|[-+]?\.?[0-9]\d*$')
-    result = pattern.match(num)
-    if result:
-        return True
-    else:
-        return False
-
-
-class TimeAnalyzer(object):
-    def __init__(self,
-                 filename,
-                 keyword=None,
-                 separator=None,
-                 position=None,
-                 range="-1"):
-        if filename is None:
-            raise Exception("Please specify the filename!")
-
-        if keyword is None:
-            raise Exception("Please specify the keyword!")
-
-        self.filename = filename
-        self.keyword = keyword
-        self.separator = separator
-        self.position = position
-        self.range = range
-        self.records = None
-        self._distil()
-
-    def _distil(self):
-        self.records = []
-        with open(self.filename, "r") as f_object:
-            lines = f_object.readlines()
-            for line in lines:
-                if self.keyword not in line:
-                    continue
-                try:
-                    result = None
-
-                    # Distil the string from a line.
-                    line = line.strip()
-                    line_words = line.split(
-                        self.separator) if self.separator else line.split()
-                    print("line_words", line_words)
-                    if args.position:
-                        result = line_words[self.position]
-                    else:
-                        # Distil the string following the keyword.
-                        for i in range(len(line_words) - 1):
-                            if line_words[i] == self.keyword:
-                                result = line_words[i + 1]
-                                break
-
-                    # Distil the result from the picked string.
-                    if not self.range:
-                        result = result[0:]
-                    elif _is_number(self.range):
-                        result = result[0:int(self.range)]
-                    else:
-                        result = result[int(self.range.split(":")[0]):int(
-                            self.range.split(":")[1])]
-                    self.records.append(float(result))
-                except Exception as exc:
-                    pass
-                    #print("line is: {}; separator={}; position={}".format(line, self.separator, self.position))
-        self.records.sort()
-        self.records = self.records[:args.use_num]
-        print("records", self.records)
-        print("Extract {} records: separator={}; position={}".format(
-            len(self.records), self.separator, self.position))
-
-    def _get_fps(self,
-                 mode,
-                 batch_size,
-                 gpu_num,
-                 avg_of_records,
-                 run_mode,
-                 unit=None):
-        if mode == -1 and run_mode == 'sp':
-            assert unit, "Please set the unit when mode is -1."
-            fps = gpu_num * avg_of_records
-        elif mode == -1 and run_mode == 'mp':
-            assert unit, "Please set the unit when mode is -1."
-            fps = gpu_num * avg_of_records  #temporarily, not used now
-            print("------------this is mp")
-        elif mode == 0:
-            # s/step -> samples/s
-            fps = (batch_size * gpu_num) / avg_of_records
-            unit = "samples/s"
-        elif mode == 1:
-            # steps/s -> steps/s
-            fps = avg_of_records
-            unit = "steps/s"
-        elif mode == 2:
-            # s/step -> steps/s
-            fps = 1 / avg_of_records
-            unit = "steps/s"
-        elif mode == 3:
-            # steps/s -> samples/s
-            fps = batch_size * gpu_num * avg_of_records
-            unit = "samples/s"
-        elif mode == 4:
-            # s/epoch -> s/epoch
-            fps = avg_of_records
-            unit = "s/epoch"
-        else:
-            ValueError("Unsupported analysis mode.")
-
-        return fps, unit
-
-    def analysis(self,
-                 batch_size,
-                 gpu_num=1,
-                 skip_steps=0,
-                 mode=-1,
-                 run_mode='sp',
-                 unit=None):
-        if batch_size <= 0:
-            print("base_batch_size should larger than 0.")
-            return 0, ''
-
-        if len(
-                self.records
-        ) <= skip_steps:  # to address the condition which item of log equals to skip_steps
-            print("no records")
-            return 0, ''
-
-        sum_of_records = 0
-        sum_of_records_skipped = 0
-        skip_min = self.records[skip_steps]
-        skip_max = self.records[skip_steps]
-
-        count = len(self.records)
-        for i in range(count):
-            sum_of_records += self.records[i]
-            if i >= skip_steps:
-                sum_of_records_skipped += self.records[i]
-                if self.records[i] < skip_min:
-                    skip_min = self.records[i]
-                if self.records[i] > skip_max:
-                    skip_max = self.records[i]
-
-        avg_of_records = sum_of_records / float(count)
-        avg_of_records_skipped = sum_of_records_skipped / float(count -
-                                                                skip_steps)
-
-        fps, fps_unit = self._get_fps(mode, batch_size, gpu_num, avg_of_records,
-                                      run_mode, unit)
-        fps_skipped, _ = self._get_fps(mode, batch_size, gpu_num,
-                                       avg_of_records_skipped, run_mode, unit)
-        if mode == -1:
-            print("average ips of %d steps, skip 0 step:" % count)
-            print("\tAvg: %.3f %s" % (avg_of_records, fps_unit))
-            print("\tFPS: %.3f %s" % (fps, fps_unit))
-            if skip_steps > 0:
-                print("average ips of %d steps, skip %d steps:" %
-                      (count, skip_steps))
-                print("\tAvg: %.3f %s" % (avg_of_records_skipped, fps_unit))
-                print("\tMin: %.3f %s" % (skip_min, fps_unit))
-                print("\tMax: %.3f %s" % (skip_max, fps_unit))
-                print("\tFPS: %.3f %s" % (fps_skipped, fps_unit))
-        elif mode == 1 or mode == 3:
-            print("average latency of %d steps, skip 0 step:" % count)
-            print("\tAvg: %.3f steps/s" % avg_of_records)
-            print("\tFPS: %.3f %s" % (fps, fps_unit))
-            if skip_steps > 0:
-                print("average latency of %d steps, skip %d steps:" %
-                      (count, skip_steps))
-                print("\tAvg: %.3f steps/s" % avg_of_records_skipped)
-                print("\tMin: %.3f steps/s" % skip_min)
-                print("\tMax: %.3f steps/s" % skip_max)
-                print("\tFPS: %.3f %s" % (fps_skipped, fps_unit))
-        elif mode == 0 or mode == 2:
-            print("average latency of %d steps, skip 0 step:" % count)
-            print("\tAvg: %.3f s/step" % avg_of_records)
-            print("\tFPS: %.3f %s" % (fps, fps_unit))
-            if skip_steps > 0:
-                print("average latency of %d steps, skip %d steps:" %
-                      (count, skip_steps))
-                print("\tAvg: %.3f s/step" % avg_of_records_skipped)
-                print("\tMin: %.3f s/step" % skip_min)
-                print("\tMax: %.3f s/step" % skip_max)
-                print("\tFPS: %.3f %s" % (fps_skipped, fps_unit))
-
-        return round(fps_skipped, 3), fps_unit
-
-
-if __name__ == "__main__":
-    args = parse_args()
-    run_info = dict()
-    run_info["log_file"] = args.filename
-    run_info["model_name"] = args.model_name
-    run_info["mission_name"] = args.mission_name
-    run_info["direction_id"] = args.direction_id
-    run_info["run_mode"] = args.run_mode
-    run_info["index"] = args.index
-    run_info["gpu_num"] = args.gpu_num
-    run_info["FINAL_RESULT"] = 0
-    run_info["JOB_FAIL_FLAG"] = 0
-
-    try:
-        if args.index == 1:
-            if args.gpu_num == 1:
-                run_info["log_with_profiler"] = args.log_with_profiler
-                run_info["profiler_path"] = args.profiler_path
-            analyzer = TimeAnalyzer(args.filename, args.keyword, args.separator,
-                                    args.position, args.range)
-            run_info["FINAL_RESULT"], run_info["UNIT"] = analyzer.analysis(
-                batch_size=args.base_batch_size,
-                gpu_num=args.gpu_num,
-                skip_steps=args.skip_steps,
-                mode=args.model_mode,
-                run_mode=args.run_mode,
-                unit=args.ips_unit)
-    #     if int(os.getenv('job_fail_flag')) == 1 or int(run_info["FINAL_RESULT"]) == 0:
-    #         run_info["JOB_FAIL_FLAG"] = 1
-        elif args.index == 3:
-            run_info["FINAL_RESULT"] = {}
-            records_fo_total = TimeAnalyzer(args.filename, 'Framework overhead',
-                                            None, 3, '').records
-            records_fo_ratio = TimeAnalyzer(args.filename, 'Framework overhead',
-                                            None, 5).records
-            records_ct_total = TimeAnalyzer(args.filename, 'Computation time',
-                                            None, 3, '').records
-            records_gm_total = TimeAnalyzer(args.filename,
-                                            'GpuMemcpy                Calls',
-                                            None, 4, '').records
-            records_gm_ratio = TimeAnalyzer(args.filename,
-                                            'GpuMemcpy                Calls',
-                                            None, 6).records
-            records_gmas_total = TimeAnalyzer(args.filename,
-                                              'GpuMemcpyAsync         Calls',
-                                              None, 4, '').records
-            records_gms_total = TimeAnalyzer(args.filename,
-                                             'GpuMemcpySync          Calls',
-                                             None, 4, '').records
-            run_info["FINAL_RESULT"]["Framework_Total"] = records_fo_total[
-                0] if records_fo_total else 0
-            run_info["FINAL_RESULT"]["Framework_Ratio"] = records_fo_ratio[
-                0] if records_fo_ratio else 0
-            run_info["FINAL_RESULT"][
-                "ComputationTime_Total"] = records_ct_total[
-                    0] if records_ct_total else 0
-            run_info["FINAL_RESULT"]["GpuMemcpy_Total"] = records_gm_total[
-                0] if records_gm_total else 0
-            run_info["FINAL_RESULT"]["GpuMemcpy_Ratio"] = records_gm_ratio[
-                0] if records_gm_ratio else 0
-            run_info["FINAL_RESULT"][
-                "GpuMemcpyAsync_Total"] = records_gmas_total[
-                    0] if records_gmas_total else 0
-            run_info["FINAL_RESULT"]["GpuMemcpySync_Total"] = records_gms_total[
-                0] if records_gms_total else 0
-        else:
-            print("Not support!")
-    except Exception:
-        traceback.print_exc()
-    print("{}".format(json.dumps(run_info))
-          )  # it's required, for the log file path  insert to the database
diff --git a/tests/benchmark/conformer/prepare.sh b/tests/benchmark/conformer/prepare.sh
index 8f03fd1b988fb458a681d7c8612416cf9ef65895..c5fae06a59d41147c9aaa89f3074914e7ea9906f 100644
--- a/tests/benchmark/conformer/prepare.sh
+++ b/tests/benchmark/conformer/prepare.sh
@@ -1,5 +1,6 @@
-source ../../../tools/venv/bin/activate
-
+cd ../../../
+pip install -e .   # 安装pdspeech
+cd -
 #Enter the example dir
 pushd ../../../examples/aishell/s1
 
diff --git a/tests/benchmark/conformer/run.sh b/tests/benchmark/conformer/run.sh
index c09bbf09b0547f3f0214f85a437dae23f764df98..79beb4e961fc01d7b1d5a80e81d94289057c0398 100644
--- a/tests/benchmark/conformer/run.sh
+++ b/tests/benchmark/conformer/run.sh
@@ -1,8 +1,12 @@
 
 # 提供可稳定复现性能的脚本，默认在标准docker环境内py37执行： paddlepaddle/paddle:latest-gpu-cuda10.1-cudnn7  paddle=2.1.2  py=37
 # 执行目录：需说明
-CUR_DIR=${PWD}
-source ../../../tools/venv/bin/activate
+CUR_DIR=${PWD} # PaddleSpeech/tests/benchmark/conformer
+cd ../../../
+log_path=${LOG_PATH_INDEX_DIR:-$(pwd)}  #  benchmark系统指定该参数,不需要跑profile时,log_path指向存speed的目录
+cd ${CUR_DIR}
+sed -i '/set\ -xe/d' run_benchmark.sh
+
 #cd **
 pushd ../../../examples/aishell/s1
 # 1 安装该模型需要的依赖 (如需开启优化策略请注明)
@@ -11,26 +15,33 @@ pushd ../../../examples/aishell/s1
 
 source path.sh
 source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
-
+mkdir -p conf/benchmark
+#yq e ".training.accum_grad=1" conf/conformer.yaml > conf/benchmark/conformer.yaml
+cp conf/conformer.yaml  conf/benchmark/conformer.yaml
+sed -i "s/  accum_grad: 2/  accum_grad: 1/g" conf/benchmark/conformer.yaml
 fp_item_list=(fp32)
 bs_item=(16 30)
-config_path=conf/conformer.yaml
+config_path=conf/benchmark/conformer.yaml
 seed=0
 output=exp/conformer
 profiler_options=None
+model_item=conformer
 for fp_item in ${fp_item_list[@]}; do
-    for batch_size in ${bs_item[@]}
+    for bs_item in ${bs_item[@]}
         do
         rm exp -rf
+        log_name=speech_${model_item}_bs${bs_item}_${fp_item}   # 如:clas_MobileNetv1_mp_bs32_fp32_8
         echo "index is speed, 8gpus, run_mode is multi_process, begin, conformer"
         run_mode=mp
         ngpu=8
-        CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash ${CUR_DIR}/run_benchmark.sh ${run_mode} ${config_path} ${output} ${seed} ${ngpu} ${profiler_options} ${batch_size} ${fp_item} ${CUR_DIR}
-        rm exp -rf
-        echo "index is speed, 1gpus, begin, conformer"
+        CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash ${CUR_DIR}/run_benchmark.sh ${run_mode} ${config_path} ${output} ${seed} ${ngpu} ${profiler_options} ${bs_item} ${fp_item} ${model_item} | tee ${log_path}/${log_name}_speed_8gpus8p 2>&1
+        sleep 60
+        log_name=speech_${model_item}_bs${bs_item}_${fp_item}   # 如:clas_MobileNetv1_mp_bs32_fp32_8
+        echo "index is speed, 1gpus, begin, ${log_name}"
         run_mode=sp
         ngpu=1
-        CUDA_VISIBLE_DEVICES=0 bash ${CUR_DIR}/run_benchmark.sh ${run_mode} ${config_path} ${output} ${seed} ${ngpu} ${profiler_options} ${batch_size} ${fp_item} ${CUR_DIR}
+        CUDA_VISIBLE_DEVICES=0 bash ${CUR_DIR}/run_benchmark.sh ${run_mode} ${config_path} ${output} ${seed} ${ngpu} ${profiler_options} ${bs_item} ${fp_item} ${model_item} | tee ${log_path}/${log_name}_speed_1gpus 2>&1   #  (5min)
+        sleep 60
     done
 done
 
diff --git a/tests/benchmark/conformer/run_benchmark.sh b/tests/benchmark/conformer/run_benchmark.sh
index c03a08f3b000e4d81649a392124ee1dbb445dace..56b63e76b1f23abf8f36c237dcd2232e20792d39 100644
--- a/tests/benchmark/conformer/run_benchmark.sh
+++ b/tests/benchmark/conformer/run_benchmark.sh
@@ -12,17 +12,24 @@ function _set_params(){
     profiler_options=${6:-"None"}
     batch_size=${7:-"32"}
     fp_item=${8:-"fp32"}
-    TRAIN_LOG_DIR=${9:-$(pwd)}
-
+    model_item=${9:-"conformer"}
     benchmark_max_step=0
-
     run_log_path=${TRAIN_LOG_DIR:-$(pwd)}  # TRAIN_LOG_DIR 后续QA设置该参数
+# 添加日志解析需要的参数
+    base_batch_size=${batch_size}
+    mission_name="语音识别"
+    direction_id="1"
+    ips_unit="sent./sec"
+    skip_steps=10                     # 解析日志，有些模型前几个step耗时长，需要跳过                                    (必填)
+    keyword="ips:"                 # 解析日志，筛选出数据所在行的关键字                                             (必填)
+    index="1"
+    model_name=${model_item}_bs${batch_size}_${fp_item}
 
 #   以下不用修改
     device=${CUDA_VISIBLE_DEVICES//,/ }
     arr=(${device})
     num_gpu_devices=${#arr[*]}
-    log_file=${run_log_path}/recoder_${run_mode}_bs${batch_size}_${fp_item}_ngpu${ngpu}.txt
+    log_file=${run_log_path}/recoder_${model_item}_${run_mode}_bs${batch_size}_${fp_item}_ngpu${ngpu}
 }
 
 function _train(){
@@ -36,11 +43,9 @@ function _train(){
                --benchmark-batch-size ${batch_size}
                --benchmark-max-step ${benchmark_max_step} "
 
-    echo "run_mode "${run_mode}
-
     case ${run_mode} in
-    sp) train_cmd="python3 -u ${BIN_DIR}/train.py "${train_cmd} ;;
-    mp) train_cmd="python3 -u ${BIN_DIR}/train.py "${train_cmd} ;;
+    sp) train_cmd="python -u ${BIN_DIR}/train.py "${train_cmd} ;;
+    mp) train_cmd="python -u ${BIN_DIR}/train.py "${train_cmd} ;;
     *) echo "choose run_mode(sp or mp)"; exit 1;
     esac
     echo ${train_cmd}
@@ -61,5 +66,8 @@ function _train(){
     fi
 }
 
+source ${BENCHMARK_ROOT}/scripts/run_model.sh   # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;该脚本在连调时可从benchmark repo中下载https://github.com/PaddlePaddle/benchmark/blob/master/scripts/run_model.sh;如果不联调只想要产出训练log可以注掉本行,提交时需打开
 _set_params $@
-_train
+# _train       # 如果只想产出训练log,不解析,可取消注释
+_run     # 该函数在run_model.sh中,执行时会调用_train; 如果不联调只想要产出训练log可以注掉本行,提交时需打开
+
diff --git a/tools/extras/install_miniconda.sh b/tools/extras/install_miniconda.sh
index 3d1909af6f4f8a23e261e8983bf9ee6d1275cb4f..c6ee4b361ca7733d46ecb9b6d3b260199c190203 100755
--- a/tools/extras/install_miniconda.sh
+++ b/tools/extras/install_miniconda.sh
@@ -13,6 +13,8 @@ else
 fi
 bash Miniconda3-latest-Linux-x86_64.sh -b
 
+$HOME/miniconda3/bin/conda init
+
 $HOME/miniconda3/bin/python -m pip install --user tqdm
 $HOME/miniconda3/bin/python -m pip install --user scikit-learn
 $HOME/miniconda3/bin/python -m pip install --user librosa