diff --git a/README.md b/README.md
index 532b24cc3a8b7ea67fb42dc5a445dd53db76b2af..da413001a51a74be9de4c70f6c1b8f23732a3597 100644
--- a/README.md
+++ b/README.md
@@ -110,7 +110,7 @@ Developers can have a try of our model with only a few lines of code.
 A tiny DeepSpeech2 **Speech-to-Text** model training on toy set of LibriSpeech:
 
 ```shell
-cd examples/tiny/s0/
+cd examples/tiny/asr0/
 # source the environment
 source path.sh
 source ../../../utils/parse_options.sh
@@ -177,20 +177,20 @@ The current hyperlinks redirect to [Previous Parakeet](https://github.com/Paddle
       <td rowspan="2" >Aishell</td>
       <td >DeepSpeech2 RNN + Conv based Models</td>
       <td>
-      <a href = "./examples/aishell/s0">deepspeech2-aishell</a>
+      <a href = "./examples/aishell/asr0">deepspeech2-aishell</a>
       </td>
     </tr>
     <tr>
       <td>Transformer based Attention Models </td>
       <td>
-      <a href = "./examples/aishell/s1">u2.transformer.conformer-aishell</a>
+      <a href = "./examples/aishell/asr1">u2.transformer.conformer-aishell</a>
       </td>
     </tr>
       <tr>
       <td> Librispeech</td>
       <td>Transformer based Attention Models </td>
       <td>
-      <a href = "./examples/librispeech/s0">deepspeech2-librispeech</a> / <a href = "./examples/librispeech/s1">transformer.conformer.u2-librispeech</a>  / <a href = "./examples/librispeech/s2">transformer.conformer.u2-kaldi-librispeech</a>
+      <a href = "./examples/librispeech/asr0">deepspeech2-librispeech</a> / <a href = "./examples/librispeech/asr1">transformer.conformer.u2-librispeech</a>  / <a href = "./examples/librispeech/asr2">transformer.conformer.u2-kaldi-librispeech</a>
       </td>
       </td>
     </tr>
@@ -199,7 +199,7 @@ The current hyperlinks redirect to [Previous Parakeet](https://github.com/Paddle
   <td>THCHS30</td>
   <td>MFA</td>
   <td>
-  <a href = ".examples/thchs30/a0">mfa-thchs30</a>
+  <a href = ".examples/thchs30/align0">mfa-thchs30</a>
   </td>
   </tr>
    <tr>
@@ -213,7 +213,7 @@ The current hyperlinks redirect to [Previous Parakeet](https://github.com/Paddle
       <td>TIMIT</td>
       <td>Unified Streaming & Non-streaming Two-pass</td>
       <td>
-    <a href = "./examples/timit/s1"> u2-timit</a>
+    <a href = "./examples/timit/asr1"> u2-timit</a>
       </td>
     </tr>
   </tbody>
diff --git a/dataset/ted_en_zh/ted_en_zh.py b/dataset/ted_en_zh/ted_en_zh.py
index 9a3ba3b31c2f7a9b9e050ceebaa8da9ace0ccb89..2d1fc67100e9fd2bef7cb0f0efc929b2ef075875 100644
--- a/dataset/ted_en_zh/ted_en_zh.py
+++ b/dataset/ted_en_zh/ted_en_zh.py
@@ -28,7 +28,7 @@ import soundfile
 
 parser = argparse.ArgumentParser(description=__doc__)
 parser.add_argument(
-    "--src_dir",
+    "--src-dir",
     default="",
     type=str,
     help="Directory to kaldi splited data. (default: %(default)s)")
diff --git a/demos/README.md b/demos/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..2183c1f2dec3487936837604125ee28c8355c674
--- /dev/null
+++ b/demos/README.md
@@ -0,0 +1 @@
+# Demos for PaddleSpeech
diff --git a/demos/style_fs2/README.md b/demos/style_fs2/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..b9b0469f8778538fff97acbbdc130d74c0c67d38
--- /dev/null
+++ b/demos/style_fs2/README.md
@@ -0,0 +1,8 @@
+# Style FastSpeech2
+You can change the `pitch`、`duration` and `energy` of `FastSpeech2`.
+
+Run the following command line to get started:
+```
+./run.sh
+```
+For more details, please see `style_syn.py`
diff --git a/docs/tutorial/tts/tts_tutorial.ipynb b/docs/tutorial/tts/tts_tutorial.ipynb
index 587924cda4a09606b29025c6bdac815fc5c6c47d..2bb407bebf401717e4b5b75d5e26492e88055229 100644
--- a/docs/tutorial/tts/tts_tutorial.ipynb
+++ b/docs/tutorial/tts/tts_tutorial.ipynb
@@ -252,7 +252,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 25,
    "metadata": {
     "scrolled": true
    },
@@ -261,8 +261,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "The autoreload extension is already loaded. To reload it, use:\n",
-      "  %reload_ext autoreload\n"
+      "env: CUDA_VISIBLE_DEVICES=0\n"
      ]
     }
    ],
@@ -284,7 +283,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 28,
    "metadata": {
     "scrolled": true
    },
@@ -317,7 +316,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 30,
    "metadata": {
     "scrolled": true
    },
@@ -596,11 +595,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 32,
+   "execution_count": 31,
    "metadata": {
     "scrolled": true
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Frontend done!\n"
+     ]
+    }
+   ],
    "source": [
     "# 传入 phones_dict 会把相应的 phones 转换成 phone_ids\n",
     "frontend = Frontend(phone_vocab_path=phones_dict)\n",
@@ -619,25 +626,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 35,
    "metadata": {
     "scrolled": true
    },
    "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Building prefix dict from the default dictionary ...\n",
-      "DEBUG:jieba:Building prefix dict from the default dictionary ...\n",
-      "Loading model from cache /tmp/jieba.cache\n",
-      "DEBUG:jieba:Loading model from cache /tmp/jieba.cache\n",
-      "Loading model cost 5.331 seconds.\n",
-      "DEBUG:jieba:Loading model cost 5.331 seconds.\n",
-      "Prefix dict has been built successfully.\n",
-      "DEBUG:jieba:Prefix dict has been built successfully.\n"
-     ]
-    },
     {
      "name": "stdout",
      "output_type": "stream",
@@ -701,8 +694,10 @@
     "<br></br>\n",
     "在本教程中，我们使用 `FastSpeech2` 作为声学模型。\n",
     "![FastSpeech2](source/fastspeech2.png)\n",
+    "\n",
     "PaddleSpeech TTS 实现的 FastSpeech2 与论文不同的地方在于，我们使用的的是 phone 级别的 `pitch` 和 `energy`(与 FastPitch 类似)。\n",
     "![FastPitch](source/fastpitch.png)\n",
+    "\n",
     "更多关于[声学模型的发展及改进](https://paddlespeech.readthedocs.io/en/latest/tts/models_introduction.html)。"
    ]
   },
@@ -1020,13 +1015,16 @@
     "odim = fastspeech2_config.n_mels\n",
     "model = FastSpeech2(\n",
     "    idim=vocab_size, odim=odim, **fastspeech2_config[\"model\"])\n",
-    "\n",
-    "model.set_state_dict(paddle.load(fastspeech2_checkpoint)[\"main_params\"]) # 加载预训练模型参数\n",
-    "model.eval() # 推理阶段不启用 batch norm 和 dropout\n",
+    "# 加载预训练模型参数\n",
+    "model.set_state_dict(paddle.load(fastspeech2_checkpoint)[\"main_params\"])\n",
+    "# 推理阶段不启用 batch norm 和 dropout\n",
+    "model.eval()\n",
     "stat = np.load(fastspeech2_stat)\n",
-    "mu, std = stat # 读取数据预处理阶段数据集的均值和标准差\n",
+    "# 读取数据预处理阶段数据集的均值和标准差\n",
+    "mu, std = stat\n",
     "mu, std = paddle.to_tensor(mu), paddle.to_tensor(std)\n",
-    "fastspeech2_normalizer = ZScore(mu, std) # 构造归一化的新模型\n",
+    "# 构造归一化的新模型\n",
+    "fastspeech2_normalizer = ZScore(mu, std)\n",
     "fastspeech2_inference = FastSpeech2Inference(fastspeech2_normalizer, model)\n",
     "fastspeech2_inference.eval()\n",
     "print(fastspeech2_inference)\n",
@@ -1153,16 +1151,18 @@
    ],
    "source": [
     "vocoder = PWGGenerator(**pwg_config[\"generator_params\"])\n",
-    "\n",
-    "vocoder.set_state_dict(paddle.load(pwg_checkpoint)[\"generator_params\"]) # 模型加载预训练参数\n",
+    "# 模型加载预训练参数\n",
+    "vocoder.set_state_dict(paddle.load(pwg_checkpoint)[\"generator_params\"]) \n",
     "vocoder.remove_weight_norm()\n",
-    "vocoder.eval() # 推理阶段不启用 batch norm 和 dropout\n",
-    "\n",
-    "stat = np.load(pwg_stat) # 读取数据预处理阶段数据集的均值和标准差\n",
+    "# 推理阶段不启用 batch norm 和 dropout\n",
+    "vocoder.eval()\n",
+    "# 读取数据预处理阶段数据集的均值和标准差\n",
+    "stat = np.load(pwg_stat)\n",
     "mu, std = stat\n",
     "mu, std = paddle.to_tensor(mu), paddle.to_tensor(std)\n",
     "pwg_normalizer = ZScore(mu, std)\n",
-    "pwg_inference = PWGInference(pwg_normalizer, vocoder) # 构建归一化的模型\n",
+    "# 构建归一化的模型\n",
+    "pwg_inference = PWGInference(pwg_normalizer, vocoder)\n",
     "pwg_inference.eval()\n",
     "print(\"Parallel WaveGAN done!\")"
    ]
@@ -1266,7 +1266,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 40,
+   "execution_count": 36,
    "metadata": {},
    "outputs": [
     {
diff --git a/examples/aishell/asr1/READEME.md b/examples/aishell/asr1/READEME.md
new file mode 100644
index 0000000000000000000000000000000000000000..2eea233da021b90de3b96c7089c80c23228279bb
--- /dev/null
+++ b/examples/aishell/asr1/READEME.md
@@ -0,0 +1,341 @@
+# Transformer/Conformer ASR with Aishell
+
+This example contains code used to train a Transformer or [Conformer](http://arxiv.org/abs/2008.03802) model with [Aishell dataset](http://www.openslr.org/resources/33)
+
+## Overview
+
+All the scirpts you need are in ```run.sh```. There are several stages in ```run.sh```, and each stage has its function.
+
+| Stage | Function                                                     |
+| :---- | :----------------------------------------------------------- |
+| 0     | Process data. It includes: <br>       (1) Download the dataset <br>       (2) Caculate the CMVN of the train dataset <br>       (3) Get the vocabulary file <br>       (4) Get the manifest files of the train, development and test dataset |
+| 1     | Train the model                                              |
+| 2     | Get the final model by averaging the top-k models, set k = 1 means choose the best model |
+| 3     | Test the final model performance                             |
+| 4     | Get ctc alignment of test data using the final model         |
+| 5     | Infer the single audio file                                  |
+
+
+You can choose to run a range of stages by setting ```stage``` and ```stop_stage ```. 
+
+For example, if you want to execute the code in stage 2 and stage 3, you can run this script:
+
+```bash
+bash run.sh --stage 2 --stop_stage 3
+```
+
+Or you can set ```stage``` equal to ```stop-stage``` to only run one stage.
+For example, if you only want to run ```stage 0```, you can use the script below:
+
+```bash
+bash run.sh --stage 0 --stop_stage 0
+```
+
+
+
+The document below will describe the scripts in ```run.sh``` in detail.
+
+## The Environment Variables
+
+The path.sh contains the environment variables. 
+
+```bash
+source path.sh
+```
+
+This script needs to be run firstly. And another script is also needed:
+
+```bash
+source ${MAIN_ROOT}/utils/parse_options.sh
+```
+
+It will support the way of using```--varibale value``` in the shell scripts.
+
+
+
+## The Local Variables
+
+Some local variables are set in ```run.sh```. 
+```gpus``` denotes the GPU number you want to use. If you set ```gpus=```, it means you only use CPU. 
+
+```stage``` denotes the number of stage you want to start from in the expriments.
+```stop stage```denotes the number of stage you want to end at in the expriments. 
+
+```conf_path``` denotes the config path of the model.
+
+```avg_num``` denotes the number K of top-K models you want to average to get the final model.
+
+```audio file``` denotes the file path of the single file you want to infer in stage 6
+
+```ckpt``` denotes the checkpoint prefix of the model, e.g. "conformer"
+
+You can set the local variables (except ```ckpt```) when you use ```run.sh```
+
+For example, you can set the ```gpus``` and ``avg_num`` when you use the command line.:
+
+```bash
+bash run.sh --gpus 0,1 --avg_num 20
+```
+
+
+
+## Stage 0: Data Processing
+
+To use this example, you need to process data firstly and you can use stage 0 in ```run.sh``` to do this. The code is shown below:
+
+```bash
+ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+     # prepare data
+     bash ./local/data.sh || exit -1
+ fi
+```
+
+Stage 0 is for processing the data.
+
+If you only want to process the data. You can run
+
+```bash
+bash run.sh --stage 0 --stop_stage 0
+```
+
+You can also just run these scripts in your command line.
+
+```bash
+source path.sh
+bash ./local/data.sh
+```
+
+After processing the data, the ``data`` directory will look like this:
+
+```bash
+data/
+|-- dev.meta
+|-- lang_char
+|   `-- vocab.txt
+|-- manifest.dev
+|-- manifest.dev.raw
+|-- manifest.test
+|-- manifest.test.raw
+|-- manifest.train
+|-- manifest.train.raw
+|-- mean_std.json
+|-- test.meta
+`-- train.meta
+```
+
+
+
+## Stage 1: Model Training
+
+If you want to train the model. you can use stage 1 in ```run.sh```. The code is shown below. 
+
+```bash
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+     # train model, all `ckpt` under `exp` dir
+     CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt}
+ fi
+```
+
+If you want to train the model, you can use the script below to execute stage 0 and stage 1:
+
+```bash
+bash run.sh --stage 0 --stop_stage 1
+```
+
+or you can run these scripts in the command line (only use CPU).
+
+```bash
+source path.sh
+bash ./local/data.sh
+CUDA_VISIBLE_DEVICES= ./local/train.sh conf/conformer.yaml conformer
+```
+
+
+
+## Stage 2: Top-k Models Averaging
+
+After training the model, we need to get the final model for testing and inference. In every epoch, the model checkpoint is saved, so we can choose the best model from them based on the validation loss or we can sort them and average the parameters of the top-k models to get the final model. We can use stage 2 to do this, and the code is shown below:
+
+```bash
+ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+     # avg n best model
+     avg.sh best exp/${ckpt}/checkpoints ${avg_num}
+ fi
+```
+
+The ```avg.sh``` is in the ```../../../utils/``` which is define in the ```path.sh```.
+If you want to get the final model, you can use the script below to execute stage 0, stage 1, and stage 2:
+
+```bash
+bash run.sh --stage 0 --stop_stage 2
+```
+
+or you can run these scripts in the command line (only use CPU).
+
+```bash
+source path.sh
+bash ./local/data.sh
+CUDA_VISIBLE_DEVICES= ./local/train.sh conf/conformer.yaml conformer
+avg.sh best exp/conformer/checkpoints 20
+```
+
+
+
+## Stage 3: Model Testing
+
+The test stage is to evaluate the model performance. The code of test stage is shown below:
+
+```bash
+ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+     # test ckpt avg_n
+     CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
+ fi
+```
+
+If you want to train a model and test it, you can use the script below to execute stage 0, stage 1, stage 2, and stage 3 :
+
+```bash
+bash run.sh --stage 0 --stop_stage 3
+```
+
+or you can run these scripts in the command line (only use CPU).
+
+```bash
+source path.sh
+bash ./local/data.sh
+CUDA_VISIBLE_DEVICES= ./local/train.sh conf/conformer.yaml conformer
+avg.sh best exp/conformer/checkpoints 20
+CUDA_VISIBLE_DEVICES= ./local/test.sh conf/conformer.yaml exp/conformer/checkpoints/avg_20
+```
+
+
+
+## Pretrained Model
+
+You can get the pretrained transfomer or conformer using the scripts below:
+
+```bash
+Conformer:
+wget https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.release.tar.gz
+
+Chunk Conformer:
+wget https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.chunk.release.tar.gz
+
+Transfomer:
+wget https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/transformer.model.tar.gz
+
+```
+
+using the ```tar``` scripts to unpack the model and then you can use the script to test the modle.
+
+For example:
+
+```
+wget https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/transformer.model.tar.gz
+tar xzvf transformer.model.tar.gz
+source path.sh
+# If you have process the data and get the manifest file， you can skip the following 2 steps
+bash local/data.sh --stage -1 --stop_stage -1
+bash local/data.sh --stage 2 --stop_stage 2
+
+CUDA_VISIBLE_DEVICES= ./local/test.sh conf/transformer.yaml exp/transformer/checkpoints/avg_20
+```
+
+
+
+The performance of the released models are shown below:
+
+### Conformer
+
+| Model     | Params | Config              | Augmentation     | Test set | Decode method          | Loss | CER      |
+| --------- | ------ | ------------------- | ---------------- | -------- | ---------------------- | ---- | -------- |
+| conformer | 47.07M | conf/conformer.yaml | spec_aug + shift | test     | attention              | -    | 0.059858 |
+| conformer | 47.07M | conf/conformer.yaml | spec_aug + shift | test     | ctc_greedy_search      | -    | 0.062311 |
+| conformer | 47.07M | conf/conformer.yaml | spec_aug + shift | test     | ctc_prefix_beam_search | -    | 0.062196 |
+| conformer | 47.07M | conf/conformer.yaml | spec_aug + shift | test     | attention_rescoring    | -    | 0.054694 |
+
+
+### Chunk Conformer
+
+Need set `decoding.decoding_chunk_size=16` when decoding.
+
+| Model     | Params | Config                    | Augmentation     | Test set | Decode method          | Chunk Size & Left Chunks | Loss | CER      |
+| --------- | ------ | ------------------------- | ---------------- | -------- | ---------------------- | ------------------------ | ---- | -------- |
+| conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test     | attention              | 16, -1                   | -    | 0.061939 |
+| conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test     | ctc_greedy_search      | 16, -1                   | -    | 0.070806 |
+| conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test     | ctc_prefix_beam_search | 16, -1                   | -    | 0.070739 |
+| conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test     | attention_rescoring    | 16, -1                   | -    | 0.059400 |
+
+
+### Transformer 
+
+| Model       | Params | Config                | Augmentation | Test set | Decode method          | Loss              | CER      |
+| ----------- | ------ | --------------------- | ------------ | -------- | ---------------------- | ----------------- | -------- |
+| transformer | 31.95M | conf/transformer.yaml | spec_aug     | test     | attention              | 3.858648955821991 | 0.057293 |
+| transformer | 31.95M | conf/transformer.yaml | spec_aug     | test     | ctc_greedy_search      | 3.858648955821991 | 0.061837 |
+| transformer | 31.95M | conf/transformer.yaml | spec_aug     | test     | ctc_prefix_beam_search | 3.858648955821991 | 0.061685 |
+| transformer | 31.95M | conf/transformer.yaml | spec_aug     | test     | attention_rescoring    | 3.858648955821991 | 0.053844 |
+
+
+
+## Stage 4: CTC Alignment 
+
+If you want to get the alignment between the audio and the text, you can use the ctc alignment. The code of this stage is shown below:
+
+```bash
+ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+     # ctc alignment of test data
+     CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
+ fi
+```
+
+If you want to train the model, test it and do the alignment, you can use the script below to execute stage 0, stage 1, stage 2, and stage 3 :
+
+```bash
+bash run.sh --stage 0 --stop_stage 4
+```
+
+or if you only need to train a model and do the alignment, you can use these scripts to escape stage 3(test stage):
+
+```bash
+bash run.sh --stage 0 --stop_stage 2
+bash run.sh --stage 4 --stop_stage 4
+```
+
+or you can also use these scripts in the command line (only use CPU).
+
+```bash
+source path.sh
+bash ./local/data.sh
+CUDA_VISIBLE_DEVICES= ./local/train.sh conf/conformer.yaml conformer
+avg.sh best exp/conformer/checkpoints 20
+# test stage is optional
+CUDA_VISIBLE_DEVICES= ./local/test.sh conf/conformer.yaml exp/conformer/checkpoints/avg_20
+CUDA_VISIBLE_DEVICES= ./local/align.sh conf/conformer.yaml exp/conformer/checkpoints/avg_20
+```
+
+
+
+## Stage 5: Single Audio File Inference
+
+In some situations, you want to use the trained model to do the inference for the single audio file. You can use stage 5. The code is shown below
+
+```bash
+ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+     # test a single .wav file
+     CUDA_VISIBLE_DEVICES=0 ./local/test_hub.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1
+ fi
+```
+
+you can train the model by yourself using ```bash run.sh --stage 0 --stop_stage 3```, or you can download the pretrained model through the script below:
+
+```bash
+wget https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/transformer.model.tar.gz
+tar xzvf transformer.model.tar.gz
+```
+
+You need to prepare an audio file, please confirm the sample rate of the audio is 16K. Assume the path of the audio file is ```data/test_audio.wav```, you can get the result by running the script below.
+
+```bash
+CUDA_VISIBLE_DEVICES= ./local/test_hub.sh conf/transformer.yaml exp/transformer/checkpoints/avg_20 data/test_audio.wav
+```
diff --git a/examples/aishell/asr1/run.sh b/examples/aishell/asr1/run.sh
index 1464f2c918cab7922297b907cfbb7530cd043cc8..d9c0ee3e002d1c869a400119487a9068cde53c6d 100644
--- a/examples/aishell/asr1/run.sh
+++ b/examples/aishell/asr1/run.sh
@@ -4,7 +4,7 @@ set -e
 
 gpus=0,1,2,3
 stage=0
-stop_stage=100
+stop_stage=50
 conf_path=conf/conformer.yaml
 avg_num=20
 audio_file=data/demo_01_03.wav
@@ -15,7 +15,6 @@ avg_ckpt=avg_${avg_num}
 ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}')
 echo "checkpoint name ${ckpt}"
 
-
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
     # prepare data
     bash ./local/data.sh || exit -1
@@ -41,18 +40,20 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
     CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
 fi
 
-# if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
-#     # export ckpt avg_n
-#     CUDA_VISIBLE_DEVICES=0 ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
-# fi
-
 # Optionally, you can add LM and test it with runtime.
-if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
     # test a single .wav file
     CUDA_VISIBLE_DEVICES=0 ./local/test_hub.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1
 fi
 
-if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
+# Not supported at now!!!
+if [ ${stage} -le 51 ] && [ ${stop_stage} -ge 51 ]; then
+     # export ckpt avg_n
+     CUDA_VISIBLE_DEVICES=0 ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
+fi
+
+# Need further installation! Read the install.md to complete further installation
+if [ ${stage} -le 101 ] && [ ${stop_stage} -ge 101 ]; then
     echo "warning: deps on kaldi and srilm, please make sure installed."
     # train lm and build TLG
     ./local/tlg.sh --corpus aishell --lmtype srilm
diff --git a/examples/aishell3/vc1/README.md b/examples/aishell3/vc1/README.md
index 974b84cade3536da775fcfb2aab5ae0db3df0683..676678c999b9f0b08d963483f09fe22279da879b 100644
--- a/examples/aishell3/vc1/README.md
+++ b/examples/aishell3/vc1/README.md
@@ -21,7 +21,7 @@ We use [MFA2.x](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner)
 You can download from here [aishell3_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/with_tone/aishell3_alignment_tone.tar.gz), or train your own MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) (use MFA1.x now) of our repo.
 
 ## Pretrained GE2E Model
-We use pretrained GE2E model to generate spwaker embedding for each sentence.
+We use pretrained GE2E model to generate speaker embedding for each sentence.
 
 Download pretrained GE2E model from here [ge2e_ckpt_0.3.zip](https://bj.bcebos.com/paddlespeech/Parakeet/released_models/ge2e/ge2e_ckpt_0.3.zip), and `unzip` it.
 
diff --git a/examples/csmsc/voc3/conf/default.yaml b/examples/csmsc/voc3/conf/default.yaml
index cc27220fca0a4d6dcad4aff463df4a6bfcc0c1eb..5dda835aee23c8eb68b536a2947a98675987f814 100644
--- a/examples/csmsc/voc3/conf/default.yaml
+++ b/examples/csmsc/voc3/conf/default.yaml
@@ -6,8 +6,7 @@
 # This configuration is based on full-band MelGAN but the hop size and sampling
 # rate is different from the paper (16kHz vs 24kHz). The number of iteraions
 # is not shown in the paper so currently we train 1M iterations (not sure enough
-# to converge). The optimizer setting is based on @dathudeptrai advice.
-# https://github.com/kan-bayashi/ParallelWaveGAN/issues/143#issuecomment-632539906
+# to converge). 
 
 ###########################################################
 #                FEATURE EXTRACTION SETTING               #
@@ -136,4 +135,4 @@ eval_interval_steps: 1000               # Interval steps to evaluate the network
 #                     OTHER SETTING                       #
 ###########################################################
 num_snapshots: 10                 # max number of snapshots to keep while training
-seed: 42                          # random seed for paddle, random, and np.random
\ No newline at end of file
+seed: 42                          # random seed for paddle, random, and np.random
diff --git a/examples/csmsc/voc3/conf/finetune.yaml b/examples/csmsc/voc3/conf/finetune.yaml
index 80ab6bed707b4cdf89a367634490a5bfba122fea..302274019d62a1134bdd4e0179e1e88e84d4d649 100644
--- a/examples/csmsc/voc3/conf/finetune.yaml
+++ b/examples/csmsc/voc3/conf/finetune.yaml
@@ -6,8 +6,7 @@
 # This configuration is based on full-band MelGAN but the hop size and sampling
 # rate is different from the paper (16kHz vs 24kHz). The number of iteraions
 # is not shown in the paper so currently we train 1M iterations (not sure enough
-# to converge). The optimizer setting is based on @dathudeptrai advice.
-# https://github.com/kan-bayashi/ParallelWaveGAN/issues/143#issuecomment-632539906
+# to converge). 
 
 ###########################################################
 #                FEATURE EXTRACTION SETTING               #
diff --git a/examples/librispeech/asr1/RESULTS.md b/examples/librispeech/asr1/RESULTS.md
index 19300adea6fb3bdf2743956209aa64268f2f1b3c..a5257c9d4e40bfdae1cf1a35ca1524b30ab9ae3a 100644
--- a/examples/librispeech/asr1/RESULTS.md
+++ b/examples/librispeech/asr1/RESULTS.md
@@ -21,7 +21,7 @@
 ## Transformer
 | Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER |  
 | --- | --- | --- | --- | --- | --- | --- | --- |
-| transformer | 32.52 M | conf/transformer.yaml | spec_aug  | test-clean | attention | 6.725063021977743 | 0.047417 |  
-| transformer | 32.52 M | conf/transformer.yaml | spec_aug  | test-clean | ctc_greedy_search | 6.725063021977743 | 0.053922 |  
-| transformer | 32.52 M | conf/transformer.yaml | spec_aug  | test-clean | ctc_prefix_beam_search | 6.725063021977743 | 0.053180 |  
-| transformer | 32.52 M | conf/transformer.yaml | spec_aug  | test-clean | attention_rescoring | 6.725063021977743 | 0.041026 |  
\ No newline at end of file
+| transformer | 32.52 M | conf/transformer.yaml | spec_aug  | test-clean | attention | 6.484564081827799 | 0.044355 |  
+| transformer | 32.52 M | conf/transformer.yaml | spec_aug  | test-clean | ctc_greedy_search | 6.484564081827799 | 0.050479 |  
+| transformer | 32.52 M | conf/transformer.yaml | spec_aug  | test-clean | ctc_prefix_beam_search | 6.484564081827799 | 0.049890 |  
+| transformer | 32.52 M | conf/transformer.yaml | spec_aug  | test-clean | attention_rescoring | 6.484564081827799 | 0.039200 |  
\ No newline at end of file
diff --git a/examples/librispeech/asr1/conf/transformer.yaml b/examples/librispeech/asr1/conf/transformer.yaml
index 1806f3fd67a212f7bfaf370a9ce2bce72e960290..0cc0dae63766c3947fab44ed5990ddf7310a0d2d 100644
--- a/examples/librispeech/asr1/conf/transformer.yaml
+++ b/examples/librispeech/asr1/conf/transformer.yaml
@@ -1,3 +1,40 @@
+# network architecture
+model:
+    cmvn_file: 
+    cmvn_file_type: "json"
+    # encoder related
+    encoder: transformer
+    encoder_conf:
+        output_size: 256    # dimension of attention
+        attention_heads: 4
+        linear_units: 2048  # the number of units of position-wise feed forward
+        num_blocks: 12      # the number of encoder blocks
+        dropout_rate: 0.1
+        positional_dropout_rate: 0.1
+        attention_dropout_rate: 0.0
+        input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+        normalize_before: true
+
+    # decoder related
+    decoder: transformer
+    decoder_conf:
+        attention_heads: 4
+        linear_units: 2048
+        num_blocks: 6
+        dropout_rate: 0.1
+        positional_dropout_rate: 0.1
+        self_attention_dropout_rate: 0.0
+        src_attention_dropout_rate: 0.0
+
+    # hybrid CTC/attention
+    model_conf:
+        ctc_weight: 0.3
+        ctc_dropoutrate: 0.0
+        ctc_grad_norm_type: null 
+        lsm_weight: 0.1     # label smoothing option
+        length_normalized_loss: false
+
+
 # https://yaml.org/type/float.html
 data:
   train_manifest: data/manifest.train
@@ -36,43 +73,6 @@ collator:
   num_workers: 2
 
 
-# network architecture
-model:
-    cmvn_file: 
-    cmvn_file_type: "json"
-    # encoder related
-    encoder: transformer
-    encoder_conf:
-        output_size: 256    # dimension of attention
-        attention_heads: 4
-        linear_units: 2048  # the number of units of position-wise feed forward
-        num_blocks: 12      # the number of encoder blocks
-        dropout_rate: 0.1
-        positional_dropout_rate: 0.1
-        attention_dropout_rate: 0.0
-        input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
-        normalize_before: true
-
-    # decoder related
-    decoder: transformer
-    decoder_conf:
-        attention_heads: 4
-        linear_units: 2048
-        num_blocks: 6
-        dropout_rate: 0.1
-        positional_dropout_rate: 0.1
-        self_attention_dropout_rate: 0.0
-        src_attention_dropout_rate: 0.0
-
-    # hybrid CTC/attention
-    model_conf:
-        ctc_weight: 0.3
-        ctc_dropoutrate: 0.0
-        ctc_grad_norm_type: null 
-        lsm_weight: 0.1     # label smoothing option
-        length_normalized_loss: false
-
-
 training:
   n_epoch: 120 
   accum_grad: 4
diff --git a/examples/librispeech/asr2/RESULTS.md b/examples/librispeech/asr2/RESULTS.md
index 41655565dc12b6fe5e1b1164e3f76ab426f0fd37..4cb2d2872e62774ebc4ffb1911a91d88725d25a0 100644
--- a/examples/librispeech/asr2/RESULTS.md
+++ b/examples/librispeech/asr2/RESULTS.md
@@ -3,9 +3,9 @@
 
 ## Transformer
 
-| Model | Params | GPUS | Averaged Model | Config | Augmentation| Loss |
-| --- | --- | --- | --- | --- | --- |  
-| transformer | 32.52 M | 8 Tesla V100-SXM2-32GB | 10-best val_loss | conf/transformer.yaml | spec_aug | 6.3197922706604 |
+| Model | Params | GPUS | Averaged Model | Config | Augmentation| Loss |  
+| :-: | :-: | :------------: | :------------: | :-: | :-: | :-: |     
+| transformer | 32.52M | 8 Tesla V100-SXM2-32GB | 10-best val_loss | conf/transformer.yaml | spec_aug | 6.3197922706604 |  
 
 ### Attention Rescore
 
diff --git a/examples/librispeech/asr2/conf/transformer.yaml b/examples/librispeech/asr2/conf/transformer.yaml
index 4a50183a9c84ce1470966e833f163efe38a960e5..3e9350abc0f118e152132e8c45e579fb29cbdc8a 100644
--- a/examples/librispeech/asr2/conf/transformer.yaml
+++ b/examples/librispeech/asr2/conf/transformer.yaml
@@ -31,7 +31,7 @@ model:
     model_conf:
         ctc_weight: 0.3
         ctc_dropoutrate: 0.0
-        ctc_grad_norm_type: batch
+        ctc_grad_norm_type: null 
         lsm_weight: 0.1     # label smoothing option
         length_normalized_loss: false
 
diff --git a/examples/ted_en_zh/st0/conf/transformer_joint_noam.yaml b/examples/ted_en_zh/st0/conf/transformer_mtl_noam.yaml
similarity index 99%
rename from examples/ted_en_zh/st0/conf/transformer_joint_noam.yaml
rename to examples/ted_en_zh/st0/conf/transformer_mtl_noam.yaml
index 8256f7160598b882eae9ca06a476ca3ffd85742e..7e886cca3e02509ec5a79ec708800ae6a49a6390 100644
--- a/examples/ted_en_zh/st0/conf/transformer_joint_noam.yaml
+++ b/examples/ted_en_zh/st0/conf/transformer_mtl_noam.yaml
@@ -86,7 +86,7 @@ training:
   scheduler_conf:
     warmup_steps: 25000
     lr_decay: 1.0
-  log_interval: 5
+  log_interval: 50
   checkpoint:
     kbest_n: 50
     latest_n: 5
diff --git a/examples/ted_en_zh/st0/local/data.sh b/examples/ted_en_zh/st0/local/data.sh
index 097cd3a85d9ae6a78857e2051223a80d10fc8862..c4de1749e93c31ee2e2b5eeddf1f305b34d22fc4 100755
--- a/examples/ted_en_zh/st0/local/data.sh
+++ b/examples/ted_en_zh/st0/local/data.sh
@@ -42,7 +42,7 @@ if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
     # generate manifests
     python3 ${TARGET_DIR}/ted_en_zh/ted_en_zh.py \
     --manifest_prefix="data/manifest" \
-    --src_dir="${data_dir}"
+    --src-dir="${data_dir}"
 
     echo "Complete raw data pre-process."
 fi
@@ -76,8 +76,9 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     --spm_vocab_size=${nbpe} \
     --spm_mode ${bpemode} \
     --spm_model_prefix ${bpeprefix} \
+    --spm_character_coverage 1. \
     --vocab_path="${dict_dir}/vocab.txt" \
-    --text_keys 'text' 'text1' \
+    --text_keys 'text' \
     --manifest_paths="data/manifest.train.raw"
 
     if [ $? -ne 0 ]; then
diff --git a/examples/ted_en_zh/st0/run.sh b/examples/ted_en_zh/st0/run.sh
index fb4bc33880b0013d05e19c734554a51348b6a484..bc5ee4e6048acacfb65aa76a189a7c6611a7f99b 100755
--- a/examples/ted_en_zh/st0/run.sh
+++ b/examples/ted_en_zh/st0/run.sh
@@ -5,7 +5,7 @@ source path.sh
 gpus=0,1,2,3
 stage=0
 stop_stage=100
-conf_path=conf/transformer_joint_noam.yaml
+conf_path=conf/transformer_mtl_noam.yaml
 avg_num=5
 data_path=./TED_EnZh # path to unzipped data
 source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
diff --git a/examples/ted_en_zh/st1/cmd.sh b/examples/ted_en_zh/st1/cmd.sh
new file mode 100644
index 0000000000000000000000000000000000000000..7b70ef5e06e550d466e41668a44a86263952262c
--- /dev/null
+++ b/examples/ted_en_zh/st1/cmd.sh
@@ -0,0 +1,89 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# "qsub" (SGE, Torque, PBS, etc.)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/gpu.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/examples/ted_en_zh/st1/conf/fbank.conf b/examples/ted_en_zh/st1/conf/fbank.conf
new file mode 100644
index 0000000000000000000000000000000000000000..82ac7bd0dbc50d716b9a901e9bd0d02186809f11
--- /dev/null
+++ b/examples/ted_en_zh/st1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/examples/ted_en_zh/st1/conf/pitch.conf b/examples/ted_en_zh/st1/conf/pitch.conf
new file mode 100644
index 0000000000000000000000000000000000000000..e959a19d5b8988562d539b98c905bc7129bcd4d6
--- /dev/null
+++ b/examples/ted_en_zh/st1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/examples/ted_en_zh/st1/conf/transformer_mtl_noam.yaml b/examples/ted_en_zh/st1/conf/transformer_mtl_noam.yaml
index b4fb510754713f6c63f5bce9561032225b31c6c5..3175aad9f7b36d340ce30a56a6a00d4f0291491c 100644
--- a/examples/ted_en_zh/st1/conf/transformer_mtl_noam.yaml
+++ b/examples/ted_en_zh/st1/conf/transformer_mtl_noam.yaml
@@ -11,9 +11,9 @@ data:
   max_output_input_ratio: 20.0
 
 collator:
-  vocab_filepath: data/lang_char/vocab.txt
+  vocab_filepath: data/lang_char/ted_en_zh_bpe8000.txt
   unit_type: 'spm'
-  spm_model_prefix: data/train_sp.en-zh-nlpr.zh-nlpr_bpe8000_tc
+  spm_model_prefix: data/lang_char/ted_en_zh_bpe8000
   mean_std_filepath: ""
   # augmentation_config: conf/augmentation.json
   batch_size: 10
diff --git a/examples/ted_en_zh/st1/local/data.sh b/examples/ted_en_zh/st1/local/data.sh
index aa958cfde38fadebc8d6e042cb9c1ce9ce21db16..f9c876b1631d069dadcba5a547afa9505f66675a 100755
--- a/examples/ted_en_zh/st1/local/data.sh
+++ b/examples/ted_en_zh/st1/local/data.sh
@@ -8,10 +8,13 @@ dict_dir=data/lang_char
 
 # bpemode (unigram or bpe)
 nbpe=8000
-bpemode=unigram
+bpemode=bpe
 bpeprefix="${dict_dir}/bpe_${bpemode}_${nbpe}"
 data_dir=./TED_EnZh
-
+target_dir=data/ted_en_zh
+dumpdir=data/dump
+do_delta=false
+nj=20
 
 source ${MAIN_ROOT}/utils/parse_options.sh
 
@@ -38,75 +41,167 @@ if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
         exit 1
     fi
 
-    # generate manifests
-    python3 ${TARGET_DIR}/ted_en_zh/ted_en_zh.py \
-    --manifest_prefix="data/manifest" \
-    --src_dir="${data_dir}"
+    # extract data 
+    echo "data Extraction"
+    python3 local/ted_en_zh.py \
+    --tgt-dir=${target_dir} \
+    --src-dir=${data_dir}
 
-    echo "Complete raw data pre-process."
 fi
-
+prep_dir=${target_dir}/data_prep 
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
-    # compute mean and stddev for normalizer
-    num_workers=$(nproc)
-    python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
-    --manifest_path="data/manifest.train.raw" \
-    --num_samples=-1 \
-    --spectrum_type="fbank" \
-    --feat_dim=80 \
-    --delta_delta=false \
-    --sample_rate=16000 \
-    --stride_ms=10.0 \
-    --window_ms=25.0 \
-    --use_dB_normalization=False \
-    --num_workers=${num_workers} \
-    --output_path="data/mean_std.json"
-
-    if [ $? -ne 0 ]; then
-        echo "Compute mean and stddev failed. Terminated."
-        exit 1
-    fi
+    ### Task dependent. You have to make data the following preparation part by yourself.
+    ### But you can utilize Kaldi recipes in most cases
+    echo "stage 0: Data preparation"
+    for set in train dev test; do
+    # for set in train; do
+        dst=${target_dir}/${set}
+        for lang in en zh; do
+
+            if [ ${lang} = 'en' ]; then
+                echo "remove punctuation $lang"
+                # remove punctuation
+                local/remove_punctuation.pl < ${dst}/${lang}.org > ${dst}/${lang}.raw
+            else
+                cp ${dst}/${lang}.org ${dst}/${lang}.raw
+            fi
+
+            paste -d " " ${dst}/.yaml ${dst}/${lang}.raw | sort > ${dst}/text.${lang}
+
+
+        done
+        # error check
+        n=$(cat ${dst}/.yaml | wc -l)
+        n_en=$(cat ${dst}/en.raw | wc -l)
+        n_tgt=$(cat ${dst}/zh.raw | wc -l)
+        [ ${n} -ne ${n_en} ] && echo "Warning: expected ${n} data data files, found ${n_en}" && exit 1;
+        [ ${n} -ne ${n_tgt} ] && echo "Warning: expected ${n} data data files, found ${n_tgt}" && exit 1;
+
+        echo "done text processing"
+        cat ${dst}/wav.scp.org | uniq | sort -k1,1 -u > ${dst}/wav.scp
+        cat ${dst}/utt2spk.org | uniq | sort -k1,1 -u > ${dst}/utt2spk
+
+        cat ${dst}/utt2spk | utils/utt2spk_to_spk2utt.pl | sort -k1,1 -u > ${dst}/spk2utt
+        rm -rf ${prep_dir}/${set}.en-zh
+        mkdir -p ${prep_dir}/${set}.en-zh
+        echo "remove duplicate lines..."
+        cut -d ' ' -f 1 ${dst}/text.en | sort | uniq -c | sort -n -k1 -r | grep -v '1 ted-en-zh' \
+            | sed 's/^[ \t]*//' > ${dst}/duplicate_lines
+        cut -d ' ' -f 1 ${dst}/text.en | sort | uniq -c | sort -n -k1 -r | grep '1 ted-en-zh' \
+            | cut -d '1' -f 2- | sed 's/^[ \t]*//' > ${dst}/reclist
+        reduce_data_dir.sh ${dst} ${dst}/reclist ${prep_dir}/${set}.en-zh
+        echo "done wav processing"
+        for l in en zh; do
+            cp ${dst}/text.${l} ${prep_dir}/${set}.en-zh/text.${l}
+        done
+        utils/fix_data_dir.sh --utt_extra_files \
+        "text.en text.zh" \
+        ${prep_dir}/${set}.en-zh
+    done
 fi
 
+feat_tr_dir=${dumpdir}/train_sp/delta${do_delta}; mkdir -p ${feat_tr_dir}
+feat_dt_dir=${dumpdir}/dev/delta${do_delta}; mkdir -p ${feat_dt_dir}
+feat_trans_dir=${dumpdir}/test/delta${do_delta}; mkdir -p ${feat_trans_dir}
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
-    # build vocabulary
-    python3 ${MAIN_ROOT}/utils/build_vocab.py \
-    --unit_type "spm" \
-    --spm_vocab_size=${nbpe} \
-    --spm_mode ${bpemode} \
-    --spm_model_prefix ${bpeprefix} \
-    --vocab_path="${dict_dir}/vocab.txt" \
-    --text_keys 'text' 'text1' \
-    --manifest_paths="data/manifest.train.raw"
-
-
-    if [ $? -ne 0 ]; then
-        echo "Build vocabulary failed. Terminated."
-        exit 1
-    fi
+    ### Task dependent. You have to design training and dev sets by yourself.
+    ### But you can utilize Kaldi recipes in most cases
+    echo "stage 1: Feature Generation"
+    fbankdir=data/fbank
+    # Generate the fbank features; by default 80-dimensional fbanks with pitch on each frame
+    for x in train dev test; do
+        steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj ${nj} --write_utt2num_frames true \
+            ${prep_dir}/${x}.en-zh data/make_fbank/${x} ${fbankdir}
+    done
+    
+    echo "speed perturbation"
+    utils/perturb_data_dir_speed.sh 0.9 ${prep_dir}/train.en-zh ${prep_dir}/temp1.en-zh
+    utils/perturb_data_dir_speed.sh 1.0 ${prep_dir}/train.en-zh ${prep_dir}/temp2.en-zh
+    utils/perturb_data_dir_speed.sh 1.1 ${prep_dir}/train.en-zh ${prep_dir}/temp3.en-zh
+
+    utils/combine_data.sh --extra-files utt2uniq ${prep_dir}/train_sp.en-zh \
+    ${prep_dir}/temp1.en-zh ${prep_dir}/temp2.en-zh ${prep_dir}/temp3.en-zh
+    rm -r ${prep_dir}/temp*.en-zh 
+    utils/fix_data_dir.sh ${prep_dir}/train_sp.en-zh
+
+    steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj ${nj} --write_utt2num_frames true \
+        ${prep_dir}/train_sp.en-zh exp/make_fbank/train_sp.en-zh ${fbankdir}
+
+    for lang in en zh; do
+        cat /dev/null > ${prep_dir}/train_sp.en-zh/text.${lang}
+        for p in "sp0.9-" "sp1.0-" "sp1.1-"; do
+            awk -v p=${p} '{printf("%s %s%s\n", $1, p, $1);}' ${prep_dir}/train.en-zh/utt2spk > ${prep_dir}/train_sp.en-zh/utt_map
+            utils/apply_map.pl -f 1 ${prep_dir}/train_sp.en-zh/utt_map < ${prep_dir}/train.en-zh/text.${lang} >>${prep_dir}/train_sp.en-zh/text.${lang}
+        done
+    done
+
+    for x in train_sp dev test; do
+        local/divide_lang.sh ${prep_dir}/${x}.en-zh zh
+    done
+
+    for x in train_sp dev; do
+        # remove utt having more than 3000 frames
+        # remove utt having more than 400 characters
+        for lang in zh en; do
+            remove_longshortdata.sh --maxframes 3000 --maxchars 400 ${prep_dir}/${x}.en-zh.${lang} ${prep_dir}/${x}.en-zh.${lang}.tmp
+        done
+        cut -f 1 -d " " ${prep_dir}/${x}.en-zh.en.tmp/text > ${prep_dir}/${x}.en-zh.${lang}.tmp/reclist1
+        cut -f 1 -d " " ${prep_dir}/${x}.en-zh.${lang}.tmp/text > ${prep_dir}/${x}.en-zh.${lang}.tmp/reclist2
+        comm -12 ${prep_dir}/${x}.en-zh.${lang}.tmp/reclist1 ${prep_dir}/${x}.en-zh.${lang}.tmp/reclist2 > ${prep_dir}/${x}.en-zh.en.tmp/reclist
+
+        for lang in zh en; do
+            reduce_data_dir.sh ${prep_dir}/${x}.en-zh.${lang}.tmp ${prep_dir}/${x}.en-zh.en.tmp/reclist ${prep_dir}/${x}.en-zh.${lang}
+            utils/fix_data_dir.sh  ${prep_dir}/${x}.en-zh.${lang}
+        done
+        rm -rf ${prep_dir}/${x}.en-zh.*.tmp
+    done
+
+    compute-cmvn-stats scp:${prep_dir}/train_sp.en-zh.zh/feats.scp ${prep_dir}/train_sp.en-zh.zh/cmvn.ark
+
+    dump.sh --cmd "$train_cmd" --nj ${nj} --do_delta $do_delta \
+        ${prep_dir}/train_sp.en-zh.zh/feats.scp ${prep_dir}/train_sp.en-zh.zh/cmvn.ark ${prep_dir}/dump_feats/train_sp.en-zh.zh ${feat_tr_dir}
+    dump.sh --cmd "$train_cmd" --nj ${nj} --do_delta $do_delta \
+        ${prep_dir}/dev.en-zh.zh/feats.scp ${prep_dir}/train_sp.en-zh.zh/cmvn.ark ${prep_dir}/dump_feats/dev.en-zh.zh ${feat_dt_dir}
+    dump.sh --cmd "$train_cmd" --nj ${nj} --do_delta $do_delta \
+        ${prep_dir}/test.en-zh.zh/feats.scp ${prep_dir}/train_sp.en-zh.zh/cmvn.ark ${prep_dir}/dump_feats/test.en-zh.zh ${feat_trans_dir}
 fi
 
+dict=${dict_dir}/ted_en_zh_${bpemode}${nbpe}.txt
+nlsyms=${dict_dir}/ted_en_zh_non_lang_syms.txt
+bpemodel=${dict_dir}/ted_en_zh_${bpemode}${nbpe}
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
-    # format manifest with tokenids, vocab size
-    for set in train dev test; do
-    {
-        python3 ${MAIN_ROOT}/utils/format_triplet_data.py \
-        --feat_type "raw" \
-        --cmvn_path "data/mean_std.json" \
-        --unit_type "spm" \
-        --spm_model_prefix ${bpeprefix} \
-        --vocab_path="${dict_dir}/vocab.txt" \
-        --manifest_path="data/manifest.${set}.raw" \
-        --output_path="data/manifest.${set}"
-
-        if [ $? -ne 0 ]; then
-            echo "Formt mnaifest failed. Terminated."
-            exit 1
-        fi
-    }&
+    echo "stage 2: Dictionary and Json Data Preparation"
+
+    echo "make a joint source and target dictionary"
+    echo "<unk> 1" > ${dict} # <unk> must be 1, 0 will be used for "blank" in CTC
+    offset=$(wc -l < ${dict})
+    grep sp1.0 ${prep_dir}/train_sp.en-zh.*/text | cut -f 2- -d' ' | grep -v -e '^\s*$' > ${dict_dir}/input.txt
+    spm_train  --input=${dict_dir}/input.txt --vocab_size=${nbpe} --model_type=${bpemode} --model_prefix=${bpemodel} --input_sentence_size=100000000 --character_coverage=1.0
+    spm_encode --model=${bpemodel}.model --output_format=piece < ${dict_dir}/input.txt | tr ' ' '\n' | sort | uniq | awk -v offset=${offset} '{print $0 " " NR+offset}' >> ${dict}
+    wc -l ${dict}
+
+    echo "make json files"
+    data2json.sh --nj ${nj} --feat ${feat_tr_dir}/feats.scp --text ${prep_dir}/train_sp.en-zh.zh/text --bpecode ${bpemodel}.model --lang zh \
+        ${prep_dir}/train_sp.en-zh.zh ${dict} > ${feat_tr_dir}/data_${bpemode}${nbpe}.json
+    data2json.sh --feat ${feat_dt_dir}/feats.scp --text ${prep_dir}/dev.en-zh.zh/text --bpecode ${bpemodel}.model --lang zh \
+        ${prep_dir}/dev.en-zh.zh ${dict} > ${feat_dt_dir}/data_${bpemode}${nbpe}.json
+    data2json.sh --feat ${feat_trans_dir}/feats.scp --text ${prep_dir}/test.en-zh.zh/text --bpecode ${bpemodel}.model --lang zh \
+        ${prep_dir}/test.en-zh.zh ${dict} > ${feat_trans_dir}/data_${bpemode}${nbpe}.json
+    echo "update json (add source references)"
+    # update json (add source references)
+    for x in train_sp dev; do
+        feat_dir=${dumpdir}/${x}/delta${do_delta}
+        data_dir=${prep_dir}/$(echo ${x} | cut -f 1 -d ".").en-zh.en
+        update_json.sh --text ${data_dir}/text --bpecode ${bpemodel}.model \
+            ${feat_dir}/data_${bpemode}${nbpe}.json ${data_dir} ${dict}
     done
-    wait
 fi
 
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    echo "stage 3: Format the Json Data"
+    python3 local/espnet_json_to_manifest.py --json-file ${feat_tr_dir}/data_${bpemode}${nbpe}.json --manifest-file data/manifest.train
+    python3 local/espnet_json_to_manifest.py --json-file ${feat_dt_dir}/data_${bpemode}${nbpe}.json --manifest-file data/manifest.dev
+    python3 local/espnet_json_to_manifest.py --json-file ${feat_trans_dir}/data_${bpemode}${nbpe}.json --manifest-file data/manifest.test
+fi
 echo "Ted En-Zh Data preparation done."
 exit 0
diff --git a/examples/ted_en_zh/st1/local/divide_lang.sh b/examples/ted_en_zh/st1/local/divide_lang.sh
new file mode 100755
index 0000000000000000000000000000000000000000..4e5f85c86ba8bcd7f962db5cc2ca9d88d0f65ff9
--- /dev/null
+++ b/examples/ted_en_zh/st1/local/divide_lang.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+
+# Copyright 2019 Kyoto University (Hirofumi Inaguma)
+#           2021 PaddlePaddle
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+. ./path.sh
+
+if [ "$#" -ne 2 ]; then
+    echo "Usage: $0 <set> <lang>>"
+    echo "e.g.: $0 dev"
+    exit 1
+fi
+
+set=$1
+lang=$2
+export LC_ALL=en_US.UTF-8
+# Copy stuff intoc its final locations [this has been moved from the format_data script]
+# for En
+mkdir -p ${set}.en
+for f in spk2utt utt2spk segments wav.scp feats.scp utt2num_frames; do
+    if [ -f ${set}/${f} ]; then
+        sort ${set}/${f} > ${set}.en/${f}
+    fi
+done
+sort ${set}/text.en | sed $'s/[^[:print:]]//g' > ${set}.en/text 
+
+utils/fix_data_dir.sh ${set}.en
+if [ -f ${set}.en/feats.scp ]; then
+    utils/validate_data_dir.sh ${set}.en || exit 1;
+else
+    utils/validate_data_dir.sh --no-feats --no-wav ${set}.en || exit 1;
+fi
+
+# for target language
+mkdir -p ${set}.${lang}
+for f in spk2utt utt2spk segments wav.scp feats.scp utt2num_frames; do
+    if [ -f ${set}/${f} ]; then
+        sort ${set}/${f} > ${set}.${lang}/${f}
+    fi
+done
+sort ${set}/text.${lang} | sed $'s/[^[:print:]]//g' > ${set}.${lang}/text 
+utils/fix_data_dir.sh  ${set}.${lang}
+if [ -f ${set}.${lang}/feats.scp ]; then
+    utils/validate_data_dir.sh ${set}.${lang} || exit 1;
+else
+    utils/validate_data_dir.sh --no-feats --no-wav ${set}.${lang} || exit 1;
+fi
diff --git a/examples/ted_en_zh/st1/local/espnet_json_to_manifest.py b/examples/ted_en_zh/st1/local/espnet_json_to_manifest.py
new file mode 100644
index 0000000000000000000000000000000000000000..60d254367919cc1f83afe2d9da7a0f97d8732bab
--- /dev/null
+++ b/examples/ted_en_zh/st1/local/espnet_json_to_manifest.py
@@ -0,0 +1,27 @@
+#!/usr/bin/env python
+import argparse
+import json
+
+
+def main(args):
+    with open(args.json_file, 'r') as fin:
+        data_json = json.load(fin)
+
+    with open(args.manifest_file, 'w') as fout:
+        for key, value in data_json['utts'].items():
+            value['utt'] = key
+            fout.write(json.dumps(value, ensure_ascii=False))
+            fout.write("\n")
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        '--json-file', type=str, default=None, help="espnet data json file.")
+    parser.add_argument(
+        '--manifest-file',
+        type=str,
+        default='manifest.train',
+        help='manifest data json line file.')
+    args = parser.parse_args()
+    main(args)
diff --git a/examples/ted_en_zh/st1/local/remove_punctuation.pl b/examples/ted_en_zh/st1/local/remove_punctuation.pl
new file mode 100755
index 0000000000000000000000000000000000000000..89e19c6f4f566119555d149724f6f404b1d56b5d
--- /dev/null
+++ b/examples/ted_en_zh/st1/local/remove_punctuation.pl
@@ -0,0 +1,25 @@
+#!/usr/bin/perl
+
+use warnings;
+use strict;
+
+binmode(STDIN,":utf8");
+binmode(STDOUT,":utf8");
+
+while(<STDIN>) {
+  $_ = " $_ ";
+
+  # remove punctuation except apostrophe
+  s/<space>/spacemark/g;  # for scoring
+  s/'/apostrophe/g;
+  s/[[:punct:]]//g;
+  s/apostrophe/'/g;
+  s/spacemark/<space>/g;  # for scoring
+
+  # remove whitespace
+  s/\s+/ /g;
+  s/^\s+//;
+  s/\s+$//;
+
+  print "$_\n";
+}
diff --git a/examples/ted_en_zh/st1/local/ted_en_zh.py b/examples/ted_en_zh/st1/local/ted_en_zh.py
new file mode 100644
index 0000000000000000000000000000000000000000..f30573b7e0aeba8e1c9d83b62064b863fd0a79e7
--- /dev/null
+++ b/examples/ted_en_zh/st1/local/ted_en_zh.py
@@ -0,0 +1,104 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import codecs
+import os
+
+
+# org_split = 'train-split/train-segment'
+# text_file = 'En-Zh/train.en-zh'
+# data_split = 'train'
+def data_process(src_dir, tgt_dir, wav_dir_list, text_file_list,
+                 data_split_list):
+
+    for org_split, text_file, data_split in zip(wav_dir_list, text_file_list,
+                                                data_split_list):
+        local_data_split_dir = os.path.join(tgt_dir, data_split)
+
+        os.makedirs(local_data_split_dir, exist_ok=True)
+        utts = []
+        utt2spk = {}
+        with open(os.path.join(local_data_split_dir, 'wav.scp.org'), 'w') as wav_wf, \
+            open(os.path.join(local_data_split_dir, 'utt2spk.org'), 'w') as utt2spk_wf:
+            for files in os.listdir(os.path.join(src_dir, org_split)):
+                files = files.strip()
+                file_path = os.path.join(src_dir, org_split, files)
+                size = os.path.getsize(file_path)
+                if size <= 30000:
+                    continue
+                utt = files.split('.')[0]
+                audio_name = utt.split('_')[0]
+                #format the name of utterance 
+                while len(audio_name) < 6:
+                    utt = '0' + utt
+                    audio_name = '0' + audio_name
+                utt = 'ted-en-zh-' + utt
+                utts.append(utt)
+                spk = utt.split('_')[0]
+                utt2spk[utt] = spk
+                assert len(spk) == 16, "%r" % spk
+                print(utt, 'cat', os.path.abspath(file_path), '|', file=wav_wf)
+            for utt in sorted(utts):
+                print(utt, utt2spk[utt], file=utt2spk_wf)
+
+        with open(os.path.join(local_data_split_dir, 'en.org'), 'w') as en_wf, \
+            open(os.path.join(local_data_split_dir, 'zh.org'), 'w') as zh_wf, \
+            open(os.path.join(local_data_split_dir, '.yaml'), 'w') as yaml_wf, \
+            codecs.open(os.path.join(src_dir, text_file), 'r', encoding='utf-8',
+                        errors='ignore') as rf:
+            count = 0
+            for line in rf:
+                line = line.strip()
+                line_spl = line.split('\t')
+                assert len(line_spl) == 3, "%r" % line
+                wav, en, zh = line_spl
+                assert wav.endswith('wav'), "%r" % wav[-3:]
+                utt = wav.split('.')[0]
+                audio_name = utt.split('_')[0]
+                while len(audio_name) < 6:
+                    utt = '0' + utt
+                    audio_name = '0' + audio_name
+                utt = 'ted-en-zh-' + utt
+                print(utt, file=yaml_wf)
+                print(en.lower(), file=en_wf)
+                print(zh, file=zh_wf)
+                count += 1
+            print('%s set lines count: %d' % (data_split, count))
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description=__doc__)
+
+    parser.add_argument(
+        "--src-dir",
+        default="",
+        type=str,
+        help="Directory to kaldi splited data. (default: %(default)s)")
+    parser.add_argument(
+        "--tgt-dir",
+        default="local/ted_en_zh",
+        type=str,
+        help="Directory to save processed data. (default: %(default)s)")
+    args = parser.parse_args()
+
+    wav_dir_list = [
+        'train-split/train-segment', 'test-segment/tst2014',
+        'test-segment/tst2015'
+    ]
+    text_file_list = [
+        'En-Zh/train.en-zh', 'En-Zh/tst2014.en-zh', 'En-Zh/tst2015.en-zh'
+    ]
+    data_split_list = ['train', 'dev', 'test']
+    data_process(args.src_dir, args.tgt_dir, wav_dir_list, text_file_list,
+                 data_split_list)
diff --git a/examples/ted_en_zh/st1/local/train_finetune.sh b/examples/ted_en_zh/st1/local/train.sh
similarity index 93%
rename from examples/ted_en_zh/st1/local/train_finetune.sh
rename to examples/ted_en_zh/st1/local/train.sh
index e54c7fff4c20dfd2dcb6bd75c3eed6dd6c983858..a8e4acaa01428c13ab2ba36b73c706119160c3bf 100755
--- a/examples/ted_en_zh/st1/local/train_finetune.sh
+++ b/examples/ted_en_zh/st1/local/train.sh
@@ -24,7 +24,7 @@ python3 -u ${BIN_DIR}/train.py \
 --ngpu ${ngpu} \
 --config ${config_path} \
 --output exp/${ckpt_name} \
---checkpoint_path ${ckpt_path} \
+--checkpoint_path "${ckpt_path}" \
 --seed ${seed}
 
 if [ ${seed} != 0 ]; then
diff --git a/examples/ted_en_zh/st1/path.sh b/examples/ted_en_zh/st1/path.sh
index fd537917a8204848957a6cf14268da5d3dbc4b35..867cdb48abc1065e07b8a67932ec735ae6b3ab47 100644
--- a/examples/ted_en_zh/st1/path.sh
+++ b/examples/ted_en_zh/st1/path.sh
@@ -1,6 +1,6 @@
 export MAIN_ROOT=`realpath ${PWD}/../../../`
 
-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PWD}/utils:${PATH}
 export LC_ALL=C
 
 export PYTHONDONTWRITEBYTECODE=1
@@ -13,3 +13,10 @@ export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/
 
 MODEL=u2_st
 export BIN_DIR=${MAIN_ROOT}/paddlespeech/s2t/exps/${MODEL}/bin
+
+# Kaldi
+export KALDI_ROOT=${MAIN_ROOT}/tools/kaldi
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present, can not using Kaldi!"
+[ -f $KALDI_ROOT/tools/config/common_path.sh ] && . $KALDI_ROOT/tools/config/common_path.sh
\ No newline at end of file
diff --git a/examples/ted_en_zh/st1/run.sh b/examples/ted_en_zh/st1/run.sh
index f8adf4f652f9da7773c3b048b0993c829def3913..f6362a8b358b9020eb0ec79c79358ebd0ba0f5ad 100755
--- a/examples/ted_en_zh/st1/run.sh
+++ b/examples/ted_en_zh/st1/run.sh
@@ -1,12 +1,13 @@
 #!/bin/bash
 set -e
-source path.sh
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
 
 gpus=0,1,2,3
 stage=1
 stop_stage=4
 conf_path=conf/transformer_mtl_noam.yaml
-ckpt_path=paddle.98
+ckpt_path= # paddle.98 # (finetune from FAT-ST pretrained model)
 avg_num=5
 data_path=./TED_EnZh # path to unzipped data
 source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
@@ -22,21 +23,20 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
 fi
 
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
-    # download pretrained
-    bash ./local/download_pretrain.sh || exit -1
-fi
-
-if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
     # train model, all `ckpt` under `exp` dir
-    CUDA_VISIBLE_DEVICES=${gpus} ./local/train_finetune.sh ${conf_path}  ${ckpt} ${ckpt_path}
+    if [ -n "${ckpt_path}" ]; then
+        echo "Finetune from Pretrained Model" ${ckpt_path}
+        ./local/download_pretrain.sh || exit -1
+    fi 
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} "${ckpt_path}"
 fi
 
-if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
     # avg n best model
     avg.sh best exp/${ckpt}/checkpoints ${avg_num}
 fi
 
-if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
     # test ckpt avg_n
     CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
 fi
\ No newline at end of file
diff --git a/examples/ted_en_zh/st1/steps b/examples/ted_en_zh/st1/steps
new file mode 120000
index 0000000000000000000000000000000000000000..91f2d234e20b07329572dc4fb9435b3a7e0831ee
--- /dev/null
+++ b/examples/ted_en_zh/st1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/examples/ted_en_zh/st1/utils b/examples/ted_en_zh/st1/utils
new file mode 120000
index 0000000000000000000000000000000000000000..f49247da82775373b379191434cc6252d0246270
--- /dev/null
+++ b/examples/ted_en_zh/st1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/examples/tiny/asr1/conf/chunk_confermer.yaml b/examples/tiny/asr1/conf/chunk_confermer.yaml
index 6183a903bde34783a15dc58d9c3f5ce47b3553d7..728a82e3c8377b4a401dc5f4eb89820faa988df3 100644
--- a/examples/tiny/asr1/conf/chunk_confermer.yaml
+++ b/examples/tiny/asr1/conf/chunk_confermer.yaml
@@ -14,7 +14,7 @@ collator:
   mean_std_filepath: ""
   vocab_filepath: data/lang_char/vocab.txt 
   unit_type: 'spm'
-  spm_model_prefix: 'data/bpe_unigram_200'
+  spm_model_prefix: 'data/lang_char/bpe_unigram_200'
   augmentation_config: conf/preprocess.yaml
   batch_size: 4
   raw_wav: True  # use raw_wav or kaldi feature
diff --git a/examples/tiny/asr1/conf/chunk_transformer.yaml b/examples/tiny/asr1/conf/chunk_transformer.yaml
index 01d383fb82d01ceb424d1dad3d48a6c401e1ce82..7c927122b30ad3728d19bbc158c2d09c0d6d3873 100644
--- a/examples/tiny/asr1/conf/chunk_transformer.yaml
+++ b/examples/tiny/asr1/conf/chunk_transformer.yaml
@@ -14,7 +14,7 @@ collator:
   mean_std_filepath: ""
   vocab_filepath: data/lang_char/vocab.txt 
   unit_type: 'spm'
-  spm_model_prefix: 'data/bpe_unigram_200'
+  spm_model_prefix: 'data/lang_char/bpe_unigram_200'
   augmentation_config: conf/preprocess.yaml
   batch_size: 4
   raw_wav: True  # use raw_wav or kaldi feature
diff --git a/examples/tiny/asr1/conf/conformer.yaml b/examples/tiny/asr1/conf/conformer.yaml
index a3fee6901df062e04e91f54f478910ef5c716a93..21cc112862438b46477895b4ad98e58ef075bdd8 100644
--- a/examples/tiny/asr1/conf/conformer.yaml
+++ b/examples/tiny/asr1/conf/conformer.yaml
@@ -14,7 +14,7 @@ collator:
   mean_std_filepath: ""
   vocab_filepath: data/lang_char/vocab.txt 
   unit_type: 'spm'
-  spm_model_prefix: 'data/bpe_unigram_200'
+  spm_model_prefix: 'data/lang_char/bpe_unigram_200'
   augmentation_config: conf/preprocess.yaml
   batch_size: 4
   raw_wav: True  # use raw_wav or kaldi feature
diff --git a/examples/tiny/asr1/conf/transformer.yaml b/examples/tiny/asr1/conf/transformer.yaml
index 5a87d6d245a36e30f50ced4c7777ff90ef1c9f28..f4645c681b8ba637ee9a9ca738d570321d81173e 100644
--- a/examples/tiny/asr1/conf/transformer.yaml
+++ b/examples/tiny/asr1/conf/transformer.yaml
@@ -14,7 +14,7 @@ collator:
   mean_std_filepath: data/mean_std.json
   vocab_filepath: data/lang_char/vocab.txt 
   unit_type: 'spm'
-  spm_model_prefix: 'data/bpe_unigram_200'
+  spm_model_prefix: 'data/lang_char/bpe_unigram_200'
   augmentation_config: conf/preprocess.yaml
   batch_size: 4
   raw_wav: True  # use raw_wav or kaldi feature
diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/test_hub.py b/paddlespeech/s2t/exps/deepspeech2/bin/test_hub.py
index 831bd1adb40c878be4daac9b07592086975c2c9c..b8544dc2bf4666cb62ff61c5fbc476c33be783a9 100644
--- a/paddlespeech/s2t/exps/deepspeech2/bin/test_hub.py
+++ b/paddlespeech/s2t/exps/deepspeech2/bin/test_hub.py
@@ -110,8 +110,8 @@ class DeepSpeech2Tester_hub():
     def setup_model(self):
         config = self.config.clone()
         with UpdateConfig(config):
-            config.model.feat_size = self.collate_fn_test.feature_size
-            config.model.dict_size = self.collate_fn_test.vocab_size
+            config.model.input_dim = self.collate_fn_test.feature_size
+            config.model.output_dim = self.collate_fn_test.vocab_size
 
         if self.args.model_type == 'offline':
             model = DeepSpeech2Model.from_config(config.model)
diff --git a/paddlespeech/s2t/exps/deepspeech2/model.py b/paddlespeech/s2t/exps/deepspeech2/model.py
index e827414d3c67a5790a381f4877bf6a7618ff7d46..3e4ff1a8b10be88fc6990850ae1dda63fe1a2b21 100644
--- a/paddlespeech/s2t/exps/deepspeech2/model.py
+++ b/paddlespeech/s2t/exps/deepspeech2/model.py
@@ -154,11 +154,11 @@ class DeepSpeech2Trainer(Trainer):
         config = self.config.clone()
         with UpdateConfig(config):
             if self.train:
-                config.model.feat_size = self.train_loader.collate_fn.feature_size
-                config.model.dict_size = self.train_loader.collate_fn.vocab_size
+                config.model.input_dim = self.train_loader.collate_fn.feature_size
+                config.model.output_dim = self.train_loader.collate_fn.vocab_size
             else:
-                config.model.feat_size = self.test_loader.collate_fn.feature_size
-                config.model.dict_size = self.test_loader.collate_fn.vocab_size
+                config.model.input_dim = self.test_loader.collate_fn.feature_size
+                config.model.output_dim = self.test_loader.collate_fn.vocab_size
 
         if self.args.model_type == 'offline':
             model = DeepSpeech2Model.from_config(config.model)
diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py
index b6dbcf443e94d752615295c20c7aae2a95525896..d448021cb1bc3d58d76d29f5c703bedf113a8713 100644
--- a/paddlespeech/s2t/exps/u2/model.py
+++ b/paddlespeech/s2t/exps/u2/model.py
@@ -128,8 +128,9 @@ class U2Trainer(Trainer):
             if dist.get_rank() == 0 and self.visualizer:
                 losses_np_v = losses_np.copy()
                 losses_np_v.update({"lr": self.lr_scheduler()})
-                self.visualizer.add_scalars("step", losses_np_v,
-                                            self.iteration - 1)
+                for key, val in losses_np_v.items():
+                    self.visualizer.add_scalar(
+                        tag='train/' + key, value=val, step=self.iteration - 1)
 
     @paddle.no_grad()
     def valid(self):
@@ -237,9 +238,10 @@ class U2Trainer(Trainer):
             logger.info(
                 'Epoch {} Val info val_loss {}'.format(self.epoch, cv_loss))
             if self.visualizer:
-                self.visualizer.add_scalars(
-                    'epoch', {'cv_loss': cv_loss,
-                              'lr': self.lr_scheduler()}, self.epoch)
+                self.visualizer.add_scalar(
+                    tag='eval/cv_loss', value=cv_loss, step=self.epoch)
+                self.visualizer.add_scalar(
+                    tag='eval/lr', value=self.lr_scheduler(), step=self.epoch)
 
             self.save(tag=self.epoch, infos={'val_loss': cv_loss})
             self.new_epoch()
diff --git a/paddlespeech/s2t/exps/u2_kaldi/model.py b/paddlespeech/s2t/exps/u2_kaldi/model.py
index c23b4c24545c2dda21b4b366a8d1e76c3acdf7a6..43e31a60dc127d607b7a644d1ce553832cb909a3 100644
--- a/paddlespeech/s2t/exps/u2_kaldi/model.py
+++ b/paddlespeech/s2t/exps/u2_kaldi/model.py
@@ -131,8 +131,9 @@ class U2Trainer(Trainer):
             if dist.get_rank() == 0 and self.visualizer:
                 losses_np_v = losses_np.copy()
                 losses_np_v.update({"lr": self.lr_scheduler()})
-                self.visualizer.add_scalars("step", losses_np_v,
-                                            self.iteration - 1)
+                for key, val in losses_np_v.items():
+                    self.visualizer.add_scalar(
+                        tag="train/" + key, value=val, step=self.iteration - 1)
 
     @paddle.no_grad()
     def valid(self):
@@ -222,9 +223,11 @@ class U2Trainer(Trainer):
             logger.info(
                 'Epoch {} Val info val_loss {}'.format(self.epoch, cv_loss))
             if self.visualizer:
-                self.visualizer.add_scalars(
-                    'epoch', {'cv_loss': cv_loss,
-                              'lr': self.lr_scheduler()}, self.epoch)
+                self.visualizer.add_scalar(
+                    tag='eval/cv_loss', value=cv_loss, step=self.epoch)
+                self.visualizer.add_scalar(
+                    tag='eval/lr', value=self.lr_scheduler(), step=self.epoch)
+
             self.save(tag=self.epoch, infos={'val_loss': cv_loss})
             self.new_epoch()
 
diff --git a/paddlespeech/s2t/exps/u2_st/model.py b/paddlespeech/s2t/exps/u2_st/model.py
index 52d3c3b7d1df9e3c3583525614dda8e5b10412a3..144eed9d85f8ec6214c6263eb24059b18a8b96f0 100644
--- a/paddlespeech/s2t/exps/u2_st/model.py
+++ b/paddlespeech/s2t/exps/u2_st/model.py
@@ -26,8 +26,10 @@ from paddle import distributed as dist
 from paddle.io import DataLoader
 from yacs.config import CfgNode
 
+from paddlespeech.s2t.frontend.featurizer import TextFeaturizer
 from paddlespeech.s2t.io.collator import SpeechCollator
 from paddlespeech.s2t.io.collator import TripletSpeechCollator
+from paddlespeech.s2t.io.dataloader import BatchDataLoader
 from paddlespeech.s2t.io.dataset import ManifestDataset
 from paddlespeech.s2t.io.sampler import SortagradBatchSampler
 from paddlespeech.s2t.io.sampler import SortagradDistributedBatchSampler
@@ -136,8 +138,9 @@ class U2STTrainer(Trainer):
             if dist.get_rank() == 0 and self.visualizer:
                 losses_np_v = losses_np.copy()
                 losses_np_v.update({"lr": self.lr_scheduler()})
-                self.visualizer.add_scalars("step", losses_np_v,
-                                            self.iteration - 1)
+                for key, val in losses_np_v.items():
+                    self.visualizer.add_scalar(
+                        tag="train/" + key, value=val, step=self.iteration - 1)
 
     @paddle.no_grad()
     def valid(self):
@@ -233,9 +236,11 @@ class U2STTrainer(Trainer):
             logger.info(
                 'Epoch {} Val info val_loss {}'.format(self.epoch, cv_loss))
             if self.visualizer:
-                self.visualizer.add_scalars(
-                    'epoch', {'cv_loss': cv_loss,
-                              'lr': self.lr_scheduler()}, self.epoch)
+                self.visualizer.add_scalar(
+                    tag='eval/cv_loss', value=cv_loss, step=self.epoch)
+                self.visualizer.add_scalar(
+                    tag='eval/lr', value=self.lr_scheduler(), step=self.epoch)
+
             self.save(tag=self.epoch, infos={'val_loss': cv_loss})
             self.new_epoch()
 
@@ -423,6 +428,31 @@ class U2STTester(U2STTrainer):
             trans.append(''.join([chr(i) for i in ids]))
         return trans
 
+    def translate(self, audio, audio_len):
+        """"E2E translation from extracted audio feature"""
+        cfg = self.config.decoding
+        text_feature = self.test_loader.collate_fn.text_feature
+        self.model.eval()
+
+        hyps = self.model.decode(
+            audio,
+            audio_len,
+            text_feature=text_feature,
+            decoding_method=cfg.decoding_method,
+            lang_model_path=cfg.lang_model_path,
+            beam_alpha=cfg.alpha,
+            beam_beta=cfg.beta,
+            beam_size=cfg.beam_size,
+            cutoff_prob=cfg.cutoff_prob,
+            cutoff_top_n=cfg.cutoff_top_n,
+            num_processes=cfg.num_proc_bsearch,
+            ctc_weight=cfg.ctc_weight,
+            word_reward=cfg.word_reward,
+            decoding_chunk_size=cfg.decoding_chunk_size,
+            num_decoding_left_chunks=cfg.num_decoding_left_chunks,
+            simulate_streaming=cfg.simulate_streaming)
+        return hyps
+
     def compute_translation_metrics(self,
                                     utts,
                                     audio,
diff --git a/paddlespeech/s2t/frontend/augmentor/impulse_response.py b/paddlespeech/s2t/frontend/augmentor/impulse_response.py
index 6cc9c0d43991cd3f6141916ca5778485ce49dec5..5ba45bb20c2e3816358b34209fa3cb03142b285b 100644
--- a/paddlespeech/s2t/frontend/augmentor/impulse_response.py
+++ b/paddlespeech/s2t/frontend/augmentor/impulse_response.py
@@ -12,9 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Contains the impulse response augmentation model."""
+import jsonlines
+
 from paddlespeech.s2t.frontend.audio import AudioSegment
 from paddlespeech.s2t.frontend.augmentor.base import AugmentorBase
-from paddlespeech.s2t.frontend.utility import read_manifest
 
 
 class ImpulseResponseAugmentor(AugmentorBase):
@@ -28,7 +29,8 @@ class ImpulseResponseAugmentor(AugmentorBase):
 
     def __init__(self, rng, impulse_manifest_path):
         self._rng = rng
-        self._impulse_manifest = read_manifest(impulse_manifest_path)
+        with jsonlines.open(impulse_manifest_path, 'r') as reader:
+            self._impulse_manifest = list(reader)
 
     def __call__(self, x, uttid=None, train=True):
         if not train:
diff --git a/paddlespeech/s2t/frontend/augmentor/noise_perturb.py b/paddlespeech/s2t/frontend/augmentor/noise_perturb.py
index 9d6da1a8f34818546ebc579efb4cb7a49cb559e5..71165dac893a526963b0e870ccbf99cf979aac42 100644
--- a/paddlespeech/s2t/frontend/augmentor/noise_perturb.py
+++ b/paddlespeech/s2t/frontend/augmentor/noise_perturb.py
@@ -12,9 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Contains the noise perturb augmentation model."""
+import jsonlines
+
 from paddlespeech.s2t.frontend.audio import AudioSegment
 from paddlespeech.s2t.frontend.augmentor.base import AugmentorBase
-from paddlespeech.s2t.frontend.utility import read_manifest
 
 
 class NoisePerturbAugmentor(AugmentorBase):
@@ -34,7 +35,8 @@ class NoisePerturbAugmentor(AugmentorBase):
         self._min_snr_dB = min_snr_dB
         self._max_snr_dB = max_snr_dB
         self._rng = rng
-        self._noise_manifest = read_manifest(manifest_path=noise_manifest_path)
+        with jsonlines.open(noise_manifest_path, 'r') as reader:
+            self._noise_manifest = list(reader)
 
     def __call__(self, x, uttid=None, train=True):
         if not train:
diff --git a/paddlespeech/s2t/frontend/normalizer.py b/paddlespeech/s2t/frontend/normalizer.py
index a29cddc387c8c8336a09a6b2792bcd73a72b0908..017851e6344a83932f92e7f20f6fa680b4d650f1 100644
--- a/paddlespeech/s2t/frontend/normalizer.py
+++ b/paddlespeech/s2t/frontend/normalizer.py
@@ -14,6 +14,7 @@
 """Contains feature normalizers."""
 import json
 
+import jsonlines
 import numpy as np
 import paddle
 from paddle.io import DataLoader
@@ -21,7 +22,6 @@ from paddle.io import Dataset
 
 from paddlespeech.s2t.frontend.audio import AudioSegment
 from paddlespeech.s2t.frontend.utility import load_cmvn
-from paddlespeech.s2t.frontend.utility import read_manifest
 from paddlespeech.s2t.utils.log import Log
 
 __all__ = ["FeatureNormalizer"]
@@ -61,7 +61,10 @@ class CollateFunc(object):
 class AudioDataset(Dataset):
     def __init__(self, manifest_path, num_samples=-1, rng=None, random_seed=0):
         self._rng = rng if rng else np.random.RandomState(random_seed)
-        manifest = read_manifest(manifest_path)
+
+        with jsonlines.open(manifest_path, 'r') as reader:
+            manifest = list(reader)
+
         if num_samples == -1:
             sampled_manifest = manifest
         else:
diff --git a/paddlespeech/s2t/frontend/utility.py b/paddlespeech/s2t/frontend/utility.py
index 703f2127d7e71b093030658f6d88c45979770c61..e6c7603fa20a8fcfa91ced38a61f1b329191ddf5 100644
--- a/paddlespeech/s2t/frontend/utility.py
+++ b/paddlespeech/s2t/frontend/utility.py
@@ -98,14 +98,13 @@ def read_manifest(
     Returns:
         List[dict]: Manifest parsing results.
     """
-
     manifest = []
     with jsonlines.open(manifest_path, 'r') as reader:
         for json_data in reader:
-            feat_len = json_data["feat_shape"][
-                0] if 'feat_shape' in json_data else 1.0
-            token_len = json_data["token_shape"][
-                0] if 'token_shape' in json_data else 1.0
+            feat_len = json_data["input"][0]["shape"][
+                0] if "input" in json_data and "shape" in json_data["input"][0] else 1.0
+            token_len = json_data["output"][0]["shape"][
+                0] if "output" in json_data and "shape" in json_data["output"][0] else 1.0
             conditions = [
                 feat_len >= min_input_len,
                 feat_len <= max_input_len,
diff --git a/paddlespeech/s2t/io/dataloader.py b/paddlespeech/s2t/io/dataloader.py
index 3b5000a280621502a6b20ca2bf71b43789912ab8..b8eb33679dfadfb155ae4be9cce54876fe991666 100644
--- a/paddlespeech/s2t/io/dataloader.py
+++ b/paddlespeech/s2t/io/dataloader.py
@@ -16,10 +16,10 @@ from typing import Dict
 from typing import List
 from typing import Text
 
+import jsonlines
 import numpy as np
 from paddle.io import DataLoader
 
-from paddlespeech.s2t.frontend.utility import read_manifest
 from paddlespeech.s2t.io.batchfy import make_batchset
 from paddlespeech.s2t.io.converter import CustomConverter
 from paddlespeech.s2t.io.dataset import TransformDataset
@@ -91,7 +91,9 @@ class BatchDataLoader():
         self.n_iter_processes = n_iter_processes
 
         # read json data
-        self.data_json = read_manifest(json_file)
+        with jsonlines.open(json_file, 'r') as reader:
+            self.data_json = list(reader)
+
         self.feat_dim, self.vocab_size = feat_dim_and_vocab_size(
             self.data_json, mode='asr')
 
diff --git a/paddlespeech/s2t/io/dataset.py b/paddlespeech/s2t/io/dataset.py
index 61eeb00f1c41b586762035fe1a097b2197a74008..d64d7d3ec16527c7b1f18e0f7c439d14aff3b0ad 100644
--- a/paddlespeech/s2t/io/dataset.py
+++ b/paddlespeech/s2t/io/dataset.py
@@ -15,6 +15,7 @@
 # Modified from wenet(https://github.com/wenet-e2e/wenet)
 from typing import Optional
 
+import jsonlines
 from paddle.io import Dataset
 from yacs.config import CfgNode
 
@@ -184,7 +185,8 @@ class AudioDataset(Dataset):
         """
         assert batch_type in ['static', 'dynamic']
         # read manifest
-        data = read_manifest(data_file)
+        with jsonlines.open(data_file, 'r') as reader:
+            data = list(reader)
         if sort:
             data = sorted(data, key=lambda x: x["feat_shape"][0])
         if raw_wav:
diff --git a/paddlespeech/s2t/models/ds2/deepspeech2.py b/paddlespeech/s2t/models/ds2/deepspeech2.py
index 4a7a7c15e9b96edfe5ee4b3f29f406b4b6b62a64..317abc69e3d9dd0a919e692328764f1319b7b76d 100644
--- a/paddlespeech/s2t/models/ds2/deepspeech2.py
+++ b/paddlespeech/s2t/models/ds2/deepspeech2.py
@@ -249,8 +249,8 @@ class DeepSpeech2Model(nn.Layer):
             The model built from config.
         """
         model = cls(
-            feat_size=config.feat_size,
-            dict_size=config.dict_size,
+            feat_size=config.input_dim,
+            dict_size=config.output_dim,
             num_conv_layers=config.num_conv_layers,
             num_rnn_layers=config.num_rnn_layers,
             rnn_size=config.rnn_layer_size,
diff --git a/paddlespeech/s2t/models/ds2_online/deepspeech2.py b/paddlespeech/s2t/models/ds2_online/deepspeech2.py
index da04d5c5de8f56ac17801b2871c6c2f89b16712c..d134239f295f3dc90e7cde55bdb211240b892d22 100644
--- a/paddlespeech/s2t/models/ds2_online/deepspeech2.py
+++ b/paddlespeech/s2t/models/ds2_online/deepspeech2.py
@@ -381,8 +381,8 @@ class DeepSpeech2ModelOnline(nn.Layer):
             The model built from config.
         """
         model = cls(
-            feat_size=config.feat_size,
-            dict_size=config.dict_size,
+            feat_size=config.input_dim,
+            dict_size=config.output_dim,
             num_conv_layers=config.num_conv_layers,
             num_rnn_layers=config.num_rnn_layers,
             rnn_size=config.rnn_layer_size,
diff --git a/paddlespeech/s2t/training/trainer.py b/paddlespeech/s2t/training/trainer.py
index f5fb2db03be0090d6e99656db0be4836745783a2..cc8f50317144b94be19aef424674053b977b1b3b 100644
--- a/paddlespeech/s2t/training/trainer.py
+++ b/paddlespeech/s2t/training/trainer.py
@@ -19,7 +19,7 @@ from pathlib import Path
 
 import paddle
 from paddle import distributed as dist
-from tensorboardX import SummaryWriter
+from visualdl import LogWriter
 
 from paddlespeech.s2t.training.reporter import ObsScope
 from paddlespeech.s2t.training.reporter import report
@@ -245,8 +245,9 @@ class Trainer():
         self.maybe_batch_sampler_step()
 
     def after_train_batch(self):
-        if self.args.benchmark_max_step and self.iteration > self.args.benchmark_max_step:
+        if self.args.benchmark_max_step:
             profiler.add_profiler_step(self.args.profiler_options)
+        if self.args.benchmark_max_step and self.iteration > self.args.benchmark_max_step:
             logger.info(
                 f"Reach benchmark-max-step: {self.args.benchmark_max_step}")
             sys.exit(
@@ -309,9 +310,10 @@ class Trainer():
             logger.info(
                 'Epoch {} Val info val_loss {}'.format(self.epoch, cv_loss))
             if self.visualizer:
-                self.visualizer.add_scalars(
-                    'epoch', {'cv_loss': cv_loss,
-                              'lr': self.lr_scheduler()}, self.epoch)
+                self.visualizer.add_scalar(
+                    tag='eval/cv_loss', value=cv_loss, step=self.epoch)
+                self.visualizer.add_scalar(
+                    tag='eval/lr', value=self.lr_scheduler(), step=self.epoch)
 
             # after epoch
             self.save(tag=self.epoch, infos={'val_loss': cv_loss})
@@ -427,7 +429,7 @@ class Trainer():
         unexpected behaviors.
         """
         # visualizer
-        visualizer = SummaryWriter(logdir=str(self.visual_dir))
+        visualizer = LogWriter(logdir=str(self.visual_dir))
         self.visualizer = visualizer
 
     @mp_tools.rank_zero_only
diff --git a/paddlespeech/s2t/utils/checkpoint.py b/paddlespeech/s2t/utils/checkpoint.py
index 5105f95efaebf4347a0d13b4383a1db962835308..1d24c880add78e13d2d87fc5fd1a4fc67ffd123b 100644
--- a/paddlespeech/s2t/utils/checkpoint.py
+++ b/paddlespeech/s2t/utils/checkpoint.py
@@ -94,7 +94,7 @@ class Checkpoint():
         """
         configs = {}
 
-        if checkpoint_path is not None:
+        if checkpoint_path:
             pass
         elif checkpoint_dir is not None and record_file is not None:
             # load checkpint from record file
diff --git a/paddlespeech/s2t/utils/socket_server.py b/paddlespeech/s2t/utils/socket_server.py
index 43b56d723c154c35be9b71afad67123bf99c238f..691ea966821dce4923652b68937dd4a4fbb17ede 100644
--- a/paddlespeech/s2t/utils/socket_server.py
+++ b/paddlespeech/s2t/utils/socket_server.py
@@ -21,7 +21,7 @@ import wave
 from time import gmtime
 from time import strftime
 
-from paddlespeech.s2t.frontend.utility import read_manifest
+import jsonlines
 
 __all__ = ["socket_send", "warm_up_test", "AsrTCPServer", "AsrRequestHandler"]
 
@@ -44,7 +44,8 @@ def warm_up_test(audio_process_handler,
                  num_test_cases,
                  random_seed=0):
     """Warming-up test."""
-    manifest = read_manifest(manifest_path)
+    with jsonlines.open(manifest_path) as reader:
+        manifest = list(reader)
     rng = random.Random(random_seed)
     samples = rng.sample(manifest, num_test_cases)
     for idx, sample in enumerate(samples):
diff --git a/paddlespeech/text/speechtask/punctuation_restoration/training/trainer.py b/paddlespeech/text/speechtask/punctuation_restoration/training/trainer.py
index d6b6eeb65ea338029b9a58d2dda1d3824f093866..78512796be0de321c8e2bef865b0bce6af1f226a 100644
--- a/paddlespeech/text/speechtask/punctuation_restoration/training/trainer.py
+++ b/paddlespeech/text/speechtask/punctuation_restoration/training/trainer.py
@@ -34,7 +34,7 @@ from speechtask.punctuation_restoration.model.lstm import RnnLm
 from speechtask.punctuation_restoration.utils import layer_tools
 from speechtask.punctuation_restoration.utils import mp_tools
 from speechtask.punctuation_restoration.utils.checkpoint import Checkpoint
-from tensorboardX import SummaryWriter
+from visualdl import LogWriter
 
 __all__ = ["Trainer", "Tester"]
 
@@ -252,10 +252,10 @@ class Trainer():
             self.logger.info("Epoch {} Val info val_loss {}, F1_score {}".
                              format(self.epoch, total_loss, F1_score))
             if self.visualizer:
-                self.visualizer.add_scalars("epoch", {
-                    "total_loss": total_loss,
-                    "lr": self.lr_scheduler()
-                }, self.epoch)
+                self.visualizer.add_scalar(
+                    tag='eval/cv_loss', value=cv_loss, step=self.epoch)
+                self.visualizer.add_scalar(
+                    tag='eval/lr', value=self.lr_scheduler(), step=self.epoch)
 
             self.save(
                 tag=self.epoch, infos={"val_loss": total_loss,
@@ -341,7 +341,7 @@ class Trainer():
         unexpected behaviors.
         """
         # visualizer
-        visualizer = SummaryWriter(logdir=str(self.output_dir))
+        visualizer = LogWriter(logdir=str(self.output_dir))
         self.visualizer = visualizer
 
     @mp_tools.rank_zero_only
diff --git a/requirements.txt b/requirements.txt
index 99e485f867ea3dfaeee2f7e440d230f8802881b6..2ee60d3f620f803b8c4eebf4747bfee2fba55fa6 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -40,7 +40,6 @@ snakeviz
 soundfile~=0.10
 sox
 soxbindings
-tensorboardX
 textgrid
 timer
 tqdm
diff --git a/tests/benchmark/conformer/prepare.sh b/tests/benchmark/conformer/prepare.sh
index c5fae06a59d41147c9aaa89f3074914e7ea9906f..4a7be1b69a0cbee1d8c85baf86c6a4be3fbc8f57 100644
--- a/tests/benchmark/conformer/prepare.sh
+++ b/tests/benchmark/conformer/prepare.sh
@@ -2,7 +2,7 @@ cd ../../../
 pip install -e .   # 安装pdspeech
 cd -
 #Enter the example dir
-pushd ../../../examples/aishell/s1
+pushd ../../../examples/aishell/asr1
 
 #Prepare the data
 bash run.sh --stage 0 --stop_stage 0
diff --git a/tests/benchmark/conformer/run.sh b/tests/benchmark/conformer/run.sh
index 79beb4e961fc01d7b1d5a80e81d94289057c0398..9fd13fbcb1691f2a6a6ab9ac42fce97912efd062 100644
--- a/tests/benchmark/conformer/run.sh
+++ b/tests/benchmark/conformer/run.sh
@@ -8,7 +8,7 @@ cd ${CUR_DIR}
 sed -i '/set\ -xe/d' run_benchmark.sh
 
 #cd **
-pushd ../../../examples/aishell/s1
+pushd ../../../examples/aishell/asr1
 # 1 安装该模型需要的依赖 (如需开启优化策略请注明)
 # 2 拷贝该模型需要数据、预训练模型
 
diff --git a/tests/benchmark/conformer/run_benchmark.sh b/tests/benchmark/conformer/run_benchmark.sh
index d4efe2b96a03d045040b37715abd9bb0844ae219..5b83b15ce5fda954ee272e5040e419f74b2a97a6 100644
--- a/tests/benchmark/conformer/run_benchmark.sh
+++ b/tests/benchmark/conformer/run_benchmark.sh
@@ -1,5 +1,4 @@
 #!/usr/bin/env bash
-set -xe
 # 运行示例：CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh ${run_mode} ${bs_item} ${fp_item} 500 ${model_mode}
 # 参数说明
 function _set_params(){
@@ -35,13 +34,15 @@ function _set_params(){
 function _train(){
     echo "Train on ${num_gpu_devices} GPUs"
     echo "current CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES, gpus=$num_gpu_devices, batch_size=$batch_size"
-    train_cmd="--config=${config_path}
-               --output=${output}
-               --seed=${seed}
-               --ngpu=${ngpu}
-               --profiler-options "${profiler_options}"
-               --benchmark-batch-size ${batch_size}
-               --benchmark-max-step ${benchmark_max_step} "
+    train_cmd="--config=${config_path} \
+           --output=${output} \
+           --seed=${seed} \
+           --ngpu=${ngpu} \
+           --benchmark-batch-size ${batch_size} \
+           --benchmark-max-step ${benchmark_max_step} "
+    if [ ${profiler_options} != "None" ]; then
+        train_cmd=${train_cmd}" --profiler-options=${profiler_options}"
+    fi
 
     case ${run_mode} in
     sp) train_cmd="python -u ${BIN_DIR}/train.py "${train_cmd} ;;
diff --git a/utils/addjson.py b/utils/addjson.py
new file mode 100755
index 0000000000000000000000000000000000000000..013d14727f40d8b95a9d54f1f60a0682e8bf3977
--- /dev/null
+++ b/utils/addjson.py
@@ -0,0 +1,152 @@
+#!/usr/bin/env python3
+# encoding: utf-8
+# Copyright 2018 Nagoya University (Tomoki Hayashi)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import argparse
+import codecs
+import json
+import logging
+import sys
+from distutils.util import strtobool
+
+from espnet.utils.cli_utils import get_commandline_args
+
+is_python2 = sys.version_info[0] == 2
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        description="add multiple json values to an input or output value",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter, )
+    parser.add_argument("jsons", type=str, nargs="+", help="json files")
+    parser.add_argument(
+        "-i",
+        "--is-input",
+        default=True,
+        type=strtobool,
+        help="If true, add to input. If false, add to output", )
+    parser.add_argument(
+        "--verbose", "-V", default=0, type=int, help="Verbose option")
+    return parser
+
+
+if __name__ == "__main__":
+    parser = get_parser()
+    args = parser.parse_args()
+
+    # logging info
+    logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
+    if args.verbose > 0:
+        logging.basicConfig(level=logging.INFO, format=logfmt)
+    else:
+        logging.basicConfig(level=logging.WARN, format=logfmt)
+    logging.info(get_commandline_args())
+
+    # make intersection set for utterance keys
+    js = []
+    intersec_ks = []
+    for x in args.jsons:
+        with codecs.open(x, "r", encoding="utf-8") as f:
+            j = json.load(f)
+        ks = j["utts"].keys()
+        logging.info(x + ": has " + str(len(ks)) + " utterances")
+        if len(intersec_ks) > 0:
+            intersec_ks = intersec_ks.intersection(set(ks))
+            if len(intersec_ks) == 0:
+                logging.warning("Empty intersection")
+                break
+        else:
+            intersec_ks = set(ks)
+        js.append(j)
+    logging.info("new json has " + str(len(intersec_ks)) + " utterances")
+
+    # updated original dict to keep intersection
+    intersec_org_dic = dict()
+    for k in intersec_ks:
+        v = js[0]["utts"][k]
+        intersec_org_dic[k] = v
+
+    intersec_add_dic = dict()
+    for k in intersec_ks:
+        v = js[1]["utts"][k]
+        for j in js[2:]:
+            v.update(j["utts"][k])
+        intersec_add_dic[k] = v
+
+    new_dic = dict()
+    for key_id in intersec_org_dic:
+        orgdic = intersec_org_dic[key_id]
+        adddic = intersec_add_dic[key_id]
+
+        if "utt2spk" not in orgdic:
+            orgdic["utt2spk"] = ""
+        # NOTE: for machine translation
+
+        # add as input
+        if args.is_input:
+            # original input
+            input_list = orgdic["input"]
+            # additional input
+            in_add_dic = {}
+            if "idim" in adddic and "ilen" in adddic:
+                in_add_dic["shape"] = [int(adddic["ilen"]), int(adddic["idim"])]
+            elif "idim" in adddic:
+                in_add_dic["shape"] = [int(adddic["idim"])]
+            # add all other key value
+            for key, value in adddic.items():
+                if key in ["idim", "ilen"]:
+                    continue
+                in_add_dic[key] = value
+            # add name
+            in_add_dic["name"] = "input%d" % (len(input_list) + 1)
+
+            input_list.append(in_add_dic)
+            new_dic[key_id] = {
+                "input": input_list,
+                "output": orgdic["output"],
+                "utt2spk": orgdic["utt2spk"],
+            }
+        # add as output
+        else:
+            # original output
+            output_list = orgdic["output"]
+            # additional output
+            out_add_dic = {}
+            # add shape
+            if "odim" in adddic and "olen" in adddic:
+                out_add_dic[
+                    "shape"] = [int(adddic["olen"]), int(adddic["odim"])]
+            elif "odim" in adddic:
+                out_add_dic["shape"] = [int(adddic["odim"])]
+            # add all other key value
+            for key, value in adddic.items():
+                if key in ["odim", "olen"]:
+                    continue
+                out_add_dic[key] = value
+            # add name
+            out_add_dic["name"] = "target%d" % (len(output_list) + 1)
+
+            output_list.append(out_add_dic)
+            new_dic[key_id] = {
+                "input": orgdic["input"],
+                "output": output_list,
+                "utt2spk": orgdic["utt2spk"],
+            }
+            if "lang" in orgdic.keys():
+                new_dic[key_id]["lang"] = orgdic["lang"]
+
+    # ensure "ensure_ascii=False", which is a bug
+    jsonstring = json.dumps(
+        {
+            "utts": new_dic
+        },
+        indent=4,
+        ensure_ascii=False,
+        sort_keys=True,
+        separators=(",", ": "), )
+    sys.stdout = codecs.getwriter("utf-8")(sys.stdout
+                                           if is_python2 else sys.stdout.buffer)
+    print(jsonstring)
diff --git a/utils/build_vocab.py b/utils/build_vocab.py
index 6a90314759a20d8ade3e33995a1afd9bfa770d3c..e364e821e7089d083fb3a8731c37b6afc40cc2ef 100755
--- a/utils/build_vocab.py
+++ b/utils/build_vocab.py
@@ -21,9 +21,10 @@ import os
 import tempfile
 from collections import Counter
 
+import jsonlines
+
 from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
 from paddlespeech.s2t.frontend.utility import BLANK
-from paddlespeech.s2t.frontend.utility import read_manifest
 from paddlespeech.s2t.frontend.utility import SOS
 from paddlespeech.s2t.frontend.utility import SPACE
 from paddlespeech.s2t.frontend.utility import UNK
@@ -54,20 +55,41 @@ add_arg('text_keys', str,
 add_arg('spm_vocab_size', int, 0, "Vocab size for spm.")
 add_arg('spm_mode', str, 'unigram', "spm model type, e.g. unigram, spm, char, word. only need when `unit_type` is spm")
 add_arg('spm_model_prefix', str, "", "spm_model_%(spm_mode)_%(count_threshold), spm model prefix, only need when `unit_type` is spm")
+add_arg('spm_character_coverage', float, 0.9995, "character coverage to determine the minimum symbols")
+
 # yapf: disable
 args = parser.parse_args()
 
 
 def count_manifest(counter, text_feature, manifest_path):
-    manifest_jsons = read_manifest(manifest_path)
+    manifest_jsons = []
+    with jsonlines.open(manifest_path, 'r') as reader:
+        for json_data in reader:
+            manifest_jsons.append(json_data)
+
     for line_json in manifest_jsons:
-        line = text_feature.tokenize(line_json['text'], replace_space=False)
-        counter.update(line)
+        if isinstance(line_json['text'], str):
+            line = text_feature.tokenize(line_json['text'], replace_space=False)
+            counter.update(line)
+        else:
+            assert isinstance(line_json['text'], list)
+            for text in line_json['text']:
+                line = text_feature.tokenize(text, replace_space=False)
+                counter.update(line)
 
 def dump_text_manifest(fileobj, manifest_path, key='text'):
-    manifest_jsons = read_manifest(manifest_path)
+    manifest_jsons = []
+    with jsonlines.open(manifest_path, 'r') as reader:
+        for json_data in reader:
+            manifest_jsons.append(json_data)
+
     for line_json in manifest_jsons:
-        fileobj.write(line_json[key] + "\n")
+        if isinstance(line_json[key], str):
+            fileobj.write(line_json[key] + "\n")
+        else:
+            assert isinstance(line_json[key], list)
+            for line in line_json[key]:
+                fileobj.write(line + "\n")
 
 def main():
     print_arguments(args, globals())
@@ -95,7 +117,7 @@ def main():
             model_type=args.spm_mode,
             model_prefix=args.spm_model_prefix,
             input_sentence_size=100000000,
-            character_coverage=0.9995)
+            character_coverage=args.spm_character_coverage)
         os.unlink(fp.name)
 
     # encode
diff --git a/utils/dump_manifest.py b/utils/dump_manifest.py
index b5f7b64a44dead5bac2ea66ee5fdab09d9c255b6..58d91755886ad14538cc88afaf609a06b6d3341a 100755
--- a/utils/dump_manifest.py
+++ b/utils/dump_manifest.py
@@ -17,7 +17,7 @@ import argparse
 from pathlib import Path
 from typing import Union
 
-from paddlespeech.s2t.frontend.utility import read_manifest
+import jsonlines
 
 key_whitelist = set(['feat', 'text', 'syllable', 'phone'])
 filename = {
@@ -32,7 +32,10 @@ def dump_manifest(manifest_path, output_dir: Union[str, Path]):
 
     output_dir = Path(output_dir).expanduser()
     manifest_path = Path(manifest_path).expanduser()
-    manifest_jsons = read_manifest(manifest_path)
+
+    with jsonlines.open(str(manifest_path), 'r') as reader:
+        manifest_jsons = list(reader)
+
     first_line = manifest_jsons[0]
     file_map = {}
 
diff --git a/utils/format_data.py b/utils/format_data.py
index 2fa1924a072faa67ad559f68174896846f8cbdf9..6db2a1bbbeb568c4cc0a6c3ef0ab17039247966a 100755
--- a/utils/format_data.py
+++ b/utils/format_data.py
@@ -17,9 +17,10 @@ import argparse
 import functools
 import json
 
+import jsonlines
+
 from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
 from paddlespeech.s2t.frontend.utility import load_cmvn
-from paddlespeech.s2t.frontend.utility import read_manifest
 from paddlespeech.s2t.io.utility import feat_type
 from paddlespeech.s2t.utils.utility import add_arguments
 from paddlespeech.s2t.utils.utility import print_arguments
@@ -71,7 +72,9 @@ def main():
     # }
     count = 0
     for manifest_path in args.manifest_paths:
-        manifest_jsons = read_manifest(manifest_path)
+        with jsonlines.open(str(manifest_path), 'r') as reader:
+            manifest_jsons = list(reader)
+
         for line_json in manifest_jsons:
             output_json = {
                 "input": [],
diff --git a/utils/format_triplet_data.py b/utils/format_triplet_data.py
index e0b5ece37353dc9cc592d440ce11ba486569bfaf..44ff4527c897a826fdaade9ed8c6276f86add54e 100755
--- a/utils/format_triplet_data.py
+++ b/utils/format_triplet_data.py
@@ -17,9 +17,10 @@ import argparse
 import functools
 import json
 
+import jsonlines
+
 from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
 from paddlespeech.s2t.frontend.utility import load_cmvn
-from paddlespeech.s2t.frontend.utility import read_manifest
 from paddlespeech.s2t.io.utility import feat_type
 from paddlespeech.s2t.utils.utility import add_arguments
 from paddlespeech.s2t.utils.utility import print_arguments
@@ -63,7 +64,8 @@ def main():
 
     count = 0
     for manifest_path in args.manifest_paths:
-        manifest_jsons = read_manifest(manifest_path)
+        with jsonlines.open(str(manifest_path), 'r') as reader:
+            manifest_jsons = list(reader)
         for line_json in manifest_jsons:
             # text: translation text, text1: transcript text.
             # Currently only support joint-vocab, will add separate vocabs setting.
diff --git a/utils/manifest_key_value.py b/utils/manifest_key_value.py
index b409236fc05e8661e136c76d4b780becbf0ee19d..fb3d3aaaf47948428cd5eaf4a9ae6b0fe82b93e1 100755
--- a/utils/manifest_key_value.py
+++ b/utils/manifest_key_value.py
@@ -4,9 +4,10 @@ import argparse
 import functools
 from pathlib import Path
 
+import jsonlines
+
 from utils.utility import add_arguments
 from utils.utility import print_arguments
-from utils.utility import read_manifest
 
 
 def main(args):
@@ -19,7 +20,8 @@ def main(args):
     dur_scp = outdir / 'duration'
     text_scp = outdir / 'text'
 
-    manifest_jsons = read_manifest(args.manifest_path)
+    with jsonlines.open(args.manifest_path, 'r') as reader:
+        manifest_jsons = list(reader)
 
     with wav_scp.open('w') as fwav, dur_scp.open('w') as fdur, text_scp.open(
             'w') as ftxt:
diff --git a/utils/pack_model.sh b/utils/pack_model.sh
index 8acd59a640bb6fe10e22a38fc1c2f6d6c71ca46a..fe6ab51bca877ac2f8c3691b981e5eec31f7bbb4 100755
--- a/utils/pack_model.sh
+++ b/utils/pack_model.sh
@@ -57,7 +57,7 @@ else
     echo "missing ${dec_conf}"
     exit 1
 fi
-# NOTE(kan-bayashi): preprocess conf is optional
+# preprocess conf is optional
 if [ -n "${preprocess_conf}" ]; then
     tar rfh ${outfile}.tar ${preprocess_conf}
     echo -n "    - preprocess config file: \`"
@@ -82,12 +82,12 @@ if [ -e ${e2e} ]; then
 
     e2e_conf=$(dirname ${e2e})/model.json
     if [ ! -e ${e2e_conf} ]; then
-	echo missing ${e2e_conf}
-	#exit 1
+	    echo missing ${e2e_conf}
+	    #exit 1
     else
-	echo -n "    - e2e JSON file: \`"
-	echo ${e2e_conf} | sed -e "s/$/\`/"
-	tar rfh ${outfile}.tar ${e2e_conf}
+	    echo -n "    - e2e JSON file: \`"
+	    echo ${e2e_conf} | sed -e "s/$/\`/"
+	    tar rfh ${outfile}.tar ${e2e_conf}
     fi
 else
     echo "missing ${e2e}"
@@ -104,7 +104,7 @@ if [ -n "${lm}" ]; then
 	lm_conf=$(dirname ${lm})/model.json
 	if [ ! -e ${lm_conf} ]; then
 	    echo missing ${lm_conf}
-	    exit 1
+	    #exit 1
 	else
 	    echo -n "    - lm JSON file: \`"
 	    echo ${lm_conf} | sed -e "s/$/\`/"
diff --git a/utils/scp2json.py b/utils/scp2json.py
new file mode 100755
index 0000000000000000000000000000000000000000..e2a757665cb26163370e9994e8ab581b429da4ae
--- /dev/null
+++ b/utils/scp2json.py
@@ -0,0 +1,49 @@
+#!/usr/bin/env python3
+# encoding: utf-8
+# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import argparse
+import codecs
+import json
+import sys
+
+is_python2 = sys.version_info[0] == 2
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        description="convert scp to json",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter, )
+    parser.add_argument("--key", "-k", type=str, help="key")
+    return parser
+
+
+if __name__ == "__main__":
+    parser = get_parser()
+    args = parser.parse_args()
+
+    new_line = {}
+    sys.stdin = codecs.getreader("utf-8")(sys.stdin
+                                          if is_python2 else sys.stdin.buffer)
+    sys.stdout = codecs.getwriter("utf-8")(sys.stdout
+                                           if is_python2 else sys.stdout.buffer)
+    line = sys.stdin.readline()
+    while line:
+        x = line.rstrip().split()
+        v = {args.key: " ".join(x[1:])}
+        new_line[x[0]] = v
+        line = sys.stdin.readline()
+
+    all_l = {"utts": new_line}
+
+    # ensure "ensure_ascii=False", which is a bug
+    jsonstring = json.dumps(
+        all_l,
+        indent=4,
+        ensure_ascii=False,
+        sort_keys=True,
+        separators=(",", ": "))
+    print(jsonstring)
diff --git a/utils/tokenizer.perl b/utils/tokenizer.perl
new file mode 100644
index 0000000000000000000000000000000000000000..ae97d6582bd4fa2433349e8d668c0b128d9097ad
--- /dev/null
+++ b/utils/tokenizer.perl
@@ -0,0 +1,596 @@
+#!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
+
+use warnings;
+
+# Sample Tokenizer
+### Version 1.1
+# written by Pidong Wang, based on the code written by Josh Schroeder and Philipp Koehn
+# Version 1.1 updates:
+#       (1) add multithreading option "-threads NUM_THREADS" (default is 1);
+#       (2) add a timing option "-time" to calculate the average speed of this tokenizer;
+#       (3) add an option "-lines NUM_SENTENCES_PER_THREAD" to set the number of lines for each thread (default is 2000), and this option controls the memory amount needed: the larger this number is, the larger memory is required (the higher tokenization speed);
+### Version 1.0
+# $Id: tokenizer.perl 915 2009-08-10 08:15:49Z philipp $
+# written by Josh Schroeder, based on code by Philipp Koehn
+
+binmode(STDIN, ":utf8");
+binmode(STDOUT, ":utf8");
+
+use warnings;
+use FindBin qw($RealBin);
+use strict;
+use Time::HiRes;
+
+if  (eval {require Thread;1;}) {
+  #module loaded
+  Thread->import();
+}
+
+my $mydir = "$RealBin/../share/nonbreaking_prefixes";
+
+my %NONBREAKING_PREFIX = ();
+my @protected_patterns = ();
+my $protected_patterns_file = "";
+my $language = "en";
+my $QUIET = 0;
+my $HELP = 0;
+my $AGGRESSIVE = 0;
+my $SKIP_XML = 0;
+my $TIMING = 0;
+my $NUM_THREADS = 1;
+my $NUM_SENTENCES_PER_THREAD = 2000;
+my $PENN = 0;
+my $NO_ESCAPING = 0;
+while (@ARGV)
+{
+	$_ = shift;
+	/^-b$/ && ($| = 1, next);
+	/^-l$/ && ($language = shift, next);
+	/^-q$/ && ($QUIET = 1, next);
+	/^-h$/ && ($HELP = 1, next);
+	/^-x$/ && ($SKIP_XML = 1, next);
+	/^-a$/ && ($AGGRESSIVE = 1, next);
+	/^-time$/ && ($TIMING = 1, next);
+  # Option to add list of regexps to be protected
+  /^-protected/ && ($protected_patterns_file = shift, next);
+	/^-threads$/ && ($NUM_THREADS = int(shift), next);
+	/^-lines$/ && ($NUM_SENTENCES_PER_THREAD = int(shift), next);
+	/^-penn$/ && ($PENN = 1, next);
+	/^-no-escape/ && ($NO_ESCAPING = 1, next);
+}
+
+# for time calculation
+my $start_time;
+if ($TIMING)
+{
+    $start_time = [ Time::HiRes::gettimeofday( ) ];
+}
+
+# print help message
+if ($HELP)
+{
+	print "Usage ./tokenizer.perl (-l [en|de|...]) (-threads 4) < textfile > tokenizedfile\n";
+        print "Options:\n";
+        print "  -q     ... quiet.\n";
+        print "  -a     ... aggressive hyphen splitting.\n";
+        print "  -b     ... disable Perl buffering.\n";
+        print "  -time  ... enable processing time calculation.\n";
+        print "  -penn  ... use Penn treebank-like tokenization.\n";
+        print "  -protected FILE  ... specify file with patters to be protected in tokenisation.\n";
+	print "  -no-escape ... don't perform HTML escaping on apostrophy, quotes, etc.\n";
+	exit;
+}
+
+if (!$QUIET)
+{
+	print STDERR "Tokenizer Version 1.1\n";
+	print STDERR "Language: $language\n";
+	print STDERR "Number of threads: $NUM_THREADS\n";
+}
+
+# load the language-specific non-breaking prefix info from files in the directory nonbreaking_prefixes
+load_prefixes($language,\%NONBREAKING_PREFIX);
+
+if (scalar(%NONBREAKING_PREFIX) eq 0)
+{
+	print STDERR "Warning: No known abbreviations for language '$language'\n";
+}
+
+# Load protected patterns
+if ($protected_patterns_file)
+{
+  open(PP,$protected_patterns_file) || die "Unable to open $protected_patterns_file";
+  while(<PP>) {
+    chomp;
+    push @protected_patterns, $_;
+  }
+}
+
+my @batch_sentences = ();
+my @thread_list = ();
+my $count_sentences = 0;
+
+if ($NUM_THREADS > 1)
+{# multi-threading tokenization
+    while(<STDIN>)
+    {
+        $count_sentences = $count_sentences + 1;
+        push(@batch_sentences, $_);
+        if (scalar(@batch_sentences)>=($NUM_SENTENCES_PER_THREAD*$NUM_THREADS))
+        {
+            # assign each thread work
+            for (my $i=0; $i<$NUM_THREADS; $i++)
+            {
+                my $start_index = $i*$NUM_SENTENCES_PER_THREAD;
+                my $end_index = $start_index+$NUM_SENTENCES_PER_THREAD-1;
+                my @subbatch_sentences = @batch_sentences[$start_index..$end_index];
+                my $new_thread = new Thread \&tokenize_batch, @subbatch_sentences;
+                push(@thread_list, $new_thread);
+            }
+            foreach (@thread_list)
+            {
+                my $tokenized_list = $_->join;
+                foreach (@$tokenized_list)
+                {
+                    print $_;
+                }
+            }
+            # reset for the new run
+            @thread_list = ();
+            @batch_sentences = ();
+        }
+    }
+    # the last batch
+    if (scalar(@batch_sentences)>0)
+    {
+        # assign each thread work
+        for (my $i=0; $i<$NUM_THREADS; $i++)
+        {
+            my $start_index = $i*$NUM_SENTENCES_PER_THREAD;
+            if ($start_index >= scalar(@batch_sentences))
+            {
+                last;
+            }
+            my $end_index = $start_index+$NUM_SENTENCES_PER_THREAD-1;
+            if ($end_index >= scalar(@batch_sentences))
+            {
+                $end_index = scalar(@batch_sentences)-1;
+            }
+            my @subbatch_sentences = @batch_sentences[$start_index..$end_index];
+            my $new_thread = new Thread \&tokenize_batch, @subbatch_sentences;
+            push(@thread_list, $new_thread);
+        }
+        foreach (@thread_list)
+        {
+            my $tokenized_list = $_->join;
+            foreach (@$tokenized_list)
+            {
+                print $_;
+            }
+        }
+    }
+}
+else
+{# single thread only
+    while(<STDIN>)
+    {
+        if (($SKIP_XML && /^<.+>$/) || /^\s*$/)
+        {
+            #don't try to tokenize XML/HTML tag lines
+            print $_;
+        }
+        else
+        {
+            print &tokenize($_);
+        }
+    }
+}
+
+if ($TIMING)
+{
+    my $duration = Time::HiRes::tv_interval( $start_time );
+    print STDERR ("TOTAL EXECUTION TIME: ".$duration."\n");
+    print STDERR ("TOKENIZATION SPEED: ".($duration/$count_sentences*1000)." milliseconds/line\n");
+}
+
+#####################################################################################
+# subroutines afterward
+
+# tokenize a batch of texts saved in an array
+# input: an array containing a batch of texts
+# return: another array containing a batch of tokenized texts for the input array
+sub tokenize_batch
+{
+    my(@text_list) = @_;
+    my(@tokenized_list) = ();
+    foreach (@text_list)
+    {
+        if (($SKIP_XML && /^<.+>$/) || /^\s*$/)
+        {
+            #don't try to tokenize XML/HTML tag lines
+            push(@tokenized_list, $_);
+        }
+        else
+        {
+            push(@tokenized_list, &tokenize($_));
+        }
+    }
+    return \@tokenized_list;
+}
+
+# the actual tokenize function which tokenizes one input string
+# input: one string
+# return: the tokenized string for the input string
+sub tokenize
+{
+    my($text) = @_;
+
+    if ($PENN) {
+      return tokenize_penn($text);
+    }
+
+    chomp($text);
+    $text = " $text ";
+
+    # remove ASCII junk
+    $text =~ s/\s+/ /g;
+    $text =~ s/[\000-\037]//g;
+
+    # Find protected patterns
+    my @protected = ();
+    foreach my $protected_pattern (@protected_patterns) {
+      my $t = $text;
+      while ($t =~ /(?<PATTERN>$protected_pattern)(?<TAIL>.*)$/) {
+        push @protected, $+{PATTERN};
+        $t = $+{TAIL};
+      }
+    }
+
+    for (my $i = 0; $i < scalar(@protected); ++$i) {
+      my $subst = sprintf("THISISPROTECTED%.3d", $i);
+      $text =~ s,\Q$protected[$i], $subst ,g;
+    }
+    $text =~ s/ +/ /g;
+    $text =~ s/^ //g;
+    $text =~ s/ $//g;
+
+    # separate out all "other" special characters
+    if (($language eq "fi") or ($language eq "sv")) {
+        # in Finnish and Swedish, the colon can be used inside words as an apostrophe-like character:
+        # USA:n, 20:een, EU:ssa, USA:s, S:t
+        $text =~ s/([^\p{IsAlnum}\s\.\:\'\`\,\-])/ $1 /g;
+        # if a colon is not immediately followed by lower-case characters, separate it out anyway
+        $text =~ s/(:)(?=$|[^\p{Ll}])/ $1 /g;
+    }
+    elsif ($language eq "tdt") {
+        # in Tetun, the apostrophe can be used inside words as an apostrophe-like character:
+        $text =~ s/([^\p{IsAlnum}\s\.\'\`\,\-])/ $1 /g;
+        # if an apostrophe is not immediately followed by lower-case characters, separate it out anyway
+        $text =~ s/(\')(?=$|[^\p{Ll}])/ $1 /g;
+    }
+    elsif (($language eq "ca")) {
+        # in Catalan, the middle dot can be used inside words:
+        # il�lusio
+        $text =~ s/([^\p{IsAlnum}\s\.\·\'\`\,\-])/ $1 /g;
+        # if a middot is not immediately followed by lower-case characters, separate it out anyway
+        $text =~ s/(·)(?=$|[^\p{Ll}])/ $1 /g;
+    }   
+    else {
+        $text =~ s/([^\p{IsAlnum}\s\.\'\`\,\-])/ $1 /g;
+    }
+
+    # aggressive hyphen splitting
+    if ($AGGRESSIVE)
+    {
+        $text =~ s/([\p{IsAlnum}])\-(?=[\p{IsAlnum}])/$1 \@-\@ /g;
+    }
+
+    #multi-dots stay together
+    $text =~ s/\.([\.]+)/ DOTMULTI$1/g;
+    while($text =~ /DOTMULTI\./)
+    {
+        $text =~ s/DOTMULTI\.([^\.])/DOTDOTMULTI $1/g;
+        $text =~ s/DOTMULTI\./DOTDOTMULTI/g;
+    }
+
+    # seperate out "," except if within numbers (5,300)
+    #$text =~ s/([^\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
+
+    # separate out "," except if within numbers (5,300)
+    # previous "global" application skips some:  A,B,C,D,E > A , B,C , D,E
+    # first application uses up B so rule can't see B,C
+    # two-step version here may create extra spaces but these are removed later
+    # will also space digit,letter or letter,digit forms (redundant with next section)
+    $text =~ s/([^\p{IsN}])[,]/$1 , /g;
+    $text =~ s/[,]([^\p{IsN}])/ , $1/g;
+    
+    # separate "," after a number if it's the end of a sentence
+    $text =~ s/([\p{IsN}])[,]$/$1 ,/g;
+
+    # separate , pre and post number
+    #$text =~ s/([\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
+    #$text =~ s/([^\p{IsN}])[,]([\p{IsN}])/$1 , $2/g;
+
+    # turn `into '
+    #$text =~ s/\`/\'/g;
+
+    #turn '' into "
+    #$text =~ s/\'\'/ \" /g;
+
+    if ($language eq "en")
+    {
+        #split contractions right
+        $text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
+        $text =~ s/([^\p{IsAlpha}\p{IsN}])[']([\p{IsAlpha}])/$1 ' $2/g;
+        $text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
+        $text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1 '$2/g;
+        #special case for "1990's"
+        $text =~ s/([\p{IsN}])[']([s])/$1 '$2/g;
+    }
+    elsif (($language eq "fr") or ($language eq "it") or ($language eq "ga") or ($language eq "ca"))
+    {
+        #split contractions left
+        $text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
+        $text =~ s/([^\p{IsAlpha}])[']([\p{IsAlpha}])/$1 ' $2/g;
+        $text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
+        $text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1' $2/g;
+    }
+    elsif (($language eq "so")  or ($language eq "tdt"))
+    {
+        # Don't split glottals
+        $text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
+        $text =~ s/([^\p{IsAlpha}])[']([\p{IsAlpha}])/$1 ' $2/g;
+        $text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
+    }
+    else
+    {
+        $text =~ s/\'/ \' /g;
+    }
+
+    #word token method
+    my @words = split(/\s/,$text);
+    $text = "";
+    for (my $i=0;$i<(scalar(@words));$i++)
+    {
+        my $word = $words[$i];
+        if ( $word =~ /^(\S+)\.$/)
+        {
+            my $pre = $1;
+            if ($i == scalar(@words)-1) {
+                # split last words independently as they are unlikely to be non-breaking prefixes
+                $word = $pre." .";
+            }
+            elsif (($pre =~ /\./ && $pre =~ /\p{IsAlpha}/) || ($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==1) || ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[\p{IsLower}]/)))
+            {
+                #no change
+            }
+            elsif (($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==2) && ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[0-9]+/)))
+            {
+                #no change
+            }
+            else
+            {
+                $word = $pre." .";
+            }
+        }
+        $text .= $word." ";
+    }
+
+    # clean up extraneous spaces
+    $text =~ s/ +/ /g;
+    $text =~ s/^ //g;
+    $text =~ s/ $//g;
+
+    # .' at end of sentence is missed
+    $text =~ s/\.\' ?$/ . ' /;
+
+    # restore protected
+    for (my $i = 0; $i < scalar(@protected); ++$i) {
+      my $subst = sprintf("THISISPROTECTED%.3d", $i);
+      $text =~ s/$subst/$protected[$i]/g;
+    }
+
+    #restore multi-dots
+    while($text =~ /DOTDOTMULTI/)
+    {
+        $text =~ s/DOTDOTMULTI/DOTMULTI./g;
+    }
+    $text =~ s/DOTMULTI/./g;
+
+    #escape special chars
+    if (!$NO_ESCAPING)
+      {
+	$text =~ s/\&/\&amp;/g;   # escape escape
+	$text =~ s/\|/\&#124;/g;  # factor separator
+	$text =~ s/\</\&lt;/g;    # xml
+	$text =~ s/\>/\&gt;/g;    # xml
+	$text =~ s/\'/\&apos;/g;  # xml
+	$text =~ s/\"/\&quot;/g;  # xml
+	$text =~ s/\[/\&#91;/g;   # syntax non-terminal
+	$text =~ s/\]/\&#93;/g;   # syntax non-terminal
+      }
+
+    #ensure final line break
+    $text .= "\n" unless $text =~ /\n$/;
+
+    return $text;
+}
+
+sub tokenize_penn
+{
+    # Improved compatibility with Penn Treebank tokenization.  Useful if
+    # the text is to later be parsed with a PTB-trained parser.
+    #
+    # Adapted from Robert MacIntyre's sed script:
+    #   http://www.cis.upenn.edu/~treebank/tokenizer.sed
+
+    my($text) = @_;
+    chomp($text);
+
+    # remove ASCII junk
+    $text =~ s/\s+/ /g;
+    $text =~ s/[\000-\037]//g;
+
+    # attempt to get correct directional quotes
+    $text =~ s/^``/`` /g;
+    $text =~ s/^"/`` /g;
+    $text =~ s/^`([^`])/` $1/g;
+    $text =~ s/^'/`  /g;
+    $text =~ s/([ ([{<])"/$1 `` /g;
+    $text =~ s/([ ([{<])``/$1 `` /g;
+    $text =~ s/([ ([{<])`([^`])/$1 ` $2/g;
+    $text =~ s/([ ([{<])'/$1 ` /g;
+    # close quotes handled at end
+
+    $text =~ s=\.\.\.= _ELLIPSIS_ =g;
+
+    # separate out "," except if within numbers (5,300)
+    $text =~ s/([^\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
+    # separate , pre and post number
+    $text =~ s/([\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
+    $text =~ s/([^\p{IsN}])[,]([\p{IsN}])/$1 , $2/g;
+
+    #$text =~ s=([;:@#\$%&\p{IsSc}])= $1 =g;
+$text =~ s=([;:@#\$%&\p{IsSc}\p{IsSo}])= $1 =g;
+
+    # Separate out intra-token slashes.  PTB tokenization doesn't do this, so
+    # the tokens should be merged prior to parsing with a PTB-trained parser
+    # (see syntax-hyphen-splitting.perl).
+    $text =~ s/([\p{IsAlnum}])\/([\p{IsAlnum}])/$1 \@\/\@ $2/g;
+
+    # Assume sentence tokenization has been done first, so split FINAL periods
+    # only.
+    $text =~ s=([^.])([.])([\]\)}>"']*) ?$=$1 $2$3 =g;
+    # however, we may as well split ALL question marks and exclamation points,
+    # since they shouldn't have the abbrev.-marker ambiguity problem
+    $text =~ s=([?!])= $1 =g;
+
+    # parentheses, brackets, etc.
+    $text =~ s=([\]\[\(\){}<>])= $1 =g;
+    $text =~ s/\(/-LRB-/g;
+    $text =~ s/\)/-RRB-/g;
+    $text =~ s/\[/-LSB-/g;
+    $text =~ s/\]/-RSB-/g;
+    $text =~ s/{/-LCB-/g;
+    $text =~ s/}/-RCB-/g;
+
+    $text =~ s=--= -- =g;
+
+    # First off, add a space to the beginning and end of each line, to reduce
+    # necessary number of regexps.
+    $text =~ s=$= =;
+    $text =~ s=^= =;
+
+    $text =~ s="= '' =g;
+    # possessive or close-single-quote
+    $text =~ s=([^'])' =$1 ' =g;
+    # as in it's, I'm, we'd
+    $text =~ s='([sSmMdD]) = '$1 =g;
+    $text =~ s='ll = 'll =g;
+    $text =~ s='re = 're =g;
+    $text =~ s='ve = 've =g;
+    $text =~ s=n't = n't =g;
+    $text =~ s='LL = 'LL =g;
+    $text =~ s='RE = 'RE =g;
+    $text =~ s='VE = 'VE =g;
+    $text =~ s=N'T = N'T =g;
+
+    $text =~ s= ([Cc])annot = $1an not =g;
+    $text =~ s= ([Dd])'ye = $1' ye =g;
+    $text =~ s= ([Gg])imme = $1im me =g;
+    $text =~ s= ([Gg])onna = $1on na =g;
+    $text =~ s= ([Gg])otta = $1ot ta =g;
+    $text =~ s= ([Ll])emme = $1em me =g;
+    $text =~ s= ([Mm])ore'n = $1ore 'n =g;
+    $text =~ s= '([Tt])is = '$1 is =g;
+    $text =~ s= '([Tt])was = '$1 was =g;
+    $text =~ s= ([Ww])anna = $1an na =g;
+
+    #word token method
+    my @words = split(/\s/,$text);
+    $text = "";
+    for (my $i=0;$i<(scalar(@words));$i++)
+    {
+        my $word = $words[$i];
+        if ( $word =~ /^(\S+)\.$/)
+        {
+            my $pre = $1;
+            if (($pre =~ /\./ && $pre =~ /\p{IsAlpha}/) || ($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==1) || ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[\p{IsLower}]/)))
+            {
+                #no change
+            }
+            elsif (($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==2) && ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[0-9]+/)))
+            {
+                #no change
+            }
+            else
+            {
+                $word = $pre." .";
+            }
+        }
+        $text .= $word." ";
+    }
+
+    # restore ellipses
+    $text =~ s=_ELLIPSIS_=\.\.\.=g;
+
+    # clean out extra spaces
+    $text =~ s=  *= =g;
+    $text =~ s=^ *==g;
+    $text =~ s= *$==g;
+
+    #escape special chars
+    $text =~ s/\&/\&amp;/g;   # escape escape
+    $text =~ s/\|/\&#124;/g;  # factor separator
+    $text =~ s/\</\&lt;/g;    # xml
+    $text =~ s/\>/\&gt;/g;    # xml
+    $text =~ s/\'/\&apos;/g;  # xml
+    $text =~ s/\"/\&quot;/g;  # xml
+    $text =~ s/\[/\&#91;/g;   # syntax non-terminal
+    $text =~ s/\]/\&#93;/g;   # syntax non-terminal
+
+    #ensure final line break
+    $text .= "\n" unless $text =~ /\n$/;
+
+    return $text;
+}
+
+sub load_prefixes
+{
+    my ($language, $PREFIX_REF) = @_;
+
+    my $prefixfile = "$mydir/nonbreaking_prefix.$language";
+
+    #default back to English if we don't have a language-specific prefix file
+    if (!(-e $prefixfile))
+    {
+        $prefixfile = "$mydir/nonbreaking_prefix.en";
+        print STDERR "WARNING: No known abbreviations for language '$language', attempting fall-back to English version...\n";
+        die ("ERROR: No abbreviations files found in $mydir\n") unless (-e $prefixfile);
+    }
+
+    if (-e "$prefixfile")
+    {
+        open(PREFIX, "<:utf8", "$prefixfile");
+        while (<PREFIX>)
+        {
+            my $item = $_;
+            chomp($item);
+            if (($item) && (substr($item,0,1) ne "#"))
+            {
+                if ($item =~ /(.*)[\s]+(\#NUMERIC_ONLY\#)/)
+                {
+                    $PREFIX_REF->{$1} = 2;
+                }
+                else
+                {
+                    $PREFIX_REF->{$item} = 1;
+                }
+            }
+        }
+        close(PREFIX);
+    }
+}
\ No newline at end of file
diff --git a/utils/update_json.sh b/utils/update_json.sh
new file mode 100755
index 0000000000000000000000000000000000000000..bf69747554b1e1aa405449151540a6e03dfe6ded
--- /dev/null
+++ b/utils/update_json.sh
@@ -0,0 +1,88 @@
+#!/bin/bash
+
+# Copyright 2020 Kyoto University (Hirofumi Inaguma)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+echo "$0 $*" >&2 # Print the command line for logging
+. ./path.sh
+
+nlsyms=""
+oov="<unk>"
+bpecode=""
+verbose=0
+
+text=""
+multilingual=false
+
+help_message=$(cat << EOF
+Usage: $0 <json> <data-dir> <dict>
+e.g. $0 data/train data/lang_1char/train_units.txt
+Options:
+  --oov <oov-word>                                 # Default: <unk>
+  --verbose <num>                                  # Default: 0
+EOF
+)
+. utils/parse_options.sh
+
+if [ $# != 3 ]; then
+    echo "${help_message}" 1>&2
+    exit 1;
+fi
+
+set -euo pipefail
+
+json=$1
+dir=$2
+dic=$3
+json_dir=$(dirname ${json})
+tmpdir=$(mktemp -d ${dir}/tmp-XXXXX)
+trap 'rm -rf ${tmpdir}' EXIT
+
+if [ -z ${text} ]; then
+    text=${dir}/text
+fi
+
+# 2. Create scp files for outputs
+mkdir -p ${tmpdir}/output
+if [ -n "${bpecode}" ]; then
+    if [ ${multilingual} = true ]; then
+        # remove a space before the language ID
+        paste -d " " <(awk '{print $1}' ${text}) <(cut -f 2- -d" " ${text} \
+            | spm_encode --model=${bpecode} --output_format=piece | cut -f 2- -d" ") \
+            > ${tmpdir}/output/token.scp
+    else
+        paste -d " " <(awk '{print $1}' ${text}) <(cut -f 2- -d" " ${text} \
+            | spm_encode --model=${bpecode} --output_format=piece) \
+            > ${tmpdir}/output/token.scp
+    fi
+elif [ -n "${nlsyms}" ]; then
+    text2token.py -s 1 -n 1 -l ${nlsyms} ${text} > ${tmpdir}/output/token.scp
+else
+    text2token.py -s 1 -n 1 ${text} > ${tmpdir}/output/token.scp
+fi
+< ${tmpdir}/output/token.scp utils/sym2int.pl --map-oov ${oov} -f 2- ${dic} > ${tmpdir}/output/tokenid.scp
+awk '{print $1 " " NF-1}' ${tmpdir}/output/tokenid.scp > ${tmpdir}/output/olen.scp
+# +2 comes from CTC blank and EOS
+vocsize=$(tail -n 1 ${dic} | awk '{print $2}')
+odim=$(echo "$vocsize + 2" | bc)
+awk -v odim=${odim} '{print $1 " " odim}' ${text} > ${tmpdir}/output/odim.scp
+
+cat ${text} > ${tmpdir}/output/text.scp
+
+
+# 4. Create JSON files from each scp files
+rm -f ${tmpdir}/*/*.json
+for x in "${tmpdir}"/output/*.scp; do
+    k=$(basename ${x} .scp)
+    < ${x} scp2json.py --key ${k} > ${tmpdir}/output/${k}.json
+done
+
+# add to json
+addjson.py --verbose ${verbose} -i false \
+  ${json} ${tmpdir}/output/text.json ${tmpdir}/output/token.json ${tmpdir}/output/tokenid.json ${tmpdir}/output/olen.json ${tmpdir}/output/odim.json > ${tmpdir}/data.json
+mkdir -p ${json_dir}/.backup
+echo "json updated. original json is kept in ${json_dir}/.backup."
+cp ${json} ${json_dir}/.backup/"$(basename ${json})"
+cp ${tmpdir}/data.json ${json}
+
+rm -fr ${tmpdir}
diff --git a/utils/utility.py b/utils/utility.py
index b4db518a414de78fcc1c95dc0b9ab63d0a3c1733..dbf8b1d7fe24cc8595dfbd6858125b2213987c88 100755
--- a/utils/utility.py
+++ b/utils/utility.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import hashlib
-import json
 import os
 import sys
 import tarfile
@@ -22,31 +21,10 @@ from typing import Text
 __all__ = [
     "check_md5sum", "getfile_insensitive", "download_multi", "download",
     "unpack", "unzip", "md5file", "print_arguments", "add_arguments",
-    "read_manifest", "get_commandline_args"
+    "get_commandline_args"
 ]
 
 
-def read_manifest(manifest_path):
-    """Load and parse manifest file.
-    Args:
-        manifest_path ([type]): Manifest file to load and parse.
-
-    Raises:
-        IOError: If failed to parse the manifest.
-
-    Returns:
-        List[dict]: Manifest parsing results.
-    """
-
-    manifest = []
-    for json_line in open(manifest_path, 'r'):
-        try:
-            json_data = json.loads(json_line)
-        except Exception as e:
-            raise IOError("Error reading manifest: %s" % str(e))
-    return manifest
-
-
 def get_commandline_args():
     extra_chars = [
         " ",