84cc5fc9 · 84cc5fc9 · 84cc5fc9 · 84cc5fc9 · 84cc5fc9 · 84cc5fc9
103 changed file
--- a/.github/CONTRIBUTING.md
+++ b/.github/CONTRIBUTING.md
@@ -27,4 +27,4 @@ git commit -m "xxxxxx, test=doc"
 1. 虽然跳过了 CI，但是还要先排队排到才能跳过，所以非自己方向看到 pending 不要着急 🤣
 2. 在 `git commit --amend` 的时候才加 `test=xxx` 可能不太有效
 3. 一个 pr 多次提交 commit 注意每次都要加 `test=xxx`，因为每个 commit 都会触发 CI
-4. 删除 python 环境中已经安装好的的 paddlespeech，否则可能会影响 import paddlespeech 的顺序</div>
+4. 删除 python 环境中已经安装好的 paddlespeech，否则可能会影响 import paddlespeech 的顺序</div>
--- a/audio/paddleaudio/backends/soundfile_backend.py
+++ b/audio/paddleaudio/backends/soundfile_backend.py
@@ -191,7 +191,7 @@ def soundfile_save(y: np.ndarray, sr: int, file: os.PathLike) -> None:

    if sr <= 0:
        raise ParameterError(
-            f'Sample rate should be larger than 0, recieved sr = {sr}')
+            f'Sample rate should be larger than 0, received sr = {sr}')

    if y.dtype not in ['int16', 'int8']:
        warnings.warn(

--- a/dataset/aidatatang_200zh/aidatatang_200zh.py
+++ b/dataset/aidatatang_200zh/aidatatang_200zh.py
@@ -18,139 +18,7 @@ Manifest file is a json-format file with each line containing the
 meta data (i.e. audio filepath, transcript and audio duration)
 of each audio file in the data set.
 """
-import argparse
-import codecs
-import json
-import os
-from pathlib import Path
-
-import soundfile
-
-from utils.utility import download
-from utils.utility import unpack
-
-DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
-
-URL_ROOT = 'http://www.openslr.org/resources/62'
-# URL_ROOT = 'https://openslr.magicdatatech.com/resources/62'
-DATA_URL = URL_ROOT + '/aidatatang_200zh.tgz'
-MD5_DATA = '6e0f4f39cd5f667a7ee53c397c8d0949'
-
-parser = argparse.ArgumentParser(description=__doc__)
-parser.add_argument(
-    "--target_dir",
-    default=DATA_HOME + "/aidatatang_200zh",
-    type=str,
-    help="Directory to save the dataset. (default: %(default)s)")
-parser.add_argument(
-    "--manifest_prefix",
-    default="manifest",
-    type=str,
-    help="Filepath prefix for output manifests. (default: %(default)s)")
-args = parser.parse_args()
-
-
-def create_manifest(data_dir, manifest_path_prefix):
-    print("Creating manifest %s ..." % manifest_path_prefix)
-    json_lines = []
-    transcript_path = os.path.join(data_dir, 'transcript',
-                                   'aidatatang_200_zh_transcript.txt')
-    transcript_dict = {}
-    for line in codecs.open(transcript_path, 'r', 'utf-8'):
-        line = line.strip()
-        if line == '':
-            continue
-        audio_id, text = line.split(' ', 1)
-        # remove withespace, charactor text
-        text = ''.join(text.split())
-        transcript_dict[audio_id] = text
-
-    data_types = ['train', 'dev', 'test']
-    for dtype in data_types:
-        del json_lines[:]
-        total_sec = 0.0
-        total_text = 0.0
-        total_num = 0
-
-        audio_dir = os.path.join(data_dir, 'corpus/', dtype)
-        for subfolder, _, filelist in sorted(os.walk(audio_dir)):
-            for fname in filelist:
-                if not fname.endswith('.wav'):
-                    continue
-
-                audio_path = os.path.abspath(os.path.join(subfolder, fname))
-                audio_id = os.path.basename(fname)[:-4]
-                utt2spk = Path(audio_path).parent.name
-
-                audio_data, samplerate = soundfile.read(audio_path)
-                duration = float(len(audio_data) / samplerate)
-                text = transcript_dict[audio_id]
-                json_lines.append(
-                    json.dumps(
-                        {
-                            'utt': audio_id,
-                            'utt2spk': str(utt2spk),
-                            'feat': audio_path,
-                            'feat_shape': (duration, ),  # second
-                            'text': text,
-                        },
-                        ensure_ascii=False))
-
-                total_sec += duration
-                total_text += len(text)
-                total_num += 1
-
-        manifest_path = manifest_path_prefix + '.' + dtype
-        with codecs.open(manifest_path, 'w', 'utf-8') as fout:
-            for line in json_lines:
-                fout.write(line + '\n')
-
-        manifest_dir = os.path.dirname(manifest_path_prefix)
-        meta_path = os.path.join(manifest_dir, dtype) + '.meta'
-        with open(meta_path, 'w') as f:
-            print(f"{dtype}:", file=f)
-            print(f"{total_num} utts", file=f)
-            print(f"{total_sec / (60*60)} h", file=f)
-            print(f"{total_text} text", file=f)
-            print(f"{total_text / total_sec} text/sec", file=f)
-            print(f"{total_sec / total_num} sec/utt", file=f)
-
-
-def prepare_dataset(url, md5sum, target_dir, manifest_path, subset):
-    """Download, unpack and create manifest file."""
-    data_dir = os.path.join(target_dir, subset)
-    if not os.path.exists(data_dir):
-        filepath = download(url, md5sum, target_dir)
-        unpack(filepath, target_dir)
-        # unpack all audio tar files
-        audio_dir = os.path.join(data_dir, 'corpus')
-        for subfolder, dirlist, filelist in sorted(os.walk(audio_dir)):
-            for sub in dirlist:
-                print(f"unpack dir {sub}...")
-                for folder, _, filelist in sorted(
-                        os.walk(os.path.join(subfolder, sub))):
-                    for ftar in filelist:
-                        unpack(os.path.join(folder, ftar), folder, True)
-    else:
-        print("Skip downloading and unpacking. Data already exists in %s." %
-              target_dir)
-
-    create_manifest(data_dir, manifest_path)
-
-
-def main():
-    if args.target_dir.startswith('~'):
-        args.target_dir = os.path.expanduser(args.target_dir)
-
-    prepare_dataset(
-        url=DATA_URL,
-        md5sum=MD5_DATA,
-        target_dir=args.target_dir,
-        manifest_path=args.manifest_prefix,
-        subset='aidatatang_200zh')
-
-    print("Data download and manifest prepare done!")
-
+from paddlespeech.dataset.aidatatang_200zh import aidatatang_200zh_main

 if __name__ == '__main__':
-    main()
+    aidatatang_200zh_main()
--- a/dataset/aishell/aishell.py
+++ b/dataset/aishell/aishell.py
@@ -18,143 +18,7 @@ Manifest file is a json-format file with each line containing the
 meta data (i.e. audio filepath, transcript and audio duration)
 of each audio file in the data set.
 """
-import argparse
-import codecs
-import json
-import os
-from pathlib import Path
-
-import soundfile
-
-from utils.utility import download
-from utils.utility import unpack
-
-DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
-
-URL_ROOT = 'http://openslr.elda.org/resources/33'
-# URL_ROOT = 'https://openslr.magicdatatech.com/resources/33'
-DATA_URL = URL_ROOT + '/data_aishell.tgz'
-MD5_DATA = '2f494334227864a8a8fec932999db9d8'
-RESOURCE_URL = URL_ROOT + '/resource_aishell.tgz'
-MD5_RESOURCE = '957d480a0fcac85fc18e550756f624e5'
-
-parser = argparse.ArgumentParser(description=__doc__)
-parser.add_argument(
-    "--target_dir",
-    default=DATA_HOME + "/Aishell",
-    type=str,
-    help="Directory to save the dataset. (default: %(default)s)")
-parser.add_argument(
-    "--manifest_prefix",
-    default="manifest",
-    type=str,
-    help="Filepath prefix for output manifests. (default: %(default)s)")
-args = parser.parse_args()
-
-
-def create_manifest(data_dir, manifest_path_prefix):
-    print("Creating manifest %s ..." % manifest_path_prefix)
-    json_lines = []
-    transcript_path = os.path.join(data_dir, 'transcript',
-                                   'aishell_transcript_v0.8.txt')
-    transcript_dict = {}
-    for line in codecs.open(transcript_path, 'r', 'utf-8'):
-        line = line.strip()
-        if line == '':
-            continue
-        audio_id, text = line.split(' ', 1)
-        # remove withespace, charactor text
-        text = ''.join(text.split())
-        transcript_dict[audio_id] = text
-
-    data_types = ['train', 'dev', 'test']
-    for dtype in data_types:
-        del json_lines[:]
-        total_sec = 0.0
-        total_text = 0.0
-        total_num = 0
-
-        audio_dir = os.path.join(data_dir, 'wav', dtype)
-        for subfolder, _, filelist in sorted(os.walk(audio_dir)):
-            for fname in filelist:
-                audio_path = os.path.abspath(os.path.join(subfolder, fname))
-                audio_id = os.path.basename(fname)[:-4]
-                # if no transcription for audio then skipped
-                if audio_id not in transcript_dict:
-                    continue
-
-                utt2spk = Path(audio_path).parent.name
-                audio_data, samplerate = soundfile.read(audio_path)
-                duration = float(len(audio_data) / samplerate)
-                text = transcript_dict[audio_id]
-                json_lines.append(
-                    json.dumps(
-                        {
-                            'utt': audio_id,
-                            'utt2spk': str(utt2spk),
-                            'feat': audio_path,
-                            'feat_shape': (duration, ),  # second
-                            'text': text
-                        },
-                        ensure_ascii=False))
-
-                total_sec += duration
-                total_text += len(text)
-                total_num += 1
-
-        manifest_path = manifest_path_prefix + '.' + dtype
-        with codecs.open(manifest_path, 'w', 'utf-8') as fout:
-            for line in json_lines:
-                fout.write(line + '\n')
-
-        manifest_dir = os.path.dirname(manifest_path_prefix)
-        meta_path = os.path.join(manifest_dir, dtype) + '.meta'
-        with open(meta_path, 'w') as f:
-            print(f"{dtype}:", file=f)
-            print(f"{total_num} utts", file=f)
-            print(f"{total_sec / (60*60)} h", file=f)
-            print(f"{total_text} text", file=f)
-            print(f"{total_text / total_sec} text/sec", file=f)
-            print(f"{total_sec / total_num} sec/utt", file=f)
-
-
-def prepare_dataset(url, md5sum, target_dir, manifest_path=None):
-    """Download, unpack and create manifest file."""
-    data_dir = os.path.join(target_dir, 'data_aishell')
-    if not os.path.exists(data_dir):
-        filepath = download(url, md5sum, target_dir)
-        unpack(filepath, target_dir)
-        # unpack all audio tar files
-        audio_dir = os.path.join(data_dir, 'wav')
-        for subfolder, _, filelist in sorted(os.walk(audio_dir)):
-            for ftar in filelist:
-                unpack(os.path.join(subfolder, ftar), subfolder, True)
-    else:
-        print("Skip downloading and unpacking. Data already exists in %s." %
-              target_dir)
-
-    if manifest_path:
-        create_manifest(data_dir, manifest_path)
-
-
-def main():
-    if args.target_dir.startswith('~'):
-        args.target_dir = os.path.expanduser(args.target_dir)
-
-    prepare_dataset(
-        url=DATA_URL,
-        md5sum=MD5_DATA,
-        target_dir=args.target_dir,
-        manifest_path=args.manifest_prefix)
-
-    prepare_dataset(
-        url=RESOURCE_URL,
-        md5sum=MD5_RESOURCE,
-        target_dir=args.target_dir,
-        manifest_path=None)
-
-    print("Data download and manifest prepare done!")
-
+from paddlespeech.dataset.aishell import aishell_main

 if __name__ == '__main__':
-    main()
+    aishell_main()
--- a/dataset/librispeech/librispeech.py
+++ b/dataset/librispeech/librispeech.py
@@ -28,8 +28,8 @@ from multiprocessing.pool import Pool
 import distutils.util
 import soundfile

-from utils.utility import download
-from utils.utility import unpack
+from paddlespeech.dataset.download import download
+from paddlespeech.dataset.download import unpack

 URL_ROOT = "http://openslr.elda.org/resources/12"
 #URL_ROOT = "https://openslr.magicdatatech.com/resources/12"

--- a/dataset/mini_librispeech/mini_librispeech.py
+++ b/dataset/mini_librispeech/mini_librispeech.py
@@ -27,8 +27,8 @@ from multiprocessing.pool import Pool

 import soundfile

-from utils.utility import download
-from utils.utility import unpack
+from paddlespeech.dataset.download import download
+from paddlespeech.dataset.download import unpack

 URL_ROOT = "http://openslr.elda.org/resources/31"
 URL_TRAIN_CLEAN = URL_ROOT + "/train-clean-5.tar.gz"

--- a/dataset/musan/musan.py
+++ b/dataset/musan/musan.py
@@ -29,8 +29,8 @@ import os

 import soundfile

-from utils.utility import download
-from utils.utility import unpack
+from paddlespeech.dataset.download import download
+from paddlespeech.dataset.download import unpack

 DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')


--- a/dataset/rir_noise/rir_noise.py
+++ b/dataset/rir_noise/rir_noise.py
@@ -29,8 +29,8 @@ import os

 import soundfile

-from utils.utility import download
-from utils.utility import unzip
+from paddlespeech.dataset.download import download
+from paddlespeech.dataset.download import unzip

 DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')


--- a/dataset/thchs30/thchs30.py
+++ b/dataset/thchs30/thchs30.py
@@ -27,8 +27,8 @@ from pathlib import Path

 import soundfile

-from utils.utility import download
-from utils.utility import unpack
+from paddlespeech.dataset.download import download
+from paddlespeech.dataset.download import unpack

 DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')


--- a/dataset/timit/timit.py
+++ b/dataset/timit/timit.py
@@ -28,7 +28,7 @@ from pathlib import Path

 import soundfile

-from utils.utility import unzip
+from paddlespeech.dataset.download import unzip

 URL_ROOT = ""
 MD5_DATA = "45c68037c7fdfe063a43c851f181fb2d"

--- a/dataset/voxceleb/voxceleb1.py
+++ b/dataset/voxceleb/voxceleb1.py
@@ -31,9 +31,9 @@ from pathlib import Path

 import soundfile

-from utils.utility import check_md5sum
-from utils.utility import download
-from utils.utility import unzip
+from paddlespeech.dataset.download import check_md5sum
+from paddlespeech.dataset.download import download
+from paddlespeech.dataset.download import unzip

 # all the data will be download in the current data/voxceleb directory default
 DATA_HOME = os.path.expanduser('.')

--- a/dataset/voxceleb/voxceleb2.py
+++ b/dataset/voxceleb/voxceleb2.py
@@ -27,9 +27,9 @@ from pathlib import Path

 import soundfile

-from utils.utility import check_md5sum
-from utils.utility import download
-from utils.utility import unzip
+from paddlespeech.dataset.download import check_md5sum
+from paddlespeech.dataset.download import download
+from paddlespeech.dataset.download import unzip

 # all the data will be download in the current data/voxceleb directory default
 DATA_HOME = os.path.expanduser('.')

--- a/dataset/voxforge/voxforge.py
+++ b/dataset/voxforge/voxforge.py
@@ -28,9 +28,9 @@ import subprocess

 import soundfile

-from utils.utility import download_multi
-from utils.utility import getfile_insensitive
-from utils.utility import unpack
+from paddlespeech.dataset.download import download_multi
+from paddlespeech.dataset.download import getfile_insensitive
+from paddlespeech.dataset.download import unpack

 DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')


--- a/demos/TTSAndroid/README.md
+++ b/demos/TTSAndroid/README.md
 # 语音合成 Java API Demo 使用指南

-在 Android 上实现语音合成功能，此 Demo 有很好的的易用性和开放性，如在 Demo 中跑自己训练好的模型等。
+在 Android 上实现语音合成功能，此 Demo 有很好的易用性和开放性，如在 Demo 中跑自己训练好的模型等。

 本文主要介绍语音合成 Demo 运行方法。


--- a/demos/TTSArmLinux/front.conf
+++ b/demos/TTSArmLinux/front.conf
@@ -6,13 +6,13 @@
 --jieba_stop_word_path=./dict/jieba/stop_words.utf8

 # dict conf fastspeech2_0.4
--seperate_tone=false
+--separate_tone=false
 --word2phone_path=./dict/fastspeech2_nosil_baker_ckpt_0.4/word2phone_fs2.dict
 --phone2id_path=./dict/fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt
 --tone2id_path=./dict/fastspeech2_nosil_baker_ckpt_0.4/word2phone_fs2.dict

 # dict conf speedyspeech_0.5
-#--seperate_tone=true
+#--separate_tone=true
 #--word2phone_path=./dict/speedyspeech_nosil_baker_ckpt_0.5/word2phone.dict
 #--phone2id_path=./dict/speedyspeech_nosil_baker_ckpt_0.5/phone_id_map.txt
 #--tone2id_path=./dict/speedyspeech_nosil_baker_ckpt_0.5/tone_id_map.txt

--- a/demos/TTSCppFrontend/front_demo/front.conf
+++ b/demos/TTSCppFrontend/front_demo/front.conf
@@ -6,13 +6,13 @@
 --jieba_stop_word_path=./front_demo/dict/jieba/stop_words.utf8

 # dict conf fastspeech2_0.4
--seperate_tone=false
+--separate_tone=false
 --word2phone_path=./front_demo/dict/fastspeech2_nosil_baker_ckpt_0.4/word2phone_fs2.dict
 --phone2id_path=./front_demo/dict/fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt
 --tone2id_path=./front_demo/dict/fastspeech2_nosil_baker_ckpt_0.4/word2phone_fs2.dict

 # dict conf speedyspeech_0.5
-#--seperate_tone=true
+#--separate_tone=true
 #--word2phone_path=./front_demo/dict/speedyspeech_nosil_baker_ckpt_0.5/word2phone.dict
 #--phone2id_path=./front_demo/dict/speedyspeech_nosil_baker_ckpt_0.5/phone_id_map.txt
 #--tone2id_path=./front_demo/dict/speedyspeech_nosil_baker_ckpt_0.5/tone_id_map.txt

--- a/demos/TTSCppFrontend/front_demo/front_demo.cpp
+++ b/demos/TTSCppFrontend/front_demo/front_demo.cpp
@@ -20,7 +20,7 @@

 DEFINE_string(sentence, "你好，欢迎使用语音合成服务", "Text to be synthesized");
 DEFINE_string(front_conf, "./front_demo/front.conf", "Front conf file");
-// DEFINE_string(seperate_tone, "true", "If true, get phoneids and tonesid");
+// DEFINE_string(separate_tone, "true", "If true, get phoneids and tonesid");


 int main(int argc, char** argv) {

--- a/demos/TTSCppFrontend/front_demo/gentools/word2phones.py
+++ b/demos/TTSCppFrontend/front_demo/gentools/word2phones.py
@@ -20,7 +20,7 @@ worddict = "./dict/jieba_part.dict.utf8"
 newdict = "./dict/word_phones.dict"


-def GenPhones(initials, finals, seperate=True):
+def GenPhones(initials, finals, separate=True):

    phones = []
    for c, v in zip(initials, finals):
@@ -30,9 +30,9 @@ def GenPhones(initials, finals, seperate=True):
            elif c in ['zh', 'ch', 'sh', 'r']:
                v = re.sub('i', 'iii', v)
        if c:
-            if seperate is True:
+            if separate is True:
                phones.append(c + '0')
-            elif seperate is False:
+            elif separate is False:
                phones.append(c)
            else:
                print("Not sure whether phone and tone need to be separated")

--- a/demos/TTSCppFrontend/src/front/front_interface.cpp
+++ b/demos/TTSCppFrontend/src/front/front_interface.cpp
@@ -126,7 +126,7 @@ int FrontEngineInterface::init() {
    }

    // 生成音调字典（音调到音调id的映射）
-    if (_seperate_tone == "true") {
+    if (_separate_tone == "true") {
        if (0 != GenDict(_tone2id_path, &tone_id_map)) {
            LOG(ERROR) << "Genarate tone2id dict failed";
            return -1;
@@ -168,7 +168,7 @@ int FrontEngineInterface::ReadConfFile() {
    _jieba_stop_word_path = conf_map["jieba_stop_word_path"];

    // dict path
-    _seperate_tone = conf_map["seperate_tone"];
+    _separate_tone = conf_map["separate_tone"];
    _word2phone_path = conf_map["word2phone_path"];
    _phone2id_path = conf_map["phone2id_path"];
    _tone2id_path = conf_map["tone2id_path"];
@@ -295,7 +295,7 @@ int FrontEngineInterface::GetWordsIds(
                    }
                }
            } else {  // 标点符号
-                if (_seperate_tone == "true") {
+                if (_separate_tone == "true") {
                    phone = "sp0";  // speedyspeech
                } else {
                    phone = "sp";  // fastspeech2
@@ -354,7 +354,7 @@ int FrontEngineInterface::Phone2Phoneid(const std::string &phone,
    std::string temp_phone;
    for (int i = 0; i < phone_vec.size(); i++) {
        temp_phone = phone_vec[i];
-        if (_seperate_tone == "true") {
+        if (_separate_tone == "true") {
            phoneid->push_back(atoi(
                (phone_id_map[temp_phone.substr(0, temp_phone.length() - 1)])
                    .c_str()));

--- a/demos/TTSCppFrontend/src/front/front_interface.h
+++ b/demos/TTSCppFrontend/src/front/front_interface.h
@@ -182,7 +182,7 @@ class FrontEngineInterface : public TextNormalizer {
    std::string _jieba_idf_path;
    std::string _jieba_stop_word_path;

-    std::string _seperate_tone;
+    std::string _separate_tone;
    std::string _word2phone_path;
    std::string _phone2id_path;
    std::string _tone2id_path;

--- a/demos/audio_searching/src/test_audio_search.py
+++ b/demos/audio_searching/src/test_audio_search.py
@@ -14,8 +14,8 @@
 from audio_search import app
 from fastapi.testclient import TestClient

-from utils.utility import download
-from utils.utility import unpack
+from paddlespeech.dataset.download import download
+from paddlespeech.dataset.download import unpack

 client = TestClient(app)


--- a/demos/audio_searching/src/test_vpr_search.py
+++ b/demos/audio_searching/src/test_vpr_search.py
@@ -14,8 +14,8 @@
 from fastapi.testclient import TestClient
 from vpr_search import app

-from utils.utility import download
-from utils.utility import unpack
+from paddlespeech.dataset.download import download
+from paddlespeech.dataset.download import unpack

 client = TestClient(app)


--- a/demos/speech_web/README.md
+++ b/demos/speech_web/README.md
@@ -23,7 +23,7 @@ Paddle Speech Demo 是一个以 PaddleSpeech 的语音交互功能为主体开

 + ERNIE-SAT：语言-语音跨模态大模型 ERNIE-SAT 可视化展示示例，支持个性化合成，跨语言语音合成（音频为中文则输入英文文本进行合成），语音编辑（修改音频文字中间的结果）功能。 ERNIE-SAT 更多实现细节，可以参考：
  + [【ERNIE-SAT with AISHELL-3 dataset】](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/ernie_sat)
-  + [【ERNIE-SAT with with AISHELL3 and VCTK datasets】](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3_vctk/ernie_sat)
+  + [【ERNIE-SAT with AISHELL3 and VCTK datasets】](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3_vctk/ernie_sat)
  + [【ERNIE-SAT with VCTK dataset】](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/ernie_sat)

 运行效果：

--- a/demos/speech_web/speech_server/main.py
+++ b/demos/speech_web/speech_server/main.py
@@ -260,7 +260,7 @@ async def websocket_endpoint_online(websocket: WebSocket):
                #              and we break the loop
                if message['signal'] == 'start':
                    resp = {"status": "ok", "signal": "server_ready"}
-                    # do something at begining here
+                    # do something at beginning here
                    # create the instance to process the audio
                    # connection_handler = chatbot.asr.connection_handler
                    connection_handler = PaddleASRConnectionHanddler(engine)

--- a/docs/tutorial/st/st_tutorial.ipynb
+++ b/docs/tutorial/st/st_tutorial.ipynb
@@ -62,7 +62,7 @@
    "collapsed": false
   },
   "source": [
-    "# 使用Transformer进行端到端语音翻译的的基本流程\n",
+    "# 使用Transformer进行端到端语音翻译的基本流程\n",
    "## 基础模型\n",
    "由于 ASR 章节已经介绍了 Transformer 以及语音特征抽取，在此便不做过多介绍，感兴趣的同学可以去相关章节进行了解。\n",
    "\n",

--- a/docs/tutorial/tts/tts_tutorial.ipynb
+++ b/docs/tutorial/tts/tts_tutorial.ipynb
@@ -464,7 +464,7 @@
    "<br><center> FastSpeech2 网络结构图</center></br>\n",
    "\n",
    "\n",
-    "PaddleSpeech TTS 实现的 FastSpeech2 与论文不同的地方在于，我们使用的的是 phone 级别的 `pitch` 和 `energy`(与 FastPitch 类似)，这样的合成结果可以更加**稳定**。\n",
+    "PaddleSpeech TTS 实现的 FastSpeech2 与论文不同的地方在于，我们使用的是 phone 级别的 `pitch` 和 `energy`(与 FastPitch 类似)，这样的合成结果可以更加**稳定**。\n",
    "<center><img src=\"https://ai-studio-static-online.cdn.bcebos.com/862c21456c784c41a83a308b7d9707f0810cc3b3c6f94ed48c60f5d32d0072f0\"></center>\n",
    "<br><center> FastPitch 网络结构图</center></br>\n",
    "\n",

--- a/examples/aishell/asr1/local/test.sh
+++ b/examples/aishell/asr1/local/test.sh
 #!/bin/bash

-if [ $# != 3 ];then
-    echo "usage: ${0} config_path decode_config_path ckpt_path_prefix"
-    exit -1
-fi
+set -e

 stage=0
 stop_stage=100
+
+source utils/parse_options.sh || exit 1;
+
 ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."

+
+if [ $# != 3 ];then
+    echo "usage: ${0} config_path decode_config_path ckpt_path_prefix"
+    exit -1
+fi
+
 config_path=$1
 decode_config_path=$2
 ckpt_prefix=$3
@@ -92,6 +98,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
 fi

 if [ ${stage} -le 101 ] && [ ${stop_stage} -ge 101 ]; then
+    echo "using sclite to compute cer..."
    # format the reference test file for sclite
    python utils/format_rsl.py \
        --origin_ref data/manifest.test.raw \

--- a/examples/csmsc/jets/README.md
+++ b/examples/csmsc/jets/README.md
@@ -96,3 +96,13 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p
 ```bash
 CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name}
 ```
+
+## Pretrained Model
+
+The pretrained model can be downloaded here:
+
+- [jets_csmsc_ckpt_1.5.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/jets_csmsc_ckpt_1.5.0.zip)
+
+The static model can be downloaded here:
+
+- [jets_csmsc_static_1.5.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/jets_csmsc_static_1.5.0.zip)
--- a/examples/librispeech/asr2/README.md
+++ b/examples/librispeech/asr2/README.md
@@ -153,7 +153,7 @@ After training the model, we need to get the final model for testing and inferen
 ```bash
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
     # avg n best model
-     avg.sh lastest exp/${ckpt}/checkpoints ${avg_num}
+     avg.sh latest exp/${ckpt}/checkpoints ${avg_num}
 fi
 ```
 The `avg.sh` is in the `../../../utils/` which is define in the `path.sh`.

--- a/examples/other/mfa/local/generate_lexicon.py
+++ b/examples/other/mfa/local/generate_lexicon.py
@@ -48,7 +48,7 @@ def rule(C, V, R, T):
    
    'i' is distinguished when appeared in phonemes, and separated into 3 categories, 'i', 'ii' and 'iii'.

-    Erhua is is possibly applied to every finals, except for finals that already ends with 'r'.
+    Erhua is possibly applied to every finals, except for finals that already ends with 'r'.

    When a syllable is impossible or does not have any characters with this pronunciation, return None
    to filter it out.

--- a/examples/tiny/asr1/README.md
+++ b/examples/tiny/asr1/README.md
@@ -37,7 +37,7 @@ It will support the way of using `--variable value` in the shell scripts.
 Some local variables are set in `run.sh`. 
 `gpus` denotes the GPU number you want to use. If you set `gpus=`, it means you only use CPU. 
 `stage` denotes the number of stage you want the start from in the experiments.
-`stop stage` denotes the number of stage you want the stop at in the expriments. 
+`stop stage` denotes the number of stage you want the stop at in the experiments. 
 `conf_path` denotes the config path of the model.
 `avg_num`denotes the number K of top-K models you want to average to get the final model.
 `ckpt` denotes the checkpoint prefix of the model, e.g. "transformerr"

--- a/paddlespeech/__init__.py
+++ b/paddlespeech/__init__.py
@@ -13,3 +13,7 @@
 # limitations under the License.
 import _locale
 _locale._getdefaultlocale = (lambda *args: ['en_US', 'utf8'])
+
+__version__ = '0.0.0'
+
+__commit__ = '9cf8c1985a98bb380c183116123672976bdfe5c9'
--- a/paddlespeech/dataset/__init__.py
+++ b/paddlespeech/dataset/__init__.py
--- a/dataset/aidatatang_200zh/README.md
+++ b/dataset/aidatatang_200zh/README.md
--- a/paddlespeech/dataset/aidatatang_200zh/__init__.py
+++ b/paddlespeech/dataset/aidatatang_200zh/__init__.py
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .aidatatang_200zh import main as aidatatang_200zh_main
--- a/paddlespeech/dataset/aidatatang_200zh/aidatatang_200zh.py
+++ b/paddlespeech/dataset/aidatatang_200zh/aidatatang_200zh.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Prepare aidatatang_200zh mandarin dataset
+
+Download, unpack and create manifest files.
+Manifest file is a json-format file with each line containing the
+meta data (i.e. audio filepath, transcript and audio duration)
+of each audio file in the data set.
+"""
+import argparse
+import codecs
+import json
+import os
+from pathlib import Path
+
+import soundfile
+
+from paddlespeech.dataset.download import download
+from paddlespeech.dataset.download import unpack
+from paddlespeech.utils.argparse import print_arguments
+
+DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
+
+URL_ROOT = 'http://www.openslr.org/resources/62'
+# URL_ROOT = 'https://openslr.magicdatatech.com/resources/62'
+DATA_URL = URL_ROOT + '/aidatatang_200zh.tgz'
+MD5_DATA = '6e0f4f39cd5f667a7ee53c397c8d0949'
+
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    "--target_dir",
+    default=DATA_HOME + "/aidatatang_200zh",
+    type=str,
+    help="Directory to save the dataset. (default: %(default)s)")
+parser.add_argument(
+    "--manifest_prefix",
+    default="manifest",
+    type=str,
+    help="Filepath prefix for output manifests. (default: %(default)s)")
+args = parser.parse_args()
+
+
+def create_manifest(data_dir, manifest_path_prefix):
+    print("Creating manifest %s ..." % manifest_path_prefix)
+    json_lines = []
+    transcript_path = os.path.join(data_dir, 'transcript',
+                                   'aidatatang_200_zh_transcript.txt')
+    transcript_dict = {}
+    for line in codecs.open(transcript_path, 'r', 'utf-8'):
+        line = line.strip()
+        if line == '':
+            continue
+        audio_id, text = line.split(' ', 1)
+        # remove withespace, charactor text
+        text = ''.join(text.split())
+        transcript_dict[audio_id] = text
+
+    data_types = ['train', 'dev', 'test']
+    for dtype in data_types:
+        del json_lines[:]
+        total_sec = 0.0
+        total_text = 0.0
+        total_num = 0
+
+        audio_dir = os.path.join(data_dir, 'corpus/', dtype)
+        for subfolder, _, filelist in sorted(os.walk(audio_dir)):
+            for fname in filelist:
+                if not fname.endswith('.wav'):
+                    continue
+
+                audio_path = os.path.abspath(os.path.join(subfolder, fname))
+                audio_id = os.path.basename(fname)[:-4]
+                utt2spk = Path(audio_path).parent.name
+
+                audio_data, samplerate = soundfile.read(audio_path)
+                duration = float(len(audio_data) / samplerate)
+                text = transcript_dict[audio_id]
+                json_lines.append(
+                    json.dumps(
+                        {
+                            'utt': audio_id,
+                            'utt2spk': str(utt2spk),
+                            'feat': audio_path,
+                            'feat_shape': (duration, ),  # second
+                            'text': text,
+                        },
+                        ensure_ascii=False))
+
+                total_sec += duration
+                total_text += len(text)
+                total_num += 1
+
+        manifest_path = manifest_path_prefix + '.' + dtype
+        with codecs.open(manifest_path, 'w', 'utf-8') as fout:
+            for line in json_lines:
+                fout.write(line + '\n')
+
+        manifest_dir = os.path.dirname(manifest_path_prefix)
+        meta_path = os.path.join(manifest_dir, dtype) + '.meta'
+        with open(meta_path, 'w') as f:
+            print(f"{dtype}:", file=f)
+            print(f"{total_num} utts", file=f)
+            print(f"{total_sec / (60*60)} h", file=f)
+            print(f"{total_text} text", file=f)
+            print(f"{total_text / total_sec} text/sec", file=f)
+            print(f"{total_sec / total_num} sec/utt", file=f)
+
+
+def prepare_dataset(url, md5sum, target_dir, manifest_path, subset):
+    """Download, unpack and create manifest file."""
+    data_dir = os.path.join(target_dir, subset)
+    if not os.path.exists(data_dir):
+        filepath = download(url, md5sum, target_dir)
+        unpack(filepath, target_dir)
+        # unpack all audio tar files
+        audio_dir = os.path.join(data_dir, 'corpus')
+        for subfolder, dirlist, filelist in sorted(os.walk(audio_dir)):
+            for sub in dirlist:
+                print(f"unpack dir {sub}...")
+                for folder, _, filelist in sorted(
+                        os.walk(os.path.join(subfolder, sub))):
+                    for ftar in filelist:
+                        unpack(os.path.join(folder, ftar), folder, True)
+    else:
+        print("Skip downloading and unpacking. Data already exists in %s." %
+              target_dir)
+
+    create_manifest(data_dir, manifest_path)
+
+
+def main():
+    print_arguments(args, globals())
+    if args.target_dir.startswith('~'):
+        args.target_dir = os.path.expanduser(args.target_dir)
+
+    prepare_dataset(
+        url=DATA_URL,
+        md5sum=MD5_DATA,
+        target_dir=args.target_dir,
+        manifest_path=args.manifest_prefix,
+        subset='aidatatang_200zh')
+
+    print("Data download and manifest prepare done!")
+
+
+if __name__ == '__main__':
+    main()
--- a/dataset/aishell/README.md
+++ b/dataset/aishell/README.md
 # [Aishell1](http://openslr.elda.org/33/)

 This Open Source Mandarin Speech Corpus, AISHELL-ASR0009-OS1, is 178 hours long. It is a part of AISHELL-ASR0009, of which utterance contains 11 domains, including smart home, autonomous driving, and industrial production. The whole recording was put in quiet indoor environment, using 3 different devices at the same time: high fidelity microphone (44.1kHz, 16-bit,); Android-system mobile phone (16kHz, 16-bit), iOS-system mobile phone (16kHz, 16-bit). Audios in high fidelity were re-sampled to 16kHz to build AISHELL- ASR0009-OS1. 400 speakers from different accent areas in China were invited to participate in the recording. The manual transcription accuracy rate is above 95%, through professional speech annotation and strict quality inspection. The corpus is divided into training, development and testing sets. ( This database is free for academic research, not in the commerce, if without permission. )
+
+
+## Dataset Architecture
+
+```bash
+data_aishell
+├── transcript      # text 目录
+└── wav             # wav 目录
+    ├── dev         # dev 目录
+    │   ├── S0724   # spk 目录
+    │   ├── S0725
+    │   ├── S0726
+    ├── train
+    │   ├── S0724
+    │   ├── S0725
+    │   ├── S0726
+    ├── test
+    │   ├── S0724
+    │   ├── S0725
+    │   ├── S0726
+ 
+
+data_aishell
+├── transcript
+│   └── aishell_transcript_v0.8.txt   # 文本标注文件
+└── wav
+    ├── dev
+    │   ├── S0724
+    │   │   ├── BAC009S0724W0121.wav  # S0724 的音频
+    │   │   ├── BAC009S0724W0122.wav
+    │   │   ├── BAC009S0724W0123.wav
+    ├── test
+    │   ├── S0724
+    │   │   ├── BAC009S0724W0121.wav
+    │   │   ├── BAC009S0724W0122.wav
+    │   │   ├── BAC009S0724W0123.wav
+    ├── train
+    │   ├── S0724
+    │   │   ├── BAC009S0724W0121.wav
+    │   │   ├── BAC009S0724W0122.wav
+    │   │   ├── BAC009S0724W0123.wav
+    
+标注文件格式： <utt> <tokens>
+> head data_aishell/transcript/aishell_transcript_v0.8.txt 
+BAC009S0002W0122 而 对 楼市 成交 抑制 作用 最 大 的 限 购
+BAC009S0002W0123 也 成为 地方 政府 的 眼中 钉
+BAC009S0002W0124 自 六月 底 呼和浩特 市 率先 宣布 取消 限 购 后
+BAC009S0002W0125 各地 政府 便 纷纷 跟进
+BAC009S0002W0126 仅 一 个 多 月 的 时间 里
+BAC009S0002W0127 除了 北京 上海 广州 深圳 四 个 一 线 城市 和 三亚 之外
+BAC009S0002W0128 四十六 个 限 购 城市 当中
+BAC009S0002W0129 四十一 个 已 正式 取消 或 变相 放松 了 限 购
+BAC009S0002W0130 财政 金融 政策 紧随 其后 而来
+BAC009S0002W0131 显示 出 了 极 强 的 威力
+```
--- a/paddlespeech/dataset/aishell/__init__.py
+++ b/paddlespeech/dataset/aishell/__init__.py
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .aishell import check_dataset
+from .aishell import create_manifest
+from .aishell import download_dataset
+from .aishell import main as aishell_main
+from .aishell import prepare_dataset
--- a/paddlespeech/dataset/aishell/aishell.py
+++ b/paddlespeech/dataset/aishell/aishell.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Prepare Aishell mandarin dataset
+
+Download, unpack and create manifest files.
+Manifest file is a json-format file with each line containing the
+meta data (i.e. audio filepath, transcript and audio duration)
+of each audio file in the data set.
+"""
+import argparse
+import codecs
+import json
+import os
+from pathlib import Path
+
+import soundfile
+
+from paddlespeech.dataset.download import download
+from paddlespeech.dataset.download import unpack
+from paddlespeech.utils.argparse import print_arguments
+
+DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
+
+URL_ROOT = 'http://openslr.elda.org/resources/33'
+# URL_ROOT = 'https://openslr.magicdatatech.com/resources/33'
+DATA_URL = URL_ROOT + '/data_aishell.tgz'
+MD5_DATA = '2f494334227864a8a8fec932999db9d8'
+RESOURCE_URL = URL_ROOT + '/resource_aishell.tgz'
+MD5_RESOURCE = '957d480a0fcac85fc18e550756f624e5'
+
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    "--target_dir",
+    default=DATA_HOME + "/Aishell",
+    type=str,
+    help="Directory to save the dataset. (default: %(default)s)")
+parser.add_argument(
+    "--manifest_prefix",
+    default="manifest",
+    type=str,
+    help="Filepath prefix for output manifests. (default: %(default)s)")
+args = parser.parse_args()
+
+
+def create_manifest(data_dir, manifest_path_prefix):
+    print("Creating manifest %s ..." % os.path.join(data_dir,
+                                                    manifest_path_prefix))
+    json_lines = []
+    transcript_path = os.path.join(data_dir, 'transcript',
+                                   'aishell_transcript_v0.8.txt')
+    transcript_dict = {}
+    for line in codecs.open(transcript_path, 'r', 'utf-8'):
+        line = line.strip()
+        if line == '':
+            continue
+        audio_id, text = line.split(' ', 1)
+        # remove withespace, charactor text
+        text = ''.join(text.split())
+        transcript_dict[audio_id] = text
+
+    data_metas = dict()
+    data_types = ['train', 'dev', 'test']
+    for dtype in data_types:
+        del json_lines[:]
+        total_sec = 0.0
+        total_text = 0.0
+        total_num = 0
+
+        audio_dir = os.path.join(data_dir, 'wav', dtype)
+        for subfolder, _, filelist in sorted(os.walk(audio_dir)):
+            for fname in filelist:
+                audio_path = os.path.abspath(os.path.join(subfolder, fname))
+                audio_id = os.path.basename(fname)[:-4]
+                # if no transcription for audio then skipped
+                if audio_id not in transcript_dict:
+                    continue
+
+                utt2spk = Path(audio_path).parent.name
+                audio_data, samplerate = soundfile.read(audio_path)
+                duration = float(len(audio_data) / samplerate)
+                text = transcript_dict[audio_id]
+                json_lines.append(
+                    json.dumps(
+                        {
+                            'utt': audio_id,
+                            'utt2spk': str(utt2spk),
+                            'feat': audio_path,
+                            'feat_shape': (duration, ),  # second
+                            'text': text
+                        },
+                        ensure_ascii=False))
+
+                total_sec += duration
+                total_text += len(text)
+                total_num += 1
+
+        manifest_path = manifest_path_prefix + '.' + dtype
+        with codecs.open(manifest_path, 'w', 'utf-8') as fout:
+            for line in json_lines:
+                fout.write(line + '\n')
+
+        meta = dict()
+        meta["dtype"] = dtype  # train, dev, test
+        meta["utts"] = total_num
+        meta["hours"] = total_sec / (60 * 60)
+        meta["text"] = total_text
+        meta["text/sec"] = total_text / total_sec
+        meta["sec/utt"] = total_sec / total_num
+        data_metas[dtype] = meta
+
+        manifest_dir = os.path.dirname(manifest_path_prefix)
+        meta_path = os.path.join(manifest_dir, dtype) + '.meta'
+        with open(meta_path, 'w') as f:
+            for key, val in meta.items():
+                print(f"{key}: {val}", file=f)
+
+    return data_metas
+
+
+def download_dataset(url, md5sum, target_dir):
+    """Download, unpack and create manifest file."""
+    data_dir = os.path.join(target_dir, 'data_aishell')
+    if not os.path.exists(data_dir):
+        filepath = download(url, md5sum, target_dir)
+        unpack(filepath, target_dir)
+        # unpack all audio tar files
+        audio_dir = os.path.join(data_dir, 'wav')
+        for subfolder, _, filelist in sorted(os.walk(audio_dir)):
+            for ftar in filelist:
+                unpack(os.path.join(subfolder, ftar), subfolder, True)
+    else:
+        print("Skip downloading and unpacking. Data already exists in %s." %
+              os.path.abspath(target_dir))
+    return os.path.abspath(data_dir)
+
+
+def check_dataset(data_dir):
+    print(f"check dataset {os.path.abspath(data_dir)} ...")
+
+    transcript_path = os.path.join(data_dir, 'transcript',
+                                   'aishell_transcript_v0.8.txt')
+    if not os.path.exists(transcript_path):
+        raise FileNotFoundError(f"no transcript file found in {data_dir}.")
+
+    transcript_dict = {}
+    for line in codecs.open(transcript_path, 'r', 'utf-8'):
+        line = line.strip()
+        if line == '':
+            continue
+        audio_id, text = line.split(' ', 1)
+        # remove withespace, charactor text
+        text = ''.join(text.split())
+        transcript_dict[audio_id] = text
+
+    no_label = 0
+    data_types = ['train', 'dev', 'test']
+    for dtype in data_types:
+        audio_dir = os.path.join(data_dir, 'wav', dtype)
+        if not os.path.exists(audio_dir):
+            raise IOError(f"{audio_dir} does not exist.")
+
+        for subfolder, _, filelist in sorted(os.walk(audio_dir)):
+            for fname in filelist:
+                audio_path = os.path.abspath(os.path.join(subfolder, fname))
+                audio_id = os.path.basename(fname)[:-4]
+                # if no transcription for audio then skipped
+                if audio_id not in transcript_dict:
+                    print(f"Warning: {audio_id} not has transcript.")
+                    no_label += 1
+                    continue
+
+                utt2spk = Path(audio_path).parent.name
+                audio_data, samplerate = soundfile.read(audio_path)
+                assert samplerate == 16000, f"{audio_path} sample rate is {samplerate} not 16k, please check."
+
+        print(f"Warning: {dtype} has {no_label} audio does not has transcript.")
+
+
+def prepare_dataset(url, md5sum, target_dir, manifest_path=None, check=False):
+    """Download, unpack and create manifest file."""
+    data_dir = download_dataset(url, md5sum, target_dir)
+
+    if check:
+        try:
+            check_dataset(data_dir)
+        except Exception as e:
+            raise ValueError(
+                f"{data_dir} dataset format not right, please check it.")
+
+    meta = None
+    if manifest_path:
+        meta = create_manifest(data_dir, manifest_path)
+
+    return data_dir, meta
+
+
+def main():
+    print_arguments(args, globals())
+    if args.target_dir.startswith('~'):
+        args.target_dir = os.path.expanduser(args.target_dir)
+
+    data_dir, meta = prepare_dataset(
+        url=DATA_URL,
+        md5sum=MD5_DATA,
+        target_dir=args.target_dir,
+        manifest_path=args.manifest_prefix,
+        check=True)
+
+    resource_dir, _ = prepare_dataset(
+        url=RESOURCE_URL,
+        md5sum=MD5_RESOURCE,
+        target_dir=args.target_dir,
+        manifest_path=None)
+
+    print("Data download and manifest prepare done!")
+
+
+if __name__ == '__main__':
+    main()
--- a/utils/utility.py
+++ b/utils/utility.py
@@ -19,91 +19,16 @@ import zipfile
 from typing import Text

 __all__ = [
-    "check_md5sum", "getfile_insensitive", "download_multi", "download",
-    "unpack", "unzip", "md5file", "print_arguments", "add_arguments",
-    "get_commandline_args"
+    "check_md5sum",
+    "getfile_insensitive",
+    "download_multi",
+    "download",
+    "unpack",
+    "unzip",
+    "md5file",
 ]


-def get_commandline_args():
-    extra_chars = [
-        " ",
-        ";",
-        "&",
-        "(",
-        ")",
-        "|",
-        "^",
-        "<",
-        ">",
-        "?",
-        "*",
-        "[",
-        "]",
-        "$",
-        "`",
-        '"',
-        "\\",
-        "!",
-        "{",
-        "}",
-    ]
-
-    # Escape the extra characters for shell
-    argv = [
-        arg.replace("'", "'\\''") if all(char not in arg
-                                         for char in extra_chars) else
-        "'" + arg.replace("'", "'\\''") + "'" for arg in sys.argv
-    ]
-
-    return sys.executable + " " + " ".join(argv)
-
-
-def print_arguments(args, info=None):
-    """Print argparse's arguments.
-
-    Usage:
-
-    .. code-block:: python
-
-        parser = argparse.ArgumentParser()
-        parser.add_argument("name", default="Jonh", type=str, help="User name.")
-        args = parser.parse_args()
-        print_arguments(args)
-
-    :param args: Input argparse.Namespace for printing.
-    :type args: argparse.Namespace
-    """
-    filename = ""
-    if info:
-        filename = info["__file__"]
-    filename = os.path.basename(filename)
-    print(f"----------- {filename} Configuration Arguments -----------")
-    for arg, value in sorted(vars(args).items()):
-        print("%s: %s" % (arg, value))
-    print("-----------------------------------------------------------")
-
-
-def add_arguments(argname, type, default, help, argparser, **kwargs):
-    """Add argparse's argument.
-
-    Usage:
-
-    .. code-block:: python
-
-        parser = argparse.ArgumentParser()
-        add_argument("name", str, "Jonh", "User name.", parser)
-        args = parser.parse_args()
-    """
-    type = distutils.util.strtobool if type == bool else type
-    argparser.add_argument(
-        "--" + argname,
-        default=default,
-        type=type,
-        help=help + ' Default: %(default)s.',
-        **kwargs)
-
-
 def md5file(fname):
    hash_md5 = hashlib.md5()
    f = open(fname, "rb")

--- a/paddlespeech/dataset/s2t/__init__.py
+++ b/paddlespeech/dataset/s2t/__init__.py
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# s2t utils binaries.
+from .avg_model import main as avg_ckpts_main
+from .build_vocab import main as build_vocab_main
+from .compute_mean_std import main as compute_mean_std_main
+from .compute_wer import main as compute_wer_main
+from .format_data import main as format_data_main
+from .format_rsl import main as format_rsl_main
--- a/paddlespeech/dataset/s2t/avg_model.py
+++ b/paddlespeech/dataset/s2t/avg_model.py
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import glob
+import json
+import os
+
+import numpy as np
+import paddle
+
+
+def define_argparse():
+    parser = argparse.ArgumentParser(description='average model')
+    parser.add_argument('--dst_model', required=True, help='averaged model')
+    parser.add_argument(
+        '--ckpt_dir', required=True, help='ckpt model dir for average')
+    parser.add_argument(
+        '--val_best', action="store_true", help='averaged model')
+    parser.add_argument(
+        '--num', default=5, type=int, help='nums for averaged model')
+    parser.add_argument(
+        '--min_epoch',
+        default=0,
+        type=int,
+        help='min epoch used for averaging model')
+    parser.add_argument(
+        '--max_epoch',
+        default=65536,  # Big enough
+        type=int,
+        help='max epoch used for averaging model')
+
+    args = parser.parse_args()
+    return args
+
+
+def average_checkpoints(dst_model="",
+                        ckpt_dir="",
+                        val_best=True,
+                        num=5,
+                        min_epoch=0,
+                        max_epoch=65536):
+    paddle.set_device('cpu')
+
+    val_scores = []
+    jsons = glob.glob(f'{ckpt_dir}/[!train]*.json')
+    jsons = sorted(jsons, key=os.path.getmtime, reverse=True)
+    for y in jsons:
+        with open(y, 'r') as f:
+            dic_json = json.load(f)
+        loss = dic_json['val_loss']
+        epoch = dic_json['epoch']
+        if epoch >= min_epoch and epoch <= max_epoch:
+            val_scores.append((epoch, loss))
+    assert val_scores, f"Not find any valid checkpoints: {val_scores}"
+    val_scores = np.array(val_scores)
+
+    if val_best:
+        sort_idx = np.argsort(val_scores[:, 1])
+        sorted_val_scores = val_scores[sort_idx]
+    else:
+        sorted_val_scores = val_scores
+
+    beat_val_scores = sorted_val_scores[:num, 1]
+    selected_epochs = sorted_val_scores[:num, 0].astype(np.int64)
+    avg_val_score = np.mean(beat_val_scores)
+    print("selected val scores = " + str(beat_val_scores))
+    print("selected epochs = " + str(selected_epochs))
+    print("averaged val score = " + str(avg_val_score))
+
+    path_list = [
+        ckpt_dir + '/{}.pdparams'.format(int(epoch))
+        for epoch in sorted_val_scores[:num, 0]
+    ]
+    print(path_list)
+
+    avg = None
+    num = args.num
+    assert num == len(path_list)
+    for path in path_list:
+        print(f'Processing {path}')
+        states = paddle.load(path)
+        if avg is None:
+            avg = states
+        else:
+            for k in avg.keys():
+                avg[k] += states[k]
+    # average
+    for k in avg.keys():
+        if avg[k] is not None:
+            avg[k] /= num
+
+    paddle.save(avg, args.dst_model)
+    print(f'Saving to {args.dst_model}')
+
+    meta_path = os.path.splitext(args.dst_model)[0] + '.avg.json'
+    with open(meta_path, 'w') as f:
+        data = json.dumps({
+            "mode": 'val_best' if args.val_best else 'latest',
+            "avg_ckpt": args.dst_model,
+            "val_loss_mean": avg_val_score,
+            "ckpts": path_list,
+            "epochs": selected_epochs.tolist(),
+            "val_losses": beat_val_scores.tolist(),
+        })
+        f.write(data + "\n")
+
+
+def main():
+    args = define_argparse()
+    average_checkpoints(args)
+
+
+if __name__ == '__main__':
+    main()
--- a/paddlespeech/dataset/s2t/build_vocab.py
+++ b/paddlespeech/dataset/s2t/build_vocab.py
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Build vocabulary from manifest files.
+Each item in vocabulary file is a character.
+"""
+import argparse
+import functools
+import os
+import tempfile
+from collections import Counter
+
+import jsonlines
+
+from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
+from paddlespeech.s2t.frontend.utility import BLANK
+from paddlespeech.s2t.frontend.utility import SOS
+from paddlespeech.s2t.frontend.utility import SPACE
+from paddlespeech.s2t.frontend.utility import UNK
+from paddlespeech.utils.argparse import add_arguments
+from paddlespeech.utils.argparse import print_arguments
+
+
+def count_manifest(counter, text_feature, manifest_path):
+    manifest_jsons = []
+    with jsonlines.open(manifest_path, 'r') as reader:
+        for json_data in reader:
+            manifest_jsons.append(json_data)
+
+    for line_json in manifest_jsons:
+        if isinstance(line_json['text'], str):
+            tokens = text_feature.tokenize(
+                line_json['text'], replace_space=False)
+
+            counter.update(tokens)
+        else:
+            assert isinstance(line_json['text'], list)
+            for text in line_json['text']:
+                tokens = text_feature.tokenize(text, replace_space=False)
+                counter.update(tokens)
+
+
+def dump_text_manifest(fileobj, manifest_path, key='text'):
+    manifest_jsons = []
+    with jsonlines.open(manifest_path, 'r') as reader:
+        for json_data in reader:
+            manifest_jsons.append(json_data)
+
+    for line_json in manifest_jsons:
+        if isinstance(line_json[key], str):
+            fileobj.write(line_json[key] + "\n")
+        else:
+            assert isinstance(line_json[key], list)
+            for line in line_json[key]:
+                fileobj.write(line + "\n")
+
+
+def build_vocab(manifest_paths="",
+                vocab_path="examples/librispeech/data/vocab.txt",
+                unit_type="char",
+                count_threshold=0,
+                text_keys='text',
+                spm_mode="unigram",
+                spm_vocab_size=0,
+                spm_model_prefix="",
+                spm_character_coverage=0.9995):
+    fout = open(vocab_path, 'w', encoding='utf-8')
+    fout.write(BLANK + "\n")  # 0 will be used for "blank" in CTC
+    fout.write(UNK + '\n')  # <unk> must be 1
+
+    if unit_type == 'spm':
+        # tools/spm_train --input=$wave_data/lang_char/input.txt
+        # --vocab_size=${nbpe} --model_type=${bpemode}
+        # --model_prefix=${bpemodel} --input_sentence_size=100000000
+        import sentencepiece as spm
+
+        fp = tempfile.NamedTemporaryFile(mode='w', delete=False)
+        for manifest_path in manifest_paths:
+            _text_keys = [text_keys] if type(
+                text_keys) is not list else text_keys
+            for text_key in _text_keys:
+                dump_text_manifest(fp, manifest_path, key=text_key)
+        fp.close()
+        # train
+        spm.SentencePieceTrainer.Train(
+            input=fp.name,
+            vocab_size=spm_vocab_size,
+            model_type=spm_mode,
+            model_prefix=spm_model_prefix,
+            input_sentence_size=100000000,
+            character_coverage=spm_character_coverage)
+        os.unlink(fp.name)
+
+    # encode
+    text_feature = TextFeaturizer(unit_type, "", spm_model_prefix)
+    counter = Counter()
+
+    for manifest_path in manifest_paths:
+        count_manifest(counter, text_feature, manifest_path)
+
+    count_sorted = sorted(counter.items(), key=lambda x: x[1], reverse=True)
+    tokens = []
+    for token, count in count_sorted:
+        if count < count_threshold:
+            break
+        # replace space by `<space>`
+        token = SPACE if token == ' ' else token
+        tokens.append(token)
+
+    tokens = sorted(tokens)
+    for token in tokens:
+        fout.write(token + '\n')
+
+    fout.write(SOS + "\n")  # <sos/eos>
+    fout.close()
+
+
+def define_argparse():
+    parser = argparse.ArgumentParser(description=__doc__)
+    add_arg = functools.partial(add_arguments, argparser=parser)
+
+    # yapf: disable
+    add_arg('unit_type', str, "char", "Unit type, e.g. char, word, spm")
+    add_arg('count_threshold', int, 0,
+            "Truncation threshold for char/word counts.Default 0, no truncate.")
+    add_arg('vocab_path', str,
+            'examples/librispeech/data/vocab.txt',
+            "Filepath to write the vocabulary.")
+    add_arg('manifest_paths', str,
+            None,
+            "Filepaths of manifests for building vocabulary. "
+            "You can provide multiple manifest files.",
+            nargs='+',
+            required=True)
+    add_arg('text_keys', str,
+            'text',
+            "keys of the text in manifest for building vocabulary. "
+            "You can provide multiple k.",
+            nargs='+')
+    # bpe
+    add_arg('spm_vocab_size', int, 0, "Vocab size for spm.")
+    add_arg('spm_mode', str, 'unigram', "spm model type, e.g. unigram, spm, char, word. only need when `unit_type` is spm")
+    add_arg('spm_model_prefix', str, "", "spm_model_%(spm_mode)_%(count_threshold), spm model prefix, only need when `unit_type` is spm")
+    add_arg('spm_character_coverage', float, 0.9995, "character coverage to determine the minimum symbols")
+    # yapf: disable
+
+    args = parser.parse_args()
+    return args
+
+def main():
+    args = define_argparse()
+    print_arguments(args, globals())
+    build_vocab(**vars(args))
+
+if __name__ == '__main__':
+    main()
--- a/paddlespeech/dataset/s2t/compute_mean_std.py
+++ b/paddlespeech/dataset/s2t/compute_mean_std.py
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Compute mean and std for feature normalizer, and save to file."""
+import argparse
+import functools
+
+from paddlespeech.s2t.frontend.augmentor.augmentation import AugmentationPipeline
+from paddlespeech.s2t.frontend.featurizer.audio_featurizer import AudioFeaturizer
+from paddlespeech.s2t.frontend.normalizer import FeatureNormalizer
+from paddlespeech.utils.argparse import add_arguments
+from paddlespeech.utils.argparse import print_arguments
+
+
+def compute_cmvn(manifest_path="data/librispeech/manifest.train",
+                 output_path="data/librispeech/mean_std.npz",
+                 num_samples=2000,
+                 num_workers=0,
+                 spectrum_type="linear",
+                 feat_dim=13,
+                 delta_delta=False,
+                 stride_ms=10,
+                 window_ms=20,
+                 sample_rate=16000,
+                 use_dB_normalization=True,
+                 target_dB=-20):
+
+    augmentation_pipeline = AugmentationPipeline('{}')
+    audio_featurizer = AudioFeaturizer(
+        spectrum_type=spectrum_type,
+        feat_dim=feat_dim,
+        delta_delta=delta_delta,
+        stride_ms=float(stride_ms),
+        window_ms=float(window_ms),
+        n_fft=None,
+        max_freq=None,
+        target_sample_rate=sample_rate,
+        use_dB_normalization=use_dB_normalization,
+        target_dB=target_dB,
+        dither=0.0)
+
+    def augment_and_featurize(audio_segment):
+        augmentation_pipeline.transform_audio(audio_segment)
+        return audio_featurizer.featurize(audio_segment)
+
+    normalizer = FeatureNormalizer(
+        mean_std_filepath=None,
+        manifest_path=manifest_path,
+        featurize_func=augment_and_featurize,
+        num_samples=num_samples,
+        num_workers=num_workers)
+    normalizer.write_to_file(output_path)
+
+
+def define_argparse():
+    parser = argparse.ArgumentParser(description=__doc__)
+    add_arg = functools.partial(add_arguments, argparser=parser)
+
+    # yapf: disable
+    add_arg('manifest_path', str,
+            'data/librispeech/manifest.train',
+            "Filepath of manifest to compute normalizer's mean and stddev.")
+
+    add_arg('output_path', str,
+            'data/librispeech/mean_std.npz',
+            "Filepath of write mean and stddev to (.npz).")
+    add_arg('num_samples',  int,    2000,    "# of samples to for statistics.")
+    add_arg('num_workers',
+                            default=0,
+                            type=int,
+                            help='num of subprocess workers for processing')
+
+
+    add_arg('spectrum_type', str,
+            'linear',
+            "Audio feature type. Options: linear, mfcc, fbank.",
+            choices=['linear', 'mfcc', 'fbank'])
+    add_arg('feat_dim', int, 13, "Audio feature dim.")
+    add_arg('delta_delta', bool,  False, "Audio feature with delta delta.")
+    add_arg('stride_ms', int, 10,  "stride length in ms.")
+    add_arg('window_ms', int, 20,  "stride length in ms.")
+    add_arg('sample_rate',  int, 16000,  "target sample rate.")
+    add_arg('use_dB_normalization', bool, True, "do dB normalization.")
+    add_arg('target_dB',   int, -20,  "target dB.")
+    # yapf: disable
+
+    args = parser.parse_args()
+    return args
+
+def main():
+    args = define_argparse()
+    print_arguments(args, globals())
+    compute_cmvn(**vars(args))
+
+if __name__ == '__main__':
+    main()
--- a/paddlespeech/dataset/s2t/compute_wer.py
+++ b/paddlespeech/dataset/s2t/compute_wer.py
--- a/paddlespeech/dataset/s2t/format_data.py
+++ b/paddlespeech/dataset/s2t/format_data.py
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""format manifest with more metadata."""
+import argparse
+import functools
+import json
+
+import jsonlines
+
+from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
+from paddlespeech.s2t.frontend.utility import load_cmvn
+from paddlespeech.s2t.io.utility import feat_type
+from paddlespeech.utils.argparse import add_arguments
+from paddlespeech.utils.argparse import print_arguments
+
+
+def define_argparse():
+    parser = argparse.ArgumentParser(description=__doc__)
+    add_arg = functools.partial(add_arguments, argparser=parser)
+    # yapf: disable
+    add_arg('manifest_paths',   str,
+            None,
+            "Filepaths of manifests for building vocabulary. "
+            "You can provide multiple manifest files.",
+            nargs='+',
+            required=True)
+    add_arg('output_path',  str, None, "filepath of formated manifest.", required=True)
+    add_arg('cmvn_path',       str,
+            'examples/librispeech/data/mean_std.json',
+            "Filepath of cmvn.")
+    add_arg('unit_type', str, "char", "Unit type, e.g. char, word, spm")
+    add_arg('vocab_path',       str,
+            'examples/librispeech/data/vocab.txt',
+            "Filepath of the vocabulary.")
+    # bpe
+    add_arg('spm_model_prefix', str, None,
+        "spm model prefix, spm_model_%(bpe_mode)_%(count_threshold), only need when `unit_type` is spm")
+
+    # yapf: disable
+    args = parser.parse_args()
+    return args
+
+def format_data(
+    manifest_paths="",
+    output_path="",
+    cmvn_path="examples/librispeech/data/mean_std.json",
+    unit_type="char",
+    vocab_path="examples/librispeech/data/vocab.txt",
+    spm_model_prefix=""):
+
+    fout = open(output_path, 'w', encoding='utf-8')
+
+    # get feat dim
+    filetype = cmvn_path.split(".")[-1]
+    mean, istd = load_cmvn(cmvn_path, filetype=filetype)
+    feat_dim = mean.shape[0] #(D)
+    print(f"Feature dim: {feat_dim}")
+
+    text_feature = TextFeaturizer(unit_type, vocab_path, spm_model_prefix)
+    vocab_size = text_feature.vocab_size
+    print(f"Vocab size: {vocab_size}")
+
+    # josnline like this
+    # {
+    #   "input": [{"name": "input1", "shape": (100, 83), "feat": "xxx.ark:123"}],
+    #   "output": [{"name":"target1", "shape": (40, 5002), "text": "a b c de"}],
+    #   "utt2spk": "111-2222",
+    #   "utt": "111-2222-333"
+    # }
+    count = 0
+    for manifest_path in manifest_paths:
+        with jsonlines.open(str(manifest_path), 'r') as reader:
+            manifest_jsons = list(reader)
+
+        for line_json in manifest_jsons:
+            output_json = {
+                "input": [],
+                "output": [],
+                'utt': line_json['utt'],
+                'utt2spk': line_json.get('utt2spk', 'global'),
+            }
+
+            # output
+            line = line_json['text']
+            if isinstance(line, str):
+                # only one target
+                tokens = text_feature.tokenize(line)
+                tokenids = text_feature.featurize(line)
+                output_json['output'].append({
+                    'name': 'target1',
+                    'shape': (len(tokenids), vocab_size),
+                    'text': line,
+                    'token': ' '.join(tokens),
+                    'tokenid': ' '.join(map(str, tokenids)),
+                })
+            else:
+                # isinstance(line, list), multi target in one vocab
+                for i, item in enumerate(line, 1):
+                    tokens = text_feature.tokenize(item)
+                    tokenids = text_feature.featurize(item)
+                    output_json['output'].append({
+                        'name': f'target{i}',
+                        'shape': (len(tokenids), vocab_size),
+                        'text': item,
+                        'token': ' '.join(tokens),
+                        'tokenid': ' '.join(map(str, tokenids)),
+                    })
+
+            # input
+            line = line_json['feat']
+            if isinstance(line, str):
+                # only one input
+                feat_shape = line_json['feat_shape']
+                assert isinstance(feat_shape, (list, tuple)), type(feat_shape)
+                filetype = feat_type(line)
+                if filetype == 'sound':
+                    feat_shape.append(feat_dim)
+                else: # kaldi
+                    raise NotImplementedError('no support kaldi feat now!')
+
+                output_json['input'].append({
+                    "name": "input1",
+                    "shape": feat_shape,
+                    "feat": line,
+                    "filetype": filetype,
+                })
+            else:
+                # isinstance(line, list), multi input 
+                raise NotImplementedError("not support multi input now!")
+
+            fout.write(json.dumps(output_json) + '\n')
+            count += 1
+
+    print(f"{manifest_paths} Examples number: {count}")
+    fout.close()
+
+def main():
+    args = define_argparse()
+    print_arguments(args, globals())
+    format_data(**vars(args))
+
+if __name__ == '__main__':
+    main()
--- a/paddlespeech/dataset/s2t/format_rsl.py
+++ b/paddlespeech/dataset/s2t/format_rsl.py
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+format ref/hyp file for `utt text` format to compute CER/WER/MER.
+
+norm:
+BAC009S0764W0196 明确了发展目标和重点任务
+BAC009S0764W0186 实现我国房地产市场的平稳运行
+
+
+sclite:
+加大对结构机械化环境和收集谈控机制力度(BAC009S0906W0240.wav)
+河南省新乡市丰秋县刘光镇政府东五零左右(BAC009S0770W0441.wav)
+"""
+import argparse
+
+import jsonlines
+
+from paddlespeech.utils.argparse import print_arguments
+
+
+def transform_hyp(origin, trans, trans_sclite):
+    """
+    Args:
+        origin: The input json file which contains the model output
+        trans: The output file for caculate CER/WER
+        trans_sclite: The output file for caculate CER/WER using sclite
+    """
+    input_dict = {}
+
+    with open(origin, "r+", encoding="utf8") as f:
+        for item in jsonlines.Reader(f):
+            input_dict[item["utt"]] = item["hyps"][0]
+
+    if trans:
+        with open(trans, "w+", encoding="utf8") as f:
+            for key in input_dict.keys():
+                f.write(key + " " + input_dict[key] + "\n")
+        print(f"transform_hyp output: {trans}")
+
+    if trans_sclite:
+        with open(trans_sclite, "w+") as f:
+            for key in input_dict.keys():
+                line = input_dict[key] + "(" + key + ".wav" + ")" + "\n"
+                f.write(line)
+        print(f"transform_hyp output: {trans_sclite}")
+
+
+def transform_ref(origin, trans, trans_sclite):
+    """
+    Args:
+        origin: The input json file which contains the model output
+        trans: The output file for caculate CER/WER
+        trans_sclite: The output file for caculate CER/WER using sclite
+    """
+    input_dict = {}
+
+    with open(origin, "r", encoding="utf8") as f:
+        for item in jsonlines.Reader(f):
+            input_dict[item["utt"]] = item["text"]
+
+    if trans:
+        with open(trans, "w", encoding="utf8") as f:
+            for key in input_dict.keys():
+                f.write(key + " " + input_dict[key] + "\n")
+        print(f"transform_hyp output: {trans}")
+
+    if trans_sclite:
+        with open(trans_sclite, "w") as f:
+            for key in input_dict.keys():
+                line = input_dict[key] + "(" + key + ".wav" + ")" + "\n"
+                f.write(line)
+        print(f"transform_hyp output: {trans_sclite}")
+
+
+def define_argparse():
+    parser = argparse.ArgumentParser(
+        prog='format ref/hyp file for compute CER/WER', add_help=True)
+    parser.add_argument(
+        '--origin_hyp', type=str, default="", help='origin hyp file')
+    parser.add_argument(
+        '--trans_hyp',
+        type=str,
+        default="",
+        help='hyp file for caculating CER/WER')
+    parser.add_argument(
+        '--trans_hyp_sclite',
+        type=str,
+        default="",
+        help='hyp file for caculating CER/WER by sclite')
+
+    parser.add_argument(
+        '--origin_ref', type=str, default="", help='origin ref file')
+    parser.add_argument(
+        '--trans_ref',
+        type=str,
+        default="",
+        help='ref file for caculating CER/WER')
+    parser.add_argument(
+        '--trans_ref_sclite',
+        type=str,
+        default="",
+        help='ref file for caculating CER/WER by sclite')
+    parser_args = parser.parse_args()
+    return parser_args
+
+
+def format_result(origin_hyp="",
+                  trans_hyp="",
+                  trans_hyp_sclite="",
+                  origin_ref="",
+                  trans_ref="",
+                  trans_ref_sclite=""):
+
+    if origin_hyp:
+        transform_hyp(
+            origin=origin_hyp, trans=trans_hyp, trans_sclite=trans_hyp_sclite)
+
+    if origin_ref:
+        transform_ref(
+            origin=origin_ref, trans=trans_ref, trans_sclite=trans_ref_sclite)
+
+
+def main():
+    args = define_argparse()
+    print_arguments(args, globals())
+
+    format_result(**vars(args))
+
+
+if __name__ == "__main__":
+    main()
--- a/paddlespeech/s2t/__init__.py
+++ b/paddlespeech/s2t/__init__.py
@@ -267,7 +267,7 @@ def to(x: paddle.Tensor, *args, **kwargs) -> paddle.Tensor:


 if not hasattr(paddle.Tensor, 'to'):
-    logger.debug("register user to to paddle.Tensor, remove this when fixed!")
+    logger.debug("register user to paddle.Tensor, remove this when fixed!")
    setattr(paddle.Tensor, 'to', to)
    setattr(paddle.static.Variable, 'to', to)


--- a/paddlespeech/s2t/exps/deepspeech2/bin/deploy/runtime.py
+++ b/paddlespeech/s2t/exps/deepspeech2/bin/deploy/runtime.py
@@ -28,8 +28,8 @@ from paddlespeech.s2t.training.cli import default_argument_parser
 from paddlespeech.s2t.utils.socket_server import AsrRequestHandler
 from paddlespeech.s2t.utils.socket_server import AsrTCPServer
 from paddlespeech.s2t.utils.socket_server import warm_up_test
-from paddlespeech.s2t.utils.utility import add_arguments
-from paddlespeech.s2t.utils.utility import print_arguments
+from paddlespeech.utils.argparse import add_arguments
+from paddlespeech.utils.argparse import print_arguments


 def init_predictor(args):

--- a/paddlespeech/s2t/exps/deepspeech2/bin/deploy/server.py
+++ b/paddlespeech/s2t/exps/deepspeech2/bin/deploy/server.py
@@ -26,8 +26,8 @@ from paddlespeech.s2t.training.cli import default_argument_parser
 from paddlespeech.s2t.utils.socket_server import AsrRequestHandler
 from paddlespeech.s2t.utils.socket_server import AsrTCPServer
 from paddlespeech.s2t.utils.socket_server import warm_up_test
-from paddlespeech.s2t.utils.utility import add_arguments
-from paddlespeech.s2t.utils.utility import print_arguments
+from paddlespeech.utils.argparse import add_arguments
+from paddlespeech.utils.argparse import print_arguments


 def start_server(config, args):

--- a/paddlespeech/s2t/exps/deepspeech2/bin/export.py
+++ b/paddlespeech/s2t/exps/deepspeech2/bin/export.py
@@ -16,7 +16,7 @@ from yacs.config import CfgNode

 from paddlespeech.s2t.exps.deepspeech2.model import DeepSpeech2Tester as Tester
 from paddlespeech.s2t.training.cli import default_argument_parser
-from paddlespeech.s2t.utils.utility import print_arguments
+from paddlespeech.utils.argparse import print_arguments


 def main_sp(config, args):

--- a/paddlespeech/s2t/exps/deepspeech2/bin/test.py
+++ b/paddlespeech/s2t/exps/deepspeech2/bin/test.py
@@ -16,7 +16,7 @@ from yacs.config import CfgNode

 from paddlespeech.s2t.exps.deepspeech2.model import DeepSpeech2Tester as Tester
 from paddlespeech.s2t.training.cli import default_argument_parser
-from paddlespeech.s2t.utils.utility import print_arguments
+from paddlespeech.utils.argparse import print_arguments


 def main_sp(config, args):

--- a/paddlespeech/s2t/exps/deepspeech2/bin/test_export.py
+++ b/paddlespeech/s2t/exps/deepspeech2/bin/test_export.py
@@ -16,7 +16,7 @@ from yacs.config import CfgNode

 from paddlespeech.s2t.exps.deepspeech2.model import DeepSpeech2ExportTester as ExportTester
 from paddlespeech.s2t.training.cli import default_argument_parser
-from paddlespeech.s2t.utils.utility import print_arguments
+from paddlespeech.utils.argparse import print_arguments


 def main_sp(config, args):

--- a/paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py
+++ b/paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py
@@ -27,8 +27,8 @@ from paddlespeech.s2t.training.cli import default_argument_parser
 from paddlespeech.s2t.utils import mp_tools
 from paddlespeech.s2t.utils.checkpoint import Checkpoint
 from paddlespeech.s2t.utils.log import Log
-from paddlespeech.s2t.utils.utility import print_arguments
 from paddlespeech.s2t.utils.utility import UpdateConfig
+from paddlespeech.utils.argparse import print_arguments

 logger = Log(__name__).getlog()


--- a/paddlespeech/s2t/exps/deepspeech2/bin/train.py
+++ b/paddlespeech/s2t/exps/deepspeech2/bin/train.py
@@ -16,7 +16,7 @@ from yacs.config import CfgNode

 from paddlespeech.s2t.exps.deepspeech2.model import DeepSpeech2Trainer as Trainer
 from paddlespeech.s2t.training.cli import default_argument_parser
-from paddlespeech.s2t.utils.utility import print_arguments
+from paddlespeech.utils.argparse import print_arguments


 def main_sp(config, args):

--- a/paddlespeech/s2t/exps/u2/bin/alignment.py
+++ b/paddlespeech/s2t/exps/u2/bin/alignment.py
@@ -16,7 +16,7 @@ from yacs.config import CfgNode

 from paddlespeech.s2t.exps.u2.model import U2Tester as Tester
 from paddlespeech.s2t.training.cli import default_argument_parser
-from paddlespeech.s2t.utils.utility import print_arguments
+from paddlespeech.utils.argparse import print_arguments


 def main_sp(config, args):

--- a/paddlespeech/s2t/exps/u2/bin/export.py
+++ b/paddlespeech/s2t/exps/u2/bin/export.py
@@ -16,7 +16,7 @@ from yacs.config import CfgNode

 from paddlespeech.s2t.exps.u2.model import U2Tester as Tester
 from paddlespeech.s2t.training.cli import default_argument_parser
-from paddlespeech.s2t.utils.utility import print_arguments
+from paddlespeech.utils.argparse import print_arguments


 def main_sp(config, args):

--- a/paddlespeech/s2t/exps/u2/bin/test.py
+++ b/paddlespeech/s2t/exps/u2/bin/test.py
@@ -18,7 +18,7 @@ from yacs.config import CfgNode

 from paddlespeech.s2t.exps.u2.model import U2Tester as Tester
 from paddlespeech.s2t.training.cli import default_argument_parser
-from paddlespeech.s2t.utils.utility import print_arguments
+from paddlespeech.utils.argparse import print_arguments


 def main_sp(config, args):

--- a/paddlespeech/s2t/exps/u2/bin/train.py
+++ b/paddlespeech/s2t/exps/u2/bin/train.py
@@ -19,7 +19,7 @@ from yacs.config import CfgNode

 from paddlespeech.s2t.exps.u2.model import U2Trainer as Trainer
 from paddlespeech.s2t.training.cli import default_argument_parser
-from paddlespeech.s2t.utils.utility import print_arguments
+from paddlespeech.utils.argparse import print_arguments

 # from paddlespeech.s2t.exps.u2.trainer import U2Trainer as Trainer


--- a/paddlespeech/s2t/exps/u2_kaldi/bin/test.py
+++ b/paddlespeech/s2t/exps/u2_kaldi/bin/test.py
@@ -18,7 +18,7 @@ from yacs.config import CfgNode

 from paddlespeech.s2t.training.cli import default_argument_parser
 from paddlespeech.s2t.utils.dynamic_import import dynamic_import
-from paddlespeech.s2t.utils.utility import print_arguments
+from paddlespeech.utils.argparse import print_arguments

 model_test_alias = {
    "u2": "paddlespeech.s2t.exps.u2.model:U2Tester",

--- a/paddlespeech/s2t/exps/u2_kaldi/bin/train.py
+++ b/paddlespeech/s2t/exps/u2_kaldi/bin/train.py
@@ -19,7 +19,7 @@ from yacs.config import CfgNode

 from paddlespeech.s2t.training.cli import default_argument_parser
 from paddlespeech.s2t.utils.dynamic_import import dynamic_import
-from paddlespeech.s2t.utils.utility import print_arguments
+from paddlespeech.utils.argparse import print_arguments

 model_train_alias = {
    "u2": "paddlespeech.s2t.exps.u2.model:U2Trainer",

--- a/paddlespeech/s2t/exps/u2_st/bin/export.py
+++ b/paddlespeech/s2t/exps/u2_st/bin/export.py
@@ -16,7 +16,7 @@ from yacs.config import CfgNode

 from paddlespeech.s2t.exps.u2_st.model import U2STTester as Tester
 from paddlespeech.s2t.training.cli import default_argument_parser
-from paddlespeech.s2t.utils.utility import print_arguments
+from paddlespeech.utils.argparse import print_arguments


 def main_sp(config, args):

--- a/paddlespeech/s2t/exps/u2_st/bin/test.py
+++ b/paddlespeech/s2t/exps/u2_st/bin/test.py
@@ -18,7 +18,7 @@ from yacs.config import CfgNode

 from paddlespeech.s2t.exps.u2_st.model import U2STTester as Tester
 from paddlespeech.s2t.training.cli import default_argument_parser
-from paddlespeech.s2t.utils.utility import print_arguments
+from paddlespeech.utils.argparse import print_arguments


 def main_sp(config, args):

--- a/paddlespeech/s2t/exps/u2_st/bin/train.py
+++ b/paddlespeech/s2t/exps/u2_st/bin/train.py
@@ -19,7 +19,7 @@ from yacs.config import CfgNode

 from paddlespeech.s2t.exps.u2_st.model import U2STTrainer as Trainer
 from paddlespeech.s2t.training.cli import default_argument_parser
-from paddlespeech.s2t.utils.utility import print_arguments
+from paddlespeech.utils.argparse import print_arguments


 def main_sp(config, args):

--- a/paddlespeech/s2t/exps/wav2vec2/bin/test.py
+++ b/paddlespeech/s2t/exps/wav2vec2/bin/test.py
@@ -18,7 +18,7 @@ from yacs.config import CfgNode

 from paddlespeech.s2t.exps.wav2vec2.model import Wav2Vec2ASRTester as Tester
 from paddlespeech.s2t.training.cli import default_argument_parser
-from paddlespeech.s2t.utils.utility import print_arguments
+from paddlespeech.utils.argparse import print_arguments


 def main_sp(config, args):

--- a/paddlespeech/s2t/exps/wav2vec2/bin/train.py
+++ b/paddlespeech/s2t/exps/wav2vec2/bin/train.py
@@ -19,7 +19,7 @@ from yacs.config import CfgNode

 from paddlespeech.s2t.exps.wav2vec2.model import Wav2Vec2ASRTrainer as Trainer
 from paddlespeech.s2t.training.cli import default_argument_parser
-from paddlespeech.s2t.utils.utility import print_arguments
+from paddlespeech.utils.argparse import print_arguments


 def main_sp(config, args):

--- a/paddlespeech/s2t/frontend/augmentor/augmentation.py
+++ b/paddlespeech/s2t/frontend/augmentor/augmentation.py
@@ -45,7 +45,7 @@ class AugmentationPipeline():
    samples to make the model invariant to certain types of perturbations in the
    real world, improving model's generalization ability.

-    The pipeline is built according the the augmentation configuration in json
+    The pipeline is built according to the augmentation configuration in json
    string, e.g.
    
    .. code-block::

--- a/paddlespeech/s2t/frontend/featurizer/text_featurizer.py
+++ b/paddlespeech/s2t/frontend/featurizer/text_featurizer.py
@@ -48,13 +48,16 @@ class TextFeaturizer():
        self.unit_type = unit_type
        self.unk = UNK
        self.maskctc = maskctc
+        self.vocab_path_or_list = vocab

-        if vocab:
+        if self.vocab_path_or_list:
            self.vocab_dict, self._id2token, self.vocab_list, self.unk_id, self.eos_id, self.blank_id = self._load_vocabulary_from_file(
                vocab, maskctc)
            self.vocab_size = len(self.vocab_list)
        else:
-            logger.warning("TextFeaturizer: not have vocab file or vocab list.")
+            logger.warning(
+                "TextFeaturizer: not have vocab file or vocab list. Only Tokenizer can use, can not convert to token idx"
+            )

        if unit_type == 'spm':
            spm_model = spm_model_prefix + '.model'
@@ -62,6 +65,7 @@ class TextFeaturizer():
            self.sp.Load(spm_model)

    def tokenize(self, text, replace_space=True):
+        """tokenizer split text into text tokens"""
        if self.unit_type == 'char':
            tokens = self.char_tokenize(text, replace_space)
        elif self.unit_type == 'word':
@@ -71,6 +75,7 @@ class TextFeaturizer():
        return tokens

    def detokenize(self, tokens):
+        """tokenizer convert text tokens back to text"""
        if self.unit_type == 'char':
            text = self.char_detokenize(tokens)
        elif self.unit_type == 'word':
@@ -88,6 +93,7 @@ class TextFeaturizer():
        Returns:
            List[int]: List of token indices.
        """
+        assert self.vocab_path_or_list, "toidx need vocab path or vocab list"
        tokens = self.tokenize(text)
        ids = []
        for token in tokens:
@@ -107,6 +113,7 @@ class TextFeaturizer():
        Returns:
            str: Text.
        """
+        assert self.vocab_path_or_list, "toidx need vocab path or vocab list"
        tokens = []
        for idx in idxs:
            if idx == self.eos_id:
@@ -127,10 +134,10 @@ class TextFeaturizer():
        """
        text = text.strip()
        if replace_space:
-            text_list = [SPACE if item == " " else item for item in list(text)]
+            tokens = [SPACE if item == " " else item for item in list(text)]
        else:
-            text_list = list(text)
-        return text_list
+            tokens = list(text)
+        return tokens

    def char_detokenize(self, tokens):
        """Character detokenizer.

--- a/paddlespeech/s2t/io/speechbrain/sampler.py
+++ b/paddlespeech/s2t/io/speechbrain/sampler.py
@@ -283,7 +283,7 @@ class DynamicBatchSampler(Sampler):
            num_quantiles, )
        # get quantiles using lognormal distribution
        quantiles = lognorm.ppf(latent_boundaries, 1)
-        # scale up to to max_batch_length
+        # scale up to max_batch_length
        bucket_boundaries = quantiles * max_batch_length / quantiles[-1]
        # compute resulting bucket length multipliers
        length_multipliers = [

--- a/paddlespeech/s2t/models/u2/u2.py
+++ b/paddlespeech/s2t/models/u2/u2.py
@@ -560,7 +560,7 @@ class U2BaseModel(ASRInterface, nn.Layer):
            [len(hyp[0]) for hyp in hyps], place=device,
            dtype=paddle.long)  # (beam_size,)
        hyps_pad, _ = add_sos_eos(hyps_pad, self.sos, self.eos, self.ignore_id)
-        hyps_lens = hyps_lens + 1  # Add <sos> at begining
+        hyps_lens = hyps_lens + 1  # Add <sos> at beginning
        logger.debug(
            f"hyps pad: {hyps_pad} {self.sos} {self.eos} {self.ignore_id}")

@@ -709,7 +709,7 @@ class U2BaseModel(ASRInterface, nn.Layer):
            hypothesis from ctc prefix beam search and one encoder output
        Args:
            hyps (paddle.Tensor): hyps from ctc prefix beam search, already
-                pad sos at the begining, (B, T)
+                pad sos at the beginning, (B, T)
            hyps_lens (paddle.Tensor): length of each hyp in hyps, (B)
            encoder_out (paddle.Tensor): corresponding encoder output, (B=1, T, D)
        Returns:

--- a/paddlespeech/s2t/models/u2_st/u2_st.py
+++ b/paddlespeech/s2t/models/u2_st/u2_st.py
@@ -455,7 +455,7 @@ class U2STBaseModel(nn.Layer):
            hypothesis from ctc prefix beam search and one encoder output
        Args:
            hyps (paddle.Tensor): hyps from ctc prefix beam search, already
-                pad sos at the begining, (B, T)
+                pad sos at the beginning, (B, T)
            hyps_lens (paddle.Tensor): length of each hyp in hyps, (B)
            encoder_out (paddle.Tensor): corresponding encoder output, (B=1, T, D)
        Returns:

--- a/paddlespeech/s2t/utils/utility.py
+++ b/paddlespeech/s2t/utils/utility.py
@@ -29,10 +29,7 @@ from paddlespeech.s2t.utils.log import Log

 logger = Log(__name__).getlog()

-__all__ = [
-    "all_version", "UpdateConfig", "seed_all", 'print_arguments',
-    'add_arguments', "log_add"
-]
+__all__ = ["all_version", "UpdateConfig", "seed_all", "log_add"]


 def all_version():
@@ -60,51 +57,6 @@ def seed_all(seed: int=20210329):
    paddle.seed(seed)


-def print_arguments(args, info=None):
-    """Print argparse's arguments.
-
-    Usage:
-
-    .. code-block:: python
-
-        parser = argparse.ArgumentParser()
-        parser.add_argument("name", default="Jonh", type=str, help="User name.")
-        args = parser.parse_args()
-        print_arguments(args)
-
-    :param args: Input argparse.Namespace for printing.
-    :type args: argparse.Namespace
-    """
-    filename = ""
-    if info:
-        filename = info["__file__"]
-    filename = os.path.basename(filename)
-    print(f"----------- {filename} Arguments -----------")
-    for arg, value in sorted(vars(args).items()):
-        print("%s: %s" % (arg, value))
-    print("-----------------------------------------------------------")
-
-
-def add_arguments(argname, type, default, help, argparser, **kwargs):
-    """Add argparse's argument.
-
-    Usage:
-
-    .. code-block:: python
-
-        parser = argparse.ArgumentParser()
-        add_argument("name", str, "Jonh", "User name.", parser)
-        args = parser.parse_args()
-    """
-    type = distutils.util.strtobool if type == bool else type
-    argparser.add_argument(
-        "--" + argname,
-        default=default,
-        type=type,
-        help=help + ' Default: %(default)s.',
-        **kwargs)
-
-
 def log_add(args: List[int]) -> float:
    """Stable log add


--- a/paddlespeech/server/engine/asr/online/python/asr_engine.py
+++ b/paddlespeech/server/engine/asr/online/python/asr_engine.py
@@ -609,7 +609,7 @@ class PaddleASRConnectionHanddler:
            dtype=paddle.long)  # (beam_size,)
        hyps_pad, _ = add_sos_eos(hyps_pad, self.model.sos, self.model.eos,
                                  self.model.ignore_id)
-        hyps_lens = hyps_lens + 1  # Add <sos> at begining
+        hyps_lens = hyps_lens + 1  # Add <sos> at beginning

        # ctc score in ln domain
        # (beam_size, max_hyps_len, vocab_size)

--- a/paddlespeech/server/ws/asr_api.py
+++ b/paddlespeech/server/ws/asr_api.py
@@ -67,7 +67,7 @@ async def websocket_endpoint(websocket: WebSocket):
                #              and we break the loop
                if message['signal'] == 'start':
                    resp = {"status": "ok", "signal": "server_ready"}
-                    # do something at begining here
+                    # do something at beginning here
                    # create the instance to process the audio
                    #connection_handler = PaddleASRConnectionHanddler(asr_model)
                    connection_handler = asr_model.new_handler()

--- a/paddlespeech/t2s/frontend/generate_lexicon.py
+++ b/paddlespeech/t2s/frontend/generate_lexicon.py
@@ -45,7 +45,7 @@ def rule(C, V, R, T):
    'u' in syllables when certain conditions are satisfied.

    'i' is distinguished when appeared in phonemes, and separated into 3 categories, 'i', 'ii' and 'iii'.
-    Erhua is is possibly applied to every finals, except for finals that already ends with 'r'.
+    Erhua is possibly applied to every finals, except for finals that already ends with 'r'.
    When a syllable is impossible or does not have any characters with this pronunciation, return None
    to filter it out.
    """

--- a/paddlespeech/t2s/models/waveflow.py
+++ b/paddlespeech/t2s/models/waveflow.py
@@ -236,7 +236,7 @@ class ResidualBlock(nn.Layer):

        Returns:
            res (Tensor): 
-                A row of the the residual output. shape=(batch_size, channel, 1, width)
+                A row of the residual output. shape=(batch_size, channel, 1, width)
            skip (Tensor): 
                A row of the skip output. shape=(batch_size, channel, 1, width)

@@ -343,7 +343,7 @@ class ResidualNet(nn.LayerList):
            
        Returns:
            res (Tensor): 
-                A row of the the residual output. shape=(batch_size, channel, 1, width) 
+                A row of the residual output. shape=(batch_size, channel, 1, width) 
            skip (Tensor): 
                A row of the skip output. shape=(batch_size, channel, 1, width)
                
@@ -465,7 +465,7 @@ class Flow(nn.Layer):
        self.resnet.start_sequence()

    def inverse(self, z, condition):
-        """Sampling from the the distrition p(X). It is done by sample form
+        """Sampling from the distrition p(X). It is done by sample form
        p(Z) and transform the sample. It is a auto regressive transformation.

        Args:
@@ -600,7 +600,7 @@ class WaveFlow(nn.LayerList):
        return z, log_det_jacobian

    def inverse(self, z, condition):
-        """Sampling from the the distrition p(X).
+        """Sampling from the distrition p(X).

        It is done by sample a ``z`` form p(Z) and transform it into ``x``.
        Each Flow transform .. math:: `z_{i-1}` to .. math:: `z_{i}` in an

--- a/paddlespeech/t2s/modules/transformer/lightconv.py
+++ b/paddlespeech/t2s/modules/transformer/lightconv.py
@@ -110,7 +110,7 @@ class LightweightConvolution(nn.Layer):
                (batch, time1, time2) mask

        Return:
-            Tensor: ouput. (batch, time1, d_model) 
+            Tensor: output. (batch, time1, d_model) 

        """
        # linear -> GLU -> lightconv -> linear

--- a/paddlespeech/utils/argparse.py
+++ b/paddlespeech/utils/argparse.py
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import hashlib
+import os
+import sys
+from typing import Text
+
+import distutils
+
+__all__ = ["print_arguments", "add_arguments", "get_commandline_args"]
+
+
+def get_commandline_args():
+    extra_chars = [
+        " ",
+        ";",
+        "&",
+        "(",
+        ")",
+        "|",
+        "^",
+        "<",
+        ">",
+        "?",
+        "*",
+        "[",
+        "]",
+        "$",
+        "`",
+        '"',
+        "\\",
+        "!",
+        "{",
+        "}",
+    ]
+
+    # Escape the extra characters for shell
+    argv = [
+        arg.replace("'", "'\\''") if all(char not in arg
+                                         for char in extra_chars) else
+        "'" + arg.replace("'", "'\\''") + "'" for arg in sys.argv
+    ]
+
+    return sys.executable + " " + " ".join(argv)
+
+
+def print_arguments(args, info=None):
+    """Print argparse's arguments.
+
+    Usage:
+
+    .. code-block:: python
+
+        parser = argparse.ArgumentParser()
+        parser.add_argument("name", default="Jonh", type=str, help="User name.")
+        args = parser.parse_args()
+        print_arguments(args)
+
+    :param args: Input argparse.Namespace for printing.
+    :type args: argparse.Namespace
+    """
+    filename = ""
+    if info:
+        filename = info["__file__"]
+    filename = os.path.basename(filename)
+    print(f"----------- {filename} Configuration Arguments -----------")
+    for arg, value in sorted(vars(args).items()):
+        print("%s: %s" % (arg, value))
+    print("-----------------------------------------------------------")
+
+
+def add_arguments(argname, type, default, help, argparser, **kwargs):
+    """Add argparse's argument.
+
+    Usage:
+
+    .. code-block:: python
+
+        parser = argparse.ArgumentParser()
+        add_argument("name", str, "Jonh", "User name.", parser)
+        args = parser.parse_args()
+    """
+    type = distutils.util.strtobool if type == bool else type
+    argparser.add_argument(
+        "--" + argname,
+        default=default,
+        type=type,
+        help=help + ' Default: %(default)s.',
+        **kwargs)
--- a/paddlespeech/vector/exps/ecapa_tdnn/train.py
+++ b/paddlespeech/vector/exps/ecapa_tdnn/train.py
@@ -51,7 +51,7 @@ def main(args, config):
    # stage0: set the training device, cpu or gpu
    paddle.set_device(args.device)

-    # stage1: we must call the paddle.distributed.init_parallel_env() api at the begining
+    # stage1: we must call the paddle.distributed.init_parallel_env() api at the beginning
    paddle.distributed.init_parallel_env()
    nranks = paddle.distributed.get_world_size()
    rank = paddle.distributed.get_rank()
@@ -146,7 +146,7 @@ def main(args, config):
    timer.start()

    for epoch in range(start_epoch + 1, config.epochs + 1):
-        # at the begining, model must set to train mode
+        # at the beginning, model must set to train mode
        model.train()

        avg_loss = 0

--- a/paddlespeech/vector/exps/ge2e/preprocess.py
+++ b/paddlespeech/vector/exps/ge2e/preprocess.py
@@ -42,7 +42,7 @@ if __name__ == "__main__":
    parser.add_argument(
        "--skip_existing",
        action="store_true",
-        help="Whether to skip ouput files with the same name. Useful if this script was interrupted."
+        help="Whether to skip output files with the same name. Useful if this script was interrupted."
    )
    parser.add_argument(
        "--no_trim",

--- a/speechx/examples/ds2_ol/onnx/local/onnx_infer_shape.py
+++ b/speechx/examples/ds2_ol/onnx/local/onnx_infer_shape.py
@@ -2078,7 +2078,7 @@ class SymbolicShapeInference:
        output_tensor_ranks = get_attribute(node, 'output_tensor_ranks')
        assert output_tensor_ranks

-        # set the context output seperately.
+        # set the context output separately.
        # The first output is autograd's context.
        vi = self.known_vi_[node.output[0]]
        vi.CopyFrom(

--- a/speechx/speechx/frontend/audio/db_norm.cc
+++ b/speechx/speechx/frontend/audio/db_norm.cc
@@ -76,7 +76,7 @@ bool DecibelNormalizer::Compute(VectorBase<BaseFloat>* waves) const {
    if (gain > opts_.max_gain_db) {
        LOG(ERROR)
            << "Unable to normalize segment to " << opts_.target_db << "dB,"
-            << "because the the probable gain have exceeds opts_.max_gain_db"
+            << "because the probable gain has exceeded opts_.max_gain_db"
            << opts_.max_gain_db << "dB.";
        return false;
    }

--- a/speechx/speechx/kaldi/base/kaldi-types.h
+++ b/speechx/speechx/kaldi/base/kaldi-types.h
@@ -40,7 +40,7 @@ typedef float   BaseFloat;
 #include <stdint.h>

 // for discussion on what to do if you need compile kaldi
-// without OpenFST, see the bottom of this this file
+// without OpenFST, see the bottom of this file

 #ifndef COMPILE_WITHOUT_OPENFST


--- a/speechx/speechx/kaldi/feat/pitch-functions.cc
+++ b/speechx/speechx/kaldi/feat/pitch-functions.cc
@@ -746,7 +746,7 @@ OnlinePitchFeatureImpl::OnlinePitchFeatureImpl(
  Vector<BaseFloat> lags_offset(lags_);
  // lags_offset equals lags_ (which are the log-spaced lag values we want to
  // measure the NCCF at) with nccf_first_lag_ / opts.resample_freq subtracted
-  // from each element, so we can treat the measured NCCF values as as starting
+  // from each element, so we can treat the measured NCCF values as starting
  // from sample zero in a signal that starts at the point start /
  // opts.resample_freq.  This is necessary because the ArbitraryResample code
  // assumes that the input signal starts from sample zero.

--- a/speechx/speechx/kaldi/lat/lattice-functions.h
+++ b/speechx/speechx/kaldi/lat/lattice-functions.h
@@ -355,12 +355,12 @@ bool PruneLattice(BaseFloat beam, LatticeType *lat);
 //
 //
 // /// This function returns the number of words in the longest sentence in a
-// /// CompactLattice (i.e. the the maximum of any path, of the count of
+// /// CompactLattice (i.e. the maximum of any path, of the count of
 // /// olabels on that path).
 // int32 LongestSentenceLength(const Lattice &lat);
 //
 // /// This function returns the number of words in the longest sentence in a
-// /// CompactLattice, i.e. the the maximum of any path, of the count of
+// /// CompactLattice, i.e. the maximum of any path, of the count of
 // /// labels on that path... note, in CompactLattice, the ilabels and olabels
 // /// are identical because it is an acceptor.
 // int32 LongestSentenceLength(const CompactLattice &lat);
@@ -408,7 +408,7 @@ bool PruneLattice(BaseFloat beam, LatticeType *lat);
 //
 // /// This function computes the mapping from the pair
 // /// (frame-index, transition-id) to the pair
-// /// (sum-of-acoustic-scores, num-of-occurences) over all occurences of the
+// /// (sum-of-acoustic-scores, num-of-occurrences) over all occurrences of the
 // /// transition-id in that frame.
 // /// frame-index in the lattice.
 // /// This function is useful for retaining the acoustic scores in a
@@ -422,13 +422,13 @@ bool PruneLattice(BaseFloat beam, LatticeType *lat);
 // ///   @param [out] acoustic_scores
 // ///                     Pointer to a map from the pair (frame-index,
 // ///                     transition-id) to a pair (sum-of-acoustic-scores,
-// ///                     num-of-occurences).
+// ///                     num-of-occurrences).
 // ///                     Usually the acoustic scores for a pdf-id (and hence
 // ///                     transition-id) on a frame will be the same for all the
-// ///                     occurences of the pdf-id in that frame.
+// ///                     occurrences of the pdf-id in that frame.
 // ///                     But if not, we will take the average of the acoustic
 // ///                     scores. Hence, we store both the sum-of-acoustic-scores
-// ///                     and the num-of-occurences of the transition-id in that
+// ///                     and the num-of-occurrences of the transition-id in that
 // ///                     frame.
 // void ComputeAcousticScoresMap(
 //     const Lattice &lat,
@@ -440,8 +440,8 @@ bool PruneLattice(BaseFloat beam, LatticeType *lat);
 // ///
 // ///   @param [in] acoustic_scores
 // ///                      A map from the pair (frame-index, transition-id) to a
-// ///                      pair (sum-of-acoustic-scores, num-of-occurences) of
-// ///                      the occurences of the transition-id in that frame.
+// ///                      pair (sum-of-acoustic-scores, num-of-occurrences) of
+// ///                      the occurrences of the transition-id in that frame.
 // ///                      See the comments for ComputeAcousticScoresMap for
 // ///                      details.
 // ///   @param [out] lat   Pointer to the output lattice.

--- a/speechx/speechx/kaldi/matrix/kaldi-matrix.cc
+++ b/speechx/speechx/kaldi/matrix/kaldi-matrix.cc
@@ -1646,7 +1646,7 @@ SubMatrix<Real>::SubMatrix(const MatrixBase<Real> &M,
               static_cast<UnsignedMatrixIndexT>(M.num_rows_ - ro) &&
               static_cast<UnsignedMatrixIndexT>(c) <=
               static_cast<UnsignedMatrixIndexT>(M.num_cols_ - co));
-  // point to the begining of window
+  // point to the beginning of window
  MatrixBase<Real>::num_rows_ = r;
  MatrixBase<Real>::num_cols_ = c;
  MatrixBase<Real>::stride_ = M.Stride();

--- a/speechx/speechx/kaldi/matrix/sparse-matrix.cc
+++ b/speechx/speechx/kaldi/matrix/sparse-matrix.cc
@@ -998,7 +998,7 @@ void FilterCompressedMatrixRows(const CompressedMatrix &in,
  // iterating row-wise versus column-wise in compressed-matrix uncompression.

  if (num_kept_rows > heuristic * in.NumRows()) {
-    // if quite a few of the the rows are kept, it may be more efficient
+    // if quite a few of the rows are kept, it may be more efficient
    // to uncompress the entire compressed matrix, since per-column operation
    // is more efficient.
    Matrix<BaseFloat> full_mat(in);

--- a/speechx/speechx/kaldi/util/kaldi-table-inl.h
+++ b/speechx/speechx/kaldi/util/kaldi-table-inl.h
@@ -1587,7 +1587,7 @@ template<class Holder> class RandomAccessTableReaderImplBase {
 // this from a pipe.  In principle we could read it on-demand as for the
 // archives, but this would probably be overkill.

-// Note: the code for this this class is similar to TableWriterScriptImpl:
+// Note: the code for this class is similar to TableWriterScriptImpl:
 // try to keep them in sync.
 template<class Holder>
 class RandomAccessTableReaderScriptImpl:

--- a/speechx/speechx/nnet/ds2_nnet.cc
+++ b/speechx/speechx/nnet/ds2_nnet.cc
@@ -105,7 +105,7 @@ paddle_infer::Predictor* PaddleNnet::GetPredictor() {

    while (pred_id < pool_usages.size()) {
        if (pool_usages[pred_id] == false) {
-            predictor = pool->Retrive(pred_id);
+            predictor = pool->Retrieve(pred_id);
            break;
        }
        ++pred_id;

--- a/speechx/speechx/protocol/websocket/websocket_server.cc
+++ b/speechx/speechx/protocol/websocket/websocket_server.cc
@@ -32,14 +32,14 @@ void ConnectionHandler::OnSpeechStart() {
    decode_thread_ = std::make_shared<std::thread>(
        &ConnectionHandler::DecodeThreadFunc, this);
    got_start_tag_ = true;
-    LOG(INFO) << "Server: Recieved speech start signal, start reading speech";
+    LOG(INFO) << "Server: Received speech start signal, start reading speech";
    json::value rv = {{"status", "ok"}, {"type", "server_ready"}};
    ws_.text(true);
    ws_.write(asio::buffer(json::serialize(rv)));
 }

 void ConnectionHandler::OnSpeechEnd() {
-    LOG(INFO) << "Server: Recieved speech end signal";
+    LOG(INFO) << "Server: Received speech end signal";
    if (recognizer_ != nullptr) {
        recognizer_->SetFinished();
    }
@@ -70,8 +70,8 @@ void ConnectionHandler::OnSpeechData(const beast::flat_buffer& buffer) {
        pcm_data(i) = static_cast<float>(*pdata);
        pdata++;
    }
-    VLOG(2) << "Server: Recieved " << num_samples << " samples";
-    LOG(INFO) << "Server: Recieved " << num_samples << " samples";
+    VLOG(2) << "Server: Received " << num_samples << " samples";
+    LOG(INFO) << "Server: Received " << num_samples << " samples";
    CHECK(recognizer_ != nullptr);
    recognizer_->Accept(pcm_data);


--- a/tests/test_tipc/conformer/scripts/aishell_tiny.py
+++ b/tests/test_tipc/conformer/scripts/aishell_tiny.py
@@ -26,8 +26,8 @@ from pathlib import Path

 import soundfile

-from utils.utility import download
-from utils.utility import unpack
+from paddlespeech.dataset.download import download
+from paddlespeech.dataset.download import unpack

 DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')


--- a/tests/unit/cli/aishell_test_prepare.py
+++ b/tests/unit/cli/aishell_test_prepare.py
@@ -25,8 +25,8 @@ from pathlib import Path

 import soundfile

-from utils.utility import download
-from utils.utility import unpack
+from paddlespeech.dataset.download import download
+from paddlespeech.dataset.download import unpack

 DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')


--- a/tools/extras/install_mkl.sh
+++ b/tools/extras/install_mkl.sh
@@ -166,7 +166,7 @@ variable, sudo might not allow it to propagate to the command that it invokes."
 fi

 # The install variants, each in a function to simplify error reporting.
-# Each one invokes a subshell with a 'set -x' to to show system-modifying
+# Each one invokes a subshell with a 'set -x' to show system-modifying
 # commands it runs. The subshells simply limit the scope of this diagnostics
 # and avoid creating noise (if we were using 'set +x', it would be printed).
 Install_redhat () {

--- a/utils/avg_model.py
+++ b/utils/avg_model.py
@@ -12,105 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import argparse
-import glob
-import json
-import os
-
-import numpy as np
-import paddle
-
-
-def main(args):
-    paddle.set_device('cpu')
-
-    val_scores = []
-    beat_val_scores = None
-    selected_epochs = None
-
-    jsons = glob.glob(f'{args.ckpt_dir}/[!train]*.json')
-    jsons = sorted(jsons, key=os.path.getmtime, reverse=True)
-    for y in jsons:
-        with open(y, 'r') as f:
-            dic_json = json.load(f)
-        loss = dic_json['val_loss']
-        epoch = dic_json['epoch']
-        if epoch >= args.min_epoch and epoch <= args.max_epoch:
-            val_scores.append((epoch, loss))
-    val_scores = np.array(val_scores)
-
-    if args.val_best:
-        sort_idx = np.argsort(val_scores[:, 1])
-        sorted_val_scores = val_scores[sort_idx]
-    else:
-        sorted_val_scores = val_scores
-
-    beat_val_scores = sorted_val_scores[:args.num, 1]
-    selected_epochs = sorted_val_scores[:args.num, 0].astype(np.int64)
-    avg_val_score = np.mean(beat_val_scores)
-    print("selected val scores = " + str(beat_val_scores))
-    print("selected epochs = " + str(selected_epochs))
-    print("averaged val score = " + str(avg_val_score))
-
-    path_list = [
-        args.ckpt_dir + '/{}.pdparams'.format(int(epoch))
-        for epoch in sorted_val_scores[:args.num, 0]
-    ]
-    print(path_list)
-
-    avg = None
-    num = args.num
-    assert num == len(path_list)
-    for path in path_list:
-        print(f'Processing {path}')
-        states = paddle.load(path)
-        if avg is None:
-            avg = states
-        else:
-            for k in avg.keys():
-                avg[k] += states[k]
-    # average
-    for k in avg.keys():
-        if avg[k] is not None:
-            avg[k] /= num
-
-    paddle.save(avg, args.dst_model)
-    print(f'Saving to {args.dst_model}')
-
-    meta_path = os.path.splitext(args.dst_model)[0] + '.avg.json'
-    with open(meta_path, 'w') as f:
-        data = json.dumps({
-            "mode": 'val_best' if args.val_best else 'latest',
-            "avg_ckpt": args.dst_model,
-            "val_loss_mean": avg_val_score,
-            "ckpts": path_list,
-            "epochs": selected_epochs.tolist(),
-            "val_losses": beat_val_scores.tolist(),
-        })
-        f.write(data + "\n")
-
+from paddlespeech.dataset.s2t import avg_ckpts_main

 if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description='average model')
-    parser.add_argument('--dst_model', required=True, help='averaged model')
-    parser.add_argument(
-        '--ckpt_dir', required=True, help='ckpt model dir for average')
-    parser.add_argument(
-        '--val_best', action="store_true", help='averaged model')
-    parser.add_argument(
-        '--num', default=5, type=int, help='nums for averaged model')
-    parser.add_argument(
-        '--min_epoch',
-        default=0,
-        type=int,
-        help='min epoch used for averaging model')
-    parser.add_argument(
-        '--max_epoch',
-        default=65536,  # Big enough
-        type=int,
-        help='max epoch used for averaging model')
-
-    args = parser.parse_args()
-    print(args)
-
-    main(args)
+    avg_ckpts_main()
--- a/utils/build_vocab.py
+++ b/utils/build_vocab.py
@@ -15,134 +15,7 @@
 """Build vocabulary from manifest files.
 Each item in vocabulary file is a character.
 """
-import argparse
-import functools
-import os
-import tempfile
-from collections import Counter
-
-import jsonlines
-
-from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
-from paddlespeech.s2t.frontend.utility import BLANK
-from paddlespeech.s2t.frontend.utility import SOS
-from paddlespeech.s2t.frontend.utility import SPACE
-from paddlespeech.s2t.frontend.utility import UNK
-from paddlespeech.s2t.utils.utility import add_arguments
-from paddlespeech.s2t.utils.utility import print_arguments
-
-parser = argparse.ArgumentParser(description=__doc__)
-add_arg = functools.partial(add_arguments, argparser=parser)
-# yapf: disable
-add_arg('unit_type', str, "char", "Unit type, e.g. char, word, spm")
-add_arg('count_threshold', int, 0,
-        "Truncation threshold for char/word counts.Default 0, no truncate.")
-add_arg('vocab_path', str,
-        'examples/librispeech/data/vocab.txt',
-        "Filepath to write the vocabulary.")
-add_arg('manifest_paths', str,
-        None,
-        "Filepaths of manifests for building vocabulary. "
-        "You can provide multiple manifest files.",
-        nargs='+',
-        required=True)
-add_arg('text_keys', str,
-        'text',
-        "keys of the text in manifest for building vocabulary. "
-        "You can provide multiple k.",
-        nargs='+')
-# bpe
-add_arg('spm_vocab_size', int, 0, "Vocab size for spm.")
-add_arg('spm_mode', str, 'unigram', "spm model type, e.g. unigram, spm, char, word. only need when `unit_type` is spm")
-add_arg('spm_model_prefix', str, "", "spm_model_%(spm_mode)_%(count_threshold), spm model prefix, only need when `unit_type` is spm")
-add_arg('spm_character_coverage', float, 0.9995, "character coverage to determine the minimum symbols")
-
-# yapf: disable
-args = parser.parse_args()
-
-
-def count_manifest(counter, text_feature, manifest_path):
-    manifest_jsons = []
-    with jsonlines.open(manifest_path, 'r') as reader:
-        for json_data in reader:
-            manifest_jsons.append(json_data)
-
-    for line_json in manifest_jsons:
-        if isinstance(line_json['text'], str):
-            line = text_feature.tokenize(line_json['text'], replace_space=False)
-            counter.update(line)
-        else:
-            assert isinstance(line_json['text'], list)
-            for text in line_json['text']:
-                line = text_feature.tokenize(text, replace_space=False)
-                counter.update(line)
-
-def dump_text_manifest(fileobj, manifest_path, key='text'):
-    manifest_jsons = []
-    with jsonlines.open(manifest_path, 'r') as reader:
-        for json_data in reader:
-            manifest_jsons.append(json_data)
-
-    for line_json in manifest_jsons:
-        if isinstance(line_json[key], str):
-            fileobj.write(line_json[key] + "\n")
-        else:
-            assert isinstance(line_json[key], list)
-            for line in line_json[key]:
-                fileobj.write(line + "\n")
-
-def main():
-    print_arguments(args, globals())
-
-    fout = open(args.vocab_path, 'w', encoding='utf-8')
-    fout.write(BLANK + "\n")  # 0 will be used for "blank" in CTC
-    fout.write(UNK + '\n')  # <unk> must be 1
-
-    if args.unit_type == 'spm':
-        # tools/spm_train --input=$wave_data/lang_char/input.txt
-        # --vocab_size=${nbpe} --model_type=${bpemode}
-        # --model_prefix=${bpemodel} --input_sentence_size=100000000
-        import sentencepiece as spm
-
-        fp = tempfile.NamedTemporaryFile(mode='w', delete=False)
-        for manifest_path in args.manifest_paths:
-            text_keys = [args.text_keys] if type(args.text_keys) is not list else args.text_keys
-            for text_key in text_keys:
-                dump_text_manifest(fp, manifest_path, key=text_key)
-        fp.close()
-        # train
-        spm.SentencePieceTrainer.Train(
-            input=fp.name,
-            vocab_size=args.spm_vocab_size,
-            model_type=args.spm_mode,
-            model_prefix=args.spm_model_prefix,
-            input_sentence_size=100000000,
-            character_coverage=args.spm_character_coverage)
-        os.unlink(fp.name)
-
-    # encode
-    text_feature = TextFeaturizer(args.unit_type, "", args.spm_model_prefix)
-    counter = Counter()
-
-    for manifest_path in args.manifest_paths:
-        count_manifest(counter, text_feature, manifest_path)
-
-    count_sorted = sorted(counter.items(), key=lambda x: x[1], reverse=True)
-    tokens = []
-    for token, count in count_sorted:
-        if count < args.count_threshold:
-            break
-        # replace space by `<space>`
-        token = SPACE if token == ' ' else token
-        tokens.append(token)
-
-    tokens = sorted(tokens)
-    for token in tokens:
-        fout.write(token + '\n')
-
-    fout.write(SOS + "\n")  # <sos/eos>
-    fout.close()
-
+from paddlespeech.dataset.s2t import build_vocab_main

 if __name__ == '__main__':
-    main()
+    build_vocab_main()
--- a/utils/compute-wer.py
+++ b/utils/compute-wer.py
--- a/utils/compute_mean_std.py
+++ b/utils/compute_mean_std.py
@@ -13,75 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Compute mean and std for feature normalizer, and save to file."""
-import argparse
-import functools
-
-from paddlespeech.s2t.frontend.augmentor.augmentation import AugmentationPipeline
-from paddlespeech.s2t.frontend.featurizer.audio_featurizer import AudioFeaturizer
-from paddlespeech.s2t.frontend.normalizer import FeatureNormalizer
-from paddlespeech.s2t.utils.utility import add_arguments
-from paddlespeech.s2t.utils.utility import print_arguments
-
-parser = argparse.ArgumentParser(description=__doc__)
-add_arg = functools.partial(add_arguments, argparser=parser)
-# yapf: disable
-add_arg('num_samples',      int,    2000,    "# of samples to for statistics.")
-
-add_arg('spectrum_type',    str,
-        'linear',
-        "Audio feature type. Options: linear, mfcc, fbank.",
-        choices=['linear', 'mfcc', 'fbank'])
-add_arg('feat_dim',    int, 13, "Audio feature dim.")
-add_arg('delta_delta', bool,  False, "Audio feature with delta delta.")
-add_arg('stride_ms', int, 10,  "stride length in ms.")
-add_arg('window_ms', int, 20,  "stride length in ms.")
-add_arg('sample_rate',  int, 16000,  "target sample rate.")
-add_arg('use_dB_normalization', bool, True, "do dB normalization.")
-add_arg('target_dB',   int, -20,  "target dB.")
-
-add_arg('manifest_path',    str,
-        'data/librispeech/manifest.train',
-        "Filepath of manifest to compute normalizer's mean and stddev.")
-add_arg('num_workers',
-                        default=0,
-                        type=int,
-                        help='num of subprocess workers for processing')
-add_arg('output_path',    str,
-        'data/librispeech/mean_std.npz',
-        "Filepath of write mean and stddev to (.npz).")
-# yapf: disable
-args = parser.parse_args()
-
-
-def main():
-    print_arguments(args, globals())
-
-    augmentation_pipeline = AugmentationPipeline('{}')
-    audio_featurizer = AudioFeaturizer(
-        spectrum_type=args.spectrum_type,
-        feat_dim=args.feat_dim,
-        delta_delta=args.delta_delta,
-        stride_ms=float(args.stride_ms),
-        window_ms=float(args.window_ms),
-        n_fft=None,
-        max_freq=None,
-        target_sample_rate=args.sample_rate,
-        use_dB_normalization=args.use_dB_normalization,
-        target_dB=args.target_dB,
-        dither=0.0)
-
-    def augment_and_featurize(audio_segment):
-        augmentation_pipeline.transform_audio(audio_segment)
-        return audio_featurizer.featurize(audio_segment)
-
-    normalizer = FeatureNormalizer(
-        mean_std_filepath=None,
-        manifest_path=args.manifest_path,
-        featurize_func=augment_and_featurize,
-        num_samples=args.num_samples,
-        num_workers=args.num_workers)
-    normalizer.write_to_file(args.output_path)
-
+from paddlespeech.dataset.s2t import compute_mean_std_main

 if __name__ == '__main__':
-    main()
+    compute_mean_std_main()
--- a/utils/format_data.py
+++ b/utils/format_data.py
@@ -13,130 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """format manifest with more metadata."""
-import argparse
-import functools
-import json
-
-import jsonlines
-
-from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
-from paddlespeech.s2t.frontend.utility import load_cmvn
-from paddlespeech.s2t.io.utility import feat_type
-from paddlespeech.s2t.utils.utility import add_arguments
-from paddlespeech.s2t.utils.utility import print_arguments
-
-parser = argparse.ArgumentParser(description=__doc__)
-add_arg = functools.partial(add_arguments, argparser=parser)
-# yapf: disable
-add_arg('cmvn_path',       str,
-        'examples/librispeech/data/mean_std.json',
-        "Filepath of cmvn.")
-add_arg('unit_type', str, "char", "Unit type, e.g. char, word, spm")
-add_arg('vocab_path',       str,
-        'examples/librispeech/data/vocab.txt',
-        "Filepath of the vocabulary.")
-add_arg('manifest_paths',   str,
-        None,
-        "Filepaths of manifests for building vocabulary. "
-        "You can provide multiple manifest files.",
-        nargs='+',
-        required=True)
-# bpe
-add_arg('spm_model_prefix', str, None,
-     "spm model prefix, spm_model_%(bpe_mode)_%(count_threshold), only need when `unit_type` is spm")
-add_arg('output_path',  str, None, "filepath of formated manifest.", required=True)
-# yapf: disable
-args = parser.parse_args()
-
-
-def main():
-    print_arguments(args, globals())
-    fout = open(args.output_path, 'w', encoding='utf-8')
-
-    # get feat dim
-    filetype = args.cmvn_path.split(".")[-1]
-    mean, istd = load_cmvn(args.cmvn_path, filetype=filetype)
-    feat_dim = mean.shape[0] #(D)
-    print(f"Feature dim: {feat_dim}")
-
-    text_feature = TextFeaturizer(args.unit_type, args.vocab_path, args.spm_model_prefix)
-    vocab_size = text_feature.vocab_size
-    print(f"Vocab size: {vocab_size}")
-
-    # josnline like this
-    # {
-    #   "input": [{"name": "input1", "shape": (100, 83), "feat": "xxx.ark:123"}],
-    #   "output": [{"name":"target1", "shape": (40, 5002), "text": "a b c de"}],
-    #   "utt2spk": "111-2222",
-    #   "utt": "111-2222-333"
-    # }
-    count = 0
-    for manifest_path in args.manifest_paths:
-        with jsonlines.open(str(manifest_path), 'r') as reader:
-            manifest_jsons = list(reader)
-
-        for line_json in manifest_jsons:
-            output_json = {
-                "input": [],
-                "output": [],
-                'utt': line_json['utt'],
-                'utt2spk': line_json.get('utt2spk', 'global'),
-            }
-
-            # output
-            line = line_json['text']
-            if isinstance(line, str):
-                # only one target
-                tokens = text_feature.tokenize(line)
-                tokenids = text_feature.featurize(line)
-                output_json['output'].append({
-                    'name': 'target1',
-                    'shape': (len(tokenids), vocab_size),
-                    'text': line,
-                    'token': ' '.join(tokens),
-                    'tokenid': ' '.join(map(str, tokenids)),
-                })
-            else:
-                # isinstance(line, list), multi target in one vocab
-                for i, item in enumerate(line, 1):
-                    tokens = text_feature.tokenize(item)
-                    tokenids = text_feature.featurize(item)
-                    output_json['output'].append({
-                        'name': f'target{i}',
-                        'shape': (len(tokenids), vocab_size),
-                        'text': item,
-                        'token': ' '.join(tokens),
-                        'tokenid': ' '.join(map(str, tokenids)),
-                    })
-
-            # input
-            line = line_json['feat']
-            if isinstance(line, str):
-                # only one input
-                feat_shape = line_json['feat_shape']
-                assert isinstance(feat_shape, (list, tuple)), type(feat_shape)
-                filetype = feat_type(line)
-                if filetype == 'sound':
-                    feat_shape.append(feat_dim)
-                else: # kaldi
-                    raise NotImplementedError('no support kaldi feat now!')
-
-                output_json['input'].append({
-                    "name": "input1",
-                    "shape": feat_shape,
-                    "feat": line,
-                    "filetype": filetype,
-                })
-            else:
-                # isinstance(line, list), multi input 
-                raise NotImplementedError("not support multi input now!")
-
-            fout.write(json.dumps(output_json) + '\n')
-            count += 1
-
-    print(f"{args.manifest_paths} Examples number: {count}")
-    fout.close()
-
+from paddlespeech.dataset.s2t import format_data_main

 if __name__ == '__main__':
-    main()
+    format_data_main()
--- a/utils/format_rsl.py
+++ b/utils/format_rsl.py
@@ -11,96 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import argparse
+from paddlespeech.dataset.s2t import format_rsl_main

-import jsonlines
-
-
-def trans_hyp(origin_hyp, trans_hyp=None, trans_hyp_sclite=None):
-    """
-    Args:
-        origin_hyp: The input json file which contains the model output
-        trans_hyp: The output file for caculate CER/WER
-        trans_hyp_sclite: The output file for caculate CER/WER using sclite
-    """
-    input_dict = {}
-
-    with open(origin_hyp, "r+", encoding="utf8") as f:
-        for item in jsonlines.Reader(f):
-            input_dict[item["utt"]] = item["hyps"][0]
-    if trans_hyp is not None:
-        with open(trans_hyp, "w+", encoding="utf8") as f:
-            for key in input_dict.keys():
-                f.write(key + " " + input_dict[key] + "\n")
-    if trans_hyp_sclite is not None:
-        with open(trans_hyp_sclite, "w+") as f:
-            for key in input_dict.keys():
-                line = input_dict[key] + "(" + key + ".wav" + ")" + "\n"
-                f.write(line)
-
-
-def trans_ref(origin_ref, trans_ref=None, trans_ref_sclite=None):
-    """
-    Args:
-        origin_hyp: The input json file which contains the model output
-        trans_hyp: The output file for caculate CER/WER
-        trans_hyp_sclite: The output file for caculate CER/WER using sclite
-    """
-    input_dict = {}
-
-    with open(origin_ref, "r", encoding="utf8") as f:
-        for item in jsonlines.Reader(f):
-            input_dict[item["utt"]] = item["text"]
-    if trans_ref is not None:
-        with open(trans_ref, "w", encoding="utf8") as f:
-            for key in input_dict.keys():
-                f.write(key + " " + input_dict[key] + "\n")
-
-    if trans_ref_sclite is not None:
-        with open(trans_ref_sclite, "w") as f:
-            for key in input_dict.keys():
-                line = input_dict[key] + "(" + key + ".wav" + ")" + "\n"
-                f.write(line)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        prog='format hyp file for compute CER/WER', add_help=True)
-    parser.add_argument(
-        '--origin_hyp', type=str, default=None, help='origin hyp file')
-    parser.add_argument(
-        '--trans_hyp',
-        type=str,
-        default=None,
-        help='hyp file for caculating CER/WER')
-    parser.add_argument(
-        '--trans_hyp_sclite',
-        type=str,
-        default=None,
-        help='hyp file for caculating CER/WER by sclite')
-
-    parser.add_argument(
-        '--origin_ref', type=str, default=None, help='origin ref file')
-    parser.add_argument(
-        '--trans_ref',
-        type=str,
-        default=None,
-        help='ref file for caculating CER/WER')
-    parser.add_argument(
-        '--trans_ref_sclite',
-        type=str,
-        default=None,
-        help='ref file for caculating CER/WER by sclite')
-    parser_args = parser.parse_args()
-
-    if parser_args.origin_hyp is not None:
-        trans_hyp(
-            origin_hyp=parser_args.origin_hyp,
-            trans_hyp=parser_args.trans_hyp,
-            trans_hyp_sclite=parser_args.trans_hyp_sclite, )
-
-    if parser_args.origin_ref is not None:
-        trans_ref(
-            origin_ref=parser_args.origin_ref,
-            trans_ref=parser_args.trans_ref,
-            trans_ref_sclite=parser_args.trans_ref_sclite, )
+if __name__ == '__main__':
+    format_rsl_main()
--- a/utils/format_triplet_data.py
+++ b/utils/format_triplet_data.py
@@ -22,8 +22,8 @@ import jsonlines
 from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
 from paddlespeech.s2t.frontend.utility import load_cmvn
 from paddlespeech.s2t.io.utility import feat_type
-from paddlespeech.s2t.utils.utility import add_arguments
-from paddlespeech.s2t.utils.utility import print_arguments
+from paddlespeech.utils.argparse import add_arguments
+from paddlespeech.utils.argparse import print_arguments

 parser = argparse.ArgumentParser(description=__doc__)
 add_arg = functools.partial(add_arguments, argparser=parser)

--- a/utils/fst/ctc_token_fst.py
+++ b/utils/fst/ctc_token_fst.py
--- a/utils/manifest_key_value.py
+++ b/utils/manifest_key_value.py
--- a/utils/tokenizer.perl
+++ b/utils/tokenizer.perl